Dendro  5.01
Dendro in Greek language means tree. The Dendro library is a large scale (262K cores on ORNL's Titan) distributed memory adaptive octree framework. The main goal of Dendro is to perform large scale multiphysics simulations efficeiently in mordern supercomputers. Dendro consists of efficient parallel data structures and algorithms to perform variational ( finite element) methods and finite difference mthods on 2:1 balanced arbitary adaptive octrees which enables the users to perform simulations raning from black holes (binary black hole mergers) to blood flow in human body, where applications ranging from relativity, astrophysics to biomedical engineering.
sfcSort.h
1 //
2 // Created by milinda on 6/29/16.
3 
16 #ifndef SFCSORTBENCH_SFCSORT_H
17 #define SFCSORTBENCH_SFCSORT_H
18 
19 
20 #include <iostream>
21 #include <vector>
22 #include <assert.h>
23 #include "hcurvedata.h"
24 #include "TreeNode.h"
25 #include "treenode2vtk.h"
26 #include "dendro.h"
27 #include "testUtils.h"
28 #include "ompUtils.h"
29 
30 #include <mpi.h>
31 #include <chrono>
32 #include "dtypes.h"
33 #include "parUtils.h"
34 #include <set>
35 #include <unordered_set>
36 #include "dollar.hpp"
37 #include <stdio.h>
38 
39 #ifdef PROFILE_TREE_SORT
40 #include <chrono>
41 // for timer
42 std::vector<double> stats_previous; // stats for the 1st pass of the treeSort.
43 std::vector<double> stats; // contains the last run of the SFC::parSort::SFC_TreeSort.
44 
45 /*
46  * For option 1, 2, 4:
47  * inputSz (min mean max)
48  * splitter_fix_all (min mean max)
49  * splitter_time (min mean max)
50  * all2all1_time (min mean max)
51  * all2all2_time (min mean max)
52  * localSort_time (min mean max)
53  * remove_duplicates_seq (min mean max)
54  * remove duplicates_par (min mean max)
55  * auxBalOct (min mean max)
56  * 2ndPass (min mean max)
57  * Overall total (min mean mex)
58  *
59  * */
60 
61 
62 unsigned int inputSz[3];
63 
64 double splitter_fix_all=0;
65 double splitter_time=0;
66 double all2all1_time=0;
67 double all2all2_time=0;
68 
69 double localSort_time=0;
70 double remove_duplicates_seq=0;
71 double remove_duplicates_par=0;
72 
73 double auxBalOCt_time=0;
74 
75 double construction_time=0;
76 double balancing_time=0;
77 double total_rd=0;
78 
79 auto t1=std::chrono::high_resolution_clock::now();
80 auto t2=std::chrono::high_resolution_clock::now();
81 auto t3=std::chrono::high_resolution_clock::now();
82 auto t4=std::chrono::high_resolution_clock::now();
83 auto t5_sf_staged=std::chrono::high_resolution_clock::now();
84 
85 double stat_property[3]={};
86 
87 #endif
88 
89 
90 typedef unsigned __int128 uint128_t;
91 
92 #ifdef DIM_2
93  #define NUM_CHILDREN 4
94  #define ROTATION_OFFSET 8
95  #define m_uiDim 2
96 #else
97 #define NUM_CHILDREN 8
98 #define ROTATION_OFFSET 16
99 #define m_uiDim 3
100 #endif
101 
102 #define MILLISECOND_CONVERSION 1e3
103 #define ROOT_ROTATION 0
104 
105 
119 #define TS_SORT_ONLY 0
120 #define TS_REMOVE_DUPLICATES 1
121 #define TS_CONSTRUCT_OCTREE 2
122 #define TS_BALANCE_OCTREE 4 // which ensures that balance octree cannote be called without construct octree function.
123 
124 
125 template <typename T>
127 {
128  unsigned char rot_id;
129  unsigned char lev;
130  DendroIntL begin;
131  DendroIntL end;
132 
133  BucketInfo()
134  {
135  rot_id=0;
136  lev=0;
137 
138  }
139 
140  BucketInfo(unsigned char p_rot_id,unsigned char p_lev,DendroIntL p_begin, DendroIntL p_end)
141  {
142  rot_id=p_rot_id;
143  lev=p_lev;
144  begin=p_begin;
145  end=p_end;
146 
147  }
148 
149 };
150 
151 
152 template<typename T>
154 {
155 
156  inline uint64_t splitBy3(unsigned int a) const{
157  uint64_t x=a;
158  x = (x | (x << 16)) & 0x0000FFFF0000FFFF;
159  x = (x | (x << 8)) & 0x00FF00FF00FF00FF;
160  x = (x | (x << 4)) & 0x0F0F0F0F0F0F0F0F;
161  x = (x | (x << 2)) & 0x3333333333333333;
162  x = (x | (x << 1)) & 0x5555555555555555;
163  return x;
164  }
165 
166 
167  inline bool operator()(const T & first, const T &other) const
168  {//$
169 
170 /* return (((first.getLevel()!=other.getLevel()) && (first.getLevel()>other.getLevel()))
171  || ((first.getLevel()==other.getLevel()) && ((first.getX() != other.getX())) && (first.getX() < other.getX()))
172  || ((first.getLevel()==other.getLevel()) &&(first.getX() == other.getX()) && (first.getY() != other.getY()) && (first.getY() < other.getY()))
173  || ((first.getLevel()==other.getLevel()) && (first.getX() == other.getX()) && (first.getY() == other.getY()) && (first.getZ() < other.getZ())));*/
174 
175 
176  /*return (((first.getLevel()==other.getLevel()) || (first.getLevel()<other.getLevel()))
177  && ((first.getLevel()!=other.getLevel()) || ((first.getX() == other.getX())) || (first.getX() > other.getX()))
178  && ((first.getLevel()!=other.getLevel()) || (first.getX() != other.getX()) || (first.getY() == other.getY()) || (first.getY() > other.getY()))
179  && ((first.getLevel()!=other.getLevel()) || (first.getX() != other.getX()) || (first.getY() != other.getY()) || (first.getZ() > other.getZ())));*/
180 
181 
182  uint128_t a1=0;
183  uint128_t a2=0;
184  a1=(((uint128_t)first.getLevel())<<96u) | (((uint128_t)first.getX())<<64u) | (((uint128_t)first.getY())<<32u) |(((uint128_t)first.getZ()));
185  a2=(((uint128_t)other.getLevel())<<96u) | (((uint128_t)other.getX())<<64u) | (((uint128_t)other.getY())<<32u) |(((uint128_t)other.getZ()));
186  return a1<a2;
187 
188 /*
189  if(first.getLevel()!=other.getLevel())
190  {
191  if(state!=(first.getLevel()>other.getLevel())) std::cout<<" lev first"<<first<<" other: "<<other<<std::endl;
192  return (first.getLevel()>other.getLevel());
193  }else
194  {
195  if(first.getX()!=other.getX())
196  {
197  if(state!=(first.getX()<other.getX())) std::cout<<" x first"<<first<<" other: "<<other<<std::endl;
198  return first.getX()<other.getX();
199  }else
200  {
201  if(first.getY()!=other.getY())
202  {
203  if(state!=(first.getY()<other.getY())) std::cout<<" y first"<<first<<" other: "<<other<<std::endl;
204  return first.getY()<other.getY();
205  }else
206  {
207  if(state!=(first.getZ()<other.getZ())) std::cout<<" z first"<<first<<" other: "<<other<<std::endl;
208  return first.getZ()<other.getZ();
209  }
210  }
211  }*/
212 
213  }
214 
215 };
216 template<typename T>
217 struct OctreeHash{
218 
219  inline uint128_t operator() (const T &first) const {
220 
221 /* uint64_t answer1 = 0;
222  answer1 |= splitBy3(node.getX()) | splitBy3(node.getY()) << 1 | splitBy3(node.getZ()) << 2;
223  return answer1;*/
224  /*size_t h1 = std::hash<unsigned int>{}(node.getX());
225  size_t h2 = std::hash<unsigned int>{}(node.getY());
226  size_t h3 = std::hash<unsigned int>{}(node.getZ());
227  size_t h4 = std::hash<unsigned int>{}(node.getLevel());
228  return (h4^(h3^((h2 ^ (h1 << 16))<<16))<<16); // or use boost::hash_combine*/
229  uint128_t a1=0;
230  a1=(((uint128_t)first.getLevel())<<96u) | (((uint128_t)first.getX())<<64u) | (((uint128_t)first.getY())<<32u) |(((uint128_t)first.getZ()));
231  return a1;
232  //return (((uint64_t)h4<<48u)|((uint64_t)h3<<32u)|((uint64_t)h2<<16u)|((uint64_t)h1));
233  }
234 };
235 
236 
237 
238 
239 namespace SFC {
240 
241  namespace seqSort {
242 
243 
244  /*
245  *
246  * Contains the sequential utils and treeSort function implementations.
247  *
248  */
249 
250 
251  //========================================================= Function declaration begin.=========================================================================================
252 
259  template<typename T>
260  inline void SFC_bottomUpBalOctantCreation(std::vector<T> & pNodes);
261 
262 
275  template<typename T>
276  void SFC_treeSort(T* pNodes , DendroIntL n ,std::vector<T>& pOutSorted,std::vector<T>& pOutConstruct,std::vector<T>& pOutBalanced, unsigned int pMaxDepthBit,unsigned int pMaxDepth, T& parent, unsigned int rot_id,unsigned int k, unsigned int options);
277 
278 
284  template<typename T>
285  inline void SFC_bucketing(T *pNodes, int lev, unsigned int maxDepth,unsigned char rot_id,DendroIntL &begin, DendroIntL &end, DendroIntL *splitters);
286 
299  template<typename T>
300  void SFC_treeSortLocalOptimal(T* pNodes , DendroIntL n , unsigned int pMaxDepthBit,unsigned int pMaxDepth, T& parent, unsigned int rot_id,bool minimum,T& optimal);
301 
302 
315  template<typename T>
316  void SFC_treeSortLocalOptimal(T* pNodes , DendroIntL n , unsigned int pMaxDepthBit,unsigned int pMaxDepth, T& parent, unsigned int rot_id,T&min,T&max);
317 
318 
319  //========================================================= Function declaration end.==========================================================================================
320 
321 
322 
323 
324  template<typename T>
325  inline void SFC_bottomUpBalOctantCreation(std::vector<T> & pNodes)
326  {//$
327 
328  if(pNodes.empty()){ return; }
329 
330  T tmpParent;
331  /*std::vector<T> tmpSorted;*/
332  T root(m_uiDim,m_uiMaxDepth);
333  std::set<T,OctreeComp<T> > auxOct(pNodes.begin(),pNodes.end());
334  std::set<T,OctreeComp<T> > parentAux;
335  unsigned int neighbourCount=0;
336 
337  unsigned int newCount=0;
338  unsigned int preSz=0;
339  unsigned int curr=0;
340  unsigned int d_max=(1u<<m_uiMaxDepth);
341 
342 #ifdef DIM_2
343  neighbourCount=8;
344 #else
345  neighbourCount=26;
346 #endif
347 
348  std::pair<typename std::set<T,OctreeComp<T>>::iterator,bool> hint[neighbourCount];
349  std::pair<typename std::set<T,OctreeComp<T>>::iterator,bool> hintParent;
350 
351  /* std::unordered_set<T,OctreeHash<T>> auxOct(pNodes.begin(),pNodes.end());
352  std::pair<typename std::unordered_set<T,OctreeHash<T>>::iterator,bool> hint;*/
353 
354 
355  for(unsigned int w=0;w<pNodes.size();++w){
356  hintParent=parentAux.emplace(pNodes[w].getParent());
357  if(hintParent.second) {
358  tmpParent=*(hintParent.first);
359 
360 #ifdef DIM_2
361 
362  auxOct.emplace(tmpParent.getLeft());
363  auxOct.emplace(tmpParent.getRight());
364  auxOct.emplace(tmpParent.getFront());
365  auxOct.emplace(tmpParent.getBack());
366  auxOct.emplace(tmpParent.getLeftBack());
367  auxOct.emplace(tmpParent.getRightBack());
368  auxOct.emplace(tmpParent.getLeftFront());
369  auxOct.emplace(tmpParent.getRightFront());
370 
371 #else
372 
373 
374  preSz=auxOct.size();
375  hint[0] = auxOct.emplace(tmpParent.getLeft());
376  hint[1] = auxOct.emplace(tmpParent.getLeftBack());
377  hint[2] = auxOct.emplace(tmpParent.getLeftFront());
378  hint[3] = auxOct.emplace(tmpParent.getRight());
379  hint[4] = auxOct.emplace(tmpParent.getRightBack());
380  hint[5] = auxOct.emplace(tmpParent.getRightFront());
381  hint[6] = auxOct.emplace(tmpParent.getBack());
382  hint[7] = auxOct.emplace(tmpParent.getFront());
383  hint[8] = auxOct.emplace(tmpParent.getBottom());
384  hint[9] = auxOct.emplace(tmpParent.getBottomLeft());
385  hint[10] = auxOct.emplace(tmpParent.getBottomLeftBack());
386  hint[11] = auxOct.emplace(tmpParent.getBottomLeftFront());
387  hint[12] = auxOct.emplace(tmpParent.getBottomRight());
388  hint[13] = auxOct.emplace(tmpParent.getBottomRightBack());
389  hint[14] = auxOct.emplace(tmpParent.getBottomRightFront());
390  hint[15] = auxOct.emplace(tmpParent.getBottomBack());
391  hint[16] = auxOct.emplace(tmpParent.getBottomFront());
392  hint[17] = auxOct.emplace(tmpParent.getTop());
393  hint[18] = auxOct.emplace(tmpParent.getTopLeft());
394  hint[19] = auxOct.emplace(tmpParent.getTopLeftBack());
395  hint[20] = auxOct.emplace(tmpParent.getTopLeftFront());
396  hint[21] = auxOct.emplace(tmpParent.getTopRight());
397  hint[22] = auxOct.emplace(tmpParent.getTopRightBack());
398  hint[23] = auxOct.emplace(tmpParent.getTopRightFront());
399  hint[24] = auxOct.emplace(tmpParent.getTopBack());
400  hint[25] = auxOct.emplace(tmpParent.getTopFront());
401 #endif
402 
403  if(auxOct.size()>preSz)
404  {
405  curr=0;
406  pNodes.resize(pNodes.size()+auxOct.size()-preSz);
407  for(unsigned int kk=0;kk<neighbourCount;kk++)
408  {
409  if(hint[kk].second) {
410  pNodes[preSz + curr] = *(hint[kk].first);
411  //pNodes.push_back(*(hint[kk].first));
412  curr++;
413  }
414 
415  }
416  }
417  //std::cout<<"aux Size: "<<auxOct.size()<<" pNodes size: "<<pNodes.size()<<std::endl;
418  }
419 
420 
421  }
422 
423 
424 
425  }
426 
427 
428  template<typename T>
429  void SFC_treeSort(T* pNodes , DendroIntL n ,std::vector<T>& pOutSorted,std::vector<T>& pOutConstruct,std::vector<T>& pOutBalanced, unsigned int pMaxDepthBit,unsigned int pMaxDepth, T& parent, unsigned int rot_id,unsigned int k, unsigned int options)
430  {
431 
432  if(n==0) return;
433  register unsigned int cnum;
434  register unsigned int cnum_prev=0;
435  //register unsigned int n=0;
436  unsigned int rotation=0;
437  DendroIntL count[(NUM_CHILDREN+2)]={};
438  unsigned int lev=pMaxDepth-pMaxDepthBit;
439  pMaxDepthBit--;
440  unsigned int x,y,z;
441  T temp;
442  count[0]=0;
443 
444  for (DendroIntL i=0; i< n; ++i) {
445 
446  cnum = (lev < pNodes[i].getLevel())? 1 +( (((pNodes[i].getZ() >> pMaxDepthBit) & 1u) << 2u) | (((pNodes[i].getY() >> pMaxDepthBit) & 1u) << 1u) | ((pNodes[i].getX() >>pMaxDepthBit) & 1u)):0;
447  count[cnum+1]++;
448 
449  }
450  //std::cout<<"initial Count finished"<<std::endl;
451  DendroIntL loc[NUM_CHILDREN+1];
452  T unsorted[NUM_CHILDREN+1];
453  unsigned int live = 0;
454 
455 
456  for (unsigned int i=0; i<(NUM_CHILDREN+1); ++i) {
457  if(i==0)
458  {
459  loc[0]=count[0];
460  count[1]+=count[0];
461  unsorted[live] = pNodes[loc[0]];
462  if (loc[0] < count[1]) {live++; /*std::cout<<i<<" Live: "<<live<<std::endl;*/}
463  }else
464  {
465  cnum=(rotations[ROTATION_OFFSET * rot_id+ i-1] - '0');
466  (i>1) ? cnum_prev = ((rotations[ROTATION_OFFSET * rot_id+i-2] - '0')+2): cnum_prev=1;
467  loc[cnum+1]=count[cnum_prev];
468  count[cnum+2] += count[cnum_prev];
469  //std::cout<<" loc[cnum+1]: "<<loc[cnum+1]<<std::endl;
470  (loc[cnum+1]==n) ? unsorted[live] = pNodes[loc[cnum+1]-1]: unsorted[live] = pNodes[loc[cnum+1]];
471  if (loc[cnum+1] < count[cnum+2]) {live++; /*std::cout<<i<<" Live after increment: "<<live<<std::endl;*/}
472  }
473 
474  }
475  //std::cout<<"second pass finished "<<std::endl;
476 
477  if(live>0)
478  {
479 
480  live--;
481 
482  for (DendroIntL i=0; i < n ; ++i) {
483 
484 
485  cnum = (lev < unsorted[live].getLevel()) ? ((((unsorted[live].getZ() >> pMaxDepthBit) & 1u) << 2u) | (((unsorted[live].getY() >> pMaxDepthBit) & 1u) << 1u) | ((unsorted[live].getX() >> pMaxDepthBit) & 1u))+ 1: 0 ;
486 
487  pNodes[loc[(cnum )]++] = unsorted[live];
488  (loc[cnum]==n) ? unsorted[live] = pNodes[loc[cnum]-1] : unsorted[live] = pNodes[loc[cnum]];
489  if ((loc[cnum] == count[cnum + 1])) {
490  /* if(i<(n-1)) assert(live>0);
491  if(live==0) assert(i==(n-1));*/
492  live--;
493 
494  }
495 
496 
497  }
498  }
499 
500  if((options==TS_SORT_ONLY) || (options==TS_REMOVE_DUPLICATES))
501  {
502  if (pMaxDepthBit > 0) {
503 
504  DendroIntL numElements=0;
505  for (unsigned int i=1; i<(NUM_CHILDREN+1); i++) {
506  cnum=(rotations[ROTATION_OFFSET*rot_id+i-1]-'0');
507  (i>1)? cnum_prev = ((rotations[ROTATION_OFFSET * rot_id+i-2] - '0')+2) : cnum_prev=1;
508  numElements = count[cnum+2] - count[cnum_prev];
509  if (numElements > k) {
510  rotation=HILBERT_TABLE[NUM_CHILDREN * rot_id + cnum];
511  if(numElements)SFC_treeSort(pNodes+count[cnum_prev],numElements,pOutSorted,pOutConstruct,pOutBalanced,(pMaxDepthBit),pMaxDepth,temp,rotation,k,options);
512  }
513 
514  }
515  }
516 
517  }else
518  {
519  if (pMaxDepthBit > MAXDEAPTH_LEVEL_DIFF) {
520 
521  DendroIntL numElements=0;
522  for (unsigned int i=1; i<(NUM_CHILDREN+1); i++) {
523  cnum=(rotations[ROTATION_OFFSET*rot_id+i-1]-'0');
524  (i>1)? cnum_prev = ((rotations[ROTATION_OFFSET * rot_id+i-2] - '0')+2) : cnum_prev=1;
525  numElements = count[cnum+2] - count[cnum_prev];
526  if((options & TS_CONSTRUCT_OCTREE) | (options & TS_BALANCE_OCTREE))
527  {
528  x=parent.getX() +(((int)((bool)(cnum & 1u)))<<(pMaxDepthBit));
529  y=parent.getY() +(((int)((bool)(cnum & 2u)))<<(pMaxDepthBit));
530  z=parent.getZ() +(((int)((bool)(cnum & 4u)))<<(pMaxDepthBit));
531  temp=T(x,y,z,(lev+1),parent.getDim(),pMaxDepth);
532 
533  }
534  if (numElements > k) {
535  rotation=HILBERT_TABLE[NUM_CHILDREN * rot_id + cnum];
536  SFC_treeSort(pNodes+count[cnum_prev],numElements,pOutSorted,pOutConstruct,pOutBalanced,(pMaxDepthBit),pMaxDepth,temp,rotation,k,options);
537  }else if((options & TS_CONSTRUCT_OCTREE) | (options & TS_BALANCE_OCTREE))
538  {
539  if(options & TS_CONSTRUCT_OCTREE) {
540  pOutConstruct.push_back(temp);
541  }
542 
543  if (options & TS_BALANCE_OCTREE) {
544  //generate all the neighbours and add them to unordered map. use pOutBalaced to push balanced octree.
545  pOutBalanced.push_back(temp);
546 
547 
548  }
549 
550  }
551 
552  }
553  }
554 
555  }
556 
557 
558 
559  if(lev==0)
560  {
561 
562  // !!!! Note: Please note that all the code here executed only once. In the final stage of the recursion.
563 
564  if((options & TS_REMOVE_DUPLICATES)) {
565 
566 #ifdef PROFILE_TREE_SORT
567  t1=std::chrono::high_resolution_clock::now();//MPI_Wtime();
568 #endif
569 
570  // Note: This is executed only once. In the final stage of the recursion.
571  // Do the remove duplicates here.
572  if (n >= 1) {
573  std::vector<T> tmp(n);
574  T *tmpPtr = (&(*(tmp.begin())));
575 
576  tmpPtr[0] = pNodes[0];
577 
578  unsigned int tmpSize = 1;
579  unsigned int vecTsz = static_cast<unsigned int>(n);
580 
581  for (unsigned int i = 1; i < n; i++) {
582  if ( /*(!tmpPtr[tmpSize-1].isAncestor(pNodes[i])) &*/ (tmpPtr[tmpSize - 1] != pNodes[i])) { // It is efficient to do this rather than marking all the elements in sorting. (Which will cause a performance degradation. )
583  tmpPtr[tmpSize] = pNodes[i];
584  tmpSize++;
585  }
586  }//end for
587 
588 
589  // Remove ancestor loop for removing local ancestors.
590  // Assumes that we have removed all the duplicates after the first iteration.
591 
592  tmp.resize(tmpSize);
593  std::vector<T> tmp_rmvAncestors(tmp.size());
594  tmpPtr = (&(*(tmp_rmvAncestors.begin())));
595  tmpPtr[0]=tmp[0];
596  tmpSize=0;
597 
598  for(unsigned int i=1;i<tmp.size();i++)
599  {
600  if(tmpPtr[tmpSize].isAncestor(tmp[i]))
601  tmpPtr[tmpSize]=tmp[i];
602  else {
603  tmpPtr[tmpSize+1]=tmp[i];
604  tmpSize++;
605  }
606 
607  }
608  tmp_rmvAncestors.resize(tmpSize+1);
609  std::swap(pOutSorted, tmp_rmvAncestors);
610 
611  tmp_rmvAncestors.clear();
612  tmp.clear();
613  }
614 
615 #ifdef PROFILE_TREE_SORT
616  remove_duplicates_seq=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t1).count();
617 
618 #endif
619 
620  }
621 
622  if(options & TS_BALANCE_OCTREE)
623  {
624  // Bottom up balancing octant creation.
625  //std::cout<<"balOCt: before Aux octants: "<<pOutBalanced.size()<<std::endl;
626 #ifdef PROFILE_TREE_SORT
627  t1=std::chrono::high_resolution_clock::now();//MPI_Wtime();
628 #endif
629  /*int rank;
630  MPI_Comm_rank(MPI_COMM_WORLD,&rank);*/
631 
632  SFC::seqSort::SFC_bottomUpBalOctantCreation(pOutBalanced);
633 
634 #ifdef PROFILE_TREE_SORT
635  auxBalOCt_time=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t1).count();
636 
637 #endif
638  std::vector<T> tmpSorted;
639  std::vector<T> tmpConstruct;
640  std::vector<T> tmpBalanced;
641  T root =T(0,0,0,0,m_uiDim,pMaxDepth);
642  //std::cout<<"bal with aux octants: "<<pOutBalanced.size()<<std::endl;
643  SFC::seqSort::SFC_treeSort(&(*(pOutBalanced.begin())),pOutBalanced.size(),tmpSorted,tmpConstruct,tmpBalanced,pMaxDepth,pMaxDepth,root,0,k,2);
644  std::swap(tmpConstruct,pOutBalanced);
645  tmpConstruct.clear();
646  }
647 
648 
649  }
650 
651  } // end of function SFC_treeSort
652 
653 
654  template<typename T>
655  inline void SFC_bucketing(T *pNodes, int lev, unsigned int maxDepth,unsigned char rot_id,DendroIntL &begin, DendroIntL &end, DendroIntL *splitters)
656  {
657 
658 
659  /* int rank,npes;
660  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
661  MPI_Comm_size(MPI_COMM_WORLD,&npes);
662 
663  MPI_Barrier(MPI_COMM_WORLD);*/
664 
665  //std::cout<<rank<<" Bucketing : begin: "<<begin<<" end: "<<end<<" lev: "<<lev<<" rotaion: "<<(int)rot_id<<std::endl;
666 
667 
668  // Special case that needs to be handled when performing the load balancing.
669  if ((lev >= maxDepth) || (begin == end)) {
670  // Special Case when the considering level exceeds the max depth.
671 
672  for (int ii = 0; ii < NUM_CHILDREN; ii++) {
673  int index = (rotations[2 * NUM_CHILDREN * rot_id + ii] - '0');
674  int nextIndex = 0;
675  if (ii == (NUM_CHILDREN-1))
676  nextIndex = ii + 1;
677  else
678  nextIndex = (rotations[2 * NUM_CHILDREN * rot_id + ii + 1] - '0');
679 
680  if (ii == 0) {
681  splitters[index] = begin;
682  splitters[nextIndex] = end;
683  continue;
684  }
685  splitters[nextIndex] = splitters[index];
686  }
687  //std::cout<<"End return "<<"maxDepth "<<maxDepth<<" Lev: "<<lev<< " Begin "<<begin <<" End "<<end<<std::endl;
688  return;
689 
690  }
691 
692  register unsigned int cnum;
693  register unsigned int cnum_prev=0;
694  DendroIntL num_elements=0;
695  unsigned int rotation=0;
696  DendroIntL count[(NUM_CHILDREN+2)]={};
697  //unsigned int pMaxDepth=(lev);
698  //pMaxDepth--;
699  unsigned int mid_bit = maxDepth - lev - 1;
700  count[0]=begin;
701  for (DendroIntL i=begin; i<end; ++i) {
702 
703  /*cnum = (lev < pNodes[i].getLevel())? 1 +(((((pNodes[i].getZ() & (1u << mid_bit)) >> mid_bit) << 2u) |
704  (((pNodes[i].getY() & (1u << mid_bit)) >> mid_bit) << 1u) |
705  ((pNodes[i].getX() & (1u << mid_bit)) >>
706  mid_bit))):0;*/
707 
708  cnum = (lev < pNodes[i].getLevel())? 1 +( (((pNodes[i].getZ() >> mid_bit) & 1u) << 2u) | (((pNodes[i].getY() >> mid_bit) & 1u) << 1u) | ((pNodes[i].getX() >>mid_bit) & 1u)):0;
709  count[cnum+1]++;
710 
711 
712  }
713 
714  DendroIntL loc[NUM_CHILDREN+1];
715  T unsorted[NUM_CHILDREN+1];
716  unsigned int live = 0;
717 
718  //if(count[1]>0) std::cout<<"For rank: "<<rank<<" count [1]: "<<count[1]<<std::endl;
719 
720  for (unsigned int ii = 0; ii < NUM_CHILDREN; ii++) {
721  int index = (rotations[2 * NUM_CHILDREN * rot_id + ii] - '0');
722  int nextIndex = 0;
723  if (ii == (NUM_CHILDREN-1))
724  nextIndex = ii + 1;
725  else
726  nextIndex = (rotations[2 * NUM_CHILDREN * rot_id + ii + 1] - '0');
727 
728  if (ii == 0) {
729  splitters[index] = begin;
730  splitters[nextIndex] = splitters[index]+count[1]+ count[(index+2)]; // number of elements which needs to come before the others due to level constraint.
731 
732  }else {
733  splitters[nextIndex] = splitters[index] + count[(index + 2)];
734  }
735  // if(count[1]>0 & !rank) std::cout<<" Spliter B:"<<index <<" "<<splitters[index]<<" Splitters E "<<nextIndex<<" "<<splitters[nextIndex]<<std::endl;
736 
737  }
738 
739 
740  for (unsigned int i=0; i<(NUM_CHILDREN+1); ++i) {
741  if(i==0)
742  {
743  loc[0]=count[0];
744  count[1]+=count[0];
745  unsorted[live] = pNodes[loc[0]];
746  if (loc[0] < count[1]) {live++; /*std::cout<<" level buck Live ++ : "<<live<<std::endl;*/}
747  }else {
748 
749  cnum = (rotations[ROTATION_OFFSET * rot_id + i-1] - '0');
750  (i > 1) ? cnum_prev = ((rotations[ROTATION_OFFSET * rot_id + i - 2] - '0') + 2) : cnum_prev = 1;
751 
752  loc[cnum+1] = count[cnum_prev];
753  count[cnum + 2] += count[cnum_prev];
754  if (loc[cnum+1] < count[cnum + 2]) { unsorted[live] = pNodes[loc[(cnum+1)]]; live++; /*std::cout<<i<<" Live ++: "<<live<<std::endl;*/}
755  }
756  }
757 
758  live--;
759  //if(live <0) std::cout<<"live count overflow. " <<std::endl; // Note: This could be a potential bug.
760  for (DendroIntL i=begin; i<end; ++i) {
761 
762  //cnum = (lev < unsorted[live].getLevel()) ? (((((unsorted[live].getZ() & (1u << mid_bit)) >> mid_bit) << 2u) | (((unsorted[live].getY() & (1u << mid_bit)) >> mid_bit) << 1u) | ((unsorted[live].getX() & (1u << mid_bit)) >> mid_bit)))+ 1: 0 ;
763  cnum=(lev < unsorted[live].getLevel()) ? ((((unsorted[live].getZ() >> mid_bit) & 1u) << 2u) | (((unsorted[live].getY() >> mid_bit) & 1u) << 1u) | ((unsorted[live].getX() >> mid_bit) & 1u))+ 1: 0 ;
764  if(loc[cnum]<count[cnum+1]) {
765 
766  pNodes[loc[cnum]++] = unsorted[live];
767  if ((loc[cnum] == count[cnum + 1])) {
768  live--;
769  continue;
770  }
771  unsorted[live] = pNodes[loc[cnum]];
772 
773  }
774  /*if(live<0)
775  {
776  std::cout<<"begin: "<<begin<<" end: "<<end<<" i: "<<i<<" loc[0]: "<<loc[0]<<" live : "<<live<<std::endl;
777  }*/
778 
779  }
780 
781 
782 
783  }// end of function SFC_bucketing.
784 
785 
786  template<typename T>
787  void SFC_treeSortLocalOptimal(T* pNodes , DendroIntL n , unsigned int pMaxDepthBit,unsigned int pMaxDepth, T& parent, unsigned int rot_id,bool minimum,T& optimal)
788  {
789 
790  if(n==0 && minimum)
791  {
792  optimal=T(0,0,0,0,m_uiDim,m_uiMaxDepth);
793  return ;
794  }else if(n==0 && (!minimum))
795  {
796  optimal=T(((1u<<m_uiMaxDepth)-1),((1u<<m_uiMaxDepth)-1),((1u<<m_uiMaxDepth)-1),m_uiMaxDepth,m_uiDim,m_uiMaxDepth);
797  return ;
798  }
799 
800  unsigned int lev=pMaxDepth-pMaxDepthBit;
801  DendroIntL nNodeBegin=0;
802  DendroIntL nNodeEnd=n;
803 
804  unsigned int hindex = 0;
805  unsigned int hindexN = 0;
806  unsigned int index = 0;
807  DendroIntL splitterNodes[(NUM_CHILDREN + 1)];
808  unsigned int bucketIndex=0;
809  if(!minimum) bucketIndex=(NUM_CHILDREN-1);
810 
811  if(((nNodeEnd-nNodeBegin)==1))
812  {
813  optimal=pNodes[nNodeBegin];
814  return ;
815  }
816 
817  if( (pMaxDepthBit) && ((nNodeEnd-nNodeBegin)>1)) {
818 
819  SFC::seqSort::SFC_bucketing(pNodes,lev,pMaxDepth,rot_id,nNodeBegin,nNodeEnd,splitterNodes);
820  hindex = (rotations[2 * NUM_CHILDREN * rot_id + bucketIndex] - '0');
821  index=HILBERT_TABLE[NUM_CHILDREN * rot_id + bucketIndex];
822  (bucketIndex == (NUM_CHILDREN - 1)) ? hindexN = bucketIndex + 1: hindexN = (rotations[2 * NUM_CHILDREN * rot_id + bucketIndex + 1] - '0');
823 
824  assert(splitterNodes[hindex] <= splitterNodes[hindexN]);
825  if(splitterNodes[hindex]!=splitterNodes[hindexN]) SFC_treeSortLocalOptimal(pNodes+splitterNodes[hindex],(splitterNodes[hindexN]-splitterNodes[hindex]),(pMaxDepthBit-1),pMaxDepth,parent,index,minimum,optimal);
826  else
827  {
828  //std::cout<<"Calling Seq: sort at Lev: "<<lev<<"size : "<<n<<std::endl;
829  std::vector<T>tmpVec;
830  SFC::seqSort::SFC_treeSort(pNodes,n,tmpVec,tmpVec,tmpVec,pMaxDepth,pMaxDepth,parent,ROOT_ROTATION,1,TS_SORT_ONLY);
831  (minimum) ? optimal=pNodes[0]: optimal=pNodes[n-1];
832  return ;
833  }
834 
835 
836 
837  }
838 
839 
840 
841  }
842 
843  template<typename T>
844  void SFC_treeSortLocalOptimal(T* pNodes , DendroIntL n , unsigned int pMaxDepthBit,unsigned int pMaxDepth, T& parent, unsigned int rot_id,T&min,T&max)
845  {
846  unsigned int lev=pMaxDepth-pMaxDepthBit;
847  DendroIntL splitterNodes[(NUM_CHILDREN + 1)];
848  unsigned int hindex = 0;
849  unsigned int hindexN = 0;
850  unsigned int index = 0;
851  DendroIntL nNodeBegin=0;
852  DendroIntL nNodeEnd=n;
853 
854  SFC::seqSort::SFC_bucketing(pNodes,lev,pMaxDepth,rot_id,nNodeBegin,nNodeEnd,splitterNodes);
855 
856  unsigned int min_b=0,max_b=NUM_CHILDREN-1;
857  for(unsigned int i=0;i<NUM_CHILDREN;i++)
858  {
859  hindex = (rotations[2 * NUM_CHILDREN * rot_id + min_b] - '0');
860  (min_b == (NUM_CHILDREN - 1)) ? hindexN = min_b + 1: hindexN = (rotations[2 * NUM_CHILDREN * rot_id + min_b + 1] - '0');
861  if(splitterNodes[hindex]!=splitterNodes[hindexN])break;
862  min_b++;
863 
864 
865  }
866 
867  for(unsigned int i=0;i<(NUM_CHILDREN);i++)
868  {
869  hindex = (rotations[2 * NUM_CHILDREN * rot_id + max_b] - '0');
870  (max_b == (NUM_CHILDREN - 1)) ? hindexN = max_b + 1: hindexN = (rotations[2 * NUM_CHILDREN * rot_id + max_b + 1] - '0');
871  if(splitterNodes[hindex]!=splitterNodes[hindexN])break;
872  max_b--;
873  }
874 
875 
876  hindex = (rotations[2 * NUM_CHILDREN * rot_id + min_b] - '0');
877  (min_b == (NUM_CHILDREN - 1)) ? hindexN = min_b + 1: hindexN = (rotations[2 * NUM_CHILDREN * rot_id + min_b + 1] - '0');
878  index=HILBERT_TABLE[NUM_CHILDREN * rot_id + min_b];
879  DendroIntL numElem=splitterNodes[hindexN]-splitterNodes[hindex];
880  SFC::seqSort::SFC_treeSortLocalOptimal(&pNodes[splitterNodes[hindex]],numElem,pMaxDepthBit-1,pMaxDepth,parent,index,true,min);
881  //SFC::seqSort::SFC_treeSortLocalOptimal(pNodes,n,pMaxDepthBit,pMaxDepth,parent,rot_id,true,min);
882 
883  hindex = (rotations[2 * NUM_CHILDREN * rot_id + max_b] - '0');
884  (max_b == (NUM_CHILDREN - 1)) ? hindexN = max_b + 1: hindexN = (rotations[2 * NUM_CHILDREN * rot_id + max_b + 1] - '0');
885  index=HILBERT_TABLE[NUM_CHILDREN * rot_id + max_b];
886  numElem=splitterNodes[hindexN]-splitterNodes[hindex];
887  SFC::seqSort::SFC_treeSortLocalOptimal(&pNodes[splitterNodes[hindex]],numElem,pMaxDepthBit-1,pMaxDepth,parent,index,false,max);
888  //SFC::seqSort::SFC_treeSortLocalOptimal(pNodes,n,pMaxDepthBit,pMaxDepth,parent,rot_id,false,max);
889 
890 
891 
892 
893 
894 
895 
896  }
897 
898 
899  //========================================================= Function definition end =========================================================================================
900 
901 
902  }
903 }
904 
905 namespace SFC
906 {
907 
908  namespace parSort
909  {
910 
911  //========================================================= Function declaration begin.=========================================================================================
916  template<typename T>
917  inline void SFC_SplitterFix(std::vector<T>& pNodes,unsigned int pMaxDepth,double loadFlexibility,unsigned int sf_k,MPI_Comm comm,MPI_Comm * newComm);
918 
919 
932  template <typename T>
933  void SFC_treeSort(std::vector<T> &pNodes, std::vector<T>& pOutSorted,std::vector<T>& pOutConstruct,std::vector<T>& pOutBalanced , double loadFlexibility,unsigned int pMaxDepth, T& parent, unsigned int rot_id,unsigned int k, unsigned int options, unsigned int sf_k,MPI_Comm pcomm);
934 
935 
936  /* template <typename T>
937  void SFC_PartitionW(std::vector<T>&pNodes,double loadFlexibility, unsigned int maxDepth,MPI_Comm comm);*/
938 
939 
940  //========================================================= Function declaration end.=========================================================================================
941 
942 
943 
944  //========================================================= Function definition begin.=========================================================================================
945 
946  template<typename T>
947  inline void SFC_SplitterFix(std::vector<T>& pNodes,unsigned int pMaxDepth,double loadFlexibility,unsigned int sf_k,MPI_Comm comm,MPI_Comm * newComm) {
948 
949 #ifdef SPLITTER_SELECTION_FIX
950 
951  int rank, npes;
952  MPI_Comm_rank(comm, &rank);
953  MPI_Comm_size(comm, &npes);
954 
955 
956 
957 //#pragma message ("Splitter selection FIX ON")
958 
959  if (npes > NUM_NPES_THRESHOLD) {
960 
961  unsigned int npes_sqrt = 1u << (binOp::fastLog2(npes) / 2);
962 
963  const unsigned int a=sf_k;
964  const unsigned int b=npes/a;
965  const unsigned int p_mod_a=npes%a;
966 
967 
968  unsigned int dim = 3;
969 #ifdef DIM_2
970  dim=2;
971 #else
972  dim = 3;
973 #endif
974 
975  unsigned int firstSplitLevel = std::ceil(binOp::fastLog2(a) / (double) dim);
976  unsigned int totalNumBuckets = 1u << (dim * firstSplitLevel);
977 
978 
979  DendroIntL localSz = pNodes.size();
980  DendroIntL globalSz = 0;
981  MPI_Allreduce(&localSz, &globalSz, 1, MPI_LONG_LONG, MPI_SUM, comm);
982 
983  // Number of initial buckets. This should be larger than npes.
984 
985  // maintain the splitters and buckets for splitting for further splitting.
986  std::vector<DendroIntL> bucketCounts;
987  std::vector<BucketInfo<T>> bucketInfo; // Stores the buckets info of the buckets where the initial buckets was splitted.
988  std::vector<DendroIntL> bucketSplitter;
989 
990 
991  std::vector<BucketInfo<T>> nodeStack; // rotation id stack
992  BucketInfo<T> root(0, 0, 0, pNodes.size());
993  nodeStack.push_back(root);
994  BucketInfo<T> tmp = root;
995  unsigned int levSplitCount = 0;
996 
997  // Used repetitively in rotation computations.
998  unsigned int hindex = 0;
999  unsigned int hindexN = 0;
1000 
1001  unsigned int index = 0;
1002  //bool *updateState = new bool[pNodes.size()];
1003  unsigned int numLeafBuckets = 0;
1004 
1005  unsigned int begin_loc = 0;
1006 
1007 
1008  DendroIntL spliterstemp[(NUM_CHILDREN + 1)];
1009  while (numLeafBuckets < totalNumBuckets) {
1010 
1011  tmp = nodeStack[0];
1012  nodeStack.erase(nodeStack.begin());
1013 
1014  SFC::seqSort::SFC_bucketing(&(*(pNodes.begin())), tmp.lev, pMaxDepth, tmp.rot_id, tmp.begin,
1015  tmp.end, spliterstemp);
1016 
1017  for (int i = 0; i < NUM_CHILDREN; i++) {
1018  hindex = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i] - '0');
1019  if (i == (NUM_CHILDREN - 1))
1020  hindexN = i + 1;
1021  else
1022  hindexN = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i + 1] - '0');
1023  assert(spliterstemp[hindex] <= spliterstemp[hindexN]);
1024  index = HILBERT_TABLE[NUM_CHILDREN * tmp.rot_id + hindex];
1025 
1026  BucketInfo<T> child(index, (tmp.lev + 1), spliterstemp[hindex], spliterstemp[hindexN]);
1027  nodeStack.push_back(child);
1028 
1029  if (tmp.lev == (firstSplitLevel - 1)) {
1030  BucketInfo<T> bucket(index, (tmp.lev + 1), spliterstemp[hindex], spliterstemp[hindexN]);
1031  bucketCounts.push_back((spliterstemp[hindexN] - spliterstemp[hindex]));
1032  bucketSplitter.push_back(spliterstemp[hindex]);
1033  bucketInfo.push_back(bucket);
1034  numLeafBuckets++;
1035 
1036  }
1037 
1038  }
1039 
1040 
1041  }
1042 
1043  /*MPI_Barrier(MPI_COMM_WORLD);
1044  if(!rank)std::cout<<" intial buckets created: "<<std::endl;*/
1045 
1046  std::vector<DendroIntL> bucketCounts_g(bucketCounts.size());
1047  std::vector<DendroIntL> bucketCounts_gScan(bucketCounts.size());
1048 
1049  par::Mpi_Allreduce<DendroIntL>(&(*(bucketCounts.begin())), &(*(bucketCounts_g.begin())),
1050  bucketCounts.size(), MPI_SUM, comm);
1051 
1052 #ifdef DEBUG_TREE_SORT
1053  assert(totalNumBuckets);
1054 #endif
1055  bucketCounts_gScan[0] = bucketCounts_g[0];
1056  for (int k = 1; k < bucketCounts_g.size(); k++) {
1057  bucketCounts_gScan[k] = bucketCounts_gScan[k - 1] + bucketCounts_g[k];
1058  }
1059 
1060 
1061 #ifdef DEBUG_TREE_SORT
1062  if(!rank) {
1063  for (int i = 0; i < totalNumBuckets; i++) {
1064  //std::cout << "Bucket count G : " << i << " : " << bucketCounts_g[i] << std::endl;
1065  std::cout << "Bucket initial count scan G : " << i << " : " << bucketCounts_gScan[i] << std::endl;
1066  }
1067  }
1068 #endif
1069  DendroIntL *localSplitterTmp = new DendroIntL[a];
1070  DendroIntL idealLoadBalance = 0;
1071  begin_loc = 0;
1072 
1073  std::vector<unsigned int> splitBucketIndex;
1074  begin_loc = 0;
1075  for (int i = 0; i < a - 1; i++) {
1076  idealLoadBalance += ((i + 1) * globalSz / a - i * globalSz / a);
1077  DendroIntL toleranceLoadBalance = ((i + 1) * globalSz / a - i * globalSz / a) * loadFlexibility;
1078 
1079  unsigned int loc = (
1080  std::lower_bound(bucketCounts_gScan.begin(), bucketCounts_gScan.end(), idealLoadBalance) -
1081  bucketCounts_gScan.begin());
1082  //std::cout<<rank<<" Searching: "<<idealLoadBalance<<"found: "<<loc<<std::endl;
1083 
1084  if (abs(bucketCounts_gScan[loc] - idealLoadBalance) > toleranceLoadBalance) {
1085 
1086  if (splitBucketIndex.empty() || splitBucketIndex.back() != loc)
1087  splitBucketIndex.push_back(loc);
1088  /*if(!rank)
1089  std::cout<<"Bucket index : "<<loc << " Needs a split "<<std::endl;*/
1090  } else {
1091  if ((loc + 1) < bucketSplitter.size())
1092  localSplitterTmp[i] = bucketSplitter[loc + 1];
1093  else
1094  localSplitterTmp[i] = bucketSplitter[loc];
1095  }
1096 
1097  /* if(loc+1<bucketCounts_gScan.size())
1098  begin_loc=loc+1;
1099  else
1100  begin_loc=loc;*/
1101 
1102  }
1103  localSplitterTmp[a - 1] = pNodes.size();
1104 
1105 
1106 #ifdef DEBUG_TREE_SORT
1107  for(int i=0;i<splitBucketIndex.size()-1;i++)
1108  {
1109  assert(pNodes[bucketSplitter[splitBucketIndex[i]]]<pNodes[bucketSplitter[splitBucketIndex[i+1]]]);
1110  }
1111 #endif
1112 
1113 
1114  std::vector<DendroIntL> newBucketCounts;
1115  std::vector<DendroIntL> newBucketCounts_g;
1116  std::vector<BucketInfo<T>> newBucketInfo;
1117  std::vector<DendroIntL> newBucketSplitters;
1118 
1119 
1120  std::vector<DendroIntL> bucketCount_gMerge;
1121  std::vector<DendroIntL> bucketSplitterMerge;
1122  std::vector<BucketInfo<T>> bucketInfoMerge;
1123 
1124  DendroIntL splitterTemp[(NUM_CHILDREN + 1)];
1125  while (!splitBucketIndex.empty()) {
1126 
1127 
1128  newBucketCounts.clear();
1129  newBucketCounts_g.clear();
1130  newBucketInfo.clear();
1131  newBucketSplitters.clear();
1132 
1133  bucketCount_gMerge.clear();
1134  bucketSplitterMerge.clear();
1135  bucketInfoMerge.clear();
1136 
1137  if (bucketInfo[splitBucketIndex[0]].lev < pMaxDepth) {
1138 
1139  BucketInfo<T> tmp;
1140  // unsigned int numSplitBuckets = NUM_CHILDREN * splitBucketIndex.size();
1141 
1142 #ifdef DEBUG_TREE_SORT
1143  if (!rank)
1144  for (int i = 0; i < splitBucketIndex.size(); i++)
1145  std::cout << "Splitter Bucket Index: " << i << " : " << splitBucketIndex[i] << std::endl;
1146 #endif
1147 
1148 
1149  //unsigned int maxDepthBuckets = 0;
1150 
1151 #ifdef DEBUG_TREE_SORT
1152  if(!rank) {
1153  for (int i = 0; i <bucketSplitter.size(); i++) {
1154  std::cout<<" Bucket Splitter : "<<i<<" : "<<bucketSplitter[i]<<std::endl;
1155  }
1156  }
1157 #endif
1158  for (int k = 0; k < splitBucketIndex.size(); k++) {
1159 #ifdef DEBUG_TREE_SORT
1160  if(!rank)
1161  std::cout<<"Splitting Bucket index "<<splitBucketIndex[k]<<std::endl;
1162 #endif
1163 
1164  tmp = bucketInfo[splitBucketIndex[k]];
1165  SFC::seqSort::SFC_bucketing(&(*(pNodes.begin())), tmp.lev, pMaxDepth, tmp.rot_id, tmp.begin,
1166  tmp.end, splitterTemp);
1167 
1168 
1169  for (int i = 0; i < NUM_CHILDREN; i++) {
1170  hindex = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i] - '0');
1171  if (i == (NUM_CHILDREN - 1))
1172  hindexN = i + 1;
1173  else
1174  hindexN = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i + 1] - '0');
1175 
1176  //newBucketCounts[NUM_CHILDREN * k + i] = (splitterTemp[hindexN] - splitterTemp[hindex]);
1177  newBucketCounts.push_back((splitterTemp[hindexN] - splitterTemp[hindex]));
1178 
1179  index = HILBERT_TABLE[NUM_CHILDREN * tmp.rot_id + hindex];
1180  BucketInfo<T> bucket(index, (tmp.lev + 1), splitterTemp[hindex], splitterTemp[hindexN]);
1181  // newBucketInfo[NUM_CHILDREN * k + i] = bucket;
1182  // newBucketSplitters[NUM_CHILDREN * k + i] = splitterTemp[hindex];
1183  newBucketInfo.push_back(bucket);
1184  newBucketSplitters.push_back(splitterTemp[hindex]);
1185 #ifdef DEBUG_TREE_SORT
1186  assert(pNodes[splitterTemp[hindex]]<=pNodes[splitterTemp[hindexN]]);
1187 #endif
1188  }
1189 
1190  }
1191 
1192  newBucketCounts_g.resize(newBucketCounts.size());
1193  par::Mpi_Allreduce(&(*(newBucketCounts.begin())), &(*(newBucketCounts_g.begin())),
1194  newBucketCounts.size(), MPI_SUM, comm);
1195  //MPI_Allreduce(&newBucketCounts[0], &newBucketCounts_g[0], newBucketCounts.size(), MPI_LONG_LONG, MPI_SUM, comm);
1196 
1197 
1198 #ifdef DEBUG_TREE_SORT
1199  for(int i=0;i<newBucketSplitters.size()-1;i++)
1200  {
1201  assert(pNodes[newBucketSplitters[i]]<=pNodes[newBucketSplitters[i+1]]);
1202  }
1203 #endif
1204 
1205 
1206 #ifdef DEBUG_TREE_SORT
1207  for (int k = 0; k < splitBucketIndex.size(); k++) {
1208  unsigned int sum = 0;
1209  for (int i = 0; i < NUM_CHILDREN; i++)
1210  sum += newBucketCounts_g[NUM_CHILDREN * k + i];
1211  if (!rank) if (bucketCounts_g[splitBucketIndex[k]] != sum) {
1212  assert(false);
1213  }
1214 
1215  }
1216 #endif
1217 
1218  //int count = 0;
1219  for (int k = 0; k < splitBucketIndex.size(); k++) {
1220 
1221  unsigned int bucketBegin = 0;
1222 
1223  (k == 0) ? bucketBegin = 0 : bucketBegin = splitBucketIndex[k - 1] + 1;
1224 
1225 
1226  for (int i = bucketBegin; i < splitBucketIndex[k]; i++) {
1227 
1228  bucketCount_gMerge.push_back(bucketCounts_g[i]);
1229  bucketInfoMerge.push_back(bucketInfo[i]);
1230  bucketSplitterMerge.push_back(bucketSplitter[i]);
1231 
1232  }
1233 
1234  tmp = bucketInfo[splitBucketIndex[k]];
1235  for (int i = 0; i < NUM_CHILDREN; i++) {
1236  bucketCount_gMerge.push_back(newBucketCounts_g[NUM_CHILDREN * k + i]);
1237  bucketInfoMerge.push_back(newBucketInfo[NUM_CHILDREN * k + i]);
1238  bucketSplitterMerge.push_back(newBucketSplitters[NUM_CHILDREN * k + i]);
1239 
1240  }
1241 
1242 
1243  if (k == (splitBucketIndex.size() - 1)) {
1244  for (int i = splitBucketIndex[k] + 1; i < bucketCounts_g.size(); i++) {
1245  bucketCount_gMerge.push_back(bucketCounts_g[i]);
1246  bucketInfoMerge.push_back(bucketInfo[i]);
1247  bucketSplitterMerge.push_back(bucketSplitter[i]);
1248  }
1249 
1250 
1251  }
1252 
1253 
1254  }
1255 
1256  std::swap(bucketCounts_g, bucketCount_gMerge);
1257  std::swap(bucketInfo, bucketInfoMerge);
1258  std::swap(bucketSplitter, bucketSplitterMerge);
1259 
1260 
1261  bucketCounts_gScan.resize(bucketCounts_g.size());
1262 
1263  bucketCounts_gScan[0] = bucketCounts_g[0];
1264  for (int k = 1; k < bucketCounts_g.size(); k++) {
1265  bucketCounts_gScan[k] = bucketCounts_gScan[k - 1] + bucketCounts_g[k];
1266  }
1267 #ifdef DEBUG_TREE_SORT
1268  assert(bucketCounts_gScan.back()==globalSz);
1269 #endif
1270  splitBucketIndex.clear();
1271 #ifdef DEBUG_TREE_SORT
1272  for(int i=0;i<bucketSplitter.size()-2;i++)
1273  {
1274  std::cout<<"Bucket Splitter : "<<bucketSplitter[i]<<std::endl;
1275  assert( bucketSplitter[i+1]!=(pNodes.size()) && pNodes[bucketSplitter[i]]<=pNodes[bucketSplitter[i+1]]);
1276  }
1277 #endif
1278 
1279  idealLoadBalance = 0;
1280  //begin_loc=0;
1281  for (unsigned int i = 0; i < a - 1; i++) {
1282  idealLoadBalance += ((i + 1) * globalSz / a - i * globalSz / a);
1283  DendroIntL toleranceLoadBalance = (((i + 1) * globalSz / a - i * globalSz / a) *
1284  loadFlexibility);
1285  unsigned int loc = (std::lower_bound(bucketCounts_gScan.begin(), bucketCounts_gScan.end(),
1286  idealLoadBalance) -
1287  bucketCounts_gScan.begin());
1288 
1289  if (abs(bucketCounts_gScan[loc] - idealLoadBalance) > toleranceLoadBalance) {
1290  if (splitBucketIndex.empty() || splitBucketIndex.back() != loc)
1291  splitBucketIndex.push_back(loc);
1292 
1293 
1294  } else {
1295  if ((loc + 1) < bucketSplitter.size())
1296  localSplitterTmp[i] = bucketSplitter[loc + 1];
1297  else
1298  localSplitterTmp[i] = bucketSplitter[loc];
1299 
1300  }
1301 
1302  /* if(loc+1<bucketCounts_gScan.size())
1303  begin_loc=loc+1;
1304  else
1305  begin_loc=loc;*/
1306 
1307  }
1308  localSplitterTmp[a - 1] = pNodes.size();
1309 
1310  } else {
1311  //begin_loc=0;
1312  idealLoadBalance = 0;
1313  for (unsigned int i = 0; i < a - 1; i++) {
1314 
1315  idealLoadBalance += ((i + 1) * globalSz / a - i * globalSz / a);
1316  //DendroIntL toleranceLoadBalance = ((i + 1) * globalSz / npes - i * globalSz / npes) * loadFlexibility;
1317  unsigned int loc = (
1318  std::lower_bound(bucketCounts_gScan.begin(), bucketCounts_gScan.end(),
1319  idealLoadBalance) -
1320  bucketCounts_gScan.begin());
1321 
1322 
1323  if ((loc + 1) < bucketSplitter.size())
1324  localSplitterTmp[i] = bucketSplitter[loc + 1];
1325  else
1326  localSplitterTmp[i] = bucketSplitter[loc];
1327 
1328  /* if(loc+1<bucketCounts_gScan.size())
1329  begin_loc=loc+1;
1330  else
1331  begin_loc=loc;*/
1332 
1333  }
1334  localSplitterTmp[a - 1] = pNodes.size();
1335 
1336 
1337  break;
1338  }
1339 
1340  }
1341 
1342  bucketCount_gMerge.clear();
1343  bucketCounts.clear();
1344  bucketCounts_gScan.clear();
1345  bucketInfoMerge.clear();
1346  bucketInfo.clear();
1347  bucketCounts_g.clear();
1348  bucketSplitter.clear();
1349  bucketSplitterMerge.clear();
1350  newBucketCounts.clear();
1351  newBucketCounts_g.clear();
1352  newBucketInfo.clear();
1353  newBucketSplitters.clear();
1354 
1355 
1356 #ifdef DEBUG_TREE_SORT
1357  if(!rank) {
1358  for (int i = 0; i <a; i++) {
1359  //std::cout << "Bucket count G : " << i << " : " << bucketCounts_g[i] << std::endl;
1360  std::cout << "Local Splitter Tmp : " << i << " : " << localSplitterTmp[i] << std::endl;
1361  }
1362  }
1363 #endif
1364 
1365 
1366  std::vector<unsigned int> blockCounts;
1367  blockCounts.resize(a,b);
1368 
1369  for(unsigned int k=0;k<p_mod_a;k++)
1370  blockCounts[k]=(b+1);
1371 
1372 
1373 
1374 
1375  std::vector<unsigned int > blockOffset;
1376  blockOffset.resize(a,0);
1377 
1378  blockOffset[0]=0;
1379  omp_par::scan(&(*(blockCounts.begin())),&(*(blockOffset.begin())),a);
1380 
1381 
1382 
1383  // compute the block ids.
1384  unsigned int blk_id;
1385  for(unsigned int k=0;k<a;k++)
1386  {
1387  if( (rank>=blockOffset[k]) && rank<(blockOffset[k]+blockCounts[k]))
1388  {
1389  (blockOffset[k]!=0) ? blk_id=rank%blockOffset[k] : blk_id=rank;
1390  }
1391  }
1392 
1393  /* if(!rank)
1394  for(unsigned int k=0;k<a;k++)
1395  std::cout<<" bck count: "<<k<<" val: "<<blockCounts[k]<<" offsets: "<<blockOffset[k]<<std::endl;*/
1396 
1397 
1398  //std::cout<<"rank: "<<rank<<" block_id: "<<blk_id<<std::endl;
1399 
1400 
1401  unsigned int *sendCnt = new unsigned int[a];
1402  unsigned int owner_blk=0;
1403  (rank<(b+1)*p_mod_a) ? owner_blk=rank/(b+1) : owner_blk =(((rank-(b+1)*p_mod_a)/b) + p_mod_a);
1404 
1405  //std::cout<<" rank: "<<rank<<" owner_blk: "<<owner_blk<<std::endl;
1406 
1407  sendCnt[0] = localSplitterTmp[0];
1408  for (unsigned int i = 1; i < a; i++)
1409  {
1410  assert(localSplitterTmp[i]>=localSplitterTmp[i-1]);
1411  sendCnt[i]= (localSplitterTmp[i] - localSplitterTmp[i - 1]);
1412  }
1413 
1414 
1415  // computed a-splitters.
1416 
1417  std::vector<unsigned int> send_count;
1418  std::vector<unsigned int> send_offset;
1419  std::vector<unsigned int> recv_count;
1420  std::vector<unsigned int> recv_offset;
1421 
1422  send_count.resize(npes,0);
1423  send_offset.resize(npes,0);
1424  recv_count.resize(npes,0);
1425  recv_offset.resize(npes,0);
1426 
1427  /*if(rank==4)
1428  for(unsigned int p=0;p<a;p++)
1429  std::cout<<" sendCnt["<<p<<" ]: "<<sendCnt[p]<<" spliter: ["<<p<<" ] : value: "<<localSplitterTmp[p]<<std::endl;*/
1430 
1431  for(unsigned int k=0;k<a;k++)
1432  {
1433  if(owner_blk==k) continue;
1434  if( (rank<(b+1)*p_mod_a) && blockCounts[k]==(b+1))
1435  {
1436  assert((blockOffset[k]+blk_id)<npes);
1437  if(k>0)send_offset[(blockOffset[k]+blk_id)]=localSplitterTmp[k-1];
1438  send_count[(blockOffset[k]+blk_id)]=sendCnt[k];
1439  }else
1440  {
1441  assert((blockOffset[k]+(blk_id%b))<npes);
1442  if(k>0)send_offset[(blockOffset[k]+(blk_id%b))]=localSplitterTmp[k-1];
1443  send_count[(blockOffset[k]+(blk_id%b))]=sendCnt[k];
1444  }
1445 
1446  }
1447 
1448 
1449  par::Mpi_Alltoall(&(*(send_count.begin())),&(*(recv_count.begin())),1,comm);
1450 
1451  recv_offset[0]=0;
1452  omp_par::scan(&(*(recv_count.begin())),&(*(recv_offset.begin())),npes);
1453 
1454  std::vector<T> recvBuf;
1455  recvBuf.resize(recv_offset[npes-1]+recv_count[npes-1]);
1456 
1457 
1458  par::Mpi_Alltoallv_sparse(&(*(pNodes.begin())),(int *) &(*(send_count.begin())),(int *) &(*(send_offset.begin())),&(*(recvBuf.begin())),(int *) &(*(recv_count.begin())),(int *) &(*(recv_offset.begin())),comm);
1459  (owner_blk>0) ? recvBuf.insert(recvBuf.end(),pNodes.begin()+localSplitterTmp[owner_blk-1],pNodes.begin()+localSplitterTmp[owner_blk-1]+sendCnt[owner_blk]) : recvBuf.insert(recvBuf.end(),pNodes.begin(),pNodes.begin()+sendCnt[owner_blk]);
1460 
1461 
1462  delete [] sendCnt;
1463 
1464  std::swap(pNodes,recvBuf);
1465  recvBuf.clear();
1466 
1467 
1468 
1469 #ifdef DEBUG_TREE_SORT
1470  if(!rank) {
1471  for (int i = 0; i <a; i++) {
1472  //std::cout << "Bucket count G : " << i << " : " << bucketCounts_g[i] << std::endl;
1473  std::cout << "Send Cnt : " << i << " : " << sendCnt[i] << std::endl;
1474  }
1475  }
1476 #endif
1477 
1478 
1479 
1480 #ifdef PROFILE_TREE_SORT
1481  t1=std::chrono::high_resolution_clock::now();//MPI_Wtime();
1482 #endif
1483 
1484 
1485 
1486  unsigned int col=owner_blk;
1487  MPI_Comm_split(comm,col,rank,newComm);
1488 
1489 
1490 
1491 
1492  }
1493 
1494 #endif
1495 
1496 
1497 
1498  return;
1499 
1500  }// end of function.
1501 
1502 
1503 
1504  template <typename T>
1505  void SFC_treeSort(std::vector<T> &pNodes, std::vector<T>& pOutSorted,std::vector<T>& pOutConstruct,std::vector<T>& pOutBalanced , double loadFlexibility,unsigned int pMaxDepth, T& parent, unsigned int rot_id,unsigned int k, unsigned int options, unsigned int sf_k,MPI_Comm pcomm)
1506  {
1507 
1508  int rank, npes;
1509  MPI_Comm_rank(pcomm, &rank);
1510  MPI_Comm_size(pcomm, &npes);
1511 
1512  MPI_Comm comm;
1513  MPI_Comm_dup(pcomm,&comm);
1514 
1515 #ifdef PROFILE_TREE_SORT
1516  stats.clear();
1517 
1518  splitter_fix_all=0;
1519  splitter_time=0;
1520  all2all1_time=0;
1521  all2all2_time=0;
1522  localSort_time=0;
1523  remove_duplicates_seq=0;
1524  remove_duplicates_par=0;
1525  auxBalOCt_time=0;
1526  construction_time=0;
1527  balancing_time=0;
1528  total_rd=0;
1529 
1530  //MPI_Barrier(pcomm);
1531 
1532  t4=std::chrono::high_resolution_clock::now();//MPI_Wtime();
1533  t2=std::chrono::high_resolution_clock::now();//MPI_Wtime();
1534 
1535 #endif
1536 
1537 
1538  //SFC_SplitterFix(pNodes,pMaxDepth,loadFlexibility,pcomm,&comm);
1539 
1540 
1541  MPI_Comm SF_comm=pcomm;
1542  unsigned int SF_Stages=0;
1543 
1544 #ifdef PROFILE_TREE_SORT
1545  double * sf_full;
1546  double * sf_all2all;
1547  double * sf_splitters;
1548 #endif
1549 
1550 
1551  if(npes==1)
1552  {
1553  MPI_Comm_free(&comm);
1554  //call the sequential case
1555  SFC::seqSort::SFC_treeSort(&(*(pNodes.begin())),pNodes.size(),pOutSorted,pOutConstruct,pOutBalanced,pMaxDepth,pMaxDepth,parent,rot_id,k,options);
1556  return ;
1557 
1558  }
1559 
1560 
1561 
1562 #ifdef SPLITTER_SELECTION_FIX
1563  if(npes > sf_k)
1564  {
1565 
1566  SF_Stages= std::ceil((binOp::fastLog2(npes)/(double)binOp::fastLog2(sf_k))) - 1;
1567  //if(!rank) std::cout<<" sf_stages: "<<SF_Stages<<std::endl;
1568 
1569 #ifdef PROFILE_TREE_SORT
1570  sf_full=new double[SF_Stages];
1571  sf_all2all=new double[SF_Stages];
1572  sf_splitters=new double[SF_Stages];
1573 #endif
1574 
1575  MPI_Comm * sf_comms=new MPI_Comm[SF_Stages];
1576 
1577  for(int i=0;i<SF_Stages;i++)
1578  {
1579 #ifdef PROFILE_TREE_SORT
1580  t5_sf_staged=std::chrono::high_resolution_clock::now();
1581 #endif
1582  //if(!rank) std::cout<<"sf_stage: "<<i<<"of "<<SF_Stages<<std::endl;
1583  SFC_SplitterFix(pNodes,pMaxDepth,loadFlexibility,sf_k,SF_comm,&sf_comms[i]);
1584 #ifdef PROFILE_TREE_SORT
1585  sf_full[i]=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t5_sf_staged).count();
1586  sf_all2all[i]=all2all1_time;
1587  sf_splitters[i]=sf_full[i]-sf_all2all[i];
1588 #endif
1589 
1590  SF_comm=sf_comms[i];
1591  }
1592 
1593  MPI_Comm_free(&comm);
1594  MPI_Comm_dup(sf_comms[SF_Stages-1],&comm);
1595  for(int i=0;i<SF_Stages;i++)
1596  {
1597  MPI_Comm_free(&sf_comms[i]);
1598  }
1599 
1600  delete [] sf_comms;
1601 
1602  }
1603 
1604 #endif
1605 
1606 #ifdef PROFILE_TREE_SORT
1607  splitter_fix_all=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t2).count();
1608 
1609 #endif
1610 
1611  MPI_Comm_rank(comm, &rank);
1612  MPI_Comm_size(comm, &npes);
1613 
1614  unsigned int dim=3;
1615 #ifdef DIM_2
1616  dim=2;
1617 #else
1618  dim=3;
1619 #endif
1620 
1621 #ifdef PROFILE_TREE_SORT
1622  //MPI_Barrier(pcomm);
1623  t2=std::chrono::high_resolution_clock::now();//MPI_Wtime();
1624 #endif
1625  unsigned int firstSplitLevel = std::ceil(binOp::fastLog2(npes)/(double)(dim));
1626  unsigned int totalNumBuckets =1u << (dim * firstSplitLevel);
1627  DendroIntL localSz=pNodes.size();
1628  DendroIntL globalSz=0;
1629  MPI_Allreduce(&localSz,&globalSz,1,MPI_LONG_LONG,MPI_SUM,comm);
1630  //if(!rank) std::cout<<"First Split Level : "<<firstSplitLevel<<" Total number of buckets: "<<totalNumBuckets <<std::endl;
1631  //if(!rank) std::cout<<"NUM_CHILDREN: "<<NUM_CHILDREN<<std::endl;
1632  // Number of initial buckets. This should be larger than npes.
1633 
1634  // maintain the splitters and buckets for splitting for further splitting.
1635  std::vector<DendroIntL> bucketCounts;
1636  std::vector<BucketInfo<T>> bucketInfo; // Stores the buckets info of the buckets where the initial buckets was splitted.
1637  std::vector<DendroIntL > bucketSplitter;
1638 
1639  std::vector<BucketInfo<T>> nodeStack; // rotation id stack
1640  BucketInfo<T> root(0, 0, 0, pNodes.size());
1641  nodeStack.push_back(root);
1642  BucketInfo<T> tmp = root;
1643  unsigned int levSplitCount = 0;
1644 
1645  // Used repetitively in rotation computations.
1646  unsigned int hindex = 0;
1647  unsigned int hindexN = 0;
1648 
1649  unsigned int index = 0;
1650  //bool *updateState = new bool[pNodes.size()];
1651  unsigned int numLeafBuckets =0;
1652 
1653  unsigned int begin_loc=0;
1654  /*
1655  * We need to split the array into buckets until it is we number of buckets is slightly higher than the number of processors.
1656  */
1657 
1658 
1659  // 1. ===================Initial Splitting Start===============================
1660  DendroIntL spliterstemp[(NUM_CHILDREN+1)];
1661  while(numLeafBuckets<totalNumBuckets) {
1662 
1663  tmp = nodeStack[0];
1664  nodeStack.erase(nodeStack.begin());
1665 
1666 
1667  SFC::seqSort::SFC_bucketing(&(*(pNodes.begin())),tmp.lev,pMaxDepth,tmp.rot_id,tmp.begin,tmp.end,spliterstemp);
1668 
1669 
1670  for (int i = 0; i < NUM_CHILDREN; i++) {
1671  hindex = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i] - '0');
1672  if (i == (NUM_CHILDREN-1))
1673  hindexN = i + 1;
1674  else
1675  hindexN = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i + 1] - '0');
1676  assert(spliterstemp[hindex] <= spliterstemp[hindexN]);
1677  index = HILBERT_TABLE[NUM_CHILDREN * tmp.rot_id + hindex];
1678 
1679  BucketInfo<T> child(index, (tmp.lev + 1), spliterstemp[hindex], spliterstemp[hindexN]);
1680  nodeStack.push_back(child);
1681 
1682 
1683  if(tmp.lev==(firstSplitLevel-1))
1684  {
1685  BucketInfo<T> bucket(index, (tmp.lev + 1), spliterstemp[hindex], spliterstemp[hindexN]);
1686  bucketCounts.push_back((spliterstemp[hindexN] - spliterstemp[hindex]));
1687  bucketSplitter.push_back(spliterstemp[hindex]);
1688  bucketInfo.push_back(bucket);
1689  numLeafBuckets++;
1690 
1691  }
1692 
1693  }
1694 
1695 
1696  }
1697 
1698 
1699 
1700 #ifdef DEBUG_TREE_SORT
1701  for(int i=0;i<bucketSplitter.size()-2;i++)
1702  {
1703  std::cout<<"Bucket Splitter : "<<bucketSplitter[i]<<std::endl;
1704  //assert(pNodes[bucketSplitter[i]]<=pNodes[bucketSplitter[i+1]]);
1705  }
1706 #endif
1707 
1708 
1709  // #ifdef DEBUG_TREE_SORT
1710  //std::cout<<rank<<" Initial Splitter Calculation ended "<<numLeafBuckets<<std::endl;
1711  //#endif
1712 
1713  //1=================== Initial Splitting END=========================================================================
1714 
1715 
1716  // (2) =================== Pick npes splitters form the bucket splitters.
1717  std::vector<DendroIntL >bucketCounts_g(bucketCounts.size());
1718  std::vector<DendroIntL >bucketCounts_gScan(bucketCounts.size());
1719 
1720 
1721  par::Mpi_Allreduce<DendroIntL>(&(*(bucketCounts.begin())),&(*(bucketCounts_g.begin())),bucketCounts.size(),MPI_SUM,comm);
1722 
1723  /* if(!rank)
1724  std::cout<<"All Reduction Time for : "<<npes<<" : "<<allReduceTime<<std::endl;*/
1725 
1726  //MPI_Allreduce(&bucketCounts[0], &bucketCounts_g[0], bucketCounts.size(), MPI_LONG_LONG, MPI_SUM, comm);
1727  //std::cout<<"All to all ended. "<<rank<<std::endl;
1728 
1729 #ifdef DEBUG_TREE_SORT
1730  assert(totalNumBuckets);
1731 #endif
1732  bucketCounts_gScan[0]=bucketCounts_g[0];
1733  for(int k=1;k<bucketCounts_g.size();k++){
1734  bucketCounts_gScan[k]=bucketCounts_gScan[k-1]+bucketCounts_g[k];
1735  }
1736 
1737 #ifdef DEBUG_TREE_SORT
1738  if(!rank) {
1739  for (int i = 0; i < totalNumBuckets; i++) {
1740  //std::cout << "Bucket count G : " << i << " : " << bucketCounts_g[i] << std::endl;
1741  std::cout << "Bucket initial count scan G : " << i << " : " << bucketCounts_gScan[i] << std::endl;
1742  }
1743  }
1744 #endif
1745 
1746 #ifdef DEBUG_TREE_SORT
1747  assert(bucketCounts_gScan.back()==globalSz);
1748 #endif
1749  DendroIntL* localSplitter=new DendroIntL[npes];
1750  std::vector<unsigned int> splitBucketIndex;
1751  DendroIntL idealLoadBalance=0;
1752  //begin_loc=0;
1753  for(int i=0;i<npes-1;i++) {
1754  idealLoadBalance+=((i+1)*globalSz/npes -i*globalSz/npes);
1755  DendroIntL toleranceLoadBalance = ((i+1)*globalSz/npes -i*globalSz/npes) * loadFlexibility;
1756 
1757  unsigned int loc=(std::lower_bound(bucketCounts_gScan.begin(), bucketCounts_gScan.end(), idealLoadBalance) - bucketCounts_gScan.begin());
1758  //std::cout<<rank<<" Searching: "<<idealLoadBalance<<"found: "<<loc<<std::endl;
1759 
1760  if(abs(bucketCounts_gScan[loc]-idealLoadBalance) > toleranceLoadBalance)
1761  {
1762 
1763  if(splitBucketIndex.empty() || splitBucketIndex.back()!=loc)
1764  splitBucketIndex.push_back(loc);
1765  /*if(!rank)
1766  std::cout<<"Bucket index : "<<loc << " Needs a split "<<std::endl;*/
1767  }else
1768  {
1769  if ((loc + 1) < bucketSplitter.size())
1770  localSplitter[i] = bucketSplitter[loc + 1];
1771  else
1772  localSplitter[i] = bucketSplitter[loc];
1773  }
1774 
1775  /* if(loc+1<bucketCounts_gScan.size())
1776  begin_loc=loc+1;
1777  else
1778  begin_loc=loc;*/
1779 
1780  }
1781 
1782  localSplitter[npes-1]=pNodes.size();
1783 
1784 
1785 #ifdef DEBUG_TREE_SORT
1786  for(int i=0;i<splitBucketIndex.size()-1;i++)
1787  {
1788  assert(pNodes[bucketSplitter[splitBucketIndex[i]]]<pNodes[bucketSplitter[splitBucketIndex[i+1]]]);
1789  }
1790 #endif
1791  std::vector<DendroIntL> newBucketCounts;
1792  std::vector<DendroIntL> newBucketCounts_g;
1793  std::vector<BucketInfo<T>> newBucketInfo;
1794  std::vector<DendroIntL> newBucketSplitters;
1795 
1796 
1797  std::vector<DendroIntL> bucketCount_gMerge;
1798  std::vector<DendroIntL> bucketSplitterMerge;
1799  std::vector<BucketInfo<T>> bucketInfoMerge;
1800 
1801  DendroIntL splitterTemp[(NUM_CHILDREN+1)];
1802  while(!splitBucketIndex.empty()) {
1803 
1804 
1805  newBucketCounts.clear();
1806  newBucketCounts_g.clear();
1807  newBucketInfo.clear();
1808  newBucketSplitters.clear();
1809 
1810  bucketCount_gMerge.clear();
1811  bucketSplitterMerge.clear();
1812  bucketInfoMerge.clear();
1813 
1814  if (bucketInfo[splitBucketIndex[0]].lev < pMaxDepth) {
1815 
1816  BucketInfo<T> tmp;
1817  // unsigned int numSplitBuckets = NUM_CHILDREN * splitBucketIndex.size();
1818 
1819 #ifdef DEBUG_TREE_SORT
1820  if (!rank)
1821  for (int i = 0; i < splitBucketIndex.size(); i++)
1822  std::cout << "Splitter Bucket Index: " << i << " : " << splitBucketIndex[i] << std::endl;
1823 #endif
1824 
1825  //unsigned int maxDepthBuckets = 0;
1826 
1827 #ifdef DEBUG_TREE_SORT
1828  if(!rank) {
1829  for (int i = 0; i <bucketSplitter.size(); i++) {
1830  std::cout<<" Bucket Splitter : "<<i<<" : "<<bucketSplitter[i]<<std::endl;
1831  }
1832  }
1833 #endif
1834  for (int k = 0; k < splitBucketIndex.size(); k++) {
1835 
1836  tmp = bucketInfo[splitBucketIndex[k]];
1837 #ifdef DEBUG_TREE_SORT
1838  if(!rank)
1839  std::cout<<"Splitting Bucket index "<<splitBucketIndex[k]<<"begin: "<<tmp.begin <<" end: "<<tmp.end<<" rot_id: "<< (int)tmp.rot_id<<std::endl;
1840 #endif
1841 
1842  SFC::seqSort::SFC_bucketing(&(*(pNodes.begin())),tmp.lev,pMaxDepth,tmp.rot_id,tmp.begin,tmp.end,splitterTemp);
1843 
1844 
1845 
1846  for (int i = 0; i < NUM_CHILDREN; i++) {
1847  hindex = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i] - '0');
1848  if (i == (NUM_CHILDREN-1))
1849  hindexN = i + 1;
1850  else
1851  hindexN = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i + 1] - '0');
1852 
1853  //newBucketCounts[NUM_CHILDREN * k + i] = (splitterTemp[hindexN] - splitterTemp[hindex]);
1854  newBucketCounts.push_back((splitterTemp[hindexN] - splitterTemp[hindex]));
1855 
1856  index = HILBERT_TABLE[NUM_CHILDREN * tmp.rot_id + hindex];
1857  BucketInfo<T> bucket(index, (tmp.lev + 1), splitterTemp[hindex], splitterTemp[hindexN]);
1858 // newBucketInfo[NUM_CHILDREN * k + i] = bucket;
1859 // newBucketSplitters[NUM_CHILDREN * k + i] = splitterTemp[hindex];
1860  newBucketInfo.push_back(bucket);
1861  newBucketSplitters.push_back(splitterTemp[hindex]);
1862 #ifdef DEBUG_TREE_SORT
1863  assert(pNodes[splitterTemp[hindex]]<=pNodes[splitterTemp[hindexN]]);
1864 #endif
1865  }
1866 
1867  }
1868 
1869  newBucketCounts_g.resize(newBucketCounts.size());
1870  par::Mpi_Allreduce(&(*(newBucketCounts.begin())),&(*(newBucketCounts_g.begin())),newBucketCounts.size(),MPI_SUM,comm);
1871 
1872 
1873 
1874 #ifdef DEBUG_TREE_SORT
1875  for(int i=0;i<newBucketSplitters.size()-1;i++)
1876  {
1877  assert(pNodes[newBucketSplitters[i]]<=pNodes[newBucketSplitters[i+1]]);
1878  }
1879 #endif
1880 
1881 
1882 
1883 #ifdef DEBUG_TREE_SORT
1884 
1885  for (int k = 0; k < splitBucketIndex.size(); k++) {
1886  unsigned int sum = 0;
1887  for (int i = 0; i < NUM_CHILDREN; i++)
1888  sum += newBucketCounts_g[NUM_CHILDREN * k + i];
1889  if (!rank) if (bucketCounts_g[splitBucketIndex[k]] != sum) {
1890  assert(false);
1891  }
1892 
1893  }
1894 #endif
1895 
1896  //int count = 0;
1897  for (int k = 0; k < splitBucketIndex.size(); k++) {
1898 
1899  unsigned int bucketBegin = 0;
1900 
1901  (k == 0) ? bucketBegin = 0 : bucketBegin = splitBucketIndex[k - 1] + 1;
1902 
1903 
1904  for (int i = bucketBegin; i < splitBucketIndex[k]; i++) {
1905 
1906  bucketCount_gMerge.push_back(bucketCounts_g[i]);
1907  bucketInfoMerge.push_back(bucketInfo[i]);
1908  bucketSplitterMerge.push_back(bucketSplitter[i]);
1909 
1910  }
1911 
1912  tmp = bucketInfo[splitBucketIndex[k]];
1913  for (int i = 0; i < NUM_CHILDREN; i++) {
1914  bucketCount_gMerge.push_back(newBucketCounts_g[NUM_CHILDREN * k + i]);
1915  bucketInfoMerge.push_back(newBucketInfo[NUM_CHILDREN * k + i]);
1916  bucketSplitterMerge.push_back(newBucketSplitters[NUM_CHILDREN * k + i]);
1917 
1918  }
1919 
1920 
1921  if (k == (splitBucketIndex.size() - 1)) {
1922  for (int i = splitBucketIndex[k] + 1; i < bucketCounts_g.size(); i++) {
1923  bucketCount_gMerge.push_back(bucketCounts_g[i]);
1924  bucketInfoMerge.push_back(bucketInfo[i]);
1925  bucketSplitterMerge.push_back(bucketSplitter[i]);
1926  }
1927 
1928 
1929  }
1930 
1931 
1932  }
1933 
1934  std::swap(bucketCounts_g,bucketCount_gMerge);
1935  std::swap(bucketInfo,bucketInfoMerge);
1936  std::swap(bucketSplitter,bucketSplitterMerge);
1937 
1938 
1939  bucketCounts_gScan.resize(bucketCounts_g.size());
1940 
1941  bucketCounts_gScan[0] = bucketCounts_g[0];
1942  for (int k = 1; k < bucketCounts_g.size(); k++) {
1943  bucketCounts_gScan[k] = bucketCounts_gScan[k - 1] + bucketCounts_g[k];
1944  }
1945 #ifdef DEBUG_TREE_SORT
1946  assert(bucketCounts_gScan.back()==globalSz);
1947 #endif
1948  splitBucketIndex.clear();
1949 #ifdef DEBUG_TREE_SORT
1950  for(int i=0;i<bucketSplitter.size()-2;i++)
1951  {
1952  std::cout<<"Bucket Splitter : "<<bucketSplitter[i]<<std::endl;
1953  assert( bucketSplitter[i+1]!=(pNodes.size()) && pNodes[bucketSplitter[i]]<=pNodes[bucketSplitter[i+1]]);
1954  }
1955 #endif
1956 
1957  idealLoadBalance = 0;
1958  //begin_loc=0;
1959  for (unsigned int i = 0; i < npes-1; i++) {
1960  idealLoadBalance += ((i + 1) * globalSz / npes - i * globalSz / npes);
1961  DendroIntL toleranceLoadBalance = (((i + 1) * globalSz / npes - i * globalSz / npes) * loadFlexibility);
1962  unsigned int loc = (std::lower_bound(bucketCounts_gScan.begin(), bucketCounts_gScan.end(), idealLoadBalance) -
1963  bucketCounts_gScan.begin());
1964 
1965  if (abs(bucketCounts_gScan[loc] - idealLoadBalance) > toleranceLoadBalance) {
1966  if (splitBucketIndex.empty() || splitBucketIndex.back() != loc)
1967  splitBucketIndex.push_back(loc);
1968 
1969 
1970  } else {
1971  if ((loc + 1) < bucketSplitter.size())
1972  localSplitter[i] = bucketSplitter[loc + 1];
1973  else
1974  localSplitter[i] = bucketSplitter[loc];
1975 
1976  }
1977 
1978  /* if(loc+1<bucketCounts_gScan.size())
1979  begin_loc=loc+1;
1980  else
1981  begin_loc=loc;*/
1982 
1983  }
1984  localSplitter[npes-1]=pNodes.size();
1985 
1986  } else {
1987  //begin_loc=0;
1988  idealLoadBalance = 0;
1989  for (unsigned int i = 0; i < npes-1; i++) {
1990 
1991  idealLoadBalance += ((i + 1) * globalSz / npes - i * globalSz / npes);
1992  //DendroIntL toleranceLoadBalance = ((i + 1) * globalSz / npes - i * globalSz / npes) * loadFlexibility;
1993  unsigned int loc = (
1994  std::lower_bound(bucketCounts_gScan.begin(), bucketCounts_gScan.end(), idealLoadBalance) -
1995  bucketCounts_gScan.begin());
1996 
1997 
1998  if ((loc + 1) < bucketSplitter.size())
1999  localSplitter[i] = bucketSplitter[loc + 1];
2000  else
2001  localSplitter[i] = bucketSplitter[loc];
2002 
2003  /* if(loc+1<bucketCounts_gScan.size())
2004  begin_loc=loc+1;
2005  else
2006  begin_loc=loc;*/
2007 
2008  }
2009  localSplitter[npes-1]=pNodes.size();
2010 
2011 
2012 
2013  break;
2014  }
2015 
2016  }
2017 
2018 
2019 
2020  bucketCount_gMerge.clear();
2021  bucketCounts.clear();
2022  bucketCounts_gScan.clear();
2023  bucketInfoMerge.clear();
2024  bucketInfo.clear();
2025  bucketCounts_g.clear();
2026  bucketSplitter.clear();
2027  bucketSplitterMerge.clear();
2028  newBucketCounts.clear();
2029  newBucketCounts_g.clear();
2030  newBucketInfo.clear();
2031  newBucketSplitters.clear();
2032 
2033 
2034 //#ifdef DEBUG_TREE_SORT
2035  // if(!rank) std::cout<<"Splitter Calculation ended "<<std::endl;
2036 //#endif
2037 
2038 #ifdef DEBUG_TREE_SORT
2039  for(int i=0;i<npes;i++)
2040  {
2041  for(int j=i+1 ;j<npes -1;j++)
2042  assert(pNodes[localSplitter[i]]<=pNodes[localSplitter[j]]);
2043  }
2044 #endif
2045 
2046 
2047 #ifdef DEBUG_TREE_SORT
2048  if(!rank)
2049  {
2050  for(int i=0;i<npes;i++)
2051  std::cout<<"Rank "<<rank<<" Local Splitter: "<<i<<": "<<localSplitter[i]<<std::endl;
2052  }
2053 #endif
2054 
2055 // 3. All to all communication
2056 
2057 #ifdef PROFILE_TREE_SORT
2058  splitter_time=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t2).count();
2059  //MPI_Barrier(pcomm);
2060  t2=std::chrono::high_resolution_clock::now();//MPI_Wtime();
2061 #endif
2062 
2063 
2064  int * sendCounts = new int[npes];
2065  int * recvCounts = new int[npes];
2066 
2067 
2068  sendCounts[0] = localSplitter[0];
2069 
2070  for(unsigned int i=1;i<npes; ++i)
2071  {
2072  sendCounts[i] = localSplitter[i] - localSplitter[i-1];
2073  }
2074 
2075 
2076 
2077  par::Mpi_Alltoall(sendCounts,recvCounts,1,comm);
2078  //MPI_Alltoall(sendCounts, 1, MPI_INT,recvCounts,1,MPI_INT,comm);
2079  //std::cout<<"rank "<<rank<<" MPI_ALL TO ALL END"<<std::endl;
2080 
2081  int * sendDispl =new int [npes];
2082  int * recvDispl =new int [npes];
2083 
2084 
2085  sendDispl[0] = 0;
2086  recvDispl[0] = 0;
2087 
2088  for(int i=1;i<npes;i++)
2089  {
2090  sendDispl[i] = sendCounts[i-1] + sendDispl[i - 1];
2091  recvDispl[i] =recvCounts[i-1] +recvDispl[i-1];
2092  }
2093 
2094 
2095 #ifdef DEBUG_TREE_SORT
2096  /*if (!rank)*/ std::cout << rank << " : send = " << sendCounts[0] << ", " << sendCounts[1] << std::endl;
2097  /*if (!rank)*/ std::cout << rank << " : recv = " << recvCounts[0] << ", " << recvCounts[1] << std::endl;
2098 
2099  /* if (!rank) std::cout << rank << " : send offset = " << sendDispl[0] << ", " << sendDispl[1] << std::endl;
2100  if (!rank) std::cout << rank << " : recv offset = " << recvDispl[0] << ", " << recvDispl[1] << std::endl;*/
2101 #endif
2102 
2103 
2104 
2105  std::vector<T> pNodesRecv;
2106  DendroIntL recvTotalCnt=recvDispl[npes-1]+recvCounts[npes-1];
2107  if(recvTotalCnt) pNodesRecv.resize(recvTotalCnt);
2108 
2109  //par::Mpi_Alltoallv(&pNodes[0],sendCounts,sendDispl,&pNodesRecv[0],recvCounts,recvDispl,comm);
2110  // MPI_Alltoallv(&pNodes[0],sendCounts,sendDispl,MPI_TREENODE,&pNodesRecv[0],recvCounts,recvDispl,MPI_TREENODE,comm);
2111  par::Mpi_Alltoallv_Kway(&pNodes[0],sendCounts,sendDispl,&pNodesRecv[0],recvCounts,recvDispl,comm);
2112 
2113 #ifdef PROFILE_TREE_SORT
2114  all2all2_time=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t2).count();
2115  //MPI_Barrier(pcomm);
2116 #endif
2117 
2118 #ifdef DEBUG_TREE_SORT
2119  if(!rank) std::cout<<"All2All Communication Ended "<<std::endl;
2120 #endif
2121 
2122  delete[](localSplitter);
2123  localSplitter=NULL;
2124 
2125 
2126  pNodes.clear();
2127  //pNodes=pNodesRecv;
2128  std::swap(pNodes,pNodesRecv);
2129  pNodesRecv.clear();
2130 
2131  delete[](sendCounts);
2132  delete[](sendDispl);
2133  delete[](recvCounts);
2134  delete[](recvDispl);
2135 
2136 
2137  //std::cout<<"Rank: "<<rank<<"executing local Sort for size: "<<pNodes.size()<<std::endl;
2138 #ifdef PROFILE_TREE_SORT
2139  //MPI_Barrier(pcomm);
2140  t2=std::chrono::high_resolution_clock::now();//MPI_Wtime();
2141 #endif
2142  SFC::seqSort::SFC_treeSort(&(*(pNodes.begin())),pNodes.size(),pOutSorted,pOutConstruct,pOutBalanced,pMaxDepth,pMaxDepth,parent,rot_id,k,options);
2143 
2144 #ifdef PROFILE_TREE_SORT
2145  localSort_time=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t2).count();//MPI_Wtime()-t2;
2146  //MPI_Barrier(pcomm);
2147 #endif
2148 
2149  /* assert(seq::test::isUniqueAndSorted(pOutSorted));
2150  assert(seq::test::isUniqueAndSorted(pOutConstruct));
2151  assert(seq::test::isUniqueAndSorted(pOutBalanced));*/
2152  // freee the comm from the last stage.
2153  MPI_Comm_free(&comm);
2154 
2155  if(options & TS_REMOVE_DUPLICATES) {
2156 
2157  //if(!rank) std::cout<<"Executing par::RD begin"<<std::endl;
2158 
2159 #ifdef PROFILE_TREE_SORT
2160  //MPI_Barrier(pcomm);
2161  t2=std::chrono::high_resolution_clock::now();//MPI_Wtime();
2162 #endif
2163 
2164  int new_rank, new_size;
2165  MPI_Comm new_comm;
2166  // very quick and dirty solution -- assert that tmpVec is non-emply at every processor (repetetive calls to splitComm2way exhaust MPI resources)
2167  par::splitComm2way(pOutSorted.empty(), &new_comm, pcomm);
2168  //new_comm = pcomm;
2169  //assert(!pNodes.empty());
2170  MPI_Comm_rank(new_comm, &new_rank);
2171  MPI_Comm_size(new_comm, &new_size);
2172 
2173 
2174 #ifdef __DEBUG_PAR__
2175  MPI_Barrier(comm);
2176  if(!rank) {
2177  std::cout<<"RemDup: Stage-4 passed."<<std::endl;
2178  }
2179  MPI_Barrier(comm);
2180 #endif
2181 
2182  //Checking boundaries...
2183  if (!pOutSorted.empty()) {
2184 
2185  T begin =pOutSorted[0];
2186  T end = pOutSorted[pOutSorted.size() - 1];
2187  T endRecv;
2188  T beginRecv;
2189 
2190  //communicate end to the next processor.
2191  MPI_Status status;
2192 
2193  par::Mpi_Sendrecv<T, T>(&end, 1, ((new_rank < (new_size - 1)) ? (new_rank + 1) : 0), 1, &endRecv,
2194  1, ((new_rank > 0) ? (new_rank - 1) : (new_size - 1)), 1, new_comm,
2195  &status);
2196 
2197 
2198 
2199  //Remove endRecv if it exists (There can be no more than one copy of this)
2200  if (new_rank) {
2201  typename std::vector<T>::iterator Iter = std::find(pOutSorted.begin(), pOutSorted.end(), endRecv);
2202  if (Iter != pOutSorted.end()) {
2203  pOutSorted.erase(Iter);
2204  }//end if found
2205 
2206 
2207 
2208  }//end if p not 0
2209 
2210 
2211  bool state=true;
2212  bool state_global=true;
2213  unsigned int count=0;
2214  //@milindasf : Possible location for MPI hang, if for a one processor if above is not true, then followign sendrecv will get hanged.
2215 
2216  while(count<pOutSorted.size() & state_global ) {
2217 
2218  begin=pOutSorted[count];
2219  end=pOutSorted.back();
2220  state=false;
2221  par::Mpi_Sendrecv<T, T>(&begin, 1, ((new_rank > 0) ? (new_rank - 1) : (new_size - 1)), 1,
2222  &beginRecv, 1, ((new_rank < (new_size - 1)) ? (new_rank + 1) : 0), 1, new_comm,
2223  &status);
2224 
2225  /* if(beginRecv.isAncestor(end))
2226  {
2227  std::cout<<"for rank: "<<new_rank<<" beginRecv: "<<beginRecv<<" is ancestor to : "<<end<<std::endl;
2228  }*/
2229 
2230  while (end.isAncestor(beginRecv)) {
2231  state = true;
2232  pOutSorted.pop_back();
2233  end = pOutSorted.back();
2234  }
2235  count++;
2236  MPI_Allreduce(&state,&state_global,1,MPI_CXX_BOOL,MPI_LOR,new_comm);
2237 
2238 
2239  }
2240 
2241  }//end if not empty
2242 
2243  // free the new comm.
2244  MPI_Comm_free(&new_comm);
2245 
2246  //if(!rank) std::cout<<"Executing par::RD end"<<std::endl;
2247 #ifdef PROFILE_TREE_SORT
2248  remove_duplicates_par=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t2).count();//MPI_Wtime()-t2;
2249  //MPI_Barrier(pcomm);
2250 #endif
2251 
2252  }
2253 
2254 #ifdef PROFILE_TREE_SORT
2255  total_rd=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t4).count();
2256 
2257 
2258  int rank_g,npes_g;
2259  MPI_Comm_rank(pcomm,&rank_g);
2260  MPI_Comm_size(pcomm,&npes_g);
2261 
2262  par::Mpi_Reduce(&splitter_fix_all,&stat_property[0],1,MPI_MIN,0,pcomm);
2263  par::Mpi_Reduce(&splitter_fix_all,&stat_property[1],1,MPI_SUM,0,pcomm);
2264  par::Mpi_Reduce(&splitter_fix_all,&stat_property[2],1,MPI_MAX,0,pcomm);
2265 
2266  if(!rank_g) {
2267  stat_property[1] = stat_property[1] / (double) npes_g;
2268  //std::cout<<"Rank: "<<rank_g<<"splitter_fix_all: "<<stat_property[0]<< ": "<<stat_property[1]<<" : "<<stat_property[2]<<std::endl;
2269  stats.push_back(stat_property[0]);
2270  stats.push_back(stat_property[1]);
2271  stats.push_back(stat_property[2]);
2272  }
2273 
2274 
2275  par::Mpi_Reduce(&splitter_time,&stat_property[0],1,MPI_MIN,0,pcomm);
2276  par::Mpi_Reduce(&splitter_time,&stat_property[1],1,MPI_SUM,0,pcomm);
2277  par::Mpi_Reduce(&splitter_time,&stat_property[2],1,MPI_MAX,0,pcomm);
2278 
2279  if(!rank_g)
2280  {
2281  stat_property[1] = stat_property[1] / (double) npes_g;
2282 
2283  stats.push_back(stat_property[0]);
2284  stats.push_back(stat_property[1]);
2285  stats.push_back(stat_property[2]);
2286 
2287  }
2288 
2289 
2290  par::Mpi_Reduce(&all2all1_time,&stat_property[0],1,MPI_MIN,0,pcomm);
2291  par::Mpi_Reduce(&all2all1_time,&stat_property[1],1,MPI_SUM,0,pcomm);
2292  par::Mpi_Reduce(&all2all1_time,&stat_property[2],1,MPI_MAX,0,pcomm);
2293 
2294  if(!rank_g)
2295  {
2296  stat_property[1] = stat_property[1] / (double) npes_g;
2297 
2298  stats.push_back(stat_property[0]);
2299  stats.push_back(stat_property[1]);
2300  stats.push_back(stat_property[2]);
2301 
2302  }
2303 
2304 
2305  par::Mpi_Reduce(&all2all2_time,&stat_property[0],1,MPI_MIN,0,pcomm);
2306  par::Mpi_Reduce(&all2all2_time,&stat_property[1],1,MPI_SUM,0,pcomm);
2307  par::Mpi_Reduce(&all2all2_time,&stat_property[2],1,MPI_MAX,0,pcomm);
2308 
2309  if(!rank_g)
2310  {
2311  stat_property[1] = stat_property[1] / (double) npes_g;
2312 
2313  stats.push_back(stat_property[0]);
2314  stats.push_back(stat_property[1]);
2315  stats.push_back(stat_property[2]);
2316 
2317  }
2318 
2319  par::Mpi_Reduce(&localSort_time,&stat_property[0],1,MPI_MIN,0,pcomm);
2320  par::Mpi_Reduce(&localSort_time,&stat_property[1],1,MPI_SUM,0,pcomm);
2321  par::Mpi_Reduce(&localSort_time,&stat_property[2],1,MPI_MAX,0,pcomm);
2322 
2323  if(!rank_g)
2324  {
2325  stat_property[1] = stat_property[1] / (double) npes_g;
2326 
2327  stats.push_back(stat_property[0]);
2328  stats.push_back(stat_property[1]);
2329  stats.push_back(stat_property[2]);
2330 
2331  }
2332 
2333 
2334  par::Mpi_Reduce(&remove_duplicates_seq,&stat_property[0],1,MPI_MIN,0,pcomm);
2335  par::Mpi_Reduce(&remove_duplicates_seq,&stat_property[1],1,MPI_SUM,0,pcomm);
2336  par::Mpi_Reduce(&remove_duplicates_seq,&stat_property[2],1,MPI_MAX,0,pcomm);
2337 
2338  if(!rank_g)
2339  {
2340  stat_property[1] = stat_property[1] / (double) npes_g;
2341 
2342  stats.push_back(stat_property[0]);
2343  stats.push_back(stat_property[1]);
2344  stats.push_back(stat_property[2]);
2345 
2346  }
2347 
2348 
2349  par::Mpi_Reduce(&auxBalOCt_time,&stat_property[0],1,MPI_MIN,0,pcomm);
2350  par::Mpi_Reduce(&auxBalOCt_time,&stat_property[1],1,MPI_SUM,0,pcomm);
2351  par::Mpi_Reduce(&auxBalOCt_time,&stat_property[2],1,MPI_MAX,0,pcomm);
2352 
2353  if(!rank_g)
2354  {
2355  stat_property[1] = stat_property[1] / (double) npes_g;
2356 
2357  stats.push_back(stat_property[0]);
2358  stats.push_back(stat_property[1]);
2359  stats.push_back(stat_property[2]);
2360 
2361  }
2362 
2363 
2364  par::Mpi_Reduce(&remove_duplicates_par,&stat_property[0],1,MPI_MIN,0,pcomm);
2365  par::Mpi_Reduce(&remove_duplicates_par,&stat_property[1],1,MPI_SUM,0,pcomm);
2366  par::Mpi_Reduce(&remove_duplicates_par,&stat_property[2],1,MPI_MAX,0,pcomm);
2367 
2368  if(!rank_g)
2369  {
2370  stat_property[1] = stat_property[1] / (double) npes_g;
2371 
2372  stats.push_back(stat_property[0]);
2373  stats.push_back(stat_property[1]);
2374  stats.push_back(stat_property[2]);
2375 
2376 
2377  }
2378 
2379 
2380 
2381  par::Mpi_Reduce(&total_rd,&stat_property[0],1,MPI_MIN,0,pcomm);
2382  par::Mpi_Reduce(&total_rd,&stat_property[1],1,MPI_SUM,0,pcomm);
2383  par::Mpi_Reduce(&total_rd,&stat_property[2],1,MPI_MAX,0,pcomm);
2384 
2385  if(!rank_g)
2386  {
2387  stat_property[1] = stat_property[1] / (double) npes_g;
2388 
2389  stats.push_back(stat_property[0]);
2390  stats.push_back(stat_property[1]);
2391  stats.push_back(stat_property[2]);
2392 
2393 
2394  }
2395 
2396  for(int i=0;i<SF_Stages;i++)
2397  {
2398  par::Mpi_Reduce(&sf_full[i],&stat_property[0],1,MPI_MIN,0,pcomm);
2399  par::Mpi_Reduce(&sf_full[i],&stat_property[1],1,MPI_SUM,0,pcomm);
2400  par::Mpi_Reduce(&sf_full[i],&stat_property[2],1,MPI_MAX,0,pcomm);
2401 
2402  stat_property[1] = stat_property[1] / (double) npes_g;
2403 
2404  if(!rank_g)
2405  {
2406  stats.push_back(stat_property[0]);
2407  stats.push_back(stat_property[1]);
2408  stats.push_back(stat_property[2]);
2409  }
2410 
2411 
2412  par::Mpi_Reduce(&sf_all2all[i],&stat_property[0],1,MPI_MIN,0,pcomm);
2413  par::Mpi_Reduce(&sf_all2all[i],&stat_property[1],1,MPI_SUM,0,pcomm);
2414  par::Mpi_Reduce(&sf_all2all[i],&stat_property[2],1,MPI_MAX,0,pcomm);
2415 
2416  stat_property[1] = stat_property[1] / (double) npes_g;
2417 
2418  if(!rank_g)
2419  {
2420  stats.push_back(stat_property[0]);
2421  stats.push_back(stat_property[1]);
2422  stats.push_back(stat_property[2]);
2423  }
2424 
2425 
2426  par::Mpi_Reduce(&sf_splitters[i],&stat_property[0],1,MPI_MIN,0,pcomm);
2427  par::Mpi_Reduce(&sf_splitters[i],&stat_property[1],1,MPI_SUM,0,pcomm);
2428  par::Mpi_Reduce(&sf_splitters[i],&stat_property[2],1,MPI_MAX,0,pcomm);
2429 
2430  stat_property[1] = stat_property[1] / (double) npes_g;
2431 
2432  if(!rank_g)
2433  {
2434  stats.push_back(stat_property[0]);
2435  stats.push_back(stat_property[1]);
2436  stats.push_back(stat_property[2]);
2437  }
2438 
2439  }
2440 
2441 
2442 
2443 #endif
2444 
2445 
2446 
2447  if((options & TS_CONSTRUCT_OCTREE) | (options & TS_BALANCE_OCTREE))
2448  {
2449 
2450 
2451  MPI_Comm_rank(pcomm,&rank);
2452  MPI_Comm_size(pcomm,&npes);
2453 
2454  /*std::cout<<"rank: "<<rank<<" Remove Duplicates"<<std::endl;*/
2455 #ifdef PROFILE_TREE_SORT
2456  stats_previous.clear();
2457  stats_previous.insert(stats_previous.end(),stats.begin(),stats.end());
2458 #endif
2459  std::vector<T> tmpSorted;
2460  std::vector<T> tmpConstructed;
2461  std::vector<T> tmpBalanced;
2462  T root(0,0,0,0,m_uiDim,pMaxDepth);
2463  if(options & TS_CONSTRUCT_OCTREE) {
2464 
2465 
2466 #ifdef PROFILE_TREE_SORT
2467  //MPI_Barrier(pcomm);
2468  t3=std::chrono::high_resolution_clock::now();//MPI_Wtime();
2469 #endif
2470  par::partitionW<T>(pOutConstruct,NULL,pcomm);
2471  SFC::parSort::SFC_treeSort(pOutConstruct,tmpSorted,tmpConstructed,tmpBalanced,loadFlexibility,pMaxDepth,root,rot_id,k,1,sf_k,pcomm);
2472 
2473 #ifdef PROFILE_TREE_SORT
2474  construction_time=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t3).count();
2475  //MPI_Wtime()-t3;
2476 
2477  //MPI_Barrier(pcomm);
2478 
2479  int rank_g,npes_g;
2480  MPI_Comm_rank(pcomm,&rank_g);
2481  MPI_Comm_size(pcomm,&npes_g);
2482 
2483 
2484  par::Mpi_Reduce(&construction_time,&stat_property[0],1,MPI_MIN,0,pcomm);
2485  par::Mpi_Reduce(&construction_time,&stat_property[1],1,MPI_SUM,0,pcomm);
2486  par::Mpi_Reduce(&construction_time,&stat_property[2],1,MPI_MAX,0,pcomm);
2487 
2488  if(!rank_g)
2489  {
2490  stat_property[1] = stat_property[1] / (double) npes_g;
2491 
2492  stats.push_back(stat_property[0]);
2493  stats.push_back(stat_property[1]);
2494  stats.push_back(stat_property[2]);
2495 
2496  }
2497 
2498 
2499 #endif
2500 
2501  std::swap(tmpSorted,pOutConstruct);
2502  tmpSorted.clear();
2503 
2504  //if(!rank) std::cout<<"Executing Construct end "<<std::endl;
2505 
2506  }
2507  if(options & TS_BALANCE_OCTREE) {
2508  //if(!rank) std::cout<<"Executing balancing"<<std::endl;
2509 #ifdef PROFILE_TREE_SORT
2510  //MPI_Barrier(pcomm);
2511  t3=std::chrono::high_resolution_clock::now();//MPI_Wtime();
2512 #endif
2513  par::partitionW<T>(pOutBalanced,NULL,pcomm);
2514  SFC::parSort::SFC_treeSort(pOutBalanced,tmpSorted,tmpConstructed,tmpBalanced,loadFlexibility,pMaxDepth,root,rot_id,k,1,sf_k,pcomm);
2515 
2516 #ifdef PROFILE_TREE_SORT
2517  balancing_time=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t3).count();//MPI_Wtime()-t3;
2518  int rank_g,npes_g;
2519  MPI_Comm_rank(pcomm,&rank_g);
2520  MPI_Comm_size(pcomm,&npes_g);
2521 
2522  par::Mpi_Reduce(&balancing_time,&stat_property[0],1,MPI_MIN,0,pcomm);
2523  par::Mpi_Reduce(&balancing_time,&stat_property[1],1,MPI_SUM,0,pcomm);
2524  par::Mpi_Reduce(&balancing_time,&stat_property[2],1,MPI_MAX,0,pcomm);
2525 
2526  if(!rank_g)
2527  {
2528  stat_property[1] = stat_property[1] / (double) npes_g;
2529 
2530  stats.push_back(stat_property[0]);
2531  stats.push_back(stat_property[1]);
2532  stats.push_back(stat_property[2]);
2533 
2534  }
2535 
2536  //MPI_Barrier(pcomm);
2537 
2538 #endif
2539  std::swap(tmpSorted,pOutBalanced);
2540  tmpSorted.clear();
2541 
2542 
2543 
2544  }
2545 
2546  }
2547 
2548 
2549 
2550  }
2551 
2552 
2553 
2554 
2555 
2556  /* template <typename T>
2557  void SFC_PartitionW(std::vector<T>&pNodes,double loadFlexibility, unsigned int maxDepth,MPI_Comm comm)
2558  {
2559  T root=T(m_uiDim,maxDepth);
2560  std::vector<T> tmp;
2561  SFC_treeSort(pNodes,tmp,tmp,tmp,loadFlexibility,maxDepth,root,ROOT_ROTATION,1,0,NUM_NPES_THRESHOLD,comm);
2562  tmp.clear();
2563  }*/
2564 
2565 
2566 
2567 
2568 
2569  //========================================================= Function definition end.=========================================================================================
2570 
2571 
2572 
2573  }// end of namespace parSort
2574 
2575 
2576 }// end of namespace SFC
2577 
2578 
2579 
2580 
2581 
2582 #endif //SFCSORTBENCH_SFCSORT_H
Traits to determine MPI_DATATYPE from a C++ datatype.
int Mpi_Alltoallv_sparse(T *sendbuf, int *sendcnts, int *sdispls, T *recvbuf, int *recvcnts, int *rdispls, MPI_Comm comm)
int Mpi_Reduce(T *sendbuf, T *recvbuf, int count, MPI_Op op, int root, MPI_Comm comm)
int Mpi_Alltoall(T *sendbuf, T *recvbuf, int count, MPI_Comm comm)
Definition: sfcSort.h:217
Definition: sfcSort.h:153
A set of parallel utilities.
unsigned int fastLog2(unsigned int num)
Definition: binUtils.cpp:15
Definition: sfcSort.h:126
int Mpi_Allreduce(T *sendbuf, T *recvbuf, int count, MPI_Op op, MPI_Comm comm)
A Set of utilities to test octrees.
int splitComm2way(bool iAmEmpty, MPI_Comm *new_comm, MPI_Comm orig_comm)
Removes duplicates in parallel. If the input is not sorted, sample sort will be called within the fun...
Definition: parUtils.cpp:128
Definition: sfcSearch.h:22