16 #ifndef SFCSORTBENCH_SFCSORT_H 17 #define SFCSORTBENCH_SFCSORT_H 23 #include "hcurvedata.h" 25 #include "treenode2vtk.h" 35 #include <unordered_set> 39 #ifdef PROFILE_TREE_SORT 42 std::vector<double> stats_previous;
43 std::vector<double> stats;
62 unsigned int inputSz[3];
64 double splitter_fix_all=0;
65 double splitter_time=0;
66 double all2all1_time=0;
67 double all2all2_time=0;
69 double localSort_time=0;
70 double remove_duplicates_seq=0;
71 double remove_duplicates_par=0;
73 double auxBalOCt_time=0;
75 double construction_time=0;
76 double balancing_time=0;
79 auto t1=std::chrono::high_resolution_clock::now();
80 auto t2=std::chrono::high_resolution_clock::now();
81 auto t3=std::chrono::high_resolution_clock::now();
82 auto t4=std::chrono::high_resolution_clock::now();
83 auto t5_sf_staged=std::chrono::high_resolution_clock::now();
85 double stat_property[3]={};
90 typedef unsigned __int128 uint128_t;
93 #define NUM_CHILDREN 4 94 #define ROTATION_OFFSET 8 97 #define NUM_CHILDREN 8 98 #define ROTATION_OFFSET 16 102 #define MILLISECOND_CONVERSION 1e3 103 #define ROOT_ROTATION 0 119 #define TS_SORT_ONLY 0 120 #define TS_REMOVE_DUPLICATES 1 121 #define TS_CONSTRUCT_OCTREE 2 122 #define TS_BALANCE_OCTREE 4 // which ensures that balance octree cannote be called without construct octree function. 125 template <
typename T>
128 unsigned char rot_id;
140 BucketInfo(
unsigned char p_rot_id,
unsigned char p_lev,DendroIntL p_begin, DendroIntL p_end)
156 inline uint64_t splitBy3(
unsigned int a)
const{
158 x = (x | (x << 16)) & 0x0000FFFF0000FFFF;
159 x = (x | (x << 8)) & 0x00FF00FF00FF00FF;
160 x = (x | (x << 4)) & 0x0F0F0F0F0F0F0F0F;
161 x = (x | (x << 2)) & 0x3333333333333333;
162 x = (x | (x << 1)) & 0x5555555555555555;
167 inline bool operator()(
const T & first,
const T &other)
const 184 a1=(((uint128_t)first.getLevel())<<96u) | (((uint128_t)first.getX())<<64u) | (((uint128_t)first.getY())<<32u) |(((uint128_t)first.getZ()));
185 a2=(((uint128_t)other.getLevel())<<96u) | (((uint128_t)other.getX())<<64u) | (((uint128_t)other.getY())<<32u) |(((uint128_t)other.getZ()));
219 inline uint128_t operator() (
const T &first)
const {
230 a1=(((uint128_t)first.getLevel())<<96u) | (((uint128_t)first.getX())<<64u) | (((uint128_t)first.getY())<<32u) |(((uint128_t)first.getZ()));
260 inline void SFC_bottomUpBalOctantCreation(std::vector<T> & pNodes);
276 void SFC_treeSort(T* pNodes , DendroIntL n ,std::vector<T>& pOutSorted,std::vector<T>& pOutConstruct,std::vector<T>& pOutBalanced,
unsigned int pMaxDepthBit,
unsigned int pMaxDepth, T& parent,
unsigned int rot_id,
unsigned int k,
unsigned int options);
285 inline void SFC_bucketing(T *pNodes,
int lev,
unsigned int maxDepth,
unsigned char rot_id,DendroIntL &begin, DendroIntL &end, DendroIntL *splitters);
300 void SFC_treeSortLocalOptimal(T* pNodes , DendroIntL n ,
unsigned int pMaxDepthBit,
unsigned int pMaxDepth, T& parent,
unsigned int rot_id,
bool minimum,T& optimal);
316 void SFC_treeSortLocalOptimal(T* pNodes , DendroIntL n ,
unsigned int pMaxDepthBit,
unsigned int pMaxDepth, T& parent,
unsigned int rot_id,T&min,T&max);
325 inline void SFC_bottomUpBalOctantCreation(std::vector<T> & pNodes)
328 if(pNodes.empty()){
return; }
332 T root(m_uiDim,m_uiMaxDepth);
333 std::set<T,OctreeComp<T> > auxOct(pNodes.begin(),pNodes.end());
334 std::set<T,OctreeComp<T> > parentAux;
335 unsigned int neighbourCount=0;
337 unsigned int newCount=0;
338 unsigned int preSz=0;
340 unsigned int d_max=(1u<<m_uiMaxDepth);
348 std::pair<typename std::set<T,OctreeComp<T>>::iterator,
bool> hint[neighbourCount];
349 std::pair<typename std::set<T,OctreeComp<T>>::iterator,
bool> hintParent;
355 for(
unsigned int w=0;w<pNodes.size();++w){
356 hintParent=parentAux.emplace(pNodes[w].getParent());
357 if(hintParent.second) {
358 tmpParent=*(hintParent.first);
362 auxOct.emplace(tmpParent.getLeft());
363 auxOct.emplace(tmpParent.getRight());
364 auxOct.emplace(tmpParent.getFront());
365 auxOct.emplace(tmpParent.getBack());
366 auxOct.emplace(tmpParent.getLeftBack());
367 auxOct.emplace(tmpParent.getRightBack());
368 auxOct.emplace(tmpParent.getLeftFront());
369 auxOct.emplace(tmpParent.getRightFront());
375 hint[0] = auxOct.emplace(tmpParent.getLeft());
376 hint[1] = auxOct.emplace(tmpParent.getLeftBack());
377 hint[2] = auxOct.emplace(tmpParent.getLeftFront());
378 hint[3] = auxOct.emplace(tmpParent.getRight());
379 hint[4] = auxOct.emplace(tmpParent.getRightBack());
380 hint[5] = auxOct.emplace(tmpParent.getRightFront());
381 hint[6] = auxOct.emplace(tmpParent.getBack());
382 hint[7] = auxOct.emplace(tmpParent.getFront());
383 hint[8] = auxOct.emplace(tmpParent.getBottom());
384 hint[9] = auxOct.emplace(tmpParent.getBottomLeft());
385 hint[10] = auxOct.emplace(tmpParent.getBottomLeftBack());
386 hint[11] = auxOct.emplace(tmpParent.getBottomLeftFront());
387 hint[12] = auxOct.emplace(tmpParent.getBottomRight());
388 hint[13] = auxOct.emplace(tmpParent.getBottomRightBack());
389 hint[14] = auxOct.emplace(tmpParent.getBottomRightFront());
390 hint[15] = auxOct.emplace(tmpParent.getBottomBack());
391 hint[16] = auxOct.emplace(tmpParent.getBottomFront());
392 hint[17] = auxOct.emplace(tmpParent.getTop());
393 hint[18] = auxOct.emplace(tmpParent.getTopLeft());
394 hint[19] = auxOct.emplace(tmpParent.getTopLeftBack());
395 hint[20] = auxOct.emplace(tmpParent.getTopLeftFront());
396 hint[21] = auxOct.emplace(tmpParent.getTopRight());
397 hint[22] = auxOct.emplace(tmpParent.getTopRightBack());
398 hint[23] = auxOct.emplace(tmpParent.getTopRightFront());
399 hint[24] = auxOct.emplace(tmpParent.getTopBack());
400 hint[25] = auxOct.emplace(tmpParent.getTopFront());
403 if(auxOct.size()>preSz)
406 pNodes.resize(pNodes.size()+auxOct.size()-preSz);
407 for(
unsigned int kk=0;kk<neighbourCount;kk++)
409 if(hint[kk].second) {
410 pNodes[preSz + curr] = *(hint[kk].first);
429 void SFC_treeSort(T* pNodes , DendroIntL n ,std::vector<T>& pOutSorted,std::vector<T>& pOutConstruct,std::vector<T>& pOutBalanced,
unsigned int pMaxDepthBit,
unsigned int pMaxDepth, T& parent,
unsigned int rot_id,
unsigned int k,
unsigned int options)
433 register unsigned int cnum;
434 register unsigned int cnum_prev=0;
436 unsigned int rotation=0;
437 DendroIntL count[(NUM_CHILDREN+2)]={};
438 unsigned int lev=pMaxDepth-pMaxDepthBit;
444 for (DendroIntL i=0; i< n; ++i) {
446 cnum = (lev < pNodes[i].getLevel())? 1 +( (((pNodes[i].getZ() >> pMaxDepthBit) & 1u) << 2u) | (((pNodes[i].getY() >> pMaxDepthBit) & 1u) << 1u) | ((pNodes[i].getX() >>pMaxDepthBit) & 1u)):0;
451 DendroIntL loc[NUM_CHILDREN+1];
452 T unsorted[NUM_CHILDREN+1];
453 unsigned int live = 0;
456 for (
unsigned int i=0; i<(NUM_CHILDREN+1); ++i) {
461 unsorted[live] = pNodes[loc[0]];
462 if (loc[0] < count[1]) {live++; }
465 cnum=(rotations[ROTATION_OFFSET * rot_id+ i-1] -
'0');
466 (i>1) ? cnum_prev = ((rotations[ROTATION_OFFSET * rot_id+i-2] -
'0')+2): cnum_prev=1;
467 loc[cnum+1]=count[cnum_prev];
468 count[cnum+2] += count[cnum_prev];
470 (loc[cnum+1]==n) ? unsorted[live] = pNodes[loc[cnum+1]-1]: unsorted[live] = pNodes[loc[cnum+1]];
471 if (loc[cnum+1] < count[cnum+2]) {live++; }
482 for (DendroIntL i=0; i < n ; ++i) {
485 cnum = (lev < unsorted[live].getLevel()) ? ((((unsorted[live].getZ() >> pMaxDepthBit) & 1u) << 2u) | (((unsorted[live].getY() >> pMaxDepthBit) & 1u) << 1u) | ((unsorted[live].getX() >> pMaxDepthBit) & 1u))+ 1: 0 ;
487 pNodes[loc[(cnum )]++] = unsorted[live];
488 (loc[cnum]==n) ? unsorted[live] = pNodes[loc[cnum]-1] : unsorted[live] = pNodes[loc[cnum]];
489 if ((loc[cnum] == count[cnum + 1])) {
500 if((options==TS_SORT_ONLY) || (options==TS_REMOVE_DUPLICATES))
502 if (pMaxDepthBit > 0) {
504 DendroIntL numElements=0;
505 for (
unsigned int i=1; i<(NUM_CHILDREN+1); i++) {
506 cnum=(rotations[ROTATION_OFFSET*rot_id+i-1]-
'0');
507 (i>1)? cnum_prev = ((rotations[ROTATION_OFFSET * rot_id+i-2] -
'0')+2) : cnum_prev=1;
508 numElements = count[cnum+2] - count[cnum_prev];
509 if (numElements > k) {
510 rotation=HILBERT_TABLE[NUM_CHILDREN * rot_id + cnum];
511 if(numElements)SFC_treeSort(pNodes+count[cnum_prev],numElements,pOutSorted,pOutConstruct,pOutBalanced,(pMaxDepthBit),pMaxDepth,temp,rotation,k,options);
519 if (pMaxDepthBit > MAXDEAPTH_LEVEL_DIFF) {
521 DendroIntL numElements=0;
522 for (
unsigned int i=1; i<(NUM_CHILDREN+1); i++) {
523 cnum=(rotations[ROTATION_OFFSET*rot_id+i-1]-
'0');
524 (i>1)? cnum_prev = ((rotations[ROTATION_OFFSET * rot_id+i-2] -
'0')+2) : cnum_prev=1;
525 numElements = count[cnum+2] - count[cnum_prev];
526 if((options & TS_CONSTRUCT_OCTREE) | (options & TS_BALANCE_OCTREE))
528 x=parent.getX() +(((int)((
bool)(cnum & 1u)))<<(pMaxDepthBit));
529 y=parent.getY() +(((int)((
bool)(cnum & 2u)))<<(pMaxDepthBit));
530 z=parent.getZ() +(((int)((
bool)(cnum & 4u)))<<(pMaxDepthBit));
531 temp=T(x,y,z,(lev+1),parent.getDim(),pMaxDepth);
534 if (numElements > k) {
535 rotation=HILBERT_TABLE[NUM_CHILDREN * rot_id + cnum];
536 SFC_treeSort(pNodes+count[cnum_prev],numElements,pOutSorted,pOutConstruct,pOutBalanced,(pMaxDepthBit),pMaxDepth,temp,rotation,k,options);
537 }
else if((options & TS_CONSTRUCT_OCTREE) | (options & TS_BALANCE_OCTREE))
539 if(options & TS_CONSTRUCT_OCTREE) {
540 pOutConstruct.push_back(temp);
543 if (options & TS_BALANCE_OCTREE) {
545 pOutBalanced.push_back(temp);
564 if((options & TS_REMOVE_DUPLICATES)) {
566 #ifdef PROFILE_TREE_SORT 567 t1=std::chrono::high_resolution_clock::now();
573 std::vector<T> tmp(n);
574 T *tmpPtr = (&(*(tmp.begin())));
576 tmpPtr[0] = pNodes[0];
578 unsigned int tmpSize = 1;
579 unsigned int vecTsz =
static_cast<unsigned int>(n);
581 for (
unsigned int i = 1; i < n; i++) {
582 if ( (tmpPtr[tmpSize - 1] != pNodes[i])) {
583 tmpPtr[tmpSize] = pNodes[i];
593 std::vector<T> tmp_rmvAncestors(tmp.size());
594 tmpPtr = (&(*(tmp_rmvAncestors.begin())));
598 for(
unsigned int i=1;i<tmp.size();i++)
600 if(tmpPtr[tmpSize].isAncestor(tmp[i]))
601 tmpPtr[tmpSize]=tmp[i];
603 tmpPtr[tmpSize+1]=tmp[i];
608 tmp_rmvAncestors.resize(tmpSize+1);
609 std::swap(pOutSorted, tmp_rmvAncestors);
611 tmp_rmvAncestors.clear();
615 #ifdef PROFILE_TREE_SORT 616 remove_duplicates_seq=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t1).count();
622 if(options & TS_BALANCE_OCTREE)
626 #ifdef PROFILE_TREE_SORT 627 t1=std::chrono::high_resolution_clock::now();
632 SFC::seqSort::SFC_bottomUpBalOctantCreation(pOutBalanced);
634 #ifdef PROFILE_TREE_SORT 635 auxBalOCt_time=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t1).count();
638 std::vector<T> tmpSorted;
639 std::vector<T> tmpConstruct;
640 std::vector<T> tmpBalanced;
641 T root =T(0,0,0,0,m_uiDim,pMaxDepth);
643 SFC::seqSort::SFC_treeSort(&(*(pOutBalanced.begin())),pOutBalanced.size(),tmpSorted,tmpConstruct,tmpBalanced,pMaxDepth,pMaxDepth,root,0,k,2);
644 std::swap(tmpConstruct,pOutBalanced);
645 tmpConstruct.clear();
655 inline void SFC_bucketing(T *pNodes,
int lev,
unsigned int maxDepth,
unsigned char rot_id,DendroIntL &begin, DendroIntL &end, DendroIntL *splitters)
669 if ((lev >= maxDepth) || (begin == end)) {
672 for (
int ii = 0; ii < NUM_CHILDREN; ii++) {
673 int index = (rotations[2 * NUM_CHILDREN * rot_id + ii] -
'0');
675 if (ii == (NUM_CHILDREN-1))
678 nextIndex = (rotations[2 * NUM_CHILDREN * rot_id + ii + 1] -
'0');
681 splitters[index] = begin;
682 splitters[nextIndex] = end;
685 splitters[nextIndex] = splitters[index];
692 register unsigned int cnum;
693 register unsigned int cnum_prev=0;
694 DendroIntL num_elements=0;
695 unsigned int rotation=0;
696 DendroIntL count[(NUM_CHILDREN+2)]={};
699 unsigned int mid_bit = maxDepth - lev - 1;
701 for (DendroIntL i=begin; i<end; ++i) {
708 cnum = (lev < pNodes[i].getLevel())? 1 +( (((pNodes[i].getZ() >> mid_bit) & 1u) << 2u) | (((pNodes[i].getY() >> mid_bit) & 1u) << 1u) | ((pNodes[i].getX() >>mid_bit) & 1u)):0;
714 DendroIntL loc[NUM_CHILDREN+1];
715 T unsorted[NUM_CHILDREN+1];
716 unsigned int live = 0;
720 for (
unsigned int ii = 0; ii < NUM_CHILDREN; ii++) {
721 int index = (rotations[2 * NUM_CHILDREN * rot_id + ii] -
'0');
723 if (ii == (NUM_CHILDREN-1))
726 nextIndex = (rotations[2 * NUM_CHILDREN * rot_id + ii + 1] -
'0');
729 splitters[index] = begin;
730 splitters[nextIndex] = splitters[index]+count[1]+ count[(index+2)];
733 splitters[nextIndex] = splitters[index] + count[(index + 2)];
740 for (
unsigned int i=0; i<(NUM_CHILDREN+1); ++i) {
745 unsorted[live] = pNodes[loc[0]];
746 if (loc[0] < count[1]) {live++; }
749 cnum = (rotations[ROTATION_OFFSET * rot_id + i-1] -
'0');
750 (i > 1) ? cnum_prev = ((rotations[ROTATION_OFFSET * rot_id + i - 2] -
'0') + 2) : cnum_prev = 1;
752 loc[cnum+1] = count[cnum_prev];
753 count[cnum + 2] += count[cnum_prev];
754 if (loc[cnum+1] < count[cnum + 2]) { unsorted[live] = pNodes[loc[(cnum+1)]]; live++; }
760 for (DendroIntL i=begin; i<end; ++i) {
763 cnum=(lev < unsorted[live].getLevel()) ? ((((unsorted[live].getZ() >> mid_bit) & 1u) << 2u) | (((unsorted[live].getY() >> mid_bit) & 1u) << 1u) | ((unsorted[live].getX() >> mid_bit) & 1u))+ 1: 0 ;
764 if(loc[cnum]<count[cnum+1]) {
766 pNodes[loc[cnum]++] = unsorted[live];
767 if ((loc[cnum] == count[cnum + 1])) {
771 unsorted[live] = pNodes[loc[cnum]];
787 void SFC_treeSortLocalOptimal(T* pNodes , DendroIntL n ,
unsigned int pMaxDepthBit,
unsigned int pMaxDepth, T& parent,
unsigned int rot_id,
bool minimum,T& optimal)
792 optimal=T(0,0,0,0,m_uiDim,m_uiMaxDepth);
794 }
else if(n==0 && (!minimum))
796 optimal=T(((1u<<m_uiMaxDepth)-1),((1u<<m_uiMaxDepth)-1),((1u<<m_uiMaxDepth)-1),m_uiMaxDepth,m_uiDim,m_uiMaxDepth);
800 unsigned int lev=pMaxDepth-pMaxDepthBit;
801 DendroIntL nNodeBegin=0;
802 DendroIntL nNodeEnd=n;
804 unsigned int hindex = 0;
805 unsigned int hindexN = 0;
806 unsigned int index = 0;
807 DendroIntL splitterNodes[(NUM_CHILDREN + 1)];
808 unsigned int bucketIndex=0;
809 if(!minimum) bucketIndex=(NUM_CHILDREN-1);
811 if(((nNodeEnd-nNodeBegin)==1))
813 optimal=pNodes[nNodeBegin];
817 if( (pMaxDepthBit) && ((nNodeEnd-nNodeBegin)>1)) {
819 SFC::seqSort::SFC_bucketing(pNodes,lev,pMaxDepth,rot_id,nNodeBegin,nNodeEnd,splitterNodes);
820 hindex = (rotations[2 * NUM_CHILDREN * rot_id + bucketIndex] -
'0');
821 index=HILBERT_TABLE[NUM_CHILDREN * rot_id + bucketIndex];
822 (bucketIndex == (NUM_CHILDREN - 1)) ? hindexN = bucketIndex + 1: hindexN = (rotations[2 * NUM_CHILDREN * rot_id + bucketIndex + 1] -
'0');
824 assert(splitterNodes[hindex] <= splitterNodes[hindexN]);
825 if(splitterNodes[hindex]!=splitterNodes[hindexN]) SFC_treeSortLocalOptimal(pNodes+splitterNodes[hindex],(splitterNodes[hindexN]-splitterNodes[hindex]),(pMaxDepthBit-1),pMaxDepth,parent,index,minimum,optimal);
829 std::vector<T>tmpVec;
830 SFC::seqSort::SFC_treeSort(pNodes,n,tmpVec,tmpVec,tmpVec,pMaxDepth,pMaxDepth,parent,ROOT_ROTATION,1,TS_SORT_ONLY);
831 (minimum) ? optimal=pNodes[0]: optimal=pNodes[n-1];
844 void SFC_treeSortLocalOptimal(T* pNodes , DendroIntL n ,
unsigned int pMaxDepthBit,
unsigned int pMaxDepth, T& parent,
unsigned int rot_id,T&min,T&max)
846 unsigned int lev=pMaxDepth-pMaxDepthBit;
847 DendroIntL splitterNodes[(NUM_CHILDREN + 1)];
848 unsigned int hindex = 0;
849 unsigned int hindexN = 0;
850 unsigned int index = 0;
851 DendroIntL nNodeBegin=0;
852 DendroIntL nNodeEnd=n;
854 SFC::seqSort::SFC_bucketing(pNodes,lev,pMaxDepth,rot_id,nNodeBegin,nNodeEnd,splitterNodes);
856 unsigned int min_b=0,max_b=NUM_CHILDREN-1;
857 for(
unsigned int i=0;i<NUM_CHILDREN;i++)
859 hindex = (rotations[2 * NUM_CHILDREN * rot_id + min_b] -
'0');
860 (min_b == (NUM_CHILDREN - 1)) ? hindexN = min_b + 1: hindexN = (rotations[2 * NUM_CHILDREN * rot_id + min_b + 1] -
'0');
861 if(splitterNodes[hindex]!=splitterNodes[hindexN])
break;
867 for(
unsigned int i=0;i<(NUM_CHILDREN);i++)
869 hindex = (rotations[2 * NUM_CHILDREN * rot_id + max_b] -
'0');
870 (max_b == (NUM_CHILDREN - 1)) ? hindexN = max_b + 1: hindexN = (rotations[2 * NUM_CHILDREN * rot_id + max_b + 1] -
'0');
871 if(splitterNodes[hindex]!=splitterNodes[hindexN])
break;
876 hindex = (rotations[2 * NUM_CHILDREN * rot_id + min_b] -
'0');
877 (min_b == (NUM_CHILDREN - 1)) ? hindexN = min_b + 1: hindexN = (rotations[2 * NUM_CHILDREN * rot_id + min_b + 1] -
'0');
878 index=HILBERT_TABLE[NUM_CHILDREN * rot_id + min_b];
879 DendroIntL numElem=splitterNodes[hindexN]-splitterNodes[hindex];
880 SFC::seqSort::SFC_treeSortLocalOptimal(&pNodes[splitterNodes[hindex]],numElem,pMaxDepthBit-1,pMaxDepth,parent,index,
true,min);
883 hindex = (rotations[2 * NUM_CHILDREN * rot_id + max_b] -
'0');
884 (max_b == (NUM_CHILDREN - 1)) ? hindexN = max_b + 1: hindexN = (rotations[2 * NUM_CHILDREN * rot_id + max_b + 1] -
'0');
885 index=HILBERT_TABLE[NUM_CHILDREN * rot_id + max_b];
886 numElem=splitterNodes[hindexN]-splitterNodes[hindex];
887 SFC::seqSort::SFC_treeSortLocalOptimal(&pNodes[splitterNodes[hindex]],numElem,pMaxDepthBit-1,pMaxDepth,parent,index,
false,max);
917 inline void SFC_SplitterFix(std::vector<T>& pNodes,
unsigned int pMaxDepth,
double loadFlexibility,
unsigned int sf_k,MPI_Comm comm,MPI_Comm * newComm);
932 template <
typename T>
933 void SFC_treeSort(std::vector<T> &pNodes, std::vector<T>& pOutSorted,std::vector<T>& pOutConstruct,std::vector<T>& pOutBalanced ,
double loadFlexibility,
unsigned int pMaxDepth, T& parent,
unsigned int rot_id,
unsigned int k,
unsigned int options,
unsigned int sf_k,MPI_Comm pcomm);
947 inline void SFC_SplitterFix(std::vector<T>& pNodes,
unsigned int pMaxDepth,
double loadFlexibility,
unsigned int sf_k,MPI_Comm comm,MPI_Comm * newComm) {
949 #ifdef SPLITTER_SELECTION_FIX 952 MPI_Comm_rank(comm, &rank);
953 MPI_Comm_size(comm, &npes);
959 if (npes > NUM_NPES_THRESHOLD) {
963 const unsigned int a=sf_k;
964 const unsigned int b=npes/a;
965 const unsigned int p_mod_a=npes%a;
968 unsigned int dim = 3;
975 unsigned int firstSplitLevel = std::ceil(
binOp::fastLog2(a) / (
double) dim);
976 unsigned int totalNumBuckets = 1u << (dim * firstSplitLevel);
979 DendroIntL localSz = pNodes.size();
980 DendroIntL globalSz = 0;
981 MPI_Allreduce(&localSz, &globalSz, 1, MPI_LONG_LONG, MPI_SUM, comm);
986 std::vector<DendroIntL> bucketCounts;
987 std::vector<BucketInfo<T>> bucketInfo;
988 std::vector<DendroIntL> bucketSplitter;
991 std::vector<BucketInfo<T>> nodeStack;
993 nodeStack.push_back(root);
995 unsigned int levSplitCount = 0;
998 unsigned int hindex = 0;
999 unsigned int hindexN = 0;
1001 unsigned int index = 0;
1003 unsigned int numLeafBuckets = 0;
1005 unsigned int begin_loc = 0;
1008 DendroIntL spliterstemp[(NUM_CHILDREN + 1)];
1009 while (numLeafBuckets < totalNumBuckets) {
1012 nodeStack.erase(nodeStack.begin());
1014 SFC::seqSort::SFC_bucketing(&(*(pNodes.begin())), tmp.lev, pMaxDepth, tmp.rot_id, tmp.begin,
1015 tmp.end, spliterstemp);
1017 for (
int i = 0; i < NUM_CHILDREN; i++) {
1018 hindex = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i] -
'0');
1019 if (i == (NUM_CHILDREN - 1))
1022 hindexN = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i + 1] -
'0');
1023 assert(spliterstemp[hindex] <= spliterstemp[hindexN]);
1024 index = HILBERT_TABLE[NUM_CHILDREN * tmp.rot_id + hindex];
1026 BucketInfo<T> child(index, (tmp.lev + 1), spliterstemp[hindex], spliterstemp[hindexN]);
1027 nodeStack.push_back(child);
1029 if (tmp.lev == (firstSplitLevel - 1)) {
1030 BucketInfo<T> bucket(index, (tmp.lev + 1), spliterstemp[hindex], spliterstemp[hindexN]);
1031 bucketCounts.push_back((spliterstemp[hindexN] - spliterstemp[hindex]));
1032 bucketSplitter.push_back(spliterstemp[hindex]);
1033 bucketInfo.push_back(bucket);
1046 std::vector<DendroIntL> bucketCounts_g(bucketCounts.size());
1047 std::vector<DendroIntL> bucketCounts_gScan(bucketCounts.size());
1049 par::Mpi_Allreduce<DendroIntL>(&(*(bucketCounts.begin())), &(*(bucketCounts_g.begin())),
1050 bucketCounts.size(), MPI_SUM, comm);
1052 #ifdef DEBUG_TREE_SORT 1053 assert(totalNumBuckets);
1055 bucketCounts_gScan[0] = bucketCounts_g[0];
1056 for (
int k = 1; k < bucketCounts_g.size(); k++) {
1057 bucketCounts_gScan[k] = bucketCounts_gScan[k - 1] + bucketCounts_g[k];
1061 #ifdef DEBUG_TREE_SORT 1063 for (
int i = 0; i < totalNumBuckets; i++) {
1065 std::cout <<
"Bucket initial count scan G : " << i <<
" : " << bucketCounts_gScan[i] << std::endl;
1069 DendroIntL *localSplitterTmp =
new DendroIntL[a];
1070 DendroIntL idealLoadBalance = 0;
1073 std::vector<unsigned int> splitBucketIndex;
1075 for (
int i = 0; i < a - 1; i++) {
1076 idealLoadBalance += ((i + 1) * globalSz / a - i * globalSz / a);
1077 DendroIntL toleranceLoadBalance = ((i + 1) * globalSz / a - i * globalSz / a) * loadFlexibility;
1079 unsigned int loc = (
1080 std::lower_bound(bucketCounts_gScan.begin(), bucketCounts_gScan.end(), idealLoadBalance) -
1081 bucketCounts_gScan.begin());
1084 if (abs(bucketCounts_gScan[loc] - idealLoadBalance) > toleranceLoadBalance) {
1086 if (splitBucketIndex.empty() || splitBucketIndex.back() != loc)
1087 splitBucketIndex.push_back(loc);
1091 if ((loc + 1) < bucketSplitter.size())
1092 localSplitterTmp[i] = bucketSplitter[loc + 1];
1094 localSplitterTmp[i] = bucketSplitter[loc];
1103 localSplitterTmp[a - 1] = pNodes.size();
1106 #ifdef DEBUG_TREE_SORT 1107 for(
int i=0;i<splitBucketIndex.size()-1;i++)
1109 assert(pNodes[bucketSplitter[splitBucketIndex[i]]]<pNodes[bucketSplitter[splitBucketIndex[i+1]]]);
1114 std::vector<DendroIntL> newBucketCounts;
1115 std::vector<DendroIntL> newBucketCounts_g;
1116 std::vector<BucketInfo<T>> newBucketInfo;
1117 std::vector<DendroIntL> newBucketSplitters;
1120 std::vector<DendroIntL> bucketCount_gMerge;
1121 std::vector<DendroIntL> bucketSplitterMerge;
1122 std::vector<BucketInfo<T>> bucketInfoMerge;
1124 DendroIntL splitterTemp[(NUM_CHILDREN + 1)];
1125 while (!splitBucketIndex.empty()) {
1128 newBucketCounts.clear();
1129 newBucketCounts_g.clear();
1130 newBucketInfo.clear();
1131 newBucketSplitters.clear();
1133 bucketCount_gMerge.clear();
1134 bucketSplitterMerge.clear();
1135 bucketInfoMerge.clear();
1137 if (bucketInfo[splitBucketIndex[0]].lev < pMaxDepth) {
1142 #ifdef DEBUG_TREE_SORT 1144 for (
int i = 0; i < splitBucketIndex.size(); i++)
1145 std::cout <<
"Splitter Bucket Index: " << i <<
" : " << splitBucketIndex[i] << std::endl;
1151 #ifdef DEBUG_TREE_SORT 1153 for (
int i = 0; i <bucketSplitter.size(); i++) {
1154 std::cout<<
" Bucket Splitter : "<<i<<
" : "<<bucketSplitter[i]<<std::endl;
1158 for (
int k = 0; k < splitBucketIndex.size(); k++) {
1159 #ifdef DEBUG_TREE_SORT 1161 std::cout<<
"Splitting Bucket index "<<splitBucketIndex[k]<<std::endl;
1164 tmp = bucketInfo[splitBucketIndex[k]];
1165 SFC::seqSort::SFC_bucketing(&(*(pNodes.begin())), tmp.lev, pMaxDepth, tmp.rot_id, tmp.begin,
1166 tmp.end, splitterTemp);
1169 for (
int i = 0; i < NUM_CHILDREN; i++) {
1170 hindex = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i] -
'0');
1171 if (i == (NUM_CHILDREN - 1))
1174 hindexN = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i + 1] -
'0');
1177 newBucketCounts.push_back((splitterTemp[hindexN] - splitterTemp[hindex]));
1179 index = HILBERT_TABLE[NUM_CHILDREN * tmp.rot_id + hindex];
1180 BucketInfo<T> bucket(index, (tmp.lev + 1), splitterTemp[hindex], splitterTemp[hindexN]);
1183 newBucketInfo.push_back(bucket);
1184 newBucketSplitters.push_back(splitterTemp[hindex]);
1185 #ifdef DEBUG_TREE_SORT 1186 assert(pNodes[splitterTemp[hindex]]<=pNodes[splitterTemp[hindexN]]);
1192 newBucketCounts_g.resize(newBucketCounts.size());
1194 newBucketCounts.size(), MPI_SUM, comm);
1198 #ifdef DEBUG_TREE_SORT 1199 for(
int i=0;i<newBucketSplitters.size()-1;i++)
1201 assert(pNodes[newBucketSplitters[i]]<=pNodes[newBucketSplitters[i+1]]);
1206 #ifdef DEBUG_TREE_SORT 1207 for (
int k = 0; k < splitBucketIndex.size(); k++) {
1208 unsigned int sum = 0;
1209 for (
int i = 0; i < NUM_CHILDREN; i++)
1210 sum += newBucketCounts_g[NUM_CHILDREN * k + i];
1211 if (!rank)
if (bucketCounts_g[splitBucketIndex[k]] != sum) {
1219 for (
int k = 0; k < splitBucketIndex.size(); k++) {
1221 unsigned int bucketBegin = 0;
1223 (k == 0) ? bucketBegin = 0 : bucketBegin = splitBucketIndex[k - 1] + 1;
1226 for (
int i = bucketBegin; i < splitBucketIndex[k]; i++) {
1228 bucketCount_gMerge.push_back(bucketCounts_g[i]);
1229 bucketInfoMerge.push_back(bucketInfo[i]);
1230 bucketSplitterMerge.push_back(bucketSplitter[i]);
1234 tmp = bucketInfo[splitBucketIndex[k]];
1235 for (
int i = 0; i < NUM_CHILDREN; i++) {
1236 bucketCount_gMerge.push_back(newBucketCounts_g[NUM_CHILDREN * k + i]);
1237 bucketInfoMerge.push_back(newBucketInfo[NUM_CHILDREN * k + i]);
1238 bucketSplitterMerge.push_back(newBucketSplitters[NUM_CHILDREN * k + i]);
1243 if (k == (splitBucketIndex.size() - 1)) {
1244 for (
int i = splitBucketIndex[k] + 1; i < bucketCounts_g.size(); i++) {
1245 bucketCount_gMerge.push_back(bucketCounts_g[i]);
1246 bucketInfoMerge.push_back(bucketInfo[i]);
1247 bucketSplitterMerge.push_back(bucketSplitter[i]);
1256 std::swap(bucketCounts_g, bucketCount_gMerge);
1257 std::swap(bucketInfo, bucketInfoMerge);
1258 std::swap(bucketSplitter, bucketSplitterMerge);
1261 bucketCounts_gScan.resize(bucketCounts_g.size());
1263 bucketCounts_gScan[0] = bucketCounts_g[0];
1264 for (
int k = 1; k < bucketCounts_g.size(); k++) {
1265 bucketCounts_gScan[k] = bucketCounts_gScan[k - 1] + bucketCounts_g[k];
1267 #ifdef DEBUG_TREE_SORT 1268 assert(bucketCounts_gScan.back()==globalSz);
1270 splitBucketIndex.clear();
1271 #ifdef DEBUG_TREE_SORT 1272 for(
int i=0;i<bucketSplitter.size()-2;i++)
1274 std::cout<<
"Bucket Splitter : "<<bucketSplitter[i]<<std::endl;
1275 assert( bucketSplitter[i+1]!=(pNodes.size()) && pNodes[bucketSplitter[i]]<=pNodes[bucketSplitter[i+1]]);
1279 idealLoadBalance = 0;
1281 for (
unsigned int i = 0; i < a - 1; i++) {
1282 idealLoadBalance += ((i + 1) * globalSz / a - i * globalSz / a);
1283 DendroIntL toleranceLoadBalance = (((i + 1) * globalSz / a - i * globalSz / a) *
1285 unsigned int loc = (std::lower_bound(bucketCounts_gScan.begin(), bucketCounts_gScan.end(),
1287 bucketCounts_gScan.begin());
1289 if (abs(bucketCounts_gScan[loc] - idealLoadBalance) > toleranceLoadBalance) {
1290 if (splitBucketIndex.empty() || splitBucketIndex.back() != loc)
1291 splitBucketIndex.push_back(loc);
1295 if ((loc + 1) < bucketSplitter.size())
1296 localSplitterTmp[i] = bucketSplitter[loc + 1];
1298 localSplitterTmp[i] = bucketSplitter[loc];
1308 localSplitterTmp[a - 1] = pNodes.size();
1312 idealLoadBalance = 0;
1313 for (
unsigned int i = 0; i < a - 1; i++) {
1315 idealLoadBalance += ((i + 1) * globalSz / a - i * globalSz / a);
1317 unsigned int loc = (
1318 std::lower_bound(bucketCounts_gScan.begin(), bucketCounts_gScan.end(),
1320 bucketCounts_gScan.begin());
1323 if ((loc + 1) < bucketSplitter.size())
1324 localSplitterTmp[i] = bucketSplitter[loc + 1];
1326 localSplitterTmp[i] = bucketSplitter[loc];
1334 localSplitterTmp[a - 1] = pNodes.size();
1342 bucketCount_gMerge.clear();
1343 bucketCounts.clear();
1344 bucketCounts_gScan.clear();
1345 bucketInfoMerge.clear();
1347 bucketCounts_g.clear();
1348 bucketSplitter.clear();
1349 bucketSplitterMerge.clear();
1350 newBucketCounts.clear();
1351 newBucketCounts_g.clear();
1352 newBucketInfo.clear();
1353 newBucketSplitters.clear();
1356 #ifdef DEBUG_TREE_SORT 1358 for (
int i = 0; i <a; i++) {
1360 std::cout <<
"Local Splitter Tmp : " << i <<
" : " << localSplitterTmp[i] << std::endl;
1366 std::vector<unsigned int> blockCounts;
1367 blockCounts.resize(a,b);
1369 for(
unsigned int k=0;k<p_mod_a;k++)
1370 blockCounts[k]=(b+1);
1375 std::vector<unsigned int > blockOffset;
1376 blockOffset.resize(a,0);
1379 omp_par::scan(&(*(blockCounts.begin())),&(*(blockOffset.begin())),a);
1384 unsigned int blk_id;
1385 for(
unsigned int k=0;k<a;k++)
1387 if( (rank>=blockOffset[k]) && rank<(blockOffset[k]+blockCounts[k]))
1389 (blockOffset[k]!=0) ? blk_id=rank%blockOffset[k] : blk_id=rank;
1401 unsigned int *sendCnt =
new unsigned int[a];
1402 unsigned int owner_blk=0;
1403 (rank<(b+1)*p_mod_a) ? owner_blk=rank/(b+1) : owner_blk =(((rank-(b+1)*p_mod_a)/b) + p_mod_a);
1407 sendCnt[0] = localSplitterTmp[0];
1408 for (
unsigned int i = 1; i < a; i++)
1410 assert(localSplitterTmp[i]>=localSplitterTmp[i-1]);
1411 sendCnt[i]= (localSplitterTmp[i] - localSplitterTmp[i - 1]);
1417 std::vector<unsigned int> send_count;
1418 std::vector<unsigned int> send_offset;
1419 std::vector<unsigned int> recv_count;
1420 std::vector<unsigned int> recv_offset;
1422 send_count.resize(npes,0);
1423 send_offset.resize(npes,0);
1424 recv_count.resize(npes,0);
1425 recv_offset.resize(npes,0);
1431 for(
unsigned int k=0;k<a;k++)
1433 if(owner_blk==k)
continue;
1434 if( (rank<(b+1)*p_mod_a) && blockCounts[k]==(b+1))
1436 assert((blockOffset[k]+blk_id)<npes);
1437 if(k>0)send_offset[(blockOffset[k]+blk_id)]=localSplitterTmp[k-1];
1438 send_count[(blockOffset[k]+blk_id)]=sendCnt[k];
1441 assert((blockOffset[k]+(blk_id%b))<npes);
1442 if(k>0)send_offset[(blockOffset[k]+(blk_id%b))]=localSplitterTmp[k-1];
1443 send_count[(blockOffset[k]+(blk_id%b))]=sendCnt[k];
1452 omp_par::scan(&(*(recv_count.begin())),&(*(recv_offset.begin())),npes);
1454 std::vector<T> recvBuf;
1455 recvBuf.resize(recv_offset[npes-1]+recv_count[npes-1]);
1458 par::Mpi_Alltoallv_sparse(&(*(pNodes.begin())),(
int *) &(*(send_count.begin())),(
int *) &(*(send_offset.begin())),&(*(recvBuf.begin())),(
int *) &(*(recv_count.begin())),(
int *) &(*(recv_offset.begin())),comm);
1459 (owner_blk>0) ? recvBuf.insert(recvBuf.end(),pNodes.begin()+localSplitterTmp[owner_blk-1],pNodes.begin()+localSplitterTmp[owner_blk-1]+sendCnt[owner_blk]) : recvBuf.insert(recvBuf.end(),pNodes.begin(),pNodes.begin()+sendCnt[owner_blk]);
1464 std::swap(pNodes,recvBuf);
1469 #ifdef DEBUG_TREE_SORT 1471 for (
int i = 0; i <a; i++) {
1473 std::cout <<
"Send Cnt : " << i <<
" : " << sendCnt[i] << std::endl;
1480 #ifdef PROFILE_TREE_SORT 1481 t1=std::chrono::high_resolution_clock::now();
1486 unsigned int col=owner_blk;
1487 MPI_Comm_split(comm,col,rank,newComm);
1504 template <
typename T>
1505 void SFC_treeSort(std::vector<T> &pNodes, std::vector<T>& pOutSorted,std::vector<T>& pOutConstruct,std::vector<T>& pOutBalanced ,
double loadFlexibility,
unsigned int pMaxDepth, T& parent,
unsigned int rot_id,
unsigned int k,
unsigned int options,
unsigned int sf_k,MPI_Comm pcomm)
1509 MPI_Comm_rank(pcomm, &rank);
1510 MPI_Comm_size(pcomm, &npes);
1513 MPI_Comm_dup(pcomm,&comm);
1515 #ifdef PROFILE_TREE_SORT 1523 remove_duplicates_seq=0;
1524 remove_duplicates_par=0;
1526 construction_time=0;
1532 t4=std::chrono::high_resolution_clock::now();
1533 t2=std::chrono::high_resolution_clock::now();
1541 MPI_Comm SF_comm=pcomm;
1542 unsigned int SF_Stages=0;
1544 #ifdef PROFILE_TREE_SORT 1546 double * sf_all2all;
1547 double * sf_splitters;
1553 MPI_Comm_free(&comm);
1555 SFC::seqSort::SFC_treeSort(&(*(pNodes.begin())),pNodes.size(),pOutSorted,pOutConstruct,pOutBalanced,pMaxDepth,pMaxDepth,parent,rot_id,k,options);
1562 #ifdef SPLITTER_SELECTION_FIX 1569 #ifdef PROFILE_TREE_SORT 1570 sf_full=
new double[SF_Stages];
1571 sf_all2all=
new double[SF_Stages];
1572 sf_splitters=
new double[SF_Stages];
1575 MPI_Comm * sf_comms=
new MPI_Comm[SF_Stages];
1577 for(
int i=0;i<SF_Stages;i++)
1579 #ifdef PROFILE_TREE_SORT 1580 t5_sf_staged=std::chrono::high_resolution_clock::now();
1583 SFC_SplitterFix(pNodes,pMaxDepth,loadFlexibility,sf_k,SF_comm,&sf_comms[i]);
1584 #ifdef PROFILE_TREE_SORT 1585 sf_full[i]=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t5_sf_staged).count();
1586 sf_all2all[i]=all2all1_time;
1587 sf_splitters[i]=sf_full[i]-sf_all2all[i];
1590 SF_comm=sf_comms[i];
1593 MPI_Comm_free(&comm);
1594 MPI_Comm_dup(sf_comms[SF_Stages-1],&comm);
1595 for(
int i=0;i<SF_Stages;i++)
1597 MPI_Comm_free(&sf_comms[i]);
1606 #ifdef PROFILE_TREE_SORT 1607 splitter_fix_all=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t2).count();
1611 MPI_Comm_rank(comm, &rank);
1612 MPI_Comm_size(comm, &npes);
1621 #ifdef PROFILE_TREE_SORT 1623 t2=std::chrono::high_resolution_clock::now();
1625 unsigned int firstSplitLevel = std::ceil(
binOp::fastLog2(npes)/(
double)(dim));
1626 unsigned int totalNumBuckets =1u << (dim * firstSplitLevel);
1627 DendroIntL localSz=pNodes.size();
1628 DendroIntL globalSz=0;
1629 MPI_Allreduce(&localSz,&globalSz,1,MPI_LONG_LONG,MPI_SUM,comm);
1635 std::vector<DendroIntL> bucketCounts;
1636 std::vector<BucketInfo<T>> bucketInfo;
1637 std::vector<DendroIntL > bucketSplitter;
1639 std::vector<BucketInfo<T>> nodeStack;
1641 nodeStack.push_back(root);
1643 unsigned int levSplitCount = 0;
1646 unsigned int hindex = 0;
1647 unsigned int hindexN = 0;
1649 unsigned int index = 0;
1651 unsigned int numLeafBuckets =0;
1653 unsigned int begin_loc=0;
1660 DendroIntL spliterstemp[(NUM_CHILDREN+1)];
1661 while(numLeafBuckets<totalNumBuckets) {
1664 nodeStack.erase(nodeStack.begin());
1667 SFC::seqSort::SFC_bucketing(&(*(pNodes.begin())),tmp.lev,pMaxDepth,tmp.rot_id,tmp.begin,tmp.end,spliterstemp);
1670 for (
int i = 0; i < NUM_CHILDREN; i++) {
1671 hindex = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i] -
'0');
1672 if (i == (NUM_CHILDREN-1))
1675 hindexN = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i + 1] -
'0');
1676 assert(spliterstemp[hindex] <= spliterstemp[hindexN]);
1677 index = HILBERT_TABLE[NUM_CHILDREN * tmp.rot_id + hindex];
1679 BucketInfo<T> child(index, (tmp.lev + 1), spliterstemp[hindex], spliterstemp[hindexN]);
1680 nodeStack.push_back(child);
1683 if(tmp.lev==(firstSplitLevel-1))
1685 BucketInfo<T> bucket(index, (tmp.lev + 1), spliterstemp[hindex], spliterstemp[hindexN]);
1686 bucketCounts.push_back((spliterstemp[hindexN] - spliterstemp[hindex]));
1687 bucketSplitter.push_back(spliterstemp[hindex]);
1688 bucketInfo.push_back(bucket);
1700 #ifdef DEBUG_TREE_SORT 1701 for(
int i=0;i<bucketSplitter.size()-2;i++)
1703 std::cout<<
"Bucket Splitter : "<<bucketSplitter[i]<<std::endl;
1717 std::vector<DendroIntL >bucketCounts_g(bucketCounts.size());
1718 std::vector<DendroIntL >bucketCounts_gScan(bucketCounts.size());
1721 par::Mpi_Allreduce<DendroIntL>(&(*(bucketCounts.begin())),&(*(bucketCounts_g.begin())),bucketCounts.size(),MPI_SUM,comm);
1729 #ifdef DEBUG_TREE_SORT 1730 assert(totalNumBuckets);
1732 bucketCounts_gScan[0]=bucketCounts_g[0];
1733 for(
int k=1;k<bucketCounts_g.size();k++){
1734 bucketCounts_gScan[k]=bucketCounts_gScan[k-1]+bucketCounts_g[k];
1737 #ifdef DEBUG_TREE_SORT 1739 for (
int i = 0; i < totalNumBuckets; i++) {
1741 std::cout <<
"Bucket initial count scan G : " << i <<
" : " << bucketCounts_gScan[i] << std::endl;
1746 #ifdef DEBUG_TREE_SORT 1747 assert(bucketCounts_gScan.back()==globalSz);
1749 DendroIntL* localSplitter=
new DendroIntL[npes];
1750 std::vector<unsigned int> splitBucketIndex;
1751 DendroIntL idealLoadBalance=0;
1753 for(
int i=0;i<npes-1;i++) {
1754 idealLoadBalance+=((i+1)*globalSz/npes -i*globalSz/npes);
1755 DendroIntL toleranceLoadBalance = ((i+1)*globalSz/npes -i*globalSz/npes) * loadFlexibility;
1757 unsigned int loc=(std::lower_bound(bucketCounts_gScan.begin(), bucketCounts_gScan.end(), idealLoadBalance) - bucketCounts_gScan.begin());
1760 if(abs(bucketCounts_gScan[loc]-idealLoadBalance) > toleranceLoadBalance)
1763 if(splitBucketIndex.empty() || splitBucketIndex.back()!=loc)
1764 splitBucketIndex.push_back(loc);
1769 if ((loc + 1) < bucketSplitter.size())
1770 localSplitter[i] = bucketSplitter[loc + 1];
1772 localSplitter[i] = bucketSplitter[loc];
1782 localSplitter[npes-1]=pNodes.size();
1785 #ifdef DEBUG_TREE_SORT 1786 for(
int i=0;i<splitBucketIndex.size()-1;i++)
1788 assert(pNodes[bucketSplitter[splitBucketIndex[i]]]<pNodes[bucketSplitter[splitBucketIndex[i+1]]]);
1791 std::vector<DendroIntL> newBucketCounts;
1792 std::vector<DendroIntL> newBucketCounts_g;
1793 std::vector<BucketInfo<T>> newBucketInfo;
1794 std::vector<DendroIntL> newBucketSplitters;
1797 std::vector<DendroIntL> bucketCount_gMerge;
1798 std::vector<DendroIntL> bucketSplitterMerge;
1799 std::vector<BucketInfo<T>> bucketInfoMerge;
1801 DendroIntL splitterTemp[(NUM_CHILDREN+1)];
1802 while(!splitBucketIndex.empty()) {
1805 newBucketCounts.clear();
1806 newBucketCounts_g.clear();
1807 newBucketInfo.clear();
1808 newBucketSplitters.clear();
1810 bucketCount_gMerge.clear();
1811 bucketSplitterMerge.clear();
1812 bucketInfoMerge.clear();
1814 if (bucketInfo[splitBucketIndex[0]].lev < pMaxDepth) {
1819 #ifdef DEBUG_TREE_SORT 1821 for (
int i = 0; i < splitBucketIndex.size(); i++)
1822 std::cout <<
"Splitter Bucket Index: " << i <<
" : " << splitBucketIndex[i] << std::endl;
1827 #ifdef DEBUG_TREE_SORT 1829 for (
int i = 0; i <bucketSplitter.size(); i++) {
1830 std::cout<<
" Bucket Splitter : "<<i<<
" : "<<bucketSplitter[i]<<std::endl;
1834 for (
int k = 0; k < splitBucketIndex.size(); k++) {
1836 tmp = bucketInfo[splitBucketIndex[k]];
1837 #ifdef DEBUG_TREE_SORT 1839 std::cout<<
"Splitting Bucket index "<<splitBucketIndex[k]<<
"begin: "<<tmp.begin <<
" end: "<<tmp.end<<
" rot_id: "<< (int)tmp.rot_id<<std::endl;
1842 SFC::seqSort::SFC_bucketing(&(*(pNodes.begin())),tmp.lev,pMaxDepth,tmp.rot_id,tmp.begin,tmp.end,splitterTemp);
1846 for (
int i = 0; i < NUM_CHILDREN; i++) {
1847 hindex = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i] -
'0');
1848 if (i == (NUM_CHILDREN-1))
1851 hindexN = (rotations[2 * NUM_CHILDREN * tmp.rot_id + i + 1] -
'0');
1854 newBucketCounts.push_back((splitterTemp[hindexN] - splitterTemp[hindex]));
1856 index = HILBERT_TABLE[NUM_CHILDREN * tmp.rot_id + hindex];
1857 BucketInfo<T> bucket(index, (tmp.lev + 1), splitterTemp[hindex], splitterTemp[hindexN]);
1860 newBucketInfo.push_back(bucket);
1861 newBucketSplitters.push_back(splitterTemp[hindex]);
1862 #ifdef DEBUG_TREE_SORT 1863 assert(pNodes[splitterTemp[hindex]]<=pNodes[splitterTemp[hindexN]]);
1869 newBucketCounts_g.resize(newBucketCounts.size());
1870 par::Mpi_Allreduce(&(*(newBucketCounts.begin())),&(*(newBucketCounts_g.begin())),newBucketCounts.size(),MPI_SUM,comm);
1874 #ifdef DEBUG_TREE_SORT 1875 for(
int i=0;i<newBucketSplitters.size()-1;i++)
1877 assert(pNodes[newBucketSplitters[i]]<=pNodes[newBucketSplitters[i+1]]);
1883 #ifdef DEBUG_TREE_SORT 1885 for (
int k = 0; k < splitBucketIndex.size(); k++) {
1886 unsigned int sum = 0;
1887 for (
int i = 0; i < NUM_CHILDREN; i++)
1888 sum += newBucketCounts_g[NUM_CHILDREN * k + i];
1889 if (!rank)
if (bucketCounts_g[splitBucketIndex[k]] != sum) {
1897 for (
int k = 0; k < splitBucketIndex.size(); k++) {
1899 unsigned int bucketBegin = 0;
1901 (k == 0) ? bucketBegin = 0 : bucketBegin = splitBucketIndex[k - 1] + 1;
1904 for (
int i = bucketBegin; i < splitBucketIndex[k]; i++) {
1906 bucketCount_gMerge.push_back(bucketCounts_g[i]);
1907 bucketInfoMerge.push_back(bucketInfo[i]);
1908 bucketSplitterMerge.push_back(bucketSplitter[i]);
1912 tmp = bucketInfo[splitBucketIndex[k]];
1913 for (
int i = 0; i < NUM_CHILDREN; i++) {
1914 bucketCount_gMerge.push_back(newBucketCounts_g[NUM_CHILDREN * k + i]);
1915 bucketInfoMerge.push_back(newBucketInfo[NUM_CHILDREN * k + i]);
1916 bucketSplitterMerge.push_back(newBucketSplitters[NUM_CHILDREN * k + i]);
1921 if (k == (splitBucketIndex.size() - 1)) {
1922 for (
int i = splitBucketIndex[k] + 1; i < bucketCounts_g.size(); i++) {
1923 bucketCount_gMerge.push_back(bucketCounts_g[i]);
1924 bucketInfoMerge.push_back(bucketInfo[i]);
1925 bucketSplitterMerge.push_back(bucketSplitter[i]);
1934 std::swap(bucketCounts_g,bucketCount_gMerge);
1935 std::swap(bucketInfo,bucketInfoMerge);
1936 std::swap(bucketSplitter,bucketSplitterMerge);
1939 bucketCounts_gScan.resize(bucketCounts_g.size());
1941 bucketCounts_gScan[0] = bucketCounts_g[0];
1942 for (
int k = 1; k < bucketCounts_g.size(); k++) {
1943 bucketCounts_gScan[k] = bucketCounts_gScan[k - 1] + bucketCounts_g[k];
1945 #ifdef DEBUG_TREE_SORT 1946 assert(bucketCounts_gScan.back()==globalSz);
1948 splitBucketIndex.clear();
1949 #ifdef DEBUG_TREE_SORT 1950 for(
int i=0;i<bucketSplitter.size()-2;i++)
1952 std::cout<<
"Bucket Splitter : "<<bucketSplitter[i]<<std::endl;
1953 assert( bucketSplitter[i+1]!=(pNodes.size()) && pNodes[bucketSplitter[i]]<=pNodes[bucketSplitter[i+1]]);
1957 idealLoadBalance = 0;
1959 for (
unsigned int i = 0; i < npes-1; i++) {
1960 idealLoadBalance += ((i + 1) * globalSz / npes - i * globalSz / npes);
1961 DendroIntL toleranceLoadBalance = (((i + 1) * globalSz / npes - i * globalSz / npes) * loadFlexibility);
1962 unsigned int loc = (std::lower_bound(bucketCounts_gScan.begin(), bucketCounts_gScan.end(), idealLoadBalance) -
1963 bucketCounts_gScan.begin());
1965 if (abs(bucketCounts_gScan[loc] - idealLoadBalance) > toleranceLoadBalance) {
1966 if (splitBucketIndex.empty() || splitBucketIndex.back() != loc)
1967 splitBucketIndex.push_back(loc);
1971 if ((loc + 1) < bucketSplitter.size())
1972 localSplitter[i] = bucketSplitter[loc + 1];
1974 localSplitter[i] = bucketSplitter[loc];
1984 localSplitter[npes-1]=pNodes.size();
1988 idealLoadBalance = 0;
1989 for (
unsigned int i = 0; i < npes-1; i++) {
1991 idealLoadBalance += ((i + 1) * globalSz / npes - i * globalSz / npes);
1993 unsigned int loc = (
1994 std::lower_bound(bucketCounts_gScan.begin(), bucketCounts_gScan.end(), idealLoadBalance) -
1995 bucketCounts_gScan.begin());
1998 if ((loc + 1) < bucketSplitter.size())
1999 localSplitter[i] = bucketSplitter[loc + 1];
2001 localSplitter[i] = bucketSplitter[loc];
2009 localSplitter[npes-1]=pNodes.size();
2020 bucketCount_gMerge.clear();
2021 bucketCounts.clear();
2022 bucketCounts_gScan.clear();
2023 bucketInfoMerge.clear();
2025 bucketCounts_g.clear();
2026 bucketSplitter.clear();
2027 bucketSplitterMerge.clear();
2028 newBucketCounts.clear();
2029 newBucketCounts_g.clear();
2030 newBucketInfo.clear();
2031 newBucketSplitters.clear();
2038 #ifdef DEBUG_TREE_SORT 2039 for(
int i=0;i<npes;i++)
2041 for(
int j=i+1 ;j<npes -1;j++)
2042 assert(pNodes[localSplitter[i]]<=pNodes[localSplitter[j]]);
2047 #ifdef DEBUG_TREE_SORT 2050 for(
int i=0;i<npes;i++)
2051 std::cout<<
"Rank "<<rank<<
" Local Splitter: "<<i<<
": "<<localSplitter[i]<<std::endl;
2057 #ifdef PROFILE_TREE_SORT 2058 splitter_time=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t2).count();
2060 t2=std::chrono::high_resolution_clock::now();
2064 int * sendCounts =
new int[npes];
2065 int * recvCounts =
new int[npes];
2068 sendCounts[0] = localSplitter[0];
2070 for(
unsigned int i=1;i<npes; ++i)
2072 sendCounts[i] = localSplitter[i] - localSplitter[i-1];
2081 int * sendDispl =
new int [npes];
2082 int * recvDispl =
new int [npes];
2088 for(
int i=1;i<npes;i++)
2090 sendDispl[i] = sendCounts[i-1] + sendDispl[i - 1];
2091 recvDispl[i] =recvCounts[i-1] +recvDispl[i-1];
2095 #ifdef DEBUG_TREE_SORT 2096 std::cout << rank <<
" : send = " << sendCounts[0] <<
", " << sendCounts[1] << std::endl;
2097 std::cout << rank <<
" : recv = " << recvCounts[0] <<
", " << recvCounts[1] << std::endl;
2105 std::vector<T> pNodesRecv;
2106 DendroIntL recvTotalCnt=recvDispl[npes-1]+recvCounts[npes-1];
2107 if(recvTotalCnt) pNodesRecv.resize(recvTotalCnt);
2111 par::Mpi_Alltoallv_Kway(&pNodes[0],sendCounts,sendDispl,&pNodesRecv[0],recvCounts,recvDispl,comm);
2113 #ifdef PROFILE_TREE_SORT 2114 all2all2_time=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t2).count();
2118 #ifdef DEBUG_TREE_SORT 2119 if(!rank) std::cout<<
"All2All Communication Ended "<<std::endl;
2122 delete[](localSplitter);
2128 std::swap(pNodes,pNodesRecv);
2131 delete[](sendCounts);
2132 delete[](sendDispl);
2133 delete[](recvCounts);
2134 delete[](recvDispl);
2138 #ifdef PROFILE_TREE_SORT 2140 t2=std::chrono::high_resolution_clock::now();
2142 SFC::seqSort::SFC_treeSort(&(*(pNodes.begin())),pNodes.size(),pOutSorted,pOutConstruct,pOutBalanced,pMaxDepth,pMaxDepth,parent,rot_id,k,options);
2144 #ifdef PROFILE_TREE_SORT 2145 localSort_time=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t2).count();
2153 MPI_Comm_free(&comm);
2155 if(options & TS_REMOVE_DUPLICATES) {
2159 #ifdef PROFILE_TREE_SORT 2161 t2=std::chrono::high_resolution_clock::now();
2164 int new_rank, new_size;
2170 MPI_Comm_rank(new_comm, &new_rank);
2171 MPI_Comm_size(new_comm, &new_size);
2174 #ifdef __DEBUG_PAR__ 2177 std::cout<<
"RemDup: Stage-4 passed."<<std::endl;
2183 if (!pOutSorted.empty()) {
2185 T begin =pOutSorted[0];
2186 T end = pOutSorted[pOutSorted.size() - 1];
2193 par::Mpi_Sendrecv<T, T>(&end, 1, ((new_rank < (new_size - 1)) ? (new_rank + 1) : 0), 1, &endRecv,
2194 1, ((new_rank > 0) ? (new_rank - 1) : (new_size - 1)), 1, new_comm,
2201 typename std::vector<T>::iterator Iter = std::find(pOutSorted.begin(), pOutSorted.end(), endRecv);
2202 if (Iter != pOutSorted.end()) {
2203 pOutSorted.erase(Iter);
2212 bool state_global=
true;
2213 unsigned int count=0;
2216 while(count<pOutSorted.size() & state_global ) {
2218 begin=pOutSorted[count];
2219 end=pOutSorted.back();
2221 par::Mpi_Sendrecv<T, T>(&begin, 1, ((new_rank > 0) ? (new_rank - 1) : (new_size - 1)), 1,
2222 &beginRecv, 1, ((new_rank < (new_size - 1)) ? (new_rank + 1) : 0), 1, new_comm,
2230 while (end.isAncestor(beginRecv)) {
2232 pOutSorted.pop_back();
2233 end = pOutSorted.back();
2236 MPI_Allreduce(&state,&state_global,1,MPI_CXX_BOOL,MPI_LOR,new_comm);
2244 MPI_Comm_free(&new_comm);
2247 #ifdef PROFILE_TREE_SORT 2248 remove_duplicates_par=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t2).count();
2254 #ifdef PROFILE_TREE_SORT 2255 total_rd=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t4).count();
2259 MPI_Comm_rank(pcomm,&rank_g);
2260 MPI_Comm_size(pcomm,&npes_g);
2262 par::Mpi_Reduce(&splitter_fix_all,&stat_property[0],1,MPI_MIN,0,pcomm);
2263 par::Mpi_Reduce(&splitter_fix_all,&stat_property[1],1,MPI_SUM,0,pcomm);
2264 par::Mpi_Reduce(&splitter_fix_all,&stat_property[2],1,MPI_MAX,0,pcomm);
2267 stat_property[1] = stat_property[1] / (double) npes_g;
2269 stats.push_back(stat_property[0]);
2270 stats.push_back(stat_property[1]);
2271 stats.push_back(stat_property[2]);
2281 stat_property[1] = stat_property[1] / (double) npes_g;
2283 stats.push_back(stat_property[0]);
2284 stats.push_back(stat_property[1]);
2285 stats.push_back(stat_property[2]);
2296 stat_property[1] = stat_property[1] / (double) npes_g;
2298 stats.push_back(stat_property[0]);
2299 stats.push_back(stat_property[1]);
2300 stats.push_back(stat_property[2]);
2311 stat_property[1] = stat_property[1] / (double) npes_g;
2313 stats.push_back(stat_property[0]);
2314 stats.push_back(stat_property[1]);
2315 stats.push_back(stat_property[2]);
2325 stat_property[1] = stat_property[1] / (double) npes_g;
2327 stats.push_back(stat_property[0]);
2328 stats.push_back(stat_property[1]);
2329 stats.push_back(stat_property[2]);
2334 par::Mpi_Reduce(&remove_duplicates_seq,&stat_property[0],1,MPI_MIN,0,pcomm);
2335 par::Mpi_Reduce(&remove_duplicates_seq,&stat_property[1],1,MPI_SUM,0,pcomm);
2336 par::Mpi_Reduce(&remove_duplicates_seq,&stat_property[2],1,MPI_MAX,0,pcomm);
2340 stat_property[1] = stat_property[1] / (double) npes_g;
2342 stats.push_back(stat_property[0]);
2343 stats.push_back(stat_property[1]);
2344 stats.push_back(stat_property[2]);
2355 stat_property[1] = stat_property[1] / (double) npes_g;
2357 stats.push_back(stat_property[0]);
2358 stats.push_back(stat_property[1]);
2359 stats.push_back(stat_property[2]);
2364 par::Mpi_Reduce(&remove_duplicates_par,&stat_property[0],1,MPI_MIN,0,pcomm);
2365 par::Mpi_Reduce(&remove_duplicates_par,&stat_property[1],1,MPI_SUM,0,pcomm);
2366 par::Mpi_Reduce(&remove_duplicates_par,&stat_property[2],1,MPI_MAX,0,pcomm);
2370 stat_property[1] = stat_property[1] / (double) npes_g;
2372 stats.push_back(stat_property[0]);
2373 stats.push_back(stat_property[1]);
2374 stats.push_back(stat_property[2]);
2387 stat_property[1] = stat_property[1] / (double) npes_g;
2389 stats.push_back(stat_property[0]);
2390 stats.push_back(stat_property[1]);
2391 stats.push_back(stat_property[2]);
2396 for(
int i=0;i<SF_Stages;i++)
2402 stat_property[1] = stat_property[1] / (double) npes_g;
2406 stats.push_back(stat_property[0]);
2407 stats.push_back(stat_property[1]);
2408 stats.push_back(stat_property[2]);
2416 stat_property[1] = stat_property[1] / (double) npes_g;
2420 stats.push_back(stat_property[0]);
2421 stats.push_back(stat_property[1]);
2422 stats.push_back(stat_property[2]);
2426 par::Mpi_Reduce(&sf_splitters[i],&stat_property[0],1,MPI_MIN,0,pcomm);
2427 par::Mpi_Reduce(&sf_splitters[i],&stat_property[1],1,MPI_SUM,0,pcomm);
2428 par::Mpi_Reduce(&sf_splitters[i],&stat_property[2],1,MPI_MAX,0,pcomm);
2430 stat_property[1] = stat_property[1] / (double) npes_g;
2434 stats.push_back(stat_property[0]);
2435 stats.push_back(stat_property[1]);
2436 stats.push_back(stat_property[2]);
2447 if((options & TS_CONSTRUCT_OCTREE) | (options & TS_BALANCE_OCTREE))
2451 MPI_Comm_rank(pcomm,&rank);
2452 MPI_Comm_size(pcomm,&npes);
2455 #ifdef PROFILE_TREE_SORT 2456 stats_previous.clear();
2457 stats_previous.insert(stats_previous.end(),stats.begin(),stats.end());
2459 std::vector<T> tmpSorted;
2460 std::vector<T> tmpConstructed;
2461 std::vector<T> tmpBalanced;
2462 T root(0,0,0,0,m_uiDim,pMaxDepth);
2463 if(options & TS_CONSTRUCT_OCTREE) {
2466 #ifdef PROFILE_TREE_SORT 2468 t3=std::chrono::high_resolution_clock::now();
2470 par::partitionW<T>(pOutConstruct,NULL,pcomm);
2471 SFC::parSort::SFC_treeSort(pOutConstruct,tmpSorted,tmpConstructed,tmpBalanced,loadFlexibility,pMaxDepth,root,rot_id,k,1,sf_k,pcomm);
2473 #ifdef PROFILE_TREE_SORT 2474 construction_time=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t3).count();
2480 MPI_Comm_rank(pcomm,&rank_g);
2481 MPI_Comm_size(pcomm,&npes_g);
2484 par::Mpi_Reduce(&construction_time,&stat_property[0],1,MPI_MIN,0,pcomm);
2485 par::Mpi_Reduce(&construction_time,&stat_property[1],1,MPI_SUM,0,pcomm);
2486 par::Mpi_Reduce(&construction_time,&stat_property[2],1,MPI_MAX,0,pcomm);
2490 stat_property[1] = stat_property[1] / (double) npes_g;
2492 stats.push_back(stat_property[0]);
2493 stats.push_back(stat_property[1]);
2494 stats.push_back(stat_property[2]);
2501 std::swap(tmpSorted,pOutConstruct);
2507 if(options & TS_BALANCE_OCTREE) {
2509 #ifdef PROFILE_TREE_SORT 2511 t3=std::chrono::high_resolution_clock::now();
2513 par::partitionW<T>(pOutBalanced,NULL,pcomm);
2514 SFC::parSort::SFC_treeSort(pOutBalanced,tmpSorted,tmpConstructed,tmpBalanced,loadFlexibility,pMaxDepth,root,rot_id,k,1,sf_k,pcomm);
2516 #ifdef PROFILE_TREE_SORT 2517 balancing_time=std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t3).count();
2519 MPI_Comm_rank(pcomm,&rank_g);
2520 MPI_Comm_size(pcomm,&npes_g);
2528 stat_property[1] = stat_property[1] / (double) npes_g;
2530 stats.push_back(stat_property[0]);
2531 stats.push_back(stat_property[1]);
2532 stats.push_back(stat_property[2]);
2539 std::swap(tmpSorted,pOutBalanced);
2582 #endif //SFCSORTBENCH_SFCSORT_H Traits to determine MPI_DATATYPE from a C++ datatype.
int Mpi_Alltoallv_sparse(T *sendbuf, int *sendcnts, int *sdispls, T *recvbuf, int *recvcnts, int *rdispls, MPI_Comm comm)
int Mpi_Reduce(T *sendbuf, T *recvbuf, int count, MPI_Op op, int root, MPI_Comm comm)
int Mpi_Alltoall(T *sendbuf, T *recvbuf, int count, MPI_Comm comm)
Definition: sfcSort.h:217
Definition: sfcSort.h:153
A set of parallel utilities.
unsigned int fastLog2(unsigned int num)
Definition: binUtils.cpp:15
Definition: sfcSort.h:126
int Mpi_Allreduce(T *sendbuf, T *recvbuf, int count, MPI_Op op, MPI_Comm comm)
A Set of utilities to test octrees.
int splitComm2way(bool iAmEmpty, MPI_Comm *new_comm, MPI_Comm orig_comm)
Removes duplicates in parallel. If the input is not sorted, sample sort will be called within the fun...
Definition: parUtils.cpp:128
Definition: sfcSearch.h:22