Dendro  5.01
Dendro in Greek language means tree. The Dendro library is a large scale (262K cores on ORNL's Titan) distributed memory adaptive octree framework. The main goal of Dendro is to perform large scale multiphysics simulations efficeiently in mordern supercomputers. Dendro consists of efficient parallel data structures and algorithms to perform variational ( finite element) methods and finite difference mthods on 2:1 balanced arbitary adaptive octrees which enables the users to perform simulations raning from black holes (binary black hole mergers) to blood flow in human body, where applications ranging from relativity, astrophysics to biomedical engineering.
Classes | Namespaces | Macros | Functions
parUtils.h File Reference

A set of parallel utilities. More...

#include "mpi.h"
#include <vector>
#include "dendro.h"
#include "parUtils.tcc"
Include dependency graph for parUtils.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  _T< T >
 : data structure to compute parallel rank More...
 
class  par::Mpi_datatype< T >
 An abstract class used for communicating messages using user-defined datatypes. The user must implement the static member function "value()" that returns the MPI_Datatype corresponding to this user-defined datatype. More...
 
class  par::Mpi_datatype< _T< double > >
 
class  par::Mpi_datatype< _T< float > >
 

Namespaces

 par
 Collection of Generic Parallel Functions: Sorting, Partitioning, Searching,...
 

Macros

#define KEEP_HIGH   100
 
#define KEEP_LOW   101
 
#define KWAY   128
 
#define PROF_A2AV_WAIT_BEGIN
 
#define PROF_SPLIT_COMM_2WAY_BEGIN
 
#define PROF_SPLIT_COMM_BEGIN
 
#define PROF_SEARCH_BEGIN
 
#define PROF_PAR_SCATTER_BEGIN
 
#define PROF_PAR_SENDRECV_BEGIN
 
#define PROF_PAR_BCAST_BEGIN
 
#define PROF_PAR_GATHER_BEGIN
 
#define PROF_PAR_SCAN_BEGIN
 
#define PROF_PAR_REDUCE_BEGIN
 
#define PROF_PAR_ALLREDUCE_BEGIN
 
#define PROF_PAR_ALL2ALL_BEGIN
 
#define PROF_PAR_ALLGATHERV_BEGIN
 
#define PROF_PAR_ALLGATHER_BEGIN
 
#define PROF_PAR_ALL2ALLV_SPARSE_BEGIN
 
#define PROF_PAR_ALL2ALLV_DENSE_BEGIN
 
#define PROF_PAR_CONCAT_BEGIN
 
#define PROF_SORT_BEGIN
 
#define PROF_REMDUP_BEGIN
 
#define PROF_PARTW_BEGIN
 
#define PROF_A2AV_WAIT_END
 
#define PROF_SPLIT_COMM_2WAY_END   return 1;
 
#define PROF_SPLIT_COMM_END   return 1;
 
#define PROF_SEARCH_END   return 1;
 
#define PROF_PAR_SCATTER_END   return 1;
 
#define PROF_PAR_SENDRECV_END   return 1;
 
#define PROF_PAR_BCAST_END   return 1;
 
#define PROF_PAR_GATHER_END   return 1;
 
#define PROF_PAR_SCAN_END   return 1;
 
#define PROF_PAR_REDUCE_END   return 1;
 
#define PROF_PAR_ALLREDUCE_END   return 1;
 
#define PROF_PAR_ALL2ALL_END   return 1;
 
#define PROF_PAR_ALLGATHERV_END   return 1;
 
#define PROF_PAR_ALLGATHER_END   return 1;
 
#define PROF_PAR_ALL2ALLV_SPARSE_END   return 1;
 
#define PROF_PAR_ALL2ALLV_DENSE_END   return 1;
 
#define PROF_PAR_CONCAT_END   return 1;
 
#define PROF_SORT_END   return 1;
 
#define PROF_REMDUP_END   return 1;
 
#define PROF_PARTW_END   return 1;
 

Functions

template<typename T >
int par::Mpi_Isend (T *buf, int count, int dest, int tag, MPI_Comm comm, MPI_Request *request)
 
template<typename T >
int par::Mpi_Issend (T *buf, int count, int dest, int tag, MPI_Comm comm, MPI_Request *request)
 
template<typename T >
int par::Mpi_Recv (T *buf, int count, int source, int tag, MPI_Comm comm, MPI_Status *status)
 
template<typename T >
int par::Mpi_Irecv (T *buf, int count, int source, int tag, MPI_Comm comm, MPI_Request *request)
 
template<typename T >
int par::Mpi_Gather (T *sendBuffer, T *recvBuffer, int count, int root, MPI_Comm comm)
 
template<typename T , typename S >
int par::Mpi_Sendrecv (T *sendBuf, int sendCount, int dest, int sendTag, S *recvBuf, int recvCount, int source, int recvTag, MPI_Comm comm, MPI_Status *status)
 
template<typename T >
int par::Mpi_Bcast (T *buffer, int count, int root, MPI_Comm comm)
 
template<typename T >
int par::Mpi_Scan (T *sendbuf, T *recvbuf, int count, MPI_Op op, MPI_Comm comm)
 
template<typename T >
int par::Mpi_Reduce (T *sendbuf, T *recvbuf, int count, MPI_Op op, int root, MPI_Comm comm)
 
template<typename T >
int par::Mpi_Allreduce (T *sendbuf, T *recvbuf, int count, MPI_Op op, MPI_Comm comm)
 
template<typename T >
int par::Mpi_Alltoall (T *sendbuf, T *recvbuf, int count, MPI_Comm comm)
 
template<typename T >
int par::Mpi_Allgatherv (T *sendbuf, int sendcount, T *recvbuf, int *recvcounts, int *displs, MPI_Comm comm)
 
template<typename T >
int par::Mpi_Allgather (T *sendbuf, T *recvbuf, int count, MPI_Comm comm)
 
template<typename T >
int par::Mpi_Alltoallv_sparse (T *sendbuf, int *sendcnts, int *sdispls, T *recvbuf, int *recvcnts, int *rdispls, MPI_Comm comm)
 
template<typename T >
int par::Mpi_Alltoallv_dense (T *sendbuf, int *sendcnts, int *sdispls, T *recvbuf, int *recvcnts, int *rdispls, MPI_Comm comm)
 
template<typename T >
int par::Mpi_Alltoallv_Kway (T *sbuff_, int *s_cnt_, int *sdisp_, T *rbuff_, int *r_cnt_, int *rdisp_, MPI_Comm c)
 
template<typename T >
int par::scatterValues (std::vector< T > &in, std::vector< T > &out, DendroIntL outSz, MPI_Comm comm)
 Re-distributes a STL vector, preserving the relative ordering of the elements. More...
 
template<typename T >
int par::maxLowerBound (const std::vector< T > &keys, const std::vector< T > &searchList, std::vector< T > &results, MPI_Comm comm)
 A parallel search function. More...
 
template<typename T >
unsigned int par::defaultWeight (const T *a)
 
template<typename T >
int par::partitionW (std::vector< T > &vec, unsigned int(*getWeight)(const T *), MPI_Comm comm)
 A parallel weighted partitioning function. In our implementation, we do not pose any restriction on the input or the number of processors. This function can be used with an odd number of processors as well. Some processors can pass an empty vector as input. The relative ordering of the elements is preserved. More...
 
template<typename T >
int par::concatenate (std::vector< T > &listA, std::vector< T > &listB, MPI_Comm comm)
 A parallel concatenation function. listB is appended (globally) to listA and the result is stored in listA. An useful application of this function is when listA and listB are individually sorted (globally) and the smallest element in listB is greater than the largest element in listA and we want to create a merged list that is sorted. More...
 
template<typename T >
int par::sampleSort (std::vector< T > &in, std::vector< T > &out, std::vector< double > &stats, MPI_Comm comm)
 A parallel sample sort implementation. In our implementation, we do not pose any restriction on the input or the number of processors. This function can be used with an odd number of processors as well. Some processors can pass an empty vector as input. If the total number of elements in the vector (globally) is fewer than 10*p^2, where p is the number of processors, then we will use bitonic sort instead of sample sort to sort the vector. We use a paralle bitonic sort to sort the samples in the sample sort algorithm. Hence, the complexity of the algorithm is O(n/p log n/p) + O(p log p). Here, n is the global length of the vector and p is the number of processors. More...
 
int par::splitComm2way (bool iAmEmpty, MPI_Comm *new_comm, MPI_Comm orig_comm)
 Removes duplicates in parallel. If the input is not sorted, sample sort will be called within the function to sort the vector and then duplicates will be removed. More...
 
int par::splitComm2way (const bool *isEmptyList, MPI_Comm *new_comm, MPI_Comm orig_comm)
 Splits a communication group into two depending on the values in isEmptyList. Both the groups are sorted in the ascending order of their ranks in the old comm. All processors must call this function with the same 'isEmptyList' array. More...
 
int par::splitCommUsingSplittingRank (int splittingRank, MPI_Comm *new_comm, MPI_Comm orig_comm)
 
unsigned int par::splitCommBinary (MPI_Comm orig_comm, MPI_Comm *new_comm)
 Splits a communication group into two, the first having a power of 2 number of processors and the other having the remainder. The first group is sorted in the ascending order of their ranks in the old comm and the second group is sorted in the descending order of their ranks in the old comm. More...
 
unsigned int par::splitCommBinaryNoFlip (MPI_Comm orig_comm, MPI_Comm *new_comm)
 Splits a communication group into two, the first having a power of 2 number of processors and the other having the remainder. Both the groups are sorted in the ascending order of their ranks in the old comm. More...
 
template<typename T >
void par::MergeLists (std::vector< T > &listA, std::vector< T > &listB, int KEEP_WHAT)
 Merges lists A, and B, retaining either the low or the High in list A. More...
 
template<typename T >
void par::MergeSplit (std::vector< T > &local_list, int which_keys, int partner, MPI_Comm comm)
 The main operation in the parallel bitonic sort algorithm. This implements the compare-split operation. More...
 
template<typename T >
void par::Par_bitonic_sort_incr (std::vector< T > &local_list, int proc_set_size, MPI_Comm comm)
 
template<typename T >
void par::Par_bitonic_sort_decr (std::vector< T > &local_list, int proc_set_size, MPI_Comm comm)
 
template<typename T >
void par::Par_bitonic_merge_incr (std::vector< T > &local_list, int proc_set_size, MPI_Comm comm)
 
template<typename T >
void par::bitonicSort_binary (std::vector< T > &in, MPI_Comm comm)
 An implementation of parallel bitonic sort that expects the number of processors to be a power of 2. However, unlike most implementations, we do not expect the length of the vector (neither locally nor globally) to be a power of 2 or even. Moreover, each processor can call this with a different number of elements. However, we do expect that 'in' atleast has 1 element on each processor. More...
 
template<typename T >
void par::bitonicSort (std::vector< T > &in, MPI_Comm comm)
 An implementation of parallel bitonic sort that does not expect the number of processors to be a power of 2. In fact, the number of processors can even be odd. Moreover, we do not even expect the length of the vector (neither locally nor globally) to be a power of 2 or even. Moreover, each processor can call this with a different number of elements. However, we do expect that 'in' atleast has 1 element on each processor. This recursively calls the function bitonicSort_binary, followed by a special parallel merge. More...
 
template<typename T >
void par::parallel_rank (const T *in, unsigned int sz, DendroIntL *out, MPI_Comm comm)
 

Detailed Description

A set of parallel utilities.

Author
Rahul S. Sampath, rahul.nosp@m..sam.nosp@m.path@.nosp@m.gmai.nosp@m.l.com
Hari Sundar, hsund.nosp@m.ar@g.nosp@m.mail..nosp@m.com
Shravan Veerapaneni, shrav.nosp@m.an@s.nosp@m.eas.u.nosp@m.penn.nosp@m..edu
Santi Swaroop Adavani, santi.nosp@m.s@gm.nosp@m.ail.c.nosp@m.om