OpenLB 1.7
Loading...
Searching...
No Matches
Public Member Functions | List of all members
olb::HeterogeneousLoadBalancer< T > Class Template Referencefinal

Load balancer for heterogeneous CPU-GPU systems. More...

#include <heterogeneousLoadBalancer.h>

+ Inheritance diagram for olb::HeterogeneousLoadBalancer< T >:
+ Collaboration diagram for olb::HeterogeneousLoadBalancer< T >:

Public Member Functions

 HeterogeneousLoadBalancer (CuboidGeometry< T, 3 > &cGeometry, T largeBlockFraction=0.9)
 
Platform platform (int loc) const override
 
void setPlatform (int loc, Platform platform)
 
- Public Member Functions inherited from olb::LoadBalancer< T >
 LoadBalancer (int size=1)
 Default empty constructor.
 
 LoadBalancer (int size, std::map< int, int > &loc, std::vector< int > &glob, std::map< int, int > &rank)
 Constructor accepting existing balancing.
 
 LoadBalancer (int size, std::map< int, int > &loc, std::vector< int > &glob, std::map< int, int > &rank, std::map< int, Platform > &platform)
 Constructor accepting existing heterogeneous balancing.
 
virtual ~LoadBalancer ()
 Default empty destructor.
 
void swap (LoadBalancer< T > &loadBalancer)
 Swap method.
 
bool isLocal (const int &glob)
 returns whether glob is on this process
 
int loc (const int &glob)
 
int loc (int glob) const
 
int glob (int loc) const
 
int rank (const int &glob)
 
int rank (int glob) const
 
int size () const
 
int getRankSize () const
 
bool operator== (const LoadBalancer< T > &rhs) const
 equal operator
 
std::size_t getNblock () const override
 Number of data blocks for the serializable interface.
 
std::size_t getSerializableSize () const override
 Binary size for the serializer.
 
bool * getBlock (std::size_t iBlock, std::size_t &sizeBlock, bool loadingMode) override
 Return a pointer to the memory of the current block and its size for the serializable interface.
 
void print (bool multiOutput=false) const
 
- Public Member Functions inherited from olb::Serializable
virtual ~Serializable ()=default
 
template<bool includeLogOutputDir = true>
bool save (std::string fileName="", const bool enforceUint=false)
 Save Serializable into file fileName
 
template<bool includeLogOutputDir = true>
bool load (std::string fileName="", const bool enforceUint=false)
 Load Serializable from file fileName
 
bool save (std::uint8_t *buffer)
 Save Serializable into buffer of length getSerializableSize
 
bool load (const std::uint8_t *buffer)
 Load Serializable from buffer of length getSerializableSize
 
virtual void postLoad ()
 

Additional Inherited Members

- Protected Member Functions inherited from olb::BufferSerializable
template<typename DataType >
void registerSerializable (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, DataType &data, const bool loadingMode=false)
 Register Serializable object of dynamic size.
 
template<typename DataType >
void registerStdVectorOfVars (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
 Method for registering a std::vector<DataType> of primitive DataType (int, double, ...)
 
template<typename DataType >
void registerStdVectorOfSerializablesOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
 Method for registering a std::vector<DataType> of constant-sized Serializable
 
template<typename DataType >
void registerStdVectorOfSerializables (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
 Method for registering a std::vector<DataType> of dynamic-sized DataType
 
template<typename DataTypeKey , typename DataTypeValue >
void registerMap (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::map< DataTypeKey, DataTypeValue > &data, const bool loadingMode=false)
 Method for registering a std::map<DataTypeKey, DataTypeValue> of fixed-sized types (i.e. int, double)
 
size_t addSizeToBuffer (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, const size_t data) const
 Add a size_t to the sizeBuffer in the n-th util::round and return that size_t in all successive rounds.
 
- Protected Member Functions inherited from olb::Serializable
template<typename DataType >
void registerVar (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, const DataType &data, const size_t arrayLength=1) const
 Register primitive data types (int, double, ...) or arrays of those.
 
template<typename DataType >
void registerSerializableOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, DataType &data, const bool loadingMode=false)
 Register Serializable object of constant size.
 
template<typename DataType >
void registerSerializablesOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, DataType *data, const size_t arrayLength, const bool loadingMode=false)
 Register an array of Serializable objects of constant size.
 
- Protected Attributes inherited from olb::LoadBalancer< T >
int _size
 number of cuboids after shrink -1 in appropriate thread
 
std::map< int, int > _loc
 maps global cuboid to (local) thread cuboid
 
std::vector< int > _glob
 content is 0,1,2,...,_size
 
std::map< int, int > _rank
 maps global cuboid number to the processing thread
 
std::map< int, Platform_platform
 maps global cuboid number to local platform
 
- Protected Attributes inherited from olb::BufferSerializable
std::vector< bool * > _dataBuffer
 Data buffer for data that has to be buffered between two getBlock() iterations.
 
std::vector< size_t > _sizeBuffer
 std::vector of integer buffers (e.g. for std::vector size) to be buffered for the whole iteration process
 

Detailed Description

template<typename T>
class olb::HeterogeneousLoadBalancer< T >

Load balancer for heterogeneous CPU-GPU systems.

Assigns largest cuboids to GPUs until given largeBlockFraction is reached. Remaining (small) cuboids are assigned to CPUs of their GPU-placed neighbors.

This balancer should only be used in conjunction with a heterogeneous cuboid decomposition and appropriate, system specific, CPU_SIMD+OpenMP configuration.

Definition at line 45 of file heterogeneousLoadBalancer.h.

Constructor & Destructor Documentation

◆ HeterogeneousLoadBalancer()

template<typename T >
olb::HeterogeneousLoadBalancer< T >::HeterogeneousLoadBalancer ( CuboidGeometry< T, 3 > & cGeometry,
T largeBlockFraction = 0.9 )
inline

Definition at line 50 of file heterogeneousLoadBalancer.h.

50 :
51 LoadBalancer<T>(0)
52 {
53 OstreamManager clout(std::cout, "HeterogeneousLoadBalancer");
54
55 std::vector<int> rankBuffer(cGeometry.getNc(), 0);
56 std::vector<int> locBuffer(cGeometry.getNc(), 0);
57 std::vector<std::uint8_t> platformBuffer(cGeometry.getNc(), static_cast<std::uint8_t>(Platform::CPU_SISD));
58
59 const auto nRank = singleton::mpi().getSize();
60 const auto iRank = singleton::mpi().getRank();
61
62 auto& cuboids = cGeometry.cuboids();
63 // Prioritize assignment of largest cuboids
64 std::sort(cuboids.begin(), cuboids.end(),
65 [&](const auto& lhs, const auto& rhs) {
66 return lhs.getLatticeVolume() > rhs.getLatticeVolume();
67 });
68
69 // Distribute cuboids to ranks on rank 0
70 if (iRank == 0) {
71 // Total volume for tracking targeted GPU block fraction
72 std::size_t totalVolume = std::accumulate(cuboids.begin(), cuboids.end(),
73 std::size_t{0},
74 [](const auto& currVolume, const auto& rhs) -> std::size_t {
75 return currVolume + rhs.getLatticeVolume();
76 });
77
78 std::map<int,int> nLoc;
79 int jRank = 0;
80 int iCuboid = 0;
81
82 std::size_t globalAssignedVolume = 0;
83 std::vector<std::size_t> localAssignedVolume(nRank, 0);
84
85 // Assign largest cuboids to GPUs until desired fraction is reached
86 do {
87 jRank = std::distance(localAssignedVolume.begin(),
88 std::min_element(localAssignedVolume.begin(),
89 localAssignedVolume.end()));
90 rankBuffer[iCuboid] = jRank;
91 platformBuffer[iCuboid] = static_cast<std::uint8_t>(Platform::GPU_CUDA);
92 locBuffer[iCuboid] = nLoc[jRank]++;
93 localAssignedVolume[jRank] += cuboids[iCuboid].getLatticeVolume();
94 globalAssignedVolume += cuboids[iCuboid].getLatticeVolume();
95 clout << iCuboid << ", " << jRank << ", assignedVolumeFraction=" << (1.*globalAssignedVolume) / totalVolume << std::endl;
96 iCuboid += 1;
97 } while (globalAssignedVolume < largeBlockFraction*totalVolume);
98
99 // Compute GPU rank affinity of remaining cuboids
100 std::vector<int> preferredRank(cGeometry.getNc(), -1);
101 for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
102 std::vector<int> neighbours;
103 cGeometry.getNeighbourhood(jCuboid, neighbours, 1);
104 std::vector<int> neighboursInRank(nRank, 0);
105 int nGpuNeighbors = 0;
106 for (int neighborC : neighbours) {
107 if (platformBuffer[neighborC] == static_cast<std::uint8_t>(Platform::GPU_CUDA)) {
108 neighboursInRank[rankBuffer[neighborC]] += 1;
109 nGpuNeighbors += 1;
110 }
111 }
112 if (nGpuNeighbors > 0) {
113 preferredRank[jCuboid] = std::distance(neighboursInRank.begin(), std::max_element(neighboursInRank.begin(),
114 neighboursInRank.end()));
115 clout << "Preferred rank of " << jCuboid << " is " << preferredRank[jCuboid] << std::endl;
116 }
117 }
118
119 // Distribute remaining GPU-neighboring blocks over ranks using CPU_SIMD platform
120 for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
121 if (preferredRank[jCuboid] != -1) {
122 auto iRank = preferredRank[jCuboid];
123
124 rankBuffer[jCuboid] = iRank;
125 platformBuffer[jCuboid] = static_cast<std::uint8_t>(Platform::CPU_SIMD);
126 locBuffer[jCuboid] = nLoc[iRank]++;
127 }
128 }
129
130 // Compute fallback CPU rank affinity of remaining cuboids
131 std::vector<int> preferredRankFallback(cGeometry.getNc(), -1);
132 for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
133 if (preferredRank[jCuboid] == -1) {
134 std::vector<int> neighbours;
135 cGeometry.getNeighbourhood(jCuboid, neighbours, 1);
136 std::vector<int> neighboursInRank(nRank, 0);
137 int nCpuNeighbors = 0;
138 for (int neighborC : neighbours) {
139 if (platformBuffer[neighborC] != static_cast<std::uint8_t>(Platform::GPU_CUDA)) {
140 neighboursInRank[rankBuffer[neighborC]] += 1;
141 nCpuNeighbors += 1;
142 }
143 }
144 if (nCpuNeighbors > 0) {
145 preferredRankFallback[jCuboid] = std::distance(neighboursInRank.begin(), std::max_element(neighboursInRank.begin(),
146 neighboursInRank.end()));
147 clout << "Preferred rank of " << jCuboid << " is " << preferredRankFallback[jCuboid] << std::endl;
148 }
149 }
150 }
151
152 // Distribute remaining blocks over ranks using CPU_SIMD platform
153 for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
154 if (preferredRank[jCuboid] == -1) {
155 auto iRank = preferredRankFallback[jCuboid];
156
157 rankBuffer[jCuboid] = iRank;
158 platformBuffer[jCuboid] = static_cast<std::uint8_t>(Platform::CPU_SIMD);
159 locBuffer[jCuboid] = nLoc[iRank]++;
160 }
161 }
162
163 }
164
165 #ifdef PARALLEL_MODE_MPI
166 // Broadcast assignments to all processes
167 singleton::mpi().bCast(rankBuffer.data(), rankBuffer.size());
168 singleton::mpi().bCast(locBuffer.data(), locBuffer.size());
169 singleton::mpi().bCast(platformBuffer.data(), platformBuffer.size());
170 #endif
171
172 // Update internal LoadBalancer structure to match given assignment
173 for (int iCuboid=0; iCuboid < cGeometry.getNc(); ++iCuboid) {
174 this->_rank[iCuboid] = rankBuffer[iCuboid];
175 this->_loc[iCuboid] = locBuffer[iCuboid];
176
177 if (rankBuffer[iCuboid] == iRank) {
178 this->_glob.resize(std::max(int(this->_glob.size()), this->_loc[iCuboid]+1));
179 this->_platform.resize(this->_glob.size());
180
181 this->_glob[this->_loc[iCuboid]] = iCuboid;
182 this->_platform[this->_loc[iCuboid]] = static_cast<Platform>(platformBuffer[iCuboid]);
183
184 this->_size = this->_glob.size();
185 }
186 }
187 }
std::vector< int > _glob
content is 0,1,2,...,_size
int _size
number of cuboids after shrink -1 in appropriate thread
std::map< int, int > _rank
maps global cuboid number to the processing thread
std::map< int, int > _loc
maps global cuboid to (local) thread cuboid
void bCast(T *sendBuf, int sendCount, int root=0, MPI_Comm comm=MPI_COMM_WORLD)
Broadcast data from one processor to multiple processors.
int getSize() const
Returns the number of processes.
int getRank() const
Returns the process ID.
MpiManager & mpi()
Platform
OpenLB execution targets.
Definition platform.h:36
@ CPU_SIMD
Basic scalar CPU.
@ GPU_CUDA
Vector CPU (AVX2 / AVX-512 collision)

References olb::LoadBalancer< T >::_glob, olb::LoadBalancer< T >::_loc, olb::LoadBalancer< T >::_rank, olb::LoadBalancer< T >::_size, olb::singleton::MpiManager::bCast(), olb::CPU_SIMD, olb::CPU_SISD, olb::singleton::MpiManager::getRank(), olb::singleton::MpiManager::getSize(), olb::GPU_CUDA, and olb::singleton::mpi().

+ Here is the call graph for this function:

Member Function Documentation

◆ platform()

template<typename T >
Platform olb::HeterogeneousLoadBalancer< T >::platform ( int loc) const
inlineoverridevirtual
Returns
target platform for processing of local cuboid

Reimplemented from olb::LoadBalancer< T >.

Definition at line 189 of file heterogeneousLoadBalancer.h.

189 {
190 return _platform[loc];
191 }
int loc(const int &glob)

References olb::LoadBalancer< T >::loc().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ setPlatform()

template<typename T >
void olb::HeterogeneousLoadBalancer< T >::setPlatform ( int loc,
Platform platform )
inlinevirtual

Reimplemented from olb::LoadBalancer< T >.

Definition at line 193 of file heterogeneousLoadBalancer.h.

193 {
194 _platform[loc] = platform;
195 }
Platform platform(int loc) const override

References olb::LoadBalancer< T >::loc(), and olb::HeterogeneousLoadBalancer< T >::platform().

+ Here is the call graph for this function:

The documentation for this class was generated from the following file: