OpenLB 1.8.1
Loading...
Searching...
No Matches
olb::OrthogonalHeterogeneousLoadBalancer< T > Class Template Referencefinal

Load balancer for heterogeneous CPU-GPU systems. More...

#include <heterogeneousLoadBalancer.h>

+ Inheritance diagram for olb::OrthogonalHeterogeneousLoadBalancer< T >:
+ Collaboration diagram for olb::OrthogonalHeterogeneousLoadBalancer< T >:

Public Member Functions

 OrthogonalHeterogeneousLoadBalancer (CuboidDecomposition< T, 3 > &cGeometry, T largeBlockFraction=0.9)
 
Platform platform (int loc) const override
 
void setPlatform (int loc, Platform platform)
 
- Public Member Functions inherited from olb::LoadBalancer< T >
 LoadBalancer (int size=1)
 Default empty constructor.
 
 LoadBalancer (int size, std::map< int, int > &loc, std::vector< int > &glob, std::map< int, int > &rank)
 Constructor accepting existing balancing.
 
 LoadBalancer (int size, std::map< int, int > &loc, std::vector< int > &glob, std::map< int, int > &rank, std::map< int, Platform > &platform)
 Constructor accepting existing heterogeneous balancing.
 
virtual ~LoadBalancer ()
 Default empty destructor.
 
void swap (LoadBalancer< T > &loadBalancer)
 Swap method.
 
bool isLocal (const int &glob) const
 returns whether glob is on this process
 
bool isLocal (Platform platform) const
 returns whether there is a block on platform in this process
 
int loc (const int &glob)
 
int loc (int glob) const
 
int glob (int loc) const
 
int rank (int glob) const
 
int size () const
 
int getRankSize () const
 
virtual bool doOutput (int glob) const
 
virtual void setDoOutput (int glob, bool doOutput)
 
bool operator== (const LoadBalancer< T > &rhs) const
 equal operator
 
std::size_t getNblock () const override
 Number of data blocks for the serializable interface.
 
std::size_t getSerializableSize () const override
 Binary size for the serializer.
 
bool * getBlock (std::size_t iBlock, std::size_t &sizeBlock, bool loadingMode) override
 Return a pointer to the memory of the current block and its size for the serializable interface.
 
void print (bool multiOutput=false) const
 
- Public Member Functions inherited from olb::Serializable
virtual ~Serializable ()=default
 
template<bool includeLogOutputDir = true>
bool save (std::string fileName="", const bool enforceUint=false)
 Save Serializable into file fileName
 
template<bool includeLogOutputDir = true>
bool load (std::string fileName="", const bool enforceUint=false)
 Load Serializable from file fileName
 
bool save (std::uint8_t *buffer)
 Save Serializable into buffer of length getSerializableSize
 
bool load (const std::uint8_t *buffer)
 Load Serializable from buffer of length getSerializableSize
 
virtual void postLoad ()
 

Additional Inherited Members

- Protected Member Functions inherited from olb::BufferSerializable
template<typename DataType >
void registerSerializable (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, DataType &data, const bool loadingMode=false)
 Register Serializable object of dynamic size.
 
template<typename DataType >
void registerStdVectorOfVars (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
 Method for registering a std::vector<DataType> of primitive DataType (int, double, ...)
 
template<typename DataType >
void registerStdVectorOfSerializablesOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
 Method for registering a std::vector<DataType> of constant-sized Serializable
 
template<typename DataType >
void registerStdVectorOfSerializables (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
 Method for registering a std::vector<DataType> of dynamic-sized DataType
 
template<typename DataTypeKey , typename DataTypeValue >
void registerMap (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::map< DataTypeKey, DataTypeValue > &data, const bool loadingMode=false)
 Method for registering a std::map<DataTypeKey, DataTypeValue> of fixed-sized types (i.e. int, double)
 
size_t addSizeToBuffer (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, const size_t data) const
 Add a size_t to the sizeBuffer in the n-th util::round and return that size_t in all successive rounds.
 
- Protected Member Functions inherited from olb::Serializable
template<typename DataType >
void registerVar (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, const DataType &data, const size_t arrayLength=1) const
 Register primitive data types (int, double, ...) or arrays of those.
 
template<typename DataType >
void registerSerializableOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, DataType &data, const bool loadingMode=false)
 Register Serializable object of constant size.
 
template<typename DataType >
void registerSerializablesOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, DataType *data, const size_t arrayLength, const bool loadingMode=false)
 Register an array of Serializable objects of constant size.
 
- Protected Attributes inherited from olb::LoadBalancer< T >
int _size
 number of cuboids after shrink -1 in appropriate thread
 
std::map< int, int > _loc
 maps global cuboid to (local) thread cuboid
 
std::vector< int > _glob
 content is 0,1,2,...,_size
 
std::map< int, int > _rank
 maps global cuboid number to the processing thread
 
std::map< int, Platform_platform
 maps global cuboid number to local platform
 
std::map< int, bool > _doOutput
 defines if global cuboid number has state doOutput
 
- Protected Attributes inherited from olb::BufferSerializable
std::vector< bool * > _dataBuffer
 Data buffer for data that has to be buffered between two getBlock() iterations.
 
std::vector< size_t > _sizeBuffer
 std::vector of integer buffers (e.g. for std::vector size) to be buffered for the whole iteration process
 

Detailed Description

template<typename T>
class olb::OrthogonalHeterogeneousLoadBalancer< T >

Load balancer for heterogeneous CPU-GPU systems.

Assigns largest cuboids to GPUs until given largeBlockFraction is reached. Remaining (small) cuboids are assigned to ranks without GPUs.

This balancer should only be used in conjunction with a heterogeneous cuboid decomposition and appropriate, system specific, CPU_SIMD+OpenMP configuration.

Definition at line 209 of file heterogeneousLoadBalancer.h.

Constructor & Destructor Documentation

◆ OrthogonalHeterogeneousLoadBalancer()

template<typename T >
olb::OrthogonalHeterogeneousLoadBalancer< T >::OrthogonalHeterogeneousLoadBalancer ( CuboidDecomposition< T, 3 > & cGeometry,
T largeBlockFraction = 0.9 )
inline

Definition at line 214 of file heterogeneousLoadBalancer.h.

214 :
216 {
217 OstreamManager clout(std::cout, "OrthogonalHeterogeneousLoadBalancer");
218
219 std::vector<int> rankBuffer(cGeometry.size(), 0);
220 std::vector<int> locBuffer(cGeometry.size(), 0);
221 std::vector<std::uint8_t> platformBuffer(cGeometry.size(), static_cast<std::uint8_t>(Platform::CPU_SISD));
222
223 const auto nRank = singleton::mpi().getSize();
224 const auto iRank = singleton::mpi().getRank();
225
226 auto& cuboids = cGeometry.cuboids();
227 // Prioritize assignment of largest cuboids
228 std::sort(cuboids.begin(), cuboids.end(),
229 [&](const auto& lhs, const auto& rhs) {
230 return lhs.getLatticeVolume() > rhs.getLatticeVolume();
231 });
232
233 #if defined(PARALLEL_MODE_MPI) || defined(PLATFORM_GPU_CUDA)
234 int localPreferredPlatform = static_cast<int>(Platform::CPU_SIMD);
235 #endif
236 #ifdef PLATFORM_GPU_CUDA
237 if (gpu::cuda::device::getCount() > 0) {
238 localPreferredPlatform = static_cast<int>(Platform::GPU_CUDA);
239 }
240 #endif
241 std::vector<int> preferredPlatform(cuboids.size(), -1);
242 #ifdef PARALLEL_MODE_MPI
243 singleton::mpi().gather(&localPreferredPlatform, 1,
244 preferredPlatform.data(), 1);
245 #endif
246
247 // Distribute cuboids to ranks on rank 0
248 if (iRank == 0) {
249 std::set<int> cpuRanks;
250 std::set<int> gpuRanks;
251 for (int jRank=0; jRank < singleton::mpi().getSize(); ++jRank) {
252 switch (static_cast<Platform>(preferredPlatform[jRank])) {
254 cpuRanks.insert(jRank);
255 break;
257 gpuRanks.insert(jRank);
258 break;
259 default:
260 break;
261 }
262 }
263
264 // Total volume for tracking targeted GPU block fraction
265 std::size_t totalVolume = std::accumulate(cuboids.begin(), cuboids.end(),
266 std::size_t{0},
267 [](const auto& currVolume, const auto& rhs) -> std::size_t {
268 return currVolume + rhs.getLatticeVolume();
269 });
270
271 std::map<int,int> nLoc;
272 int jRank = 0;
273 int iCuboid = 0;
274
275 std::size_t globalAssignedVolume = 0;
276 std::vector<std::size_t> localAssignedVolume(nRank, 0);
277 // Prevent GPU assignment to CPU ranks
278 for (int jRank : cpuRanks) {
279 localAssignedVolume[jRank] = std::numeric_limits<std::size_t>::max();
280 }
281
282 // Assign largest cuboids to GPUs until desired fraction is reached
283 do {
284 jRank = std::distance(localAssignedVolume.begin(),
285 std::min_element(localAssignedVolume.begin(),
286 localAssignedVolume.end()));
287 rankBuffer[iCuboid] = jRank;
288 platformBuffer[iCuboid] = static_cast<std::uint8_t>(Platform::GPU_CUDA);
289 locBuffer[iCuboid] = nLoc[jRank]++;
290 localAssignedVolume[jRank] += cuboids[iCuboid].getLatticeVolume();
291 globalAssignedVolume += cuboids[iCuboid].getLatticeVolume();
292 clout << iCuboid << ", " << jRank << ", assignedVolumeFraction=" << (1.*globalAssignedVolume) / totalVolume << std::endl;
293 iCuboid += 1;
294 } while (globalAssignedVolume < largeBlockFraction*totalVolume);
295
296 for (int iRank : gpuRanks) {
297 clout << "assignedVolumeOfRank[" << iRank << "]=" << localAssignedVolume[iRank] / double(totalVolume) << std::endl;
298 }
299
300 // Prevent CPU assignment to GPU ranks
301 for (int jRank : cpuRanks) {
302 localAssignedVolume[jRank] = 0;
303 }
304 for (int jRank : gpuRanks) {
305 localAssignedVolume[jRank] = std::numeric_limits<std::size_t>::max();
306 }
307
308 for (int jCuboid=iCuboid; jCuboid < cGeometry.size(); ++jCuboid) {
309 jRank = std::distance(localAssignedVolume.begin(),
310 std::min_element(localAssignedVolume.begin(),
311 localAssignedVolume.end()));
312 rankBuffer[jCuboid] = jRank;
313 platformBuffer[jCuboid] = static_cast<std::uint8_t>(Platform::CPU_SIMD);
314 locBuffer[jCuboid] = nLoc[jRank]++;
315 localAssignedVolume[jRank] += cuboids[jCuboid].getLatticeVolume();
316 clout << jCuboid << ", " << jRank << std::endl;
317 }
318
319 for (int iRank : cpuRanks) {
320 clout << "assignedVolumeOfRank[" << iRank << "]=" << localAssignedVolume[iRank] / double(totalVolume) << std::endl;
321 }
322 }
323
324 #ifdef PARALLEL_MODE_MPI
325 // Broadcast assignments to all processes
326 singleton::mpi().bCast(rankBuffer.data(), rankBuffer.size());
327 singleton::mpi().bCast(locBuffer.data(), locBuffer.size());
328 singleton::mpi().bCast(platformBuffer.data(), platformBuffer.size());
329 #endif
330
331 // Update internal LoadBalancer structure to match given assignment
332 for (int iCuboid=0; iCuboid < cGeometry.size(); ++iCuboid) {
333 this->_rank[iCuboid] = rankBuffer[iCuboid];
334 this->_loc[iCuboid] = locBuffer[iCuboid];
335
336 if (rankBuffer[iCuboid] == iRank) {
337 this->_glob.resize(std::max(int(this->_glob.size()), this->_loc[iCuboid]+1));
338 this->_platform.resize(this->_glob.size());
339
340 this->_glob[this->_loc[iCuboid]] = iCuboid;
341 this->_platform[this->_loc[iCuboid]] = static_cast<Platform>(platformBuffer[iCuboid]);
342
343 this->_size = this->_glob.size();
344 }
345 }
346 }
std::vector< int > _glob
content is 0,1,2,...,_size
int _size
number of cuboids after shrink -1 in appropriate thread
std::map< int, int > _rank
maps global cuboid number to the processing thread
std::map< int, int > _loc
maps global cuboid to (local) thread cuboid
LoadBalancer(int size=1)
Default empty constructor.
void gather(T *sendBuf, int sendCount, T *recvBuf, int recvCount, int root=0, MPI_Comm comm=MPI_COMM_WORLD)
Gather data from multiple processors to one processor.
void bCast(T *sendBuf, int sendCount, int root=0, MPI_Comm comm=MPI_COMM_WORLD)
Broadcast data from one processor to multiple processors.
int getSize() const
Returns the number of processes.
int getRank() const
Returns the process ID.
int getCount()
Return number of available devices.
Definition device.hh:42
MpiManager & mpi()
Platform
OpenLB execution targets.
Definition platform.h:35
@ CPU_SIMD
Basic scalar CPU.
@ GPU_CUDA
Vector CPU (AVX2 / AVX-512 collision)

References olb::LoadBalancer< T >::_glob, olb::LoadBalancer< T >::_loc, olb::LoadBalancer< T >::_rank, olb::LoadBalancer< T >::_size, olb::singleton::MpiManager::bCast(), olb::CPU_SIMD, olb::CPU_SISD, olb::CuboidDecomposition< T, D >::cuboids(), olb::singleton::MpiManager::gather(), olb::gpu::cuda::device::getCount(), olb::singleton::MpiManager::getRank(), olb::singleton::MpiManager::getSize(), olb::GPU_CUDA, olb::singleton::mpi(), and olb::CuboidDecomposition< T, D >::size().

+ Here is the call graph for this function:

Member Function Documentation

◆ platform()

template<typename T >
Platform olb::OrthogonalHeterogeneousLoadBalancer< T >::platform ( int loc) const
inlineoverridevirtual
Returns
target platform for processing of local cuboid

Reimplemented from olb::LoadBalancer< T >.

Definition at line 348 of file heterogeneousLoadBalancer.h.

348 {
349 return _platform[loc];
350 }
int loc(const int &glob)

References olb::LoadBalancer< T >::loc().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ setPlatform()

template<typename T >
void olb::OrthogonalHeterogeneousLoadBalancer< T >::setPlatform ( int loc,
Platform platform )
inlinevirtual

Reimplemented from olb::LoadBalancer< T >.

Definition at line 352 of file heterogeneousLoadBalancer.h.

352 {
353 _platform[loc] = platform;
354 }

References olb::LoadBalancer< T >::loc(), and olb::OrthogonalHeterogeneousLoadBalancer< T >::platform().

+ Here is the call graph for this function:

The documentation for this class was generated from the following file: