OpenLB 1.7
Loading...
Searching...
No Matches
Public Member Functions | List of all members
olb::OrthogonalHeterogeneousLoadBalancer< T > Class Template Referencefinal

Load balancer for heterogeneous CPU-GPU systems. More...

#include <heterogeneousLoadBalancer.h>

+ Inheritance diagram for olb::OrthogonalHeterogeneousLoadBalancer< T >:
+ Collaboration diagram for olb::OrthogonalHeterogeneousLoadBalancer< T >:

Public Member Functions

 OrthogonalHeterogeneousLoadBalancer (CuboidGeometry< T, 3 > &cGeometry, T largeBlockFraction=0.9)
 
Platform platform (int loc) const override
 
void setPlatform (int loc, Platform platform)
 
- Public Member Functions inherited from olb::LoadBalancer< T >
 LoadBalancer (int size=1)
 Default empty constructor.
 
 LoadBalancer (int size, std::map< int, int > &loc, std::vector< int > &glob, std::map< int, int > &rank)
 Constructor accepting existing balancing.
 
 LoadBalancer (int size, std::map< int, int > &loc, std::vector< int > &glob, std::map< int, int > &rank, std::map< int, Platform > &platform)
 Constructor accepting existing heterogeneous balancing.
 
virtual ~LoadBalancer ()
 Default empty destructor.
 
void swap (LoadBalancer< T > &loadBalancer)
 Swap method.
 
bool isLocal (const int &glob)
 returns whether glob is on this process
 
int loc (const int &glob)
 
int loc (int glob) const
 
int glob (int loc) const
 
int rank (const int &glob)
 
int rank (int glob) const
 
int size () const
 
int getRankSize () const
 
bool operator== (const LoadBalancer< T > &rhs) const
 equal operator
 
std::size_t getNblock () const override
 Number of data blocks for the serializable interface.
 
std::size_t getSerializableSize () const override
 Binary size for the serializer.
 
bool * getBlock (std::size_t iBlock, std::size_t &sizeBlock, bool loadingMode) override
 Return a pointer to the memory of the current block and its size for the serializable interface.
 
void print (bool multiOutput=false) const
 
- Public Member Functions inherited from olb::Serializable
virtual ~Serializable ()=default
 
template<bool includeLogOutputDir = true>
bool save (std::string fileName="", const bool enforceUint=false)
 Save Serializable into file fileName
 
template<bool includeLogOutputDir = true>
bool load (std::string fileName="", const bool enforceUint=false)
 Load Serializable from file fileName
 
bool save (std::uint8_t *buffer)
 Save Serializable into buffer of length getSerializableSize
 
bool load (const std::uint8_t *buffer)
 Load Serializable from buffer of length getSerializableSize
 
virtual void postLoad ()
 

Additional Inherited Members

- Protected Member Functions inherited from olb::BufferSerializable
template<typename DataType >
void registerSerializable (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, DataType &data, const bool loadingMode=false)
 Register Serializable object of dynamic size.
 
template<typename DataType >
void registerStdVectorOfVars (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
 Method for registering a std::vector<DataType> of primitive DataType (int, double, ...)
 
template<typename DataType >
void registerStdVectorOfSerializablesOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
 Method for registering a std::vector<DataType> of constant-sized Serializable
 
template<typename DataType >
void registerStdVectorOfSerializables (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
 Method for registering a std::vector<DataType> of dynamic-sized DataType
 
template<typename DataTypeKey , typename DataTypeValue >
void registerMap (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::map< DataTypeKey, DataTypeValue > &data, const bool loadingMode=false)
 Method for registering a std::map<DataTypeKey, DataTypeValue> of fixed-sized types (i.e. int, double)
 
size_t addSizeToBuffer (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, const size_t data) const
 Add a size_t to the sizeBuffer in the n-th util::round and return that size_t in all successive rounds.
 
- Protected Member Functions inherited from olb::Serializable
template<typename DataType >
void registerVar (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, const DataType &data, const size_t arrayLength=1) const
 Register primitive data types (int, double, ...) or arrays of those.
 
template<typename DataType >
void registerSerializableOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, DataType &data, const bool loadingMode=false)
 Register Serializable object of constant size.
 
template<typename DataType >
void registerSerializablesOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, DataType *data, const size_t arrayLength, const bool loadingMode=false)
 Register an array of Serializable objects of constant size.
 
- Protected Attributes inherited from olb::LoadBalancer< T >
int _size
 number of cuboids after shrink -1 in appropriate thread
 
std::map< int, int > _loc
 maps global cuboid to (local) thread cuboid
 
std::vector< int > _glob
 content is 0,1,2,...,_size
 
std::map< int, int > _rank
 maps global cuboid number to the processing thread
 
std::map< int, Platform_platform
 maps global cuboid number to local platform
 
- Protected Attributes inherited from olb::BufferSerializable
std::vector< bool * > _dataBuffer
 Data buffer for data that has to be buffered between two getBlock() iterations.
 
std::vector< size_t > _sizeBuffer
 std::vector of integer buffers (e.g. for std::vector size) to be buffered for the whole iteration process
 

Detailed Description

template<typename T>
class olb::OrthogonalHeterogeneousLoadBalancer< T >

Load balancer for heterogeneous CPU-GPU systems.

Assigns largest cuboids to GPUs until given largeBlockFraction is reached. Remaining (small) cuboids are assigned to ranks without GPUs.

This balancer should only be used in conjunction with a heterogeneous cuboid decomposition and appropriate, system specific, CPU_SIMD+OpenMP configuration.

Definition at line 209 of file heterogeneousLoadBalancer.h.

Constructor & Destructor Documentation

◆ OrthogonalHeterogeneousLoadBalancer()

template<typename T >
olb::OrthogonalHeterogeneousLoadBalancer< T >::OrthogonalHeterogeneousLoadBalancer ( CuboidGeometry< T, 3 > & cGeometry,
T largeBlockFraction = 0.9 )
inline

Definition at line 214 of file heterogeneousLoadBalancer.h.

214 :
215 LoadBalancer<T>(0)
216 {
217 OstreamManager clout(std::cout, "OrthogonalHeterogeneousLoadBalancer");
218
219 std::vector<int> rankBuffer(cGeometry.getNc(), 0);
220 std::vector<int> locBuffer(cGeometry.getNc(), 0);
221 std::vector<std::uint8_t> platformBuffer(cGeometry.getNc(), static_cast<std::uint8_t>(Platform::CPU_SISD));
222
223 const auto nRank = singleton::mpi().getSize();
224 const auto iRank = singleton::mpi().getRank();
225
226 auto& cuboids = cGeometry.cuboids();
227 // Prioritize assignment of largest cuboids
228 std::sort(cuboids.begin(), cuboids.end(),
229 [&](const auto& lhs, const auto& rhs) {
230 return lhs.getLatticeVolume() > rhs.getLatticeVolume();
231 });
232
233 int localPreferredPlatform = static_cast<int>(Platform::CPU_SIMD);
234 #ifdef PLATFORM_GPU_CUDA
235 if (gpu::cuda::device::getCount() > 0) {
236 localPreferredPlatform = static_cast<int>(Platform::GPU_CUDA);
237 }
238 #endif
239 std::vector<int> preferredPlatform(cuboids.size(), -1);
240 #ifdef PARALLEL_MODE_MPI
241 singleton::mpi().gather(&localPreferredPlatform, 1,
242 preferredPlatform.data(), 1);
243 #endif
244
245 // Distribute cuboids to ranks on rank 0
246 if (iRank == 0) {
247 std::set<int> cpuRanks;
248 std::set<int> gpuRanks;
249 for (int jRank=0; jRank < singleton::mpi().getSize(); ++jRank) {
250 switch (static_cast<Platform>(preferredPlatform[jRank])) {
252 cpuRanks.insert(jRank);
253 break;
255 gpuRanks.insert(jRank);
256 break;
257 default:
258 break;
259 }
260 }
261
262 // Total volume for tracking targeted GPU block fraction
263 std::size_t totalVolume = std::accumulate(cuboids.begin(), cuboids.end(),
264 std::size_t{0},
265 [](const auto& currVolume, const auto& rhs) -> std::size_t {
266 return currVolume + rhs.getLatticeVolume();
267 });
268
269 std::map<int,int> nLoc;
270 int jRank = 0;
271 int iCuboid = 0;
272
273 std::size_t globalAssignedVolume = 0;
274 std::vector<std::size_t> localAssignedVolume(nRank, 0);
275 // Prevent GPU assignment to CPU ranks
276 for (int jRank : cpuRanks) {
277 localAssignedVolume[jRank] = std::numeric_limits<std::size_t>::max();
278 }
279
280 // Assign largest cuboids to GPUs until desired fraction is reached
281 do {
282 jRank = std::distance(localAssignedVolume.begin(),
283 std::min_element(localAssignedVolume.begin(),
284 localAssignedVolume.end()));
285 rankBuffer[iCuboid] = jRank;
286 platformBuffer[iCuboid] = static_cast<std::uint8_t>(Platform::GPU_CUDA);
287 locBuffer[iCuboid] = nLoc[jRank]++;
288 localAssignedVolume[jRank] += cuboids[iCuboid].getLatticeVolume();
289 globalAssignedVolume += cuboids[iCuboid].getLatticeVolume();
290 clout << iCuboid << ", " << jRank << ", assignedVolumeFraction=" << (1.*globalAssignedVolume) / totalVolume << std::endl;
291 iCuboid += 1;
292 } while (globalAssignedVolume < largeBlockFraction*totalVolume);
293
294 for (int iRank : gpuRanks) {
295 clout << "assignedVolumeOfRank[" << iRank << "]=" << localAssignedVolume[iRank] / double(totalVolume) << std::endl;
296 }
297
298 // Prevent CPU assignment to GPU ranks
299 for (int jRank : cpuRanks) {
300 localAssignedVolume[jRank] = 0;
301 }
302 for (int jRank : gpuRanks) {
303 localAssignedVolume[jRank] = std::numeric_limits<std::size_t>::max();
304 }
305
306 for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
307 jRank = std::distance(localAssignedVolume.begin(),
308 std::min_element(localAssignedVolume.begin(),
309 localAssignedVolume.end()));
310 rankBuffer[jCuboid] = jRank;
311 platformBuffer[jCuboid] = static_cast<std::uint8_t>(Platform::CPU_SIMD);
312 locBuffer[jCuboid] = nLoc[jRank]++;
313 localAssignedVolume[jRank] += cuboids[jCuboid].getLatticeVolume();
314 clout << jCuboid << ", " << jRank << std::endl;
315 }
316
317 for (int iRank : cpuRanks) {
318 clout << "assignedVolumeOfRank[" << iRank << "]=" << localAssignedVolume[iRank] / double(totalVolume) << std::endl;
319 }
320 }
321
322 #ifdef PARALLEL_MODE_MPI
323 // Broadcast assignments to all processes
324 singleton::mpi().bCast(rankBuffer.data(), rankBuffer.size());
325 singleton::mpi().bCast(locBuffer.data(), locBuffer.size());
326 singleton::mpi().bCast(platformBuffer.data(), platformBuffer.size());
327 #endif
328
329 // Update internal LoadBalancer structure to match given assignment
330 for (int iCuboid=0; iCuboid < cGeometry.getNc(); ++iCuboid) {
331 this->_rank[iCuboid] = rankBuffer[iCuboid];
332 this->_loc[iCuboid] = locBuffer[iCuboid];
333
334 if (rankBuffer[iCuboid] == iRank) {
335 this->_glob.resize(std::max(int(this->_glob.size()), this->_loc[iCuboid]+1));
336 this->_platform.resize(this->_glob.size());
337
338 this->_glob[this->_loc[iCuboid]] = iCuboid;
339 this->_platform[this->_loc[iCuboid]] = static_cast<Platform>(platformBuffer[iCuboid]);
340
341 this->_size = this->_glob.size();
342 }
343 }
344 }
std::vector< int > _glob
content is 0,1,2,...,_size
int _size
number of cuboids after shrink -1 in appropriate thread
std::map< int, int > _rank
maps global cuboid number to the processing thread
std::map< int, int > _loc
maps global cuboid to (local) thread cuboid
void gather(T *sendBuf, int sendCount, T *recvBuf, int recvCount, int root=0, MPI_Comm comm=MPI_COMM_WORLD)
Gather data from multiple processors to one processor.
void bCast(T *sendBuf, int sendCount, int root=0, MPI_Comm comm=MPI_COMM_WORLD)
Broadcast data from one processor to multiple processors.
int getSize() const
Returns the number of processes.
int getRank() const
Returns the process ID.
int getCount()
Return number of available devices.
Definition device.hh:42
MpiManager & mpi()
Platform
OpenLB execution targets.
Definition platform.h:36
@ CPU_SIMD
Basic scalar CPU.
@ GPU_CUDA
Vector CPU (AVX2 / AVX-512 collision)

References olb::LoadBalancer< T >::_glob, olb::LoadBalancer< T >::_loc, olb::LoadBalancer< T >::_rank, olb::LoadBalancer< T >::_size, olb::singleton::MpiManager::bCast(), olb::CPU_SIMD, olb::CPU_SISD, olb::singleton::MpiManager::gather(), olb::gpu::cuda::device::getCount(), olb::singleton::MpiManager::getRank(), olb::singleton::MpiManager::getSize(), olb::GPU_CUDA, and olb::singleton::mpi().

+ Here is the call graph for this function:

Member Function Documentation

◆ platform()

template<typename T >
Platform olb::OrthogonalHeterogeneousLoadBalancer< T >::platform ( int loc) const
inlineoverridevirtual
Returns
target platform for processing of local cuboid

Reimplemented from olb::LoadBalancer< T >.

Definition at line 346 of file heterogeneousLoadBalancer.h.

346 {
347 return _platform[loc];
348 }
int loc(const int &glob)

References olb::LoadBalancer< T >::loc().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ setPlatform()

template<typename T >
void olb::OrthogonalHeterogeneousLoadBalancer< T >::setPlatform ( int loc,
Platform platform )
inlinevirtual

Reimplemented from olb::LoadBalancer< T >.

Definition at line 350 of file heterogeneousLoadBalancer.h.

350 {
351 _platform[loc] = platform;
352 }

References olb::LoadBalancer< T >::loc(), and olb::OrthogonalHeterogeneousLoadBalancer< T >::platform().

+ Here is the call graph for this function:

The documentation for this class was generated from the following file: