Load balancer for heterogeneous CPU-GPU systems. More...

#include <heterogeneousLoadBalancer.h>

Inheritance diagram for olb::OrthogonalHeterogeneousLoadBalancer< T >:

Collaboration diagram for olb::OrthogonalHeterogeneousLoadBalancer< T >:

Public Member Functions
	OrthogonalHeterogeneousLoadBalancer (CuboidGeometry< T, 3 > &cGeometry, T largeBlockFraction=0.9)

Platform	platform (int loc) const override

void	setPlatform (int loc, Platform platform)

Public Member Functions inherited from olb::LoadBalancer< T >
	LoadBalancer (int size=1)
	Default empty constructor.

	LoadBalancer (int size, std::map< int, int > &loc, std::vector< int > &glob, std::map< int, int > &rank)
	Constructor accepting existing balancing.

	LoadBalancer (int size, std::map< int, int > &loc, std::vector< int > &glob, std::map< int, int > &rank, std::map< int, Platform > &platform)
	Constructor accepting existing heterogeneous balancing.

virtual	~LoadBalancer ()
	Default empty destructor.

void	swap (LoadBalancer< T > &loadBalancer)
	Swap method.

bool	isLocal (const int &glob)
	returns whether `glob` is on this process

int	loc (const int &glob)

int	loc (int glob) const

int	glob (int loc) const

int	rank (const int &glob)

int	rank (int glob) const

int	size () const

int	getRankSize () const

bool	operator== (const LoadBalancer< T > &rhs) const
	equal operator

std::size_t	getNblock () const override
	Number of data blocks for the serializable interface.

std::size_t	getSerializableSize () const override
	Binary size for the serializer.

bool *	getBlock (std::size_t iBlock, std::size_t &sizeBlock, bool loadingMode) override
	Return a pointer to the memory of the current block and its size for the serializable interface.

void	print (bool multiOutput=false) const

Public Member Functions inherited from olb::Serializable
virtual	~Serializable ()=default

template<bool includeLogOutputDir = true>
bool	save (std::string fileName="", const bool enforceUint=false)
	Save `Serializable` into file `fileName`

template<bool includeLogOutputDir = true>
bool	load (std::string fileName="", const bool enforceUint=false)
	Load `Serializable` from file `fileName`

bool	save (std::uint8_t *buffer)
	Save `Serializable` into buffer of length `getSerializableSize`

bool	load (const std::uint8_t *buffer)
	Load `Serializable` from buffer of length `getSerializableSize`

virtual void	postLoad ()

Additional Inherited Members
Protected Member Functions inherited from olb::BufferSerializable
template<typename DataType >
void	registerSerializable (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, DataType &data, const bool loadingMode=false)
	Register `Serializable` object of dynamic size.

template<typename DataType >
void	registerStdVectorOfVars (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
	Method for registering a `std::vector<DataType>` of primitive `DataType` (`int`, `double`, ...)

template<typename DataType >
void	registerStdVectorOfSerializablesOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
	Method for registering a `std::vector<DataType>` of constant-sized `Serializable`

template<typename DataType >
void	registerStdVectorOfSerializables (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
	Method for registering a `std::vector<DataType>` of dynamic-sized `DataType`

template<typename DataTypeKey , typename DataTypeValue >
void	registerMap (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::map< DataTypeKey, DataTypeValue > &data, const bool loadingMode=false)
	Method for registering a `std::map<DataTypeKey, DataTypeValue>` of fixed-sized types (i.e. `int`, `double`)

size_t	addSizeToBuffer (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, const size_t data) const
	Add a `size_t` to the `sizeBuffer` in the `n-th` util::round and return that `size_t` in all successive rounds.

Protected Member Functions inherited from olb::Serializable
template<typename DataType >
void	registerVar (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, const DataType &data, const size_t arrayLength=1) const
	Register primitive data types (`int`, `double`, ...) or arrays of those.

template<typename DataType >
void	registerSerializableOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, DataType &data, const bool loadingMode=false)
	Register `Serializable` object of constant size.

template<typename DataType >
void	registerSerializablesOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool &dataPtr, DataType data, const size_t arrayLength, const bool loadingMode=false)
	Register an array of `Serializable` objects of constant size.

Protected Attributes inherited from olb::LoadBalancer< T >
int	_size
	number of cuboids after shrink -1 in appropriate thread

std::map< int, int >	_loc
	maps global cuboid to (local) thread cuboid

std::vector< int >	_glob
	content is 0,1,2,...,_size

std::map< int, int >	_rank
	maps global cuboid number to the processing thread

std::map< int, Platform >	_platform
	maps global cuboid number to local platform

Protected Attributes inherited from olb::BufferSerializable
std::vector< bool * >	_dataBuffer
	Data buffer for data that has to be buffered between two `getBlock()` iterations.

std::vector< size_t >	_sizeBuffer
	`std::vector` of integer buffers (e.g. for `std::vector` size) to be buffered for the whole iteration process

Detailed Description

template<typename T>
class olb::OrthogonalHeterogeneousLoadBalancer< T >

Load balancer for heterogeneous CPU-GPU systems.

Assigns largest cuboids to GPUs until given largeBlockFraction is reached. Remaining (small) cuboids are assigned to ranks without GPUs.

This balancer should only be used in conjunction with a heterogeneous cuboid decomposition and appropriate, system specific, CPU_SIMD+OpenMP configuration.

Definition at line 209 of file heterogeneousLoadBalancer.h.

Constructor & Destructor Documentation

◆ OrthogonalHeterogeneousLoadBalancer()

template<typename T >

olb::OrthogonalHeterogeneousLoadBalancer< T >::OrthogonalHeterogeneousLoadBalancer	(	CuboidGeometry< T, 3 > &	cGeometry,
		T	largeBlockFraction = 0.9 )

inline

Definition at line 214 of file heterogeneousLoadBalancer.h.

                                                                                               :
    LoadBalancer<T>(0)
  {
    OstreamManager clout(std::cout, "OrthogonalHeterogeneousLoadBalancer");
 
    std::vector<int> rankBuffer(cGeometry.getNc(), 0);
    std::vector<int> locBuffer(cGeometry.getNc(), 0);
    std::vector<std::uint8_t> platformBuffer(cGeometry.getNc(), static_cast<std::uint8_t>(Platform::CPU_SISD));
 
    const auto nRank = singleton::mpi().getSize();
    const auto iRank = singleton::mpi().getRank();
 
    auto& cuboids = cGeometry.cuboids();
    // Prioritize assignment of largest cuboids
    std::sort(cuboids.begin(), cuboids.end(),
              [&](const auto& lhs, const auto& rhs) {
                return lhs.getLatticeVolume() > rhs.getLatticeVolume();
              });
 
    int localPreferredPlatform = static_cast<int>(Platform::CPU_SIMD);
    #ifdef PLATFORM_GPU_CUDA
    if (gpu::cuda::device::getCount() > 0) {
      localPreferredPlatform = static_cast<int>(Platform::GPU_CUDA);
    }
    #endif
    std::vector<int> preferredPlatform(cuboids.size(), -1);
    #ifdef PARALLEL_MODE_MPI
    singleton::mpi().gather(&localPreferredPlatform,  1,
                            preferredPlatform.data(), 1);
    #endif
 
    // Distribute cuboids to ranks on rank 0
    if (iRank == 0) {
      std::set<int> cpuRanks;
      std::set<int> gpuRanks;
      for (int jRank=0; jRank < singleton::mpi().getSize(); ++jRank) {
        switch (static_cast<Platform>(preferredPlatform[jRank])) {
          case Platform::CPU_SIMD:
            cpuRanks.insert(jRank);
            break;
          case Platform::GPU_CUDA:
            gpuRanks.insert(jRank);
            break;
          default:
            break;
       }
      }
 
      // Total volume for tracking targeted GPU block fraction
      std::size_t totalVolume = std::accumulate(cuboids.begin(), cuboids.end(),
                                                std::size_t{0},
                                                [](const auto& currVolume, const auto& rhs) -> std::size_t {
                                                  return currVolume + rhs.getLatticeVolume();
                                                });
 
      std::map<int,int> nLoc;
      int jRank = 0;
      int iCuboid = 0;
 
      std::size_t globalAssignedVolume = 0;
      std::vector<std::size_t> localAssignedVolume(nRank, 0);
      // Prevent GPU assignment to CPU ranks
      for (int jRank : cpuRanks) {
        localAssignedVolume[jRank] = std::numeric_limits<std::size_t>::max();
      }
 
      // Assign largest cuboids to GPUs until desired fraction is reached
      do {
        jRank = std::distance(localAssignedVolume.begin(),
                              std::min_element(localAssignedVolume.begin(),
                                               localAssignedVolume.end()));
        rankBuffer[iCuboid] = jRank;
        platformBuffer[iCuboid] = static_cast<std::uint8_t>(Platform::GPU_CUDA);
        locBuffer[iCuboid]  = nLoc[jRank]++;
        localAssignedVolume[jRank] += cuboids[iCuboid].getLatticeVolume();
        globalAssignedVolume += cuboids[iCuboid].getLatticeVolume();
        clout << iCuboid << ", " << jRank << ", assignedVolumeFraction=" << (1.*globalAssignedVolume) / totalVolume << std::endl;
        iCuboid += 1;
      } while (globalAssignedVolume < largeBlockFraction*totalVolume);
 
      for (int iRank : gpuRanks) {
        clout << "assignedVolumeOfRank[" << iRank << "]=" << localAssignedVolume[iRank] / double(totalVolume) << std::endl;
      }
 
      // Prevent CPU assignment to GPU ranks
      for (int jRank : cpuRanks) {
        localAssignedVolume[jRank] = 0;
      }
      for (int jRank : gpuRanks) {
        localAssignedVolume[jRank] = std::numeric_limits<std::size_t>::max();
      }
 
      for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
        jRank = std::distance(localAssignedVolume.begin(),
                              std::min_element(localAssignedVolume.begin(),
                                               localAssignedVolume.end()));
        rankBuffer[jCuboid] = jRank;
        platformBuffer[jCuboid] = static_cast<std::uint8_t>(Platform::CPU_SIMD);
        locBuffer[jCuboid]  = nLoc[jRank]++;
        localAssignedVolume[jRank] += cuboids[jCuboid].getLatticeVolume();
        clout << jCuboid << ", " << jRank << std::endl;
      }
 
      for (int iRank : cpuRanks) {
        clout << "assignedVolumeOfRank[" << iRank << "]=" << localAssignedVolume[iRank] / double(totalVolume) << std::endl;
      }
    }
 
    #ifdef PARALLEL_MODE_MPI
    // Broadcast assignments to all processes
    singleton::mpi().bCast(rankBuffer.data(), rankBuffer.size());
    singleton::mpi().bCast(locBuffer.data(), locBuffer.size());
    singleton::mpi().bCast(platformBuffer.data(), platformBuffer.size());
    #endif
 
    // Update internal LoadBalancer structure to match given assignment
    for (int iCuboid=0; iCuboid < cGeometry.getNc(); ++iCuboid) {
      this->_rank[iCuboid] = rankBuffer[iCuboid];
      this->_loc[iCuboid] = locBuffer[iCuboid];
 
      if (rankBuffer[iCuboid] == iRank) {
        this->_glob.resize(std::max(int(this->_glob.size()), this->_loc[iCuboid]+1));
        this->_platform.resize(this->_glob.size());
 
        this->_glob[this->_loc[iCuboid]] = iCuboid;
        this->_platform[this->_loc[iCuboid]] = static_cast<Platform>(platformBuffer[iCuboid]);
 
        this->_size = this->_glob.size();
      }
    }
  }

References olb::LoadBalancer< T >::_glob, olb::LoadBalancer< T >::_loc, olb::LoadBalancer< T >::_rank, olb::LoadBalancer< T >::_size, olb::singleton::MpiManager::bCast(), olb::CPU_SIMD, olb::CPU_SISD, olb::singleton::MpiManager::gather(), olb::gpu::cuda::device::getCount(), olb::singleton::MpiManager::getRank(), olb::singleton::MpiManager::getSize(), olb::GPU_CUDA, and olb::singleton::mpi().

Here is the call graph for this function:

Member Function Documentation

◆ platform()

template<typename T >

Platform olb::OrthogonalHeterogeneousLoadBalancer< T >::platform ( int loc ) const

inlineoverridevirtual

Returns: target platform for processing of local cuboid

Reimplemented from olb::LoadBalancer< T >.

Definition at line 346 of file heterogeneousLoadBalancer.h.

                                            {
    return _platform[loc];
  }

References olb::LoadBalancer< T >::loc().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ setPlatform()

template<typename T >

void olb::OrthogonalHeterogeneousLoadBalancer< T >::setPlatform	(	int	loc,
		Platform	platform )

inlinevirtual

Reimplemented from olb::LoadBalancer< T >.

Definition at line 350 of file heterogeneousLoadBalancer.h.

                                               {
    _platform[loc] = platform;
  }

References olb::LoadBalancer< T >::loc(), and olb::OrthogonalHeterogeneousLoadBalancer< T >::platform().

Here is the call graph for this function:

The documentation for this class was generated from the following file:

src/communication/heterogeneousLoadBalancer.h

Public Member Functions

Additional Inherited Members

Detailed Description

Constructor & Destructor Documentation

◆ OrthogonalHeterogeneousLoadBalancer()

Member Function Documentation

◆ platform()

◆ setPlatform()