Load balancer for heterogeneous CPU-GPU systems. More...

#include <heterogeneousLoadBalancer.h>

Inheritance diagram for olb::HeterogeneousLoadBalancer< T >:

Collaboration diagram for olb::HeterogeneousLoadBalancer< T >:

Public Member Functions
	HeterogeneousLoadBalancer (CuboidGeometry< T, 3 > &cGeometry, T largeBlockFraction=0.9)

Platform	platform (int loc) const override

void	setPlatform (int loc, Platform platform)

Public Member Functions inherited from olb::LoadBalancer< T >
	LoadBalancer (int size=1)
	Default empty constructor.

	LoadBalancer (int size, std::map< int, int > &loc, std::vector< int > &glob, std::map< int, int > &rank)
	Constructor accepting existing balancing.

	LoadBalancer (int size, std::map< int, int > &loc, std::vector< int > &glob, std::map< int, int > &rank, std::map< int, Platform > &platform)
	Constructor accepting existing heterogeneous balancing.

virtual	~LoadBalancer ()
	Default empty destructor.

void	swap (LoadBalancer< T > &loadBalancer)
	Swap method.

bool	isLocal (const int &glob)
	returns whether `glob` is on this process

int	loc (const int &glob)

int	loc (int glob) const

int	glob (int loc) const

int	rank (const int &glob)

int	rank (int glob) const

int	size () const

int	getRankSize () const

bool	operator== (const LoadBalancer< T > &rhs) const
	equal operator

std::size_t	getNblock () const override
	Number of data blocks for the serializable interface.

std::size_t	getSerializableSize () const override
	Binary size for the serializer.

bool *	getBlock (std::size_t iBlock, std::size_t &sizeBlock, bool loadingMode) override
	Return a pointer to the memory of the current block and its size for the serializable interface.

void	print (bool multiOutput=false) const

Public Member Functions inherited from olb::Serializable
virtual	~Serializable ()=default

template<bool includeLogOutputDir = true>
bool	save (std::string fileName="", const bool enforceUint=false)
	Save `Serializable` into file `fileName`

template<bool includeLogOutputDir = true>
bool	load (std::string fileName="", const bool enforceUint=false)
	Load `Serializable` from file `fileName`

bool	save (std::uint8_t *buffer)
	Save `Serializable` into buffer of length `getSerializableSize`

bool	load (const std::uint8_t *buffer)
	Load `Serializable` from buffer of length `getSerializableSize`

virtual void	postLoad ()

Additional Inherited Members
Protected Member Functions inherited from olb::BufferSerializable
template<typename DataType >
void	registerSerializable (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, DataType &data, const bool loadingMode=false)
	Register `Serializable` object of dynamic size.

template<typename DataType >
void	registerStdVectorOfVars (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
	Method for registering a `std::vector<DataType>` of primitive `DataType` (`int`, `double`, ...)

template<typename DataType >
void	registerStdVectorOfSerializablesOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
	Method for registering a `std::vector<DataType>` of constant-sized `Serializable`

template<typename DataType >
void	registerStdVectorOfSerializables (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::vector< DataType > &data, const bool loadingMode=false)
	Method for registering a `std::vector<DataType>` of dynamic-sized `DataType`

template<typename DataTypeKey , typename DataTypeValue >
void	registerMap (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, std::map< DataTypeKey, DataTypeValue > &data, const bool loadingMode=false)
	Method for registering a `std::map<DataTypeKey, DataTypeValue>` of fixed-sized types (i.e. `int`, `double`)

size_t	addSizeToBuffer (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, size_t &sizeBufferIndex, bool *&dataPtr, const size_t data) const
	Add a `size_t` to the `sizeBuffer` in the `n-th` util::round and return that `size_t` in all successive rounds.

Protected Member Functions inherited from olb::Serializable
template<typename DataType >
void	registerVar (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, const DataType &data, const size_t arrayLength=1) const
	Register primitive data types (`int`, `double`, ...) or arrays of those.

template<typename DataType >
void	registerSerializableOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool *&dataPtr, DataType &data, const bool loadingMode=false)
	Register `Serializable` object of constant size.

template<typename DataType >
void	registerSerializablesOfConstSize (const std::size_t iBlock, std::size_t &sizeBlock, std::size_t &currentBlock, bool &dataPtr, DataType data, const size_t arrayLength, const bool loadingMode=false)
	Register an array of `Serializable` objects of constant size.

Protected Attributes inherited from olb::LoadBalancer< T >
int	_size
	number of cuboids after shrink -1 in appropriate thread

std::map< int, int >	_loc
	maps global cuboid to (local) thread cuboid

std::vector< int >	_glob
	content is 0,1,2,...,_size

std::map< int, int >	_rank
	maps global cuboid number to the processing thread

std::map< int, Platform >	_platform
	maps global cuboid number to local platform

Protected Attributes inherited from olb::BufferSerializable
std::vector< bool * >	_dataBuffer
	Data buffer for data that has to be buffered between two `getBlock()` iterations.

std::vector< size_t >	_sizeBuffer
	`std::vector` of integer buffers (e.g. for `std::vector` size) to be buffered for the whole iteration process

Detailed Description

template<typename T>
class olb::HeterogeneousLoadBalancer< T >

Load balancer for heterogeneous CPU-GPU systems.

Assigns largest cuboids to GPUs until given largeBlockFraction is reached. Remaining (small) cuboids are assigned to CPUs of their GPU-placed neighbors.

This balancer should only be used in conjunction with a heterogeneous cuboid decomposition and appropriate, system specific, CPU_SIMD+OpenMP configuration.

Definition at line 45 of file heterogeneousLoadBalancer.h.

Constructor & Destructor Documentation

◆ HeterogeneousLoadBalancer()

template<typename T >

olb::HeterogeneousLoadBalancer< T >::HeterogeneousLoadBalancer	(	CuboidGeometry< T, 3 > &	cGeometry,
		T	largeBlockFraction = 0.9 )

inline

Definition at line 50 of file heterogeneousLoadBalancer.h.

                                                                                     :
    LoadBalancer<T>(0)
  {
    OstreamManager clout(std::cout, "HeterogeneousLoadBalancer");
 
    std::vector<int> rankBuffer(cGeometry.getNc(), 0);
    std::vector<int> locBuffer(cGeometry.getNc(), 0);
    std::vector<std::uint8_t> platformBuffer(cGeometry.getNc(), static_cast<std::uint8_t>(Platform::CPU_SISD));
 
    const auto nRank = singleton::mpi().getSize();
    const auto iRank = singleton::mpi().getRank();
 
    auto& cuboids = cGeometry.cuboids();
    // Prioritize assignment of largest cuboids
    std::sort(cuboids.begin(), cuboids.end(),
              [&](const auto& lhs, const auto& rhs) {
                return lhs.getLatticeVolume() > rhs.getLatticeVolume();
              });
 
    // Distribute cuboids to ranks on rank 0
    if (iRank == 0) {
      // Total volume for tracking targeted GPU block fraction
      std::size_t totalVolume = std::accumulate(cuboids.begin(), cuboids.end(),
                                                std::size_t{0},
                                                [](const auto& currVolume, const auto& rhs) -> std::size_t {
                                                  return currVolume + rhs.getLatticeVolume();
                                                });
 
      std::map<int,int> nLoc;
      int jRank = 0;
      int iCuboid = 0;
 
      std::size_t globalAssignedVolume = 0;
      std::vector<std::size_t> localAssignedVolume(nRank, 0);
 
      // Assign largest cuboids to GPUs until desired fraction is reached
      do {
        jRank = std::distance(localAssignedVolume.begin(),
                              std::min_element(localAssignedVolume.begin(),
                                               localAssignedVolume.end()));
        rankBuffer[iCuboid] = jRank;
        platformBuffer[iCuboid] = static_cast<std::uint8_t>(Platform::GPU_CUDA);
        locBuffer[iCuboid]  = nLoc[jRank]++;
        localAssignedVolume[jRank] += cuboids[iCuboid].getLatticeVolume();
        globalAssignedVolume += cuboids[iCuboid].getLatticeVolume();
        clout << iCuboid << ", " << jRank << ", assignedVolumeFraction=" << (1.*globalAssignedVolume) / totalVolume << std::endl;
        iCuboid += 1;
      } while (globalAssignedVolume < largeBlockFraction*totalVolume);
 
      // Compute GPU rank affinity of remaining cuboids
      std::vector<int> preferredRank(cGeometry.getNc(), -1);
      for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
        std::vector<int> neighbours;
        cGeometry.getNeighbourhood(jCuboid, neighbours, 1);
        std::vector<int> neighboursInRank(nRank, 0);
        int nGpuNeighbors = 0;
        for (int neighborC : neighbours) {
          if (platformBuffer[neighborC] == static_cast<std::uint8_t>(Platform::GPU_CUDA)) {
            neighboursInRank[rankBuffer[neighborC]] += 1;
            nGpuNeighbors += 1;
          }
        }
        if (nGpuNeighbors > 0) {
          preferredRank[jCuboid] = std::distance(neighboursInRank.begin(), std::max_element(neighboursInRank.begin(),
                                                                                            neighboursInRank.end()));
          clout << "Preferred rank of " << jCuboid << " is " << preferredRank[jCuboid] << std::endl;
        }
      }
 
      // Distribute remaining GPU-neighboring blocks over ranks using CPU_SIMD platform
      for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
        if (preferredRank[jCuboid] != -1) {
          auto iRank = preferredRank[jCuboid];
 
          rankBuffer[jCuboid] = iRank;
          platformBuffer[jCuboid] = static_cast<std::uint8_t>(Platform::CPU_SIMD);
          locBuffer[jCuboid]  = nLoc[iRank]++;
        }
      }
 
      // Compute fallback CPU rank affinity of remaining cuboids
      std::vector<int> preferredRankFallback(cGeometry.getNc(), -1);
      for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
        if (preferredRank[jCuboid] == -1) {
          std::vector<int> neighbours;
          cGeometry.getNeighbourhood(jCuboid, neighbours, 1);
          std::vector<int> neighboursInRank(nRank, 0);
          int nCpuNeighbors = 0;
          for (int neighborC : neighbours) {
            if (platformBuffer[neighborC] != static_cast<std::uint8_t>(Platform::GPU_CUDA)) {
              neighboursInRank[rankBuffer[neighborC]] += 1;
              nCpuNeighbors += 1;
            }
          }
          if (nCpuNeighbors > 0) {
            preferredRankFallback[jCuboid] = std::distance(neighboursInRank.begin(), std::max_element(neighboursInRank.begin(),
                                                                                                      neighboursInRank.end()));
            clout << "Preferred rank of " << jCuboid << " is " << preferredRankFallback[jCuboid] << std::endl;
          }
        }
      }
 
      // Distribute remaining blocks over ranks using CPU_SIMD platform
      for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
        if (preferredRank[jCuboid] == -1) {
          auto iRank = preferredRankFallback[jCuboid];
 
          rankBuffer[jCuboid] = iRank;
          platformBuffer[jCuboid] = static_cast<std::uint8_t>(Platform::CPU_SIMD);
          locBuffer[jCuboid]  = nLoc[iRank]++;
        }
      }
 
    }
 
    #ifdef PARALLEL_MODE_MPI
    // Broadcast assignments to all processes
    singleton::mpi().bCast(rankBuffer.data(), rankBuffer.size());
    singleton::mpi().bCast(locBuffer.data(), locBuffer.size());
    singleton::mpi().bCast(platformBuffer.data(), platformBuffer.size());
    #endif
 
    // Update internal LoadBalancer structure to match given assignment
    for (int iCuboid=0; iCuboid < cGeometry.getNc(); ++iCuboid) {
      this->_rank[iCuboid] = rankBuffer[iCuboid];
      this->_loc[iCuboid] = locBuffer[iCuboid];
 
      if (rankBuffer[iCuboid] == iRank) {
        this->_glob.resize(std::max(int(this->_glob.size()), this->_loc[iCuboid]+1));
        this->_platform.resize(this->_glob.size());
 
        this->_glob[this->_loc[iCuboid]] = iCuboid;
        this->_platform[this->_loc[iCuboid]] = static_cast<Platform>(platformBuffer[iCuboid]);
 
        this->_size = this->_glob.size();
      }
    }
  }

References olb::LoadBalancer< T >::_glob, olb::LoadBalancer< T >::_loc, olb::LoadBalancer< T >::_rank, olb::LoadBalancer< T >::_size, olb::singleton::MpiManager::bCast(), olb::CPU_SIMD, olb::CPU_SISD, olb::singleton::MpiManager::getRank(), olb::singleton::MpiManager::getSize(), olb::GPU_CUDA, and olb::singleton::mpi().

Here is the call graph for this function:

Member Function Documentation

◆ platform()

template<typename T >

Platform olb::HeterogeneousLoadBalancer< T >::platform ( int loc ) const

inlineoverridevirtual

Returns: target platform for processing of local cuboid

Reimplemented from olb::LoadBalancer< T >.

Definition at line 189 of file heterogeneousLoadBalancer.h.

                                            {
    return _platform[loc];
  }

References olb::LoadBalancer< T >::loc().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ setPlatform()

template<typename T >

void olb::HeterogeneousLoadBalancer< T >::setPlatform	(	int	loc,
		Platform	platform )

inlinevirtual

Reimplemented from olb::LoadBalancer< T >.

Definition at line 193 of file heterogeneousLoadBalancer.h.

                                               {
    _platform[loc] = platform;
  }

References olb::LoadBalancer< T >::loc(), and olb::HeterogeneousLoadBalancer< T >::platform().

Here is the call graph for this function:

The documentation for this class was generated from the following file:

src/communication/heterogeneousLoadBalancer.h

Public Member Functions

Additional Inherited Members

Detailed Description

Constructor & Destructor Documentation

◆ HeterogeneousLoadBalancer()

Member Function Documentation

◆ platform()

◆ setPlatform()