OpenLB 1.7
Loading...
Searching...
No Matches
heterogeneousLoadBalancer.h
Go to the documentation of this file.
1/* This file is part of the OpenLB library
2 *
3 * Copyright (C) 2022 Adrian Kummerlaender
4 * E-mail contact: info@openlb.net
5 * The most recent release of OpenLB can be downloaded at
6 * <http://www.openlb.net/>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version 2
11 * of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the Free
20 * Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
22*/
23
24#ifndef HETEROGENEOUS_LOAD_BALANCER_H
25#define HETEROGENEOUS_LOAD_BALANCER_H
26
28
29#include <random>
30#include <algorithm>
31#include <queue>
32
33namespace olb {
34
36
44template<typename T>
45class HeterogeneousLoadBalancer final : public LoadBalancer<T> {
46private:
47 std::vector<Platform> _platform;
48
49public:
50 HeterogeneousLoadBalancer(CuboidGeometry<T,3>& cGeometry, T largeBlockFraction=0.9):
51 LoadBalancer<T>(0)
52 {
53 OstreamManager clout(std::cout, "HeterogeneousLoadBalancer");
54
55 std::vector<int> rankBuffer(cGeometry.getNc(), 0);
56 std::vector<int> locBuffer(cGeometry.getNc(), 0);
57 std::vector<std::uint8_t> platformBuffer(cGeometry.getNc(), static_cast<std::uint8_t>(Platform::CPU_SISD));
58
59 const auto nRank = singleton::mpi().getSize();
60 const auto iRank = singleton::mpi().getRank();
61
62 auto& cuboids = cGeometry.cuboids();
63 // Prioritize assignment of largest cuboids
64 std::sort(cuboids.begin(), cuboids.end(),
65 [&](const auto& lhs, const auto& rhs) {
66 return lhs.getLatticeVolume() > rhs.getLatticeVolume();
67 });
68
69 // Distribute cuboids to ranks on rank 0
70 if (iRank == 0) {
71 // Total volume for tracking targeted GPU block fraction
72 std::size_t totalVolume = std::accumulate(cuboids.begin(), cuboids.end(),
73 std::size_t{0},
74 [](const auto& currVolume, const auto& rhs) -> std::size_t {
75 return currVolume + rhs.getLatticeVolume();
76 });
77
78 std::map<int,int> nLoc;
79 int jRank = 0;
80 int iCuboid = 0;
81
82 std::size_t globalAssignedVolume = 0;
83 std::vector<std::size_t> localAssignedVolume(nRank, 0);
84
85 // Assign largest cuboids to GPUs until desired fraction is reached
86 do {
87 jRank = std::distance(localAssignedVolume.begin(),
88 std::min_element(localAssignedVolume.begin(),
89 localAssignedVolume.end()));
90 rankBuffer[iCuboid] = jRank;
91 platformBuffer[iCuboid] = static_cast<std::uint8_t>(Platform::GPU_CUDA);
92 locBuffer[iCuboid] = nLoc[jRank]++;
93 localAssignedVolume[jRank] += cuboids[iCuboid].getLatticeVolume();
94 globalAssignedVolume += cuboids[iCuboid].getLatticeVolume();
95 clout << iCuboid << ", " << jRank << ", assignedVolumeFraction=" << (1.*globalAssignedVolume) / totalVolume << std::endl;
96 iCuboid += 1;
97 } while (globalAssignedVolume < largeBlockFraction*totalVolume);
98
99 // Compute GPU rank affinity of remaining cuboids
100 std::vector<int> preferredRank(cGeometry.getNc(), -1);
101 for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
102 std::vector<int> neighbours;
103 cGeometry.getNeighbourhood(jCuboid, neighbours, 1);
104 std::vector<int> neighboursInRank(nRank, 0);
105 int nGpuNeighbors = 0;
106 for (int neighborC : neighbours) {
107 if (platformBuffer[neighborC] == static_cast<std::uint8_t>(Platform::GPU_CUDA)) {
108 neighboursInRank[rankBuffer[neighborC]] += 1;
109 nGpuNeighbors += 1;
110 }
111 }
112 if (nGpuNeighbors > 0) {
113 preferredRank[jCuboid] = std::distance(neighboursInRank.begin(), std::max_element(neighboursInRank.begin(),
114 neighboursInRank.end()));
115 clout << "Preferred rank of " << jCuboid << " is " << preferredRank[jCuboid] << std::endl;
116 }
117 }
118
119 // Distribute remaining GPU-neighboring blocks over ranks using CPU_SIMD platform
120 for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
121 if (preferredRank[jCuboid] != -1) {
122 auto iRank = preferredRank[jCuboid];
123
124 rankBuffer[jCuboid] = iRank;
125 platformBuffer[jCuboid] = static_cast<std::uint8_t>(Platform::CPU_SIMD);
126 locBuffer[jCuboid] = nLoc[iRank]++;
127 }
128 }
129
130 // Compute fallback CPU rank affinity of remaining cuboids
131 std::vector<int> preferredRankFallback(cGeometry.getNc(), -1);
132 for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
133 if (preferredRank[jCuboid] == -1) {
134 std::vector<int> neighbours;
135 cGeometry.getNeighbourhood(jCuboid, neighbours, 1);
136 std::vector<int> neighboursInRank(nRank, 0);
137 int nCpuNeighbors = 0;
138 for (int neighborC : neighbours) {
139 if (platformBuffer[neighborC] != static_cast<std::uint8_t>(Platform::GPU_CUDA)) {
140 neighboursInRank[rankBuffer[neighborC]] += 1;
141 nCpuNeighbors += 1;
142 }
143 }
144 if (nCpuNeighbors > 0) {
145 preferredRankFallback[jCuboid] = std::distance(neighboursInRank.begin(), std::max_element(neighboursInRank.begin(),
146 neighboursInRank.end()));
147 clout << "Preferred rank of " << jCuboid << " is " << preferredRankFallback[jCuboid] << std::endl;
148 }
149 }
150 }
151
152 // Distribute remaining blocks over ranks using CPU_SIMD platform
153 for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
154 if (preferredRank[jCuboid] == -1) {
155 auto iRank = preferredRankFallback[jCuboid];
156
157 rankBuffer[jCuboid] = iRank;
158 platformBuffer[jCuboid] = static_cast<std::uint8_t>(Platform::CPU_SIMD);
159 locBuffer[jCuboid] = nLoc[iRank]++;
160 }
161 }
162
163 }
164
165 #ifdef PARALLEL_MODE_MPI
166 // Broadcast assignments to all processes
167 singleton::mpi().bCast(rankBuffer.data(), rankBuffer.size());
168 singleton::mpi().bCast(locBuffer.data(), locBuffer.size());
169 singleton::mpi().bCast(platformBuffer.data(), platformBuffer.size());
170 #endif
171
172 // Update internal LoadBalancer structure to match given assignment
173 for (int iCuboid=0; iCuboid < cGeometry.getNc(); ++iCuboid) {
174 this->_rank[iCuboid] = rankBuffer[iCuboid];
175 this->_loc[iCuboid] = locBuffer[iCuboid];
176
177 if (rankBuffer[iCuboid] == iRank) {
178 this->_glob.resize(std::max(int(this->_glob.size()), this->_loc[iCuboid]+1));
179 this->_platform.resize(this->_glob.size());
180
181 this->_glob[this->_loc[iCuboid]] = iCuboid;
182 this->_platform[this->_loc[iCuboid]] = static_cast<Platform>(platformBuffer[iCuboid]);
183
184 this->_size = this->_glob.size();
185 }
186 }
187 }
188
189 Platform platform(int loc) const override {
190 return _platform[loc];
191 }
192
194 _platform[loc] = platform;
195 }
196
197};
198
200
208template<typename T>
210private:
211 std::vector<Platform> _platform;
212
213public:
214 OrthogonalHeterogeneousLoadBalancer(CuboidGeometry<T,3>& cGeometry, T largeBlockFraction=0.9):
215 LoadBalancer<T>(0)
216 {
217 OstreamManager clout(std::cout, "OrthogonalHeterogeneousLoadBalancer");
218
219 std::vector<int> rankBuffer(cGeometry.getNc(), 0);
220 std::vector<int> locBuffer(cGeometry.getNc(), 0);
221 std::vector<std::uint8_t> platformBuffer(cGeometry.getNc(), static_cast<std::uint8_t>(Platform::CPU_SISD));
222
223 const auto nRank = singleton::mpi().getSize();
224 const auto iRank = singleton::mpi().getRank();
225
226 auto& cuboids = cGeometry.cuboids();
227 // Prioritize assignment of largest cuboids
228 std::sort(cuboids.begin(), cuboids.end(),
229 [&](const auto& lhs, const auto& rhs) {
230 return lhs.getLatticeVolume() > rhs.getLatticeVolume();
231 });
232
233 int localPreferredPlatform = static_cast<int>(Platform::CPU_SIMD);
234 #ifdef PLATFORM_GPU_CUDA
235 if (gpu::cuda::device::getCount() > 0) {
236 localPreferredPlatform = static_cast<int>(Platform::GPU_CUDA);
237 }
238 #endif
239 std::vector<int> preferredPlatform(cuboids.size(), -1);
240 #ifdef PARALLEL_MODE_MPI
241 singleton::mpi().gather(&localPreferredPlatform, 1,
242 preferredPlatform.data(), 1);
243 #endif
244
245 // Distribute cuboids to ranks on rank 0
246 if (iRank == 0) {
247 std::set<int> cpuRanks;
248 std::set<int> gpuRanks;
249 for (int jRank=0; jRank < singleton::mpi().getSize(); ++jRank) {
250 switch (static_cast<Platform>(preferredPlatform[jRank])) {
252 cpuRanks.insert(jRank);
253 break;
255 gpuRanks.insert(jRank);
256 break;
257 default:
258 break;
259 }
260 }
261
262 // Total volume for tracking targeted GPU block fraction
263 std::size_t totalVolume = std::accumulate(cuboids.begin(), cuboids.end(),
264 std::size_t{0},
265 [](const auto& currVolume, const auto& rhs) -> std::size_t {
266 return currVolume + rhs.getLatticeVolume();
267 });
268
269 std::map<int,int> nLoc;
270 int jRank = 0;
271 int iCuboid = 0;
272
273 std::size_t globalAssignedVolume = 0;
274 std::vector<std::size_t> localAssignedVolume(nRank, 0);
275 // Prevent GPU assignment to CPU ranks
276 for (int jRank : cpuRanks) {
277 localAssignedVolume[jRank] = std::numeric_limits<std::size_t>::max();
278 }
279
280 // Assign largest cuboids to GPUs until desired fraction is reached
281 do {
282 jRank = std::distance(localAssignedVolume.begin(),
283 std::min_element(localAssignedVolume.begin(),
284 localAssignedVolume.end()));
285 rankBuffer[iCuboid] = jRank;
286 platformBuffer[iCuboid] = static_cast<std::uint8_t>(Platform::GPU_CUDA);
287 locBuffer[iCuboid] = nLoc[jRank]++;
288 localAssignedVolume[jRank] += cuboids[iCuboid].getLatticeVolume();
289 globalAssignedVolume += cuboids[iCuboid].getLatticeVolume();
290 clout << iCuboid << ", " << jRank << ", assignedVolumeFraction=" << (1.*globalAssignedVolume) / totalVolume << std::endl;
291 iCuboid += 1;
292 } while (globalAssignedVolume < largeBlockFraction*totalVolume);
293
294 for (int iRank : gpuRanks) {
295 clout << "assignedVolumeOfRank[" << iRank << "]=" << localAssignedVolume[iRank] / double(totalVolume) << std::endl;
296 }
297
298 // Prevent CPU assignment to GPU ranks
299 for (int jRank : cpuRanks) {
300 localAssignedVolume[jRank] = 0;
301 }
302 for (int jRank : gpuRanks) {
303 localAssignedVolume[jRank] = std::numeric_limits<std::size_t>::max();
304 }
305
306 for (int jCuboid=iCuboid; jCuboid < cGeometry.getNc(); ++jCuboid) {
307 jRank = std::distance(localAssignedVolume.begin(),
308 std::min_element(localAssignedVolume.begin(),
309 localAssignedVolume.end()));
310 rankBuffer[jCuboid] = jRank;
311 platformBuffer[jCuboid] = static_cast<std::uint8_t>(Platform::CPU_SIMD);
312 locBuffer[jCuboid] = nLoc[jRank]++;
313 localAssignedVolume[jRank] += cuboids[jCuboid].getLatticeVolume();
314 clout << jCuboid << ", " << jRank << std::endl;
315 }
316
317 for (int iRank : cpuRanks) {
318 clout << "assignedVolumeOfRank[" << iRank << "]=" << localAssignedVolume[iRank] / double(totalVolume) << std::endl;
319 }
320 }
321
322 #ifdef PARALLEL_MODE_MPI
323 // Broadcast assignments to all processes
324 singleton::mpi().bCast(rankBuffer.data(), rankBuffer.size());
325 singleton::mpi().bCast(locBuffer.data(), locBuffer.size());
326 singleton::mpi().bCast(platformBuffer.data(), platformBuffer.size());
327 #endif
328
329 // Update internal LoadBalancer structure to match given assignment
330 for (int iCuboid=0; iCuboid < cGeometry.getNc(); ++iCuboid) {
331 this->_rank[iCuboid] = rankBuffer[iCuboid];
332 this->_loc[iCuboid] = locBuffer[iCuboid];
333
334 if (rankBuffer[iCuboid] == iRank) {
335 this->_glob.resize(std::max(int(this->_glob.size()), this->_loc[iCuboid]+1));
336 this->_platform.resize(this->_glob.size());
337
338 this->_glob[this->_loc[iCuboid]] = iCuboid;
339 this->_platform[this->_loc[iCuboid]] = static_cast<Platform>(platformBuffer[iCuboid]);
340
341 this->_size = this->_glob.size();
342 }
343 }
344 }
345
346 Platform platform(int loc) const override {
347 return _platform[loc];
348 }
349
351 _platform[loc] = platform;
352 }
353
354};
355
356}
357
358#endif
Load balancer for heterogeneous CPU-GPU systems.
HeterogeneousLoadBalancer(CuboidGeometry< T, 3 > &cGeometry, T largeBlockFraction=0.9)
Platform platform(int loc) const override
void setPlatform(int loc, Platform platform)
Base class for all LoadBalancer.
std::vector< int > _glob
content is 0,1,2,...,_size
int _size
number of cuboids after shrink -1 in appropriate thread
std::map< int, int > _rank
maps global cuboid number to the processing thread
std::map< int, int > _loc
maps global cuboid to (local) thread cuboid
int loc(const int &glob)
Load balancer for heterogeneous CPU-GPU systems.
OrthogonalHeterogeneousLoadBalancer(CuboidGeometry< T, 3 > &cGeometry, T largeBlockFraction=0.9)
class for marking output with some text
void gather(T *sendBuf, int sendCount, T *recvBuf, int recvCount, int root=0, MPI_Comm comm=MPI_COMM_WORLD)
Gather data from multiple processors to one processor.
void bCast(T *sendBuf, int sendCount, int root=0, MPI_Comm comm=MPI_COMM_WORLD)
Broadcast data from one processor to multiple processors.
int getSize() const
Returns the number of processes.
int getRank() const
Returns the process ID.
int getCount()
Return number of available devices.
Definition device.hh:42
MpiManager & mpi()
Top level namespace for all of OpenLB.
Platform
OpenLB execution targets.
Definition platform.h:36
@ CPU_SIMD
Basic scalar CPU.
@ GPU_CUDA
Vector CPU (AVX2 / AVX-512 collision)
std::conditional_t< D==2, CuboidGeometry2D< T >, CuboidGeometry3D< T > > CuboidGeometry
Definition aliases.h:47