OpenLB 1.7
Loading...
Searching...
No Matches
communicator.hh
Go to the documentation of this file.
1/* This file is part of the OpenLB library
2 *
3 * Copyright (C) 2022 Adrian Kummerlaender
4 * E-mail contact: info@openlb.net
5 * The most recent release of OpenLB can be downloaded at
6 * <http://www.openlb.net/>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version 2
11 * of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the Free
20 * Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
22*/
23
24#ifndef PLATFORM_GPU_CUDA_COMMUNICATOR_HH
25#define PLATFORM_GPU_CUDA_COMMUNICATOR_HH
26
30
31#include "registry.h"
32#include "context.hh"
33
34#include <thrust/device_vector.h>
35
36#ifdef PARALLEL_MODE_MPI
37#include "mpi.h"
38#if defined(OPEN_MPI) && OPEN_MPI
39#include <mpi-ext.h>
40#endif
41#endif
42
43namespace olb {
44
45template <>
47{
48 OstreamManager clout(std::cout, "GPU_CUDA");
49
50 int nDevices{};
51 cudaGetDeviceCount(&nDevices);
52
53 clout.setMultiOutput(true);
54 if (nDevices < 1) {
55 clout << "No CUDA device found" << std::endl;
56 } else if (nDevices > 1) {
57 clout << "Found " << nDevices << " CUDA devices but only one can be used per MPI process." << std::endl;
58 }
59#ifdef OLB_DEBUG
60 for (int deviceI=0; deviceI < nDevices; ++deviceI) {
61 cudaDeviceProp deviceProp;
62 cudaGetDeviceProperties(&deviceProp, deviceI);
63 clout << deviceProp.name << " visible" << std::endl;
64 }
65#endif
66 clout.setMultiOutput(false);
67
68#ifdef PARALLEL_MODE_MPI
69#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
70 if (!MPIX_Query_cuda_support()) {
71 clout << "The used MPI Library is not CUDA-aware. Multi-GPU execution will fail." << std::endl;
72 }
73#endif
74#if defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT
75 clout << "The used MPI Library is not CUDA-aware. Multi-GPU execution will fail." << std::endl;
76#endif
77#if !defined(MPIX_CUDA_AWARE_SUPPORT)
78 clout << "Unable to check for CUDA-aware MPI support. Multi-GPU execution may fail." << std::endl;
79#endif
80#endif // PARALLEL_MODE_MPI
81}
82
83namespace gpu {
84
85namespace cuda {
86
87namespace kernel {
88
90template <typename CONTEXT, typename FIELD>
91void gather_field(CONTEXT lattice,
92 const CellID* indices, std::size_t nIndices,
93 typename FIELD::template value_type<typename CONTEXT::value_t>* buffer) __global__ {
94 const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;
95 if (!(iIndex < nIndices)) {
96 return;
97 }
98 auto* field = lattice.template getField<FIELD>();
99 for (unsigned iD=0; iD < CONTEXT::descriptor_t::template size<FIELD>(); ++iD) {
100 buffer[iD*nIndices+iIndex] = field[iD][indices[iIndex]];
101 }
102}
103
105template <typename SOURCE, typename TARGET, typename FIELD>
106void copy_field(SOURCE sourceLattice, TARGET targetLattice,
107 const CellID* sourceIndices,
108 const CellID* targetIndices,
109 std::size_t nIndices) __global__ {
110 const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;
111 if (!(iIndex < nIndices)) {
112 return;
113 }
114 auto* source = sourceLattice.template getField<FIELD>();
115 auto* target = targetLattice.template getField<FIELD>();
116 for (unsigned iD=0; iD < SOURCE::descriptor_t::template size<FIELD>(); ++iD) {
117 target[iD][targetIndices[iIndex]] = source[iD][sourceIndices[iIndex]];
118 }
119}
120
122__global__ void gather_any_fields(AnyDeviceFieldArrayD* fields, std::size_t nFields,
123 const CellID* indices, std::size_t nIndices,
124 std::uint8_t* buffer) {
125 const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;
126 if (!(iIndex < nIndices)) {
127 return;
128 }
129 for (unsigned iField=0; iField < nFields; ++iField) {
130 auto& field = fields[iField];
131 for (unsigned iD=0; iD < field.column_count; ++iD) {
132 memcpy(buffer + (iD*nIndices + iIndex)*field.element_size,
133 field[iD] + indices[iIndex]*field.element_size,
134 field.element_size);
135 }
136 buffer += nIndices*field.column_count*field.element_size;
137 }
138}
139
141
145__global__ void copy_any_fields(AnyDeviceFieldArrayD* sourceFields,
146 AnyDeviceFieldArrayD* targetFields,
147 std::size_t nFields,
148 const CellID* sourceIndices,
149 const CellID* targetIndices,
150 std::size_t nIndices) {
151 const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;
152 if (!(iIndex < nIndices)) {
153 return;
154 }
155 for (unsigned iField=0; iField < nFields; ++iField) {
156 auto& sourceField = sourceFields[iField];
157 auto& targetField = targetFields[iField];
158 for (unsigned iD=0; iD < sourceField.column_count; ++iD) {
159 memcpy(targetField[iD] + targetIndices[iIndex]*sourceField.element_size,
160 sourceField[iD] + sourceIndices[iIndex]*sourceField.element_size,
161 sourceField.element_size);
162 }
163 }
164}
165
167template <typename CONTEXT, typename FIELD>
168void scatter_field(CONTEXT lattice,
169 const CellID* indices, std::size_t nIndices,
170 typename FIELD::template value_type<typename CONTEXT::value_t>* buffer) __global__ {
171 const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;
172 if (!(iIndex < nIndices)) {
173 return;
174 }
175 auto* field = lattice.template getField<FIELD>();
176 for (unsigned iD=0; iD < CONTEXT::descriptor_t::template size<FIELD>(); ++iD) {
177 field[iD][indices[iIndex]] = buffer[iD*nIndices+iIndex];
178 }
179}
180
182__global__ void scatter_any_fields(AnyDeviceFieldArrayD* fields, std::size_t nFields,
183 const CellID* indices, std::size_t nIndices,
184 std::uint8_t* buffer) {
185 const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;
186 if (!(iIndex < nIndices)) {
187 return;
188 }
189 for (unsigned iField=0; iField < nFields; ++iField) {
190 auto& field = fields[iField];
191 for (unsigned iD=0; iD < field.column_count; ++iD) {
192 memcpy(field[iD] + indices[iIndex]*field.element_size,
193 buffer + (iD*nIndices + iIndex)*field.element_size,
194 field.element_size);
195 }
196 buffer += nIndices*field.column_count*field.element_size;
197 }
198}
199
200}
201
203template <typename FIELD, typename CONTEXT>
204void gather_field(CONTEXT& lattice, const thrust::device_vector<CellID>& indices, std::uint8_t* buffer) {
205 const auto block_size = 32;
206 const auto block_count = (indices.size() + block_size - 1) / block_size;
207 kernel::gather_field<CONTEXT,FIELD><<<block_count,block_size>>>(
208 lattice,
209 indices.data().get(), indices.size(),
210 reinterpret_cast<typename FIELD::template value_type<typename CONTEXT::value_t>*>(buffer));
212}
213
215template <typename FIELD, typename CONTEXT>
216void async_gather_field(cudaStream_t stream,
217 CONTEXT& lattice,
218 const thrust::device_vector<CellID>& indices,
219 std::uint8_t* buffer) {
220 const auto block_size = 32;
221 const auto block_count = (indices.size() + block_size - 1) / block_size;
222 kernel::gather_field<CONTEXT,FIELD><<<block_count,block_size,0,stream>>>(
223 lattice,
224 indices.data().get(), indices.size(),
225 reinterpret_cast<typename FIELD::template value_type<typename CONTEXT::value_t>*>(buffer));
227}
228
230template <typename FIELD, typename SOURCE, typename TARGET>
231void async_copy_field(cudaStream_t stream,
232 SOURCE& sourceLattice,
233 TARGET& targetLattice,
234 const thrust::device_vector<CellID>& sourceIndices,
235 const thrust::device_vector<CellID>& targetIndices) {
236 const auto block_size = 32;
237 const auto block_count = (sourceIndices.size() + block_size - 1) / block_size;
238 kernel::copy_field<SOURCE,TARGET,FIELD><<<block_count,block_size,0,stream>>>(
239 sourceLattice,
240 targetLattice,
241 sourceIndices.data().get(),
242 targetIndices.data().get(),
243 sourceIndices.size());
245}
246
247
249void gather_any_fields(thrust::device_vector<AnyDeviceFieldArrayD>& fields,
250 const thrust::device_vector<CellID>& indices,
251 std::uint8_t* buffer) {
252 const auto block_size = 32;
253 const auto block_count = (indices.size() + block_size - 1) / block_size;
254 kernel::gather_any_fields<<<block_count,block_size>>>(
255 fields.data().get(), fields.size(),
256 indices.data().get(), indices.size(),
257 buffer);
259}
260
262void async_gather_any_fields(cudaStream_t stream,
263 thrust::device_vector<AnyDeviceFieldArrayD>& fields,
264 const thrust::device_vector<CellID>& indices,
265 std::uint8_t* buffer) {
266 const auto block_size = 32;
267 const auto block_count = (indices.size() + block_size - 1) / block_size;
268 kernel::gather_any_fields<<<block_count,block_size,0,stream>>>(
269 fields.data().get(), fields.size(),
270 indices.data().get(), indices.size(),
271 buffer);
273}
274
276void async_copy_any_fields(cudaStream_t stream,
277 thrust::device_vector<AnyDeviceFieldArrayD>& sourceFields,
278 thrust::device_vector<AnyDeviceFieldArrayD>& targetFields,
279 const thrust::device_vector<CellID>& sourceIndices,
280 const thrust::device_vector<CellID>& targetIndices) {
281 const auto block_size = 32;
282 const auto block_count = (sourceIndices.size() + block_size - 1) / block_size;
283 kernel::copy_any_fields<<<block_count,block_size,0,stream>>>(
284 sourceFields.data().get(), targetFields.data().get(), sourceFields.size(),
285 sourceIndices.data().get(), targetIndices.data().get(), sourceIndices.size());
287}
288
290template <typename FIELD, typename CONTEXT>
291void scatter_field(CONTEXT& lattice, const thrust::device_vector<CellID>& indices, std::uint8_t* buffer) {
292 const auto block_size = 32;
293 const auto block_count = (indices.size() + block_size - 1) / block_size;
294 kernel::scatter_field<CONTEXT,FIELD><<<block_count,block_size>>>(
295 lattice,
296 indices.data().get(), indices.size(),
297 reinterpret_cast<typename FIELD::template value_type<typename CONTEXT::value_t>*>(buffer));
299}
300
302template <typename FIELD, typename CONTEXT>
303void async_scatter_field(cudaStream_t stream,
304 CONTEXT& lattice,
305 const thrust::device_vector<CellID>& indices,
306 std::uint8_t* buffer) {
307 const auto block_size = 32;
308 const auto block_count = (indices.size() + block_size - 1) / block_size;
309 kernel::scatter_field<CONTEXT,FIELD><<<block_count,block_size,0,stream>>>(
310 lattice,
311 indices.data().get(), indices.size(),
312 reinterpret_cast<typename FIELD::template value_type<typename CONTEXT::value_t>*>(buffer));
314}
315
317void scatter_any_fields(thrust::device_vector<AnyDeviceFieldArrayD>& fields,
318 const thrust::device_vector<CellID>& indices,
319 std::uint8_t* buffer) {
320 const auto block_size = 32;
321 const auto block_count = (indices.size() + block_size - 1) / block_size;
322 kernel::scatter_any_fields<<<block_count,block_size>>>(
323 fields.data().get(), fields.size(),
324 indices.data().get(), indices.size(),
325 buffer);
327}
328
330void async_scatter_any_fields(cudaStream_t stream,
331 thrust::device_vector<AnyDeviceFieldArrayD>& fields,
332 const thrust::device_vector<CellID>& indices,
333 std::uint8_t* buffer) {
334 const auto block_size = 32;
335 const auto block_count = (indices.size() + block_size - 1) / block_size;
336 kernel::scatter_any_fields<<<block_count,block_size,0,stream>>>(
337 fields.data().get(), fields.size(),
338 indices.data().get(), indices.size(),
339 buffer);
341}
342
343}
344
345}
346
347#ifdef PARALLEL_MODE_MPI
348
350template <typename T, typename DESCRIPTOR>
352private:
353 thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _fields;
354 const bool _onlyPopulationField;
355
356 const thrust::device_vector<CellID> _cells;
357
359
361 std::unique_ptr<MpiSendRequest> _request;
362
363 std::unique_ptr<gpu::cuda::device::Stream> _stream;
364
365public:
366 SendTask(MPI_Comm comm, int tag, int rank,
367 const std::vector<std::type_index>& fields,
368 const std::vector<CellID>& cells,
370 _fields(block.getDataRegistry().deviceFieldArrays(fields)),
371 _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),
372 _cells(cells),
373 _source(block),
374 _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking))
375 {
376 std::size_t size = 0;
377 for (auto& field : fields) {
378 size += _source.getCommunicatable(field).size(cells);
379 }
380 _buffer = gpu::cuda::device::malloc<std::uint8_t>(size);
381 _request = std::make_unique<MpiSendRequest>(
382 _buffer.get(), size, rank, tag, comm);
383 }
384
386 {
387 _stream->synchronize();
388 wait();
389 }
390
391 void prepare()
392 {
393 if (_onlyPopulationField) {
395 gpu::cuda::async_gather_field<descriptors::POPULATION>(_stream->get(), lattice, _cells, _buffer.get());
396 } else {
397 gpu::cuda::async_gather_any_fields(_stream->get(), _fields, _cells, _buffer.get());
398 }
399 }
400
401 void send()
402 {
403 _stream->synchronize();
404 _request->start();
405 }
406
407 void wait()
408 {
409 _request->wait();
410 }
411};
412
414template <typename T, typename DESCRIPTOR>
416private:
417 const int _tag;
418 const int _rank;
419
420 thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _fields;
421 const bool _onlyPopulationField;
422
423 const thrust::device_vector<CellID> _cells;
424
426
428 std::unique_ptr<MpiRecvRequest> _request;
429
430 std::unique_ptr<gpu::cuda::device::Stream> _stream;
431
432public:
434
441 class ref {
442 private:
443 RecvTask& _task;
444 public:
445 ref(std::unique_ptr<RecvTask>& task): _task(*task) { };
446
447 RecvTask* operator->() const
448 {
449 return &_task;
450 }
451
452 bool operator <(const ref& rhs) const
453 {
454 return _task < rhs._task;
455 }
456 };
457
458 RecvTask(MPI_Comm comm, int tag, int rank,
459 const std::vector<std::type_index>& fields,
460 const std::vector<CellID>& cells,
462 _tag(tag),
463 _rank(rank),
464 _fields(block.getDataRegistry().deviceFieldArrays(fields)),
465 _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),
466 _cells(cells),
467 _target(block),
468 _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking))
469 {
470 std::size_t size = 0;
471 for (auto& field : fields) {
472 size += _target.getCommunicatable(field).size(cells);
473 }
474 _buffer = gpu::cuda::device::malloc<std::uint8_t>(size);
475 _request = std::make_unique<MpiRecvRequest>(
476 _buffer.get(), size, _rank, _tag, comm);
477 }
478
480 {
481 wait();
482 }
483
484 bool operator<(const RecvTask& rhs) const
485 {
486 return _rank < rhs._rank
487 || (_rank == rhs._rank && _tag < rhs._tag);
488 }
489
490 void receive()
491 {
492 _request->start();
493 };
494
495 bool isDone()
496 {
497 return _request->isDone();
498 }
499
500 void unpack()
501 {
502 if (_onlyPopulationField) {
504 gpu::cuda::async_scatter_field<descriptors::POPULATION>(_stream->get(), lattice, _cells, _buffer.get());
505 } else {
506 gpu::cuda::async_scatter_any_fields(_stream->get(), _fields, _cells, _buffer.get());
507 }
508 }
509
510 void wait()
511 {
512 _stream->synchronize();
513 }
514
515};
516
517#endif // PARALLEL_MODE_MPI
518
520template <typename T, typename DESCRIPTOR>
522 virtual ~CopyTask() { }
523
524 virtual void copy() = 0;
525 virtual void wait() = 0;
526};
527
529template <typename T, typename DESCRIPTOR>
530class ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::HomogeneousCopyTask
531 : public ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::CopyTask {
532private:
533 thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _sourceFields;
534 thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _targetFields;
535
536 const bool _onlyPopulationField;
537
538 const thrust::device_vector<CellID> _targetCells;
539 const thrust::device_vector<CellID> _sourceCells;
540
543
544 std::unique_ptr<gpu::cuda::device::Stream> _stream;
545
546public:
548 const std::vector<std::type_index>& fields,
549 const std::vector<CellID>& targetCells, ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& target,
550 const std::vector<CellID>& sourceCells, ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& source):
551 _sourceFields(source.getDataRegistry().deviceFieldArrays(fields)),
552 _targetFields(target.getDataRegistry().deviceFieldArrays(fields)),
553 _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),
554 _targetCells(targetCells),
555 _sourceCells(sourceCells),
556 _target(target),
557 _source(source),
558 _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking))
559 {
560 OLB_ASSERT(_sourceCells.size() == _targetCells.size(),
561 "Source cell count must match target cell count");
562 }
563
565 {
566 wait();
567 }
568
569 void copy() override
570 {
571 if (_onlyPopulationField) {
572 gpu::cuda::DeviceContext<T,DESCRIPTOR> sourceLattice(_source);
573 gpu::cuda::DeviceContext<T,DESCRIPTOR> targetLattice(_target);
574 gpu::cuda::async_copy_field<descriptors::POPULATION>(_stream->get(), sourceLattice, targetLattice, _sourceCells, _targetCells);
575 } else {
576 gpu::cuda::async_copy_any_fields(_stream->get(), _sourceFields, _targetFields, _sourceCells, _targetCells);
577 }
578 }
579
580 void wait() override
581 {
582 _stream->synchronize();
583 }
584
585};
586
587
589template <typename T, typename DESCRIPTOR, Platform SOURCE>
591private:
592 thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _targetFields;
593
594 const bool _onlyPopulationField;
595
596 const thrust::device_vector<CellID> _targetCells;
597 const std::vector<CellID>& _sourceCells;
598
600
602
603 std::unique_ptr<gpu::cuda::device::Stream> _stream;
604
606
607public:
609 const std::vector<std::type_index>& fields,
610 const std::vector<CellID>& targetCells, ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& target,
611 const std::vector<CellID>& sourceCells, ConcreteBlockLattice<T,DESCRIPTOR,SOURCE>& source):
612 _targetFields(target.getDataRegistry().deviceFieldArrays(fields)),
613 _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),
614 _targetCells(targetCells),
615 _sourceCells(sourceCells),
616 _target(target),
617 _source(source, fields),
618 _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking)),
619 _buffer(_source.size(_sourceCells))
620 { }
621
622 void copy() override {
623 _source.serialize(_sourceCells, _buffer.data());
625
626 if (_onlyPopulationField) {
628 gpu::cuda::async_scatter_field<descriptors::POPULATION>(_stream->get(), lattice, _targetCells, _buffer.deviceData());
629 } else {
630 gpu::cuda::async_scatter_any_fields(_stream->get(), _targetFields, _targetCells, _buffer.deviceData());
631 }
632 }
633
634 void wait() override {
635 _stream->synchronize();
636 }
637
638};
639
641template <typename T, typename DESCRIPTOR, Platform TARGET>
643private:
644 thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _sourceFields;
645
646 const std::vector<CellID>& _targetCells;
647 const thrust::device_vector<CellID> _sourceCells;
648
651
652 const bool _onlyPopulationField;
653
654 std::unique_ptr<gpu::cuda::device::Stream> _stream;
655
657
658public:
660 const std::vector<std::type_index>& fields,
661 const std::vector<CellID>& targetCells, ConcreteBlockLattice<T,DESCRIPTOR,TARGET>& target,
662 const std::vector<CellID>& sourceCells, ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& source):
663 _sourceFields(source.getDataRegistry().deviceFieldArrays(fields)),
664 _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),
665 _targetCells(targetCells),
666 _sourceCells(sourceCells),
667 _target(target, fields),
668 _source(source),
669 _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking)),
670 _buffer(_target.size(_targetCells))
671 { }
672
673 void copy() override {
674 if (_onlyPopulationField) {
676 gpu::cuda::async_gather_field<descriptors::POPULATION>(_stream->get(), lattice, _sourceCells, _buffer.deviceData());
677 } else {
678 gpu::cuda::async_gather_any_fields(_stream->get(), _sourceFields, _sourceCells, _buffer.deviceData());
679 }
680
682 }
683
684 void wait() override {
685 _stream->synchronize();
686 _target.deserialize(_targetCells, _buffer.data());
687 }
688
689};
690
691template <typename T, typename DESCRIPTOR, Platform SOURCE>
693 const std::vector<std::type_index>& fields,
694 const std::vector<CellID>& targetCells, ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& target,
695 const std::vector<CellID>& sourceCells, ConcreteBlockLattice<T,DESCRIPTOR,SOURCE>& source):
696 _impl(new HeterogeneousCopyTaskDataForGpuTarget<T,DESCRIPTOR,SOURCE>(fields,
697 targetCells, target,
698 sourceCells, source))
699{
700 OLB_ASSERT(sourceCells.size() == targetCells.size(),
701 "Source cell count must match target cell count");
702}
703
704template <typename T, typename DESCRIPTOR, Platform SOURCE>
709
710template <typename T, typename DESCRIPTOR, Platform SOURCE>
715
716template <typename T, typename DESCRIPTOR, Platform TARGET>
718 const std::vector<std::type_index>& fields,
719 const std::vector<CellID>& targetCells, ConcreteBlockLattice<T,DESCRIPTOR,TARGET>& target,
720 const std::vector<CellID>& sourceCells, ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& source):
721 _impl(new HeterogeneousCopyTaskDataForGpuSource<T,DESCRIPTOR,TARGET>(fields,
722 targetCells, target,
723 sourceCells, source))
724{
725 OLB_ASSERT(sourceCells.size() == targetCells.size(),
726 "Source cell count must match target cell count");
727}
728
729template <typename T, typename DESCRIPTOR, Platform TARGET>
734
735template <typename T, typename DESCRIPTOR, Platform TARGET>
740
741
742template <typename T, typename DESCRIPTOR>
745 LoadBalancer<T>& loadBalancer,
746#ifdef PARALLEL_MODE_MPI
748 MPI_Comm comm,
749#endif
750 int iC,
752 _iC(iC)
753#ifdef PARALLEL_MODE_MPI
754, _mpiCommunicator(comm)
755#endif
756{
757#ifdef PARALLEL_MODE_MPI
758 neighborhood.forNeighbors([&](int remoteC) {
759 if (loadBalancer.isLocal(remoteC)) {
760 const Platform remotePlatform = loadBalancer.platform(loadBalancer.loc(remoteC));
761 if (!neighborhood.getCellsInboundFrom(remoteC).empty()) {
762 switch (remotePlatform) {
763 case Platform::GPU_CUDA:
764 // Use manual copy for local GPU-GPU communication due to better performance
765 _copyTasks.emplace_back(new HomogeneousCopyTask(
766 neighborhood.getFieldsCommonWith(remoteC),
767 neighborhood.getCellsInboundFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC),
768 neighborhood.getCellsRequestedFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(loadBalancer.loc(remoteC))));
769 break;
770 case Platform::CPU_SIMD:
771 // Use manual copy for local GPU-CPU communication due to better performance
772 _copyTasks.emplace_back(new HeterogeneousCopyTask<T,DESCRIPTOR,Platform::CPU_SIMD,Platform::GPU_CUDA>(
773 neighborhood.getFieldsCommonWith(remoteC),
774 neighborhood.getCellsInboundFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC),
775 neighborhood.getCellsRequestedFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::CPU_SIMD>>(loadBalancer.loc(remoteC))));
776 break;
777 case Platform::CPU_SISD:
778 // Use manual copy for local GPU-CPU communication due to better performance
779 _copyTasks.emplace_back(new HeterogeneousCopyTask<T,DESCRIPTOR,Platform::CPU_SISD,Platform::GPU_CUDA>(
780 neighborhood.getFieldsCommonWith(remoteC),
781 neighborhood.getCellsInboundFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC),
782 neighborhood.getCellsRequestedFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::CPU_SISD>>(loadBalancer.loc(remoteC))));
783 break;
784 default:
785 throw std::runtime_error("Invalid remote PLATFORM");
786 }
787 }
788 } else {
789 // Handling of non-local GPU-GPU and GPU-^GPU communication in general
790 if (!neighborhood.getCellsOutboundTo(remoteC).empty()) {
791 _sendTasks.emplace_back(std::make_unique<SendTask>(
792 _mpiCommunicator, tagCoordinator.get(loadBalancer.glob(_iC), remoteC),
793 loadBalancer.rank(remoteC),
794 neighborhood.getFieldsCommonWith(remoteC),
795 neighborhood.getCellsOutboundTo(remoteC),
796 super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC)));
797 }
798 if (!neighborhood.getCellsInboundFrom(remoteC).empty()) {
799 _recvTasks.emplace_back(std::make_unique<RecvTask>(
800 _mpiCommunicator, tagCoordinator.get(remoteC, loadBalancer.glob(_iC)),
801 loadBalancer.rank(remoteC),
802 neighborhood.getFieldsCommonWith(remoteC),
803 neighborhood.getCellsInboundFrom(remoteC),
804 super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC)));
805 }
806 }
807 });
808
809#else // not using PARALLEL_MODE_MPI
810 neighborhood.forNeighbors([&](int localC) {
811 if (!neighborhood.getCellsInboundFrom(localC).empty()) {
812 _copyTasks.emplace_back(new HomogeneousCopyTask(
813 neighborhood.getFieldsCommonWith(localC),
814 neighborhood.getCellsInboundFrom(localC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC),
815 neighborhood.getCellsRequestedFrom(localC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(loadBalancer.loc(localC))));
816 }
817 });
818#endif
819}
820
821template <typename T, typename DESCRIPTOR>
824
825#ifdef PARALLEL_MODE_MPI
826
827template <typename T, typename DESCRIPTOR>
829{
830 for (auto& task : _recvTasks) {
831 task->receive();
832 }
833}
834
835template <typename T, typename DESCRIPTOR>
837{
838 for (auto& task : _sendTasks) {
839 task->prepare();
840 }
841 for (auto& task : _sendTasks) {
842 task->send();
843 }
844 for (auto& task : _copyTasks) {
845 task->copy();
846 }
847}
848
849template <typename T, typename DESCRIPTOR>
851{
852 std::set<typename RecvTask::ref> pending(_recvTasks.begin(), _recvTasks.end());
853 while (!pending.empty()) {
854 auto task_iterator = pending.begin();
855 while (task_iterator != pending.end()) {
856 auto& task = *task_iterator;
857 if (task->isDone()) {
858 task->unpack();
859 task_iterator = pending.erase(task_iterator);
860 }
861 else {
862 ++task_iterator;
863 }
864 }
865 }
866}
867
868template <typename T, typename DESCRIPTOR>
870{
871 for (auto& task : _copyTasks) {
872 task->wait();
873 }
874 for (auto& task : _recvTasks) {
875 task->wait();
876 }
877 for (auto& task : _sendTasks) {
878 task->wait();
879 }
880}
881
882#else // not using PARALLEL_MODE_MPI
883
884template <typename T, typename DESCRIPTOR>
886{
887 for (auto& task : _copyTasks) {
888 task->copy();
889 }
890}
891
892#endif
893
894}
895
896#endif
Configurable overlap communication neighborhood of a block.
const std::vector< CellID > & getCellsOutboundTo(int iC) const
const std::vector< CellID > & getCellsInboundFrom(int iC) const
void forNeighbors(F f) const
Calls f(iC) for every neighboring cuboid ID iC.
RecvTask(MPI_Comm comm, int tag, int rank, const std::vector< std::type_index > &fields, const std::vector< CellID > &cells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &block)
SendTask(MPI_Comm comm, int tag, int rank, const std::vector< std::type_index > &fields, const std::vector< CellID > &cells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &block)
HomogeneousCopyTask(const std::vector< std::type_index > &fields, const std::vector< CellID > &targetCells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &target, const std::vector< CellID > &sourceCells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &source)
Implementation of BlockLattice on a concrete PLATFORM.
Communicatable & getCommunicatable(std::type_index field) override
Private implementation of heterogeneous copy task between GPU_CUDA source and CPU_* target.
HeterogeneousCopyTaskDataForGpuSource(const std::vector< std::type_index > &fields, const std::vector< CellID > &targetCells, ConcreteBlockLattice< T, DESCRIPTOR, TARGET > &target, const std::vector< CellID > &sourceCells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &source)
Private implementation of heterogeneous copy task between CPU_* source and GPU_CUDA target.
HeterogeneousCopyTaskDataForGpuTarget(const std::vector< std::type_index > &fields, const std::vector< CellID > &targetCells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &target, const std::vector< CellID > &sourceCells, ConcreteBlockLattice< T, DESCRIPTOR, SOURCE > &source)
Wrapper for a local heterogeneous block communication request.
Base class for all LoadBalancer.
bool isLocal(const int &glob)
returns whether glob is on this process
std::size_t serialize(ConstSpan< CellID > indices, std::uint8_t *buffer) const override
Serialize data at locations indices to buffer
std::size_t deserialize(ConstSpan< CellID > indices, const std::uint8_t *buffer) override
Deserialize data at locations indices to buffer
class for marking output with some text
void setMultiOutput(bool b)
enable message output for all MPI processes, disabled by default
Communication-free negotation of unique tags for inter-cuboid communication.
Super class maintaining block lattices for a cuboid decomposition.
Plain column for CUDA GPU targets.
Definition column.h:49
const T * deviceData() const
Definition column.hh:146
const T * data() const
Definition column.hh:134
void setProcessingContext(ProcessingContext)
Definition column.hh:158
Structure for passing pointers to on-device data into CUDA kernels.
Definition context.hh:55
Managed pointer for device-side memory.
Definition device.h:79
void check()
Check errors.
Definition device.hh:48
void copy_field(SOURCE sourceLattice, TARGET targetLattice, const CellID *sourceIndices, const CellID *targetIndices, std::size_t nIndices) __global__
CUDA kernel for copying FIELD data of sourceLattice at sourceIndices into targetLattice at targetIndi...
void scatter_field(CONTEXT lattice, const CellID *indices, std::size_t nIndices, typename FIELD::template value_type< typename CONTEXT::value_t > *buffer) __global__
CUDA kernel for scattering FIELD data in buffer to indices in lattice.
void gather_field(CONTEXT lattice, const CellID *indices, std::size_t nIndices, typename FIELD::template value_type< typename CONTEXT::value_t > *buffer) __global__
CUDA kernel for gathering FIELD data of lattice at indices into buffer.
__global__ void scatter_any_fields(AnyDeviceFieldArrayD *fields, std::size_t nFields, const CellID *indices, std::size_t nIndices, std::uint8_t *buffer)
CUDA kernel for scattering fields in buffer to indices in lattice.
__global__ void copy_any_fields(AnyDeviceFieldArrayD *sourceFields, AnyDeviceFieldArrayD *targetFields, std::size_t nFields, const CellID *sourceIndices, const CellID *targetIndices, std::size_t nIndices)
CUDA kernel for copying sourceFields at sourceIndices to targetFields at targetIndices.
__global__ void gather_any_fields(AnyDeviceFieldArrayD *fields, std::size_t nFields, const CellID *indices, std::size_t nIndices, std::uint8_t *buffer)
CUDA kernel for gathering fields at indices into buffer.
void async_scatter_any_fields(cudaStream_t stream, thrust::device_vector< AnyDeviceFieldArrayD > &fields, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Non-blocking scatter of fields data in buffer to given indices.
void async_copy_field(cudaStream_t stream, SOURCE &sourceLattice, TARGET &targetLattice, const thrust::device_vector< CellID > &sourceIndices, const thrust::device_vector< CellID > &targetIndices)
Non-blocking copy of FIELD at given indices from sourceLattice to targetLattice.
void scatter_any_fields(thrust::device_vector< AnyDeviceFieldArrayD > &fields, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Blocking scatter of fields data in buffer to given indices.
void gather_any_fields(thrust::device_vector< AnyDeviceFieldArrayD > &fields, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Blocking gather of fields at given indices into buffer.
void async_gather_field(cudaStream_t stream, CONTEXT &lattice, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Non-blocking gather of FIELD at given indices into buffer.
void async_scatter_field(cudaStream_t stream, CONTEXT &lattice, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Non-blocking scatter of FIELD data in buffer to given indices.
void async_copy_any_fields(cudaStream_t stream, thrust::device_vector< AnyDeviceFieldArrayD > &sourceFields, thrust::device_vector< AnyDeviceFieldArrayD > &targetFields, const thrust::device_vector< CellID > &sourceIndices, const thrust::device_vector< CellID > &targetIndices)
Non-blocking copy of fields at given indices from sourceIndices to targetIndices.
void scatter_field(CONTEXT &lattice, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Blocking scatter of FIELD data in buffer to given indices.
void gather_field(CONTEXT &lattice, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Blocking gather of FIELD at given indices into buffer.
void async_gather_any_fields(cudaStream_t stream, thrust::device_vector< AnyDeviceFieldArrayD > &fields, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Non-blocking gather of fields at given indices into buffer.
Top level namespace for all of OpenLB.
std::uint32_t CellID
Type for sequential block-local cell indices.
@ Simulation
Data available on host for e.g. functor evaluation.
Platform
OpenLB execution targets.
Definition platform.h:36
@ GPU_CUDA
Vector CPU (AVX2 / AVX-512 collision)
void checkPlatform< Platform::GPU_CUDA >()
Verifies availability of CUDA device and MPI support.
#define OLB_ASSERT(COND, MESSAGE)
Definition olbDebug.h:45
virtual std::size_t size(ConstSpan< CellID > indices) const =0
Private implementation of HeterogeneousCopyTask (PIMPL)
Type-erased pointer to FieldArrayD device data.
Definition registry.h:42