24#ifndef PLATFORM_GPU_CUDA_COMMUNICATOR_HH
25#define PLATFORM_GPU_CUDA_COMMUNICATOR_HH
34#include <thrust/device_vector.h>
36#ifdef PARALLEL_MODE_MPI
38#if defined(OPEN_MPI) && OPEN_MPI
51 cudaGetDeviceCount(&nDevices);
55 clout <<
"No CUDA device found" << std::endl;
56 }
else if (nDevices > 1) {
57 clout <<
"Found " << nDevices <<
" CUDA devices but only one can be used per MPI process." << std::endl;
60 for (
int deviceI=0; deviceI < nDevices; ++deviceI) {
61 cudaDeviceProp deviceProp;
62 cudaGetDeviceProperties(&deviceProp, deviceI);
63 clout << deviceProp.name <<
" visible" << std::endl;
68#ifdef PARALLEL_MODE_MPI
69#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
70 if (!MPIX_Query_cuda_support()) {
71 clout <<
"The used MPI Library is not CUDA-aware. Multi-GPU execution will fail." << std::endl;
74#if defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT
75 clout <<
"The used MPI Library is not CUDA-aware. Multi-GPU execution will fail." << std::endl;
77#if !defined(MPIX_CUDA_AWARE_SUPPORT)
78 clout <<
"Unable to check for CUDA-aware MPI support. Multi-GPU execution may fail." << std::endl;
90template <
typename CONTEXT,
typename FIELD>
92 const CellID* indices, std::size_t nIndices,
93 typename FIELD::template value_type<typename CONTEXT::value_t>* buffer) __global__ {
94 const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;
95 if (!(iIndex < nIndices)) {
98 auto* field = lattice.template getField<FIELD>();
99 for (
unsigned iD=0; iD < CONTEXT::descriptor_t::template size<FIELD>(); ++iD) {
100 buffer[iD*nIndices+iIndex] = field[iD][indices[iIndex]];
105template <
typename SOURCE,
typename TARGET,
typename FIELD>
107 const CellID* sourceIndices,
108 const CellID* targetIndices,
109 std::size_t nIndices) __global__ {
110 const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;
111 if (!(iIndex < nIndices)) {
114 auto* source = sourceLattice.template getField<FIELD>();
115 auto* target = targetLattice.template getField<FIELD>();
116 for (
unsigned iD=0; iD < SOURCE::descriptor_t::template size<FIELD>(); ++iD) {
117 target[iD][targetIndices[iIndex]] = source[iD][sourceIndices[iIndex]];
123 const CellID* indices, std::size_t nIndices,
124 std::uint8_t* buffer) {
125 const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;
126 if (!(iIndex < nIndices)) {
129 for (
unsigned iField=0; iField < nFields; ++iField) {
130 auto& field = fields[iField];
131 for (
unsigned iD=0; iD < field.column_count; ++iD) {
132 memcpy(buffer + (iD*nIndices + iIndex)*field.element_size,
133 field[iD] + indices[iIndex]*field.element_size,
136 buffer += nIndices*field.column_count*field.element_size;
148 const CellID* sourceIndices,
149 const CellID* targetIndices,
150 std::size_t nIndices) {
151 const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;
152 if (!(iIndex < nIndices)) {
155 for (
unsigned iField=0; iField < nFields; ++iField) {
156 auto& sourceField = sourceFields[iField];
157 auto& targetField = targetFields[iField];
158 for (
unsigned iD=0; iD < sourceField.column_count; ++iD) {
159 memcpy(targetField[iD] + targetIndices[iIndex]*sourceField.element_size,
160 sourceField[iD] + sourceIndices[iIndex]*sourceField.element_size,
161 sourceField.element_size);
167template <
typename CONTEXT,
typename FIELD>
169 const CellID* indices, std::size_t nIndices,
170 typename FIELD::template value_type<typename CONTEXT::value_t>* buffer) __global__ {
171 const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;
172 if (!(iIndex < nIndices)) {
175 auto* field = lattice.template getField<FIELD>();
176 for (
unsigned iD=0; iD < CONTEXT::descriptor_t::template size<FIELD>(); ++iD) {
177 field[iD][indices[iIndex]] = buffer[iD*nIndices+iIndex];
183 const CellID* indices, std::size_t nIndices,
184 std::uint8_t* buffer) {
185 const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;
186 if (!(iIndex < nIndices)) {
189 for (
unsigned iField=0; iField < nFields; ++iField) {
190 auto& field = fields[iField];
191 for (
unsigned iD=0; iD < field.column_count; ++iD) {
192 memcpy(field[iD] + indices[iIndex]*field.element_size,
193 buffer + (iD*nIndices + iIndex)*field.element_size,
196 buffer += nIndices*field.column_count*field.element_size;
203template <
typename FIELD,
typename CONTEXT>
204void gather_field(CONTEXT& lattice,
const thrust::device_vector<CellID>& indices, std::uint8_t* buffer) {
205 const auto block_size = 32;
206 const auto block_count = (indices.size() + block_size - 1) / block_size;
207 kernel::gather_field<CONTEXT,FIELD><<<block_count,block_size>>>(
209 indices.data().get(), indices.size(),
210 reinterpret_cast<typename FIELD::template value_type<typename CONTEXT::value_t>*
>(buffer));
215template <
typename FIELD,
typename CONTEXT>
218 const thrust::device_vector<CellID>& indices,
219 std::uint8_t* buffer) {
220 const auto block_size = 32;
221 const auto block_count = (indices.size() + block_size - 1) / block_size;
222 kernel::gather_field<CONTEXT,FIELD><<<block_count,block_size,0,stream>>>(
224 indices.data().get(), indices.size(),
225 reinterpret_cast<typename FIELD::template value_type<typename CONTEXT::value_t>*
>(buffer));
230template <
typename FIELD,
typename SOURCE,
typename TARGET>
232 SOURCE& sourceLattice,
233 TARGET& targetLattice,
234 const thrust::device_vector<CellID>& sourceIndices,
235 const thrust::device_vector<CellID>& targetIndices) {
236 const auto block_size = 32;
237 const auto block_count = (sourceIndices.size() + block_size - 1) / block_size;
238 kernel::copy_field<SOURCE,TARGET,FIELD><<<block_count,block_size,0,stream>>>(
241 sourceIndices.data().get(),
242 targetIndices.data().get(),
243 sourceIndices.size());
250 const thrust::device_vector<CellID>& indices,
251 std::uint8_t* buffer) {
252 const auto block_size = 32;
253 const auto block_count = (indices.size() + block_size - 1) / block_size;
254 kernel::gather_any_fields<<<block_count,block_size>>>(
255 fields.data().get(), fields.size(),
256 indices.data().get(), indices.size(),
263 thrust::device_vector<AnyDeviceFieldArrayD>& fields,
264 const thrust::device_vector<CellID>& indices,
265 std::uint8_t* buffer) {
266 const auto block_size = 32;
267 const auto block_count = (indices.size() + block_size - 1) / block_size;
268 kernel::gather_any_fields<<<block_count,block_size,0,stream>>>(
269 fields.data().get(), fields.size(),
270 indices.data().get(), indices.size(),
277 thrust::device_vector<AnyDeviceFieldArrayD>& sourceFields,
278 thrust::device_vector<AnyDeviceFieldArrayD>& targetFields,
279 const thrust::device_vector<CellID>& sourceIndices,
280 const thrust::device_vector<CellID>& targetIndices) {
281 const auto block_size = 32;
282 const auto block_count = (sourceIndices.size() + block_size - 1) / block_size;
283 kernel::copy_any_fields<<<block_count,block_size,0,stream>>>(
284 sourceFields.data().get(), targetFields.data().get(), sourceFields.size(),
285 sourceIndices.data().get(), targetIndices.data().get(), sourceIndices.size());
290template <
typename FIELD,
typename CONTEXT>
291void scatter_field(CONTEXT& lattice,
const thrust::device_vector<CellID>& indices, std::uint8_t* buffer) {
292 const auto block_size = 32;
293 const auto block_count = (indices.size() + block_size - 1) / block_size;
294 kernel::scatter_field<CONTEXT,FIELD><<<block_count,block_size>>>(
296 indices.data().get(), indices.size(),
297 reinterpret_cast<typename FIELD::template value_type<typename CONTEXT::value_t>*
>(buffer));
302template <
typename FIELD,
typename CONTEXT>
305 const thrust::device_vector<CellID>& indices,
306 std::uint8_t* buffer) {
307 const auto block_size = 32;
308 const auto block_count = (indices.size() + block_size - 1) / block_size;
309 kernel::scatter_field<CONTEXT,FIELD><<<block_count,block_size,0,stream>>>(
311 indices.data().get(), indices.size(),
312 reinterpret_cast<typename FIELD::template value_type<typename CONTEXT::value_t>*
>(buffer));
318 const thrust::device_vector<CellID>& indices,
319 std::uint8_t* buffer) {
320 const auto block_size = 32;
321 const auto block_count = (indices.size() + block_size - 1) / block_size;
322 kernel::scatter_any_fields<<<block_count,block_size>>>(
323 fields.data().get(), fields.size(),
324 indices.data().get(), indices.size(),
331 thrust::device_vector<AnyDeviceFieldArrayD>& fields,
332 const thrust::device_vector<CellID>& indices,
333 std::uint8_t* buffer) {
334 const auto block_size = 32;
335 const auto block_count = (indices.size() + block_size - 1) / block_size;
336 kernel::scatter_any_fields<<<block_count,block_size,0,stream>>>(
337 fields.data().get(), fields.size(),
338 indices.data().get(), indices.size(),
347#ifdef PARALLEL_MODE_MPI
350template <
typename T,
typename DESCRIPTOR>
353 thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _fields;
354 const bool _onlyPopulationField;
356 const thrust::device_vector<CellID> _cells;
361 std::unique_ptr<MpiSendRequest> _request;
363 std::unique_ptr<gpu::cuda::device::Stream> _stream;
367 const std::vector<std::type_index>& fields,
368 const std::vector<CellID>& cells,
370 _fields(block.getDataRegistry().deviceFieldArrays(fields)),
371 _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),
374 _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking))
376 std::size_t size = 0;
377 for (
auto& field : fields) {
380 _buffer = gpu::cuda::device::malloc<std::uint8_t>(size);
381 _request = std::make_unique<MpiSendRequest>(
382 _buffer.
get(), size, rank, tag, comm);
387 _stream->synchronize();
393 if (_onlyPopulationField) {
395 gpu::cuda::async_gather_field<descriptors::POPULATION>(_stream->get(), lattice, _cells, _buffer.
get());
403 _stream->synchronize();
414template <
typename T,
typename DESCRIPTOR>
420 thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _fields;
421 const bool _onlyPopulationField;
423 const thrust::device_vector<CellID> _cells;
428 std::unique_ptr<MpiRecvRequest> _request;
430 std::unique_ptr<gpu::cuda::device::Stream> _stream;
445 ref(std::unique_ptr<RecvTask>& task): _task(*task) { };
452 bool operator <(
const ref& rhs)
const
454 return _task < rhs._task;
459 const std::vector<std::type_index>& fields,
460 const std::vector<CellID>& cells,
464 _fields(block.getDataRegistry().deviceFieldArrays(fields)),
465 _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),
468 _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking))
470 std::size_t size = 0;
471 for (
auto& field : fields) {
474 _buffer = gpu::cuda::device::malloc<std::uint8_t>(size);
475 _request = std::make_unique<MpiRecvRequest>(
476 _buffer.
get(), size, _rank, _tag, comm);
486 return _rank < rhs._rank
487 || (_rank == rhs._rank && _tag < rhs._tag);
497 return _request->isDone();
502 if (_onlyPopulationField) {
504 gpu::cuda::async_scatter_field<descriptors::POPULATION>(_stream->get(), lattice, _cells, _buffer.
get());
512 _stream->synchronize();
520template <
typename T,
typename DESCRIPTOR>
529template <
typename T,
typename DESCRIPTOR>
533 thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _sourceFields;
534 thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _targetFields;
536 const bool _onlyPopulationField;
538 const thrust::device_vector<CellID> _targetCells;
539 const thrust::device_vector<CellID> _sourceCells;
544 std::unique_ptr<gpu::cuda::device::Stream> _stream;
548 const std::vector<std::type_index>& fields,
551 _sourceFields(source.getDataRegistry().deviceFieldArrays(fields)),
552 _targetFields(target.getDataRegistry().deviceFieldArrays(fields)),
553 _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),
554 _targetCells(targetCells),
555 _sourceCells(sourceCells),
558 _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking))
560 OLB_ASSERT(_sourceCells.size() == _targetCells.size(),
561 "Source cell count must match target cell count");
571 if (_onlyPopulationField) {
574 gpu::cuda::async_copy_field<descriptors::POPULATION>(_stream->get(), sourceLattice, targetLattice, _sourceCells, _targetCells);
582 _stream->synchronize();
589template <
typename T,
typename DESCRIPTOR, Platform SOURCE>
592 thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _targetFields;
594 const bool _onlyPopulationField;
596 const thrust::device_vector<CellID> _targetCells;
597 const std::vector<CellID>& _sourceCells;
603 std::unique_ptr<gpu::cuda::device::Stream> _stream;
609 const std::vector<std::type_index>& fields,
612 _targetFields(target.getDataRegistry().deviceFieldArrays(fields)),
613 _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),
614 _targetCells(targetCells),
615 _sourceCells(sourceCells),
617 _source(source, fields),
618 _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking)),
619 _buffer(_source.size(_sourceCells))
626 if (_onlyPopulationField) {
628 gpu::cuda::async_scatter_field<descriptors::POPULATION>(_stream->get(), lattice, _targetCells, _buffer.
deviceData());
635 _stream->synchronize();
641template <
typename T,
typename DESCRIPTOR, Platform TARGET>
644 thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _sourceFields;
646 const std::vector<CellID>& _targetCells;
647 const thrust::device_vector<CellID> _sourceCells;
652 const bool _onlyPopulationField;
654 std::unique_ptr<gpu::cuda::device::Stream> _stream;
660 const std::vector<std::type_index>& fields,
663 _sourceFields(source.getDataRegistry().deviceFieldArrays(fields)),
664 _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),
665 _targetCells(targetCells),
666 _sourceCells(sourceCells),
667 _target(target, fields),
669 _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking)),
670 _buffer(_target.size(_targetCells))
674 if (_onlyPopulationField) {
676 gpu::cuda::async_gather_field<descriptors::POPULATION>(_stream->get(), lattice, _sourceCells, _buffer.
deviceData());
685 _stream->synchronize();
691template <
typename T,
typename DESCRIPTOR, Platform SOURCE>
693 const std::vector<std::type_index>& fields,
698 sourceCells, source))
700 OLB_ASSERT(sourceCells.size() == targetCells.size(),
701 "Source cell count must match target cell count");
704template <
typename T,
typename DESCRIPTOR, Platform SOURCE>
710template <
typename T,
typename DESCRIPTOR, Platform SOURCE>
716template <
typename T,
typename DESCRIPTOR, Platform TARGET>
718 const std::vector<std::type_index>& fields,
723 sourceCells, source))
725 OLB_ASSERT(sourceCells.size() == targetCells.size(),
726 "Source cell count must match target cell count");
729template <
typename T,
typename DESCRIPTOR, Platform TARGET>
735template <
typename T,
typename DESCRIPTOR, Platform TARGET>
742template <
typename T,
typename DESCRIPTOR>
746#ifdef PARALLEL_MODE_MPI
753#ifdef PARALLEL_MODE_MPI
754, _mpiCommunicator(comm)
757#ifdef PARALLEL_MODE_MPI
759 if (loadBalancer.
isLocal(remoteC)) {
760 const Platform remotePlatform = loadBalancer.platform(loadBalancer.loc(remoteC));
761 if (!neighborhood.getCellsInboundFrom(remoteC).empty()) {
762 switch (remotePlatform) {
763 case Platform::GPU_CUDA:
765 _copyTasks.emplace_back(new HomogeneousCopyTask(
766 neighborhood.getFieldsCommonWith(remoteC),
767 neighborhood.getCellsInboundFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC),
768 neighborhood.getCellsRequestedFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(loadBalancer.loc(remoteC))));
770 case Platform::CPU_SIMD:
772 _copyTasks.emplace_back(new HeterogeneousCopyTask<T,DESCRIPTOR,Platform::CPU_SIMD,Platform::GPU_CUDA>(
773 neighborhood.getFieldsCommonWith(remoteC),
774 neighborhood.getCellsInboundFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC),
775 neighborhood.getCellsRequestedFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::CPU_SIMD>>(loadBalancer.loc(remoteC))));
777 case Platform::CPU_SISD:
779 _copyTasks.emplace_back(new HeterogeneousCopyTask<T,DESCRIPTOR,Platform::CPU_SISD,Platform::GPU_CUDA>(
780 neighborhood.getFieldsCommonWith(remoteC),
781 neighborhood.getCellsInboundFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC),
782 neighborhood.getCellsRequestedFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::CPU_SISD>>(loadBalancer.loc(remoteC))));
785 throw std::runtime_error(
"Invalid remote PLATFORM");
791 _sendTasks.emplace_back(std::make_unique<SendTask>(
792 _mpiCommunicator, tagCoordinator.get(loadBalancer.glob(_iC), remoteC),
793 loadBalancer.rank(remoteC),
794 neighborhood.getFieldsCommonWith(remoteC),
795 neighborhood.getCellsOutboundTo(remoteC),
796 super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC)));
799 _recvTasks.emplace_back(std::make_unique<RecvTask>(
800 _mpiCommunicator, tagCoordinator.get(remoteC, loadBalancer.glob(_iC)),
801 loadBalancer.rank(remoteC),
802 neighborhood.getFieldsCommonWith(remoteC),
803 neighborhood.getCellsInboundFrom(remoteC),
804 super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC)));
810 neighborhood.forNeighbors([&](
int localC) {
811 if (!neighborhood.getCellsInboundFrom(localC).empty()) {
812 _copyTasks.emplace_back(new HomogeneousCopyTask(
813 neighborhood.getFieldsCommonWith(localC),
814 neighborhood.getCellsInboundFrom(localC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC),
815 neighborhood.getCellsRequestedFrom(localC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(loadBalancer.loc(localC))));
821template <
typename T,
typename DESCRIPTOR>
825#ifdef PARALLEL_MODE_MPI
827template <
typename T,
typename DESCRIPTOR>
830 for (
auto& task : _recvTasks) {
835template <
typename T,
typename DESCRIPTOR>
838 for (
auto& task : _sendTasks) {
841 for (
auto& task : _sendTasks) {
844 for (
auto& task : _copyTasks) {
849template <
typename T,
typename DESCRIPTOR>
852 std::set<typename RecvTask::ref> pending(_recvTasks.begin(), _recvTasks.end());
853 while (!pending.empty()) {
854 auto task_iterator = pending.begin();
855 while (task_iterator != pending.end()) {
856 auto& task = *task_iterator;
857 if (task->isDone()) {
859 task_iterator = pending.erase(task_iterator);
868template <
typename T,
typename DESCRIPTOR>
871 for (
auto& task : _copyTasks) {
874 for (
auto& task : _recvTasks) {
877 for (
auto& task : _sendTasks) {
884template <
typename T,
typename DESCRIPTOR>
887 for (
auto& task : _copyTasks) {
Configurable overlap communication neighborhood of a block.
const std::vector< CellID > & getCellsOutboundTo(int iC) const
const std::vector< CellID > & getCellsInboundFrom(int iC) const
void forNeighbors(F f) const
Calls f(iC) for every neighboring cuboid ID iC.
RecvTask(MPI_Comm comm, int tag, int rank, const std::vector< std::type_index > &fields, const std::vector< CellID > &cells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &block)
bool operator<(const RecvTask &rhs) const
SendTask(MPI_Comm comm, int tag, int rank, const std::vector< std::type_index > &fields, const std::vector< CellID > &cells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &block)
ref(std::unique_ptr< RecvTask > &task)
RecvTask * operator->() const
HomogeneousCopyTask(const std::vector< std::type_index > &fields, const std::vector< CellID > &targetCells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &target, const std::vector< CellID > &sourceCells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &source)
Implementation of BlockLattice on a concrete PLATFORM.
Communicatable & getCommunicatable(std::type_index field) override
Private implementation of heterogeneous copy task between GPU_CUDA source and CPU_* target.
HeterogeneousCopyTaskDataForGpuSource(const std::vector< std::type_index > &fields, const std::vector< CellID > &targetCells, ConcreteBlockLattice< T, DESCRIPTOR, TARGET > &target, const std::vector< CellID > &sourceCells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &source)
Private implementation of heterogeneous copy task between CPU_* source and GPU_CUDA target.
HeterogeneousCopyTaskDataForGpuTarget(const std::vector< std::type_index > &fields, const std::vector< CellID > &targetCells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &target, const std::vector< CellID > &sourceCells, ConcreteBlockLattice< T, DESCRIPTOR, SOURCE > &source)
Wrapper for a local heterogeneous block communication request.
Base class for all LoadBalancer.
bool isLocal(const int &glob)
returns whether glob is on this process
std::size_t serialize(ConstSpan< CellID > indices, std::uint8_t *buffer) const override
Serialize data at locations indices to buffer
std::size_t deserialize(ConstSpan< CellID > indices, const std::uint8_t *buffer) override
Deserialize data at locations indices to buffer
class for marking output with some text
void setMultiOutput(bool b)
enable message output for all MPI processes, disabled by default
Communication-free negotation of unique tags for inter-cuboid communication.
Super class maintaining block lattices for a cuboid decomposition.
Plain column for CUDA GPU targets.
const T * deviceData() const
void setProcessingContext(ProcessingContext)
Structure for passing pointers to on-device data into CUDA kernels.
Managed pointer for device-side memory.
void check()
Check errors.
void copy_field(SOURCE sourceLattice, TARGET targetLattice, const CellID *sourceIndices, const CellID *targetIndices, std::size_t nIndices) __global__
CUDA kernel for copying FIELD data of sourceLattice at sourceIndices into targetLattice at targetIndi...
void scatter_field(CONTEXT lattice, const CellID *indices, std::size_t nIndices, typename FIELD::template value_type< typename CONTEXT::value_t > *buffer) __global__
CUDA kernel for scattering FIELD data in buffer to indices in lattice.
void gather_field(CONTEXT lattice, const CellID *indices, std::size_t nIndices, typename FIELD::template value_type< typename CONTEXT::value_t > *buffer) __global__
CUDA kernel for gathering FIELD data of lattice at indices into buffer.
__global__ void scatter_any_fields(AnyDeviceFieldArrayD *fields, std::size_t nFields, const CellID *indices, std::size_t nIndices, std::uint8_t *buffer)
CUDA kernel for scattering fields in buffer to indices in lattice.
__global__ void copy_any_fields(AnyDeviceFieldArrayD *sourceFields, AnyDeviceFieldArrayD *targetFields, std::size_t nFields, const CellID *sourceIndices, const CellID *targetIndices, std::size_t nIndices)
CUDA kernel for copying sourceFields at sourceIndices to targetFields at targetIndices.
__global__ void gather_any_fields(AnyDeviceFieldArrayD *fields, std::size_t nFields, const CellID *indices, std::size_t nIndices, std::uint8_t *buffer)
CUDA kernel for gathering fields at indices into buffer.
void async_scatter_any_fields(cudaStream_t stream, thrust::device_vector< AnyDeviceFieldArrayD > &fields, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Non-blocking scatter of fields data in buffer to given indices.
void async_copy_field(cudaStream_t stream, SOURCE &sourceLattice, TARGET &targetLattice, const thrust::device_vector< CellID > &sourceIndices, const thrust::device_vector< CellID > &targetIndices)
Non-blocking copy of FIELD at given indices from sourceLattice to targetLattice.
void scatter_any_fields(thrust::device_vector< AnyDeviceFieldArrayD > &fields, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Blocking scatter of fields data in buffer to given indices.
void gather_any_fields(thrust::device_vector< AnyDeviceFieldArrayD > &fields, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Blocking gather of fields at given indices into buffer.
void async_gather_field(cudaStream_t stream, CONTEXT &lattice, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Non-blocking gather of FIELD at given indices into buffer.
void async_scatter_field(cudaStream_t stream, CONTEXT &lattice, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Non-blocking scatter of FIELD data in buffer to given indices.
void async_copy_any_fields(cudaStream_t stream, thrust::device_vector< AnyDeviceFieldArrayD > &sourceFields, thrust::device_vector< AnyDeviceFieldArrayD > &targetFields, const thrust::device_vector< CellID > &sourceIndices, const thrust::device_vector< CellID > &targetIndices)
Non-blocking copy of fields at given indices from sourceIndices to targetIndices.
void scatter_field(CONTEXT &lattice, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Blocking scatter of FIELD data in buffer to given indices.
void gather_field(CONTEXT &lattice, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Blocking gather of FIELD at given indices into buffer.
void async_gather_any_fields(cudaStream_t stream, thrust::device_vector< AnyDeviceFieldArrayD > &fields, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Non-blocking gather of fields at given indices into buffer.
Top level namespace for all of OpenLB.
std::uint32_t CellID
Type for sequential block-local cell indices.
@ Simulation
Data available on host for e.g. functor evaluation.
Platform
OpenLB execution targets.
@ GPU_CUDA
Vector CPU (AVX2 / AVX-512 collision)
void checkPlatform< Platform::GPU_CUDA >()
Verifies availability of CUDA device and MPI support.
#define OLB_ASSERT(COND, MESSAGE)
virtual std::size_t size(ConstSpan< CellID > indices) const =0
Private implementation of HeterogeneousCopyTask (PIMPL)
Type-erased pointer to FieldArrayD device data.