d8/d1a/communicator_8hh_source.html

/*  This file is part of the OpenLB library

 *

 *  Copyright (C) 2022 Adrian Kummerlaender

 *  E-mail contact: info@openlb.net

 *  The most recent release of OpenLB can be downloaded at

 *  <http://www.openlb.net/>

 *

 *  This program is free software; you can redistribute it and/or

 *  modify it under the terms of the GNU General Public License

 *  as published by the Free Software Foundation; either version 2

 *  of the License, or (at your option) any later version.

 *

 *  This program is distributed in the hope that it will be useful,

 *  but WITHOUT ANY WARRANTY; without even the implied warranty of

 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 *  GNU General Public License for more details.

 *

 *  You should have received a copy of the GNU General Public

 *  License along with this program; if not, write to the Free

 *  Software Foundation, Inc., 51 Franklin Street, Fifth Floor,

 *  Boston, MA  02110-1301, USA.

*/


#ifndef PLATFORM_GPU_CUDA_COMMUNICATOR_HH

#define PLATFORM_GPU_CUDA_COMMUNICATOR_HH


#include "communication/mpiRequest.h"

#include "communication/communicatable.h"

#include "communication/superCommunicationTagCoordinator.h"


#include "registry.h"

#include "context.hh"


#include <thrust/device_vector.h>


#ifdef PARALLEL_MODE_MPI

#include "mpi.h"

#if defined(OPEN_MPI) && OPEN_MPI

#include <mpi-ext.h>

#endif

#endif


namespace olb {


template <>


void checkPlatform<Platform::GPU_CUDA>()

{

  OstreamManager clout(std::cout, "GPU_CUDA");


  int nDevices{};

  cudaGetDeviceCount(&nDevices);


  clout.setMultiOutput(true);

  if (nDevices < 1) {

    clout << "No CUDA device found" << std::endl;

  } else if (nDevices > 1) {

    clout << "Found " << nDevices << " CUDA devices but only one can be used per MPI process." << std::endl;

  }

#ifdef OLB_DEBUG

  for (int deviceI=0; deviceI < nDevices; ++deviceI) {

    cudaDeviceProp deviceProp;

    cudaGetDeviceProperties(&deviceProp, deviceI);

    clout << deviceProp.name << " visible" << std::endl;

  }

#endif

  clout.setMultiOutput(false);


#ifdef PARALLEL_MODE_MPI

#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT

  if (!MPIX_Query_cuda_support()) {

    clout << "The used MPI Library is not CUDA-aware. Multi-GPU execution will fail." << std::endl;

  }

#endif

#if defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT

  clout << "The used MPI Library is not CUDA-aware. Multi-GPU execution will fail." << std::endl;

#endif

#if !defined(MPIX_CUDA_AWARE_SUPPORT)

  clout << "Unable to check for CUDA-aware MPI support. Multi-GPU execution may fail." << std::endl;

#endif

#endif // PARALLEL_MODE_MPI

}


namespace gpu {


namespace cuda {


namespace kernel {


template <typename CONTEXT, typename FIELD>


void gather_field(CONTEXT lattice,

                  const CellID* indices, std::size_t nIndices,

                  typename FIELD::template value_type<typename CONTEXT::value_t>* buffer) __global__ {

  const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;

  if (!(iIndex < nIndices)) {

    return;

  }

  auto* field = lattice.template getField<FIELD>();

  for (unsigned iD=0; iD < CONTEXT::descriptor_t::template size<FIELD>(); ++iD) {

    buffer[iD*nIndices+iIndex] = field[iD][indices[iIndex]];

  }

}


template <typename SOURCE, typename TARGET, typename FIELD>


void copy_field(SOURCE sourceLattice, TARGET targetLattice,

                const CellID* sourceIndices,

                const CellID* targetIndices,

                std::size_t nIndices) __global__ {

  const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;

  if (!(iIndex < nIndices)) {

    return;

  }

  auto* source = sourceLattice.template getField<FIELD>();

  auto* target = targetLattice.template getField<FIELD>();

  for (unsigned iD=0; iD < SOURCE::descriptor_t::template size<FIELD>(); ++iD) {

    target[iD][targetIndices[iIndex]] = source[iD][sourceIndices[iIndex]];

  }

}


__global__ void gather_any_fields(AnyDeviceFieldArrayD* fields, std::size_t nFields,

                                  const CellID* indices, std::size_t nIndices,

                                  std::uint8_t* buffer) {

  const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;

  if (!(iIndex < nIndices)) {

    return;

  }

  for (unsigned iField=0; iField < nFields; ++iField) {

    auto& field = fields[iField];

    for (unsigned iD=0; iD < field.column_count; ++iD) {

      memcpy(buffer + (iD*nIndices + iIndex)*field.element_size,

             field[iD] + indices[iIndex]*field.element_size,

             field.element_size);

    }

    buffer += nIndices*field.column_count*field.element_size;

  }

}


__global__ void copy_any_fields(AnyDeviceFieldArrayD* sourceFields,

                                AnyDeviceFieldArrayD* targetFields,

                                std::size_t nFields,

                                const CellID* sourceIndices,

                                const CellID* targetIndices,

                                std::size_t nIndices) {

  const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;

  if (!(iIndex < nIndices)) {

    return;

  }

  for (unsigned iField=0; iField < nFields; ++iField) {

    auto& sourceField = sourceFields[iField];

    auto& targetField = targetFields[iField];

    for (unsigned iD=0; iD < sourceField.column_count; ++iD) {

      memcpy(targetField[iD] + targetIndices[iIndex]*sourceField.element_size,

             sourceField[iD] + sourceIndices[iIndex]*sourceField.element_size,

             sourceField.element_size);

    }

  }

}


template <typename CONTEXT, typename FIELD>


void scatter_field(CONTEXT lattice,

                   const CellID* indices, std::size_t nIndices,

                   typename FIELD::template value_type<typename CONTEXT::value_t>* buffer) __global__ {

  const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;

  if (!(iIndex < nIndices)) {

    return;

  }

  auto* field = lattice.template getField<FIELD>();

  for (unsigned iD=0; iD < CONTEXT::descriptor_t::template size<FIELD>(); ++iD) {

    field[iD][indices[iIndex]] = buffer[iD*nIndices+iIndex];

  }

}


__global__ void scatter_any_fields(AnyDeviceFieldArrayD* fields, std::size_t nFields,

                                   const CellID* indices, std::size_t nIndices,

                                   std::uint8_t* buffer) {

  const CellID iIndex = blockIdx.x * blockDim.x + threadIdx.x;

  if (!(iIndex < nIndices)) {

    return;

  }

  for (unsigned iField=0; iField < nFields; ++iField) {

    auto& field = fields[iField];

    for (unsigned iD=0; iD < field.column_count; ++iD) {

      memcpy(field[iD] + indices[iIndex]*field.element_size,

             buffer + (iD*nIndices + iIndex)*field.element_size,

             field.element_size);

    }

    buffer += nIndices*field.column_count*field.element_size;

  }

}


}


template <typename FIELD, typename CONTEXT>


void gather_field(CONTEXT& lattice, const thrust::device_vector<CellID>& indices, std::uint8_t* buffer) {

  const auto block_size = 32;

  const auto block_count = (indices.size() + block_size - 1) / block_size;

  kernel::gather_field<CONTEXT,FIELD><<<block_count,block_size>>>(

    lattice,

    indices.data().get(), indices.size(),

    reinterpret_cast<typename FIELD::template value_type<typename CONTEXT::value_t>*>(buffer));

  device::check();

}


template <typename FIELD, typename CONTEXT>


void async_gather_field(cudaStream_t stream,

                        CONTEXT& lattice,

                        const thrust::device_vector<CellID>& indices,

                        std::uint8_t* buffer) {

  const auto block_size = 32;

  const auto block_count = (indices.size() + block_size - 1) / block_size;

  kernel::gather_field<CONTEXT,FIELD><<<block_count,block_size,0,stream>>>(

    lattice,

    indices.data().get(), indices.size(),

    reinterpret_cast<typename FIELD::template value_type<typename CONTEXT::value_t>*>(buffer));

  device::check();

}


template <typename FIELD, typename SOURCE, typename TARGET>


void async_copy_field(cudaStream_t stream,

                      SOURCE& sourceLattice,

                      TARGET& targetLattice,

                      const thrust::device_vector<CellID>& sourceIndices,

                      const thrust::device_vector<CellID>& targetIndices) {

  const auto block_size = 32;

  const auto block_count = (sourceIndices.size() + block_size - 1) / block_size;

  kernel::copy_field<SOURCE,TARGET,FIELD><<<block_count,block_size,0,stream>>>(

    sourceLattice,

    targetLattice,

    sourceIndices.data().get(),

    targetIndices.data().get(),

    sourceIndices.size());

  device::check();

}


void gather_any_fields(thrust::device_vector<AnyDeviceFieldArrayD>& fields,

                       const thrust::device_vector<CellID>& indices,

                       std::uint8_t* buffer) {

  const auto block_size = 32;

  const auto block_count = (indices.size() + block_size - 1) / block_size;

  kernel::gather_any_fields<<<block_count,block_size>>>(

    fields.data().get(), fields.size(),

    indices.data().get(), indices.size(),

    buffer);

  device::check();

}


void async_gather_any_fields(cudaStream_t stream,

                             thrust::device_vector<AnyDeviceFieldArrayD>& fields,

                             const thrust::device_vector<CellID>& indices,

                             std::uint8_t* buffer) {

  const auto block_size = 32;

  const auto block_count = (indices.size() + block_size - 1) / block_size;

  kernel::gather_any_fields<<<block_count,block_size,0,stream>>>(

    fields.data().get(), fields.size(),

    indices.data().get(), indices.size(),

    buffer);

  device::check();

}


void async_copy_any_fields(cudaStream_t stream,

                           thrust::device_vector<AnyDeviceFieldArrayD>& sourceFields,

                           thrust::device_vector<AnyDeviceFieldArrayD>& targetFields,

                           const thrust::device_vector<CellID>& sourceIndices,

                           const thrust::device_vector<CellID>& targetIndices) {

  const auto block_size = 32;

  const auto block_count = (sourceIndices.size() + block_size - 1) / block_size;

  kernel::copy_any_fields<<<block_count,block_size,0,stream>>>(

    sourceFields.data().get(),  targetFields.data().get(),  sourceFields.size(),

    sourceIndices.data().get(), targetIndices.data().get(), sourceIndices.size());

  device::check();

}


template <typename FIELD, typename CONTEXT>


void scatter_field(CONTEXT& lattice, const thrust::device_vector<CellID>& indices, std::uint8_t* buffer) {

  const auto block_size = 32;

  const auto block_count = (indices.size() + block_size - 1) / block_size;

  kernel::scatter_field<CONTEXT,FIELD><<<block_count,block_size>>>(

    lattice,

    indices.data().get(), indices.size(),

    reinterpret_cast<typename FIELD::template value_type<typename CONTEXT::value_t>*>(buffer));

  device::check();

}


template <typename FIELD, typename CONTEXT>


void async_scatter_field(cudaStream_t stream,

                         CONTEXT& lattice,

                         const thrust::device_vector<CellID>& indices,

                         std::uint8_t* buffer) {

  const auto block_size = 32;

  const auto block_count = (indices.size() + block_size - 1) / block_size;

  kernel::scatter_field<CONTEXT,FIELD><<<block_count,block_size,0,stream>>>(

    lattice,

    indices.data().get(), indices.size(),

    reinterpret_cast<typename FIELD::template value_type<typename CONTEXT::value_t>*>(buffer));

  device::check();

}


void scatter_any_fields(thrust::device_vector<AnyDeviceFieldArrayD>& fields,

                        const thrust::device_vector<CellID>& indices,

                        std::uint8_t* buffer) {

  const auto block_size = 32;

  const auto block_count = (indices.size() + block_size - 1) / block_size;

  kernel::scatter_any_fields<<<block_count,block_size>>>(

    fields.data().get(), fields.size(),

    indices.data().get(), indices.size(),

    buffer);

  device::check();

}


void async_scatter_any_fields(cudaStream_t stream,

                              thrust::device_vector<AnyDeviceFieldArrayD>& fields,

                              const thrust::device_vector<CellID>& indices,

                              std::uint8_t* buffer) {

  const auto block_size = 32;

  const auto block_count = (indices.size() + block_size - 1) / block_size;

  kernel::scatter_any_fields<<<block_count,block_size,0,stream>>>(

    fields.data().get(), fields.size(),

    indices.data().get(), indices.size(),

    buffer);

  device::check();

}


}


}


#ifdef PARALLEL_MODE_MPI


template <typename T, typename DESCRIPTOR>


class ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::SendTask {

private:

  thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _fields;

  const bool _onlyPopulationField;


  const thrust::device_vector<CellID> _cells;


  ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& _source;


  gpu::cuda::device::unique_ptr<std::uint8_t> _buffer;

  std::unique_ptr<MpiSendRequest> _request;


  std::unique_ptr<gpu::cuda::device::Stream> _stream;


public:


  SendTask(MPI_Comm comm, int tag, int rank,

           const std::vector<std::type_index>& fields,

           const std::vector<CellID>& cells,

           ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& block):

    _fields(block.getDataRegistry().deviceFieldArrays(fields)),

    _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),

    _cells(cells),

    _source(block),

    _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking))

  {

    std::size_t size = 0;

    for (auto& field : fields) {

      size += _source.getCommunicatable(field).size(cells);

    }

    _buffer = gpu::cuda::device::malloc<std::uint8_t>(size);

    _request = std::make_unique<MpiSendRequest>(

      _buffer.get(), size, rank, tag, comm);

  }


  ~SendTask()

  {

    _stream->synchronize();

    wait();

  }


  void prepare()

  {

    if (_onlyPopulationField) {

      gpu::cuda::DeviceContext<T,DESCRIPTOR> lattice(_source);

      gpu::cuda::async_gather_field<descriptors::POPULATION>(_stream->get(), lattice, _cells, _buffer.get());

    } else {

      gpu::cuda::async_gather_any_fields(_stream->get(), _fields, _cells, _buffer.get());

    }

  }


  void send()

  {

    _stream->synchronize();

    _request->start();

  }


  void wait()

  {

    _request->wait();

  }


};


template <typename T, typename DESCRIPTOR>


class ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::RecvTask {

private:

  const int _tag;

  const int _rank;


  thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _fields;

  const bool _onlyPopulationField;


  const thrust::device_vector<CellID> _cells;


  ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& _target;


  gpu::cuda::device::unique_ptr<std::uint8_t> _buffer;

  std::unique_ptr<MpiRecvRequest> _request;


  std::unique_ptr<gpu::cuda::device::Stream> _stream;


public:


  class ref {

  private:

    RecvTask& _task;

  public:

    ref(std::unique_ptr<RecvTask>& task): _task(*task) { };


    RecvTask* operator->() const

    {

      return &_task;

    }


    bool operator <(const ref& rhs) const

    {

      return _task < rhs._task;

    }


  };


  RecvTask(MPI_Comm comm, int tag, int rank,

           const std::vector<std::type_index>& fields,

           const std::vector<CellID>& cells,

           ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& block):

    _tag(tag),

    _rank(rank),

    _fields(block.getDataRegistry().deviceFieldArrays(fields)),

    _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),

    _cells(cells),

    _target(block),

    _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking))

  {

    std::size_t size = 0;

    for (auto& field : fields) {

      size += _target.getCommunicatable(field).size(cells);

    }

    _buffer = gpu::cuda::device::malloc<std::uint8_t>(size);

    _request = std::make_unique<MpiRecvRequest>(

      _buffer.get(), size, _rank, _tag, comm);

  }


  ~RecvTask()

  {

    wait();

  }


  bool operator<(const RecvTask& rhs) const

  {

    return  _rank  < rhs._rank

        || (_rank == rhs._rank && _tag < rhs._tag);

  }


  void receive()

  {

    _request->start();

  };


  bool isDone()

  {

    return _request->isDone();

  }


  void unpack()

  {

    if (_onlyPopulationField) {

      gpu::cuda::DeviceContext<T,DESCRIPTOR> lattice(_target);

      gpu::cuda::async_scatter_field<descriptors::POPULATION>(_stream->get(), lattice, _cells, _buffer.get());

    } else {

      gpu::cuda::async_scatter_any_fields(_stream->get(), _fields, _cells, _buffer.get());

    }

  }


  void wait()

  {

    _stream->synchronize();

  }


};


#endif // PARALLEL_MODE_MPI


template <typename T, typename DESCRIPTOR>


struct ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::CopyTask {

  virtual ~CopyTask() { }


  virtual void copy() = 0;

  virtual void wait() = 0;

};


template <typename T, typename DESCRIPTOR>


class ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::HomogeneousCopyTask

  : public ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::CopyTask {

private:

  thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _sourceFields;

  thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _targetFields;


  const bool _onlyPopulationField;


  const thrust::device_vector<CellID> _targetCells;

  const thrust::device_vector<CellID> _sourceCells;


  ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& _target;

  ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& _source;


  std::unique_ptr<gpu::cuda::device::Stream> _stream;


public:


  HomogeneousCopyTask(

    const std::vector<std::type_index>& fields,

    const std::vector<CellID>& targetCells, ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& target,

    const std::vector<CellID>& sourceCells, ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& source):

    _sourceFields(source.getDataRegistry().deviceFieldArrays(fields)),

    _targetFields(target.getDataRegistry().deviceFieldArrays(fields)),

    _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),

    _targetCells(targetCells),

    _sourceCells(sourceCells),

    _target(target),

    _source(source),

    _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking))

  {

    OLB_ASSERT(_sourceCells.size() == _targetCells.size(),

               "Source cell count must match target cell count");

  }


  ~HomogeneousCopyTask()

  {

    wait();

  }


  void copy() override

  {

    if (_onlyPopulationField) {

      gpu::cuda::DeviceContext<T,DESCRIPTOR> sourceLattice(_source);

      gpu::cuda::DeviceContext<T,DESCRIPTOR> targetLattice(_target);

      gpu::cuda::async_copy_field<descriptors::POPULATION>(_stream->get(), sourceLattice, targetLattice, _sourceCells, _targetCells);

    } else {

      gpu::cuda::async_copy_any_fields(_stream->get(), _sourceFields, _targetFields, _sourceCells, _targetCells);

    }

  }


  void wait() override

  {

    _stream->synchronize();

  }


};


template <typename T, typename DESCRIPTOR, Platform SOURCE>


class HeterogeneousCopyTaskDataForGpuTarget : public ConcreteHeterogeneousCopyTask {

private:

  thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _targetFields;


  const bool _onlyPopulationField;


  const thrust::device_vector<CellID> _targetCells;

  const std::vector<CellID>&          _sourceCells;


  ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& _target;


  MultiConcreteCommunicatable<ConcreteBlockLattice<T,DESCRIPTOR,SOURCE>> _source;


  std::unique_ptr<gpu::cuda::device::Stream> _stream;


  gpu::cuda::Column<std::uint8_t> _buffer;


public:


  HeterogeneousCopyTaskDataForGpuTarget(

    const std::vector<std::type_index>& fields,

    const std::vector<CellID>& targetCells, ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& target,

    const std::vector<CellID>& sourceCells, ConcreteBlockLattice<T,DESCRIPTOR,SOURCE>&             source):

    _targetFields(target.getDataRegistry().deviceFieldArrays(fields)),

    _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),

    _targetCells(targetCells),

    _sourceCells(sourceCells),

    _target(target),

    _source(source, fields),

    _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking)),

    _buffer(_source.size(_sourceCells))

  { }


  void copy() override {

    _source.serialize(_sourceCells, _buffer.data());

    _buffer.setProcessingContext(ProcessingContext::Simulation, *_stream);


    if (_onlyPopulationField) {

      gpu::cuda::DeviceContext<T,DESCRIPTOR> lattice(_target);

      gpu::cuda::async_scatter_field<descriptors::POPULATION>(_stream->get(), lattice, _targetCells, _buffer.deviceData());

    } else {

      gpu::cuda::async_scatter_any_fields(_stream->get(), _targetFields, _targetCells, _buffer.deviceData());

    }

  }


  void wait() override {

    _stream->synchronize();

  }


};


template <typename T, typename DESCRIPTOR, Platform TARGET>


class HeterogeneousCopyTaskDataForGpuSource : public ConcreteHeterogeneousCopyTask {

private:

  thrust::device_vector<gpu::cuda::AnyDeviceFieldArrayD> _sourceFields;


  const std::vector<CellID>&          _targetCells;

  const thrust::device_vector<CellID> _sourceCells;


  MultiConcreteCommunicatable<ConcreteBlockLattice<T,DESCRIPTOR,TARGET>> _target;

  ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& _source;


  const bool _onlyPopulationField;


  std::unique_ptr<gpu::cuda::device::Stream> _stream;


  gpu::cuda::Column<std::uint8_t> _buffer;


public:


  HeterogeneousCopyTaskDataForGpuSource(

    const std::vector<std::type_index>& fields,

    const std::vector<CellID>& targetCells, ConcreteBlockLattice<T,DESCRIPTOR,TARGET>&             target,

    const std::vector<CellID>& sourceCells, ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& source):

    _sourceFields(source.getDataRegistry().deviceFieldArrays(fields)),

    _onlyPopulationField(fields.size() == 1 && fields[0] == typeid(descriptors::POPULATION)),

    _targetCells(targetCells),

    _sourceCells(sourceCells),

    _target(target, fields),

    _source(source),

    _stream(std::make_unique<gpu::cuda::device::Stream>(cudaStreamNonBlocking)),

    _buffer(_target.size(_targetCells))

  { }


  void copy() override {

    if (_onlyPopulationField) {

      gpu::cuda::DeviceContext<T,DESCRIPTOR> lattice(_source);

      gpu::cuda::async_gather_field<descriptors::POPULATION>(_stream->get(), lattice, _sourceCells, _buffer.deviceData());

    } else {

      gpu::cuda::async_gather_any_fields(_stream->get(), _sourceFields, _sourceCells, _buffer.deviceData());

    }


    _buffer.setProcessingContext(ProcessingContext::Evaluation, *_stream);

  }


  void wait() override {

    _stream->synchronize();

    _target.deserialize(_targetCells, _buffer.data());

  }


};


template <typename T, typename DESCRIPTOR, Platform SOURCE>


HeterogeneousCopyTask<T,DESCRIPTOR,SOURCE,Platform::GPU_CUDA>::HeterogeneousCopyTask(

  const std::vector<std::type_index>& fields,

  const std::vector<CellID>& targetCells, ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& target,

  const std::vector<CellID>& sourceCells, ConcreteBlockLattice<T,DESCRIPTOR,SOURCE>&             source):

  _impl(new HeterogeneousCopyTaskDataForGpuTarget<T,DESCRIPTOR,SOURCE>(fields,

                                                                       targetCells, target,

                                                                       sourceCells, source))

{

  OLB_ASSERT(sourceCells.size() == targetCells.size(),

             "Source cell count must match target cell count");

}


template <typename T, typename DESCRIPTOR, Platform SOURCE>


void HeterogeneousCopyTask<T,DESCRIPTOR,SOURCE,Platform::GPU_CUDA>::copy()

{

  _impl->copy();

}


template <typename T, typename DESCRIPTOR, Platform SOURCE>


void HeterogeneousCopyTask<T,DESCRIPTOR,SOURCE,Platform::GPU_CUDA>::wait()

{

  _impl->wait();

}


template <typename T, typename DESCRIPTOR, Platform TARGET>


HeterogeneousCopyTask<T,DESCRIPTOR,Platform::GPU_CUDA,TARGET>::HeterogeneousCopyTask(

  const std::vector<std::type_index>& fields,

  const std::vector<CellID>& targetCells, ConcreteBlockLattice<T,DESCRIPTOR,TARGET>&             target,

  const std::vector<CellID>& sourceCells, ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>& source):

  _impl(new HeterogeneousCopyTaskDataForGpuSource<T,DESCRIPTOR,TARGET>(fields,

                                                                       targetCells, target,

                                                                       sourceCells, source))

{

  OLB_ASSERT(sourceCells.size() == targetCells.size(),

             "Source cell count must match target cell count");

}


template <typename T, typename DESCRIPTOR, Platform TARGET>


void HeterogeneousCopyTask<T,DESCRIPTOR,Platform::GPU_CUDA,TARGET>::copy()

{

  _impl->copy();

}


template <typename T, typename DESCRIPTOR, Platform TARGET>


void HeterogeneousCopyTask<T,DESCRIPTOR,Platform::GPU_CUDA,TARGET>::wait()

{

  _impl->wait();

}


template <typename T, typename DESCRIPTOR>


ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::ConcreteBlockCommunicator(

  SuperLattice<T,DESCRIPTOR>& super,

  LoadBalancer<T>& loadBalancer,

#ifdef PARALLEL_MODE_MPI

  SuperCommunicationTagCoordinator<T>& tagCoordinator,

  MPI_Comm comm,

#endif

  int iC,

  const BlockCommunicationNeighborhood<T,DESCRIPTOR::d>& neighborhood):

  _iC(iC)

#ifdef PARALLEL_MODE_MPI

, _mpiCommunicator(comm)

#endif

{

#ifdef PARALLEL_MODE_MPI

  neighborhood.forNeighbors([&](int remoteC) {

    if (loadBalancer.isLocal(remoteC)) {

      const Platform remotePlatform = loadBalancer.platform(loadBalancer.loc(remoteC));

      if (!neighborhood.getCellsInboundFrom(remoteC).empty()) {

        switch (remotePlatform) {

          case Platform::GPU_CUDA:

            // Use manual copy for local GPU-GPU communication due to better performance

            _copyTasks.emplace_back(new HomogeneousCopyTask(

              neighborhood.getFieldsCommonWith(remoteC),

              neighborhood.getCellsInboundFrom(remoteC),   super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC),

              neighborhood.getCellsRequestedFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(loadBalancer.loc(remoteC))));

            break;

          case Platform::CPU_SIMD:

            // Use manual copy for local GPU-CPU communication due to better performance

            _copyTasks.emplace_back(new HeterogeneousCopyTask<T,DESCRIPTOR,Platform::CPU_SIMD,Platform::GPU_CUDA>(

              neighborhood.getFieldsCommonWith(remoteC),

              neighborhood.getCellsInboundFrom(remoteC),   super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC),

              neighborhood.getCellsRequestedFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::CPU_SIMD>>(loadBalancer.loc(remoteC))));

            break;

          case Platform::CPU_SISD:

            // Use manual copy for local GPU-CPU communication due to better performance

            _copyTasks.emplace_back(new HeterogeneousCopyTask<T,DESCRIPTOR,Platform::CPU_SISD,Platform::GPU_CUDA>(

              neighborhood.getFieldsCommonWith(remoteC),

              neighborhood.getCellsInboundFrom(remoteC),   super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC),

              neighborhood.getCellsRequestedFrom(remoteC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::CPU_SISD>>(loadBalancer.loc(remoteC))));

            break;

          default:

            throw std::runtime_error("Invalid remote PLATFORM");

        }

      }

    } else {

      // Handling of non-local GPU-GPU and GPU-^GPU communication in general

      if (!neighborhood.getCellsOutboundTo(remoteC).empty()) {

        _sendTasks.emplace_back(std::make_unique<SendTask>(

          _mpiCommunicator, tagCoordinator.get(loadBalancer.glob(_iC), remoteC),

          loadBalancer.rank(remoteC),

          neighborhood.getFieldsCommonWith(remoteC),

          neighborhood.getCellsOutboundTo(remoteC),

          super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC)));

      }

      if (!neighborhood.getCellsInboundFrom(remoteC).empty()) {

        _recvTasks.emplace_back(std::make_unique<RecvTask>(

          _mpiCommunicator, tagCoordinator.get(remoteC, loadBalancer.glob(_iC)),

          loadBalancer.rank(remoteC),

          neighborhood.getFieldsCommonWith(remoteC),

          neighborhood.getCellsInboundFrom(remoteC),

          super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC)));

      }

    }

  });


#else // not using PARALLEL_MODE_MPI

  neighborhood.forNeighbors([&](int localC) {

    if (!neighborhood.getCellsInboundFrom(localC).empty()) {

      _copyTasks.emplace_back(new HomogeneousCopyTask(

        neighborhood.getFieldsCommonWith(localC),

        neighborhood.getCellsInboundFrom(localC),   super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(_iC),

        neighborhood.getCellsRequestedFrom(localC), super.template getBlock<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>(loadBalancer.loc(localC))));

    }

  });

#endif

}


template <typename T, typename DESCRIPTOR>


ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::~ConcreteBlockCommunicator()

{ }


#ifdef PARALLEL_MODE_MPI


template <typename T, typename DESCRIPTOR>


void ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::receive()

{

  for (auto& task : _recvTasks) {

    task->receive();

  }

}


template <typename T, typename DESCRIPTOR>


void ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::send()

{

  for (auto& task : _sendTasks) {

    task->prepare();

  }

  for (auto& task : _sendTasks) {

    task->send();

  }

  for (auto& task : _copyTasks) {

    task->copy();

  }

}


template <typename T, typename DESCRIPTOR>


void ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::unpack()

{

  std::set<typename RecvTask::ref> pending(_recvTasks.begin(), _recvTasks.end());

  while (!pending.empty()) {

    auto task_iterator = pending.begin();

    while (task_iterator != pending.end()) {

      auto& task = *task_iterator;

      if (task->isDone()) {

        task->unpack();

        task_iterator = pending.erase(task_iterator);

      }

      else {

        ++task_iterator;

      }

    }

  }

}


template <typename T, typename DESCRIPTOR>


void ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::wait()

{

  for (auto& task : _copyTasks) {

    task->wait();

  }

  for (auto& task : _recvTasks) {

    task->wait();

  }

  for (auto& task : _sendTasks) {

    task->wait();

  }

}


#else // not using PARALLEL_MODE_MPI


template <typename T, typename DESCRIPTOR>

void ConcreteBlockCommunicator<ConcreteBlockLattice<T,DESCRIPTOR,Platform::GPU_CUDA>>::copy()

{

  for (auto& task : _copyTasks) {

    task->copy();

  }

}


#endif


}


#endif

olb::BlockCommunicationNeighborhood
Configurable overlap communication neighborhood of a block.
Definition blockCommunicationNeighborhood.h:48

olb::BlockCommunicationNeighborhood::getCellsOutboundTo
const std::vector< CellID > & getCellsOutboundTo(int iC) const
Definition blockCommunicationNeighborhood.hh:263

olb::BlockCommunicationNeighborhood::getCellsInboundFrom
const std::vector< CellID > & getCellsInboundFrom(int iC) const
Definition blockCommunicationNeighborhood.hh:270

olb::BlockCommunicationNeighborhood::forNeighbors
void forNeighbors(F f) const
Calls f(iC) for every neighboring cuboid ID iC.
Definition blockCommunicationNeighborhood.h:118

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::RecvTask::unpack
void unpack()
Definition communicator.hh:500

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::RecvTask::isDone
bool isDone()
Definition communicator.hh:495

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::RecvTask::receive
void receive()
Definition communicator.hh:490

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::RecvTask::RecvTask
RecvTask(MPI_Comm comm, int tag, int rank, const std::vector< std::type_index > &fields, const std::vector< CellID > &cells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &block)
Definition communicator.hh:458

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::RecvTask::~RecvTask
~RecvTask()
Definition communicator.hh:479

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::RecvTask::operator<
bool operator<(const RecvTask &rhs) const
Definition communicator.hh:484

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::RecvTask::wait
void wait()
Definition communicator.hh:510

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::SendTask::~SendTask
~SendTask()
Definition communicator.hh:385

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::SendTask::prepare
void prepare()
Definition communicator.hh:391

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::SendTask::wait
void wait()
Definition communicator.hh:407

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::SendTask::send
void send()
Definition communicator.hh:401

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::SendTask::SendTask
SendTask(MPI_Comm comm, int tag, int rank, const std::vector< std::type_index > &fields, const std::vector< CellID > &cells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &block)
Definition communicator.hh:366

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::RecvTask::ref::ref
ref(std::unique_ptr< RecvTask > &task)
Definition communicator.hh:445

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::RecvTask::ref::operator->
RecvTask * operator->() const
Definition communicator.hh:447

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::HomogeneousCopyTask::wait
void wait() override
Definition communicator.hh:580

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::HomogeneousCopyTask::copy
void copy() override
Definition communicator.hh:569

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::HomogeneousCopyTask::HomogeneousCopyTask
HomogeneousCopyTask(const std::vector< std::type_index > &fields, const std::vector< CellID > &targetCells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &target, const std::vector< CellID > &sourceCells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &source)
Definition communicator.hh:547

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::HomogeneousCopyTask::~HomogeneousCopyTask
~HomogeneousCopyTask()
Definition communicator.hh:564

olb::ConcreteBlockCommunicator
Definition blockCommunicator.h:62

olb::ConcreteBlockCommunicator::receive
void receive() override
Definition blockCommunicator.hh:254

olb::ConcreteBlockCommunicator::wait
void wait() override
Definition blockCommunicator.hh:289

olb::ConcreteBlockCommunicator::send
void send() override
Definition blockCommunicator.hh:262

olb::ConcreteBlockLattice
Implementation of BlockLattice on a concrete PLATFORM.
Definition blockLattice.h:464

olb::ConcreteBlockLattice::getCommunicatable
Communicatable & getCommunicatable(std::type_index field) override
Definition blockLattice.h:518

olb::HeterogeneousCopyTaskDataForGpuSource
Private implementation of heterogeneous copy task between GPU_CUDA source and CPU_* target.
Definition communicator.hh:642

olb::HeterogeneousCopyTaskDataForGpuSource::copy
void copy() override
Definition communicator.hh:673

olb::HeterogeneousCopyTaskDataForGpuSource::wait
void wait() override
Definition communicator.hh:684

olb::HeterogeneousCopyTaskDataForGpuSource::HeterogeneousCopyTaskDataForGpuSource
HeterogeneousCopyTaskDataForGpuSource(const std::vector< std::type_index > &fields, const std::vector< CellID > &targetCells, ConcreteBlockLattice< T, DESCRIPTOR, TARGET > &target, const std::vector< CellID > &sourceCells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &source)
Definition communicator.hh:659

olb::HeterogeneousCopyTaskDataForGpuTarget
Private implementation of heterogeneous copy task between CPU_* source and GPU_CUDA target.
Definition communicator.hh:590

olb::HeterogeneousCopyTaskDataForGpuTarget::wait
void wait() override
Definition communicator.hh:634

olb::HeterogeneousCopyTaskDataForGpuTarget::copy
void copy() override
Definition communicator.hh:622

olb::HeterogeneousCopyTaskDataForGpuTarget::HeterogeneousCopyTaskDataForGpuTarget
HeterogeneousCopyTaskDataForGpuTarget(const std::vector< std::type_index > &fields, const std::vector< CellID > &targetCells, ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > &target, const std::vector< CellID > &sourceCells, ConcreteBlockLattice< T, DESCRIPTOR, SOURCE > &source)
Definition communicator.hh:608

olb::HeterogeneousCopyTask
Wrapper for a local heterogeneous block communication request.
Definition blockLattice.h:695

olb::LoadBalancer
Base class for all LoadBalancer.
Definition loadBalancer.h:59

olb::LoadBalancer::isLocal
bool isLocal(const int &glob)
returns whether glob is on this process
Definition loadBalancer.hh:69

olb::MultiConcreteCommunicatable
Definition communicatable.h:256

olb::MultiConcreteCommunicatable::serialize
std::size_t serialize(ConstSpan< CellID > indices, std::uint8_t *buffer) const override
Serialize data at locations indices to buffer
Definition communicatable.h:278

olb::MultiConcreteCommunicatable::deserialize
std::size_t deserialize(ConstSpan< CellID > indices, const std::uint8_t *buffer) override
Deserialize data at locations indices to buffer
Definition communicatable.h:289

olb::OstreamManager
class for marking output with some text
Definition ostreamManager.h:81

olb::OstreamManager::setMultiOutput
void setMultiOutput(bool b)
enable message output for all MPI processes, disabled by default
Definition ostreamManager.cpp:128

olb::SuperCommunicationTagCoordinator
Communication-free negotation of unique tags for inter-cuboid communication.
Definition superCommunicationTagCoordinator.h:47

olb::SuperLattice
Super class maintaining block lattices for a cuboid decomposition.
Definition superLattice.h:46

olb::gpu::cuda::Column
Plain column for CUDA GPU targets.
Definition column.h:49

olb::gpu::cuda::Column::deviceData
const T * deviceData() const
Definition column.hh:146

olb::gpu::cuda::Column::data
const T * data() const
Definition column.hh:134

olb::gpu::cuda::Column::setProcessingContext
void setProcessingContext(ProcessingContext)
Definition column.hh:158

olb::gpu::cuda::DeviceContext
Structure for passing pointers to on-device data into CUDA kernels.
Definition context.hh:55

olb::gpu::cuda::device::unique_ptr
Managed pointer for device-side memory.
Definition device.h:79

olb::gpu::cuda::device::unique_ptr::get
const T * get() const
Definition device.h:110

communicatable.h

context.hh

mpiRequest.h

olb::gpu::cuda::device::check
void check()
Check errors.
Definition device.hh:48

olb::gpu::cuda::kernel::copy_field
void copy_field(SOURCE sourceLattice, TARGET targetLattice, const CellID *sourceIndices, const CellID *targetIndices, std::size_t nIndices) __global__
CUDA kernel for copying FIELD data of sourceLattice at sourceIndices into targetLattice at targetIndi...
Definition communicator.hh:106

olb::gpu::cuda::kernel::scatter_field
void scatter_field(CONTEXT lattice, const CellID *indices, std::size_t nIndices, typename FIELD::template value_type< typename CONTEXT::value_t > *buffer) __global__
CUDA kernel for scattering FIELD data in buffer to indices in lattice.
Definition communicator.hh:168

olb::gpu::cuda::kernel::gather_field
void gather_field(CONTEXT lattice, const CellID *indices, std::size_t nIndices, typename FIELD::template value_type< typename CONTEXT::value_t > *buffer) __global__
CUDA kernel for gathering FIELD data of lattice at indices into buffer.
Definition communicator.hh:91

olb::gpu::cuda::kernel::scatter_any_fields
__global__ void scatter_any_fields(AnyDeviceFieldArrayD *fields, std::size_t nFields, const CellID *indices, std::size_t nIndices, std::uint8_t *buffer)
CUDA kernel for scattering fields in buffer to indices in lattice.
Definition communicator.hh:182

olb::gpu::cuda::kernel::copy_any_fields
__global__ void copy_any_fields(AnyDeviceFieldArrayD *sourceFields, AnyDeviceFieldArrayD *targetFields, std::size_t nFields, const CellID *sourceIndices, const CellID *targetIndices, std::size_t nIndices)
CUDA kernel for copying sourceFields at sourceIndices to targetFields at targetIndices.
Definition communicator.hh:145

olb::gpu::cuda::kernel::gather_any_fields
__global__ void gather_any_fields(AnyDeviceFieldArrayD *fields, std::size_t nFields, const CellID *indices, std::size_t nIndices, std::uint8_t *buffer)
CUDA kernel for gathering fields at indices into buffer.
Definition communicator.hh:122

olb::gpu::cuda::async_scatter_any_fields
void async_scatter_any_fields(cudaStream_t stream, thrust::device_vector< AnyDeviceFieldArrayD > &fields, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Non-blocking scatter of fields data in buffer to given indices.
Definition communicator.hh:330

olb::gpu::cuda::async_copy_field
void async_copy_field(cudaStream_t stream, SOURCE &sourceLattice, TARGET &targetLattice, const thrust::device_vector< CellID > &sourceIndices, const thrust::device_vector< CellID > &targetIndices)
Non-blocking copy of FIELD at given indices from sourceLattice to targetLattice.
Definition communicator.hh:231

olb::gpu::cuda::scatter_any_fields
void scatter_any_fields(thrust::device_vector< AnyDeviceFieldArrayD > &fields, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Blocking scatter of fields data in buffer to given indices.
Definition communicator.hh:317

olb::gpu::cuda::gather_any_fields
void gather_any_fields(thrust::device_vector< AnyDeviceFieldArrayD > &fields, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Blocking gather of fields at given indices into buffer.
Definition communicator.hh:249

olb::gpu::cuda::async_gather_field
void async_gather_field(cudaStream_t stream, CONTEXT &lattice, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Non-blocking gather of FIELD at given indices into buffer.
Definition communicator.hh:216

olb::gpu::cuda::async_scatter_field
void async_scatter_field(cudaStream_t stream, CONTEXT &lattice, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Non-blocking scatter of FIELD data in buffer to given indices.
Definition communicator.hh:303

olb::gpu::cuda::async_copy_any_fields
void async_copy_any_fields(cudaStream_t stream, thrust::device_vector< AnyDeviceFieldArrayD > &sourceFields, thrust::device_vector< AnyDeviceFieldArrayD > &targetFields, const thrust::device_vector< CellID > &sourceIndices, const thrust::device_vector< CellID > &targetIndices)
Non-blocking copy of fields at given indices from sourceIndices to targetIndices.
Definition communicator.hh:276

olb::gpu::cuda::scatter_field
void scatter_field(CONTEXT &lattice, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Blocking scatter of FIELD data in buffer to given indices.
Definition communicator.hh:291

olb::gpu::cuda::gather_field
void gather_field(CONTEXT &lattice, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Blocking gather of FIELD at given indices into buffer.
Definition communicator.hh:204

olb::gpu::cuda::async_gather_any_fields
void async_gather_any_fields(cudaStream_t stream, thrust::device_vector< AnyDeviceFieldArrayD > &fields, const thrust::device_vector< CellID > &indices, std::uint8_t *buffer)
Non-blocking gather of fields at given indices into buffer.
Definition communicator.hh:262

olb
Top level namespace for all of OpenLB.
Definition boundaryPostProcessors2D.h:34

olb::CellID
std::uint32_t CellID
Type for sequential block-local cell indices.
Definition blockStructure.h:36

olb::ProcessingContext::Simulation
@ Simulation
Data available on host for e.g. functor evaluation.

olb::ProcessingContext::Evaluation
@ Evaluation

olb::Platform
Platform
OpenLB execution targets.
Definition platform.h:36

olb::Platform::GPU_CUDA
@ GPU_CUDA
Vector CPU (AVX2 / AVX-512 collision)

olb::checkPlatform< Platform::GPU_CUDA >
void checkPlatform< Platform::GPU_CUDA >()
Verifies availability of CUDA device and MPI support.
Definition communicator.hh:46

OLB_ASSERT
#define OLB_ASSERT(COND, MESSAGE)
Definition olbDebug.h:45

registry.h

olb::Communicatable::size
virtual std::size_t size(ConstSpan< CellID > indices) const =0

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::CopyTask::wait
virtual void wait()=0

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::CopyTask::copy
virtual void copy()=0

olb::ConcreteBlockCommunicator< ConcreteBlockLattice< T, DESCRIPTOR, Platform::GPU_CUDA > >::CopyTask::~CopyTask
virtual ~CopyTask()
Definition communicator.hh:522

olb::ConcreteHeterogeneousCopyTask
Private implementation of HeterogeneousCopyTask (PIMPL)
Definition communicator.h:47

olb::gpu::cuda::AnyDeviceFieldArrayD
Type-erased pointer to FieldArrayD device data.
Definition registry.h:42

superCommunicationTagCoordinator.h