walberla::cuda Namespace Reference

Namespaces

 communication
 
 internal
 

Classes

class  EventRAII
 
class  FieldAccessor
 
class  FieldAccessor3D
 
class  FieldAccessorXYZ
 
class  FieldIndexing
 
class  FieldIndexing3D
 
class  FieldIndexing3DBase
 
class  FieldIndexingXYZ
 
class  GeneratedGPUPackInfo
 
class  GPUField
 GhostLayerField stored on a CUDA GPU. More...
 
class  GPUSweepBase
 
class  HostFieldAllocator
 Allocator that allocates a CPU! field using cudaHostAlloc without padding. More...
 
class  Kernel
 Wrapper class around a CUDA kernel, to call kernels also from code not compiled with nvcc. More...
 
class  NvtxRange
 
class  ParallelSection
 
class  ParallelStreams
 Helper class to run CUDA operations on parallel streams. More...
 
class  StreamRAII
 

Functions

template<typename GPUField_T >
BlockDataID addGPUFieldToStorage (const shared_ptr< StructuredBlockStorage > &bs, const std::string &identifier, uint_t fSize, const Layout layout=fzyx, uint_t nrOfGhostLayers=1, bool usePitchedMem=true)
 Adds a cuda::GPUField to a StructuredBlockStorage. More...
 
template<typename Field_T >
BlockDataID addGPUFieldToStorage (const shared_ptr< StructuredBlockStorage > &bs, ConstBlockDataID cpuFieldID, const std::string &identifier, bool usePitchedMem=true)
 Adds a cuda::GPUField to a StructuredBlockStorage using data from a CPU field. More...
 
void * allocate_aligned_with_offset (uint_t size, uint_t alignment, uint_t offset)
 
void free_aligned_with_offset (void *ptr)
 
void * allocate_pitched_with_offset (size_t &pitchOut, size_t width, size_t height, size_t alignment, size_t alignmentOffset)
 
void selectDeviceBasedOnMpiRank ()
 Selects active CUDA device based on MPI rank. More...
 
void checkForError (cudaError_t code, const std::string &callerPath, const int line)
 
template<typename DstType , typename SrcType >
void fieldCpy (const shared_ptr< StructuredBlockStorage > &blocks, BlockDataID dstID, ConstBlockDataID srcID)
 
template<typename DstType , typename SrcType >
std::function< void()> fieldCpyFunctor (const shared_ptr< StructuredBlockStorage > &blocks, BlockDataID dstID, ConstBlockDataID srcID)
 
template<typename DstType , typename SrcType >
void fieldCpySweepFunction (BlockDataID dstID, ConstBlockDataID srcID, IBlock *block)
 
template<typename DstType , typename SrcType >
std::function< void(IBlock *)> fieldCpyFunctor (BlockDataID dstID, ConstBlockDataID srcID)
 
template<typename T , uint_t fs>
void fieldCpy (cuda::GPUField< T > &dst, const field::Field< T, fs > &src)
 
template<typename T , uint_t fs>
void fieldCpy (field::Field< T, fs > &dst, const cuda::GPUField< T > &src)
 
template<typename T >
void shiftCoordinatesWhileFastestCoordHasSizeOne (typename FieldAccessor< T >::IndexingScheme &indexing, dim3 &gridDim, dim3 &blockDim)
 
unsigned int iDivUp (unsigned int a, unsigned int b)
 
void copyDevToDevFZYX (const cudaPitchedPtr &dst, const cudaPitchedPtr &src, std::tuple< uint_t, uint_t, uint_t, uint_t > &dstOffset, std::tuple< uint_t, uint_t, uint_t, uint_t > &srcOffset, uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize, std::tuple< uint_t, uint_t, uint_t, uint_t > &intervalSize, cudaStream_t copyStream)
 Copy a 4D interval of a device buffer to another device buffer with fzyx memory layout. More...
 
void copyDevToDevZYXF (const cudaPitchedPtr &dst, const cudaPitchedPtr &src, std::tuple< uint_t, uint_t, uint_t, uint_t > &dstOffset, std::tuple< uint_t, uint_t, uint_t, uint_t > &srcOffset, uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize, std::tuple< uint_t, uint_t, uint_t, uint_t > &intervalSize, cudaStream_t copyStream)
 Copy a 4D interval of a device buffer to another device buffer with zyxf memory layout. More...
 
void copyHostToDevFZYX (const cudaPitchedPtr &dst, unsigned char *src, std::tuple< uint_t, uint_t, uint_t, uint_t > &dstOffset, std::tuple< uint_t, uint_t, uint_t, uint_t > &srcOffset, uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize, std::tuple< uint_t, uint_t, uint_t, uint_t > &intervalSize, cudaStream_t copyStream)
 Copy a 4D interval of a host buffer to a device buffer with fzyx memory layout. More...
 
void copyHostToDevZYXF (const cudaPitchedPtr &dst, unsigned char *src, std::tuple< uint_t, uint_t, uint_t, uint_t > &dstOffset, std::tuple< uint_t, uint_t, uint_t, uint_t > &srcOffset, uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize, std::tuple< uint_t, uint_t, uint_t, uint_t > &intervalSize, cudaStream_t copyStream)
 Copy a 4D interval of a host buffer to a device buffer with zyxf memory layout. More...
 
void copyDevToHostFZYX (unsigned char *dst, const cudaPitchedPtr &src, std::tuple< uint_t, uint_t, uint_t, uint_t > &dstOffset, std::tuple< uint_t, uint_t, uint_t, uint_t > &srcOffset, uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize, std::tuple< uint_t, uint_t, uint_t, uint_t > &intervalSize, cudaStream_t copyStream)
 Copy a 4D interval of a device buffer to a host buffer with fzyx memory layout. More...
 
void copyDevToHostZYXF (unsigned char *dst, const cudaPitchedPtr &src, std::tuple< uint_t, uint_t, uint_t, uint_t > &dstOffset, std::tuple< uint_t, uint_t, uint_t, uint_t > &srcOffset, uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize, std::tuple< uint_t, uint_t, uint_t, uint_t > &intervalSize, cudaStream_t copyStream)
 Copy a 4D interval of a device buffer to a host buffer with zyxf memory layout. More...
 
template<typename FuncPtr >
Kernel< FuncPtr > make_kernel (FuncPtr funcPtr)
 
void nameStream (const cudaStream_t &stream, const std::string &name)
 
void nvtxMarker (const std::string &name, const uint32_t color=0xaaaaaa)
 
template<typename GpuFields , typename CpuFields >
void exportModuleToPython ()
 

Variables

static std::map< void *, void * > freePointers_
 

Function Documentation

template<typename GPUField_T >
BlockDataID walberla::cuda::addGPUFieldToStorage ( const shared_ptr< StructuredBlockStorage > &  bs,
const std::string &  identifier,
uint_t  fSize,
const Layout  layout = fzyx,
uint_t  nrOfGhostLayers = 1,
bool  usePitchedMem = true 
)

Adds a cuda::GPUField to a StructuredBlockStorage.

template<typename Field_T >
BlockDataID walberla::cuda::addGPUFieldToStorage ( const shared_ptr< StructuredBlockStorage > &  bs,
ConstBlockDataID  cpuFieldID,
const std::string &  identifier,
bool  usePitchedMem = true 
)

Adds a cuda::GPUField to a StructuredBlockStorage using data from a CPU field.

  • adds a GPU field to a StructuredBlockStorage using a CPU field
  • sizes, number of ghostlayers and layout are the same as the CPU field
  • GPU field is initialized with the data currently stored in the CPU field
    Template Parameters
    Field_Ttype of the CPU field, the created GPUField will be of type cuda::GPUField<Field_T::value_type>
void * walberla::cuda::allocate_aligned_with_offset ( uint_t  size,
uint_t  alignment,
uint_t  offset 
)
void * walberla::cuda::allocate_pitched_with_offset ( size_t &  pitchOut,
size_t  width,
size_t  height,
size_t  alignment,
size_t  alignmentOffset 
)
void walberla::cuda::checkForError ( cudaError_t  code,
const std::string &  callerPath,
const int  line 
)
inline
void walberla::cuda::copyDevToDevFZYX ( const cudaPitchedPtr &  dst,
const cudaPitchedPtr &  src,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  dstOffset,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  srcOffset,
uint_t  dstAllocSizeZ,
uint_t  srcAllocSizeZ,
uint_t  typeSize,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  intervalSize,
cudaStream_t  copyStream 
)

Copy a 4D interval of a device buffer to another device buffer with fzyx memory layout.

Parameters
dstdestination buffer
srcsource buffer
dstOffset(x, y, z, f)-tuple containing the coordinate of the interval start point in the destination buffer
srcOffset(x, y, z, f)-tuple containing the coordinate of the interval start point in the source buffer
dstAllocSizeYallocation size in y direction of the destination buffer
srcAllocSizeYallocation size in y direction of the source buffer
typeSizesize of an f element
copyStreamCUDA stream, if not NULL copy operations will be performed asynchronously
void walberla::cuda::copyDevToDevZYXF ( const cudaPitchedPtr &  dst,
const cudaPitchedPtr &  src,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  dstOffset,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  srcOffset,
uint_t  dstAllocSizeY,
uint_t  srcAllocSizeY,
uint_t  typeSize,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  intervalSize,
cudaStream_t  copyStream 
)

Copy a 4D interval of a device buffer to another device buffer with zyxf memory layout.

Parameters
dstdestination buffer
srcsource buffer
dstOffset(x, y, z, f)-tuple containing the coordinate of the interval start point in the destination buffer
srcOffset(x, y, z, f)-tuple containing the coordinate of the interval start point in the source buffer
dstAllocSizeYallocation size in y direction of the destination buffer
srcAllocSizeYallocation size in y direction of the source buffer
typeSizesize of an f element
copyStreamCUDA stream, if not NULL copy operations will be performed asynchronously
void walberla::cuda::copyDevToHostFZYX ( unsigned char *  dst,
const cudaPitchedPtr &  src,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  dstOffset,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  srcOffset,
uint_t  dstAllocSizeZ,
uint_t  srcAllocSizeZ,
uint_t  typeSize,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  intervalSize,
cudaStream_t  copyStream 
)

Copy a 4D interval of a device buffer to a host buffer with fzyx memory layout.

See copyDevToDevFZYX() for parameter information.

void walberla::cuda::copyDevToHostZYXF ( unsigned char *  dst,
const cudaPitchedPtr &  src,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  dstOffset,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  srcOffset,
uint_t  dstAllocSizeY,
uint_t  srcAllocSizeY,
uint_t  typeSize,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  intervalSize,
cudaStream_t  copyStream 
)

Copy a 4D interval of a device buffer to a host buffer with zyxf memory layout.

See copyDevToDevZYXF() for parameter information.

void walberla::cuda::copyHostToDevFZYX ( const cudaPitchedPtr &  dst,
unsigned char *  src,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  dstOffset,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  srcOffset,
uint_t  dstAllocSizeZ,
uint_t  srcAllocSizeZ,
uint_t  typeSize,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  intervalSize,
cudaStream_t  copyStream 
)

Copy a 4D interval of a host buffer to a device buffer with fzyx memory layout.

See copyDevToDevFZYX() for parameter information.

void walberla::cuda::copyHostToDevZYXF ( const cudaPitchedPtr &  dst,
unsigned char *  src,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  dstOffset,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  srcOffset,
uint_t  dstAllocSizeY,
uint_t  srcAllocSizeY,
uint_t  typeSize,
std::tuple< uint_t, uint_t, uint_t, uint_t > &  intervalSize,
cudaStream_t  copyStream 
)

Copy a 4D interval of a host buffer to a device buffer with zyxf memory layout.

See copyDevToDevZYXF() for parameter information.

template<typename GpuFields , typename CpuFields >
void walberla::cuda::exportModuleToPython ( )
template<typename DstType , typename SrcType >
void walberla::cuda::fieldCpy ( const shared_ptr< StructuredBlockStorage > &  blocks,
BlockDataID  dstID,
ConstBlockDataID  srcID 
)
template<typename T , uint_t fs>
void walberla::cuda::fieldCpy ( cuda::GPUField< T > &  dst,
const field::Field< T, fs > &  src 
)
template<typename T , uint_t fs>
void walberla::cuda::fieldCpy ( field::Field< T, fs > &  dst,
const cuda::GPUField< T > &  src 
)
template<typename DstType , typename SrcType >
std::function<void()> walberla::cuda::fieldCpyFunctor ( const shared_ptr< StructuredBlockStorage > &  blocks,
BlockDataID  dstID,
ConstBlockDataID  srcID 
)
template<typename DstType , typename SrcType >
std::function<void(IBlock*)> walberla::cuda::fieldCpyFunctor ( BlockDataID  dstID,
ConstBlockDataID  srcID 
)
template<typename DstType , typename SrcType >
void walberla::cuda::fieldCpySweepFunction ( BlockDataID  dstID,
ConstBlockDataID  srcID,
IBlock *  block 
)
void walberla::cuda::free_aligned_with_offset ( void *  ptr)
unsigned int walberla::cuda::iDivUp ( unsigned int  a,
unsigned int  b 
)
inline
template<typename FuncPtr >
Kernel<FuncPtr> walberla::cuda::make_kernel ( FuncPtr  funcPtr)
void walberla::cuda::nameStream ( const cudaStream_t &  stream,
const std::string &  name 
)
inline
void walberla::cuda::nvtxMarker ( const std::string &  name,
const uint32_t  color = 0xaaaaaa 
)
inline
void walberla::cuda::selectDeviceBasedOnMpiRank ( )

Selects active CUDA device based on MPI rank.

assumes that on each node there are as many MPI processes started as there are GPUs

  • if there are more GPUs than processes on a node, a warning is printed and not all GPUs are utilized
  • if there are more processes than GPUs, also a warning is printed and multiple processes may access the same GPU. Processes are assigned to GPUs in a round-robin fashion
template<typename T >
void walberla::cuda::shiftCoordinatesWhileFastestCoordHasSizeOne ( typename FieldAccessor< T >::IndexingScheme &  indexing,
dim3 &  gridDim,
dim3 &  blockDim 
)

Variable Documentation

std::map<void *, void*> walberla::cuda::freePointers_
static