1#include "../include/helper.cuh"
2#include "../include/cuda_utils/cuda_launcher.cuh"
23 integerVal(integerVal), integerVal1(integerVal1), integerVal2(integerVal2),
24 realVal(realVal), realVal1(realVal1), realVal2(realVal2), keyTypeVal(keyTypeVal),
25 integerBuffer(integerBuffer), integerBuffer1(integerBuffer1), integerBuffer2(integerBuffer2),
26 integerBuffer3(integerBuffer3), integerBuffer4(integerBuffer4),
27 sendCount(sendCount), sendCount1(sendCount1), idIntegerBuffer(idIntegerBuffer),
28 idIntegerBuffer1(idIntegerBuffer1), realBuffer(realBuffer), realBuffer1(realBuffer1),
29 keyTypeBuffer(keyTypeBuffer), keyTypeBuffer1(keyTypeBuffer1), keyTypeBuffer2(keyTypeBuffer2) {
103 helper->
set(integerVal, integerVal1, integerVal2,
104 realVal, realVal1, realVal2, keyTypeVal,
105 integerBuffer, integerBuffer1, integerBuffer2,
106 integerBuffer3, integerBuffer4,
107 sendCount, sendCount1, idIntegerBuffer,
108 idIntegerBuffer1, realBuffer, realBuffer1,
109 keyTypeBuffer, keyTypeBuffer1, keyTypeBuffer2);
122 realVal, realVal1, realVal2, keyTypeVal,
123 integerBuffer, integerBuffer1, integerBuffer2,
124 integerBuffer3, integerBuffer4,
125 sendCount, sendCount1, idIntegerBuffer,
126 idIntegerBuffer1, realBuffer, realBuffer1,
127 keyTypeBuffer, keyTypeBuffer1, keyTypeBuffer2);
134 template <
typename A>
136 void *d_temp_storage = NULL;
137 size_t temp_storage_bytes = 0;
138 gpuErrorcheck(cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, keysToSort, sortedKeys, n));
144 gpuErrorcheck(cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, keysToSort, sortedKeys, n));
150 template <
typename A,
typename B>
153 void *d_temp_storage = NULL;
154 size_t temp_storage_bytes = 0;
155 gpuErrorcheck(cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
156 keyIn, keyOut, arrayToSort, sortedArray, n));
161 gpuErrorcheck(cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
162 keyIn, keyOut, arrayToSort, sortedArray, n));
183 template <
typename T>
187 void *d_temp_storage = NULL;
188 size_t temp_storage_bytes = 0;
189 switch (reductionType) {
191 cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_sml, d_aggregate, n);
195 cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_sml, d_aggregate, n);
198 cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_sml, d_aggregate, n);
202 cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_sml, d_aggregate, n);
205 cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_sml, d_aggregate, n);
209 cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_sml, d_aggregate, n);
217 gpuErrorcheck(cudaMemcpy(&reduction, d_aggregate,
sizeof(T), cudaMemcpyDeviceToHost));
220 switch (reductionType) {
223 boost::mpi::communicator comm;
224 all_reduce(comm, boost::mpi::inplace_t<T *>(&reduction), 1, boost::mpi::minimum<T>());
228 boost::mpi::communicator comm;
229 all_reduce(comm, boost::mpi::inplace_t<T *>(&reduction), 1, boost::mpi::maximum<T>());
233 boost::mpi::communicator comm;
234 all_reduce(comm, boost::mpi::inplace_t<T *>(&reduction), 1, std::plus<T>());
240 Logger(
INFO) <<
"globalized reduction = " << reduction;
253 template <
typename T>
256 int index = threadIdx.x + blockIdx.x * blockDim.x;
257 int stride = blockDim.x * gridDim.x;
260 while ((index + offset) < n) {
261 targetArray[index + offset] = sourceArray[index + offset];
267 template <
typename T>
270 int index = threadIdx.x + blockIdx.x * blockDim.x;
271 int stride = blockDim.x * gridDim.x;
274 while ((index + offset) < n) {
275 array[index + offset] = value;
292 template <
typename T>
Execution policy/instruction for CUDA kernel execution.
CUDA_CALLABLE_MEMBER ~Helper()
CUDA_CALLABLE_MEMBER void set(integer *integerVal, integer *integerVal1, integer *integerVal2, real *realVal, real *realVal1, real *realVal2, keyType *keyTypeVal, integer *integerBuffer, integer *integerBuffer1, integer *integerBuffer2, integer *integerBuffer3, integer *integerBuffer4, integer *sendCount, integer *sendCount1, idInteger *idIntegerBuffer, idInteger *idIntegerBuffer1, real *realBuffer, real *realBuffer1, keyType *keyTypeBuffer, keyType *keyTypeBuffer1, keyType *keyTypeBuffer2)
idInteger * idIntegerBuffer1
idInteger * idIntegerBuffer
CUDA_CALLABLE_MEMBER Helper()
#define gpuErrorcheck(ans)
check CUDA call
#define CUDA_CALLABLE_MEMBER
template real resetArray< real >(real *array, real value, integer n)
template real copyArray< integer >(integer *targetArray, integer *sourceArray, integer n)
template real copyArray< real >(real *targetArray, real *sourceArray, integer n)
template real copyArray< keyType >(keyType *targetArray, keyType *sourceArray, integer n)
template real resetArray< integer >(integer *array, integer value, integer n)
template real resetArray< keyType >(keyType *array, keyType value, integer n)
void set(Helper *helper, integer *integerVal, integer *integerVal1, integer *integerVal2, real *realVal, real *realVal1, real *realVal2, keyType *keyTypeVal, integer *integerBuffer, integer *integerBuffer1, integer *integerBuffer2, integer *integerBuffer3, integer *integerBuffer4, integer *sendCount, integer *sendCount1, idInteger *idIntegerBuffer, idInteger *idIntegerBuffer1, real *realBuffer, real *realBuffer1, keyType *keyTypeBuffer, keyType *keyTypeBuffer1, keyType *keyTypeBuffer2)
__global__ void resetArray(T *array, T value, integer n)
__global__ void set(Helper *helper, integer *integerVal, integer *integerVal1, integer *integerVal2, real *realVal, real *realVal1, real *realVal2, keyType *keyTypeVal, integer *integerBuffer, integer *integerBuffer1, integer *integerBuffer2, integer *integerBuffer3, integer *integerBuffer4, integer *sendCount, integer *sendCount1, idInteger *idIntegerBuffer, idInteger *idIntegerBuffer1, real *realBuffer, real *realBuffer1, keyType *keyTypeBuffer, keyType *keyTypeBuffer1, keyType *keyTypeBuffer2)
__global__ void copyArray(T *targetArray, T *sourceArray, integer n)
template real sortArray< real, integer >(real *arrayToSort, real *sortedArray, integer *keyIn, integer *keyOut, integer n)
template real reduceAndGlobalize< real >(real *, real *, integer, Reduction::Type)
template real sortArray< integer, integer >(integer *arrayToSort, integer *sortedArray, integer *keyIn, integer *keyOut, integer n)
template real sortArray< keyType, integer >(keyType *arrayToSort, keyType *sortedArray, integer *keyIn, integer *keyOut, integer n)
template real sortKeys< keyType >(keyType *keysToSort, keyType *sortedKeys, int n)
real sortArray(A *arrayToSort, A *sortedArray, B *keyIn, B *keyOut, integer n)
template real sortArray< real, keyType >(real *arrayToSort, real *sortedArray, keyType *keyIn, keyType *keyOut, integer n)
real sortKeys(A *keysToSort, A *sortedKeys, int n)
template real sortArray< integer, keyType >(integer *arrayToSort, integer *sortedArray, keyType *keyIn, keyType *keyOut, integer n)
T reduceAndGlobalize(T *d_sml, T *d_aggregate, integer n, Reduction::Type reductionType)
template real sortArray< keyType, keyType >(keyType *arrayToSort, keyType *sortedArray, keyType *keyIn, keyType *keyOut, integer n)
void set(T *d_var, T val, std::size_t count=1)
Set device memory to a specific value.
void free(T *d_var)
Free device memory.
void malloc(T *&d_var, std::size_t count)
Allocate device memory.
real launch(bool timeKernel, const ExecutionPolicy &policy, void(*f)(Arguments...), Arguments... args)
CUDA execution wrapper function.