1#include "../../include/cuda_utils/cuda_utilities.cuh"
2#include "../../include/cuda_utils/cuda_launcher.cuh"
7#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
9__device__
double atomicAdd(
double* address,
double val) {
10 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
11 unsigned long long int old = *address_as_ull, assumed;
14 old = atomicCAS(address_as_ull, assumed,
15 __double_as_longlong(val + __longlong_as_double(assumed)));
16 }
while (assumed != old);
17 return __longlong_as_double(old);
21void gpuAssert(cudaError_t code,
const char *file,
int line,
bool abort)
23 if (code != cudaSuccess)
25 fprintf(stderr,
"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
30void checkCudaCall(cudaError_t command,
const char * commandName,
const char * fileName,
int line)
32 if (command != cudaSuccess)
34 fprintf(stderr,
"Error: CUDA result \"%s\" for call \"%s\" in file \"%s\" at line %d. Terminating...\n",
35 cudaGetErrorString(command), commandName, fileName, line);
46 integer index = threadIdx.x + blockIdx.x * blockDim.x;
47 integer stride = blockDim.x * gridDim.x;
50 while ((index + offset) < count) {
51 collector[index + offset] = entries[indices[index + offset]];
59 integer index = threadIdx.x + blockIdx.x * blockDim.x;
60 integer stride = blockDim.x * gridDim.x;
63 while ((index + offset) < count) {
64 for (
int i=0; i<count; i++) {
65 if (i != index + offset) {
66 if (entry1[indices[i]] == entry1[indices[index + offset]] && entry2[indices[i]] == entry2[indices[index + offset]]) {
67 printf(
"Same entry for %i vs %i | %i vs %i!\n", i, index + offset, indices[i], indices[index + offset]);
78 integer index = threadIdx.x + blockIdx.x * blockDim.x;
79 integer stride = blockDim.x * gridDim.x;
82 while ((index + offset) < length) {
83 for (
int i=0; i<length; i++) {
84 if (i != (index + offset)) {
85 if (array[index + offset] == array[i]) {
86 atomicAdd(duplicateCounter, 1);
96 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
97 integer stride = blockDim.x * gridDim.x;
100 while ((bodyIndex + offset) < length) {
102 for (
int i=0; i<length; i++) {
103 if (i != (bodyIndex + offset)) {
104 if (array1[bodyIndex + offset] == array1[i] && array2[bodyIndex + offset] == array2[i]) {
105 atomicAdd(duplicateCounter, 1);
117 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
118 integer stride = blockDim.x * gridDim.x;
121 while ((bodyIndex + offset) < length) {
123 for (
int i=0; i<length; i++) {
124 if (i != (bodyIndex + offset)) {
125 if (array1[bodyIndex + offset] == array1[i] && array2[bodyIndex + offset] == array2[i] &&
126 array3[bodyIndex + offset] == array3[i]) {
128 atomicAdd(duplicateCounter, 1);
139 template<
typename T,
typename U>
141 integer index = threadIdx.x + blockIdx.x * blockDim.x;
142 integer stride = blockDim.x * gridDim.x;
145 while ((index + offset) < length) {
146 if ((index + offset) % 100 == 0) {
149 for (
int i=0; i<length; i++) {
150 if (i != (index + offset)) {
151 if (array[index + offset] == array[i] || (entry1[array[index + offset]] == entry1[array[i]] && entry2[array[index + offset]] == entry2[array[i]])) {
153 printf(
"Found duplicate!\n");
154 atomicAdd(duplicateCounter, 1);
165 integer index = threadIdx.x + blockIdx.x * blockDim.x;
166 integer stride = blockDim.x * gridDim.x;
171 while ((index + offset) < length) {
172 if (array[index + offset] != -1) {
173 for (
integer i = 0; i < length; i++) {
174 if (i != (index + offset)) {
175 if (array[i] != -1 && array[index + offset] == array[i]) {
177 printf(
"DUPLICATE: %i vs %i (%i vs. %i)\n",
178 array[index + offset], array[i], index + offset, i);
180 maxIndex =
max(index + offset, i);
182 array[maxIndex] = -1;
183 atomicAdd(duplicateCounter, 1);
230 template<
typename T,
typename U>
233 integer index = threadIdx.x + blockIdx.x * blockDim.x;
234 integer stride = blockDim.x * gridDim.x;
240 while ((index + offset) < length) {
241 if (array[index + offset] != -1) {
242 for (
integer i = 0; i < length; i++) {
243 if (i != (index + offset)) {
244 if (array[i] != -1 && (array[index + offset] == array[i] || (entry1[array[i]] == entry1[array[index + offset]] &&
245 entry2[array[i]] == entry2[array[index + offset]] &&
246 entry3[array[i]] == entry3[array[index + offset]]))) {
250 printf(
"DUPLICATE: %i vs %i | (%f, %f, %f) vs (%f, %f, %f):\n",
251 array[index + offset], array[i],
252 entry1[array[index + offset]], entry2[array[index + offset]], entry3[array[index + offset]],
253 entry1[array[i]], entry2[array[i]], entry3[array[i]]);
255 for (
int k=0; k<
POW_DIM; k++) {
256 if (child[
POW_DIM*array[index + offset] + k] == array[i]) {
257 printf(
"isChild: index = %i: child %i == i = %i\n", array[index + offset],
262 if (child[8*array[i] + k] == array[index + offset]) {
263 printf(
"isChild: index = %i: child %i == index = %i\n", array[i],
264 k, array[index + offset]);
270 printf(
"isChild: Duplicate NOT a child: %i vs %i | (%f, %f, %f) vs (%f, %f, %f):\n",
271 array[index + offset], array[i],
272 entry1[array[index + offset]], entry2[array[index + offset]], entry3[array[index + offset]],
273 entry1[array[i]], entry2[array[i]], entry3[array[i]]);
286 maxIndex =
max(index + offset, i);
288 array[maxIndex] = -1;
289 atomicAdd(duplicateCounter, 1);
302 int index = threadIdx.x + blockIdx.x * blockDim.x;
303 int stride = blockDim.x * gridDim.x;
308 while ((index + offset) < length) {
310 if (array[index + offset] != -1) {
311 indexToInsert = atomicAdd(duplicateCounter, 1);
312 removedArray[indexToInsert] = array[index + offset];
335 return cuda::launch(
true, executionPolicy, ::CudaUtils::Kernel::findDuplicates<T>, array,
336 duplicateCounter, length);
339 template real Launch::findDuplicates<integer>(
integer *array,
integer *duplicateCounter,
int length);
341 template real Launch::findDuplicates<real>(
real *array,
integer *duplicateCounter,
int length);
346 return cuda::launch(
true, executionPolicy, ::CudaUtils::Kernel::findDuplicateEntries<T>, array1,
347 array2, duplicateCounter, length);
349 template real Launch::findDuplicateEntries<real>(
real *array1,
real *array2,
integer *duplicateCounter,
int length);
354 return cuda::launch(
true, executionPolicy, ::CudaUtils::Kernel::findDuplicateEntries<T>, array1,
355 array2, array3, duplicateCounter, length);
357 template real Launch::findDuplicateEntries<real>(
real *array1,
real *array2,
real *array3,
integer *duplicateCounter,
int length);
359 template<
typename T,
typename U>
362 return cuda::launch(
true, executionPolicy, ::CudaUtils::Kernel::findDuplicates<T, U>, array, entry1, entry2,
363 duplicateCounter, length);
365 template real Launch::findDuplicates<integer, real>(
integer *array,
real *entry1,
real *entry2,
366 integer *duplicateCounter,
int length);
371 return cuda::launch(
true, executionPolicy, ::CudaUtils::Kernel::markDuplicates<T>, array,
372 duplicateCounter, length);
375 template real Launch::markDuplicates<integer>(
integer *array,
integer *duplicateCounter,
int length);
377 template real Launch::markDuplicates<real>(
real *array,
integer *duplicateCounter,
int length);
387 template<
typename T,
typename U>
390 return cuda::launch(
true, executionPolicy, ::CudaUtils::Kernel::markDuplicates<T, U>, array, entry1, entry2,
391 entry3, duplicateCounter, child, length);
399 return cuda::launch(
true, executionPolicy, ::CudaUtils::Kernel::removeDuplicates<T>, array, removedArray,
400 duplicateCounter, length);
403 template real Launch::removeDuplicates<integer>(
integer *array,
integer *removedArray,
404 integer *duplicateCounter,
int length);
406 template real Launch::removeDuplicates<real>(
real *array,
real *removedArray,
407 integer *duplicateCounter,
int length);
425 return fminf(temp, c);
427 return fmin(temp, c);
442 return fmaxf(temp, c);
444 return fmax(temp, c);
Execution policy/instruction for CUDA kernel execution.
void gpuAssert(cudaError_t code, const char *file, int line, bool abort)
Check CUDA error codes.
void checkCudaCall(cudaError_t command, const char *commandName, const char *fileName, int line)
Check CUDA call.
real findDuplicateEntries(T *array1, T *array2, integer *duplicateCounter, int length)
real markDuplicates(T *array, integer *duplicateCounter, int length)
real findDuplicates(T *array, integer *duplicateCounter, int length)
real collectValues(integer *indices, real *entries, real *collector, integer count)
real removeDuplicates(T *array, T *removedArray, integer *duplicateCounter, int length)
real checkValues(integer *indices, real *entry1, real *entry2, real *entry3, integer count)
__global__ void removeDuplicates(T *array, T *removedArray, integer *duplicateCounter, int length)
__global__ void collectValues(integer *indices, real *entries, real *collector, integer count)
__global__ void checkValues(integer *indices, real *entry1, real *entry2, real *entry3, integer count)
__global__ void findDuplicateEntries(T *array1, T *array2, integer *duplicateCounter, int length)
__global__ void findDuplicates(T *array, integer *duplicateCounter, int length)
__global__ void markDuplicates(T *array, integer *duplicateCounter, int length)
__device__ real min(real a, real b)
Minimum value out of two floating point values.
__device__ real rsqrt(real a)
Inverse square root of a floating point value.
__device__ real sqrt(real a)
Square root of a floating point value.
__device__ real abs(real a)
Absolute value of a floating point value.
__device__ real max(real a, real b)
Maximum value out of two floating point values.
real launch(bool timeKernel, const ExecutionPolicy &policy, void(*f)(Arguments...), Arguments... args)
CUDA execution wrapper function.