1#include "../../include/subdomain_key_tree/tree.cuh"
2#include "../../include/cuda_utils/cuda_launcher.cuh"
19 int level = 0, dir = 0;
20 for (
keyType tmp=lebesgue; tmp>1; level++) {
26 for (; level>0; level--) {
27 int cell = (lebesgue >> ((level-1)*
DIM)) & ((1<<
DIM)-1);
29 dir = DirTable[dir][cell];
44 for (
int lvl=maxLevel; lvl>0; lvl--) {
45 int cell = (lebesgue >> ((lvl-1)*
DIM)) & (
keyType)((1<<
DIM)-1);
46 hilbert = hilbert<<
DIM;
47 if (lvl>maxLevel-level) {
50 dir = DirTable[dir][cell];
61 start(start), child(child), sorted(sorted), index(index), toDeleteLeaf(toDeleteLeaf),
62 toDeleteNode(toDeleteNode), minX(minX), maxX(maxX) {
82 real *maxY) : count(count), start(start), child(child), sorted(sorted), index(index),
83 toDeleteLeaf(toDeleteLeaf), toDeleteNode(toDeleteNode), minX(minX), maxX(maxX),
84 minY(minY), maxY(maxY) {
107 start(start), child(child), sorted(sorted), index(index), toDeleteLeaf(toDeleteLeaf),
108 toDeleteNode(toDeleteNode), minX(minX), maxX(maxX), minY(minY), maxY(maxY), minZ(minZ),
179 while (level <= maxLevel) {
182 if (particles->
x[
index] < 0.5 * (min_x + max_x)) {
184 max_x = 0.5 * (min_x + max_x);
186 else { min_x = 0.5 * (min_x + max_x); }
188 if (particles->
y[
index] < 0.5 * (min_y+max_y)) {
190 max_y = 0.5 * (min_y + max_y);
192 else { min_y = 0.5 * (min_y + max_y); }
194 if (particles->
z[
index] < 0.5 * (min_z+max_z)) {
196 max_z = 0.5 * (min_z + max_z);
198 else { min_z = 0.5 * (min_z + max_z); }
201 particleKey = particleKey | ((
keyType)sonBox << (
keyType)(
DIM * (maxLevel-level-1)));
205 if (childIndex ==
index) {
206 particleLevel = particleLevelTemp;
237 printf(
"Curve type not available!\n");
250 for (
integer i=0; i<maxLevel; i++) {
257 for (
integer i=0; i<maxLevel; i++) {
259 if (childIndex ==
index) {
267 printf(
"ATTENTION: level = -1 (index = %i x = (%f, %f, %f) %f) tree index = %i\n",
305 integer index = threadIdx.x + blockDim.x * blockIdx.x;
306 integer stride = blockDim.x * gridDim.x;
308 real x_min = particles->
x[index];
309 real x_max = particles->
x[index];
311 real y_min = particles->
y[index];
312 real y_max = particles->
y[index];
314 real z_min = particles->
z[index];
315 real z_max = particles->
z[index];
319 extern __shared__
real buffer[];
322 real* x_max_buffer = (
real*)&x_min_buffer[blockSize];
324 real* y_min_buffer = (
real*)&x_max_buffer[blockSize];
325 real* y_max_buffer = (
real*)&y_min_buffer[blockSize];
327 real* z_min_buffer = (
real*)&y_max_buffer[blockSize];
328 real* z_max_buffer = (
real*)&z_min_buffer[blockSize];
335 while (index + offset < n) {
352 x_min_buffer[threadIdx.x] = x_min;
353 x_max_buffer[threadIdx.x] = x_max;
355 y_min_buffer[threadIdx.x] = y_min;
356 y_max_buffer[threadIdx.x] = y_max;
358 z_min_buffer[threadIdx.x] = z_min;
359 z_max_buffer[threadIdx.x] = z_max;
370 if (threadIdx.x < i) {
371 x_min_buffer[threadIdx.x] =
cuda::math::min(x_min_buffer[threadIdx.x], x_min_buffer[threadIdx.x + i]);
372 x_max_buffer[threadIdx.x] =
cuda::math::max(x_max_buffer[threadIdx.x], x_max_buffer[threadIdx.x + i]);
374 y_min_buffer[threadIdx.x] =
cuda::math::min(y_min_buffer[threadIdx.x], y_min_buffer[threadIdx.x + i]);
375 y_max_buffer[threadIdx.x] =
cuda::math::max(y_max_buffer[threadIdx.x], y_max_buffer[threadIdx.x + i]);
377 z_min_buffer[threadIdx.x] =
cuda::math::min(z_min_buffer[threadIdx.x], z_min_buffer[threadIdx.x + i]);
378 z_max_buffer[threadIdx.x] =
cuda::math::max(z_max_buffer[threadIdx.x], z_max_buffer[threadIdx.x + i]);
388 if (threadIdx.x == 0) {
389 while (atomicCAS(mutex, 0 ,1) != 0);
435 atomicExch(mutex, 0);
442 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
443 integer stride = blockDim.x * gridDim.x;
446 if (bodyIndex == 0) {
452#define COMPUTE_DIRECTLY 0
770 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
771 integer stride = blockDim.x * gridDim.x;
801 while ((bodyIndex + offset) < n) {
811 x = particles->
x[bodyIndex + offset];
813 y = particles->
y[bodyIndex + offset];
817 z = particles->
z[bodyIndex + offset];
827 if (x < 0.5 * (min_x + max_x)) {
829 max_x = 0.5 * (min_x + max_x);
832 min_x = 0.5 * (min_x + max_x);
836 if (y < 0.5 * (min_y + max_y)) {
838 max_y = 0.5 * (min_y + max_y);
841 min_y = 0.5 * (min_y + max_y);
845 if (z < 0.5 * (min_z + max_z)) {
847 max_z = 0.5 * (min_z + max_z);
850 min_z = 0.5 * (min_z + max_z);
856 register integer childIndex = childList[temp*
POW_DIM + childPath];
859 while (childIndex >= m) {
867 if (x < 0.5 * (min_x + max_x)) {
869 max_x = 0.5 * (min_x + max_x);
872 min_x = 0.5 * (min_x + max_x);
875 if (y < 0.5 * (min_y + max_y)) {
877 max_y = 0.5 * (min_y + max_y);
880 min_y = 0.5 * (min_y + max_y);
883 if (z < 0.5 * (min_z + max_z)) {
885 max_z = 0.5 * (min_z + max_z);
888 min_z = 0.5 * (min_z + max_z);
894 if (particles->
mass[bodyIndex + offset] != 0) {
908 atomicAdd(&particles->
mass[temp], particles->
mass[bodyIndex + offset]);
911 atomicAdd(&
tree->count[temp], 1);
912 childIndex = childList[
POW_DIM * temp + childPath];
918 if (childIndex != -2) {
922 if (atomicCAS((
int *) &childList[locked], childIndex, -2) == childIndex) {
925 if (childIndex == -1) {
927 childList[locked] = bodyIndex + offset;
928 particles->
level[bodyIndex + offset] = level + 1;
932 if (childIndex >= n) {
933 printf(
"ATTENTION!\n");
936 while (childIndex >= 0 && childIndex < n) {
940 patch =
min(patch, cell);
943 childList[
POW_DIM * temp + childPath] = cell;
951 cudaAssert(
"buildTree: level = %i for index %i (%e)", level,
952 bodyIndex + offset, particles->
x[bodyIndex + offset]);
954 cudaAssert(
"buildTree: level = %i for index %i (%e, %e)", level,
955 bodyIndex + offset, particles->
x[bodyIndex + offset],
956 particles->
y[bodyIndex + offset]);
958 cudaAssert(
"buildTree: level = %i for index %i (%e, %e, %e)", level,
959 bodyIndex + offset, particles->
x[bodyIndex + offset],
960 particles->
y[bodyIndex + offset],
961 particles->
z[bodyIndex + offset]);
967 if (particles->
x[childIndex] < 0.5 * (min_x + max_x)) { childPath += 1; }
969 if (particles->
y[childIndex] < 0.5 * (min_y + max_y)) { childPath += 2; }
971 if (particles->
z[childIndex] < 0.5 * (min_z + max_z)) { childPath += 4; }
992 particles->
mass[cell] += particles->
mass[childIndex];
995 particles->
x[cell] = 0.5 * (min_x + max_x);
998 particles->
y[cell] = 0.5 * (min_y + max_y);
1001 particles->
z[cell] = 0.5 * (min_z + max_z);
1007 tree->count[cell] +=
tree->count[childIndex];
1009 childList[
POW_DIM * cell + childPath] = childIndex;
1010 particles->
level[cell] = level;
1011 particles->
level[childIndex] += 1;
1012 tree->start[cell] = -1;
1015 if (particles->
level[cell] >= particles->
level[childIndex]) {
1026 if (x < 0.5 * (min_x + max_x)) {
1028 max_x = 0.5 * (min_x + max_x);
1030 min_x = 0.5 * (min_x + max_x);
1034 if (y < 0.5 * (min_y + max_y)) {
1036 max_y = 0.5 * (min_y + max_y);
1038 min_y = 0.5 * (min_y + max_y);
1042 if (z < 0.5 * (min_z + max_z)) {
1044 max_z = 0.5 * (min_z + max_z);
1046 min_z = 0.5 * (min_z + max_z);
1052 if (particles->
mass[bodyIndex + offset] != 0) {
1063 particles->
mass[cell] += particles->
mass[bodyIndex + offset];
1066 tree->count[cell] +=
tree->count[bodyIndex + offset];
1067 childIndex = childList[
POW_DIM * temp + childPath];
1070 childList[
POW_DIM * temp + childPath] = bodyIndex + offset;
1071 particles->
level[bodyIndex + offset] = level + 1;
1074 childList[locked] = patch;
1405 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1406 int stride = blockDim.x * gridDim.x;
1409 while ((bodyIndex + offset) < n) {
1410 tree->start[bodyIndex + offset] = bodyIndex + offset;
1418 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1419 int stride = blockDim.x * gridDim.x;
1423 register int i, index;
1433 while ((bodyIndex + offset) < *
tree->index) {
1435 i = bodyIndex + offset;
1437 if (particles->
level[i] == level) {
1456 for (
int child = 0; child <
POW_DIM; ++child) {
1458 if (
tree->child[index] != -1) {
1466 mass += particles->
mass[
tree->child[index]];
1484 particles->
mass[i] = mass;
1485 particles->
x[i] = x;
1487 particles->
y[i] = y;
1489 particles->
z[i] = z;
1503 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
1504 integer stride = blockDim.x*gridDim.x;
1510 while (bodyIndex + offset < *tree->index) {
1512 if (particles->
mass[bodyIndex + offset] != 0) {
1513 particles->
x[bodyIndex + offset] /= particles->
mass[bodyIndex + offset];
1515 particles->
y[bodyIndex + offset] /= particles->
mass[bodyIndex + offset];
1517 particles->
z[bodyIndex + offset] /= particles->
mass[bodyIndex + offset];
1529 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1530 integer stride = blockDim.x * gridDim.x;
1534 if (threadIdx.x == 0) {
1539 tree->start[node] = s;
1540 s +=
tree->count[node];
1542 else if (node >= 0) {
1543 tree->sorted[s] = node;
1560 while ((cell + offset) < ind ) {
1569 s =
tree->start[cell + offset];
1584 tree->start[node] = s;
1585 s +=
tree->count[node];
1593 else if (node >= 0) {
1594 tree->sorted[s] = node;
1621 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1622 integer stride = blockDim.x * gridDim.x;
1627 while (bodyIndex + offset < n) {
1629 particleKey =
tree->getParticleKey(particles, bodyIndex + offset, maxLevel, curveType);
1632 if (particleKey == 1UL) {
1633 printf(
"particleKey = %lu (%f, %f, %f)\n", particleKey, particles->
x[bodyIndex + offset],
1634 particles->
y[bodyIndex + offset], particles->
z[bodyIndex + offset]);
1638 keys[bodyIndex + offset] = particleKey;
1648 for (
int i=0; i<
DIM; i++) {
1651 for (
int i=0; i<
POW_DIM; i++) {
1652 if (
tree->child[i] != -1) {
1653 mass += particles->
mass[
tree->child[i]];
1681 tree->set(count, start, child, sorted, index, toDeleteLeaf, toDeleteNode, minX, maxX);
1685 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1686 integer stride = blockDim.x * gridDim.x;
1689 int relevantChild = 0;
1690 int childIndex, temp;
1692 while ((bodyIndex + offset) <
POW_DIM) {
1693 childIndex =
tree->child[bodyIndex + offset];
1695 while (childIndex != -1) {
1696 childIndex =
tree->child[
POW_DIM * childIndex + relevantChild];
1698 if (childIndex != -1) {
1699 if (particles->
level[temp] >= particles->
level[childIndex]) {
1700 cudaAssert(
"level[%i]: %i vs. level[%i]: %i\n", temp, particles->
level[temp], childIndex,
1701 particles->
level[childIndex]);
1790 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1791 integer stride = blockDim.x * gridDim.x;
1794 while (bodyIndex + offset <
POW_DIM) {
1796 printf(
"child[POW_DIM * 0 + %i] = %i, x = (%f, %f, %f) m = %f\n", bodyIndex + offset,
1797 tree->child[bodyIndex + offset], particles->
x[
tree->child[bodyIndex + offset]],
1798 particles->
y[
tree->child[bodyIndex + offset]], particles->
z[
tree->child[bodyIndex + offset]],
1799 particles->
mass[
tree->child[bodyIndex + offset]]);
1801 for (
int i=0; i<
POW_DIM; i++) {
1802 printf(
"child[POW_DIM * %i + %i] = %i, x = (%f, %f, %f) m = %f\n",
tree->child[bodyIndex + offset], i,
1817 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1818 integer stride = blockDim.x * gridDim.x;
1824 while (bodyIndex + offset <
POW_DIM) {
1828 for (
int i=0; i<
POW_DIM; i++) {
1830 if (
tree->child[
POW_DIM *
tree->child[bodyIndex + offset] + i] != -1) {
1835 if (mass != particles->
mass[
tree->child[bodyIndex + offset]]) {
1836 printf(
"testTree: index: %i mass %f vs %f (%f, %f, %f, %f, %f, %f, %f, %f)\n", bodyIndex + offset, mass, particles->
mass[
tree->child[bodyIndex + offset]],
1837 masses[0], masses[1], masses[2], masses[3], masses[4], masses[5], masses[6], masses[7]);
1873 index, toDeleteLeaf, toDeleteNode, minX, maxX);
1896 tree->set(count, start, child, sorted, index, toDeleteLeaf, toDeleteNode, minX, maxX, minY, maxY);
1904 toDeleteLeaf, toDeleteNode, minX, maxX, minY, maxY);
1912 tree->set(count, start, child, sorted, index, toDeleteLeaf, toDeleteNode, minX, maxX, minY, maxY,
1921 toDeleteLeaf, toDeleteNode, minX, maxX, minY, maxY, minZ, maxZ);
1951 size_t sharedMemory = 2 *
DIM *
sizeof(
real) * blockSize;
1971 maxLevel, n, curveType);
1984namespace UnitTesting {
1987 __global__
void test_localTree(
Tree *
tree,
Particles *particles,
int n,
int m) {
1989 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1990 int stride = blockDim.x * gridDim.x;
2003 while ((bodyIndex + offset) < m) {
2013 for (
int child=0; child<
POW_DIM; child++) {
2014 childIndex =
tree->child[
POW_DIM * (bodyIndex + offset) + child];
2016 if (childIndex != -1) {
2017 childMasses += particles->
mass[childIndex];
2018 positionX += particles->
mass[childIndex] * particles->
x[childIndex];
2020 positionY += particles->
mass[childIndex] * particles->
y[childIndex];
2022 positionZ += particles->
mass[childIndex] * particles->
z[childIndex];
2027 if (childMasses > 0.) {
2028 positionX /= childMasses;
2030 positionY /= childMasses;
2032 positionZ /= childMasses;
2040 if (particles->
mass[bodyIndex + offset] != childMasses) {
2041 if (particles->
nodeType[bodyIndex + offset] != 2) {
2042 printf(
"Masses are not correct [%i]: %e vs %e (type: %i)!\n", bodyIndex + offset, particles->
mass[bodyIndex + offset], childMasses, particles->
nodeType[bodyIndex + offset]);
2048 if (
cuda::math::abs(particles->
x[bodyIndex + offset] - positionX) > 1e-3) {
2049 if (particles->
nodeType[bodyIndex + offset] != 2) {
2050 printf(
"Masses... X position are not correct [%i]: %e vs %e (m = %e vs %e)!\n", bodyIndex + offset,
2051 particles->
x[bodyIndex + offset], positionX, particles->
mass[bodyIndex + offset], childMasses);
2052 for (
int child=0; child<
POW_DIM; child++) {
2053 printf(
"[%i] Masses: index: %i\n", bodyIndex + offset,
tree->child[
POW_DIM * (bodyIndex + offset) + child]);
2058 if (
cuda::math::abs(particles->
y[bodyIndex + offset] - positionY) > 1e-3) {
2059 if (particles->
nodeType[bodyIndex + offset] != 2) {
2060 printf(
"Masses... Y position are not correct: %e vs %e (m = %e vs %e)!\n", particles->
y[bodyIndex + offset], positionY, particles->
mass[bodyIndex + offset], childMasses);
2064 if (
cuda::math::abs(particles->
z[bodyIndex + offset] - positionZ) > 1e-3) {
2065 if (particles->
nodeType[bodyIndex + offset] != 2) {
2066 printf(
"Masses... Z position are not correct: %e vs %e (m = %e vs %e)!\n", particles->
z[bodyIndex + offset], positionZ, particles->
mass[bodyIndex + offset], childMasses);
2078 return cuda::launch(
true, executionPolicy, ::UnitTesting::Kernel::test_localTree,
tree, particles, n, m);
Execution policy/instruction for CUDA kernel execution.
Particle(s) class based on SoA (Structur of Arrays).
CUDA_CALLABLE_MEMBER real weightedEntry(integer index, Entry::Name entry)
real * x
(pointer to) x position (array)
integer * level
(pointer to) level of the (pseudo-)particles
integer * nodeType
(pointer to) node type
real * y
(pointer to) y position (array)
real * mass
(pointer to) mass (array)
real * z
(pointer to) z position (array)
CUDA_CALLABLE_MEMBER void reset(integer index, integer n)
Reset (specific) entries.
integer * index
index used for counting nodes
real * minZ
bounding box minimal z
integer * child
children/child nodes or leaves (beneath)
CUDA_CALLABLE_MEMBER keyType getParticleKey(Particles *particles, integer index, integer maxLevel, Curve::Type curveType=Curve::lebesgue)
Get SFC key of a particle.
real * maxX
bounding box maximal x
integer * sorted
sorted (indices) for better cache performance
integer * count
accumulated nodes/leaves beneath
CUDA_CALLABLE_MEMBER integer sumParticles()
Sum particles in tree.
integer * toDeleteNode
buffer for remembering old indices for rebuilding tree
CUDA_CALLABLE_MEMBER ~Tree()
Destructor.
CUDA_CALLABLE_MEMBER Tree()
Default constructor.
real * minX
bounding box minimal x
integer * start
TODO: describe start.
integer * toDeleteLeaf
buffer for remembering old indices for rebuilding tree
CUDA_CALLABLE_MEMBER integer getTreeLevel(Particles *particles, integer index, integer maxLevel, Curve::Type curveType=Curve::lebesgue)
Get tree level for a (specific) particle.
CUDA_CALLABLE_MEMBER void set(integer *count, integer *start, integer *child, integer *sorted, integer *index, integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX)
Setter, passing pointer to member variables.
real * maxY
bounding box maximal y
real * minY
bounding box minimal y
real * maxZ
bounding box maximal z
#define CUDA_CALLABLE_MEMBER
const unsigned char HilbertTable[12][8]
Table needed to convert from Lebesgue to Hilbert keys.
CUDA_CALLABLE_MEMBER keyType lebesgue2hilbert(keyType lebesgue, integer maxLevel)
Convert a Lebesgue key to a Hilbert key.
__global__ void info(Material *material)
Debug kernel giving information about material(s).
__global__ void calculateCentersOfMass(Tree *tree, Particles *particles, integer level)
__global__ void getParticleKeys(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, keyType *keys, integer maxLevel, integer n, Curve::Type curveType=Curve::lebesgue)
Kernel to get all particle keys (and additional information for debugging purposes).
real calculateCentersOfMass(Tree *tree, Particles *particles, integer n, integer level, bool time=false)
real prepareSorting(Tree *tree, Particles *particles, integer n, integer m)
real sort(Tree *tree, integer n, integer m, bool time=false)
Wrapper for TreeNS::Kernel::sort()
real centerOfMass(Tree *tree, Particles *particles, integer n, bool time=false)
Wrapper for TreeNS::Kernel::centerOfMass()
real sumParticles(Tree *tree)
Wrapper for TreeNS::Kernel::sumParticles()
real globalCOM(Tree *tree, Particles *particles, real com[DIM])
Wrapper for TreeNS::Kernel::globalCOM()
real testTree(Tree *tree, Particles *particles, integer n, integer m)
real computeBoundingBox(Tree *tree, Particles *particles, integer *mutex, integer n, integer blockSize, bool time=false)
Wrapper for TreeNS::Kernel::computeBoundingBox()
void set(Tree *tree, integer *count, integer *start, integer *child, integer *sorted, integer *index, integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX)
Wrapper for TreeNS::Kernel::set()
real getParticleKeys(Tree *tree, Particles *particles, keyType *keys, integer maxLevel, integer n, Curve::Type curveType=Curve::lebesgue, bool time=false)
Wrapper for TreeNS::Kernel::getParticleKeys()
real info(Tree *tree, Particles *particles, integer n, integer m)
Wrapper for TreeNS::Kernel::Launch::info()
real buildTree(Tree *tree, Particles *particles, integer n, integer m, bool time=false)
Wrapper for TreeNS::Kernel::buildTree()
__global__ void prepareSorting(Tree *tree, Particles *particles, integer n, integer m)
__global__ void set(Tree *tree, integer *count, integer *start, integer *child, integer *sorted, integer *index, integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX)
Kernel call to setter.
__global__ void testTree(Tree *tree, Particles *particles, integer n, integer m)
__global__ void info(Tree *tree, Particles *particles, integer n, integer m)
Info Kernel for tree class (for debugging purposes)
__global__ void getParticleKeys(Tree *tree, Particles *particles, keyType *keys, integer maxLevel, integer n, Curve::Type curveType=Curve::lebesgue)
Kernel to get all particle's keys.
__global__ void computeBoundingBox(Tree *tree, Particles *particles, integer *mutex, integer n, integer blockSize)
Kernel to compute the bounding box/borders of the tree or rather the particles within the tree.
__global__ void calculateCentersOfMass(Tree *tree, Particles *particles, integer n, integer level)
Compute center of masses (level wise).
__global__ void sumParticles(Tree *tree)
Kernel call to sum particles within tree.
__global__ void sort(Tree *tree, integer n, integer m)
Kernel to sort tree/child indices to optimize cache efficiency.
__global__ void buildTree(Tree *tree, Particles *particles, integer n, integer m)
Kernel to construct the tree using the particles within particles
__global__ void globalCOM(Tree *tree, Particles *particles, real com[DIM])
Compute center of mass for all particles.
__global__ void centerOfMass(Tree *tree, Particles *particles, integer n)
__device__ real min(real a, real b)
Minimum value out of two floating point values.
__device__ real abs(real a)
Absolute value of a floating point value.
__device__ real max(real a, real b)
Maximum value out of two floating point values.
void set(T *d_var, T val, std::size_t count=1)
Set device memory to a specific value.
real launch(bool timeKernel, const ExecutionPolicy &policy, void(*f)(Arguments...), Arguments... args)
CUDA execution wrapper function.
#define DIM
Dimension of the problem.