milupHPC documentation
  • src
miluphpc.cpp
Go to the documentation of this file.
1#include "../include/miluphpc.h"
2
3Miluphpc::Miluphpc(SimulationParameters simulationParameters) {
4
5 this->simulationParameters = simulationParameters;
6 subStep = 0;
7
8 // get number of particles from provided input file
9 numParticlesFromFile(simulationParameters.inputFile);
10 // estimate amount of memory for simulation // TODO: generalize/optimize numNodes
11 numNodes = 2 * numParticles + 20000;
12
13 boost::mpi::communicator comm;
14 sumParticles = numParticlesLocal;
15 all_reduce(comm, boost::mpi::inplace_t<integer*>(&sumParticles), 1, std::plus<integer>());
16
17 Logger(DEBUG) << "Initialized: numParticlesLocal: " << numParticlesLocal;
18 Logger(DEBUG) << "Initialized: numParticles: " << numParticles;
19 Logger(DEBUG) << "Initialized: numNodes: " << numNodes;
20 Logger(DEBUG) << "Initialized: sumParticles: " << sumParticles;
21
22 // memory allocations (via instantiation of handlers)
23 cuda::malloc(d_mutex, 1);
24 //helperHandler = new HelperHandler(numNodes);
25 //buffer = new HelperHandler(numNodes);
26 subDomainKeyTreeHandler = new SubDomainKeyTreeHandler();
27 Logger(DEBUG) << "Initialized: numProcesses: " << subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses;
28 buffer = new HelperHandler(subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses, numParticlesLocal, numParticles,
29 sumParticles, numNodes);
30 particleHandler = new ParticleHandler(numParticles, numNodes);
31 treeHandler = new TreeHandler(numParticles, numNodes);
32 domainListHandler = new DomainListHandler(simulationParameters.domainListSize);
33 lowestDomainListHandler = new DomainListHandler(simulationParameters.domainListSize);
34#if SPH_SIM
35 materialHandler = new MaterialHandler(simulationParameters.materialConfigFile.c_str()); //"config/material.cfg"
36#if DEBUGGING
37 for (int i=0; i<materialHandler->numMaterials; i++) {
38 materialHandler->h_materials[i].info();
39 }
40#endif
41 materialHandler->copy(To::device);
42#endif
43 simulationTimeHandler = new SimulationTimeHandler(simulationParameters.timeStep,
44 simulationParameters.timeEnd,
45 simulationParameters.maxTimeStep);
46 simulationTimeHandler->copy(To::device);
47 if (subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses == 1) {
48 curveType = Curve::lebesgue;
49 }
50 else {
51 //curveType = Curve::lebesgue; //curveType = Curve::hilbert;
52 if (simulationParameters.sfcSelection >= 0 && simulationParameters.sfcSelection <= 2) {
53 curveType = Curve::Type(simulationParameters.sfcSelection);
54 if (simulationParameters.sfcSelection == 0) { Logger(DEBUG) << "Selected sfc: Lebesgue"; }
55 else if (simulationParameters.sfcSelection == 1) { Logger(DEBUG) << "Selected sfc: Hilbert"; }
56 else { Logger(DEBUG) << "Selected sfc: not valid!"; }
57 }
58 else {
59 curveType = Curve::Type(0); // selecting lebesgue sfc as default
60 Logger(DEBUG) << "Selected sfc: Lebesgue (Default)";
61 }
62 }
63
64 // TODO: buffer arrays/pointers handling/optimization (and corresponding freeing)
65 //cuda::malloc(d_particles2SendIndices, numNodes); // numParticles
66 //cuda::malloc(d_pseudoParticles2SendIndices, numNodes);
67 //cuda::malloc(d_pseudoParticles2SendLevels, numNodes);
68 //cuda::malloc(d_pseudoParticles2ReceiveLevels, numNodes);
69 //cuda::malloc(d_particles2SendCount, subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses);
70 //cuda::malloc(d_pseudoParticles2SendCount, subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses);
71 //cuda::malloc(d_particles2removeBuffer, numParticles);
72 //cuda::malloc(d_particles2removeVal, 1);
73 //cuda::malloc(d_idIntegerBuffer, numParticles);
74 //cuda::malloc(d_idIntegerCopyBuffer, numParticles);
75
76#if SPH_SIM
77 try {
78 kernelHandler = SPH::KernelHandler(Smoothing::Kernel(simulationParameters.smoothingKernelSelection));
79 }
80 catch(...) {
81 Logger(WARN) << "Kernel not available! Selecting cubic spline [1] as default!";
82 kernelHandler = SPH::KernelHandler(Smoothing::Kernel::cubic_spline);
83 }
84#else
85 // TODO: instance should not be needed at all if SPH_SIM 0
86 kernelHandler = SPH::KernelHandler(Smoothing::Kernel(0));
87#endif
88
89 // read particle data, load balancing, ...
90 prepareSimulation();
91
92 // check memory allocations
93 getMemoryInfo();
94
95}
96
97Miluphpc::~Miluphpc() {
98
99 //delete helperHandler;
100 delete buffer;
101 delete particleHandler;
102 delete subDomainKeyTreeHandler;
103 delete treeHandler;
104#if SPH_SIM
105 delete materialHandler;
106#endif
107 delete simulationTimeHandler;
108
109 cuda::free(d_mutex);
110 //cuda::free(d_particles2SendIndices);
111 //cuda::free(d_pseudoParticles2SendIndices);
112 //cuda::free(d_pseudoParticles2SendLevels);
113 //cuda::free(d_pseudoParticles2ReceiveLevels);
114 //cuda::free(d_particles2SendCount);
115 //cuda::free(d_pseudoParticles2SendCount);
116 //cuda::free(d_particles2removeBuffer);
117 //cuda::free(d_particles2removeVal);
118 //cuda::free(d_idIntegerBuffer);
119 //cuda::free(d_idIntegerCopyBuffer);
120
121}
122
123void Miluphpc::numParticlesFromFile(const std::string& filename) {
124
125 boost::mpi::communicator comm;
126 HighFive::File file(filename.c_str(), HighFive::File::ReadOnly);
127 std::vector<real> m;
128 std::vector<std::vector<real>> x;
129
130 // read datasets from file
131 HighFive::DataSet mass = file.getDataSet("/m");
132 HighFive::DataSet pos = file.getDataSet("/x");
133
134 mass.read(m);
135 pos.read(x);
136
137 numParticles = (int)(m.size() * simulationParameters.particleMemoryContingent);
138
139 integer ppp = m.size()/comm.size();
140 integer ppp_remnant = m.size() % comm.size();
141
142#if DEBUGGING
143 Logger(INFO) << "ppp = " << ppp;
144 Logger(INFO) << "ppp remnant = " << ppp_remnant;
145#endif
146
147 if (ppp_remnant == 0) {
148 numParticlesLocal = ppp;
149 }
150 else {
151 if (comm.rank() < (comm.size()-1)) {
152 numParticlesLocal = ppp;
153 }
154 else {
155 numParticlesLocal = ppp + ppp_remnant;
156 }
157
158 }
159}
160
161void Miluphpc::distributionFromFile(const std::string& filename) {
162
163 HighFive::File file(filename.c_str(), HighFive::File::ReadOnly);
164
165 // containers to be filled
166 std::vector<real> m, u;
167 std::vector<std::vector<real>> x, v;
168 std::vector<integer> materialId;
169
170 // read datasets from file
171 HighFive::DataSet mass = file.getDataSet("/m");
172 HighFive::DataSet pos = file.getDataSet("/x");
173 HighFive::DataSet vel = file.getDataSet("/v");
174
175#if SPH_SIM
176 HighFive::DataSet matId = file.getDataSet("/materialId");
177#if INTEGRATE_ENERGY
178 HighFive::DataSet h5_u = file.getDataSet("/u");
179#endif
180#endif
181
182 // read data
183 mass.read(m);
184 pos.read(x);
185 vel.read(v);
186#if SPH_SIM
187 matId.read(materialId);
188 //TODO: when do I want to read in the (internal) energy?
189#if INTEGRATE_ENERGY
190 h5_u.read(u);
191#endif
192#endif
193
194 // TODO: read sml if xy?
195
196 integer ppp = m.size()/subDomainKeyTreeHandler->h_numProcesses;
197 integer ppp_remnant = m.size() % subDomainKeyTreeHandler->h_numProcesses;
198
199 int startIndex = subDomainKeyTreeHandler->h_subDomainKeyTree->rank * ppp;
200 int endIndex = (subDomainKeyTreeHandler->h_rank + 1) * ppp;
201 if (subDomainKeyTreeHandler->h_rank == (subDomainKeyTreeHandler->h_numProcesses - 1)) {
202 endIndex += ppp_remnant;
203 }
204
205 for (int j = startIndex; j < endIndex; j++) {
206 int i = j - subDomainKeyTreeHandler->h_rank * ppp;
207
208 particleHandler->h_particles->uid[i] = j;
209 particleHandler->h_particles->mass[i] = m[j];
210
211 particleHandler->h_particles->x[i] = x[j][0];
212 particleHandler->h_particles->vx[i] = v[j][0];
213#if DIM > 1
214 particleHandler->h_particles->y[i] = x[j][1];
215 particleHandler->h_particles->vy[i] = v[j][1];
216#if DIM == 3
217 particleHandler->h_particles->z[i] = x[j][2];
218 particleHandler->h_particles->vz[i] = v[j][2];
219#endif
220#endif
221
222#if SPH_SIM
223#if INTEGRATE_ENERGY
224 particleHandler->h_particles->e[i] = u[j];
225#endif
226 particleHandler->h_particles->materialId[i] = materialId[j];
227 //particleHandler->h_particles->sml[i] = simulationParameters.sml;
228 particleHandler->h_particles->sml[i] = materialHandler->h_materials[materialId[j]].sml;
229#endif
230
231 }
232}
233
234//TODO: block/warp/stack size for computeBoundingBox and computeForces
235void Miluphpc::prepareSimulation() {
236
237 Logger(DEBUG) << "Preparing simulation ...";
238
239 Logger(DEBUG) << "Initialize/Read particle distribution ...";
240 distributionFromFile(simulationParameters.inputFile);
241
242#if SPH_SIM
243 SPH::Kernel::Launch::initializeSoundSpeed(particleHandler->d_particles, materialHandler->d_materials, numParticlesLocal);
244#endif
245
246 particleHandler->copyDistribution(To::device, true, true);
247#if SPH_SIM
248 //TODO: problem: (e.g.) cs should not be copied....
249 //particleHandler->copySPH(To::device);
250 //cuda::copy(particleHandler->h_rho, particleHandler->d_rho, numParticles, To::device);
251 //cuda::copy(particleHandler->h_p, particleHandler->d_p, numParticles, To::device);
252 cuda::copy(particleHandler->h_e, particleHandler->d_e, numParticles, To::device);
253 cuda::copy(particleHandler->h_sml, particleHandler->d_sml, numParticles, To::device);
254 //cuda::copy(particleHandler->h_noi, particleHandler->d_noi, numParticles, To::device);
255 //cuda::copy(particleHandler->h_cs, particleHandler->d_cs, numParticles, To::device);
256#endif
257
258 if (simulationParameters.removeParticles) {
259 removeParticles();
260 }
261
262 Logger(DEBUG) << "Compute bounding box ...";
263 TreeNS::Kernel::Launch::computeBoundingBox(treeHandler->d_tree, particleHandler->d_particles, d_mutex,
264 numParticlesLocal, 256, false);
265
266#if DEBUGGING
267#if DIM == 3
268 treeHandler->copy(To::host);
269 Logger(DEBUG) << "x: " << std::abs(*treeHandler->h_maxX) << ", " << std::abs(*treeHandler->h_minX);
270 Logger(DEBUG) << "y: " << std::abs(*treeHandler->h_maxY) << ", " << std::abs(*treeHandler->h_minY);
271 Logger(DEBUG) << "z: " << std::abs(*treeHandler->h_maxZ) << ", " << std::abs(*treeHandler->h_minZ);
272#endif
273#endif
274
275 treeHandler->globalizeBoundingBox(Execution::device);
276 treeHandler->copy(To::host);
277
278 if (simulationParameters.loadBalancing) {
279 fixedLoadBalancing();
280 dynamicLoadBalancing();
281 }
282 else {
283 fixedLoadBalancing();
284 }
285
286 subDomainKeyTreeHandler->copy(To::device);
287
288 boost::mpi::communicator comm;
289 sumParticles = numParticlesLocal;
290 all_reduce(comm, boost::mpi::inplace_t<integer*>(&sumParticles), 1, std::plus<integer>());
291
292}
293
294real Miluphpc::rhs(int step, bool selfGravity, bool assignParticlesToProcess) {
295
296 // TESTING
297 //Logger(INFO) << "reduction: max:";
298 //HelperNS::reduceAndGlobalize(particleHandler->d_sml, helperHandler->d_realVal, numParticlesLocal, Reduction::max);
299 //Logger(INFO) << "reduction: min:";
300 //HelperNS::reduceAndGlobalize(particleHandler->d_sml, helperHandler->d_realVal, numParticlesLocal, Reduction::min);
301 //Logger(INFO) << "reduction: sum:";
302 //HelperNS::reduceAndGlobalize(particleHandler->d_sml, helperHandler->d_realVal, numParticlesLocal, Reduction::sum);
303 // end: TESTING
304
305 Logger(INFO) << "Miluphpc::rhs()";
306
307 getMemoryInfo();
308
309 Timer timer;
310 real time;
311 real elapsed;
312 real *profilerTime = &elapsed; //&time;
313 real totalTime = 0;
314
315 Logger(INFO) << "rhs::reset()";
316 timer.reset();
317 // -----------------------------------------------------------------------------------------------------------------
318 time = reset();
319 // -----------------------------------------------------------------------------------------------------------------
320 elapsed = timer.elapsed();
321 totalTime += time;
322 Logger(TIME) << "rhs::reset(): " << elapsed << " ms"; //time << " ms";
323 profiler.value2file(ProfilerIds::Time::reset, *profilerTime);
324
325 //Logger(INFO) << "checking for nans before bounding box...";
326 //ParticlesNS::Kernel::Launch::check4nans(particleHandler->d_particles, numParticlesLocal);
327
328 Logger(INFO) << "rhs::boundingBox()";
329 timer.reset();
330 // -----------------------------------------------------------------------------------------------------------------
331 time = boundingBox();
332 // -----------------------------------------------------------------------------------------------------------------
333 elapsed = timer.elapsed();
334 totalTime += time;
335 Logger(TIME) << "rhs::boundingBox(): " << time << " ms";
336 profiler.value2file(ProfilerIds::Time::boundingBox, *profilerTime);
337
338#if DEBUGGING
339 Logger(INFO) << "checking for nans before assigning particles...";
340 ParticlesNS::Kernel::Launch::check4nans(particleHandler->d_particles, numParticlesLocal);
341#endif
342
343//#if DEBUGGING
344 Logger(INFO) << "before: numParticlesLocal: " << numParticlesLocal;
345 Logger(INFO) << "before: numParticles: " << numParticles;
346 Logger(INFO) << "before: numNodes: " << numNodes;
347//#endif
348
349 if (assignParticlesToProcess) {
350 timer.reset();
351 if (subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses > 1) {
352 Logger(INFO) << "rhs::assignParticles()";
353 // ---------------------------------------------------------------------------------------------------------
354 time = assignParticles();
355 // ---------------------------------------------------------------------------------------------------------
356 }
357 elapsed = timer.elapsed();
358 totalTime += time;
359 Logger(TIME) << "rhs::assignParticles(): " << elapsed << " ms"; //time << " ms";
360 profiler.value2file(ProfilerIds::Time::assignParticles, *profilerTime);
361 }
362 //else {
363 // profiler.value2file(ProfilerIds::Time::assignParticles, 0.);
364 //}
365
366 //Logger(INFO) << "checking for nans after assigning particles...";
367 //ParticlesNS::Kernel::Launch::check4nans(particleHandler->d_particles, numParticlesLocal);
368
369#if DEBUGGING
370 Logger(INFO) << "after: numParticlesLocal: " << numParticlesLocal;
371 Logger(INFO) << "after: numParticles: " << numParticles;
372 Logger(INFO) << "after: numNodes: " << numNodes;
373#endif
374
375 boost::mpi::communicator comm;
376 sumParticles = numParticlesLocal;
377 all_reduce(comm, boost::mpi::inplace_t<integer*>(&sumParticles), 1, std::plus<integer>());
378 Logger(DEBUG) << "numParticlesLocal = " << numParticlesLocal;
379
380 subDomainKeyTreeHandler->copy(To::host, true, false);
381 for (int i=0; i<=subDomainKeyTreeHandler->h_numProcesses; i++) {
382 Logger(DEBUG) << "rangeValues[" << i << "] = " << subDomainKeyTreeHandler->h_subDomainKeyTree->range[i];
383 }
384
385 Logger(INFO) << "rhs::tree()";
386 timer.reset();
387 // -----------------------------------------------------------------------------------------------------------------
388 time = tree();
389 // -----------------------------------------------------------------------------------------------------------------
390 elapsed = timer.elapsed();
391 totalTime += time;
392 Logger(TIME) << "rhs::tree(): " << elapsed << " ms"; //time << " ms";
393 profiler.value2file(ProfilerIds::Time::tree, *profilerTime);
394
395#if GRAVITY_SIM
396 if (selfGravity) {
397 Logger(INFO) << "rhs::pseudoParticles()";
398 timer.reset();
399 // -------------------------------------------------------------------------------------------------------------
400 time = pseudoParticles();
401 // -------------------------------------------------------------------------------------------------------------
402 elapsed = timer.elapsed();
403 totalTime += time;
404 Logger(TIME) << "rhs::pseudoParticles(): " << elapsed << " ms"; //time << " ms";
405 profiler.value2file(ProfilerIds::Time::pseudoParticle, *profilerTime);
406 }
407 else {
408 // -------------------------------------------------------------------------------------------------------------
409 DomainListNS::Kernel::Launch::lowestDomainList(subDomainKeyTreeHandler->d_subDomainKeyTree, treeHandler->d_tree,
410 particleHandler->d_particles, domainListHandler->d_domainList,
411 lowestDomainListHandler->d_domainList, numParticles, numNodes);
412 // -------------------------------------------------------------------------------------------------------------
413 }
414#else
415 // -----------------------------------------------------------------------------------------------------------------
416 DomainListNS::Kernel::Launch::lowestDomainList(subDomainKeyTreeHandler->d_subDomainKeyTree, treeHandler->d_tree,
417 particleHandler->d_particles, domainListHandler->d_domainList,
418 lowestDomainListHandler->d_domainList, numParticles, numNodes);
419 // -----------------------------------------------------------------------------------------------------------------
420#endif
421
422#if GRAVITY_SIM
423 if (selfGravity) {
424 timer.reset();
425 Logger(INFO) << "rhs::gravity()";
426 // -------------------------------------------------------------------------------------------------------------
427 time = gravity();
428 // -------------------------------------------------------------------------------------------------------------
429 elapsed = timer.elapsed();
430 totalTime += time;
431 Logger(TIME) << "rhs::gravity(): " << time << " ms";
432 profiler.value2file(ProfilerIds::Time::gravity, *profilerTime);
433 }
434 //else {
435 // int *dummy = new int[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
436 // for (int i_dummy=0; i_dummy<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; ++i_dummy) {
437 // dummy[i_dummy] = 0;
438 // }
439 // profiler.vector2file(ProfilerIds::SendLengths::gravityParticles, dummy);
440 // profiler.vector2file(ProfilerIds::SendLengths::gravityPseudoParticles, dummy);
441 // profiler.vector2file(ProfilerIds::ReceiveLengths::gravityParticles, dummy);
442 // profiler.vector2file(ProfilerIds::ReceiveLengths::gravityPseudoParticles, dummy);
443 // delete [] dummy;
444 //
445 // profiler.value2file(ProfilerIds::Time::Gravity::compTheta, 0.);
446 // profiler.value2file(ProfilerIds::Time::Gravity::symbolicForce, 0.);
447 // profiler.value2file(ProfilerIds::Time::Gravity::sending, 0.);
448 // profiler.value2file(ProfilerIds::Time::Gravity::insertReceivedPseudoParticles, 0.);
449 // profiler.value2file(ProfilerIds::Time::Gravity::insertReceivedParticles, 0.);
450 // profiler.value2file(ProfilerIds::Time::Gravity::force, 0.);
451 // profiler.value2file(ProfilerIds::Time::Gravity::repairTree, 0.);
452 // profiler.value2file(ProfilerIds::Time::gravity, 0.);
453 //}
454#endif
455
456#if SPH_SIM
457 Logger(INFO) << "rhs: sph()";
458 timer.reset();
459 // -----------------------------------------------------------------------------------------------------------------
460 time = sph();
461 // -----------------------------------------------------------------------------------------------------------------
462 elapsed = timer.elapsed();
463 Logger(TIME) << "rhs::sph(): " << elapsed << " ms"; //time << " ms";
464 profiler.value2file(ProfilerIds::Time::sph, *profilerTime);
465 totalTime += time;
466#endif
467
468 return totalTime;
469}
470
471void Miluphpc::afterIntegrationStep() {
472 // things which need/can be done after (full) integration step
473
474 // angular momentum calculation
475 if (simulationParameters.calculateAngularMomentum) {
476 angularMomentum();
477 }
478 // energy calculation
479 if (simulationParameters.calculateEnergy) {
480 energy();
481 }
482}
483
484real Miluphpc::angularMomentum() {
485 real time;
486
487 boost::mpi::communicator comm;
488
489 const unsigned int blockSizeReduction = 256;
490 real *d_outputData;
491 //cuda::malloc(d_outputData, blockSizeReduction * DIM);
492 d_outputData = buffer->d_realBuffer;
493 cuda::set(d_outputData, (real)0., blockSizeReduction * DIM);
494 time = Physics::Kernel::Launch::calculateAngularMomentumBlockwise<blockSizeReduction>(particleHandler->d_particles,
495 d_outputData,
496 numParticlesLocal);
497 real *d_intermediateAngularMomentum;
498 //cuda::malloc(d_intermediateAngularMomentum, DIM);
499 d_intermediateAngularMomentum = buffer->d_realBuffer1; //&d_outputData[blockSizeReduction * DIM];
500 cuda::set(d_intermediateAngularMomentum, (real)0., DIM);
501 time += Physics::Kernel::Launch::sumAngularMomentum<blockSizeReduction>(d_outputData,
502 d_intermediateAngularMomentum);
503
504 real *h_intermediateResult = new real[DIM];
505 //cuda::copy(h_intermediateResult, d_intermediateAngularMomentum, DIM, To::host);
506 //Logger(INFO) << "angular momentum before MPI: (" << h_intermediateResult[0] << ", "
507 // << h_intermediateResult[1] << ", " << h_intermediateResult[2] << ")";
508
509 all_reduce(comm, boost::mpi::inplace_t<real*>(d_intermediateAngularMomentum), DIM, std::plus<real>());
510 cuda::copy(h_intermediateResult, d_intermediateAngularMomentum, DIM, To::host);
511
512 //Logger(INFO) << "angular momentum: (" << h_intermediateResult[0] << ", " << h_intermediateResult[1] << ", " << h_intermediateResult[2] << ")";
513
514 real angularMomentum;
515#if DIM == 1
516 angularMomentum = abs(h_intermediateResult[0]);
517#elif DIM == 2
518 angularMomentum = sqrt(h_intermediateResult[0] * h_intermediateResult[0] + h_intermediateResult[1] *
519 h_intermediateResult[1]);
520#else
521 angularMomentum = sqrt(h_intermediateResult[0] * h_intermediateResult[0] +
522 h_intermediateResult[1] * h_intermediateResult[1] +
523 h_intermediateResult[2] * h_intermediateResult[2]);
524#endif
525
526 totalAngularMomentum = angularMomentum;
527 Logger(DEBUG) << "angular momentum: " << totalAngularMomentum;
528
529 delete [] h_intermediateResult;
530 //cuda::free(d_outputData);
531 //cuda::free(d_intermediateAngularMomentum);
532
533 Logger(TIME) << "angular momentum: " << time << " ms";
534
535 return time;
536}
537
538real Miluphpc::energy() {
539
540 real time = 0;
541
542 time = Physics::Kernel::Launch::kineticEnergy(particleHandler->d_particles, numParticlesLocal);
543
544 boost::mpi::communicator comm;
545
546 const unsigned int blockSizeReduction = 256;
547 real *d_outputData;
548 //cuda::malloc(d_outputData, blockSizeReduction);
549 d_outputData = buffer->d_realBuffer;
550 cuda::set(d_outputData, (real)0., blockSizeReduction);
551 time += CudaUtils::Kernel::Launch::reduceBlockwise<real, blockSizeReduction>(particleHandler->d_u, d_outputData,
552 numParticlesLocal);
553 real *d_intermediateResult;
554 //cuda::malloc(d_intermediateResult, 1);
555 d_intermediateResult = buffer->d_realVal;
556 cuda::set(d_intermediateResult, (real)0., 1);
557 time += CudaUtils::Kernel::Launch::blockReduction<real, blockSizeReduction>(d_outputData, d_intermediateResult);
558
559 real h_intermediateResult;
560 //cuda::copy(&h_intermediateResult, d_intermediateResult, 1, To::host);
561 //Logger(INFO) << "local energy: " << h_intermediateResult;
562
563 all_reduce(comm, boost::mpi::inplace_t<real*>(d_intermediateResult), 1, std::plus<real>());
564
565 cuda::copy(&h_intermediateResult, d_intermediateResult, 1, To::host);
566 totalEnergy = h_intermediateResult;
567
568 //cuda::free(d_outputData);
569 //cuda::free(d_intermediateResult);
570
571 Logger(DEBUG) << "energy: " << totalEnergy;
572 Logger(TIME) << "energy: " << time << " ms";
573
574 //cuda::copy(particleHandler->h_u, particleHandler->d_u, numParticlesLocal, To::host);
575 //cuda::copy(particleHandler->h_mass, particleHandler->d_mass, numParticlesLocal, To::host);
576 //for (int i=0; i<numParticlesLocal; i++) {
577 // if (i % 100 == 0 || particleHandler->h_mass[i] > 0.0001) {
578 // Logger(INFO) << "u[" << i << "] = " << particleHandler->h_u[i] << "( mass = "
579 // << particleHandler->h_mass[i] << ")";
580 // }
581 //}
582
583 return time;
584}
585
586real Miluphpc::reset() {
587 real time;
588 // START: resetting arrays, variables, buffers, ...
589 Logger(DEBUG) << "resetting (device) arrays ...";
590 // -----------------------------------------------------------------------------------------------------------------
591 time = Kernel::Launch::resetArrays(treeHandler->d_tree, particleHandler->d_particles, d_mutex, numParticles,
592 numNodes, true);
593 // -----------------------------------------------------------------------------------------------------------------
594
595#if SPH_SIM
596 cuda::set(particleHandler->d_u, (real)0., numParticlesLocal);
597#endif
598
599 cuda::set(particleHandler->d_ax, (real)0., numParticles);
600#if DIM > 1
601 cuda::set(particleHandler->d_ay, (real)0., numParticles);
602#if DIM == 3
603 cuda::set(particleHandler->d_az, (real)0., numParticles);
604#endif
605#endif
606
607 // TODO: just some testing
608 /*
609 cuda::set(treeHandler->d_child, -1, POW_DIM * numNodes);
610 cuda::set(&particleHandler->d_mass[numParticlesLocal], (real)0., numNodes - numParticlesLocal);
611 cuda::set(&particleHandler->d_x[numParticlesLocal], (real)0., numNodes - numParticlesLocal);
612#if DIM > 1
613 cuda::set(&particleHandler->d_y[numParticlesLocal], (real)0., numNodes - numParticlesLocal);
614#if DIM == 3
615 cuda::set(&particleHandler->d_z[numParticlesLocal], (real)0., numNodes - numParticlesLocal);
616#endif
617#endif
618 */
619 // end: testing
620 cuda::set(particleHandler->d_nodeType, 0, numNodes);
621
622 //cuda::set(&particleHandler->d_x[numParticles], (real)0., numNodes-numParticles);
623 //cuda::set(&particleHandler->d_y[numParticles], (real)0., numNodes-numParticles);
624 //cuda::set(&particleHandler->d_z[numParticles], (real)0., numNodes-numParticles);
625 //cuda::set(&particleHandler->d_mass[numParticles], (real)0., numNodes-numParticles);
626
627 //helperHandler->reset();
628 buffer->reset();
629 domainListHandler->reset();
630 lowestDomainListHandler->reset();
631 subDomainKeyTreeHandler->reset();
632
633#if SPH_SIM
634 cuda::set(particleHandler->d_noi, 0, numParticles);
635 cuda::set(particleHandler->d_nnl, -1, MAX_NUM_INTERACTIONS * numParticles);
636#endif
637
638 //Logger(TIME) << "resetArrays: " << time << " ms";
639 // END: resetting arrays, variables, buffers, ...
640 return time;
641}
642
643real Miluphpc::boundingBox() {
644
645 //ParticlesNS::Kernel::Launch::check4nans(particleHandler->d_particles, numParticlesLocal);
646
647 real time;
648 Logger(DEBUG) << "computing bounding box ...";
649 // ---------------------------------------------------------
650 time = TreeNS::Kernel::Launch::computeBoundingBox(treeHandler->d_tree, particleHandler->d_particles, d_mutex,
651 numParticlesLocal, 256, true);
652 // ---------------------------------------------------------
653
654 // this is/was not working properly (why?)
655 //*treeHandler->h_minX = HelperNS::reduceAndGlobalize(particleHandler->d_x, treeHandler->d_minX, numParticlesLocal, Reduction::min);
656 //*treeHandler->h_maxX = HelperNS::reduceAndGlobalize(particleHandler->d_x, treeHandler->d_maxX, numParticlesLocal, Reduction::max);
657 //*treeHandler->h_minY = HelperNS::reduceAndGlobalize(particleHandler->d_y, treeHandler->d_minY, numParticlesLocal, Reduction::min);
658 //*treeHandler->h_maxY = HelperNS::reduceAndGlobalize(particleHandler->d_y, treeHandler->d_maxY, numParticlesLocal, Reduction::max);
659 //*treeHandler->h_minZ = HelperNS::reduceAndGlobalize(particleHandler->d_z, treeHandler->d_minZ, numParticlesLocal, Reduction::min);
660 //*treeHandler->h_maxZ = HelperNS::reduceAndGlobalize(particleHandler->d_z, treeHandler->d_maxZ, numParticlesLocal, Reduction::max);
661 //treeHandler->copy(To::device);
662
663 //debug
664
665 /*
666 *treeHandler->copy(To::host);
667 *treeHandler->h_minX *= 1.1;
668 *treeHandler->h_maxX *= 1.1;
669 *treeHandler->h_minY *= 1.1;
670 *treeHandler->h_maxY *= 1.1;
671 *treeHandler->h_minZ *= 1.1;
672 *treeHandler->h_maxZ *= 1.1;
673 treeHandler->copy(To::device);
674 */
675
676 //Logger(INFO) << "x: max = " << *treeHandler->h_maxX << ", min = " << *treeHandler->h_minX;
677 //Logger(INFO) << "y: max = " << *treeHandler->h_maxY << ", min = " << *treeHandler->h_minY;
678 //Logger(INFO) << "z: max = " << *treeHandler->h_maxZ << ", min = " << *treeHandler->h_minZ;
679 //end: debug
680
681 treeHandler->globalizeBoundingBox(Execution::device);
682 treeHandler->copy(To::host);
683
684#if DIM == 1
685 Logger(DEBUG) << "Bounding box: x = (" << *treeHandler->h_minX << ", " << *treeHandler->h_maxX << ")";
686#elif DIM == 2
687 Logger(DEBUG) << "Bounding box: x = (" << *treeHandler->h_minX << ", " << *treeHandler->h_maxX << ")" << "y = ("
688 << *treeHandler->h_minY << ", " << *treeHandler->h_maxY << ")";
689#else
690 Logger(DEBUG) << "Bounding box: x = (" << std::setprecision(9) << *treeHandler->h_minX << ", "
691 << *treeHandler->h_maxX << ")" << "y = (" << *treeHandler->h_minY << ", "
692 << *treeHandler->h_maxY << ")" << "z = " << *treeHandler->h_minZ << ", "
693 << *treeHandler->h_maxZ << ")";
694#endif
695
696 Logger(TIME) << "computeBoundingBox: " << time << " ms";
697 return time;
698}
699
700real Miluphpc::assignParticles() {
701
702 real time;
703 // -----------------------------------------------------------------------------------------------------------------
704 time = SubDomainKeyTreeNS::Kernel::Launch::particlesPerProcess(subDomainKeyTreeHandler->d_subDomainKeyTree,
705 treeHandler->d_tree, particleHandler->d_particles,
706 numParticlesLocal, numNodes, curveType);
707 // -----------------------------------------------------------------------------------------------------------------
708
709 int *d_particlesProcess = buffer->d_integerBuffer; //helperHandler->d_integerBuffer;
710 int *d_particlesProcessSorted = buffer->d_integerBuffer1;
711 real *d_tempEntry = buffer->d_realBuffer; //helperHandler->d_realBuffer;
712 real *d_copyBuffer = buffer->d_realBuffer1;
713 idInteger *d_idIntTempEntry = buffer->d_idIntegerBuffer; //d_idIntegerBuffer;
714 idInteger *d_idIntCopyBuffer = buffer->d_idIntegerBuffer1; //d_idIntegerCopyBuffer;
715
716 // arrange velocities and accelerations as well (which is normally not necessary)
717 bool arrangeAll = false;
718
719 // TODO: principally sorting the keys would only be necessary once
720 // - is it possible to implement a own version of radix sort only sorting the keys ones?
721 // ---------------------------------------------------------
722 time += SubDomainKeyTreeNS::Kernel::Launch::markParticlesProcess(subDomainKeyTreeHandler->d_subDomainKeyTree,
723 treeHandler->d_tree, particleHandler->d_particles,
724 numParticlesLocal, numNodes,
725 d_particlesProcess, curveType);
726 // ---------------------------------------------------------
727 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_x, d_tempEntry);
728 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_vx, d_tempEntry);
729 if (arrangeAll) {
730 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_ax,
731 d_tempEntry);
732 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_g_ax,
733 d_tempEntry);
734 }
735#if DIM > 1
736 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_y, d_tempEntry);
737 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_vy, d_tempEntry);
738 if (arrangeAll) {
739 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_ay,
740 d_tempEntry);
741 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_g_ay,
742 d_tempEntry);
743 }
744#if DIM == 3
745 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_z, d_tempEntry);
746 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_vz, d_tempEntry);
747 if (arrangeAll) {
748 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_az,
749 d_tempEntry);
750 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_g_az,
751 d_tempEntry);
752 }
753#endif
754#endif
755
756 if (particleHandler->leapfrog) {
757 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted,
758 particleHandler->d_ax_old, d_tempEntry);
759 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted,
760 particleHandler->d_g_ax_old, d_tempEntry);
761#if DIM > 1
762 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted,
763 particleHandler->d_ay_old, d_tempEntry);
764 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted,
765 particleHandler->d_g_ay_old, d_tempEntry);
766#if DIM == 3
767 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted,
768 particleHandler->d_az_old, d_tempEntry);
769 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted,
770 particleHandler->d_g_az_old, d_tempEntry);
771#endif
772#endif
773 }
774
775 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted,
776 particleHandler->d_mass, d_tempEntry);
777 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted,
778 particleHandler->d_uid, d_idIntTempEntry);
779
780#if SPH_SIM
781 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_sml, d_tempEntry);
782 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_e, d_tempEntry);
783 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_rho, d_tempEntry);
784 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_cs, d_tempEntry);
785 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted, particleHandler->d_p, d_tempEntry);
786 time += arrangeParticleEntries(d_particlesProcess, d_particlesProcessSorted,
787 particleHandler->d_materialId, d_idIntTempEntry);
788#endif
789
790 subDomainKeyTreeHandler->copy(To::host, true, true);
791
792 Timer timer;
793 integer *sendLengths;
794 sendLengths = new integer[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
795 sendLengths[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] = 0;
796 integer *receiveLengths;
797 receiveLengths = new integer[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
798 receiveLengths[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] = 0;
799
800 for (int proc=0; proc < subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
801 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
802 sendLengths[proc] = subDomainKeyTreeHandler->h_procParticleCounter[proc];
803 }
804 }
805 mpi::messageLengths(subDomainKeyTreeHandler->h_subDomainKeyTree, sendLengths, receiveLengths);
806
807 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_x, d_tempEntry, d_copyBuffer);
808 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_vx, d_tempEntry, d_copyBuffer);
809 if (arrangeAll) {
810 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_ax, d_tempEntry, d_copyBuffer);
811 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_g_ax, d_tempEntry, d_copyBuffer);
812 }
813#if DIM > 1
814 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_y, d_tempEntry, d_copyBuffer);
815 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_vy, d_tempEntry, d_copyBuffer);
816 if (arrangeAll) {
817 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_ay, d_tempEntry, d_copyBuffer);
818 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_g_ay, d_tempEntry, d_copyBuffer);
819 }
820#if DIM == 3
821 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_z, d_tempEntry, d_copyBuffer);
822 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_vz, d_tempEntry, d_copyBuffer);
823 if (arrangeAll) {
824 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_az, d_tempEntry, d_copyBuffer);
825 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_g_az, d_tempEntry, d_copyBuffer);
826 }
827#endif
828#endif
829
830 if (particleHandler->leapfrog) {
831 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_ax_old, d_tempEntry, d_copyBuffer);
832 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_g_ax_old, d_tempEntry, d_copyBuffer);
833#if DIM > 1
834 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_ay_old, d_tempEntry, d_copyBuffer);
835 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_g_ay_old, d_tempEntry, d_copyBuffer);
836#if DIM == 3
837 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_az_old, d_tempEntry, d_copyBuffer);
838 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_g_az_old, d_tempEntry, d_copyBuffer);
839#endif
840#endif
841 }
842
843 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_uid, d_idIntTempEntry, d_idIntCopyBuffer);
844
845#if SPH_SIM
846 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_sml, d_tempEntry, d_copyBuffer);
847 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_e, d_tempEntry, d_copyBuffer);
848 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_rho, d_tempEntry, d_copyBuffer);
849 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_cs, d_tempEntry, d_copyBuffer);
850 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_p, d_tempEntry, d_copyBuffer);
851 sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_materialId, d_idIntTempEntry, d_idIntCopyBuffer);
852#endif
853 numParticlesLocal = sendParticlesEntry(sendLengths, receiveLengths, particleHandler->d_mass,
854 d_tempEntry, d_copyBuffer);
855
856 if (numParticlesLocal > numParticles) {
857 MPI_Finalize();
858 Logger(ERROR) << "numParticlesLocal = " << numParticlesLocal << " > "
859 << " numParticles = " << numParticles;
860 // TODO: implement possibility to restart simulation
861 Logger(ERROR) << "Restart simulation with more memory! exiting ...";
862 exit(1);
863 }
864
865 delete [] sendLengths;
866 delete [] receiveLengths;
867
868 time += timer.elapsed();
869
870 int resetLength = numParticles-numParticlesLocal;
871 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_x[numParticlesLocal],
872 (real)0, resetLength);
873 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_vx[numParticlesLocal],
874 (real)0, resetLength);
875 if (arrangeAll) {
876 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_ax[numParticlesLocal],
877 (real) 0, resetLength);
878 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_g_ax[numParticlesLocal],
879 (real) 0, resetLength);
880 }
881#if DIM > 1
882 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_y[numParticlesLocal],
883 (real)0, resetLength);
884 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_vy[numParticlesLocal],
885 (real)0, resetLength);
886 if (arrangeAll) {
887 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_ay[numParticlesLocal],
888 (real) 0, resetLength);
889 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_g_ay[numParticlesLocal],
890 (real) 0, resetLength);
891 }
892#if DIM == 3
893 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_z[numParticlesLocal],
894 (real)0, resetLength);
895 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_vz[numParticlesLocal],
896 (real)0, resetLength);
897 if (arrangeAll) {
898 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_az[numParticlesLocal],
899 (real) 0, resetLength);
900 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_g_az[numParticlesLocal],
901 (real) 0, resetLength);
902 }
903#endif
904#endif
905
906 if (particleHandler->leapfrog) {
907 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_ax_old[numParticlesLocal],
908 (real) 0, resetLength);
909 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_g_ax_old[numParticlesLocal],
910 (real) 0, resetLength);
911#if DIM > 1
912 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_ay_old[numParticlesLocal],
913 (real) 0, resetLength);
914 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_g_ay_old[numParticlesLocal],
915 (real) 0, resetLength);
916#if DIM == 3
917 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_az_old[numParticlesLocal],
918 (real) 0, resetLength);
919 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_g_az_old[numParticlesLocal],
920 (real) 0, resetLength);
921#endif
922#endif
923 }
924
925 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_mass[numParticlesLocal],
926 (real)0, resetLength);
927 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_uid[numParticlesLocal],
928 (idInteger)0, resetLength);
929
930#if SPH_SIM
931 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_sml[numParticlesLocal], (real)0, resetLength);
932 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_e[numParticlesLocal], (real)0, resetLength);
933 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_rho[numParticlesLocal], (real)0, resetLength);
934 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_cs[numParticlesLocal], (real)0, resetLength);
935 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_p[numParticlesLocal], (real)0, resetLength);
936 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_materialId[numParticlesLocal],
937 (integer)0, resetLength);
938#endif
939
940 return time;
941}
942
943template <typename U, typename T>
944real Miluphpc::arrangeParticleEntries(U *sortArray, U *sortedArray, T *entry, T *temp) {
945 real time;
946 time = HelperNS::sortArray(entry, temp, sortArray, sortedArray, numParticlesLocal);
947 time += HelperNS::Kernel::Launch::copyArray(entry, temp, numParticlesLocal);
948 return time;
949}
950
951real Miluphpc::tree() {
952
953 real time = parallel_tree();
954
955 return time;
956}
957
958real Miluphpc::parallel_tree() {
959 real time;
960 real totalTime = 0.;
961
962 // START: creating domain list
963 Logger(DEBUG) << "building domain list ...";
964 // -----------------------------------------------------------------------------------------------------------------
965 if (subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses > 1) {
966 time = DomainListNS::Kernel::Launch::createDomainList(subDomainKeyTreeHandler->d_subDomainKeyTree,
967 domainListHandler->d_domainList, MAX_LEVEL,
968 curveType);
969 }
970 // -----------------------------------------------------------------------------------------------------------------
971 totalTime += time;
972 Logger(DEBUG) << "createDomainList: " << time << " ms";
973 profiler.value2file(ProfilerIds::Time::Tree::createDomain, time);
974
975 integer domainListLength;
976 cuda::copy(&domainListLength, domainListHandler->d_domainListIndex, 1, To::host);
977 Logger(DEBUG) << "domainListLength = " << domainListLength;
978 // END: creating domain list
979
980 // START: tree construction (including common coarse tree)
981#if DEBUGGING
982 integer treeIndexBeforeBuildingTree;
983 cuda::copy(&treeIndexBeforeBuildingTree, treeHandler->d_index, 1, To::host);
984 Logger(DEBUG) << "treeIndexBeforeBuildingTree: " << treeIndexBeforeBuildingTree;
985#endif
986
987 Logger(DEBUG) << "building tree ...";
988 // ---------------------------------------------------------
989 time = TreeNS::Kernel::Launch::buildTree(treeHandler->d_tree, particleHandler->d_particles, numParticlesLocal,
990 numParticles, true);
991 // ---------------------------------------------------------
992 totalTime += time;
993 Logger(TIME) << "buildTree: " << time << " ms";
994 profiler.value2file(ProfilerIds::Time::Tree::tree, time);
995
996 integer treeIndex;
997 cuda::copy(&treeIndex, treeHandler->d_index, 1, To::host);
998
999 Logger(INFO) << "treeIndex: " << treeIndex << " vs. numNodes: " << numNodes << " = "
1000 << (double)treeIndex / (double)numNodes * 100. << " %";
1001
1002#if DEBUGGING
1003 Logger(INFO) << "numParticlesLocal: " << numParticlesLocal;
1004 Logger(INFO) << "numParticles: " << numParticles;
1005 Logger(INFO) << "numNodes: " << numNodes;
1006 Logger(INFO) << "treeIndex: " << treeIndex;
1007 integer numParticlesSum = numParticlesLocal;
1008 boost::mpi::communicator comm;
1009 all_reduce(comm, boost::mpi::inplace_t<integer*>(&numParticlesSum), 1, std::plus<integer>());
1010 Logger(INFO) << "numParticlesSum: " << numParticlesSum;
1011 //ParticlesNS::Kernel::Launch::info(particleHandler->d_particles, numParticlesLocal, numParticles, treeIndex);
1012#endif
1013
1014 Logger(DEBUG) << "building domain tree ...";
1015 cuda::set(domainListHandler->d_domainListCounter, 0, 1);
1016
1017 // serial version
1018 //time = SubDomainKeyTreeNS::Kernel::Launch::buildDomainTree(treeHandler->d_tree, particleHandler->d_particles,
1019 // domainListHandler->d_domainList, numParticlesLocal,
1020 // numNodes);
1021
1022 time = 0;
1023 // parallel version
1024 // serial version (above) working for one process, "parallel" version not working for one process, thus
1025 // if statement introduced
1026 if (subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses > 1) {
1027 for (int level = 0; level <= MAX_LEVEL; level++) {
1028 // ---------------------------------------------------------------------------------------------------------
1029 time += SubDomainKeyTreeNS::Kernel::Launch::buildDomainTree(subDomainKeyTreeHandler->d_subDomainKeyTree,
1030 treeHandler->d_tree,
1031 particleHandler->d_particles,
1032 domainListHandler->d_domainList,
1033 numParticlesLocal,
1034 numNodes, level);
1035 // ---------------------------------------------------------------------------------------------------------
1036 }
1037 }
1038
1039#if DEBUGGING
1040 int domainListCounterAfterwards;
1041 cuda::copy(&domainListCounterAfterwards, domainListHandler->d_domainListCounter, 1, To::host);
1042 Logger(DEBUG) << "domain list counter afterwards : " << domainListCounterAfterwards;
1043#endif
1044
1045 cuda::set(domainListHandler->d_domainListCounter, 0, 1);
1046 // END: tree construction (including common coarse tree)
1047 totalTime += time;
1048 Logger(TIME) << "build(Domain)Tree: " << time << " ms";
1049 profiler.value2file(ProfilerIds::Time::Tree::buildDomain, time);
1050
1051 return totalTime;
1052}
1053
1054real Miluphpc::pseudoParticles() {
1055
1056 real time = parallel_pseudoParticles();
1057
1058 return time;
1059}
1060
1061real Miluphpc::parallel_pseudoParticles() {
1062
1063 real time = 0;
1064 Logger(DEBUG) << "lowestDomainList() ...";
1065 // -----------------------------------------------------------------------------------------------------------------
1066 time += DomainListNS::Kernel::Launch::lowestDomainList(subDomainKeyTreeHandler->d_subDomainKeyTree,
1067 treeHandler->d_tree, particleHandler->d_particles,
1068 domainListHandler->d_domainList,
1069 lowestDomainListHandler->d_domainList,
1070 numParticles, numNodes);
1071 // -----------------------------------------------------------------------------------------------------------------
1072
1073 Logger(DEBUG) << "calculateCentersOfMass() ...";
1074 real timeCOM = 0;
1075 for (int level=MAX_LEVEL; level>0; --level) {
1076 // -------------------------------------------------------------------------------------------------------------
1077 timeCOM += TreeNS::Kernel::Launch::calculateCentersOfMass(treeHandler->d_tree, particleHandler->d_particles,
1078 numParticles, level, true);
1079 // -------------------------------------------------------------------------------------------------------------
1080 }
1081
1082 //timeCOM = TreeNS::Kernel::Launch::centerOfMass(treeHandler->d_tree, particleHandler->d_particles,
1083 // numParticlesLocal, true);
1084
1085 Logger(TIME) << "calculate COM: " << timeCOM << " ms";
1086 time += timeCOM;
1087
1088 // -----------------------------------------------------------------------------------------------------------------
1089 SubDomainKeyTreeNS::Kernel::Launch::zeroDomainListNodes(particleHandler->d_particles,
1090 domainListHandler->d_domainList,
1091 lowestDomainListHandler->d_domainList);
1092 // -----------------------------------------------------------------------------------------------------------------
1093
1094 // old version
1095 //time += SubDomainKeyTreeNS::Kernel::Launch::compLocalPseudoParticles(treeHandler->d_tree,
1096 // particleHandler->d_particles, domainListHandler->d_domainList, numParticles);
1097 //DomainListNS::Kernel::Launch::info(particleHandler->d_particles, domainListHandler->d_domainList);
1098
1099 integer domainListIndex;
1100 integer lowestDomainListIndex;
1101
1102 cuda::copy(&domainListIndex, domainListHandler->d_domainListIndex, 1, To::host);
1103 cuda::copy(&lowestDomainListIndex, lowestDomainListHandler->d_domainListIndex, 1, To::host);
1104
1105 Logger(DEBUG) << "domainListIndex: " << domainListIndex << " | lowestDomainListIndex: "
1106 << lowestDomainListIndex;
1107 Logger(DEBUG) << "communicating/exchanging and updating domain list nodes ...";
1108
1109 boost::mpi::communicator comm;
1110
1111 //TODO: current approach reasonable?
1112 // or template functions and explicitly hand over buffer(s) (and not instance of buffer class)
1113
1114 // x ---------------------------------------------------------------------------------------------------------------
1115
1116 cuda::set(lowestDomainListHandler->d_domainListCounter, 0);
1117 time += SubDomainKeyTreeNS::Kernel::Launch::prepareLowestDomainExchange(particleHandler->d_particles,
1118 lowestDomainListHandler->d_domainList,
1119 buffer->d_realBuffer, Entry::x);
1120
1121 time += HelperNS::sortArray(buffer->d_realBuffer,
1122 &buffer->d_realBuffer[simulationParameters.domainListSize],
1123 lowestDomainListHandler->d_domainListKeys,
1124 lowestDomainListHandler->d_sortedDomainListKeys,
1125 domainListIndex);
1126
1127 // share among processes
1128 //TODO: domainListIndex or lowestDomainListIndex?
1129 all_reduce(comm, boost::mpi::inplace_t<real*>(&buffer->d_realBuffer[simulationParameters.domainListSize]),
1130 domainListIndex, std::plus<real>());
1131
1132 cuda::set(lowestDomainListHandler->d_domainListCounter, 0);
1133
1134 time += SubDomainKeyTreeNS::Kernel::Launch::updateLowestDomainListNodes(particleHandler->d_particles,
1135 lowestDomainListHandler->d_domainList,
1136 &buffer->d_realBuffer[simulationParameters.domainListSize],
1137 Entry::x);
1138
1139#if DIM > 1
1140 // y ---------------------------------------------------------------------------------------------------------------
1141
1142 cuda::set(lowestDomainListHandler->d_domainListCounter, 0);
1143 time += SubDomainKeyTreeNS::Kernel::Launch::prepareLowestDomainExchange(particleHandler->d_particles,
1144 lowestDomainListHandler->d_domainList,
1145 buffer->d_realBuffer, Entry::y);
1146
1147 time += HelperNS::sortArray(buffer->d_realBuffer,
1148 &buffer->d_realBuffer[simulationParameters.domainListSize],
1149 lowestDomainListHandler->d_domainListKeys,
1150 lowestDomainListHandler->d_sortedDomainListKeys,
1151 domainListIndex);
1152
1153 all_reduce(comm, boost::mpi::inplace_t<real*>(&buffer->d_realBuffer[simulationParameters.domainListSize]),
1154 domainListIndex, std::plus<real>());
1155
1156 cuda::set(lowestDomainListHandler->d_domainListCounter, 0);
1157
1158 time += SubDomainKeyTreeNS::Kernel::Launch::updateLowestDomainListNodes(particleHandler->d_particles,
1159 lowestDomainListHandler->d_domainList,
1160 &buffer->d_realBuffer[simulationParameters.domainListSize],
1161 Entry::y);
1162
1163#if DIM == 3
1164 // z ---------------------------------------------------------------------------------------------------------------
1165
1166 cuda::set(lowestDomainListHandler->d_domainListCounter, 0);
1167 time += SubDomainKeyTreeNS::Kernel::Launch::prepareLowestDomainExchange(particleHandler->d_particles,
1168 lowestDomainListHandler->d_domainList,
1169 buffer->d_realBuffer, Entry::z);
1170
1171 time += HelperNS::sortArray(buffer->d_realBuffer,
1172 &buffer->d_realBuffer[simulationParameters.domainListSize],
1173 lowestDomainListHandler->d_domainListKeys,
1174 lowestDomainListHandler->d_sortedDomainListKeys,
1175 domainListIndex);
1176
1177 all_reduce(comm, boost::mpi::inplace_t<real*>(&buffer->d_realBuffer[simulationParameters.domainListSize]),
1178 domainListIndex, std::plus<real>());
1179
1180 cuda::set(lowestDomainListHandler->d_domainListCounter, 0);
1181
1182 time += SubDomainKeyTreeNS::Kernel::Launch::updateLowestDomainListNodes(particleHandler->d_particles,
1183 lowestDomainListHandler->d_domainList,
1184 &buffer->d_realBuffer[simulationParameters.domainListSize],
1185 Entry::z);
1186
1187#endif
1188#endif
1189
1190 // m ---------------------------------------------------------------------------------------------------------------
1191
1192 cuda::set(lowestDomainListHandler->d_domainListCounter, 0);
1193 time += SubDomainKeyTreeNS::Kernel::Launch::prepareLowestDomainExchange(particleHandler->d_particles,
1194 lowestDomainListHandler->d_domainList,
1195 buffer->d_realBuffer, Entry::mass);
1196
1197 time += HelperNS::sortArray(buffer->d_realBuffer,
1198 &buffer->d_realBuffer[simulationParameters.domainListSize],
1199 lowestDomainListHandler->d_domainListKeys,
1200 lowestDomainListHandler->d_sortedDomainListKeys,
1201 domainListIndex);
1202
1203 all_reduce(comm, boost::mpi::inplace_t<real*>(&buffer->d_realBuffer[simulationParameters.domainListSize]),
1204 domainListIndex, std::plus<real>());
1205
1206 cuda::set(lowestDomainListHandler->d_domainListCounter, 0);
1207
1208 time += SubDomainKeyTreeNS::Kernel::Launch::updateLowestDomainListNodes(particleHandler->d_particles,
1209 lowestDomainListHandler->d_domainList,
1210 &buffer->d_realBuffer[simulationParameters.domainListSize],
1211 Entry::mass);
1212
1213 // -----------------------------------------------------------------------------------------------------------------
1214
1215 Logger(DEBUG) << "finish computation of lowest domain list nodes ...";
1216 // -----------------------------------------------------------------------------------------------------------------
1217 time += SubDomainKeyTreeNS::Kernel::Launch::compLowestDomainListNodes(treeHandler->d_tree,
1218 particleHandler->d_particles,
1219 lowestDomainListHandler->d_domainList);
1220 // -----------------------------------------------------------------------------------------------------------------
1221 //end: for all entries!
1222
1223 Logger(DEBUG) << "finish computation of (all) domain list nodes ...";
1224 // per level computation of domain list pseudo-particles to ensure the correct order (avoid race condition)
1225 for (int domainLevel = MAX_LEVEL; domainLevel>= 0; domainLevel--) {
1226 // -------------------------------------------------------------------------------------------------------------
1227 time += SubDomainKeyTreeNS::Kernel::Launch::compDomainListPseudoParticlesPerLevel(treeHandler->d_tree,
1228 particleHandler->d_particles,
1229 domainListHandler->d_domainList,
1230 lowestDomainListHandler->d_domainList,
1231 numParticles, domainLevel);
1232 // -------------------------------------------------------------------------------------------------------------
1233 }
1234
1235 //timeCOM = TreeNS::Kernel::Launch::centerOfMass(treeHandler->d_tree, particleHandler->d_particles, numParticlesLocal,
1236 // true);
1237
1238 return time;
1239}
1240
1241real Miluphpc::gravity() {
1242
1243 real time = parallel_gravity();
1244
1245 return time;
1246}
1247
1248real Miluphpc::parallel_gravity() {
1249
1250 real time;
1251 real totalTime = 0;
1252
1253 /*
1254#if UNIT_TESTING
1255 //if (subStep == 10) {
1256 int actualTreeIndex;
1257 cuda::copy(&actualTreeIndex, treeHandler->d_index, 1, To::host);
1258 Logger(TRACE) << "[" << subStep << "] Checking Masses ...";
1259 UnitTesting::Kernel::Launch::test_localTree(treeHandler->d_tree, particleHandler->d_particles, numParticles, actualTreeIndex); //numNodes);
1260 //}
1261#endif
1262 */
1263
1264 totalTime += HelperNS::Kernel::Launch::resetArray(buffer->d_realBuffer, (real)0, numParticles);
1265
1266 cuda::set(domainListHandler->d_domainListCounter, 0);
1267
1268 Logger(DEBUG) << "compTheta()";
1269 // -----------------------------------------------------------------------------------------------------------------
1270 time = Gravity::Kernel::Launch::compTheta(subDomainKeyTreeHandler->d_subDomainKeyTree, treeHandler->d_tree,
1271 particleHandler->d_particles, lowestDomainListHandler->d_domainList, //domainListHandler->d_domainList,
1272 curveType);
1273 // -----------------------------------------------------------------------------------------------------------------
1274 totalTime += time;
1275 Logger(TIME) << "compTheta(): " << time << " ms";
1276 profiler.value2file(ProfilerIds::Time::Gravity::compTheta, time);
1277
1278 integer relevantIndicesCounter;
1279 // changed from domainListHandler to lowestDomainListHandler
1280 cuda::copy(&relevantIndicesCounter, lowestDomainListHandler->d_domainListCounter, 1, To::host);
1281 //cuda::copy(&relevantIndicesCounter, domainListHandler->d_domainListCounter, 1, To::host);
1282
1283 //Logger(DEBUG) << "relevantIndicesCounter: " << relevantIndicesCounter;
1284
1285 integer *h_relevantDomainListProcess;
1286 h_relevantDomainListProcess = new integer[relevantIndicesCounter];
1287 //cuda::copy(h_relevantDomainListProcess, domainListHandler->d_relevantDomainListProcess,
1288 // relevantIndicesCounter, To::host);
1289 // changed from domainListHandler to lowestDomainListHandler
1290 cuda::copy(h_relevantDomainListProcess, lowestDomainListHandler->d_relevantDomainListProcess,
1291 relevantIndicesCounter, To::host);
1292 //cuda::copy(h_relevantDomainListProcess, domainListHandler->d_relevantDomainListProcess,
1293 // relevantIndicesCounter, To::host);
1294
1295 //for (int i=0; i<relevantIndicesCounter; i++) {
1296 // Logger(DEBUG) << "relevantDomainListProcess[" << i << "] = " << h_relevantDomainListProcess[i];
1297 //}
1298
1299 treeHandler->copy(To::host);
1300#if CUBIC_DOMAINS
1301 real diam = std::abs(*treeHandler->h_maxX) + std::abs(*treeHandler->h_minX);
1302 //Logger(INFO) << "diam: " << diam;
1303#else // !CUBIC DOMAINS
1304 real diam_x = std::abs(*treeHandler->h_maxX) + std::abs(*treeHandler->h_minX);
1305#if DIM > 1
1306 real diam_y = std::abs(*treeHandler->h_maxY) + std::abs(*treeHandler->h_minY);
1307#if DIM == 3
1308 real diam_z = std::abs(*treeHandler->h_maxZ) + std::abs(*treeHandler->h_minZ);
1309#endif
1310#endif
1311#if DIM == 1
1312 real diam = diam_x;
1313 //Logger(INFO) << "diam: " << diam << " (x = " << diam_x << ")";
1314#elif DIM == 2
1315 real diam = std::max({diam_x, diam_y});
1316 //Logger(INFO) << "diam: " << diam << " (x = " << diam_x << ", y = " << diam_y << ")";
1317#else
1318 real diam = std::max({diam_x, diam_y, diam_z});
1319 //Logger(INFO) << "diam: " << diam << " (x = " << diam_x << ", y = " << diam_y << ", z = " << diam_z << ")";
1320#endif
1321#endif // CUBIC_DOMAINS
1322
1323 // TODO: create buffer concept and (re)use buffer for gravity internode communicaton
1324 // changed from domainListHandler to lowestDomainListHandler
1325 cuda::set(lowestDomainListHandler->d_domainListCounter, 0);
1326 //cuda::set(domainListHandler->d_domainListCounter, 0);
1327
1328 integer *d_markedSendIndices = buffer->d_integerBuffer;
1329 real *d_collectedEntries = buffer->d_realBuffer;
1330
1331 integer *d_particles2SendIndices = buffer->d_integerBuffer1;
1332 cuda::set(d_particles2SendIndices, -1, numParticles);
1333 integer *d_pseudoParticles2SendIndices = buffer->d_integerBuffer2;
1334 cuda::set(d_pseudoParticles2SendIndices, -1, numParticles);
1335 integer *d_pseudoParticles2SendLevels = buffer->d_integerBuffer3;
1336 cuda::set(d_pseudoParticles2SendLevels, -1, numParticles);
1337 integer *d_pseudoParticles2ReceiveLevels = buffer->d_integerBuffer4;
1338 cuda::set(d_pseudoParticles2ReceiveLevels, -1, numParticles);
1339
1340 integer *d_particles2SendCount = buffer->d_sendCount;
1341 cuda::set(d_particles2SendCount, 0, subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses);
1342 integer *d_pseudoParticles2SendCount = buffer->d_sendCount1;
1343 cuda::set(d_pseudoParticles2SendCount, 0, subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses);
1344
1345 integer particlesOffset = 0;
1346 integer pseudoParticlesOffset = 0;
1347
1348 integer particlesOffsetBuffer;
1349 integer pseudoParticlesOffsetBuffer;
1350
1351 integer *h_particles2SendCount = new integer[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
1352 integer *h_pseudoParticles2SendCount = new integer[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
1353
1354 int symbolicForceVersion = 4;
1355
1356 time = 0;
1357 if (symbolicForceVersion == 0) {
1358 for (int proc = 0; proc < subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
1359 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
1360 cuda::set(d_markedSendIndices, -1, numNodes);
1361 for (int level = 0; level < MAX_LEVEL; level++) {
1362 // ---------------------------------------------------------
1363 // changed from domainListHandler to lowestDomainListHandler
1364 time += Gravity::Kernel::Launch::intermediateSymbolicForce(
1365 subDomainKeyTreeHandler->d_subDomainKeyTree,
1366 treeHandler->d_tree, particleHandler->d_particles, lowestDomainListHandler->d_domainList,
1367 d_markedSendIndices, diam, simulationParameters.theta, numParticlesLocal, numParticles, 0,
1368 level,
1369 curveType);
1370 //time += Gravity::Kernel::Launch::intermediateSymbolicForce(subDomainKeyTreeHandler->d_subDomainKeyTree,
1371 // treeHandler->d_tree, particleHandler->d_particles, domainListHandler->d_domainList,
1372 // d_markedSendIndices, diam, simulationParameters.theta, numParticlesLocal, numParticles,0, level,
1373 // curveType);
1374 // ---------------------------------------------------------
1375 for (int relevantIndex = 0; relevantIndex < relevantIndicesCounter; relevantIndex++) {
1376 if (h_relevantDomainListProcess[relevantIndex] == proc) {
1377 //Logger(INFO) << "h_relevantDomainListProcess[" << relevantIndex << "] = "
1378 // << h_relevantDomainListProcess[relevantIndex];
1379 // ---------------------------------------------------------
1380 // changed from domainListHandler to lowestDomainListHandler
1381 time += Gravity::Kernel::Launch::symbolicForce(subDomainKeyTreeHandler->d_subDomainKeyTree,
1382 treeHandler->d_tree,
1383 particleHandler->d_particles,
1384 lowestDomainListHandler->d_domainList,
1385 d_markedSendIndices, diam,
1386 simulationParameters.theta,
1387 numParticlesLocal, numParticles,
1388 relevantIndex, level, curveType);
1389 //time += Gravity::Kernel::Launch::symbolicForce(subDomainKeyTreeHandler->d_subDomainKeyTree,
1390 // treeHandler->d_tree, particleHandler->d_particles,
1391 // domainListHandler->d_domainList,
1392 // d_markedSendIndices, diam,
1393 // simulationParameters.theta,
1394 // numParticlesLocal, numParticles,
1395 // relevantIndex, level, curveType);
1396 // ---------------------------------------------------------
1397 }
1398 }
1399 }
1400 // changed from domainListHandler to lowestDomainListHandler
1401 time += Gravity::Kernel::Launch::intermediateSymbolicForce(subDomainKeyTreeHandler->d_subDomainKeyTree,
1402 treeHandler->d_tree,
1403 particleHandler->d_particles,
1404 lowestDomainListHandler->d_domainList,
1405 d_markedSendIndices, diam,
1406 simulationParameters.theta,
1407 numParticlesLocal, numParticles, 0, 0,
1408 curveType);
1409 //time += Gravity::Kernel::Launch::intermediateSymbolicForce(subDomainKeyTreeHandler->d_subDomainKeyTree,
1410 // treeHandler->d_tree, particleHandler->d_particles, domainListHandler->d_domainList,
1411 // d_markedSendIndices, diam, simulationParameters.theta, numParticlesLocal, numParticles,0, 0,
1412 // curveType);
1413 // ---------------------------------------------------------
1414 time += Gravity::Kernel::Launch::collectSendIndices(treeHandler->d_tree, particleHandler->d_particles,
1415 d_markedSendIndices,
1416 &d_particles2SendIndices[particlesOffset],
1417 &d_pseudoParticles2SendIndices[pseudoParticlesOffset],
1418 &d_pseudoParticles2SendLevels[pseudoParticlesOffset],
1419 &d_particles2SendCount[proc],
1420 &d_pseudoParticles2SendCount[proc],
1421 numParticles, numNodes, curveType);
1422 // ---------------------------------------------------------
1423
1424 cuda::copy(&particlesOffsetBuffer, &d_particles2SendCount[proc], 1, To::host);
1425 cuda::copy(&pseudoParticlesOffsetBuffer, &d_pseudoParticles2SendCount[proc], 1, To::host);
1426
1427 Logger(DEBUG) << "particles2SendCount[" << proc << "] = " << particlesOffsetBuffer;
1428 Logger(DEBUG) << "pseudoParticles2SendCount[" << proc << "] = " << pseudoParticlesOffsetBuffer;
1429
1430 particlesOffset += particlesOffsetBuffer;
1431 pseudoParticlesOffset += pseudoParticlesOffsetBuffer;
1432 }
1433 }
1434 }
1435 else if (symbolicForceVersion == 1) {
1436 // symbolic force testing
1437 time = 0;
1438 for (int proc = 0; proc < subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
1439 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
1440 cuda::set(d_markedSendIndices, -1, numNodes);
1441 for (int relevantIndex = 0; relevantIndex < relevantIndicesCounter; relevantIndex++) {
1442 if (h_relevantDomainListProcess[relevantIndex] == proc) {
1443 time += Gravity::Kernel::Launch::symbolicForce_test(subDomainKeyTreeHandler->d_subDomainKeyTree,
1444 treeHandler->d_tree,
1445 particleHandler->d_particles,
1446 lowestDomainListHandler->d_domainList,
1447 d_markedSendIndices, diam,
1448 simulationParameters.theta,
1449 numParticlesLocal, numParticles,
1450 relevantIndex, 0, curveType);
1451 }
1452 }
1453 time += Gravity::Kernel::Launch::collectSendIndices(treeHandler->d_tree, particleHandler->d_particles,
1454 d_markedSendIndices,
1455 &d_particles2SendIndices[particlesOffset],
1456 &d_pseudoParticles2SendIndices[pseudoParticlesOffset],
1457 &d_pseudoParticles2SendLevels[pseudoParticlesOffset],
1458 &d_particles2SendCount[proc],
1459 &d_pseudoParticles2SendCount[proc],
1460 numParticles, numNodes, curveType);
1461
1462 cuda::copy(&particlesOffsetBuffer, &d_particles2SendCount[proc], 1, To::host);
1463 cuda::copy(&pseudoParticlesOffsetBuffer, &d_pseudoParticles2SendCount[proc], 1, To::host);
1464
1465 Logger(DEBUG) << "particles2SendCount[" << proc << "] = " << particlesOffsetBuffer;
1466 Logger(DEBUG) << "pseudoParticles2SendCount[" << proc << "] = " << pseudoParticlesOffsetBuffer;
1467
1468 particlesOffset += particlesOffsetBuffer;
1469 pseudoParticlesOffset += pseudoParticlesOffsetBuffer;
1470 }
1471 }
1472 // end: symbolic force testing
1473 }
1474 else if (symbolicForceVersion == 2) {
1475 // symbolic force testing 2
1476 time = 0;
1477 for (int proc = 0; proc < subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
1478 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
1479 cuda::set(d_markedSendIndices, -1, numNodes);
1480
1481 time += Gravity::Kernel::Launch::symbolicForce_test2(subDomainKeyTreeHandler->d_subDomainKeyTree,
1482 treeHandler->d_tree, particleHandler->d_particles,
1483 lowestDomainListHandler->d_domainList,
1484 d_markedSendIndices, diam,
1485 simulationParameters.theta,
1486 numParticles, numParticles,
1487 proc, relevantIndicesCounter, curveType);
1488
1489 time += Gravity::Kernel::Launch::collectSendIndices(treeHandler->d_tree, particleHandler->d_particles,
1490 d_markedSendIndices,
1491 &d_particles2SendIndices[particlesOffset],
1492 &d_pseudoParticles2SendIndices[pseudoParticlesOffset],
1493 &d_pseudoParticles2SendLevels[pseudoParticlesOffset],
1494 &d_particles2SendCount[proc],
1495 &d_pseudoParticles2SendCount[proc],
1496 numParticles, numNodes, curveType);
1497
1498 cuda::copy(&particlesOffsetBuffer, &d_particles2SendCount[proc], 1, To::host);
1499 cuda::copy(&pseudoParticlesOffsetBuffer, &d_pseudoParticles2SendCount[proc], 1, To::host);
1500
1501 Logger(DEBUG) << "particles2SendCount[" << proc << "] = " << particlesOffsetBuffer;
1502 Logger(DEBUG) << "pseudoParticles2SendCount[" << proc << "] = " << pseudoParticlesOffsetBuffer;
1503
1504 particlesOffset += particlesOffsetBuffer;
1505 pseudoParticlesOffset += pseudoParticlesOffsetBuffer;
1506 }
1507 }
1508 // end: symbolic force testing 2
1509 }
1510 else if (symbolicForceVersion == 3) {
1511 // symbolic force testing 3
1512 time = 0;
1513 real time_symbolic = 0.;
1514 real time_collect = 0.;
1515 for (int proc = 0; proc < subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
1516 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
1517 cuda::set(d_markedSendIndices, -1, numNodes);
1518
1519 time_symbolic += Gravity::Kernel::Launch::symbolicForce_test3(subDomainKeyTreeHandler->d_subDomainKeyTree,
1520 treeHandler->d_tree, particleHandler->d_particles,
1521 lowestDomainListHandler->d_domainList,
1522 d_markedSendIndices, diam,
1523 simulationParameters.theta,
1524 numParticles, numParticles,
1525 proc, relevantIndicesCounter, curveType);
1526 time += time_symbolic;
1527 time_collect += Gravity::Kernel::Launch::collectSendIndices(treeHandler->d_tree, particleHandler->d_particles,
1528 d_markedSendIndices,
1529 &d_particles2SendIndices[particlesOffset],
1530 &d_pseudoParticles2SendIndices[pseudoParticlesOffset],
1531 &d_pseudoParticles2SendLevels[pseudoParticlesOffset],
1532 &d_particles2SendCount[proc],
1533 &d_pseudoParticles2SendCount[proc],
1534 numParticles, numNodes, curveType);
1535 time += time_collect;
1536
1537 cuda::copy(&particlesOffsetBuffer, &d_particles2SendCount[proc], 1, To::host);
1538 cuda::copy(&pseudoParticlesOffsetBuffer, &d_pseudoParticles2SendCount[proc], 1, To::host);
1539
1540 Logger(DEBUG) << "particles2SendCount[" << proc << "] = " << particlesOffsetBuffer;
1541 Logger(DEBUG) << "pseudoParticles2SendCount[" << proc << "] = " << pseudoParticlesOffsetBuffer;
1542
1543 particlesOffset += particlesOffsetBuffer;
1544 pseudoParticlesOffset += pseudoParticlesOffsetBuffer;
1545 }
1546 }
1547 Logger(TRACE) << "time symbolic: " << time_symbolic << " vs. collect: " << time_collect << " ms ...";
1548 // end: symbolic force testing 2
1549 }
1550 else if (symbolicForceVersion == 4) {
1551
1552 time = 0;
1553 real time_symbolic = 0.;
1554 real time_collect = 0.;
1555
1556 integer treeIndex;
1557 cuda::copy(&treeIndex, treeHandler->d_index, 1, To::host);
1558
1559 cuda::set(d_markedSendIndices, 0, numNodes);
1560
1561 time_symbolic += Gravity::Kernel::Launch::symbolicForce_test4(subDomainKeyTreeHandler->d_subDomainKeyTree,
1562 treeHandler->d_tree, particleHandler->d_particles,
1563 lowestDomainListHandler->d_domainList,
1564 d_markedSendIndices, diam,
1565 simulationParameters.theta,
1566 numParticles, numParticles,
1567 0, relevantIndicesCounter, curveType);
1568
1569 time += time_symbolic;
1570
1571 Logger(INFO) << "finished symbolic force ...";
1572
1573 for (int proc = 0; proc < subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
1574 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
1575
1576 time_collect += Gravity::Kernel::Launch::collectSendIndices_test4(treeHandler->d_tree, particleHandler->d_particles,
1577 d_markedSendIndices,
1578 &d_particles2SendIndices[particlesOffset],
1579 &d_pseudoParticles2SendIndices[pseudoParticlesOffset],
1580 &d_pseudoParticles2SendLevels[pseudoParticlesOffset],
1581 &d_particles2SendCount[proc],
1582 &d_pseudoParticles2SendCount[proc],
1583 numParticlesLocal, numParticles, treeIndex,
1584 proc, curveType);
1585 time += time_collect;
1586
1587 cuda::copy(&particlesOffsetBuffer, &d_particles2SendCount[proc], 1, To::host);
1588 cuda::copy(&pseudoParticlesOffsetBuffer, &d_pseudoParticles2SendCount[proc], 1, To::host);
1589
1590 Logger(DEBUG) << "particles2SendCount[" << proc << "] = " << particlesOffsetBuffer;
1591 Logger(DEBUG) << "pseudoParticles2SendCount[" << proc << "] = " << pseudoParticlesOffsetBuffer;
1592
1593 particlesOffset += particlesOffsetBuffer;
1594 pseudoParticlesOffset += pseudoParticlesOffsetBuffer;
1595 }
1596 }
1597 Logger(TRACE) << "time symbolic: " << time_symbolic << " vs. collect: " << time_collect << " ms ...";
1598 }
1599 else {
1600 MPI_Finalize();
1601 Logger(ERROR) << "symbolicForceVersion: " << symbolicForceVersion << " not available!";
1602 exit(1);
1603 }
1604
1605
1606//#if DEBUGGING
1607// Gravity::Kernel::Launch::testSendIndices(subDomainKeyTreeHandler->d_subDomainKeyTree, treeHandler->d_tree,
1608// particleHandler->d_particles, d_pseudoParticles2SendIndices,
1609// d_markedSendIndices,
1610// d_pseudoParticles2SendLevels, curveType, pseudoParticlesOffset);
1611//#endif
1612
1613 totalTime += time;
1614 Logger(TIME) << "symbolicForce: " << time << " ms";
1615 profiler.value2file(ProfilerIds::Time::Gravity::symbolicForce, time);
1616
1617 Timer timer;
1618
1619 cuda::copy(h_particles2SendCount, d_particles2SendCount, subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses,
1620 To::host);
1621 cuda::copy(h_pseudoParticles2SendCount, d_pseudoParticles2SendCount,
1622 subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses, To::host);
1623
1624 integer *particleSendLengths;
1625 particleSendLengths = new integer[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
1626 particleSendLengths[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] = 0;
1627 integer *particleReceiveLengths;
1628 particleReceiveLengths = new integer[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
1629 particleReceiveLengths[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] = 0;
1630
1631 integer *pseudoParticleSendLengths;
1632 pseudoParticleSendLengths = new integer[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
1633 pseudoParticleSendLengths[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] = 0;
1634 integer *pseudoParticleReceiveLengths;
1635 pseudoParticleReceiveLengths = new integer[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
1636 pseudoParticleReceiveLengths[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] = 0;
1637
1638 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
1639 //Logger(INFO) << "h_particles2SendCount[" << proc << "] = " << h_particles2SendCount[proc];
1640 //Logger(INFO) << "h_pseudoParticles2SendCount[" << proc << "] = " << h_pseudoParticles2SendCount[proc];
1641 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
1642 particleSendLengths[proc] = h_particles2SendCount[proc];
1643 pseudoParticleSendLengths[proc] = h_pseudoParticles2SendCount[proc];
1644 Logger(INFO) << "particleSendLengths[" << proc << "] = " << particleSendLengths[proc];
1645 Logger(INFO) << "pseudoParticleSendLengths[" << proc << "] = " << pseudoParticleSendLengths[proc];
1646 }
1647 }
1648
1649 profiler.vector2file(ProfilerIds::SendLengths::gravityParticles, particleSendLengths);
1650 profiler.vector2file(ProfilerIds::SendLengths::gravityPseudoParticles, pseudoParticleSendLengths);
1651
1652 mpi::messageLengths(subDomainKeyTreeHandler->h_subDomainKeyTree, particleSendLengths, particleReceiveLengths);
1653
1654 integer particleTotalReceiveLength = 0;
1655 integer particleTotalSendLength = 0;
1656 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
1657 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
1658 particleTotalReceiveLength += particleReceiveLengths[proc];
1659 particleTotalSendLength += particleSendLengths[proc];
1660 }
1661 }
1662
1663 Logger(INFO) << "gravity: particleTotalReceiveLength: " << particleTotalReceiveLength;
1664 Logger(INFO) << "gravity: particleTotalSendLength: " << particleTotalSendLength;
1665
1666 Logger(INFO) << "temporary numParticles: " << numParticlesLocal + particleTotalReceiveLength;
1667
1668 mpi::messageLengths(subDomainKeyTreeHandler->h_subDomainKeyTree, pseudoParticleSendLengths,
1669 pseudoParticleReceiveLengths);
1670
1671 profiler.vector2file(ProfilerIds::ReceiveLengths::gravityParticles, particleReceiveLengths);
1672 profiler.vector2file(ProfilerIds::ReceiveLengths::gravityPseudoParticles, pseudoParticleReceiveLengths);
1673
1674 integer pseudoParticleTotalReceiveLength = 0;
1675 integer pseudoParticleTotalSendLength = 0;
1676 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
1677 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
1678 pseudoParticleTotalReceiveLength += pseudoParticleReceiveLengths[proc];
1679 pseudoParticleTotalSendLength += pseudoParticleSendLengths[proc];
1680 //Logger(INFO) << "particleReceiveLengths[" << proc << "] = " << particleReceiveLengths[proc];
1681 //Logger(INFO) << "pseudoParticleReceiveLengths[" << proc << "] = " << pseudoParticleReceiveLengths[proc];
1682 }
1683 }
1684
1685#if DEBUGGING
1686 Gravity::Kernel::Launch::testSendIndices(subDomainKeyTreeHandler->d_subDomainKeyTree, treeHandler->d_tree,
1687 particleHandler->d_particles, d_pseudoParticles2SendIndices,
1688 d_markedSendIndices,
1689 d_pseudoParticles2SendLevels, curveType, pseudoParticleTotalSendLength);
1690#endif
1691
1692 // debug
1693 //particleHandler->copyDistribution(To::host, false, false, true);
1694 //int *h_pseudoParticlesSendIndices = new int[pseudoParticleTotalSendLength];
1695 //cuda::copy(h_pseudoParticlesSendIndices, d_pseudoParticles2SendIndices, pseudoParticleTotalSendLength, To::host);
1696 //int debug_offset = 0;
1697 //for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
1698 // if (proc != proc < subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
1699 // for (int i = 0; i < pseudoParticleSendLengths[proc]; i++) {
1700 // for (int j = 0; j < pseudoParticleSendLengths[proc]; j++) {
1701 // if (i != j) {
1702 // if (h_pseudoParticlesSendIndices[i + debug_offset] == h_pseudoParticlesSendIndices[j + debug_offset] ||
1703 // (particleHandler->h_x[h_pseudoParticlesSendIndices[i + debug_offset]] == particleHandler->h_x[h_pseudoParticlesSendIndices[j + debug_offset]] &&
1704 // particleHandler->h_y[h_pseudoParticlesSendIndices[i + debug_offset]] == particleHandler->h_y[h_pseudoParticlesSendIndices[j + debug_offset]])) {
1705 // Logger(INFO) << "found duplicate regarding proc " << proc << ": index: i: " << i << " = "
1706 // << h_pseudoParticlesSendIndices[i + debug_offset]
1707 // << ", j: " << j << " = " << h_pseudoParticlesSendIndices[j + debug_offset] <<
1708 // " (" << particleHandler->h_x[h_pseudoParticlesSendIndices[i + debug_offset]]
1709 // << ", " << particleHandler->h_x[h_pseudoParticlesSendIndices[j + debug_offset]] << ")";
1710 // }
1711 // }
1712 // }
1713 // }
1714 // debug_offset += pseudoParticleSendLengths[proc];
1715 // }
1716 //}
1717 //delete [] h_pseudoParticlesSendIndices;
1718 // end: debug
1719
1720 Logger(INFO) << "gravity: pseudoParticleTotalReceiveLength: " << pseudoParticleTotalReceiveLength;
1721 Logger(INFO) << "gravity: pseudoParticleTotalSendLength: " << pseudoParticleTotalSendLength;
1722
1723 integer treeIndex;
1724 cuda::copy(&treeIndex, treeHandler->d_index, 1, To::host);
1725
1726 // TODO: this is not working properly
1727 if (false/*simulationParameters.particlesSent2H5 && subStep == 500*/) {
1728
1729 // WRITING PARTICLES TO SEND TO H5 FILE
1730
1731 // writing particles
1732 int *h_particles2SendIndices = new int[particleTotalSendLength];
1733 cuda::copy(h_particles2SendIndices, d_particles2SendIndices, particleTotalSendLength, To::host);
1734 particles2file(std::string{"Gravity2SendParticles"}, h_particles2SendIndices, particleTotalSendLength);
1735 delete [] h_particles2SendIndices;
1736 // end: writing particles
1737
1738 // writing pseudo-particles
1739 int *h_pseudoParticles2SendIndices = new int[pseudoParticleTotalSendLength];
1740 cuda::copy(h_pseudoParticles2SendIndices, d_pseudoParticles2SendIndices, pseudoParticleTotalSendLength, To::host);
1741 particles2file(std::string{"Gravity2SendPseudoParticles"}, h_pseudoParticles2SendIndices, pseudoParticleTotalSendLength);
1742 delete [] h_pseudoParticles2SendIndices;
1743 // end: writing pseudo-particles
1744
1745 // writing both: particles and pseudo-particles
1746 int *h_sendIndices = new int[particleTotalSendLength + pseudoParticleTotalSendLength];
1747 cuda::copy(&h_sendIndices[0], d_particles2SendIndices, particleTotalSendLength, To::host);
1748 cuda::copy(&h_sendIndices[particleTotalSendLength], d_pseudoParticles2SendIndices, pseudoParticleTotalSendLength, To::host);
1749 particles2file(std::string{"Gravity2SendBoth"}, h_sendIndices, particleTotalSendLength + pseudoParticleTotalSendLength);
1750 delete [] h_sendIndices;
1751 // end: writing both: particles and pseudo-particles
1752
1753 // END: WRITING PARTICLES TO SEND TO H5 FILE
1754 }
1755
1756 //#if DEBUGGING
1757 //TreeNS::Kernel::Launch::info(treeHandler->d_tree, particleHandler->d_particles, treeIndex,
1758 // treeIndex + pseudoParticleTotalReceiveLength);
1759 //TreeNS::Kernel::Launch::info(treeHandler->d_tree, particleHandler->d_particles, numParticlesLocal,
1760 // numParticlesLocal + particleTotalReceiveLength);
1761 //#endif
1762
1763 // x-entry pseudo-particle exchange
1764 CudaUtils::Kernel::Launch::collectValues(d_pseudoParticles2SendIndices, particleHandler->d_x, d_collectedEntries,
1765 pseudoParticleTotalSendLength);
1766 sendParticles(d_collectedEntries, &particleHandler->d_x[treeIndex], pseudoParticleSendLengths,
1767 pseudoParticleReceiveLengths);
1768 // x-entry particle exchange
1769 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_x, d_collectedEntries,
1770 particleTotalSendLength);
1771 sendParticles(d_collectedEntries, &particleHandler->d_x[numParticlesLocal], particleSendLengths,
1772 particleReceiveLengths);
1773
1774 //Particle velocity and acceleration:
1775 // vx-entry particle exchange
1776 //CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_vx, d_collectedEntries,
1777 // particleTotalSendLength);
1778 //sendParticles(d_collectedEntries, &particleHandler->d_vx[numParticlesLocal], particleSendLengths,
1779 // particleReceiveLengths);
1780 // ax-entry particle exchange
1781 //CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_ax, d_collectedEntries,
1782 // particleTotalSendLength);
1783 //sendParticles(d_collectedEntries, &particleHandler->d_ax[numParticlesLocal], particleSendLengths,
1784 // particleReceiveLengths);
1785#if DIM > 1
1786 // y-entry pseudo-particle exchange
1787 CudaUtils::Kernel::Launch::collectValues(d_pseudoParticles2SendIndices, particleHandler->d_y, d_collectedEntries,
1788 pseudoParticleTotalSendLength);
1789 sendParticles(d_collectedEntries, &particleHandler->d_y[treeIndex], pseudoParticleSendLengths,
1790 pseudoParticleReceiveLengths);
1791 // y-entry particle exchange
1792 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_y, d_collectedEntries,
1793 particleTotalSendLength);
1794
1795 sendParticles(d_collectedEntries, &particleHandler->d_y[numParticlesLocal], particleSendLengths,
1796 particleReceiveLengths);
1797
1798 //Particle velocity and acceleration:
1799 // vy-entry particle exchange
1800 //CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_vy, d_collectedEntries,
1801 // particleTotalSendLength);
1802 //sendParticles(d_collectedEntries, &particleHandler->d_vy[numParticlesLocal], particleSendLengths,
1803 // particleReceiveLengths);
1804 // ay-entry particle exchange
1805 //CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_ay, d_collectedEntries,
1806 // particleTotalSendLength);
1807 //sendParticles(d_collectedEntries, &particleHandler->d_ay[numParticlesLocal], particleSendLengths,
1808 // particleReceiveLengths);
1809#if DIM == 3
1810 // z-entry pseudo-particle exchange
1811 CudaUtils::Kernel::Launch::collectValues(d_pseudoParticles2SendIndices, particleHandler->d_z, d_collectedEntries,
1812 pseudoParticleTotalSendLength);
1813 sendParticles(d_collectedEntries, &particleHandler->d_z[treeIndex], pseudoParticleSendLengths,
1814 pseudoParticleReceiveLengths);
1815 // z-entry particle exchange
1816 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_z, d_collectedEntries,
1817 particleTotalSendLength);
1818 sendParticles(d_collectedEntries, &particleHandler->d_z[numParticlesLocal], particleSendLengths,
1819 particleReceiveLengths);
1820
1821 //Particle velocity and acceleration:
1822 // vz-entry particle exchange
1823 //CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_vz, d_collectedEntries,
1824 // particleTotalSendLength);
1825 //sendParticles(d_collectedEntries, &particleHandler->d_vz[numParticlesLocal], particleSendLengths,
1826 // particleReceiveLengths);
1827 // az-entry particle exchange
1828 //CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_az, d_collectedEntries,
1829 // particleTotalSendLength);
1830 //sendParticles(d_collectedEntries, &particleHandler->d_az[numParticlesLocal], particleSendLengths,
1831 // particleReceiveLengths);
1832
1833#endif
1834#endif
1835
1836 // mass-entry pseudo-particle exchange
1837 CudaUtils::Kernel::Launch::collectValues(d_pseudoParticles2SendIndices, particleHandler->d_mass, d_collectedEntries,
1838 pseudoParticleTotalSendLength);
1839 sendParticles(d_collectedEntries, &particleHandler->d_mass[treeIndex], pseudoParticleSendLengths,
1840 pseudoParticleReceiveLengths);
1841 // mass-entry particle exchange
1842 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_mass, d_collectedEntries,
1843 particleTotalSendLength);
1844 sendParticles(d_collectedEntries, &particleHandler->d_mass[numParticlesLocal], particleSendLengths,
1845 particleReceiveLengths);
1846
1847 // PSEUDO-PARTICLE level exchange
1848 sendParticles(d_pseudoParticles2SendLevels, d_pseudoParticles2ReceiveLevels, pseudoParticleSendLengths,
1849 pseudoParticleReceiveLengths);
1850
1851
1852 time = timer.elapsed();
1853 totalTime += time;
1854 Logger(TIME) << "parallel_force(): sending particles: " << time << " ms";
1855 profiler.value2file(ProfilerIds::Time::Gravity::sending, time);
1856
1857 //#if DEBUGGING
1858 //Logger(INFO) << "exchanged particle entry: x";
1859 //if (subDomainKeyTreeHandler->h_subDomainKeyTree->rank == 0) {
1860 //TreeNS::Kernel::Launch::info(treeHandler->d_tree, particleHandler->d_particles, treeIndex,
1861 // treeIndex + pseudoParticleTotalReceiveLength);
1862 //TreeNS::Kernel::Launch::info(treeHandler->d_tree, particleHandler->d_particles, numParticlesLocal,
1863 // numParticlesLocal + particleTotalReceiveLength);
1864 //}
1865 //#endif
1866
1867 treeHandler->h_toDeleteLeaf[0] = numParticlesLocal;
1868 treeHandler->h_toDeleteLeaf[1] = numParticlesLocal + particleTotalReceiveLength;
1869 cuda::copy(treeHandler->h_toDeleteLeaf, treeHandler->d_toDeleteLeaf, 2, To::device);
1870
1871 if (treeHandler->h_toDeleteLeaf[1] > numParticles) {
1872 MPI_Finalize();
1873 Logger(ERROR) << "numParticlesLocal + receiveLength = " << treeHandler->h_toDeleteLeaf[1] << " > " << " numParticles = " << numParticles;
1874 Logger(ERROR) << "Restart simulation with more memory! exiting ...";
1875 exit(1);
1876 }
1877
1878 //int debugOffset = 0;
1879 //for (int proc=0; proc<subDomainKeyTreeHandler->h_numProcesses; proc++) {
1880 // gpuErrorcheck(cudaMemset(buffer->d_integerVal, 0, sizeof(integer)));
1881 // CudaUtils::Kernel::Launch::findDuplicateEntries(&particleHandler->d_x[treeIndex + debugOffset],
1882 // &particleHandler->d_y[treeIndex + debugOffset],
1883 // buffer->d_integerVal,
1884 // particleReceiveLengths[proc]);
1885 // integer duplicates;
1886 // gpuErrorcheck(cudaMemcpy(&duplicates, buffer->d_integerVal, sizeof(integer), cudaMemcpyDeviceToHost));
1887 // Logger(INFO) << "duplicates: " << duplicates << " between: " << treeIndex + debugOffset << " and " << treeIndex + particleReceiveLengths[proc] + debugOffset;
1888 //
1889 // debugOffset += particleReceiveLengths[proc];
1890 //}
1891
1892 //#if DEBUGGING
1893 //gpuErrorcheck(cudaMemset(buffer->d_integerVal, 0, sizeof(integer)));
1894 //CudaUtils::Kernel::Launch::findDuplicateEntries(&particleHandler->d_x[treeHandler->h_toDeleteLeaf[0]],
1895 // &particleHandler->d_y[treeHandler->h_toDeleteLeaf[0]],
1896 // buffer->d_integerVal,
1897 // particleTotalReceiveLength);
1898 //integer duplicates;
1899 //gpuErrorcheck(cudaMemcpy(&duplicates, buffer->d_integerVal, sizeof(integer), cudaMemcpyDeviceToHost));
1900 //Logger(INFO) << "duplicates: " << duplicates << " between: " << treeHandler->h_toDeleteLeaf[0] << " and " << treeHandler->h_toDeleteLeaf[0] + particleTotalReceiveLength;
1901 //#endif
1902
1903 //#if DEBUGGING
1904 // debugging
1905 //gpuErrorcheck(cudaMemset(buffer->d_integerVal, 0, sizeof(integer)));
1906 //CudaUtils::Kernel::Launch::findDuplicateEntries(&particleHandler->d_x[0],
1907 // &particleHandler->d_y[0],
1908 // buffer->d_integerVal,
1909 // numParticlesLocal + particleTotalReceiveLength);
1910 //integer duplicates;
1911 //gpuErrorcheck(cudaMemcpy(&duplicates, buffer->d_integerVal, sizeof(integer), cudaMemcpyDeviceToHost));
1912 //Logger(INFO) << "duplicates: " << duplicates << " between: " << 0 << " and " << numParticlesLocal + particleTotalReceiveLength;
1913 // end: debugging
1914 //#endif
1915
1916 treeHandler->h_toDeleteNode[0] = treeIndex;
1917 treeHandler->h_toDeleteNode[1] = treeIndex + pseudoParticleTotalReceiveLength;
1918
1919 if (treeHandler->h_toDeleteNode[1] > numNodes) {
1920 MPI_Finalize();
1921 Logger(ERROR) << "needed numNodes = " << treeHandler->h_toDeleteNode[1] << " > "
1922 << " numNodes = " << numNodes;
1923 Logger(ERROR) << "Restart simulation with more memory! exiting ...";
1924 exit(1);
1925 }
1926
1927 cuda::copy(treeHandler->h_toDeleteNode, treeHandler->d_toDeleteNode, 2, To::device);
1928
1929#if DEBUGGING
1930 Logger(INFO) << "toDeleteLeaf: " << treeHandler->h_toDeleteLeaf[0] << " : " << treeHandler->h_toDeleteLeaf[1];
1931 Logger(INFO) << "toDeleteNode: " << treeHandler->h_toDeleteNode[0] << " : " << treeHandler->h_toDeleteNode[1];
1932#endif
1933
1934 time = 0;
1935 // insert received pseudo-particles per level in order to ensure correct order (avoid race condition)
1936 for (int level=0; level<MAX_LEVEL; level++) {
1937 // -------------------------------------------------------------------------------------------------------------
1938 time += Gravity::Kernel::Launch::insertReceivedPseudoParticles(subDomainKeyTreeHandler->d_subDomainKeyTree,
1939 treeHandler->d_tree,
1940 particleHandler->d_particles,
1941 d_pseudoParticles2ReceiveLevels, level,
1942 numParticles, numParticles);
1943 // -------------------------------------------------------------------------------------------------------------
1944 }
1945
1946 totalTime += time;
1947 Logger(TIME) << "parallel_gravity: inserting received pseudo-particles: " << time << " ms";
1948 profiler.value2file(ProfilerIds::Time::Gravity::insertReceivedPseudoParticles, time);
1949
1950 time = 0;
1951 //if (treeHandler->h_toDeleteLeaf[0] < treeHandler->h_toDeleteLeaf[1]) {
1952 // ---------------------------------------------------------
1953 time += Gravity::Kernel::Launch::insertReceivedParticles(subDomainKeyTreeHandler->d_subDomainKeyTree,
1954 treeHandler->d_tree, particleHandler->d_particles,
1955 domainListHandler->d_domainList,
1956 lowestDomainListHandler->d_domainList,
1957 numParticles, numParticles);
1958 // ---------------------------------------------------------
1959 //}
1960 totalTime += time;
1961 Logger(TIME) << "parallel_gravity: inserting received particles: " << time << " ms";
1962 profiler.value2file(ProfilerIds::Time::Gravity::insertReceivedParticles, time);
1963
1964 Logger(DEBUG) << "Finished inserting received particles!";
1965
1966#if UNIT_TESTING
1967 //if (subStep == 10) {
1968 int actualTreeIndex;
1969 cuda::copy(&actualTreeIndex, treeHandler->d_index, 1, To::host);
1970 Logger(TRACE) << "[" << subStep << "] Checking Masses ...";
1971 UnitTesting::Kernel::Launch::test_localTree(treeHandler->d_tree, particleHandler->d_particles,
1972 numParticles, actualTreeIndex); //numNodes);
1973 //}
1974#endif
1975
1976 time = 0;
1977
1978 //DomainListNS::Kernel::Launch::info(particleHandler->d_particles, domainListHandler->d_domainList);
1979 //TreeNS::Kernel::Launch::testTree(treeHandler->d_tree, particleHandler->d_particles, numParticlesLocal, numParticles);
1980
1981 // SELECT compute forces version:
1982 // 0: similiar to burtscher with presorting according to the space-filling curves
1983 // 1: similiar to burtscher
1984 // 2: miluphcuda version with additional presorting according to the space-filling curves
1985 // 3: miluphcuda version
1986 // TODO: need to be verified (possible to use this additional shared memory?)
1987 // 4: miluphcuda version with additional presorting according to the space-filling curves and shared memory
1988 int computeForcesVersion = simulationParameters.gravityForceVersion;
1989
1990 Logger(DEBUG) << "computeForcesVersion: " << computeForcesVersion;
1991
1992 // preparations for computing forces
1993 treeHandler->copy(To::host);
1994 real x_radius = 0.5 * (*treeHandler->h_maxX - (*treeHandler->h_minX));
1995#if DIM > 1
1996 real y_radius = 0.5 * (*treeHandler->h_maxY - (*treeHandler->h_minY));
1997#if DIM == 3
1998 real z_radius = 0.5 * (*treeHandler->h_maxZ - (*treeHandler->h_minZ));
1999#endif
2000#endif
2001
2002#if DIM == 1
2003 real radius = x_radius;
2004#elif DIM == 2
2005 real radius = std::max(x_radius, y_radius);
2006#else
2007 real radius_max = std::max(x_radius, y_radius);
2008 real radius = std::max(radius_max, z_radius);
2009#endif
2010 //radius *= 0.5; //0.5; // TODO: was 1.0
2011 Logger(INFO) << "radius: " << radius;
2012
2013 // end: preparations for computing forces
2014
2015 // needed for version 0 and 1
2016 int warp = 32;
2017 int stackSize = 128; //128; //64;
2018 int blockSize = 256;
2019 // end: needed for version 0 and 1
2020 if (computeForcesVersion == 0) {
2021
2022 //int sortVersion = 0;
2023 //if (sortVersion == 0) {
2024 //int treeIndexBeforeSorting;
2025 //cuda::copy(&treeIndexBeforeSorting, treeHandler->d_index, 1, To::host);
2026 //Logger(INFO) << "treeIndexBeforeSorting: " << treeIndexBeforeSorting;
2027 //time = TreeNS::Kernel::Launch::sort(treeHandler->d_tree, numParticlesLocal, numParticles, true);
2028 //Logger(TIME) << "sorting: " << time << " ms";
2029 //}
2030 //else if (sortVersion == 1) {
2031
2032 // THUS, instead:
2033 // presorting using keys...
2034 real timeSorting = 0.;
2035 timeSorting += TreeNS::Kernel::Launch::prepareSorting(treeHandler->d_tree, particleHandler->d_particles,
2036 numParticlesLocal, numParticles);
2037
2038 keyType *d_keys;
2039 //cuda::malloc(d_keys, numParticlesLocal);
2040 d_keys = buffer->d_keyTypeBuffer;
2041 timeSorting += SubDomainKeyTreeNS::Kernel::Launch::getParticleKeys(
2042 subDomainKeyTreeHandler->d_subDomainKeyTree,
2043 treeHandler->d_tree, particleHandler->d_particles,
2044 d_keys, 21, numParticlesLocal, curveType);
2045
2046 timeSorting += HelperNS::sortArray(treeHandler->d_start, treeHandler->d_sorted, d_keys,
2047 buffer->d_keyTypeBuffer1, //helperHandler->d_keyTypeBuffer,
2048 numParticlesLocal);
2049 //cuda::free(d_keys);
2050
2051 Logger(TIME) << "gravity: presorting: " << timeSorting << " ms";
2052 //end: presorting using keys...
2053
2054 //}
2055
2056 //actual (local) force
2057 // -------------------------------------------------------------------------------------------------------------
2058 time = Gravity::Kernel::Launch::computeForces_v2(treeHandler->d_tree, particleHandler->d_particles, radius,
2059 numParticlesLocal, numParticles, blockSize, warp,
2060 stackSize, subDomainKeyTreeHandler->d_subDomainKeyTree,
2061 simulationParameters.theta, simulationParameters.smoothing,
2062 simulationParameters.calculateEnergy);
2063 // -------------------------------------------------------------------------------------------------------------
2064
2065 }
2066 else if (computeForcesVersion == 1) {
2067 // -------------------------------------------------------------------------------------------------------------
2068 time = Gravity::Kernel::Launch::computeForces_v2_1(treeHandler->d_tree, particleHandler->d_particles,
2069 numParticlesLocal, numParticles, blockSize, warp, stackSize,
2070 subDomainKeyTreeHandler->d_subDomainKeyTree,
2071 simulationParameters.theta, simulationParameters.smoothing,
2072 simulationParameters.calculateEnergy);
2073 // -------------------------------------------------------------------------------------------------------------
2074 }
2075 else if (computeForcesVersion == 2) {
2076
2077 //int treeIndexBeforeSorting;
2078 //cuda::copy(&treeIndexBeforeSorting, treeHandler->d_index, 1, To::host);
2079 //Logger(INFO) << "treeIndexBeforeSorting: " << treeIndexBeforeSorting;
2080 //time = TreeNS::Kernel::Launch::sort(treeHandler->d_tree, numParticlesLocal, numParticles, true);
2081 //Logger(TIME) << "sorting: " << time << " ms";
2082
2083 // presorting using keys...
2084 real timeSorting = 0.;
2085 timeSorting += TreeNS::Kernel::Launch::prepareSorting(treeHandler->d_tree, particleHandler->d_particles,
2086 numParticlesLocal, numParticles);
2087
2088 keyType *d_keys;
2089 //cuda::malloc(d_keys, numParticlesLocal + particleTotalReceiveLength);
2090 d_keys = buffer->d_keyTypeBuffer;
2091 timeSorting += SubDomainKeyTreeNS::Kernel::Launch::getParticleKeys(subDomainKeyTreeHandler->d_subDomainKeyTree,
2092 treeHandler->d_tree,
2093 particleHandler->d_particles,
2094 d_keys, 21,numParticlesLocal /*+
2095 particleTotalReceiveLength*/, curveType);
2096
2097 timeSorting += HelperNS::sortArray(treeHandler->d_start, treeHandler->d_sorted, d_keys,
2098 buffer->d_keyTypeBuffer1, //helperHandler->d_keyTypeBuffer,
2099 numParticlesLocal); // + particleTotalReceiveLength);
2100 //cuda::free(d_keys);
2101
2102 Logger(TIME) << "gravity: presorting: " << timeSorting << " ms";
2103 //end: presorting using keys...
2104
2105 // -------------------------------------------------------------------------------------------------------------
2106 time = Gravity::Kernel::Launch::computeForces_v1(treeHandler->d_tree, particleHandler->d_particles,
2107 radius, numParticles, numParticles,
2108 subDomainKeyTreeHandler->d_subDomainKeyTree,
2109 simulationParameters.theta, simulationParameters.smoothing,
2110 simulationParameters.calculateEnergy);
2111 // -------------------------------------------------------------------------------------------------------------
2112 }
2113 else if (computeForcesVersion == 3) {
2114 // -------------------------------------------------------------------------------------------------------------
2115 time = Gravity::Kernel::Launch::computeForces_v1_1(treeHandler->d_tree, particleHandler->d_particles,
2116 radius, numParticles, numParticles,
2117 subDomainKeyTreeHandler->d_subDomainKeyTree,
2118 simulationParameters.theta, simulationParameters.smoothing,
2119 simulationParameters.calculateEnergy);
2120 // -------------------------------------------------------------------------------------------------------------
2121 }
2122 else if (computeForcesVersion == 4) {
2123 // presorting using keys...
2124 real timeSorting = 0.;
2125 timeSorting += TreeNS::Kernel::Launch::prepareSorting(treeHandler->d_tree, particleHandler->d_particles,
2126 numParticlesLocal, numParticles);
2127
2128 keyType *d_keys;
2129 //cuda::malloc(d_keys, numParticlesLocal + particleTotalReceiveLength);
2130 d_keys = buffer->d_keyTypeBuffer;
2131 timeSorting += SubDomainKeyTreeNS::Kernel::Launch::getParticleKeys(subDomainKeyTreeHandler->d_subDomainKeyTree,
2132 treeHandler->d_tree, particleHandler->d_particles,
2133 d_keys, 21, numParticlesLocal /*+
2134 particleTotalReceiveLength*/, curveType);
2135
2136 timeSorting += HelperNS::sortArray(treeHandler->d_start, treeHandler->d_sorted, d_keys,
2137 buffer->d_keyTypeBuffer1, //helperHandler->d_keyTypeBuffer,
2138 numParticlesLocal); // + particleTotalReceiveLength);
2139 //cuda::free(d_keys);
2140
2141 Logger(TIME) << "gravity: presorting: " << timeSorting << " ms";
2142 //end: presorting using keys...
2143
2144 // -------------------------------------------------------------------------------------------------------------
2145 time = Gravity::Kernel::Launch::computeForces_v1_2(treeHandler->d_tree, particleHandler->d_particles,
2146 radius, numParticles, numParticles,
2147 subDomainKeyTreeHandler->d_subDomainKeyTree,
2148 simulationParameters.theta, simulationParameters.smoothing,
2149 simulationParameters.calculateEnergy);
2150 // -------------------------------------------------------------------------------------------------------------
2151 }
2152 else {
2153 MPI_Finalize();
2154 Logger(ERROR) << "select proper compute forces version!";
2155 exit(0);
2156 }
2157
2158 //NOTE: time(computeForces) < time(computeForceMiluphcuda) for kepler disk, but
2159 // time(computeForces) >> time(computeForceMiluphcuda) for plummer!!!
2160
2161 totalTime += time;
2162 Logger(TIME) << "computeForces: " << time << " ms";
2163 profiler.value2file(ProfilerIds::Time::Gravity::force, time);
2164
2165 // repairTree
2166 // necessary? Tree is build for every iteration: ONLY necessary if subsequent SPH?
2167 int debug_lowestDomainListIndex;
2168 cuda::copy(&debug_lowestDomainListIndex, lowestDomainListHandler->d_domainListIndex, 1, To::host);
2169 Logger(DEBUG) << "lowest Domain list index: " << debug_lowestDomainListIndex;
2170
2171 integer toDeleteLeaf0;
2172 integer toDeleteLeaf1;
2173 cuda::copy(&toDeleteLeaf0, &treeHandler->d_toDeleteLeaf[0], 1, To::host);
2174 cuda::copy(&toDeleteLeaf1, &treeHandler->d_toDeleteLeaf[1], 1, To::host);
2175 Logger(INFO) << "toDeleteLeaf: " << toDeleteLeaf0 << ", " << toDeleteLeaf1;
2176
2177 integer toDeleteNode0;
2178 integer toDeleteNode1;
2179 cuda::copy(&toDeleteNode0, &treeHandler->d_toDeleteNode[0], 1, To::host);
2180 cuda::copy(&toDeleteNode1, &treeHandler->d_toDeleteNode[1], 1, To::host);
2181 Logger(INFO) << "toDeleteNode: " << toDeleteNode0 << ", " << toDeleteNode1;
2182
2183 Logger(DEBUG) << "repairing tree...";
2184 // -----------------------------------------------------------------------------------------------------------------
2185 time = SubDomainKeyTreeNS::Kernel::Launch::repairTree(subDomainKeyTreeHandler->d_subDomainKeyTree,
2186 treeHandler->d_tree, particleHandler->d_particles,
2187 domainListHandler->d_domainList,
2188 lowestDomainListHandler->d_domainList,
2189 numParticles, numNodes, curveType);
2190
2191 // -----------------------------------------------------------------------------------------------------------------
2192
2193 profiler.value2file(ProfilerIds::Time::Gravity::repairTree, time);
2194
2195 //TreeNS::Kernel::Launch::info(treeHandler->d_tree, particleHandler->d_particles,
2196 // treeIndex, treeIndex + pseudoParticleTotalReceiveLength);
2197 //TreeNS::Kernel::Launch::info(treeHandler->d_tree, particleHandler->d_particles,
2198 // numParticlesLocal, numParticlesLocal + particleTotalReceiveLength);
2199
2200 delete [] h_relevantDomainListProcess;
2201 delete [] h_particles2SendCount;
2202 delete [] h_pseudoParticles2SendCount;
2203
2204 return totalTime;
2205}
2206
2207real Miluphpc::sph() {
2208
2209 real time = 0;
2210 time = parallel_sph();
2211
2212 return time;
2213}
2214
2215// IN PRINCIPLE it should be possible to reuse already sent particles from (parallel) gravity
2216real Miluphpc::parallel_sph() {
2217
2218 real time;
2219 real totalTime = 0;
2220
2221#if SPH_SIM
2222
2223 Timer timer;
2224 real timeElapsed;
2225
2226 integer *d_particles2SendIndices = buffer->d_integerBuffer1;
2227 cuda::set(d_particles2SendIndices, -1, numParticles);
2228 integer *d_particles2SendCount = buffer->d_sendCount;
2229 cuda::set(d_particles2SendCount, 0, subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses);
2230
2231 cuda::set(lowestDomainListHandler->d_domainListCounter, 0, 1);
2232
2233 // -----------------------------------------------------------------------------------------------------------------
2234 time = SPH::Kernel::Launch::compTheta(subDomainKeyTreeHandler->d_subDomainKeyTree, treeHandler->d_tree,
2235 particleHandler->d_particles, lowestDomainListHandler->d_domainList,
2236 curveType);
2237 // -----------------------------------------------------------------------------------------------------------------
2238
2239 totalTime += time;
2240 Logger(TIME) << "sph: compTheta: " << time << " ms";
2241 profiler.value2file(ProfilerIds::Time::SPH::compTheta, time);
2242
2243 integer relevantIndicesCounter;
2244 cuda::copy(&relevantIndicesCounter, lowestDomainListHandler->d_domainListCounter, 1, To::host);
2245 //Logger(DEBUG) << "relevantIndicesCounter: " << relevantIndicesCounter;
2246
2247 integer particlesOffset = 0;
2248 integer particlesOffsetBuffer;
2249
2250 integer *h_relevantDomainListProcess;
2251 h_relevantDomainListProcess = new integer[relevantIndicesCounter];
2252 cuda::copy(h_relevantDomainListProcess, lowestDomainListHandler->d_relevantDomainListProcess,
2253 relevantIndicesCounter, To::host);
2254
2255 integer *d_markedSendIndices = buffer->d_integerBuffer;
2256 real *d_collectedEntries = buffer->d_realBuffer;
2257 integer *h_particles2SendCount = new integer[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
2258
2259 timer.reset();
2260 // determine search radius
2261 boost::mpi::communicator comm;
2262
2263 // DETERMINE search radius
2264 // [1] either: use max(sml):
2265 //const unsigned int blockSizeReduction = 256;
2266 //real *d_searchRadii;
2267 //cuda::malloc(d_searchRadii, blockSizeReduction);
2268 //cuda::set(d_searchRadii, (real)0., blockSizeReduction);
2269 //time += CudaUtils::Kernel::Launch::reduceBlockwise<real, blockSizeReduction>(particleHandler->d_sml,
2270 // d_searchRadii, numParticlesLocal);
2271
2272 //real *d_intermediateResult;
2273 //cuda::malloc(d_intermediateResult, 1);
2274 //cuda::set(d_intermediateResult, (real)0., 1);
2275 //time += CudaUtils::Kernel::Launch::blockReduction<real, blockSizeReduction>(d_searchRadii, d_intermediateResult);
2276 //cuda::copy(&h_searchRadius, d_intermediateResult, 1, To::host);
2277 //h_searchRadius /= subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses;
2278 //all_reduce(comm, boost::mpi::inplace_t<real*>(&h_searchRadius), 1, std::plus<real>());
2279 //cuda::free(d_searchRadii);
2280 //cuda::free(d_intermediateResult);
2281
2282 // [2] or: calculate search radii as sml - min(distance to other process) for all particles
2283 // real *d_intermediateResult;
2284 // //cuda::malloc(d_intermediateResult, 1);
2285 // d_intermediateResult = buffer->d_realVal1;
2286 // real *d_potentialSearchRadii;
2287 // //cuda::malloc(d_potentialSearchRadii, numParticlesLocal);
2288 // d_potentialSearchRadii = buffer->d_realBuffer1;
2289 // -----------------------------------------------------------------------------------------------------------------
2290 // time = SPH::Kernel::Launch::determineSearchRadii(subDomainKeyTreeHandler->d_subDomainKeyTree, treeHandler->d_tree,
2291 // particleHandler->d_particles, domainListHandler->d_domainList,
2292 // lowestDomainListHandler->d_domainList, d_potentialSearchRadii,
2293 // numParticlesLocal, 0, curveType);
2294 // -----------------------------------------------------------------------------------------------------------------
2295
2296 // Logger(TIME) << "sph: determineSearchRadii: " << time << " ms";
2297 // h_searchRadius = HelperNS::reduceAndGlobalize(d_potentialSearchRadii, d_intermediateResult,
2298 // numParticlesLocal, Reduction::max);
2299
2300 real *d_intermediateResult;
2301 d_intermediateResult = buffer->d_realVal1;
2302 h_searchRadius = HelperNS::reduceAndGlobalize(particleHandler->d_sml, d_intermediateResult,
2303 numParticlesLocal, Reduction::max);
2304
2305 h_searchRadius *= 2.; // *= 1.;
2306 //h_searchRadius = 2 * 0.080001;
2307
2308 //cuda::free(d_potentialSearchRadii);
2309
2310 //cuda::free(d_intermediateResult);
2311
2312 Logger(INFO) << "search radius: " << h_searchRadius;
2313 // end: determine search radius
2314 timeElapsed = timer.elapsed();
2315 profiler.value2file(ProfilerIds::Time::SPH::determineSearchRadii, timeElapsed);
2316
2317 time = 0;
2318
2319 // original master version
2320 /*
2321 Logger(DEBUG) << "relevant indices counter: " << relevantIndicesCounter;
2322 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
2323 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
2324 cuda::set(d_markedSendIndices, -1, numParticles); //numParticlesLocal should be sufficient
2325 for (int relevantIndex = 0; relevantIndex < relevantIndicesCounter; relevantIndex++) {
2326 if (h_relevantDomainListProcess[relevantIndex] == proc) {
2327 //Logger(TRACE) << "h_relevantDomainListProcess[" << relevantIndex << "] = "
2328 // << h_relevantDomainListProcess[relevantIndex];
2329 // ---------------------------------------------------------
2330 time += SPH::Kernel::Launch::symbolicForce(subDomainKeyTreeHandler->d_subDomainKeyTree,
2331 treeHandler->d_tree, particleHandler->d_particles,
2332 lowestDomainListHandler->d_domainList,
2333 d_markedSendIndices, h_searchRadius, numParticlesLocal,
2334 numParticles,
2335 relevantIndex, curveType);
2336 // ---------------------------------------------------------
2337 }
2338 }
2339 // ---------------------------------------------------------
2340 time += SPH::Kernel::Launch::collectSendIndices(treeHandler->d_tree, particleHandler->d_particles,
2341 d_markedSendIndices, &d_particles2SendIndices[particlesOffset],
2342 &d_particles2SendCount[proc], numParticles, numParticles, curveType);
2343 // ---------------------------------------------------------
2344 cuda::copy(&particlesOffsetBuffer, &d_particles2SendCount[proc], 1, To::host);
2345 Logger(INFO) << "particles2SendCount[" << proc << "] = " << particlesOffsetBuffer;
2346 particlesOffset += particlesOffsetBuffer;
2347 }
2348 } */
2349 // end: original master version
2350
2351 Logger(INFO) << "relevantIndicesCounter: " << relevantIndicesCounter;
2352
2353 // test version
2354 /*
2355 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
2356 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
2357 cuda::set(d_markedSendIndices, -1, numParticles); //numParticlesLocal should be sufficient
2358 time += SPH::Kernel::Launch::symbolicForce_test(subDomainKeyTreeHandler->d_subDomainKeyTree,
2359 treeHandler->d_tree, particleHandler->d_particles,
2360 lowestDomainListHandler->d_domainList,
2361 d_markedSendIndices, h_searchRadius, numParticlesLocal,
2362 numParticles, proc, relevantIndicesCounter,
2363 0, curveType);
2364
2365 // ---------------------------------------------------------------------------------------------------------
2366 time += SPH::Kernel::Launch::collectSendIndices(treeHandler->d_tree, particleHandler->d_particles,
2367 d_markedSendIndices,
2368 &d_particles2SendIndices[particlesOffset],
2369 &d_particles2SendCount[proc], numParticles,
2370 numParticles, curveType); // numParticlesLocal should be sufficient
2371 // ---------------------------------------------------------------------------------------------------------
2372 cuda::copy(&particlesOffsetBuffer, &d_particles2SendCount[proc], 1, To::host);
2373
2374 Logger(INFO) << "particles2SendCount[" << proc << "] = " << particlesOffsetBuffer;
2375
2376 particlesOffset += particlesOffsetBuffer;
2377 }
2378 }
2379 */
2380 // end: test version
2381
2382 // test version 2
2383
2384 cuda::set(d_markedSendIndices, 0, numParticles); //numParticlesLocal should be sufficient
2385 time += SPH::Kernel::Launch::symbolicForce_test2(subDomainKeyTreeHandler->d_subDomainKeyTree,
2386 treeHandler->d_tree, particleHandler->d_particles,
2387 lowestDomainListHandler->d_domainList,
2388 d_markedSendIndices, h_searchRadius, numParticlesLocal,
2389 numParticles, 0, relevantIndicesCounter,
2390 curveType);
2391
2392 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
2393
2394 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
2395
2396 // ---------------------------------------------------------------------------------------------------------
2397 time += SPH::Kernel::Launch::collectSendIndices_test2(treeHandler->d_tree, particleHandler->d_particles,
2398 d_markedSendIndices,
2399 &d_particles2SendIndices[particlesOffset],
2400 &d_particles2SendCount[proc], numParticlesLocal,
2401 numParticles, 0, proc, curveType); // numParticlesLocal should be sufficient
2402 // ---------------------------------------------------------------------------------------------------------
2403 cuda::copy(&particlesOffsetBuffer, &d_particles2SendCount[proc], 1, To::host);
2404
2405 Logger(INFO) << "particles2SendCount[" << proc << "] = " << particlesOffsetBuffer;
2406
2407 particlesOffset += particlesOffsetBuffer;
2408 }
2409 }
2410 // end: test version 2
2411
2412
2413 profiler.value2file(ProfilerIds::Time::SPH::symbolicForce, time);
2414 totalTime += time;
2415 Logger(TIME) << "sph: symbolicForce: " << time << " ms";
2416
2417 cuda::copy(h_particles2SendCount, d_particles2SendCount, subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses,
2418 To::host);
2419
2420 timer.reset();
2421
2422 integer *particleSendLengths;
2423 particleSendLengths = new integer[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
2424 particleSendLengths[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] = 0;
2425 integer *particleReceiveLengths;
2426 particleReceiveLengths = new integer[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
2427 particleReceiveLengths[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] = 0;
2428
2429
2430 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
2431 //Logger(INFO) << "sph: h_particles2SendCount[" << proc << "] = " << h_particles2SendCount[proc];
2432 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
2433 particleSendLengths[proc] = h_particles2SendCount[proc];
2434 }
2435 }
2436
2437 mpi::messageLengths(subDomainKeyTreeHandler->h_subDomainKeyTree, particleSendLengths, particleReceiveLengths);
2438
2439 profiler.vector2file(ProfilerIds::SendLengths::sph, particleSendLengths);
2440 profiler.vector2file(ProfilerIds::ReceiveLengths::sph, particleReceiveLengths);
2441
2442 integer particleTotalReceiveLength = 0;
2443 integer particleTotalSendLength = 0;
2444 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
2445 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
2446 particleTotalReceiveLength += particleReceiveLengths[proc];
2447 particleTotalSendLength += particleSendLengths[proc];
2448 }
2449 }
2450
2451 if (simulationParameters.particlesSent2H5) {
2452 // writing particles to send to h5 file
2453 int *h_particles2SendIndices = new int[particleTotalSendLength];
2454 cuda::copy(h_particles2SendIndices, d_particles2SendIndices, particleTotalSendLength, To::host);
2455 std::string filename = "SPH2Send";
2456 particles2file(filename, h_particles2SendIndices, particleTotalSendLength);
2457 delete[] h_particles2SendIndices;
2458 // end: writing particles to send to h5 file
2459 }
2460
2461 Logger(INFO) << "sph: particleTotalReceiveLength: " << particleTotalReceiveLength;
2462 Logger(INFO) << "sph: particleTotalSendLength: " << particleTotalSendLength;
2463
2464 delete [] h_relevantDomainListProcess;
2465 delete [] h_particles2SendCount;
2466 //delete [] h_pseudoParticles2SendCount;
2467
2468 treeHandler->h_toDeleteLeaf[0] = numParticlesLocal;
2469 treeHandler->h_toDeleteLeaf[1] = numParticlesLocal + particleTotalReceiveLength;
2470 cuda::copy(treeHandler->h_toDeleteLeaf, treeHandler->d_toDeleteLeaf, 2, To::device);
2471
2472 if (treeHandler->h_toDeleteLeaf[1] > numParticles) {
2473 MPI_Finalize();
2474 Logger(ERROR) << "numParticlesLocal + receiveLength = " << treeHandler->h_toDeleteLeaf[1] << " > "
2475 << " numParticles = " << numParticles;
2476 Logger(ERROR) << "Restart simulation with more memory! exiting ...";
2477 exit(1);
2478 }
2479
2480 // x-entry particle exchange
2481 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_x, d_collectedEntries,
2482 particleTotalSendLength);
2483 sendParticles(d_collectedEntries, &particleHandler->d_x[numParticlesLocal], particleSendLengths,
2484 particleReceiveLengths);
2485 // x-vel-entry particle exchange
2486 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_vx, d_collectedEntries,
2487 particleTotalSendLength);
2488 sendParticles(d_collectedEntries, &particleHandler->d_vx[numParticlesLocal], particleSendLengths,
2489 particleReceiveLengths);
2490 // x-acc-entry particle exchange
2491 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_ax, d_collectedEntries,
2492 particleTotalSendLength);
2493 sendParticles(d_collectedEntries, &particleHandler->d_ax[numParticlesLocal], particleSendLengths,
2494 particleReceiveLengths);
2495#if DIM > 1
2496 // y-entry particle exchange
2497 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_y, d_collectedEntries,
2498 particleTotalSendLength);
2499 sendParticles(d_collectedEntries, &particleHandler->d_y[numParticlesLocal], particleSendLengths,
2500 particleReceiveLengths);
2501 // y-vel-entry particle exchange
2502 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_vy, d_collectedEntries,
2503 particleTotalSendLength);
2504 sendParticles(d_collectedEntries, &particleHandler->d_vy[numParticlesLocal], particleSendLengths,
2505 particleReceiveLengths);
2506 // y-acc-entry particle exchange
2507 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_ay, d_collectedEntries,
2508 particleTotalSendLength);
2509 sendParticles(d_collectedEntries, &particleHandler->d_ay[numParticlesLocal], particleSendLengths,
2510 particleReceiveLengths);
2511#if DIM == 3
2512 // z-entry particle exchange
2513 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_z, d_collectedEntries,
2514 particleTotalSendLength);
2515 sendParticles(d_collectedEntries, &particleHandler->d_z[numParticlesLocal], particleSendLengths,
2516 particleReceiveLengths);
2517 // z-vel-entry particle exchange
2518 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_vz, d_collectedEntries,
2519 particleTotalSendLength);
2520 sendParticles(d_collectedEntries, &particleHandler->d_vz[numParticlesLocal], particleSendLengths,
2521 particleReceiveLengths);
2522 // z-acc-entry particle exchange
2523 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_az, d_collectedEntries,
2524 particleTotalSendLength);
2525 sendParticles(d_collectedEntries, &particleHandler->d_az[numParticlesLocal], particleSendLengths,
2526 particleReceiveLengths);
2527#endif
2528#endif
2529 // mass-entry particle exchange
2530 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_mass, d_collectedEntries,
2531 particleTotalSendLength);
2532 sendParticles(d_collectedEntries, &particleHandler->d_mass[numParticlesLocal], particleSendLengths,
2533 particleReceiveLengths);
2534
2535 // sml-entry particle exchange
2536 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_sml, d_collectedEntries,
2537 particleTotalSendLength);
2538 sendParticles(d_collectedEntries, &particleHandler->d_sml[numParticlesLocal], particleSendLengths,
2539 particleReceiveLengths);
2540
2541 // rho-entry particle exchange
2542 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_rho, d_collectedEntries,
2543 particleTotalSendLength);
2544 sendParticles(d_collectedEntries, &particleHandler->d_rho[numParticlesLocal], particleSendLengths,
2545 particleReceiveLengths);
2546
2547 // pressure-entry particle exchange
2548 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_p, d_collectedEntries,
2549 particleTotalSendLength);
2550 sendParticles(d_collectedEntries, &particleHandler->d_p[numParticlesLocal], particleSendLengths,
2551 particleReceiveLengths);
2552
2553 // cs-entry particle exchange
2554 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_cs, d_collectedEntries,
2555 particleTotalSendLength);
2556 sendParticles(d_collectedEntries, &particleHandler->d_cs[numParticlesLocal], particleSendLengths,
2557 particleReceiveLengths);
2558
2559 // cs-entry particle exchange
2560 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_e, d_collectedEntries,
2561 particleTotalSendLength);
2562 sendParticles(d_collectedEntries, &particleHandler->d_e[numParticlesLocal], particleSendLengths,
2563 particleReceiveLengths);
2564 // necessary for all entries... (material id, ...) ?
2565 // should not be necessary to send all entries (since they will be resent)
2566
2567 time = timer.elapsed();
2568 totalTime += time;
2569 Logger(TIME) << "sph: sending particles: " << time;
2570 profiler.value2file(ProfilerIds::Time::SPH::sending, time);
2571
2572 //Logger(INFO) << "checking for nans before assigning particles...";
2573 //ParticlesNS::Kernel::Launch::check4nans(particleHandler->d_particles, numParticlesLocal);
2574
2575 cuda::copy(&treeHandler->h_toDeleteNode[0], treeHandler->d_index, 1, To::host);
2576
2577#if DEBUGGING
2578 // debug
2579 gpuErrorcheck(cudaMemset(buffer->d_integerVal, 0, sizeof(integer)));
2580 CudaUtils::Kernel::Launch::findDuplicateEntries(&particleHandler->d_x[0],
2581 &particleHandler->d_y[0],
2582 &particleHandler->d_z[0],
2583 buffer->d_integerVal,
2584 numParticlesLocal + particleTotalReceiveLength);
2585 integer duplicates;
2586 gpuErrorcheck(cudaMemcpy(&duplicates, buffer->d_integerVal, sizeof(integer), cudaMemcpyDeviceToHost));
2587 Logger(DEBUG) << "duplicates: " << duplicates << " between: " << 0 << " and "
2588 << numParticlesLocal + particleTotalReceiveLength;
2589 if (duplicates > 0) {
2590 MPI_Finalize();
2591 exit(0);
2592 }
2593 //end: debug
2594#endif
2595
2596 //TreeNS::Kernel::Launch::info(treeHandler->d_tree, particleHandler->d_particles, numParticles, numNodes);
2597 //DomainListNS::Kernel::Launch::info(particleHandler->d_particles, domainListHandler->d_domainList);
2598
2599 //if (treeHandler->h_toDeleteLeaf[1] > treeHandler->h_toDeleteLeaf[0]) {
2600 // -----------------------------------------------------------------------------------------------------------------
2601 time = SPH::Kernel::Launch::insertReceivedParticles(subDomainKeyTreeHandler->d_subDomainKeyTree,
2602 treeHandler->d_tree,
2603 particleHandler->d_particles, domainListHandler->d_domainList,
2604 lowestDomainListHandler->d_domainList,
2605 numParticles, numParticles);
2606 // -----------------------------------------------------------------------------------------------------------------
2607 //}
2608 profiler.value2file(ProfilerIds::Time::SPH::insertReceivedParticles, time);
2609
2610 time = 0;
2611 //for (int level=MAX_LEVEL; level>0; --level) {
2612 // time += SPH::Kernel::Launch::calculateCentersOfMass(treeHandler->d_tree, particleHandler->d_particles, level);
2613 //}
2614 Logger(TIME) << "sph: calculate centers of mass: " << time << " ms";
2615
2616 totalTime += time;
2617 Logger(TIME) << "sph: inserting received particles: " << time << " ms";
2618
2619 cuda::copy(&treeHandler->h_toDeleteNode[1], treeHandler->d_index, 1, To::host);
2620 cuda::copy(treeHandler->h_toDeleteNode, treeHandler->d_toDeleteNode, 2, To::device);
2621
2622 Logger(DEBUG) << "treeHandler->h_toDeleteNode[0]: " << treeHandler->h_toDeleteNode[0];
2623 Logger(DEBUG) << "treeHandler->h_toDeleteNode[1]: " << treeHandler->h_toDeleteNode[1];
2624
2625 if (treeHandler->h_toDeleteNode[1] > numNodes) {
2626 MPI_Finalize();
2627 Logger(ERROR) << "needed numNodes = " << treeHandler->h_toDeleteNode[1] << " > " << " numNodes = " << numNodes;
2628 Logger(ERROR) << "Restart simulation with more memory! exiting ...";
2629 exit(1);
2630 }
2631
2632 treeHandler->copy(To::host);
2633
2634#if CUBIC_DOMAINS
2635 real diam = std::abs(*treeHandler->h_maxX) + std::abs(*treeHandler->h_minX);
2636 //Logger(INFO) << "diam: " << diam;
2637#else // !CUBIC DOMAINS
2638 real diam_x = std::abs(*treeHandler->h_maxX) + std::abs(*treeHandler->h_minX);
2639#if DIM > 1
2640 real diam_y = std::abs(*treeHandler->h_maxY) + std::abs(*treeHandler->h_minY);
2641#if DIM == 3
2642 real diam_z = std::abs(*treeHandler->h_maxZ) + std::abs(*treeHandler->h_minZ);
2643#endif
2644#endif
2645#if DIM == 1
2646 real diam = diam_x;
2647 //Logger(INFO) << "diam: " << diam << " (x = " << diam_x << ")";
2648#elif DIM == 2
2649 real diam = std::max({diam_x, diam_y});
2650 //Logger(INFO) << "diam: " << diam << " (x = " << diam_x << ", y = " << diam_y << ")";
2651#else
2652 real diam = std::max({diam_x, diam_y, diam_z});
2653 //Logger(INFO) << "diam: " << diam << " (x = " << diam_x << ", y = " << diam_y << ", z = " << diam_z << ")";
2654#endif
2655#endif
2656
2657 time = 0.;
2658#if VARIABLE_SML
2659 // -----------------------------------------------------------------------------------------------------------------
2660 time += SPH::Kernel::Launch::fixedRadiusNN_variableSML(materialHandler->d_materials, treeHandler->d_tree,
2661 particleHandler->d_particles, particleHandler->d_nnl,
2662 numParticlesLocal, numParticles, numNodes);
2663 // -----------------------------------------------------------------------------------------------------------------
2664 // TODO: for variable SML
2665 // überprüfen inwiefern sich die sml geändert hat, sml_new <= sml_global_search // if sml_new > sml_global_search
2666#endif
2667
2668 //Logger(TRACE) << "diam: " << diam; // << " (x = " << diam_x << ", y = " << diam_y << ", z = " << diam_z << ")";
2669
2670 //real timeSorting = 0.;
2671 //timeSorting += TreeNS::Kernel::Launch::prepareSorting(treeHandler->d_tree, particleHandler->d_particles, numParticlesLocal, numParticles);
2672 //keyType *d_keys;
2673 //cuda::malloc(d_keys, numParticlesLocal);
2674 //timeSorting += SubDomainKeyTreeNS::Kernel::Launch::getParticleKeys(subDomainKeyTreeHandler->d_subDomainKeyTree,
2675 // treeHandler->d_tree, particleHandler->d_particles,
2676 // d_keys, 21, numParticlesLocal, curveType);
2677 //timeSorting += HelperNS::sortArray(treeHandler->d_start, treeHandler->d_sorted, d_keys,
2678 // helperHandler->d_keyTypeBuffer, numParticlesLocal);
2679
2680 //int *h_sorted = new int[numParticlesLocal];
2681 //cuda::copy(h_sorted, treeHandler->d_sorted, numParticlesLocal, To::host);
2682 //for (int i=0; i<100; i++) {
2683 // Logger(INFO) << "i: " << i << " sorted: " << h_sorted[i];
2684 //}
2685 //delete [] h_sorted;
2686 //cuda::free(d_keys);
2687 //Logger(INFO) << "sph: presorting: " << timeSorting << " ms";
2688
2689 /*real timeSorting = 0.;
2690 timeSorting += TreeNS::Kernel::Launch::prepareSorting(treeHandler->d_tree, particleHandler->d_particles,
2691 numParticlesLocal, numParticles);
2692
2693 keyType *d_keys;
2694 cuda::malloc(d_keys, numParticlesLocal);
2695 timeSorting += SubDomainKeyTreeNS::Kernel::Launch::getParticleKeys(
2696 subDomainKeyTreeHandler->d_subDomainKeyTree,
2697 treeHandler->d_tree, particleHandler->d_particles,
2698 d_keys, 21, numParticlesLocal, curveType);
2699
2700 timeSorting += HelperNS::sortArray(treeHandler->d_start, treeHandler->d_sorted, d_keys,
2701 helperHandler->d_keyTypeBuffer,
2702 numParticlesLocal);
2703 cuda::free(d_keys);
2704
2705 Logger(TIME) << "sph: presorting: " << timeSorting << " ms";*/
2706
2707 int fixedRadiusNN_version = simulationParameters.sphFixedRadiusNNVersion;
2708 if (fixedRadiusNN_version == 0) {
2709 // -------------------------------------------------------------------------------------------------------------
2710 time += SPH::Kernel::Launch::fixedRadiusNN(treeHandler->d_tree, particleHandler->d_particles,
2711 particleHandler->d_nnl,
2712 diam, numParticlesLocal, numParticles, numNodes);
2713 // -------------------------------------------------------------------------------------------------------------
2714 }
2715 else if (fixedRadiusNN_version == 1) {
2716 // ATTENTION: brute-force method
2717 time += SPH::Kernel::Launch::fixedRadiusNN_bruteForce(treeHandler->d_tree, particleHandler->d_particles,
2718 particleHandler->d_nnl, numParticlesLocal, numParticles,
2719 numNodes);
2720 }
2721 else if (fixedRadiusNN_version == 2) {
2722 // using shared memory (not beneficial)
2723 time += SPH::Kernel::Launch::fixedRadiusNN_sharedMemory(treeHandler->d_tree, particleHandler->d_particles,
2724 particleHandler->d_nnl, numParticlesLocal, numParticles,
2725 numNodes);
2726 }
2727 else if (fixedRadiusNN_version == 3) {
2728 // test version
2729 time = SPH::Kernel::Launch::fixedRadiusNN_withinBox(treeHandler->d_tree, particleHandler->d_particles,
2730 particleHandler->d_nnl, numParticlesLocal, numParticles,
2731 numNodes);
2732 }
2733 else {
2734 Logger(ERROR) << "fixedRadiusNN version not available! Exiting ...";
2735 MPI_Finalize();
2736 exit(1);
2737 }
2738
2739 profiler.value2file(ProfilerIds::Time::SPH::fixedRadiusNN, time);
2740 totalTime += time;
2741 Logger(TIME) << "sph: fixedRadiusNN: " << time << " ms";
2742
2743 // TODO: investigate presorting/cache efficiency for:
2744 // * calculateDensity
2745 // * calculateSoundSpeed
2746 // * calculatePressure
2747 // * internalForces
2748
2749 Logger(DEBUG) << "calculate density";
2750 // -----------------------------------------------------------------------------------------------------------------
2751 time = SPH::Kernel::Launch::calculateDensity(kernelHandler.kernel, treeHandler->d_tree,
2752 particleHandler->d_particles, particleHandler->d_nnl,
2753 numParticlesLocal); //treeHandler->h_toDeleteLeaf[1]);
2754 // -----------------------------------------------------------------------------------------------------------------
2755 Logger(TIME) << "sph: calculateDensity: " << time << " ms";
2756 profiler.value2file(ProfilerIds::Time::SPH::density, time);
2757
2758 Logger(DEBUG) << "calculate sound speed";
2759 // -----------------------------------------------------------------------------------------------------------------
2760 time = SPH::Kernel::Launch::calculateSoundSpeed(particleHandler->d_particles, materialHandler->d_materials,
2761 numParticlesLocal); // treeHandler->h_toDeleteLeaf[1]);
2762 // -----------------------------------------------------------------------------------------------------------------
2763 Logger(TIME) << "sph: calculateSoundSpeed: " << time << " ms";
2764 profiler.value2file(ProfilerIds::Time::SPH::soundSpeed, time);
2765
2766#if BALSARA_SWITCH
2767 time = SPH::Kernel::Launch::CalcDivvandCurlv(kernelHandler.kernel, particleHandler->d_particles, particleHandler->d_nnl,
2768 numParticlesLocal);
2769#endif
2770
2771 Logger(DEBUG) << "calculate pressure";
2772 // -----------------------------------------------------------------------------------------------------------------
2773 time = SPH::Kernel::Launch::calculatePressure(materialHandler->d_materials, particleHandler->d_particles,
2774 numParticlesLocal); // treeHandler->h_toDeleteLeaf[1]);
2775 // ---------------------------------------------------------
2776 Logger(TIME) << "sph: calculatePressure: " << time << " ms";
2777 profiler.value2file(ProfilerIds::Time::SPH::pressure, time);
2778
2779 timer.reset();
2780 //Timer timerTemp;
2781 // updating necessary particle entries
2782
2783 // not really necessary but beneficial for timing
2784 comm.barrier();
2785
2786 // sml-entry particle exchange
2787 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_sml, d_collectedEntries,
2788 particleTotalSendLength);
2789 sendParticles(d_collectedEntries, &particleHandler->d_sml[numParticlesLocal], particleSendLengths,
2790 particleReceiveLengths);
2791
2792 // pressure-entry particle exchange
2793 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_p, d_collectedEntries,
2794 particleTotalSendLength);
2795 sendParticles(d_collectedEntries, &particleHandler->d_p[numParticlesLocal], particleSendLengths,
2796 particleReceiveLengths);
2797
2798 // rho-entry particle exchange
2799 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_rho, d_collectedEntries,
2800 particleTotalSendLength);
2801 sendParticles(d_collectedEntries, &particleHandler->d_rho[numParticlesLocal], particleSendLengths,
2802 particleReceiveLengths);
2803
2804 // cs-entry particle exchange
2805 CudaUtils::Kernel::Launch::collectValues(d_particles2SendIndices, particleHandler->d_cs, d_collectedEntries,
2806 particleTotalSendLength);
2807 sendParticles(d_collectedEntries, &particleHandler->d_cs[numParticlesLocal], particleSendLengths,
2808 particleReceiveLengths);
2809 // end: updating necessary particle entries
2810
2811
2812 time = timer.elapsed();
2813 Logger(TIME) << "sph: sending particles (again): " << time;
2814 //Logger(TIME) << "sph: sending particles (again) collectingValues: " << sendingParticlesAgain;
2815 profiler.value2file(ProfilerIds::Time::SPH::resend, time);
2816
2817 totalTime += time;
2818 Logger(DEBUG) << "internal forces";
2819
2820 time = SPH::Kernel::Launch::internalForces(kernelHandler.kernel, materialHandler->d_materials, treeHandler->d_tree,
2821 particleHandler->d_particles, particleHandler->d_nnl, numParticlesLocal);
2822 Logger(TIME) << "sph: internalForces: " << time << " ms";
2823 profiler.value2file(ProfilerIds::Time::SPH::internalForces, time);
2824
2825 time = SubDomainKeyTreeNS::Kernel::Launch::repairTree(subDomainKeyTreeHandler->d_subDomainKeyTree, treeHandler->d_tree,
2826 particleHandler->d_particles, domainListHandler->d_domainList,
2827 lowestDomainListHandler->d_domainList,
2828 numParticles, numNodes, curveType);
2829
2830 Logger(TIME) << "sph: totalTime: " << totalTime << " ms";
2831 profiler.value2file(ProfilerIds::Time::SPH::repairTree, time);
2832
2833#endif // SPH_SIM
2834
2835 return totalTime;
2836
2837}
2838
2839void Miluphpc::fixedLoadBalancing() {
2840
2841 Logger(INFO) << "fixedLoadBalancing()";
2842
2843 keyType rangePerProc = (1UL << (21 * DIM))/(subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses);
2844 Logger(INFO) << "rangePerProc: " << rangePerProc;
2845 for (int i=0; i<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; i++) {
2846 subDomainKeyTreeHandler->h_subDomainKeyTree->range[i] = (keyType)i * rangePerProc;
2847 }
2848 subDomainKeyTreeHandler->h_subDomainKeyTree->range[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses] = KEY_MAX;
2849
2850 Logger(INFO) << "keyMax: " << (keyType)KEY_MAX;
2851
2852 //subDomainKeyTreeHandler->h_subDomainKeyTree->range[0] = 0UL;
2853 //subDomainKeyTreeHandler->h_subDomainKeyTree->range[1] = 0UL;
2854 //subDomainKeyTreeHandler->h_subDomainKeyTree->range[1] = (2UL << 60) + (4UL << 57); // + (4UL << 54) + (2UL << 51) + (1UL << 39);
2855 // 3|4|5|0|6|4|5|1|2|0|4|7|5|6|2|3|0|2|2|4|6|
2856 //subDomainKeyTreeHandler->h_subDomainKeyTree->range[1] = (4UL << 60) + (1UL << 57);
2857 // + (4UL << 57); // + (5UL << 54) + (0UL << 51) + (6UL << 48)
2858 //+ (4UL << 45) + (5UL << 42) + (1UL << 39) + (2UL << 36)
2859 //+ (0UL << 33) + (4UL << 30) + (7UL << 27) + (5UL << 24)
2860 //+ (6UL << 21) + (2UL << 18) + (3UL << 15) + (0UL << 12)
2861 //+ (2UL << 9) + (2UL << 6) + (4UL << 3) + (6UL << 0);
2862 //subDomainKeyTreeHandler->h_subDomainKeyTree->range[1] = (4UL << 60) + (4UL << 57) + (3UL << 54) + (2UL << 51) + (3UL << 30);
2863 //subDomainKeyTreeHandler->h_subDomainKeyTree->range[2] = (4UL << 60) + (2UL << 57);
2864 //subDomainKeyTreeHandler->h_subDomainKeyTree->range[3] = (6UL << 60) + (1UL << 57);
2865 //subDomainKeyTreeHandler->h_subDomainKeyTree->range[4] = KEY_MAX;
2866 // FOR TESTING PURPOSES:
2867 // 1, 0, 0, 0, ...: 1152921504606846976
2868 // 2, 0, 0, 0, ...: 2305843009213693952;
2869 // 3, 0, 0, 0, ...: 3458764513820540928;
2870 //subDomainKeyTreeHandler->h_subDomainKeyTree->range[0] = 0UL;
2871 //subDomainKeyTreeHandler->h_subDomainKeyTree->range[1] = 1048576; //2199023255552;//4194304; // + (4UL << 57); // + (3UL << 54) + (1UL << 42) + (2UL << 18) + (1UL << 3) + (4);
2872 // // |4 (60)|0 (57)|0 (54)|4 (51)|0 (48)|6 (45)|1 (42)|1 (39)|1 (36)|5 (33)|6 (30)|4 (27)|5 (24)|7 (21)|0 (18)|6 (15)|5 (12)|1 (9)|1 (6)|4 (3)|3 (0)|
2873 // //subDomainKeyTreeHandler->h_subDomainKeyTree->range[1] = (4UL << 60) + (4UL << 51) + (6UL << 45) + (1UL << 42) + (1UL << 39) + (1UL << 36) + (5UL << 33)
2874 // // + (6UL << 30) + (4UL << 27) + (5UL << 24) + (7UL << 21) + (6UL << 15) + (5UL << 12) + (1UL << 9) + (1UL << 6) + (4UL << 3);
2875 //subDomainKeyTreeHandler->h_subDomainKeyTree->range[2] = KEY_MAX; //9223372036854775808;
2876
2877 for (int i=0; i<=subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; i++) {
2878 //printf("range[%i] = %lu\n", i, subDomainKeyTreeHandler->h_subDomainKeyTree->range[i]);
2879 Logger(DEBUG) << "range[" << i << "] = " << subDomainKeyTreeHandler->h_subDomainKeyTree->range[i];
2880 }
2881
2882 subDomainKeyTreeHandler->copy(To::device, true, true);
2883}
2884
2885void Miluphpc::dynamicLoadBalancing(int bins) {
2886
2887 boost::mpi::communicator comm;
2888
2889 Logger(INFO) << "dynamicLoadBalancing()";
2890
2891 int *processParticleCounts = new int[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
2892
2893 all_gather(comm, &numParticlesLocal, 1, processParticleCounts);
2894
2895 int totalAmountOfParticles = 0;
2896 for (int i=0; i<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; i++) {
2897 //Logger(INFO) << "numParticles on process: " << i << " = " << processParticleCounts[i];
2898 totalAmountOfParticles += processParticleCounts[i];
2899 }
2900
2901 int aimedParticlesPerProcess = totalAmountOfParticles/subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses;
2902//#if DEBUGGING
2903 Logger(INFO) << "aimedParticlesPerProcess = " << aimedParticlesPerProcess;
2904//#endif
2905
2906 //updateRangeApproximately(aimedParticlesPerProcess, bins);
2907 updateRange(aimedParticlesPerProcess);
2908
2909 delete [] processParticleCounts;
2910}
2911
2912void Miluphpc::updateRange(int aimedParticlesPerProcess) {
2913
2914 // update bounding boxes here! // TODO: if bounding boxes are calculated here, they shouldn't be calculated again !?
2915 boundingBox();
2916
2917 boost::mpi::communicator comm;
2918
2919 sumParticles = numParticlesLocal;
2920 all_reduce(comm, boost::mpi::inplace_t<integer*>(&sumParticles), 1, std::plus<integer>());
2921
2922 keyType *d_localKeys;
2923 //cuda::malloc(d_localKeys, numParticlesLocal);
2924 d_localKeys = buffer->d_keyTypeBuffer;
2925 //keyType *h_localKeys = new keyType[sumParticles];
2926
2927 SubDomainKeyTreeNS::Kernel::Launch::getParticleKeys(subDomainKeyTreeHandler->d_subDomainKeyTree,
2928 treeHandler->d_tree, particleHandler->d_particles,
2929 d_localKeys, 21, numParticlesLocal, curveType);
2930
2931 int *numParticlesPerProcess = new int[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
2932 all_gather(comm, numParticlesLocal, numParticlesPerProcess);
2933 std::vector<int> sizes;
2934 for (int i = 0; i < subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; i++) {
2935 sizes.push_back(numParticlesPerProcess[i]);
2936 }
2937
2938 keyType *d_keys;
2939 //cuda::malloc(d_keys, sumParticles);
2940 d_keys = buffer->d_keyTypeBuffer1;
2941
2942 //for (int i = 0; i < subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; i++) {
2943 // Logger(INFO) << "numParticlesPerProcess[" << i << "] = " << numParticlesPerProcess[i]
2944 // << " (numParticlesLocal: " << sumParticles << ")";
2945 //}
2946
2947 all_gatherv(comm, d_localKeys, d_keys, sizes);
2948
2949 keyType *d_sortedKeys;
2950 //cuda::malloc(d_sortedKeys, sumParticles);
2951 d_sortedKeys = buffer->d_keyTypeBuffer2;
2952
2953 keyType *h_sortedKeys = new keyType[sumParticles];
2954 cuda::copy(h_sortedKeys, d_keys, sumParticles, To::host);
2955
2956 //for (int i=aimedParticlesPerProcess - 10; i<aimedParticlesPerProcess + 10; i++) {
2957 // Logger(INFO) << "sortedKeys[" << i << "] = " << h_sortedKeys[i] << "(" << h_sortedKeys[0]
2958 // << ", " << h_sortedKeys[sumParticles - 1] << ")";
2959 //}
2960
2961 HelperNS::sortKeys(d_keys, d_sortedKeys, sumParticles);
2962
2963 keyType newRange;
2964 for (int proc=1; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
2965 cuda::copy(&subDomainKeyTreeHandler->h_subDomainKeyTree->range[proc],
2966 &d_sortedKeys[proc * aimedParticlesPerProcess], 1, To::host);
2967 Logger(INFO) << "new range: " << newRange;
2968 }
2969
2970 for (int proc=1; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
2971 subDomainKeyTreeHandler->h_subDomainKeyTree->range[proc] = (subDomainKeyTreeHandler->h_subDomainKeyTree->range[proc] >> 30) << 30;
2972 }
2973
2974 subDomainKeyTreeHandler->copy(To::device);
2975
2976 getMemoryInfo();
2977
2978 //cuda::free(d_keys);
2979 //cuda::free(d_sortedKeys);
2980 //cuda::free(d_localKeys);
2981
2982 delete [] numParticlesPerProcess;
2983 delete [] h_sortedKeys;
2984
2985}
2986
2987// deprecated
2988void Miluphpc::updateRangeApproximately(int aimedParticlesPerProcess, int bins) {
2989
2990 // introduce "bin size" regarding keys
2991 // keyHistRanges = [0, 1 * binSize, 2 * binSize, ... ]
2992 // calculate key of particles on the fly and assign to keyHistRanges
2993 // keyHistNumbers = [1023, 50032, ...]
2994 // take corresponding keyHistRange as new range if (sum(keyHistRange[i]) > aimNumberOfParticles ...
2995 // communicate new ranges among processes
2996
2997 boost::mpi::communicator comm;
2998
2999 buffer->reset();
3000
3001 SubDomainKeyTreeNS::Kernel::Launch::createKeyHistRanges(buffer->d_helper, bins);
3002
3003 SubDomainKeyTreeNS::Kernel::Launch::keyHistCounter(treeHandler->d_tree, particleHandler->d_particles,
3004 subDomainKeyTreeHandler->d_subDomainKeyTree, buffer->d_helper,
3005 bins, numParticlesLocal, curveType);
3006
3007 all_reduce(comm, boost::mpi::inplace_t<integer*>(buffer->d_integerBuffer), bins - 1, std::plus<integer>());
3008
3009 // ------------------------------------------------------------------------------
3010 // //if (subDomainKeyTreeHandler->h_subDomainKeyTree->rank == 0) {
3011 // int *histogram = new int[bins];
3012 // cuda::copy(histogram, helperHandler->d_integerBuffer, bins, To::host);
3013 //
3014 // HighFive::File h5file(simulationParameters.logDirectory + "histogram.h5",
3015 // HighFive::File::ReadWrite | HighFive::File::Create | HighFive::File::Truncate,
3016 // HighFive::MPIOFileDriver(MPI_COMM_WORLD, MPI_INFO_NULL));
3017 // Logger(INFO) << "histogram bins: " << bins;
3018 // for (int i = 0; i < bins; i++) {
3019 // Logger(INFO) << "histogram[" << i << "]: " << histogram[i];
3020 // }
3021 // HighFive::DataSet h5_ranges = h5file.createDataSet<keyType>("/bins", HighFive::DataSpace(bins));
3022 // h5_ranges.write(histogram);
3023 // // h5file.close();
3024 // delete[] histogram;
3025 // //}
3026 // ------------------------------------------------------------------------------
3027
3028 SubDomainKeyTreeNS::Kernel::Launch::calculateNewRange(subDomainKeyTreeHandler->d_subDomainKeyTree, buffer->d_helper,
3029 bins, aimedParticlesPerProcess, curveType);
3030 keyType keyMax = (keyType)KEY_MAX;
3031 cuda::set(&subDomainKeyTreeHandler->d_range[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses], keyMax, 1);
3032 subDomainKeyTreeHandler->copy(To::host, true, true);
3033
3034 //Logger(INFO) << "numProcesses: " << subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses;
3035 for (int i=0; i<=subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; i++) {
3036 Logger(INFO) << "range[" << i << "] = " << subDomainKeyTreeHandler->h_subDomainKeyTree->range[i];
3037 }
3038}
3039
3040real Miluphpc::removeParticles() {
3041
3042 int *d_particles2remove = buffer->d_integerBuffer; //d_particles2removeBuffer; //&buffer->d_integerBuffer[0];
3043 int *d_particles2remove_counter = buffer->d_integerVal; //d_particles2removeVal; //buffer->d_integerVal;
3044
3045 real *d_temp = buffer->d_realBuffer;
3046 integer *d_tempInt = buffer->d_integerBuffer1;
3047 //cuda::malloc(d_tempInt, numParticles);
3048
3049 cuda::set(d_particles2remove_counter, 0, 1);
3050
3051 auto time = ParticlesNS::Kernel::Launch::mark2remove(subDomainKeyTreeHandler->d_subDomainKeyTree,
3052 treeHandler->d_tree, particleHandler->d_particles,
3053 d_particles2remove, d_particles2remove_counter,
3054 simulationParameters.removeParticlesCriterion,
3055 simulationParameters.removeParticlesDimension,
3056 numParticlesLocal);
3057
3058 int h_particles2remove_counter;
3059 cuda::copy(&h_particles2remove_counter, d_particles2remove_counter, 1, To::host);
3060 Logger(INFO) << "#particles to be removed: " << h_particles2remove_counter;
3061
3062 time += HelperNS::sortArray(particleHandler->d_x, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3063 numParticlesLocal);
3064 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_x, d_temp, numParticlesLocal);
3065 time += HelperNS::sortArray(particleHandler->d_vx, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3066 numParticlesLocal);
3067 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_vx, d_temp, numParticlesLocal);
3068 time += HelperNS::sortArray(particleHandler->d_ax, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3069 numParticlesLocal);
3070 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_ax, d_temp, numParticlesLocal);
3071#if DIM > 1
3072 time += HelperNS::sortArray(particleHandler->d_y, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3073 numParticlesLocal);
3074 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_y, d_temp, numParticlesLocal);
3075 time += HelperNS::sortArray(particleHandler->d_vy, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3076 numParticlesLocal);
3077 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_vy, d_temp, numParticlesLocal);
3078 time += HelperNS::sortArray(particleHandler->d_ay, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3079 numParticlesLocal);
3080 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_ay, d_temp, numParticlesLocal);
3081#if DIM == 3
3082 time += HelperNS::sortArray(particleHandler->d_z, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3083 numParticlesLocal);
3084 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_z, d_temp, numParticlesLocal);
3085 time += HelperNS::sortArray(particleHandler->d_vz, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3086 numParticlesLocal);
3087 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_vz, d_temp, numParticlesLocal);
3088 time += HelperNS::sortArray(particleHandler->d_az, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3089 numParticlesLocal);
3090 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_az, d_temp, numParticlesLocal);
3091#endif
3092#endif
3093 time += HelperNS::sortArray(particleHandler->d_mass, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3094 numParticlesLocal);
3095 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_mass, d_temp, numParticlesLocal);
3096 time += HelperNS::sortArray(particleHandler->d_uid, d_tempInt, d_particles2remove, buffer->d_integerBuffer2,
3097 numParticlesLocal);
3098 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_uid, d_tempInt, numParticlesLocal);
3099#if SPH_SIM
3100 time += HelperNS::sortArray(particleHandler->d_materialId, d_tempInt, d_particles2remove, buffer->d_integerBuffer2,
3101 numParticlesLocal);
3102 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_materialId, d_tempInt, numParticlesLocal);
3103 time += HelperNS::sortArray(particleHandler->d_sml, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3104 numParticlesLocal);
3105 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_sml, d_temp, numParticlesLocal);
3106 time += HelperNS::sortArray(particleHandler->d_rho, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3107 numParticlesLocal);
3108 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_rho, d_temp, numParticlesLocal);
3109 time += HelperNS::sortArray(particleHandler->d_p, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3110 numParticlesLocal);
3111 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_p, d_temp, numParticlesLocal);
3112 time += HelperNS::sortArray(particleHandler->d_e, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3113 numParticlesLocal);
3114 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_e, d_temp, numParticlesLocal);
3115 time += HelperNS::sortArray(particleHandler->d_cs, d_temp, d_particles2remove, buffer->d_integerBuffer2,
3116 numParticlesLocal);
3117 time += HelperNS::Kernel::Launch::copyArray(particleHandler->d_cs, d_temp, numParticlesLocal);
3118#endif
3119
3120
3121 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_x[numParticlesLocal-h_particles2remove_counter],
3122 (real)0, h_particles2remove_counter);
3123 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_vx[numParticlesLocal-h_particles2remove_counter],
3124 (real)0, h_particles2remove_counter);
3125 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_ax[numParticlesLocal-h_particles2remove_counter],
3126 (real)0, h_particles2remove_counter);
3127#if DIM > 1
3128 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_y[numParticlesLocal-h_particles2remove_counter],
3129 (real)0, h_particles2remove_counter);
3130 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_vy[numParticlesLocal-h_particles2remove_counter],
3131 (real)0, h_particles2remove_counter);
3132 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_ay[numParticlesLocal-h_particles2remove_counter],
3133 (real)0, h_particles2remove_counter);
3134#if DIM == 3
3135 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_z[numParticlesLocal-h_particles2remove_counter],
3136 (real)0, h_particles2remove_counter);
3137 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_vz[numParticlesLocal-h_particles2remove_counter],
3138 (real)0, h_particles2remove_counter);
3139 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_az[numParticlesLocal-h_particles2remove_counter],
3140 (real)0, h_particles2remove_counter);
3141#endif
3142#endif
3143 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_mass[numParticlesLocal-h_particles2remove_counter],
3144 (real)0, h_particles2remove_counter);
3145 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_uid[numParticlesLocal-h_particles2remove_counter],
3146 (integer)0, h_particles2remove_counter);
3147#if SPH_SIM
3148 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_materialId[numParticlesLocal-h_particles2remove_counter],
3149 (integer)0, h_particles2remove_counter);
3150 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_sml[numParticlesLocal-h_particles2remove_counter],
3151 (real)0, h_particles2remove_counter);
3152 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_rho[numParticlesLocal-h_particles2remove_counter],
3153 (real)0, h_particles2remove_counter);
3154 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_p[numParticlesLocal-h_particles2remove_counter],
3155 (real)0, h_particles2remove_counter);
3156 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_e[numParticlesLocal-h_particles2remove_counter],
3157 (real)0, h_particles2remove_counter);
3158 time += HelperNS::Kernel::Launch::resetArray(&particleHandler->d_cs[numParticlesLocal-h_particles2remove_counter],
3159 (real)0, h_particles2remove_counter);
3160#endif
3161
3162 //TODO: all entries (removing particles)
3163
3164 //cuda::free(d_tempInt);
3165
3166 numParticlesLocal -= h_particles2remove_counter;
3167 Logger(INFO) << "removing #" << h_particles2remove_counter << " particles!";
3168
3169 return time;
3170}
3171
3172// used for gravity and sph
3173template <typename T>
3174integer Miluphpc::sendParticles(T *sendBuffer, T *receiveBuffer, integer *sendLengths, integer *receiveLengths) {
3175
3176 //Timer temp;
3177 boost::mpi::communicator comm;
3178
3179 std::vector<boost::mpi::request> reqParticles;
3180 std::vector<boost::mpi::status> statParticles;
3181
3182 integer reqCounter = 0;
3183 integer receiveOffset = 0;
3184 integer sendOffset = 0;
3185
3186 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
3187 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
3188 reqParticles.push_back(comm.isend(proc, 17, &sendBuffer[sendOffset], sendLengths[proc]));
3189 statParticles.push_back(comm.recv(proc, 17, &receiveBuffer[receiveOffset], receiveLengths[proc]));
3190
3191 receiveOffset += receiveLengths[proc];
3192 sendOffset += sendLengths[proc];
3193 }
3194 }
3195
3196 boost::mpi::wait_all(reqParticles.begin(), reqParticles.end());
3197
3198 //real elapsed = temp.elapsed();
3199 //Logger(TIME) << "sph: resending: " << elapsed;
3200 //if (elapsed > 10.) {
3201 //Logger(INFO) << "sph: resending: receiveOffset: " << receiveOffset << ", sendOffset: " << sendOffset;
3202 //}
3203
3204}
3205
3206// used for assigning particles to corresponding process
3207/*
3208template <typename T>
3209integer Miluphpc::sendParticlesEntry(integer *sendLengths, integer *receiveLengths, T *entry, T *entryBuffer, T *copyBuffer) {
3210
3211 boost::mpi::communicator comm;
3212
3213 std::vector<boost::mpi::request> reqParticles;
3214 std::vector<boost::mpi::status> statParticles;
3215
3216 integer reqCounter = 0;
3217 integer receiveOffset = 0;
3218 integer sendOffset = 0;
3219
3220 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
3221 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
3222 if (proc == 0) {
3223 reqParticles.push_back(comm.isend(proc, 17, &entry[0], sendLengths[proc]));
3224 //Logger(INFO) << "Sending from: " << 0 << " to proc: " << proc;
3225 }
3226 else {
3227 reqParticles.push_back(comm.isend(proc, 17,
3228 &entry[subDomainKeyTreeHandler->h_procParticleCounter[proc-1] + sendOffset],
3229 sendLengths[proc]));
3230
3231 //Logger(INFO) << "Sending from: " << subDomainKeyTreeHandler->h_procParticleCounter[proc-1] + sendOffset << " to proc: " << proc;
3232 }
3233 //reqParticles.push_back(comm.isend(proc, 17,&entry[sendOffset], sendLengths[proc]));
3234 statParticles.push_back(comm.recv(proc, 17, &entryBuffer[0] + receiveOffset,
3235 receiveLengths[proc]));
3236
3237 //Logger(INFO) << "Receiving at " << receiveOffset << " from proc: " << proc;
3238
3239 //sendOffset += subDomainKeyTreeHandler->h_procParticleCounter[proc-1]; //sendLengths[proc];
3240 receiveOffset += receiveLengths[proc];
3241 }
3242 sendOffset += subDomainKeyTreeHandler->h_procParticleCounter[proc-1]; //sendLengths[proc];
3243 }
3244
3245 boost::mpi::wait_all(reqParticles.begin(), reqParticles.end());
3246
3247 integer offset = 0;
3248 for (int i=0; i < subDomainKeyTreeHandler->h_subDomainKeyTree->rank; i++) {
3249 offset += subDomainKeyTreeHandler->h_procParticleCounter[i];
3250 }
3251
3252 if (subDomainKeyTreeHandler->h_subDomainKeyTree->rank != 0) {
3253 if (offset > 0 && (subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] - offset) > 0) {
3254 //TODO: following line needed? (probably not)
3255 //HelperNS::Kernel::Launch::copyArray(&entry[0], &entry[subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] - offset], offset);
3256 //KernelHandler.copyArray(&entry[0], &entry[h_procCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] - offset], offset);
3257 }
3258 HelperNS::Kernel::Launch::copyArray(&copyBuffer[0], &entry[offset], subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]);
3259 HelperNS::Kernel::Launch::copyArray(&entry[0], &copyBuffer[0], subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]);
3260 //KernelHandler.copyArray(&d_tempArray_2[0], &entry[offset], subDomainKeyTreeHandler->d_h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]);
3261 //KernelHandler.copyArray(&entry[0], &d_tempArray_2[0], subDomainKeyTreeHandler->d_h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]);
3262 //Logger(INFO) << "moving from offet: " << offset << " length: " << subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank];
3263 }
3264
3265 HelperNS::Kernel::Launch::resetArray(&entry[subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]],
3266 (T)0, numParticles-subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]);
3267 HelperNS::Kernel::Launch::copyArray(&entry[subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]],
3268 entryBuffer, receiveOffset);
3269 //KernelHandler.resetFloatArray(&entry[h_procCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]], 0, numParticles-h_procCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]); //resetFloatArrayKernel(float *array, float value, int n)
3270 //KernelHandler.copyArray(&entry[h_procCounter[h_subDomainHandler->rank]], d_tempArray, receiveOffset);
3271
3272 //Logger(INFO) << "numParticlesLocal = " << receiveOffset << " + " << subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank];
3273 return receiveOffset + subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank];
3274} */
3275
3276/*
3277template <typename T>
3278integer Miluphpc::sendParticlesEntry(integer *sendLengths, integer *receiveLengths, T *entry, T *entryBuffer, T *copyBuffer) {
3279
3280 boost::mpi::communicator comm;
3281
3282 std::vector<boost::mpi::request> reqParticles;
3283 std::vector<boost::mpi::status> statParticles;
3284
3285 int receiveOffset = 0;
3286 int sendOffset = 0;
3287
3288 int rank = subDomainKeyTreeHandler->h_subDomainKeyTree->rank;
3289
3290 if (rank != 0) {
3291 sendOffset = subDomainKeyTreeHandler->h_procParticleCounter[rank - 1];
3292 }
3293
3294 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
3295 if (proc != rank) {
3296 reqParticles.push_back(comm.isend(proc, 17, &entry[sendOffset], sendLengths[proc]));
3297 statParticles.push_back(comm.recv(proc, 17, &entryBuffer[receiveOffset], receiveLengths[proc]));
3298
3299 receiveOffset += receiveLengths[proc];
3300 //sendOffset += sendLengths[proc];
3301 }
3302 sendOffset += subDomainKeyTreeHandler->h_procParticleCounter[proc]; //sendLengths[proc];
3303 }
3304
3305 boost::mpi::wait_all(reqParticles.begin(), reqParticles.end());
3306
3307 int offset = 0;
3308 for (int i=0; i < rank; i++) {
3309 offset += subDomainKeyTreeHandler->h_procParticleCounter[i];
3310 }
3311
3312 int copyLength = subDomainKeyTreeHandler->h_procParticleCounter[rank];
3313
3314 if (subDomainKeyTreeHandler->h_subDomainKeyTree->rank != 0) {
3315 HelperNS::Kernel::Launch::copyArray(&copyBuffer[0], &entry[offset], copyLength);
3316 HelperNS::Kernel::Launch::copyArray(&entry[0], &copyBuffer[0], copyLength);
3317 }
3318
3319 HelperNS::Kernel::Launch::resetArray(&entry[copyLength], (T)0, numParticles-copyLength);
3320 HelperNS::Kernel::Launch::copyArray(&entry[copyLength], &entryBuffer[0], receiveOffset);
3321
3322 return receiveOffset + copyLength;
3323}
3324*/
3325
3326template <typename T>
3327integer Miluphpc::sendParticlesEntry(integer *sendLengths, integer *receiveLengths, T *entry, T *entryBuffer, T *copyBuffer) {
3328
3329 boost::mpi::communicator comm;
3330
3331 std::vector<boost::mpi::request> reqParticles;
3332 std::vector<boost::mpi::status> statParticles;
3333
3334 integer reqCounter = 0;
3335 integer receiveOffset = 0;
3336 integer sendOffset = 0;
3337
3338 T *h_entry = new T[numParticles];
3339 T *h_entryBuffer = new T[numParticles];
3340
3341 for (int i=0; i<numParticles; ++i) {
3342 h_entry[i] = (T)0;
3343 h_entryBuffer[i] = (T)0;
3344 }
3345
3346 cuda::copy(h_entry, entry, numParticles, To::host);
3347
3348 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++) {
3349 if (proc != subDomainKeyTreeHandler->h_subDomainKeyTree->rank) {
3350 if (proc == 0) {
3351 reqParticles.push_back(comm.isend(proc, 17, &h_entry[0], sendLengths[proc]));
3352 //Logger(INFO) << "Sending from: " << 0 << " to proc: " << proc;
3353 }
3354 else {
3355 reqParticles.push_back(comm.isend(proc, 17,
3356 &h_entry[subDomainKeyTreeHandler->h_procParticleCounter[proc-1] + sendOffset],
3357 sendLengths[proc]));
3358
3359 //Logger(INFO) << "Sending from: " << subDomainKeyTreeHandler->h_procParticleCounter[proc-1] + sendOffset << " to proc: " << proc;
3360 }
3361 //reqParticles.push_back(comm.isend(proc, 17,&entry[sendOffset], sendLengths[proc]));
3362 statParticles.push_back(comm.recv(proc, 17, &h_entryBuffer[0] + receiveOffset,
3363 receiveLengths[proc]));
3364
3365 //Logger(INFO) << "Receiving at " << receiveOffset << " from proc: " << proc;
3366
3367 //sendOffset += subDomainKeyTreeHandler->h_procParticleCounter[proc-1]; //sendLengths[proc];
3368 receiveOffset += receiveLengths[proc];
3369 }
3370 sendOffset += subDomainKeyTreeHandler->h_procParticleCounter[proc-1]; //sendLengths[proc];
3371 }
3372
3373 boost::mpi::wait_all(reqParticles.begin(), reqParticles.end());
3374
3375 cuda::copy(h_entryBuffer, entryBuffer, numParticles, To::device);
3376
3377 delete [] h_entry;
3378 delete [] h_entryBuffer;
3379
3380 integer offset = 0;
3381 for (int i=0; i < subDomainKeyTreeHandler->h_subDomainKeyTree->rank; i++) {
3382 offset += subDomainKeyTreeHandler->h_procParticleCounter[i];
3383 }
3384
3385 if (subDomainKeyTreeHandler->h_subDomainKeyTree->rank != 0) {
3386 if (offset > 0 && (subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] - offset) > 0) {
3387 //TODO: following line needed? (probably not)
3388 //HelperNS::Kernel::Launch::copyArray(&entry[0], &entry[subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] - offset], offset);
3389 //KernelHandler.copyArray(&entry[0], &entry[h_procCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank] - offset], offset);
3390 }
3391 HelperNS::Kernel::Launch::copyArray(&copyBuffer[0], &entry[offset], subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]);
3392 HelperNS::Kernel::Launch::copyArray(&entry[0], &copyBuffer[0], subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]);
3393 //KernelHandler.copyArray(&d_tempArray_2[0], &entry[offset], subDomainKeyTreeHandler->d_h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]);
3394 //KernelHandler.copyArray(&entry[0], &d_tempArray_2[0], subDomainKeyTreeHandler->d_h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]);
3395 //Logger(INFO) << "moving from offet: " << offset << " length: " << subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank];
3396 }
3397
3398 HelperNS::Kernel::Launch::resetArray(&entry[subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]],
3399 (T)0, numParticles-subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]);
3400 HelperNS::Kernel::Launch::copyArray(&entry[subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]],
3401 entryBuffer, receiveOffset);
3402 //KernelHandler.resetFloatArray(&entry[h_procCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]], 0, numParticles-h_procCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank]); //resetFloatArrayKernel(float *array, float value, int n)
3403 //KernelHandler.copyArray(&entry[h_procCounter[h_subDomainHandler->rank]], d_tempArray, receiveOffset);
3404
3405 //Logger(INFO) << "numParticlesLocal = " << receiveOffset << " + " << subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank];
3406 return receiveOffset + subDomainKeyTreeHandler->h_procParticleCounter[subDomainKeyTreeHandler->h_subDomainKeyTree->rank];
3407}
3408
3409real Miluphpc::particles2file(int step) {
3410
3411 Timer timer;
3412
3413 boost::mpi::communicator comm;
3414 sumParticles = numParticlesLocal;
3415 all_reduce(comm, boost::mpi::inplace_t<integer*>(&sumParticles), 1, std::plus<integer>());
3416
3417 std::stringstream stepss;
3418 stepss << std::setw(6) << std::setfill('0') << step;
3419
3420 HighFive::File h5file(simulationParameters.directory + "ts" + stepss.str() + ".h5",
3421 HighFive::File::ReadWrite | HighFive::File::Create | HighFive::File::Truncate,
3422 HighFive::MPIOFileDriver(MPI_COMM_WORLD, MPI_INFO_NULL));
3423
3424 configParameters2file(h5file);
3425
3426 std::vector <size_t> dataSpaceDims(2);
3427 dataSpaceDims[0] = std::size_t(sumParticles);
3428 dataSpaceDims[1] = DIM;
3429
3430 HighFive::DataSet ranges = h5file.createDataSet<keyType>("/ranges", HighFive::DataSpace(subDomainKeyTreeHandler->h_numProcesses + 1));
3431
3432 keyType *rangeValues;
3433 rangeValues = new keyType[subDomainKeyTreeHandler->h_numProcesses + 1];
3434
3435 subDomainKeyTreeHandler->copy(To::host, true, false);
3436 for (int i=0; i<=subDomainKeyTreeHandler->h_numProcesses; i++) {
3437 rangeValues[i] = subDomainKeyTreeHandler->h_subDomainKeyTree->range[i];
3438 Logger(INFO) << "rangeValues[" << i << "] = " << rangeValues[i];
3439 }
3440
3441 ranges.write(rangeValues);
3442
3443 delete [] rangeValues;
3444
3445 // TODO: add uid (and other entries?)
3446 std::vector<real> time;
3447 simulationTimeHandler->copy(To::host);
3448 time.push_back(*simulationTimeHandler->h_currentTime); //step*simulationParameters.timestep);
3449 HighFive::DataSet h5_time = h5file.createDataSet<real>("/time", HighFive::DataSpace::From(time));
3450
3451 HighFive::DataSet pos = h5file.createDataSet<real>("/x", HighFive::DataSpace(dataSpaceDims));
3452 HighFive::DataSet vel = h5file.createDataSet<real>("/v", HighFive::DataSpace(dataSpaceDims));
3453 HighFive::DataSet key = h5file.createDataSet<keyType>("/key", HighFive::DataSpace(sumParticles));
3454 HighFive::DataSet h5_mass = h5file.createDataSet<real>("/m", HighFive::DataSpace(sumParticles));
3455 HighFive::DataSet h5_proc = h5file.createDataSet<int>("/proc", HighFive::DataSpace(sumParticles));
3456#if SPH_SIM
3457 HighFive::DataSet h5_rho = h5file.createDataSet<real>("/rho", HighFive::DataSpace(sumParticles));
3458 HighFive::DataSet h5_p = h5file.createDataSet<real>("/p", HighFive::DataSpace(sumParticles));
3459 HighFive::DataSet h5_e = h5file.createDataSet<real>("/e", HighFive::DataSpace(sumParticles));
3460 HighFive::DataSet h5_sml = h5file.createDataSet<real>("/sml", HighFive::DataSpace(sumParticles));
3461 HighFive::DataSet h5_noi = h5file.createDataSet<integer>("/noi", HighFive::DataSpace(sumParticles));
3462 HighFive::DataSet h5_cs = h5file.createDataSet<real>("/cs", HighFive::DataSpace(sumParticles));
3463#endif
3464
3465 HighFive::DataSet h5_totalEnergy;
3466 if (simulationParameters.calculateEnergy) {
3467 h5_totalEnergy = h5file.createDataSet<real>("/totalEnergy",
3468 HighFive::DataSpace(subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses));
3469 }
3470 HighFive::DataSet h5_totalAngularMomentum;
3471 if (simulationParameters.calculateAngularMomentum) {
3472 h5_totalAngularMomentum = h5file.createDataSet<real>("/totalAngularMomentum",
3473 HighFive::DataSpace(subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses));
3474 }
3475
3476 Logger(INFO) << "creating datasets ...";
3477 // ----------
3478
3479 std::vector<std::vector<real>> x, v; // two dimensional vector for 3D vector data
3480 std::vector<keyType> k; // one dimensional vector holding particle keys
3481 std::vector<real> mass;
3482 std::vector<int> particleProc;
3483#if SPH_SIM
3484 std::vector<real> rho, p, e, sml, cs;
3485 std::vector<integer> noi;
3486#endif
3487
3488 Logger(INFO) << "copying particles ...";
3489
3490 particleHandler->copyDistribution(To::host, true, false);
3491#if SPH_SIM
3492 particleHandler->copySPH(To::host);
3493#endif
3494
3495 Logger(INFO) << "getting particle keys ...";
3496 keyType *d_keys = buffer->d_keyTypeBuffer;
3497 //cuda::malloc(d_keys, numParticlesLocal);
3498
3499 //TreeNS::Kernel::Launch::getParticleKeys(treeHandler->d_tree, particleHandler->d_particles,
3500 // d_keys, 21, numParticlesLocal, curveType);
3501 SubDomainKeyTreeNS::Kernel::Launch::getParticleKeys(subDomainKeyTreeHandler->d_subDomainKeyTree,
3502 treeHandler->d_tree, particleHandler->d_particles,
3503 d_keys, 21, numParticlesLocal, curveType);
3504 //SubDomainKeyTreeNS::Kernel::Launch::getParticleKeys(subDomainKeyTreeHandler->d_subDomainKeyTree,
3505 // treeHandler->d_tree, particleHandler->d_particles,
3506 // d_keys, 21, numParticlesLocal, curveType);
3507
3508
3509 keyType *h_keys;
3510 h_keys = new keyType[numParticlesLocal];
3511 cuda::copy(h_keys, d_keys, numParticlesLocal, To::host);
3512
3513 integer keyProc;
3514
3515 Logger(INFO) << "filling vectors ...";
3516
3517 for (int i=0; i<numParticlesLocal; i++) {
3518#if DIM == 1
3519 x.push_back({particleHandler->h_x[i]});
3520 v.push_back({particleHandler->h_vx[i]});
3521#elif DIM == 2
3522 x.push_back({particleHandler->h_x[i], particleHandler->h_y[i]});
3523 v.push_back({particleHandler->h_vx[i], particleHandler->h_vy[i]});
3524#else
3525 x.push_back({particleHandler->h_x[i], particleHandler->h_y[i], particleHandler->h_z[i]});
3526 v.push_back({particleHandler->h_vx[i], particleHandler->h_vy[i], particleHandler->h_vz[i]});
3527#endif
3528 k.push_back(h_keys[i]);
3529 mass.push_back(particleHandler->h_mass[i]);
3530 particleProc.push_back(subDomainKeyTreeHandler->h_subDomainKeyTree->rank);
3531 //Logger(INFO) << "mass[" << i << "] = " << mass[i];
3532#if SPH_SIM
3533 rho.push_back(particleHandler->h_rho[i]);
3534 p.push_back(particleHandler->h_p[i]);
3535 e.push_back(particleHandler->h_e[i]);
3536 sml.push_back(particleHandler->h_sml[i]);
3537 noi.push_back(particleHandler->h_noi[i]);
3538 cs.push_back(particleHandler->h_cs[i]);
3539#endif
3540 }
3541
3542 //cuda::free(d_keys);
3543 delete [] h_keys;
3544
3545 // receive buffer
3546 int procN[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
3547
3548 // send buffer
3549 int sendProcN[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
3550 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++){
3551 sendProcN[proc] = subDomainKeyTreeHandler->h_subDomainKeyTree->rank == proc ? numParticlesLocal : 0;
3552 }
3553
3554 all_reduce(comm, sendProcN, subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses, procN,
3555 boost::mpi::maximum<integer>());
3556
3557 std::size_t nOffset = 0;
3558 // count total particles on other processes
3559 for (int proc = 0; proc < subDomainKeyTreeHandler->h_subDomainKeyTree->rank; proc++){
3560 nOffset += procN[proc];
3561 }
3562 Logger(DEBUG) << "Offset to write to datasets: " << std::to_string(nOffset);
3563
3564 Logger(INFO) << "writing to h5 ...";
3565
3566 h5_time.write(time);
3567 // write to associated datasets in h5 file
3568 // only working when load balancing has been completed and even number of particles
3569 pos.select({nOffset, 0},
3570 {std::size_t(numParticlesLocal), std::size_t(DIM)}).write(x);
3571 vel.select({nOffset, 0},
3572 {std::size_t(numParticlesLocal), std::size_t(DIM)}).write(v);
3573 key.select({nOffset}, {std::size_t(numParticlesLocal)}).write(k);
3574 h5_mass.select({nOffset}, {std::size_t(numParticlesLocal)}).write(mass);
3575 h5_proc.select({nOffset}, {std::size_t(numParticlesLocal)}).write(particleProc);
3576#if SPH_SIM
3577 h5_rho.select({nOffset}, {std::size_t(numParticlesLocal)}).write(rho);
3578 h5_p.select({nOffset}, {std::size_t(numParticlesLocal)}).write(p);
3579 h5_e.select({nOffset}, {std::size_t(numParticlesLocal)}).write(e);
3580 h5_sml.select({nOffset}, {std::size_t(numParticlesLocal)}).write(sml);
3581 h5_noi.select({nOffset}, {std::size_t(numParticlesLocal)}).write(noi);
3582 h5_cs.select({nOffset}, {std::size_t(numParticlesLocal)}).write(cs);
3583#endif
3584
3585 if (simulationParameters.calculateEnergy) {
3586 h5_totalEnergy.select({subDomainKeyTreeHandler->h_subDomainKeyTree->rank}, {1}).write(totalEnergy);
3587 }
3588 if (simulationParameters.calculateAngularMomentum) {
3589 h5_totalAngularMomentum.select({subDomainKeyTreeHandler->h_subDomainKeyTree->rank}, {1}).write(totalAngularMomentum);
3590 }
3591
3592 if (simulationParameters.calculateCenterOfMass) {
3593 real *h_com = new real[DIM];
3594 real *d_com = buffer->d_realBuffer;
3595 //cuda::malloc(d_com, DIM);
3596 TreeNS::Kernel::Launch::globalCOM(treeHandler->d_tree, particleHandler->d_particles, d_com);
3597 cuda::copy(h_com, d_com, DIM, To::host);
3598 for (int i=0; i<DIM; i++) {
3599 Logger(DEBUG) << "com[" << i << "] = " << h_com[i];
3600 }
3601 //cuda::free(d_com);
3602
3603 HighFive::DataSet _com = h5file.createDataSet<real>("/COM", HighFive::DataSpace(DIM));
3604 std::vector<real> centerOfMass;
3605 centerOfMass.push_back(h_com[0]);
3606#if DIM > 1
3607 centerOfMass.push_back(h_com[1]);
3608#if DIM == 3
3609 centerOfMass.push_back(h_com[2]);
3610#endif
3611#endif
3612 _com.select({0}, {std::size_t(DIM)}).write(centerOfMass);
3613 delete [] h_com;
3614 }
3615
3616 return timer.elapsed();
3617
3618}
3619
3620real Miluphpc::particles2file(const std::string& filename, int *particleIndices, int length) {
3621
3622 Timer timer;
3623
3624 boost::mpi::communicator comm;
3625 int totalLength = length;
3626 all_reduce(comm, boost::mpi::inplace_t<integer*>(&totalLength), 1, std::plus<integer>());
3627
3628 std::stringstream file;
3629 file << simulationParameters.logDirectory << filename.c_str() << ".h5";
3630 //stepss << std::setw(6) << std::setfill('0') << step;
3631
3632 HighFive::File h5file(file.str(),
3633 HighFive::File::ReadWrite | HighFive::File::Create | HighFive::File::Truncate,
3634 HighFive::MPIOFileDriver(MPI_COMM_WORLD, MPI_INFO_NULL));
3635
3636 std::vector <size_t> dataSpaceDims(2);
3637 dataSpaceDims[0] = std::size_t(totalLength);
3638 dataSpaceDims[1] = DIM;
3639
3640 HighFive::DataSet ranges = h5file.createDataSet<keyType>("/ranges",
3641 HighFive::DataSpace(subDomainKeyTreeHandler->h_numProcesses + 1));
3642
3643 keyType *rangeValues;
3644 rangeValues = new keyType[subDomainKeyTreeHandler->h_numProcesses + 1];
3645
3646 subDomainKeyTreeHandler->copy(To::host, true, false);
3647 for (int i=0; i<=subDomainKeyTreeHandler->h_numProcesses; i++) {
3648 rangeValues[i] = subDomainKeyTreeHandler->h_subDomainKeyTree->range[i];
3649 Logger(INFO) << "rangeValues[" << i << "] = " << rangeValues[i];
3650 }
3651
3652 ranges.write(rangeValues);
3653
3654 delete [] rangeValues;
3655
3656 HighFive::DataSet pos = h5file.createDataSet<real>("/x", HighFive::DataSpace(dataSpaceDims));
3657 //HighFive::DataSet vel = h5file.createDataSet<real>("/v", HighFive::DataSpace(dataSpaceDims));
3658 HighFive::DataSet key = h5file.createDataSet<keyType>("/key", HighFive::DataSpace(totalLength));
3659
3660 // ----------
3661
3662 std::vector<std::vector<real>> x, v; // two dimensional vector for 3D vector data
3663 std::vector<keyType> k; // one dimensional vector holding particle keys
3664
3665 particleHandler->copyDistribution(To::host, true, false, true);
3666
3667 keyType *d_keys = buffer->d_keyTypeBuffer;
3668 //cuda::malloc(d_keys, numNodes);
3669
3670 //TreeNS::Kernel::Launch::getParticleKeys(treeHandler->d_tree, particleHandler->d_particles,
3671 // d_keys, 21, numParticlesLocal, curveType);
3672 SubDomainKeyTreeNS::Kernel::Launch::getParticleKeys(subDomainKeyTreeHandler->d_subDomainKeyTree,
3673 treeHandler->d_tree, particleHandler->d_particles,
3674 d_keys, 21, numNodes, curveType);
3675 //SubDomainKeyTreeNS::Kernel::Launch::getParticleKeys(subDomainKeyTreeHandler->d_subDomainKeyTree,
3676 // treeHandler->d_tree, particleHandler->d_particles,
3677 // d_keys, 21, numParticlesLocal, curveType);
3678
3679
3680 keyType *h_keys;
3681 h_keys = new keyType[numNodes];
3682 cuda::copy(h_keys, d_keys, numNodes, To::host);
3683
3684 integer keyProc;
3685
3686 for (int i=0; i<length; i++) {
3687#if DIM == 1
3688 x.push_back({particleHandler->h_x[particleIndices[i]]});
3689 //v.push_back({particleHandler->h_vx[particleIndices[i]]});
3690#elif DIM == 2
3691 x.push_back({particleHandler->h_x[particleIndices[i]], particleHandler->h_y[particleIndices[i]]});
3692 //v.push_back({particleHandler->h_vx[i], particleHandler->h_vy[i]});
3693#else
3694 x.push_back({particleHandler->h_x[particleIndices[i]], particleHandler->h_y[particleIndices[i]],
3695 particleHandler->h_z[particleIndices[i]]});
3696 //v.push_back({particleHandler->h_vx[particleIndices[i]], particleHandler->h_vy[particleIndices[i]],
3697 // particleHandler->h_vz[particleIndices[i]]});
3698#endif
3699 k.push_back(h_keys[particleIndices[i]]);
3700 }
3701
3702 //cuda::free(d_keys);
3703 delete [] h_keys;
3704
3705 // receive buffer
3706 int procN[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
3707
3708 // send buffer
3709 int sendProcN[subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses];
3710 for (int proc=0; proc<subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses; proc++){
3711 sendProcN[proc] = subDomainKeyTreeHandler->h_subDomainKeyTree->rank == proc ? length : 0;
3712 }
3713
3714 all_reduce(comm, sendProcN, subDomainKeyTreeHandler->h_subDomainKeyTree->numProcesses, procN,
3715 boost::mpi::maximum<integer>());
3716
3717 std::size_t nOffset = 0;
3718 // count total particles on other processes
3719 for (int proc = 0; proc < subDomainKeyTreeHandler->h_subDomainKeyTree->rank; proc++){
3720 nOffset += procN[proc];
3721 }
3722 Logger(DEBUG) << "Offset to write to datasets: " << std::to_string(nOffset);
3723
3724 // write to associated datasets in h5 file
3725 // only working when load balancing has been completed and even number of particles
3726 pos.select({nOffset, 0},
3727 {std::size_t(length), std::size_t(DIM)}).write(x);
3728 //vel.select({nOffset, 0},
3729 // {std::size_t(length), std::size_t(DIM)}).write(v);
3730 key.select({nOffset}, {std::size_t(length)}).write(k);
3731
3732 return timer.elapsed();
3733
3734}
3735
3736real Miluphpc::configParameters2file(HighFive::File &h5file) {
3737
3738 // TODO: write config parameters to HDF5 file
3739
3740 // not working if only one rank writes attributes ...
3741 //if (subDomainKeyTreeHandler->h_subDomainKeyTree->rank == 0) {
3742 // HighFive::Group header = h5file.createGroup("/Header");
3743 // //header.createAttribute<int>("test", 1);
3744 //}
3745
3746 // testing
3747 HighFive::Group header = h5file.createGroup("config");
3748 int test_1 = 10;
3749 HighFive::Attribute b_1 = header.createAttribute<int>("test_1", HighFive::DataSpace::From(test_1));
3750 b_1.write(test_1);
3751 double test_2 = 1.5;
3752 HighFive::Attribute b_2 = header.createAttribute<double>("test_2", test_2);
3753 // end: testing
3754
3755 // which information are necessary
3756 // - time (to know from which start time to continue)
3757 // - ...
3758}
3759
3760void Miluphpc::getMemoryInfo() {
3761
3762 // TODO: simulate other memory allocations to include "peak memory usage" ?
3763 // - including updateRange()
3764 // - sorting using CUDA cub
3765 // - ...
3766
3767 size_t free_bytes, total_bytes, used_bytes;
3768 cudaMemGetInfo(&free_bytes, &total_bytes);
3769 used_bytes = total_bytes - free_bytes;
3770
3771 //Logger(INFO) << "free bytes: " << (double)free_bytes << " B = " << 9.31e-10 * free_bytes << " GB ("
3772 // << (double)free_bytes/(double)total_bytes * 100. << " %)";
3773 //Logger(INFO) << "used bytes: " << (double)used_bytes << " B = " << 9.31e-10 * used_bytes << " GB ("
3774 // << (double)used_bytes/(double)total_bytes * 100. << " %)";
3775 //Logger(INFO) << "total bytes: " << (double)total_bytes << " B = " << 9.31e-10 * total_bytes << " GB";
3776
3777 Logger(INFO) << "MEMORY INFO: used: " << std::setprecision(4) << 9.31e-10 * used_bytes << " GB ("
3778 << (double)used_bytes/(double)total_bytes * 100. << " %)"
3779 << " free: " << 9.31e-10 * free_bytes << " GB ("
3780 << (double)free_bytes/(double)total_bytes * 100. << " %)"
3781 << " available: " << 9.31e-10 * total_bytes << " GB";
3782
3783}
DomainListHandler
Definition: subdomain_handler.h:81
DomainListHandler::d_sortedDomainListKeys
keyType * d_sortedDomainListKeys
device sorted domain list keys
Definition: subdomain_handler.h:98
DomainListHandler::d_domainListKeys
keyType * d_domainListKeys
device domain list key
Definition: subdomain_handler.h:96
DomainListHandler::d_relevantDomainListProcess
integer * d_relevantDomainListProcess
Definition: subdomain_handler.h:102
DomainListHandler::reset
void reset()
Resetting entries.
Definition: subdomain_handler.cpp:92
DomainListHandler::d_domainList
DomainList * d_domainList
device instance of DomainList class
Definition: subdomain_handler.h:107
DomainListHandler::d_domainListIndex
integer * d_domainListIndex
device domain list index
Definition: subdomain_handler.h:92
DomainListHandler::d_domainListCounter
integer * d_domainListCounter
device domain list counter
Definition: subdomain_handler.h:94
H5Profiler::vector2file
void vector2file(const std::string &path, std::vector< T > data)
Definition: h5profiler.h:246
H5Profiler::value2file
void value2file(const std::string &path, T value)
Write value to single value data set.
Definition: h5profiler.h:202
HelperHandler
Definition: helper_handler.h:9
HelperHandler::d_sendCount
integer * d_sendCount
Definition: helper_handler.h:36
HelperHandler::d_realVal
real * d_realVal
Definition: helper_handler.h:24
HelperHandler::reset
void reset()
Definition: helper_handler.cpp:86
HelperHandler::d_helper
Helper * d_helper
Definition: helper_handler.h:49
HelperHandler::d_integerBuffer3
integer * d_integerBuffer3
Definition: helper_handler.h:33
HelperHandler::d_idIntegerBuffer
idInteger * d_idIntegerBuffer
Definition: helper_handler.h:39
HelperHandler::d_integerVal
integer * d_integerVal
Definition: helper_handler.h:20
HelperHandler::d_keyTypeBuffer
keyType * d_keyTypeBuffer
Definition: helper_handler.h:45
HelperHandler::d_idIntegerBuffer1
idInteger * d_idIntegerBuffer1
Definition: helper_handler.h:40
HelperHandler::d_integerBuffer1
integer * d_integerBuffer1
Definition: helper_handler.h:31
HelperHandler::d_keyTypeBuffer2
keyType * d_keyTypeBuffer2
Definition: helper_handler.h:47
HelperHandler::d_realBuffer1
real * d_realBuffer1
Definition: helper_handler.h:43
HelperHandler::d_sendCount1
integer * d_sendCount1
Definition: helper_handler.h:37
HelperHandler::d_integerBuffer
integer * d_integerBuffer
Definition: helper_handler.h:30
HelperHandler::d_integerBuffer4
integer * d_integerBuffer4
Definition: helper_handler.h:34
HelperHandler::d_integerBuffer2
integer * d_integerBuffer2
Definition: helper_handler.h:32
HelperHandler::d_realBuffer
real * d_realBuffer
Definition: helper_handler.h:42
HelperHandler::d_realVal1
real * d_realVal1
Definition: helper_handler.h:25
HelperHandler::d_keyTypeBuffer1
keyType * d_keyTypeBuffer1
Definition: helper_handler.h:46
Logger
Logger class.
Definition: logger.h:80
MaterialHandler
Material class handler.
Definition: material_handler.h:53
MaterialHandler::h_materials
Material * h_materials
host instance of material class
Definition: material_handler.h:59
MaterialHandler::copy
void copy(To::Target target, integer index=-1)
Definition: material_handler.cpp:125
MaterialHandler::numMaterials
integer numMaterials
number of materials or rather material instances
Definition: material_handler.h:57
MaterialHandler::d_materials
Material * d_materials
device instance of material class
Definition: material_handler.h:61
Material::sml
real sml
Definition: material.cuh:112
Material::info
CUDA_CALLABLE_MEMBER void info()
Definition: material.cu:12
Miluphpc::d_mutex
integer * d_mutex
Definition: miluphpc.h:307
Miluphpc::domainListHandler
DomainListHandler * domainListHandler
Instance to handle the DomainList instance on device and host.
Definition: miluphpc.h:315
Miluphpc::totalEnergy
real totalEnergy
total energy
Definition: miluphpc.h:218
Miluphpc::afterIntegrationStep
void afterIntegrationStep()
Definition: miluphpc.cpp:471
Miluphpc::parallel_sph
real parallel_sph()
Parallel version regarding computation of SPH-stuff.
Definition: miluphpc.cpp:2216
Miluphpc::parallel_pseudoParticles
real parallel_pseudoParticles()
Parallel version regarding computation of pseudo-particles.
Definition: miluphpc.cpp:1061
Miluphpc::particleHandler
ParticleHandler * particleHandler
Instance to handle the Particles instance on device and host.
Definition: miluphpc.h:309
Miluphpc::tree
real tree()
Wrapper function for building the tree (and domain tree).
Definition: miluphpc.cpp:951
Miluphpc::updateRangeApproximately
void updateRangeApproximately(int aimedParticlesPerProcess, int bins=5000)
Update the ranges (approximately and dynamically).
Definition: miluphpc.cpp:2988
Miluphpc::rhs
real rhs(int step, bool selfGravity=true, bool assignParticlesToProcess=true)
Definition: miluphpc.cpp:294
Miluphpc::reset
real reset()
Reset arrays, values, ...
Definition: miluphpc.cpp:586
Miluphpc::parallel_tree
real parallel_tree()
Parallel version regarding tree-stuff.
Definition: miluphpc.cpp:958
Miluphpc::angularMomentum
real angularMomentum()
Calculate the angular momentum for all particles.
Definition: miluphpc.cpp:484
Miluphpc::curveType
Curve::Type curveType
Space-filling curve type to be used (Lebesgue or Hilbert)
Definition: miluphpc.h:277
Miluphpc::assignParticles
real assignParticles()
Assign particles to correct process in dependence of particle key and ranges.
Definition: miluphpc.cpp:700
Miluphpc::profiler
H5Profiler & profiler
H5 profiler instance.
Definition: miluphpc.h:274
Miluphpc::fixedLoadBalancing
void fixedLoadBalancing()
Load balancing via equidistant ranges.
Definition: miluphpc.cpp:2839
Miluphpc::numNodes
integer numNodes
number of nodes (to be allocated)
Definition: miluphpc.h:296
Miluphpc::dynamicLoadBalancing
void dynamicLoadBalancing(int bins=5000)
Pre-calculations for updateRangeApproximately.
Definition: miluphpc.cpp:2885
Miluphpc::Miluphpc
Miluphpc(SimulationParameters simulationParameters)
Constructor to set up simulation.
Definition: miluphpc.cpp:3
Miluphpc::~Miluphpc
~Miluphpc()
Destructor freeing class instances.
Definition: miluphpc.cpp:97
Miluphpc::materialHandler
MaterialHandler * materialHandler
Instance to handle Materials instances on device and host.
Definition: miluphpc.h:319
Miluphpc::subDomainKeyTreeHandler
SubDomainKeyTreeHandler * subDomainKeyTreeHandler
Instance to handle the SubDomainKeyTree instance on device and host.
Definition: miluphpc.h:311
Miluphpc::numParticles
integer numParticles
number of particles (to be allocated)
Definition: miluphpc.h:285
Miluphpc::energy
real energy()
Calculate the total amount of energy.
Definition: miluphpc.cpp:538
Miluphpc::subStep
int subStep
current sub-step (there are possibly more sub-steps within a step!)
Definition: miluphpc.h:212
Miluphpc::treeHandler
TreeHandler * treeHandler
Instance to handle the Tree instance on device and host.
Definition: miluphpc.h:313
Miluphpc::sendParticles
integer sendParticles(T *sendBuffer, T *receiveBuffer, integer *sendLengths, integer *receiveLengths)
Send particles/Exchange particles among MPI processes.
Definition: miluphpc.cpp:3174
Miluphpc::lowestDomainListHandler
DomainListHandler * lowestDomainListHandler
Instance to handle the (lowest) DomainList instance on device and host.
Definition: miluphpc.h:317
Miluphpc::arrangeParticleEntries
real arrangeParticleEntries(U *sortArray, U *sortedArray, T *entry, T *temp)
Function to sort an array entry in dependence of another array sortArray
Definition: miluphpc.cpp:944
Miluphpc::removeParticles
real removeParticles()
Remove particles in dependence of some criterion.
Definition: miluphpc.cpp:3040
Miluphpc::sph
real sph()
Definition: miluphpc.cpp:2207
Miluphpc::gravity
real gravity()
Wrapper function for Gravity-related stuff.
Definition: miluphpc.cpp:1241
Miluphpc::simulationParameters
SimulationParameters simulationParameters
buffer (need for revising)
Definition: miluphpc.h:351
Miluphpc::configParameters2file
real configParameters2file(HighFive::File &h5file)
Definition: miluphpc.cpp:3736
Miluphpc::prepareSimulation
void prepareSimulation()
Prepare the simulation, including.
Definition: miluphpc.cpp:235
Miluphpc::distributionFromFile
void distributionFromFile(const std::string &filename)
Read initial/particle distribution file (in parallel)
Definition: miluphpc.cpp:161
Miluphpc::sendParticlesEntry
integer sendParticlesEntry(integer *sendLengths, integer *receiveLengths, T *entry, T *entryBuffer, T *copyBuffer)
Send particles/Exchange particles among MPI processes.
Definition: miluphpc.cpp:3327
Miluphpc::numParticlesLocal
integer numParticlesLocal
Definition: miluphpc.h:294
Miluphpc::kernelHandler
SPH::KernelHandler kernelHandler
Instance to handle the SPH Kernel instance on device and host.
Definition: miluphpc.h:280
Miluphpc::pseudoParticles
real pseudoParticles()
Wrapper function for calculating pseudo-particles.
Definition: miluphpc.cpp:1054
Miluphpc::numParticlesFromFile
void numParticlesFromFile(const std::string &filename)
Determine amount of particles (numParticles and numParticlesLocal) from initial file/particle distrib...
Definition: miluphpc.cpp:123
Miluphpc::h_searchRadius
real h_searchRadius
search radius for SPH (MPI-process overarching) neighbor search
Definition: miluphpc.h:215
Miluphpc::updateRange
void updateRange(int aimedParticlesPerProcess)
Update the range in dependence on number of (MPI) processes and aimed particles per process.
Definition: miluphpc.cpp:2912
Miluphpc::parallel_gravity
real parallel_gravity()
Parallel version regarding computation of gravitational stuff.
Definition: miluphpc.cpp:1248
Miluphpc::buffer
HelperHandler * buffer
buffer instance
Definition: miluphpc.h:325
Miluphpc::sumParticles
integer sumParticles
(real) number of particles on all processes
Definition: miluphpc.h:287
Miluphpc::simulationTimeHandler
SimulationTimeHandler * simulationTimeHandler
Instance to handle the SimulationTime instances on device and host.
Definition: miluphpc.h:282
Miluphpc::totalAngularMomentum
real totalAngularMomentum
total angular momentum
Definition: miluphpc.h:221
Miluphpc::getMemoryInfo
void getMemoryInfo()
Definition: miluphpc.cpp:3760
Miluphpc::boundingBox
real boundingBox()
Calculate bounding boxes/simulation domain.
Definition: miluphpc.cpp:643
Miluphpc::particles2file
real particles2file(int step)
Definition: miluphpc.cpp:3409
ParticleHandler
Definition: particle_handler.h:16
ParticleHandler::d_materialId
integer * d_materialId
device material identifier
Definition: particle_handler.h:227
ParticleHandler::h_rho
real * h_rho
host density
Definition: particle_handler.h:80
ParticleHandler::h_noi
integer * h_noi
host number of interactions
Definition: particle_handler.h:70
ParticleHandler::h_e
real * h_e
host internal energy
Definition: particle_handler.h:72
ParticleHandler::leapfrog
bool leapfrog
Definition: particle_handler.h:20
ParticleHandler::d_z
real * d_z
device z position
Definition: particle_handler.h:211
ParticleHandler::d_nnl
integer * d_nnl
device near(est) neighbor list
Definition: particle_handler.h:231
ParticleHandler::d_ay
real * d_ay
device y acceleration
Definition: particle_handler.h:206
ParticleHandler::d_u
real * d_u
energy
Definition: particle_handler.h:239
ParticleHandler::d_az_old
real * d_az_old
Definition: particle_handler.h:217
ParticleHandler::d_mass
real * d_mass
device mass array
Definition: particle_handler.h:191
ParticleHandler::d_g_az
real * d_g_az
Definition: particle_handler.h:216
ParticleHandler::h_p
real * h_p
host pressure
Definition: particle_handler.h:82
ParticleHandler::d_g_ax
real * d_g_ax
Definition: particle_handler.h:198
ParticleHandler::d_ax_old
real * d_ax_old
Definition: particle_handler.h:199
ParticleHandler::h_vx
real * h_vx
host x velocity
Definition: particle_handler.h:32
ParticleHandler::h_y
real * h_y
host y position
Definition: particle_handler.h:39
ParticleHandler::d_ay_old
real * d_ay_old
Definition: particle_handler.h:208
ParticleHandler::d_az
real * d_az
device z acceleration
Definition: particle_handler.h:215
ParticleHandler::copyDistribution
void copyDistribution(To::Target target=To::device, bool velocity=true, bool acceleration=true, bool includePseudoParticles=false)
Definition: particle_handler.cpp:873
ParticleHandler::d_g_ax_old
real * d_g_ax_old
Definition: particle_handler.h:199
ParticleHandler::d_e
real * d_e
device internal energy
Definition: particle_handler.h:235
ParticleHandler::d_nodeType
integer * d_nodeType
Definition: particle_handler.h:221
ParticleHandler::d_vx
real * d_vx
device x velocity
Definition: particle_handler.h:195
ParticleHandler::d_particles
Particles * d_particles
device instance of particles class
Definition: particle_handler.h:350
ParticleHandler::h_particles
Particles * h_particles
host instance of particles class
Definition: particle_handler.h:348
ParticleHandler::h_vz
real * h_vz
host z velocity
Definition: particle_handler.h:50
ParticleHandler::d_uid
idInteger * d_uid
device unique identifier
Definition: particle_handler.h:225
ParticleHandler::h_x
real * h_x
host x position
Definition: particle_handler.h:30
ParticleHandler::d_ax
real * d_ax
device x acceleration
Definition: particle_handler.h:197
ParticleHandler::h_mass
real * h_mass
host mass
Definition: particle_handler.h:28
ParticleHandler::d_y
real * d_y
device y position
Definition: particle_handler.h:202
ParticleHandler::d_sml
real * d_sml
device smoothing length
Definition: particle_handler.h:229
ParticleHandler::d_vy
real * d_vy
device y velocity
Definition: particle_handler.h:204
ParticleHandler::d_g_az_old
real * d_g_az_old
Definition: particle_handler.h:217
ParticleHandler::h_sml
real * h_sml
host smoothing length
Definition: particle_handler.h:66
ParticleHandler::h_cs
real * h_cs
host speed of sound
Definition: particle_handler.h:78
ParticleHandler::h_z
real * h_z
host z position
Definition: particle_handler.h:48
ParticleHandler::d_cs
real * d_cs
device speed of sound
Definition: particle_handler.h:241
ParticleHandler::d_x
real * d_x
device x position
Definition: particle_handler.h:193
ParticleHandler::copySPH
void copySPH(To::Target target)
Definition: particle_handler.cpp:889
ParticleHandler::d_g_ay_old
real * d_g_ay_old
Definition: particle_handler.h:208
ParticleHandler::d_noi
integer * d_noi
device number of interaction
Definition: particle_handler.h:233
ParticleHandler::d_p
real * d_p
device pressure
Definition: particle_handler.h:245
ParticleHandler::h_vy
real * h_vy
host y velocity
Definition: particle_handler.h:41
ParticleHandler::d_g_ay
real * d_g_ay
Definition: particle_handler.h:207
ParticleHandler::d_vz
real * d_vz
device z velocity
Definition: particle_handler.h:213
ParticleHandler::d_rho
real * d_rho
device density
Definition: particle_handler.h:243
Particles::e
real * e
(pointer to) internal energy (array)
Definition: particles.cuh:121
Particles::materialId
integer * materialId
(pointer to) material identifier (array)
Definition: particles.cuh:111
Particles::x
real * x
(pointer to) x position (array)
Definition: particles.cuh:62
Particles::uid
idInteger * uid
(pointer to) unique identifier (array)
Definition: particles.cuh:109
Particles::y
real * y
(pointer to) y position (array)
Definition: particles.cuh:70
Particles::sml
real * sml
(pointer to) smoothing length (array)
Definition: particles.cuh:113
Particles::mass
real * mass
(pointer to) mass (array)
Definition: particles.cuh:60
Particles::z
real * z
(pointer to) z position (array)
Definition: particles.cuh:78
Particles::vz
real * vz
(pointer to) z velocity (array)
Definition: particles.cuh:80
Particles::vx
real * vx
(pointer to) x velocity (array)
Definition: particles.cuh:64
Particles::vy
real * vy
(pointer to) y velocity (array)
Definition: particles.cuh:72
SPH::KernelHandler
SPH smoothing kernel handler.
Definition: kernel_handler.cuh:19
SPH::KernelHandler::kernel
SPH_kernel kernel
SPH smoothing kernel typedef/kind of function pointer.
Definition: kernel_handler.cuh:26
SimulationTimeHandler
Definition: simulation_time_handler.h:15
SimulationTimeHandler::copy
void copy(To::Target target)
Definition: simulation_time_handler.cpp:56
SimulationTimeHandler::h_currentTime
real * h_currentTime
Definition: simulation_time_handler.h:23
SubDomainKeyTreeHandler
Definition: subdomain_handler.h:23
SubDomainKeyTreeHandler::h_rank
integer h_rank
host MPI rank
Definition: subdomain_handler.h:27
SubDomainKeyTreeHandler::d_subDomainKeyTree
SubDomainKeyTree * d_subDomainKeyTree
device instance of class SubDomainKeyTree
Definition: subdomain_handler.h:48
SubDomainKeyTreeHandler::h_subDomainKeyTree
SubDomainKeyTree * h_subDomainKeyTree
host instance of class SubDomainKeyTree
Definition: subdomain_handler.h:46
SubDomainKeyTreeHandler::d_range
keyType * d_range
device range(s)
Definition: subdomain_handler.h:40
SubDomainKeyTreeHandler::reset
void reset()
Resetting member variables.
Definition: subdomain_handler.cpp:36
SubDomainKeyTreeHandler::h_procParticleCounter
integer * h_procParticleCounter
host counter for particles in dependence of MPI process belonging
Definition: subdomain_handler.h:33
SubDomainKeyTreeHandler::h_numProcesses
integer h_numProcesses
host MPI number of processes
Definition: subdomain_handler.h:29
SubDomainKeyTreeHandler::copy
void copy(To::Target target=To::device, bool range=true, bool counter=true)
Copy (parts of the) SubDomainKeyTree instance(s) between host and device.
Definition: subdomain_handler.cpp:43
SubDomainKeyTree::range
keyType * range
Space-filling curve ranges, mapping key ranges/borders to MPI processes.
Definition: subdomain.cuh:70
SubDomainKeyTree::rank
integer rank
MPI rank.
Definition: subdomain.cuh:66
SubDomainKeyTree::numProcesses
integer numProcesses
MPI number of processes.
Definition: subdomain.cuh:68
Timer
Definition: timer.h:15
Timer::elapsed
double elapsed() const
Get elapsed time since instantiation/latest reset.
Definition: timer.cpp:22
Timer::reset
void reset()
Reset timer instance.
Definition: timer.cpp:11
TreeHandler
Definition: tree_handler.h:22
TreeHandler::d_tree
Tree * d_tree
device instance of Class Tree
Definition: tree_handler.h:85
TreeHandler::h_minX
real * h_minX
host (pointer to) bounding box minimal x
Definition: tree_handler.h:60
TreeHandler::h_minY
real * h_minY
host (pointer to) bounding box minimal y
Definition: tree_handler.h:69
TreeHandler::d_sorted
integer * d_sorted
device (pointer to) sorted (array)
Definition: tree_handler.h:36
TreeHandler::h_maxX
real * h_maxX
host (pointer to) bounding box maximal x
Definition: tree_handler.h:62
TreeHandler::h_toDeleteLeaf
integer * h_toDeleteLeaf
host (pointer to) array remembering leaf indices for rebuilding after temporarily inserting particles
Definition: tree_handler.h:47
TreeHandler::h_toDeleteNode
integer * h_toDeleteNode
host (pointer to) array remembering leaf indices for rebuilding after temporarily inserting particles
Definition: tree_handler.h:49
TreeHandler::d_start
integer * d_start
device (pointer to) start (array)
Definition: tree_handler.h:34
TreeHandler::d_toDeleteLeaf
integer * d_toDeleteLeaf
device (pointer to) array remembering leaf indices for rebuilding after temporarily inserting particl...
Definition: tree_handler.h:51
TreeHandler::d_index
integer * d_index
device (pointer to) index
Definition: tree_handler.h:40
TreeHandler::copy
void copy(To::Target target=To::device, bool borders=true, bool index=true, bool toDelete=true)
Copy (parts of the) tree instance(s) between host and device.
Definition: tree_handler.cpp:86
TreeHandler::h_maxY
real * h_maxY
host (pointer to) bounding box maximal y
Definition: tree_handler.h:71
TreeHandler::h_minZ
real * h_minZ
host (pointer to) bounding box minimal z
Definition: tree_handler.h:78
TreeHandler::h_maxZ
real * h_maxZ
host (pointer to) bounding box maximal x
Definition: tree_handler.h:80
TreeHandler::globalizeBoundingBox
void globalizeBoundingBox(Execution::Location exLoc=Execution::device)
All reduce bounding box(es)/borders (among MPI processes)
Definition: tree_handler.cpp:110
TreeHandler::d_toDeleteNode
integer * d_toDeleteNode
device (pointer to) array remembering leaf indices for rebuilding after temporarily inserting particl...
Definition: tree_handler.h:53
gpuErrorcheck
#define gpuErrorcheck(ans)
check CUDA call
Definition: cuda_utilities.cuh:41
DEBUG
@ DEBUG
Definition: logger.h:47
ERROR
@ ERROR
warning log type
Definition: logger.h:51
INFO
@ INFO
debug log type
Definition: logger.h:48
WARN
@ WARN
trace log type
Definition: logger.h:50
TRACE
@ TRACE
info log type
Definition: logger.h:49
TIME
@ TIME
error log type
Definition: logger.h:52
CudaUtils::Kernel::Launch::findDuplicateEntries
real findDuplicateEntries(T *array1, T *array2, integer *duplicateCounter, int length)
Definition: cuda_utilities.cu:344
CudaUtils::Kernel::Launch::collectValues
real collectValues(integer *indices, real *entries, real *collector, integer count)
Definition: cuda_utilities.cu:320
DomainListNS::Kernel::Launch::createDomainList
real createDomainList(SubDomainKeyTree *subDomainKeyTree, DomainList *domainList, integer maxLevel, Curve::Type curveType=Curve::lebesgue)
Wrapper for DomainListNS::Kernel::createDomainList().
Definition: subdomain.cu:2141
DomainListNS::Kernel::Launch::lowestDomainList
real lowestDomainList(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, integer n, integer m)
Wrapper for ::DomainListNS::Kernel::lowestDoainList().
Definition: subdomain.cu:2149
Gravity::Kernel::collectSendIndices_test4
__global__ void collectSendIndices_test4(Tree *tree, Particles *particles, integer *sendIndices, integer *particles2Send, integer *pseudoParticles2Send, integer *pseudoParticlesLevel, integer *particlesCount, integer *pseudoParticlesCount, integer numParticlesLocal, integer numParticles, integer treeIndex, int currentProc, Curve::Type curveType)
Definition: gravity.cu:53
Gravity::Kernel::testSendIndices
__global__ void testSendIndices(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer *sendIndices, integer *markedSendIndices, integer *levels, Curve::Type curveType, integer length)
Test the send indices.
Definition: gravity.cu:103
Gravity::Kernel::intermediateSymbolicForce
__global__ void intermediateSymbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantIndex, integer level, Curve::Type curveType)
Definition: gravity.cu:1374
Gravity::Kernel::computeForces_v1_2
__global__ void computeForces_v1_2(Tree *tree, Particles *particles, real radius, integer n, integer m, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
Definition: gravity.cu:550
Gravity::Kernel::symbolicForce_test3
__global__ void symbolicForce_test3(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, Curve::Type curveType)
Definition: gravity.cu:1916
Gravity::Kernel::computeForces_v1_1
__global__ void computeForces_v1_1(Tree *tree, Particles *particles, real radius, integer n, integer m, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
Definition: gravity.cu:398
Gravity::Kernel::collectSendIndices
__global__ void collectSendIndices(Tree *tree, Particles *particles, integer *sendIndices, integer *particles2Send, integer *pseudoParticles2Send, integer *pseudoParticlesLevel, integer *particlesCount, integer *pseudoParticlesCount, integer n, integer length, Curve::Type curveType)
Collect the send indices.
Definition: gravity.cu:8
Gravity::Kernel::computeForces_v2
__global__ void computeForces_v2(Tree *tree, Particles *particles, real radius, integer n, integer m, integer blockSize, integer warp, integer stackSize, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
Definition: gravity.cu:726
Gravity::Kernel::computeForces_v2_1
__global__ void computeForces_v2_1(Tree *tree, Particles *particles, integer n, integer m, integer blockSize, integer warp, integer stackSize, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
Definition: gravity.cu:905
Gravity::Kernel::computeForces_v1
__global__ void computeForces_v1(Tree *tree, Particles *particles, real radius, integer n, integer m, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
Definition: gravity.cu:252
Gravity::Kernel::symbolicForce_test4
__global__ void symbolicForce_test4(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, Curve::Type curveType)
Definition: gravity.cu:2041
HelperNS::Kernel::Launch::copyArray
real copyArray(T *targetArray, T *sourceArray, integer n)
Definition: helper.cu:284
HelperNS::Kernel::Launch::resetArray
real resetArray(T *array, T value, integer n)
Definition: helper.cu:293
HelperNS::sortArray
real sortArray(A *arrayToSort, A *sortedArray, B *keyIn, B *keyOut, integer n)
Definition: helper.cu:151
HelperNS::sortKeys
real sortKeys(A *keysToSort, A *sortedKeys, int n)
Definition: helper.cu:135
HelperNS::reduceAndGlobalize
T reduceAndGlobalize(T *d_sml, T *d_aggregate, integer n, Reduction::Type reductionType)
Definition: helper.cu:184
Kernel::Launch::resetArrays
real resetArrays(Tree *tree, Particles *particles, integer *mutex, integer n, integer m, bool time=false)
Definition: device_rhs.cu:43
ParticlesNS::Kernel::Launch::mark2remove
real mark2remove(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, int *particles2remove, int *counter, int criterion, real d, int numParticles)
Definition: subdomain.cu:2233
ParticlesNS::Kernel::Launch::check4nans
real check4nans(Particles *particles, integer n)
Definition: particles.cu:563
Physics::Kernel::Launch::kineticEnergy
real kineticEnergy(Particles *particles, int n)
Wrapper for: Physics::Kernel::kineticEnergy().
Definition: subdomain.cu:2638
ProfilerIds::ReceiveLengths::gravityParticles
const char *const gravityParticles
Definition: h5profiler.h:42
ProfilerIds::ReceiveLengths::gravityPseudoParticles
const char *const gravityPseudoParticles
Definition: h5profiler.h:43
ProfilerIds::ReceiveLengths::sph
const char *const sph
Definition: h5profiler.h:44
ProfilerIds::SendLengths::gravityPseudoParticles
const char *const gravityPseudoParticles
Definition: h5profiler.h:36
ProfilerIds::SendLengths::gravityParticles
const char *const gravityParticles
Definition: h5profiler.h:35
ProfilerIds::SendLengths::sph
const char *const sph
Definition: h5profiler.h:37
ProfilerIds::Time::Gravity::compTheta
const char *const compTheta
Definition: h5profiler.h:87
ProfilerIds::Time::Gravity::force
const char *const force
Definition: h5profiler.h:92
ProfilerIds::Time::Gravity::repairTree
const char *const repairTree
Definition: h5profiler.h:93
ProfilerIds::Time::Gravity::symbolicForce
const char *const symbolicForce
Definition: h5profiler.h:88
ProfilerIds::Time::Gravity::insertReceivedParticles
const char *const insertReceivedParticles
Definition: h5profiler.h:91
ProfilerIds::Time::Gravity::insertReceivedPseudoParticles
const char *const insertReceivedPseudoParticles
Definition: h5profiler.h:90
ProfilerIds::Time::Gravity::sending
const char *const sending
Definition: h5profiler.h:89
ProfilerIds::Time::SPH::insertReceivedParticles
const char *const insertReceivedParticles
Definition: h5profiler.h:101
ProfilerIds::Time::SPH::internalForces
const char *const internalForces
Definition: h5profiler.h:107
ProfilerIds::Time::SPH::soundSpeed
const char *const soundSpeed
Definition: h5profiler.h:104
ProfilerIds::Time::SPH::symbolicForce
const char *const symbolicForce
Definition: h5profiler.h:99
ProfilerIds::Time::SPH::pressure
const char *const pressure
Definition: h5profiler.h:105
ProfilerIds::Time::SPH::sending
const char *const sending
Definition: h5profiler.h:100
ProfilerIds::Time::SPH::fixedRadiusNN
const char *const fixedRadiusNN
Definition: h5profiler.h:102
ProfilerIds::Time::SPH::determineSearchRadii
const char *const determineSearchRadii
Definition: h5profiler.h:98
ProfilerIds::Time::SPH::repairTree
const char *const repairTree
Definition: h5profiler.h:108
ProfilerIds::Time::SPH::density
const char *const density
Definition: h5profiler.h:103
ProfilerIds::Time::SPH::compTheta
const char *const compTheta
Definition: h5profiler.h:97
ProfilerIds::Time::SPH::resend
const char *const resend
Definition: h5profiler.h:106
ProfilerIds::Time::Tree::tree
const char *const tree
Definition: h5profiler.h:78
ProfilerIds::Time::Tree::buildDomain
const char *const buildDomain
Definition: h5profiler.h:79
ProfilerIds::Time::Tree::createDomain
const char *const createDomain
Definition: h5profiler.h:77
ProfilerIds::Time::pseudoParticle
const char *const pseudoParticle
Definition: h5profiler.h:58
ProfilerIds::Time::gravity
const char *const gravity
Definition: h5profiler.h:59
ProfilerIds::Time::reset
const char *const reset
Definition: h5profiler.h:53
ProfilerIds::Time::tree
const char *const tree
Definition: h5profiler.h:57
ProfilerIds::Time::assignParticles
const char *const assignParticles
Definition: h5profiler.h:56
ProfilerIds::Time::boundingBox
const char *const boundingBox
Definition: h5profiler.h:55
ProfilerIds::Time::sph
const char *const sph
Definition: h5profiler.h:60
ProfilerIds::ranges
const char *const ranges
Definition: h5profiler.h:31
SPH::Kernel::Launch::symbolicForce_test2
real symbolicForce_test2(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real searchRadius, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, Curve::Type curveType)
Definition: sph.cu:3163
SPH::Kernel::Launch::calculateDensity
real calculateDensity(::SPH::SPH_kernel kernel, Tree *tree, Particles *particles, int *interactions, int numParticles)
Wrapper for SPH::Kernel::calculateDensity().
Definition: density.cu:74
SPH::Kernel::Launch::fixedRadiusNN_sharedMemory
real fixedRadiusNN_sharedMemory(Tree *tree, Particles *particles, integer *interactions, integer numParticlesLocal, integer numParticles, integer numNodes)
Wrapper for SPH::Kernel::fixedRadiusNN_sharedMemory().
Definition: sph.cu:3113
SPH::Kernel::Launch::calculateSoundSpeed
real calculateSoundSpeed(Particles *particles, Material *materials, int numParticles)
Wrapper for SPH::Kernel::calculateSoundSpeed().
Definition: soundspeed.cu:98
SPH::Kernel::Launch::compTheta
real compTheta(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *lowestDomainList, Curve::Type curveType)
Wrapper for SPH::Kernel::compTheta().
Definition: sph.cu:3137
SPH::Kernel::Launch::initializeSoundSpeed
real initializeSoundSpeed(Particles *particles, Material *materials, int numParticles)
Wrapper for SPH::Kernel::initializeSoundSpeed().
Definition: soundspeed.cu:93
SPH::Kernel::Launch::insertReceivedParticles
real insertReceivedParticles(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int m)
Wrapper for SPH::Kernel::insertReceivedParticles().
Definition: sph.cu:3216
SPH::Kernel::Launch::internalForces
real internalForces(::SPH::SPH_kernel kernel, Material *materials, Tree *tree, Particles *particles, int *interactions, int numRealParticles)
Wrapper for SPH::Kernel::internalForces().
Definition: internal_forces.cu:471
SPH::Kernel::Launch::collectSendIndices_test2
real collectSendIndices_test2(Tree *tree, Particles *particles, integer *sendIndices, integer *particles2Send, integer *particlesCount, integer numParticlesLocal, integer numParticles, integer treeIndex, int currentProc, Curve::Type curveType)
Definition: sph.cu:3180
SPH::Kernel::Launch::fixedRadiusNN_bruteForce
real fixedRadiusNN_bruteForce(Tree *tree, Particles *particles, integer *interactions, integer numParticlesLocal, integer numParticles, integer numNodes)
Wrapper for SPH::Kernel::fixedRadiusNN_bruteForce().
Definition: sph.cu:3087
SPH::Kernel::Launch::fixedRadiusNN
real fixedRadiusNN(Tree *tree, Particles *particles, integer *interactions, real radius, integer numParticlesLocal, integer numParticles, integer numNodes)
Wrapper for SPH::Kernel::fixedRadiusNN().
Definition: sph.cu:3094
SPH::Kernel::Launch::fixedRadiusNN_withinBox
real fixedRadiusNN_withinBox(Tree *tree, Particles *particles, integer *interactions, integer numParticlesLocal, integer numParticles, integer numNodes)
Wrapper for SPH::Kernel::fixedRadiusNN_withinBox().
Definition: sph.cu:3103
SPH::Kernel::Launch::fixedRadiusNN_variableSML
real fixedRadiusNN_variableSML(Material *materials, Tree *tree, Particles *particles, integer *interactions, integer numParticlesLocal, integer numParticles, integer numNodes)
Wrapper for SPH::Kernel::fixedRadiusNN_variableSML().
Definition: sph.cu:3128
SPH::Kernel::Launch::calculatePressure
real calculatePressure(Material *materials, Particles *particles, int numParticles)
Wrapper for SPH::Kernel::calculatePressure().
Definition: pressure.cu:80
SPH::Kernel::symbolicForce_test2
__global__ void symbolicForce_test2(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real searchRadius, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, Curve::Type curveType)
Definition: sph.cu:1401
SPH::Kernel::symbolicForce_test
__global__ void symbolicForce_test(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *lowestDomainList, integer *sendIndices, real searchRadius, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, integer relevantIndexOld, Curve::Type curveType)
Definition: sph.cu:1271
SubDomainKeyTreeNS::Kernel::Launch::particlesPerProcess
real particlesPerProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer n, integer m, Curve::Type curveType=Curve::lebesgue)
Wrapper for SubDomainKeyTreeNS::Kernel::particlesPerProcess().
Definition: subdomain.cu:1572
SubDomainKeyTreeNS::Kernel::Launch::updateLowestDomainListNodes
real updateLowestDomainListNodes(Particles *particles, DomainList *lowestDomainList, T *buffer, Entry::Name entry)
Wrapper for SubDomainKeyTreeNS::Kernel::updateLowestDomainListNodes().
Definition: subdomain.cu:1605
SubDomainKeyTreeNS::Kernel::Launch::compDomainListPseudoParticlesPerLevel
real compDomainListPseudoParticlesPerLevel(Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int level)
Wrapper for SubDomainKeyTreeNS::Kernel::compDomainListPseudoParticlesPerLevel().
Definition: subdomain.cu:1628
SubDomainKeyTreeNS::Kernel::Launch::prepareLowestDomainExchange
real prepareLowestDomainExchange(Particles *particles, DomainList *lowestDomainList, T *buffer, Entry::Name entry)
Wrapper for SubDomainKeyTreeNS::Kernel::prepareLowestDomainExchange().
Definition: subdomain.cu:1594
SubDomainKeyTreeNS::Kernel::Launch::zeroDomainListNodes
real zeroDomainListNodes(Particles *particles, DomainList *domainList, DomainList *lowestDomainList)
Wrapper for SubDomainKeyTreeNS::Kernel::zeroDomainListNodes().
Definition: subdomain.cu:1587
SubDomainKeyTreeNS::Kernel::Launch::buildDomainTree
real buildDomainTree(Tree *tree, Particles *particles, DomainList *domainList, integer n, integer m)
Wrapper for SubDomainKeyTreeNS::Kernel::buildDomainTree().
Definition: subdomain.cu:1550
SubDomainKeyTreeNS::Kernel::Launch::createKeyHistRanges
real createKeyHistRanges(Helper *helper, integer bins)
Definition: subdomain.cu:1650
SubDomainKeyTreeNS::Kernel::Launch::repairTree
real repairTree(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int m, Curve::Type curveType)
Wrapper for SubDomainKeyTreeNS::Kernel::repairTree().
Definition: subdomain.cu:1642
SubDomainKeyTreeNS::Kernel::Launch::getParticleKeys
real getParticleKeys(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, keyType *keys, integer maxLevel, integer n, Curve::Type curveType=Curve::lebesgue)
Wrapper for SubDomainKeyTreeNS::Kernel::getParticleKeys().
Definition: subdomain.cu:1564
SubDomainKeyTreeNS::Kernel::Launch::keyHistCounter
real keyHistCounter(Tree *tree, Particles *particles, SubDomainKeyTree *subDomainKeyTree, Helper *helper, int bins, int n, Curve::Type curveType=Curve::lebesgue)
Definition: subdomain.cu:1655
SubDomainKeyTreeNS::Kernel::Launch::markParticlesProcess
real markParticlesProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer n, integer m, integer *sortArray, Curve::Type curveType=Curve::lebesgue)
Wrapper for ::SubDomainKeyTreeNS::Kernel::markParticlesPerProcess().
Definition: subdomain.cu:1579
SubDomainKeyTreeNS::Kernel::Launch::compLowestDomainListNodes
real compLowestDomainListNodes(Tree *tree, Particles *particles, DomainList *lowestDomainList)
Wrapper for SubDomainKeyTreeNS::Kernel::compLowestDomainListNodes().
Definition: subdomain.cu:1616
SubDomainKeyTreeNS::Kernel::Launch::calculateNewRange
real calculateNewRange(SubDomainKeyTree *subDomainKeyTree, Helper *helper, int bins, int n, Curve::Type curveType=Curve::lebesgue)
Definition: subdomain.cu:1662
TreeNS::Kernel::Launch::calculateCentersOfMass
real calculateCentersOfMass(Tree *tree, Particles *particles, integer n, integer level, bool time=false)
Definition: tree.cu:1944
TreeNS::Kernel::Launch::prepareSorting
real prepareSorting(Tree *tree, Particles *particles, integer n, integer m)
Definition: tree.cu:1939
TreeNS::Kernel::Launch::globalCOM
real globalCOM(Tree *tree, Particles *particles, real com[DIM])
Wrapper for TreeNS::Kernel::globalCOM()
Definition: tree.cu:1974
TreeNS::Kernel::Launch::computeBoundingBox
real computeBoundingBox(Tree *tree, Particles *particles, integer *mutex, integer n, integer blockSize, bool time=false)
Wrapper for TreeNS::Kernel::computeBoundingBox()
Definition: tree.cu:1949
TreeNS::Kernel::Launch::buildTree
real buildTree(Tree *tree, Particles *particles, integer n, integer m, bool time=false)
Wrapper for TreeNS::Kernel::buildTree()
Definition: tree.cu:1934
TreeNS::Kernel::centerOfMass
__global__ void centerOfMass(Tree *tree, Particles *particles, integer n)
Definition: tree.cu:1501
cuda::math::sqrt
__device__ real sqrt(real a)
Square root of a floating point value.
Definition: cuda_utilities.cu:456
cuda::math::abs
__device__ real abs(real a)
Absolute value of a floating point value.
Definition: cuda_utilities.cu:448
cuda::math::max
__device__ real max(real a, real b)
Maximum value out of two floating point values.
Definition: cuda_utilities.cu:431
cuda::copy
void copy(T *h_var, T *d_var, std::size_t count=1, To::Target copyTo=To::device)
Copy between host and device and vice-versa.
Definition: cuda_runtime.h:30
cuda::set
void set(T *d_var, T val, std::size_t count=1)
Set device memory to a specific value.
Definition: cuda_runtime.h:56
cuda::free
void free(T *d_var)
Free device memory.
Definition: cuda_runtime.h:81
cuda::malloc
void malloc(T *&d_var, std::size_t count)
Allocate device memory.
Definition: cuda_runtime.h:70
mpi::messageLengths
void messageLengths(SubDomainKeyTree *subDomainKeyTree, T *toSend, T *toReceive)
Send array with length of number of MPI processes across processes.
Definition: subdomain_handler.h:138
real
double real
Definition: parameter.h:15
MAX_NUM_INTERACTIONS
#define MAX_NUM_INTERACTIONS
Definition: parameter.h:106
keyType
unsigned long keyType
Definition: parameter.h:18
integer
int integer
Definition: parameter.h:17
MAX_LEVEL
#define MAX_LEVEL
Definition: parameter.h:25
KEY_MAX
#define KEY_MAX
Definition: parameter.h:102
DIM
#define DIM
Dimension of the problem.
Definition: parameter.h:38
idInteger
int idInteger
Definition: parameter.h:19
Curve::Type
Type
Definition: parameter.h:206
Curve::lebesgue
@ lebesgue
Definition: parameter.h:207
Entry::mass
@ mass
Definition: parameter.h:263
Entry::y
@ y
Definition: parameter.h:258
Entry::x
@ x
Definition: parameter.h:256
Entry::z
@ z
Definition: parameter.h:260
Execution::device
@ device
Definition: parameter.h:193
Reduction::max
@ max
Definition: helper.cuh:14
SimulationParameters
Definition: parameter.h:122
SimulationParameters::calculateAngularMomentum
bool calculateAngularMomentum
Definition: parameter.h:154
SimulationParameters::loadBalancing
bool loadBalancing
apply load balancing
Definition: parameter.h:131
SimulationParameters::logDirectory
std::string logDirectory
log file(s) directory
Definition: parameter.h:124
SimulationParameters::theta
real theta
clumping parameter/
Definition: parameter.h:142
SimulationParameters::maxTimeStep
real maxTimeStep
max (allowed) time step
Definition: parameter.h:129
SimulationParameters::removeParticlesDimension
real removeParticlesDimension
Definition: parameter.h:152
SimulationParameters::smoothingKernelSelection
int smoothingKernelSelection
Definition: parameter.h:147
SimulationParameters::timeEnd
real timeEnd
end time of simulation
Definition: parameter.h:130
SimulationParameters::materialConfigFile
std::string materialConfigFile
input file containing material configurations/parameters
Definition: parameter.h:135
SimulationParameters::directory
std::string directory
output file(s) directory
Definition: parameter.h:123
SimulationParameters::domainListSize
int domainListSize
Definition: parameter.h:158
SimulationParameters::particlesSent2H5
bool particlesSent2H5
log particles sent to HDF5 file
Definition: parameter.h:138
SimulationParameters::calculateCenterOfMass
bool calculateCenterOfMass
Definition: parameter.h:156
SimulationParameters::inputFile
std::string inputFile
input file containing initial particle distribution
Definition: parameter.h:134
SimulationParameters::smoothing
real smoothing
Definition: parameter.h:143
SimulationParameters::sphFixedRadiusNNVersion
int sphFixedRadiusNNVersion
Definition: parameter.h:148
SimulationParameters::sfcSelection
int sfcSelection
space-filling curve selection
Definition: parameter.h:139
SimulationParameters::timeStep
real timeStep
time step
Definition: parameter.h:128
SimulationParameters::gravityForceVersion
int gravityForceVersion
Definition: parameter.h:144
SimulationParameters::removeParticlesCriterion
int removeParticlesCriterion
Definition: parameter.h:151
SimulationParameters::particleMemoryContingent
real particleMemoryContingent
Definition: parameter.h:157
SimulationParameters::calculateEnergy
bool calculateEnergy
Definition: parameter.h:155
SimulationParameters::removeParticles
bool removeParticles
Definition: parameter.h:150
Smoothing::Kernel
Kernel
Definition: parameter.h:178
To::device
@ device
Definition: parameter.h:165
To::host
@ host
Definition: parameter.h:165

milupHPC - src/miluphpc.cpp Source File
Generated on Wed Aug 31 2022 12:16:53 by Doxygen 1.9.3