GPUE  v1.0
GPU Gross-Pitaevskii Equation numerical solver for Bose-Einstein condensates
operators.cu
Go to the documentation of this file.
1 #include "../include/operators.h"
2 #include "../include/kernels.h"
3 #include "../include/dynamic.h"
4 
5 double sign(double x){
6  if (x < 0){
7  return -1.0;
8  }
9  else if (x == 0){
10  return 0.0;
11  }
12  else{
13  return 1.0;
14  }
15 }
16 
17 // Function to take the curl of Ax and Ay in 2d
18 // note: This is on the cpu, there should be a GPU version too.
19 double *curl2d(Grid &par, double *Ax, double *Ay){
20  int xDim = par.ival("xDim");
21  int yDim = par.ival("yDim");
22 
23  int size = sizeof(double) * xDim * yDim;
24  double *curl;
25  curl = (double *)malloc(size);
26 
27  int index;
28 
29  // Note: To take the curl, we need a change in x and y to create a dx or dy
30  // For this reason, we have added yDim to y and 1 to x
31  for (int i = 0; i < xDim; i++){
32  for (int j = 0; j < yDim-1; j++){
33  index = j + yDim * i;
34  curl[index] = (Ay[index] - Ay[index+xDim])
35  - (Ax[index] - Ax[index+1]);
36  }
37  }
38 
39  return curl;
40 }
41 
42 double *curl3d_r(Grid &par, double *Bx, double *By){
43  int xDim = par.ival("xDim");
44  int yDim = par.ival("yDim");
45  int zDim = par.ival("zDim");
46 
47  int size = sizeof(double) * xDim * yDim * zDim;
48  double *curl;
49  curl = (double *)malloc(size);
50 
51  for (int i = 0; i < xDim*yDim*zDim; ++i){
52  curl[i] = sqrt(Bx[i]*Bx[i] + By[i] * By[i]);
53  }
54 
55  return curl;
56 }
57 
58 double *curl3d_phi(Grid &par, double *Bx, double *By){
59  int xDim = par.ival("xDim");
60  int yDim = par.ival("yDim");
61  int zDim = par.ival("zDim");
62 
63  int size = sizeof(double) * xDim * yDim * zDim;
64  double *curl;
65  curl = (double *)malloc(size);
66 
67  for (int i = 0; i < xDim*yDim*zDim; ++i){
68  curl[i] = atan2(By[i], Bx[i]);
69  }
70 
71  return curl;
72 }
73 
74 
75 // Function to take the curl of Ax and Ay in 2d
76 // note: This is on the cpu, there should be a GPU version too.
77 // Not complete yet!
78 double *curl3d_x(Grid &par, double *Ax, double *Ay, double *Az){
79  int xDim = par.ival("xDim");
80  int yDim = par.ival("yDim");
81  int zDim = par.ival("zDim");
82 
83  int size = sizeof(double) * xDim * yDim * zDim;
84  double *curl;
85  curl = (double *)malloc(size);
86 
87  int index;
88 
89  // Note: To take the curl, we need a change in x and y to create a dx or dy
90  // For this reason, we have added yDim to y and 1 to x
91  for (int i = 0; i < xDim; i++){
92  for (int j = 0; j < yDim-1; j++){
93  for (int k = 0; k < zDim - 1; k++){
94  index = k + zDim * j + zDim * yDim * i;
95  curl[index] = (Az[index] - Az[index + xDim])
96  -(Ay[index] - Ay[index + xDim*yDim]);
97  }
98  }
99  }
100 
101  return curl;
102 }
103 
104 // Function to take the curl of Ax and Ay in 2d
105 // note: This is on the cpu, there should be a GPU version too.
106 // Not complete yet!
107 double *curl3d_y(Grid &par, double *Ax, double *Ay, double *Az){
108  int xDim = par.ival("xDim");
109  int yDim = par.ival("yDim");
110  int zDim = par.ival("zDim");
111 
112  int size = sizeof(double) * xDim * yDim * zDim;
113  double *curl;
114  curl = (double *)malloc(size);
115 
116  int index;
117 
118  // Note: To take the curl, we need a change in x and y to create a dx or dy
119  // For this reason, we have added yDim to y and 1 to x
120  for (int i = 0; i < xDim-1; i++){
121  for (int j = 0; j < yDim; j++){
122  for (int k = 0; k < zDim - 1; k++){
123  index = k + zDim * j + zDim * yDim * i;
124  curl[index] = -(Az[index] - Az[index + 1])
125  -(Ax[index] - Ax[index + xDim*yDim]);
126  }
127  }
128  }
129 
130  return curl;
131 }
132 
133 // Function to take the curl of Ax and Ay in 2d
134 // note: This is on the cpu, there should be a GPU version too.
135 // Not complete yet!
136 double *curl3d_z(Grid &par, double *Ax, double *Ay, double *Az){
137  int xDim = par.ival("xDim");
138  int yDim = par.ival("yDim");
139  int zDim = par.ival("zDim");
140 
141  int size = sizeof(double) * xDim * yDim * zDim;
142  double *curl;
143  curl = (double *)malloc(size);
144 
145  int index;
146 
147  // Note: To take the curl, we need a change in x and y to create a dx or dy
148  // For this reason, we have added yDim to y and 1 to x
149  for (int i = 0; i < xDim - 1; i++){
150  for (int j = 0; j < yDim-1; j++){
151  for (int k = 0; k < zDim; k++){
152  index = k + zDim * j + zDim * yDim * i;
153  curl[index] = (Ay[index] - Ay[index + 1])
154  -(Ax[index] - Ax[index + xDim]);
155  }
156  }
157  }
158 
159  return curl;
160 }
161 
162 // Function to check whether a file exists
163 std::string filecheck(std::string filename){
164 
165  struct stat buffer = {0};
166  if (stat(filename.c_str(), &buffer) == -1){
167  std::cout << "File " << filename << " does not exist!" << '\n';
168  std::cout << "Please select a new file:" << '\n';
169  std::cin >> filename;
170  filename = filecheck(filename);
171  }
172 
173  return filename;
174 }
175 
176 // Function to read Ax from file.
177 // Note that this comes with a special method in init...
178 void file_A(std::string filename, double *A, double omega){
179  std::fstream infile(filename, std::ios_base::in);
180 
181  double inval;
182  int count = 0;
183  while (infile >> inval){
184  A[count] = omega*inval;
185  count++;
186  }
187 }
188 
189 /*----------------------------------------------------------------------------//
190 * GPU KERNELS
191 *-----------------------------------------------------------------------------*/
192 
193 // Function to generate momentum grids
194 void generate_p_space(Grid &par){
195 
196  int dimnum = par.ival("dimnum");
197  int xDim = par.ival("xDim");
198  int yDim = par.ival("yDim");
199  int zDim = par.ival("zDim");
200  double xMax = par.dval("xMax");
201  double yMax = 0;
202  if (dimnum > 1){
203  yMax = par.dval("yMax");
204  }
205  double zMax = 0;
206  if (dimnum == 3){
207  zMax = par.dval("zMax");
208  }
209 
210  double pxMax = par.dval("pxMax");
211  double pyMax = 0;
212  if (dimnum > 1){
213  pyMax = par.dval("pyMax");
214  }
215  double pzMax = 0;
216  if (dimnum == 3){
217  pzMax = par.dval("pzMax");
218  }
219 
220  double dx = par.dval("dx");
221  double dy = 0;
222  if (dimnum > 1){
223  dy = par.dval("dy");
224  }
225  double dz = 0;
226  if (dimnum == 3){
227  dz = par.dval("dz");
228  }
229 
230  double dpx = par.dval("dpx");
231  double dpy = 0;
232  if (dimnum > 1){
233  dpy = par.dval("dpy");
234  }
235  double dpz = 0;
236  if (dimnum == 3){
237  dpz = par.dval("dpz");
238  }
239 
240  double *x, *y, *z, *px, *py, *pz,
241  *x_gpu, *y_gpu, *z_gpu,
242  *px_gpu, *py_gpu, *pz_gpu;
243 
244  x = (double *) malloc(sizeof(double) * xDim);
245  y = (double *) malloc(sizeof(double) * yDim);
246  z = (double *) malloc(sizeof(double) * zDim);
247  px = (double *) malloc(sizeof(double) * xDim);
248  py = (double *) malloc(sizeof(double) * yDim);
249  pz = (double *) malloc(sizeof(double) * zDim);
250 
251  if (dimnum == 2){
252 
253  for(int i=0; i<xDim/2; ++i){
254  x[i] = -xMax + i*dx;
255  x[i + (xDim/2)] = i*dx;
256 
257  px[i] = i*dpx;
258  px[i + (xDim/2)] = -pxMax + i*dpx;
259 
260  }
261  for(int i=0; i<yDim/2; ++i){
262  y[i] = -yMax + i*dy;
263  y[i + (yDim/2)] = i*dy;
264 
265  py[i] = i*dpy;
266  py[i + (yDim/2)] = -pyMax + i*dpy;
267 
268  }
269 
270  for(int i = 0; i < zDim; ++i){
271  z[i] = 0;
272  pz[i] = 0;
273  }
274 
275  }
276  else if(dimnum == 3){
277  for(int i=0; i<xDim/2; ++i){
278  x[i] = -xMax + i*dx;
279  x[i + (xDim/2)] = i*dx;
280 
281  px[i] = i*dpx;
282  px[i + (xDim/2)] = -pxMax + i*dpx;
283 
284  }
285  for(int i=0; i<yDim/2; ++i){
286  y[i] = -yMax + i*dy;
287  y[i + (yDim/2)] = i*dy;
288 
289  py[i] = i*dpy;
290  py[i + (yDim/2)] = -pyMax + i*dpy;
291 
292  }
293  for(int i=0; i<zDim/2; ++i){
294  z[i] = -zMax + i*dz;
295  z[i + (zDim/2)] = i*dz;
296 
297  pz[i] = i*dpz;
298  pz[i + (zDim/2)] = -pzMax + i*dpz;
299 
300  }
301 
302  }
303  else if (dimnum == 1){
304  for(int i=0; i<xDim/2; ++i){
305  x[i] = -xMax + i*dx;
306  x[i + (xDim/2)] = i*dx;
307 
308  px[i] = i*dpx;
309  px[i + (xDim/2)] = -pxMax + i*dpx;
310 
311  }
312 
313  for(int i = 0; i < zDim; ++i){
314  z[i] = 0;
315  pz[i] = 0;
316  y[i] = 0;
317  py[i] = 0;
318  }
319 
320  }
321  par.store("x",x);
322  par.store("y",y);
323  par.store("z",z);
324  par.store("px",px);
325  par.store("py",py);
326  par.store("pz",pz);
327 
328  // Now move these items to the gpu
329  cudaMalloc((void**) &x_gpu, sizeof(double) * xDim);
330  cudaMalloc((void**) &y_gpu, sizeof(double) * yDim);
331  cudaMalloc((void**) &z_gpu, sizeof(double) * zDim);
332  cudaMalloc((void**) &px_gpu, sizeof(double) * xDim);
333  cudaMalloc((void**) &py_gpu, sizeof(double) * yDim);
334  cudaMalloc((void**) &pz_gpu, sizeof(double) * zDim);
335 
336  cudaMemcpy(x_gpu, x, sizeof(double)*xDim, cudaMemcpyHostToDevice);
337  cudaMemcpy(y_gpu, y, sizeof(double)*yDim, cudaMemcpyHostToDevice);
338  cudaMemcpy(z_gpu, z, sizeof(double)*zDim, cudaMemcpyHostToDevice);
339  cudaMemcpy(px_gpu, px, sizeof(double)*xDim, cudaMemcpyHostToDevice);
340  cudaMemcpy(py_gpu, py, sizeof(double)*yDim, cudaMemcpyHostToDevice);
341  cudaMemcpy(pz_gpu, pz, sizeof(double)*zDim, cudaMemcpyHostToDevice);
342 
343  par.store("x_gpu",x_gpu);
344  par.store("y_gpu",y_gpu);
345  par.store("z_gpu",z_gpu);
346  par.store("px_gpu",px_gpu);
347  par.store("py_gpu",py_gpu);
348  par.store("pz_gpu",pz_gpu);
349 }
350 
351 // This function is basically a wrapper to call the appropriate K kernel
352 void generate_K(Grid &par){
353 
354  // For k, we need xp, yp, and zp. These will also be used in generating
355  // pAxyz parameters, so it should already be stored in par.
356  double *px_gpu = par.dsval("px_gpu");
357  double *py_gpu = par.dsval("py_gpu");
358  double *pz_gpu = par.dsval("pz_gpu");
359  double gSize = par.ival("gSize");
360  double mass = par.dval("mass");
361 
362  // Creating K to work with
363  double *K, *K_gpu;
364  K = (double*)malloc(sizeof(double)*gSize);
365  cudaMalloc((void**) &K_gpu, sizeof(double)*gSize);
366 
367  simple_K<<<par.grid, par.threads>>>(px_gpu, py_gpu, pz_gpu, mass, K_gpu);
368 
369  cudaMemcpy(K, K_gpu, sizeof(double)*gSize, cudaMemcpyDeviceToHost);
370  par.store("K",K);
371  par.store("K_gpu",K_gpu);
372 
373 }
374 
375 // Simple kernel for generating K
376 __global__ void simple_K(double *xp, double *yp, double *zp, double mass,
377  double *K){
378 
379  unsigned int gid = getGid3d3d();
380  unsigned int xid = blockDim.x*blockIdx.x + threadIdx.x;
381  unsigned int yid = blockDim.y*blockIdx.y + threadIdx.y;
382  unsigned int zid = blockDim.z*blockIdx.z + threadIdx.z;
383  K[gid] = (HBAR*HBAR/(2*mass))*(xp[xid]*xp[xid] + yp[yid]*yp[yid]
384  + zp[zid]*zp[zid]);
385 }
386 
387 // Function to generate game fields
388 void generate_gauge(Grid &par){
389 
390  int gSize = par.ival("gSize");
391  int dimnum = par.ival("dimnum");
392 
393  double *Ax, *Ay, *Az, *Ax_gpu, *Ay_gpu, *Az_gpu;
394  double *x_gpu = par.dsval("x_gpu");
395  double *y_gpu = par.dsval("y_gpu");
396  double *z_gpu = par.dsval("z_gpu");
397 
398  double xMax = par.dval("xMax");
399  double yMax = par.dval("yMax");
400  double zMax = 1;
401  if (dimnum == 3){
402  zMax = par.dval("zMax");
403  }
404  double omegaX = par.dval("omegaX");
405  double omegaY = par.dval("omegaY");
406  double omegaZ;
407  if (dimnum == 3){
408  omegaZ = par.dval("omegaZ");
409  }
410  double omega = par.dval("omega");
411  double fudge = par.dval("fudge");
412 
413  Ax = (double *)malloc(sizeof(double)*gSize);
414  Ay = (double *)malloc(sizeof(double)*gSize);
415  Az = (double *)malloc(sizeof(double)*gSize);
416 
417  cudaMalloc((void**) &Ax_gpu, sizeof(double)*gSize);
418  cudaMalloc((void**) &Ay_gpu, sizeof(double)*gSize);
419  cudaMalloc((void**) &Az_gpu, sizeof(double)*gSize);
420 
421  if (par.Afn == "file"){
422  file_A(par.Axfile, Ax, omega);
423  cudaMemcpy(Ax_gpu, Ax, sizeof(double)*gSize, cudaMemcpyHostToDevice);
424 
425  if (dimnum > 1){
426  file_A(par.Ayfile, Ay, omega);
427  cudaMemcpy(Ay_gpu,Ay,sizeof(double)*gSize,cudaMemcpyHostToDevice);
428  }
429 
430  if (dimnum == 3){
431  file_A(par.Azfile, Az, omega);
432  cudaMemcpy(Az_gpu,Az,sizeof(double)*gSize,cudaMemcpyHostToDevice);
433  }
434 
435  std::cout << "finished reading Ax / Ay / Az from file" << '\n';
436  }
437  else{
438  if (par.is_ast_gpu("Ax")){
439  double dx = par.dval("dx");
440  double dy = par.dval("dy");
441  double dz = par.dval("dz");
442  double xMax = par.dval("xMax");
443  double yMax = par.dval("yMax");
444  double zMax = 0;
445  if (dimnum == 3){
446  zMax = par.dval("zMax");
447  }
448 
449  EqnNode_gpu *eqn = par.astval("Ax");
450 
451  find_field<<<par.grid, par.threads>>>(Ax_gpu, dx, dy, dz,
452  xMax, yMax, zMax, 0, eqn);
453  }
454  else{
455  par.Ax_fn<<<par.grid, par.threads>>>(x_gpu, y_gpu, z_gpu,
456  xMax, yMax, zMax,
457  omegaX, omegaY, omegaZ,
458  omega, fudge, Ax_gpu);
459  }
460  if (par.is_ast_gpu("Ay")){
461  double dx = par.dval("dx");
462  double dy = par.dval("dy");
463  double dz = par.dval("dz");
464  double xMax = par.dval("xMax");
465  double yMax = par.dval("yMax");
466  double zMax = 0;
467  if (dimnum == 3){
468  zMax = par.dval("zMax");
469  }
470 
471  EqnNode_gpu *eqn = par.astval("Ay");
472 
473  find_field<<<par.grid, par.threads>>>(Ay_gpu, dx, dy, dz,
474  xMax, yMax, zMax , 0, eqn);
475  }
476  else{
477  par.Ay_fn<<<par.grid, par.threads>>>(x_gpu, y_gpu, z_gpu,
478  xMax, yMax, zMax,
479  omegaX, omegaY, omegaZ,
480  omega, fudge, Ay_gpu);
481  }
482  if (dimnum == 3){
483  if (par.is_ast_gpu("Az")){
484  double dx = par.dval("dx");
485  double dy = par.dval("dy");
486  double dz = par.dval("dz");
487 
488  double xMax = par.dval("xMax");
489  double yMax = par.dval("yMax");
490  double zMax = 0;
491  if (dimnum == 3){
492  zMax = par.dval("zMax");
493  }
494 
495  EqnNode_gpu *eqn = par.astval("Az");
496 
497  find_field<<<par.grid, par.threads>>>(Az_gpu, dx, dy, dz,
498  xMax, yMax, zMax,
499  0, eqn);
500  }
501  else{
502  par.Az_fn<<<par.grid, par.threads>>>(x_gpu, y_gpu, z_gpu,
503  xMax, yMax, zMax,
504  omegaX, omegaY, omegaZ,
505  omega, fudge, Az_gpu);
506  }
507  }
508  else{
509  kconstant_A<<<par.grid, par.threads>>>(x_gpu, y_gpu, z_gpu,
510  xMax, yMax, zMax,
511  omegaX, omegaY, omegaZ,
512  omega, fudge, Az_gpu);
513  }
514  }
515  cudaMemcpy(Ax, Ax_gpu, sizeof(double)*gSize,cudaMemcpyDeviceToHost);
516  cudaMemcpy(Ay, Ay_gpu, sizeof(double)*gSize,cudaMemcpyDeviceToHost);
517  cudaMemcpy(Az, Az_gpu, sizeof(double)*gSize,cudaMemcpyDeviceToHost);
518 
519  par.store("Ax", Ax);
520  par.store("Ay", Ay);
521  par.store("Az", Az);
522 
523  par.store("Ax_gpu", Ax_gpu);
524  par.store("Ay_gpu", Ay_gpu);
525  par.store("Az_gpu", Az_gpu);
526 
527 }
528 
529 // constant Kernel A
530 __global__ void kconstant_A(double *x, double *y, double *z,
531  double xMax, double yMax, double zMax,
532  double omegaX, double omegaY, double omegaZ,
533  double omega, double fudge, double *A){
534  int gid = getGid3d3d();
535  A[gid] = 0;
536 }
537 
538 // Kernel for simple rotational case, Ax
539 __global__ void krotation_Ax(double *x, double *y, double *z,
540  double xMax, double yMax, double zMax,
541  double omegaX, double omegaY, double omegaZ,
542  double omega, double fudge, double *A){
543  int gid = getGid3d3d();
544  int yid = blockDim.y*blockIdx.y + threadIdx.y;
545  A[gid] = -y[yid] * omega * omegaX;
546 }
547 
548 // Kernel for simple rotational case, Ay
549 __global__ void krotation_Ay(double *x, double *y, double *z,
550  double xMax, double yMax, double zMax,
551  double omegaX, double omegaY, double omegaZ,
552  double omega, double fudge, double *A){
553  int gid = getGid3d3d();
554  int xid = blockDim.x*blockIdx.x + threadIdx.x;
555  A[gid] = x[xid] * omega * omegaY;
556 }
557 
558 // Kernel for simple rotational case, Ax
559 __global__ void kring_rotation_Ax(double *x, double *y, double *z,
560  double xMax, double yMax, double zMax,
561  double omegaX, double omegaY, double omegaZ,
562  double omega, double fudge, double *A){
563  int gid = getGid3d3d();
564  int xid = blockDim.x*blockIdx.x + threadIdx.x;
565  int yid = blockDim.y*blockIdx.y + threadIdx.y;
566  int zid = blockDim.z*blockIdx.z + threadIdx.z;
567  double theta = atan2(y[yid],x[xid]);
568  A[gid] = (z[zid]+zMax)*cos(theta)*omega*omegaX;
569 }
570 
571 // Kernel for simple rotational case, Ay
572 __global__ void kring_rotation_Ay(double *x, double *y, double *z,
573  double xMax, double yMax, double zMax,
574  double omegaX, double omegaY, double omegaZ,
575  double omega, double fudge, double *A){
576  int gid = getGid3d3d();
577  int xid = blockDim.x*blockIdx.x + threadIdx.x;
578  int yid = blockDim.y*blockIdx.y + threadIdx.y;
579  int zid = blockDim.z*blockIdx.z + threadIdx.z;
580  double theta = atan2(y[yid],x[xid]);
581  A[gid] = (z[zid]+zMax)*sin(theta)*omega*omegaX;
582 }
583 
584 // Kernel for simple rotational case, Az
585 __global__ void kring_rotation_Az(double *x, double *y, double *z,
586  double xMax, double yMax, double zMax,
587  double omegaX, double omegaY, double omegaZ,
588  double omega, double fudge, double *A){
589  int gid = getGid3d3d();
590  int xid = blockDim.x*blockIdx.x + threadIdx.x;
591  int yid = blockDim.y*blockIdx.y + threadIdx.y;
592  double r = sqrt(x[xid]*x[xid] + y[yid]*y[yid]);
593  A[gid] = r*omega*omegaX;
594 }
595 
596 
597 // kernel for a simple vortex ring
598 __global__ void kring_Az(double *x, double *y, double *z,
599  double xMax, double yMax, double zMax,
600  double omegaX, double omegaY, double omegaZ,
601  double omega, double fudge, double *A){
602  int gid = getGid3d3d();
603  int xid = blockDim.x*blockIdx.x + threadIdx.x;
604  int yid = blockDim.y*blockIdx.y + threadIdx.y;
605 
606  double rad = sqrt(x[xid]*x[xid] + y[yid]*y[yid]);
607 
608  A[gid] = omega * exp(-rad*rad / (0.0001*xMax)) * 0.01;
609 }
610 
611 // testing kernel Ax
612 __global__ void ktest_Ax(double *x, double *y, double *z,
613  double xMax, double yMax, double zMax,
614  double omegaX, double omegaY, double omegaZ,
615  double omega, double fudge, double *A){
616  int gid = getGid3d3d();
617  int yid = blockDim.y*blockIdx.y + threadIdx.y;
618  A[gid] = (sin(y[yid] * 100000)+1) * yMax * omega;
619 }
620 
621 // testing kernel Ay
622 __global__ void ktest_Ay(double *x, double *y, double *z,
623  double xMax, double yMax, double zMax,
624  double omegaX, double omegaY, double omegaZ,
625  double omega, double fudge, double *A){
626  int gid = getGid3d3d();
627  A[gid] = 0;
628 }
629 
630 // function to generate V
631 void generate_fields(Grid &par){
632 
633  generate_p_space(par);
634  generate_K(par);
635  generate_gauge(par);
636 
637  int gSize = par.ival("gSize");
638  int dimnum = par.ival("dimnum");
639  int winding = par.dval("winding");
640 
641  bool energy_calc = par.bval("energy_calc");
642 
643  double dt = par.dval("dt");
644  double gdt = par.dval("gdt");
645  double *x_gpu = par.dsval("x_gpu");
646  double *y_gpu = par.dsval("y_gpu");
647  double *z_gpu = par.dsval("z_gpu");
648  double *px_gpu = par.dsval("px_gpu");
649  double *py_gpu = par.dsval("py_gpu");
650  double *pz_gpu = par.dsval("pz_gpu");
651  double *Ax_gpu = par.dsval("Ax_gpu");
652  double *Ay_gpu = par.dsval("Ay_gpu");
653  double *Az_gpu = par.dsval("Az_gpu");
654  double *K_gpu = par.dsval("K_gpu");
655 
656  // Creating items list for kernels
657 
658  double *items, *items_gpu;
659  int item_size = 18;
660  items = (double*)malloc(sizeof(double)*item_size);
661  cudaMalloc((void**) &items_gpu, sizeof(double)*item_size);
662 
663  for (int i = 0; i < item_size; ++i){
664  items[i] = 0;
665  }
666  items[0] = par.dval("xMax");
667  items[1] = par.dval("yMax");
668  if (dimnum == 3){
669  items[2] = par.dval("zMax");
670  }
671 
672  items[3] = par.dval("omegaX");
673  items[4] = par.dval("omegaY");
674  if (dimnum == 3){
675  items[5] = par.dval("omegaZ");
676  }
677 
678  items[6] = par.dval("x0_shift");
679  items[7] = par.dval("y0_shift");
680  if (dimnum == 3){
681  items[8] = par.dval("z0_shift");
682  }
683  else{
684  items[8] = 0.0;
685  }
686 
687  items[9] = par.dval("mass");
688  items[10] = par.dval("gammaY");
689  items[11] = 1.0; // For gammaZ
690  items[12] = par.dval("fudge");
691  items[13] = 0.0; // For time
692 
693  items[14] = par.dval("Rxy");
694 
695  items[15] = par.dval("a0x");
696  items[16] = par.dval("a0y");
697  if (dimnum == 3){
698  items[17] = par.dval("a0z");
699  }
700  else{
701  items[17] = 1.0;
702  }
703 
704  cudaMemcpy(items_gpu, items, sizeof(double)*item_size,
705  cudaMemcpyHostToDevice);
706 
707  double fudge = par.dval("fudge");
708 
709  // Generating V
710 
711  double *V, *V_gpu;
712 
713  V = (double *)malloc(sizeof(double)*gSize);
714 
715  cudaMalloc((void **) &V_gpu, sizeof(double)*gSize);
716 
717  if (par.is_ast_gpu("V")){
718  double dx = par.dval("dx");
719  double dy = par.dval("dy");
720  double dz = par.dval("dz");
721 
722  double xMax = par.dval("xMax");
723  double yMax = par.dval("yMax");
724  double zMax = 0;
725  if (dimnum == 3){
726  zMax = par.dval("zMax");
727  }
728 
729  EqnNode_gpu *eqn = par.astval("V");
730  find_field<<<par.grid, par.threads>>>(V_gpu, dx, dy, dz,
731  xMax, yMax, zMax, 0, eqn);
732  }
733  else{
734  par.V_fn<<<par.grid, par.threads>>>(x_gpu, y_gpu, z_gpu, items_gpu,
735  Ax_gpu, Ay_gpu, Az_gpu, V_gpu);
736  }
737 
738  cudaMemcpy(V, V_gpu, sizeof(double)*gSize, cudaMemcpyDeviceToHost);
739 
740  // Generating wfc
741 
742  double2 *wfc, *wfc_gpu;
743  double *phi, *phi_gpu;
744 
745  wfc = (double2 *)malloc(sizeof(double2)*gSize);
746  phi = (double *)malloc(sizeof(double)*gSize);
747 
748  cudaMalloc((void**) &wfc_gpu, sizeof(double2)*gSize);
749  cudaMalloc((void**) &phi_gpu, sizeof(double)*gSize);
750 
751  if (par.bval("read_wfc")){
752  wfc = par.cufftDoubleComplexval("wfc");
753  cudaMemcpy(wfc_gpu, wfc, sizeof(double2)*gSize, cudaMemcpyHostToDevice);
754  }
755  else{
756  par.wfc_fn<<<par.grid, par.threads>>>(x_gpu, y_gpu, z_gpu, items_gpu,
757  winding, phi_gpu, wfc_gpu);
758  cudaMemcpy(wfc, wfc_gpu, sizeof(double2)*gSize, cudaMemcpyDeviceToHost);
759  }
760 
761  cudaMemcpy(phi, phi_gpu, sizeof(double)*gSize, cudaMemcpyDeviceToHost);
762 
763  // generating aux fields.
764  double2 *GV, *EV, *GK, *EK;
765  double2 *GV_gpu, *EV_gpu, *GK_gpu, *EK_gpu;
766  double2 *GpAx, *GpAy, *GpAz, *EpAx, *EpAy, *EpAz;
767  double2 *GpAx_gpu, *GpAy_gpu, *GpAz_gpu, *EpAx_gpu, *EpAy_gpu, *EpAz_gpu;
768  double *pAx, *pAy, *pAz;
769  double *pAx_gpu, *pAy_gpu, *pAz_gpu;
770 
771  GV = (double2 *)malloc(sizeof(double2)*gSize);
772  EV = (double2 *)malloc(sizeof(double2)*gSize);
773  GK = (double2 *)malloc(sizeof(double2)*gSize);
774  EK = (double2 *)malloc(sizeof(double2)*gSize);
775 
776  GpAx = (double2 *)malloc(sizeof(double2)*gSize);
777  EpAx = (double2 *)malloc(sizeof(double2)*gSize);
778  GpAy = (double2 *)malloc(sizeof(double2)*gSize);
779  EpAy = (double2 *)malloc(sizeof(double2)*gSize);
780  GpAz = (double2 *)malloc(sizeof(double2)*gSize);
781  EpAz = (double2 *)malloc(sizeof(double2)*gSize);
782 
783  pAx = (double *)malloc(sizeof(double)*gSize);
784  pAy = (double *)malloc(sizeof(double)*gSize);
785  pAz = (double *)malloc(sizeof(double)*gSize);
786 
787  cudaMalloc((void**) &GV_gpu, sizeof(double2)*gSize);
788  cudaMalloc((void**) &EV_gpu, sizeof(double2)*gSize);
789  cudaMalloc((void**) &GK_gpu, sizeof(double2)*gSize);
790  cudaMalloc((void**) &EK_gpu, sizeof(double2)*gSize);
791 
792  cudaMalloc((void**) &GpAx_gpu, sizeof(double2)*gSize);
793  cudaMalloc((void**) &EpAx_gpu, sizeof(double2)*gSize);
794  cudaMalloc((void**) &GpAy_gpu, sizeof(double2)*gSize);
795  cudaMalloc((void**) &EpAy_gpu, sizeof(double2)*gSize);
796  cudaMalloc((void**) &GpAz_gpu, sizeof(double2)*gSize);
797  cudaMalloc((void**) &EpAz_gpu, sizeof(double2)*gSize);
798 
799  cudaMalloc((void**) &pAx_gpu, sizeof(double)*gSize);
800  cudaMalloc((void**) &pAy_gpu, sizeof(double)*gSize);
801  cudaMalloc((void**) &pAz_gpu, sizeof(double)*gSize);
802 
803  aux_fields<<<par.grid, par.threads>>>(V_gpu, K_gpu, gdt, dt,
804  Ax_gpu, Ay_gpu, Az_gpu,
805  px_gpu, py_gpu, pz_gpu,
806  pAx_gpu, pAy_gpu, pAz_gpu,
807  GV_gpu, EV_gpu, GK_gpu, EK_gpu,
808  GpAx_gpu, GpAy_gpu, GpAz_gpu,
809  EpAx_gpu, EpAy_gpu, EpAz_gpu);
810 
811  cudaMemcpy(GV, GV_gpu, sizeof(double2)*gSize, cudaMemcpyDeviceToHost);
812  cudaMemcpy(EV, EV_gpu, sizeof(double2)*gSize, cudaMemcpyDeviceToHost);
813  cudaMemcpy(GK, GK_gpu, sizeof(double2)*gSize, cudaMemcpyDeviceToHost);
814  cudaMemcpy(EK, EK_gpu, sizeof(double2)*gSize, cudaMemcpyDeviceToHost);
815 
816  cudaMemcpy(GpAx, GpAx_gpu, sizeof(double2)*gSize, cudaMemcpyDeviceToHost);
817  cudaMemcpy(EpAx, EpAx_gpu, sizeof(double2)*gSize, cudaMemcpyDeviceToHost);
818  cudaMemcpy(GpAy, GpAy_gpu, sizeof(double2)*gSize, cudaMemcpyDeviceToHost);
819  cudaMemcpy(EpAy, EpAy_gpu, sizeof(double2)*gSize, cudaMemcpyDeviceToHost);
820  cudaMemcpy(GpAz, GpAz_gpu, sizeof(double2)*gSize, cudaMemcpyDeviceToHost);
821  cudaMemcpy(EpAz, EpAz_gpu, sizeof(double2)*gSize, cudaMemcpyDeviceToHost);
822 
823  cudaMemcpy(pAx, pAx_gpu, sizeof(double)*gSize, cudaMemcpyDeviceToHost);
824  cudaMemcpy(pAy, pAy_gpu, sizeof(double)*gSize, cudaMemcpyDeviceToHost);
825  cudaMemcpy(pAz, pAz_gpu, sizeof(double)*gSize, cudaMemcpyDeviceToHost);
826 
827  // Storing variables
828  cudaFree(items_gpu);
829  //cudaFree(phi_gpu);
830  cudaFree(GV_gpu);
831  cudaFree(EV_gpu);
832  cudaFree(GK_gpu);
833  cudaFree(EK_gpu);
834 
835  cudaFree(pAx_gpu);
836  cudaFree(pAy_gpu);
837  cudaFree(pAz_gpu);
838 
839  cudaFree(GpAx_gpu);
840  cudaFree(GpAy_gpu);
841  cudaFree(GpAz_gpu);
842 
843  cudaFree(EpAx_gpu);
844  cudaFree(EpAy_gpu);
845  cudaFree(EpAz_gpu);
846 
847  cudaFree(Ax_gpu);
848  cudaFree(Ay_gpu);
849  cudaFree(Az_gpu);
850 
851  cudaFree(x_gpu);
852  cudaFree(y_gpu);
853  cudaFree(z_gpu);
854 
855  cudaFree(px_gpu);
856  cudaFree(py_gpu);
857  cudaFree(pz_gpu);
858 
859  if (!energy_calc){
860  cudaFree(K_gpu);
861  cudaFree(V_gpu);
862  }
863  else{
864  par.store("V_gpu",V_gpu);
865  }
866 
867  par.store("V",V);
868  par.store("items", items);
869  //par.store("items_gpu", items_gpu);
870  par.store("wfc", wfc);
871  par.store("wfc_gpu", wfc_gpu);
872  par.store("Phi", phi);
873  par.store("Phi_gpu", phi_gpu);
874 
875  par.store("GV",GV);
876  par.store("EV",EV);
877  par.store("GK",GK);
878  par.store("EK",EK);
879  //par.store("GV_gpu",GV_gpu);
880  //par.store("EV_gpu",EV_gpu);
881  //par.store("GK_gpu",GK_gpu);
882  //par.store("EK_gpu",EK_gpu);
883 
884  par.store("GpAx",GpAx);
885  par.store("EpAx",EpAx);
886  par.store("GpAy",GpAy);
887  par.store("EpAy",EpAy);
888  par.store("GpAz",GpAz);
889  par.store("EpAz",EpAz);
890 
891  par.store("pAx",pAx);
892  par.store("pAy",pAy);
893  par.store("pAz",pAz);
894  //par.store("pAx_gpu",pAx_gpu);
895  //par.store("pAy_gpu",pAy_gpu);
896  //par.store("pAz_gpu",pAz_gpu);
897 
898 }
899 
900 __global__ void kharmonic_V(double *x, double *y, double *z, double* items,
901  double *Ax, double *Ay, double *Az, double *V){
902 
903  int gid = getGid3d3d();
904  int xid = blockDim.x*blockIdx.x + threadIdx.x;
905  int yid = blockDim.y*blockIdx.y + threadIdx.y;
906  int zid = blockDim.z*blockIdx.z + threadIdx.z;
907 
908  double V_x = items[3]*(x[xid]+items[6]);
909  double V_y = items[10]*items[4]*(y[yid]+items[7]);
910  double V_z = items[11]*items[5]*(z[zid]+items[8]);
911 
912  V[gid] = 0.5*items[9]*((V_x*V_x + V_y*V_y + V_z*V_z)
913  + (Ax[gid]*Ax[gid] + Ay[gid]*Ay[gid] + Az[gid]*Az[gid]));
914 }
915 
916 // kernel for simple 3d torus trapping potential
917 __global__ void ktorus_V(double *x, double *y, double *z, double* items,
918  double *Ax, double *Ay, double *Az, double *V){
919 
920  int gid = getGid3d3d();
921  int xid = blockDim.x*blockIdx.x + threadIdx.x;
922  int yid = blockDim.y*blockIdx.y + threadIdx.y;
923  int zid = blockDim.z*blockIdx.z + threadIdx.z;
924 
925  double rad = sqrt((x[xid] - items[6]) * (x[xid] - items[6])
926  + (y[yid] - items[7]) * (y[yid] - items[7]))
927  - 0.4*items[0];
928  double omegaR = (items[3]*items[3] + items[4]*items[4]);
929  double V_tot = (2*items[5]*items[5]*(z[zid] - items[8])*(z[zid] - items[8])
930  + omegaR*(rad*rad + items[12]*rad*z[zid]));
931  V[gid] = 0.5*items[9]*(V_tot
932  + Ax[gid]*Ax[gid]
933  + Ay[gid]*Ay[gid]
934  + Az[gid]*Az[gid]);
935 }
936 
937 __global__ void kstd_wfc(double *x, double *y, double *z, double *items,
938  double winding, double *phi, double2 *wfc){
939 
940  int gid = getGid3d3d();
941  int xid = blockDim.x*blockIdx.x + threadIdx.x;
942  int yid = blockDim.y*blockIdx.y + threadIdx.y;
943  int zid = blockDim.z*blockIdx.z + threadIdx.z;
944 
945  phi[gid] = fmod(winding*atan2(y[yid], x[xid]),2*PI);
946 
947  wfc[gid].x = exp(-(x[xid]*x[xid]/(items[14]*items[14]*items[15]*items[15])
948  + y[yid]*y[yid]/(items[14]*items[14]*items[16]*items[16])
949  + z[zid]*z[zid]/(items[14]*items[14]*items[17]*items[17])))
950  * cos(phi[gid]);
951  wfc[gid].y = -exp(-(x[xid]*x[xid]/(items[14]*items[14]*items[15]*items[15])
952  + y[yid]*y[yid]/(items[14]*items[14]*items[16]*items[16])
953  + z[zid]*z[zid]/(items[14]*items[14]*items[17]*items[17])))
954  * sin(phi[gid]);
955 
956 }
957 
958 __global__ void ktorus_wfc(double *x, double *y, double *z, double *items,
959  double winding, double *phi, double2 *wfc){
960 
961  int gid = getGid3d3d();
962  int xid = blockDim.x*blockIdx.x + threadIdx.x;
963  int yid = blockDim.y*blockIdx.y + threadIdx.y;
964  int zid = blockDim.z*blockIdx.z + threadIdx.z;
965 
966  double rad = sqrt((x[xid] - items[6]) * (x[xid] - items[6])
967  + (y[yid] - items[7]) * (y[yid] - items[7]))
968  - 0.5*items[0]*items[12];
969 
970  wfc[gid].x = exp(-( pow((rad)/(items[14]*items[15]*0.5),2) +
971  pow((z[zid])/(items[14]*items[17]*0.5),2) ) );
972  wfc[gid].y = 0.0;
973 }
974 
975 __global__ void aux_fields(double *V, double *K, double gdt, double dt,
976  double* Ax, double *Ay, double* Az,
977  double *px, double *py, double *pz,
978  double* pAx, double* pAy, double* pAz,
979  double2* GV, double2* EV, double2* GK, double2* EK,
980  double2* GpAx, double2* GpAy, double2* GpAz,
981  double2* EpAx, double2* EpAy, double2* EpAz){
982  int gid = getGid3d3d();
983  int xid = blockDim.x*blockIdx.x + threadIdx.x;
984  int yid = blockDim.y*blockIdx.y + threadIdx.y;
985  int zid = blockDim.z*blockIdx.z + threadIdx.z;
986 
987  GV[gid].x = exp( -V[gid]*(gdt/(2*HBAR)));
988  GK[gid].x = exp( -K[gid]*(gdt/HBAR));
989  GV[gid].y = 0.0;
990  GK[gid].y = 0.0;
991 
992  // Ax and Ay will be calculated here but are used only for
993  // debugging. They may be needed later for magnetic field calc
994 
995  pAy[gid] = Ax[gid] * px[xid];
996  pAx[gid] = Ay[gid] * py[yid];
997  pAz[gid] = Az[gid] * pz[zid];
998 
999  GpAx[gid].x = exp(-pAx[gid]*gdt);
1000  GpAx[gid].y = 0;
1001  GpAy[gid].x = exp(-pAy[gid]*gdt);
1002  GpAy[gid].y = 0;
1003  GpAz[gid].x = exp(-pAz[gid]*gdt);
1004  GpAz[gid].y = 0;
1005 
1006  EV[gid].x=cos( -V[gid]*(dt/(2*HBAR)));
1007  EV[gid].y=sin( -V[gid]*(dt/(2*HBAR)));
1008  EK[gid].x=cos( -K[gid]*(dt/HBAR));
1009  EK[gid].y=sin( -K[gid]*(dt/HBAR));
1010 
1011  EpAz[gid].x=cos(-pAz[gid]*dt);
1012  EpAz[gid].y=sin(-pAz[gid]*dt);
1013  EpAy[gid].x=cos(-pAy[gid]*dt);
1014  EpAy[gid].y=sin(-pAy[gid]*dt);
1015  EpAx[gid].x=cos(-pAx[gid]*dt);
1016  EpAx[gid].y=sin(-pAx[gid]*dt);
1017 }
1018 
1019 // Function to generate grids and treads for 2d and 3d cases
1020 void generate_grid(Grid& par){
1021 
1022  int dimnum = par.ival("dimnum");
1023  int xDim = par.ival("xDim");
1024  int yDim = par.ival("yDim");
1025  int zDim = par.ival("zDim");
1026  int xD = 1, yD = 1, zD = 1;
1027  int max_threads = 256;
1028  if (xDim < max_threads){
1029  max_threads = xDim;
1030  }
1031 
1032  if (dimnum == 2){
1033  if (xDim <= max_threads){
1034  par.threads.x = xDim;
1035  par.threads.y = 1;
1036  par.threads.z = 1;
1037 
1038  xD = 1;
1039  yD = yDim;
1040  zD = 1;
1041  }
1042  else{
1043  int count = 0;
1044  int dim_tmp = xDim;
1045  while (dim_tmp > max_threads){
1046  count++;
1047  dim_tmp /= 2;
1048  }
1049 
1050  std::cout << "count is: " << count << '\n';
1051 
1052  par.threads.x = dim_tmp;
1053  par.threads.y = 1;
1054  par.threads.z = 1;
1055  xD = pow(2,count);
1056  yD = yDim;
1057  zD = 1;
1058  }
1059 
1060  }
1061  else if (dimnum == 3){
1062 
1063  if (xDim <= max_threads){
1064  par.threads.x = xDim;
1065  par.threads.y = 1;
1066  par.threads.z = 1;
1067 
1068  xD = 1;
1069  yD = yDim;
1070  zD = zDim;
1071  }
1072  else{
1073  int count = 0;
1074  int dim_tmp = xDim;
1075  while (dim_tmp > max_threads){
1076  count++;
1077  dim_tmp /= 2;
1078  }
1079 
1080  std::cout << "count is: " << count << '\n';
1081 
1082  par.threads.x = dim_tmp;
1083  par.threads.y = 1;
1084  par.threads.z = 1;
1085  xD = pow(2,count);
1086  yD = yDim;
1087  zD = zDim;
1088  }
1089 
1090  }
1091  else if (dimnum == 1){
1092  par.threads.x = xDim;
1093  }
1094  par.grid.x=xD;
1095  par.grid.y=yD;
1096  par.grid.z=zD;
1097 
1098  std::cout << "threads in x are: " << par.threads.x << '\n';
1099  std::cout << "dimensions are: " << par.grid.x << '\t'
1100  << par.grid.y << '\t'
1101  << par.grid.z << '\n';
1102 
1103 }