GPUE  v1.0
GPU Gross-Pitaevskii Equation numerical solver for Bose-Einstein condensates
evolution.cu
Go to the documentation of this file.
1 #include "../include/evolution.h"
2 #include "../include/vortex_3d.h"
3 
4 void evolve(Grid &par,
5  int numSteps,
6  unsigned int gstate,
7  std::string buffer){
8 
9  // Re-establishing variables from parsed Grid class
10  std::string data_dir = par.sval("data_dir");
11  int dimnum = par.ival("dimnum");
12  double omega = par.dval("omega");
13  double angle_sweep = par.dval("angle_sweep");
14  double gdt = par.dval("gdt");
15  double dt = par.dval("dt");
16  double omegaX = par.dval("omegaX");
17  double omegaY = par.dval("omegaY");
18  double mass = par.dval("mass");
19  double dx = par.dval("dx");
20  double dy = 1;
21  double dz = 1;
22  double interaction = par.dval("interaction");
23  double laser_power = par.dval("laser_power");
24  double gDenConst = par.dval("gDenConst");
25  double thresh_const = par.dval("thresh_const");
26  double *x = par.dsval("x");
27  double *y;
28  double *V = par.dsval("V");
29  double *Phi = par.dsval("Phi");
30  double2 *gpu1dpAx = par.cufftDoubleComplexval("pAx_gpu");
31  double2 *gpu1dpAy;
32  double2 *gpu1dpAz;
33  double *Phi_gpu = par.dsval("Phi_gpu");
34  bool write_it = par.bval("write_it");
35  bool graph = par.bval("graph");
36  int N = par.ival("atoms");
37  int printSteps = par.ival("printSteps");
38  bool nonlin = par.bval("gpe");
39  bool lz = par.bval("corotating");
40  bool ramp = par.bval("ramp");
41  int ramp_type = par.ival("ramp_type");
42  int xDim = par.ival("xDim");
43  int yDim = 1;
44  int zDim = 1;
45  cufftDoubleComplex *wfc = par.cufftDoubleComplexval("wfc");
46  cufftDoubleComplex *gpuWfc = par.cufftDoubleComplexval("wfc_gpu");
47  cufftDoubleComplex *K_gpu =
48  par.cufftDoubleComplexval("K_gpu");
49  cufftDoubleComplex *V_gpu =
50  par.cufftDoubleComplexval("V_gpu");
51 
52  if (dimnum > 1){
53  dy = par.dval("dy");
54  y = par.dsval("y");
55  gpu1dpAy = par.cufftDoubleComplexval("pAy_gpu");
56  yDim = par.ival("yDim");
57  }
58  if (dimnum > 2){
59  dz = par.dval("dz");
60  gpu1dpAz = par.cufftDoubleComplexval("pAz_gpu");
61  zDim = par.ival("zDim");
62  }
63 
64  int gridSize = xDim * yDim * zDim;
65 
66  // getting data from Cuda class
67  cufftResult result;
68  cufftHandle plan_1d = par.ival("plan_1d");
69  cufftHandle plan_2d = par.ival("plan_2d");
70  cufftHandle plan_other2d = par.ival("plan_other2d");
71  cufftHandle plan_3d = par.ival("plan_3d");
72  cufftHandle plan_dim2 = par.ival("plan_dim2");
73  cufftHandle plan_dim3 = par.ival("plan_dim3");
74  dim3 threads = par.threads;
75  dim3 grid = par.grid;
76 
77  // Because no two operations are created equally.
78  // Multiplication is faster than divisions.
79  double renorm_factor_nd=1.0/pow(gridSize,0.5);
80  double renorm_factor_1d=1.0/pow(xDim,0.5);
81 
82  clock_t begin, end;
83  double time_spent;
84  double Dt;
85  if(gstate==0){
86  Dt = gdt;
87  printf("Timestep for groundstate solver set as: %E\n",Dt);
88  }
89  else{
90  Dt = dt;
91  printf("Timestep for evolution set as: %E\n",Dt);
92  }
93  begin = clock();
94  double omega_0=omega*omegaX;
95 
96  // ** ############################################################## ** //
97  // ** HERE BE DRAGONS OF THE MOST DANGEROUS KIND! ** //
98  // ** ############################################################## ** //
99 
100  // 2D VORTEX TRACKING
101 
102  double mask_2d = par.dval("mask_2d");
103  int x0_shift = par.dval("x0_shift");
104  int y0_shift = par.dval("y0_shift");
105  int charge = par.ival("charge");
106  int kill_idx = par.ival("kill_idx");
107  cufftDoubleComplex *EV_opt = par.cufftDoubleComplexval("EV_opt");
108  int kick_it = par.ival("kick_it");
109  double *V_opt = par.dsval("V_opt");
110  // Double buffering and will attempt to thread free and calloc operations to
111  // hide time penalty. Or may not bother.
112  int num_vortices[2] = {0,0};
113 
114  // binary matrix of size xDim*yDim,
115  // 1 for vortex at specified index, 0 otherwise
116  int* vortexLocation;
117  //int* olMaxLocation = (int*) calloc(xDim*yDim,sizeof(int));
118 
119  std::shared_ptr<Vtx::Vortex> central_vortex; //vortex closest to the central position
120  /*
121  central_vortex.coords.x = -1;
122  central_vortex.coords.y = -1;
123  central_vortex.coordsD.x = -1.;
124  central_vortex.coordsD.y = -1.;
125  central_vortex.wind = 0;
126  */
127 
128  // Angle of vortex lattice. Add to optical lattice for alignment.
129  double vort_angle;
130 
131  // array of vortex coordinates from vortexLocation 1's
132  //struct Vtx::Vortex *vortCoords = NULL;
133 
134 
135  std::shared_ptr<Vtx::VtxList> vortCoords = std::make_shared<Vtx::VtxList>(7);
136  //std::vector<std::shared_ptr<Vtx::Vortex> vortCoords;
137 
138  //Previous array of vortex coordinates from vortexLocation 1's
139  //struct Vtx::Vortex *vortCoordsP = NULL;
140  //std::vector<struct Vtx::Vortex> vortCoordsP;
141  std::shared_ptr<Vtx::VtxList> vortCoordsP = std::make_shared<Vtx::VtxList>(7);
142 
143 
144  LatticeGraph::Lattice lattice; //Vortex lattice graph.
145  double* adjMat;
146 
147  // Assuming triangular lattice at rotatio
148 
149 
150  //std::cout << "numSteps is: " << numSteps << '\n';
151  // Iterating through all of the steps in either g or esteps.
152  for(int i=0; i < numSteps; ++i){
153  double time = Dt*i;
154  if (ramp){
155 
156  //Adjusts omega for the appropriate trap frequency.
157  if (ramp_type == 1){
158  if (i == 0){
159  omega_0 = (double)omega;
160  }
161  else{
162  omega_0 = (double)i / (double)(i+1);
163  }
164  }
165  else{
166  if (i == 0){
167  omega_0=(double)omega/(double)(numSteps);
168  }
169  else{
170  omega_0 = (double)(i+1) / (double)i;
171  }
172  }
173  }
174 
175  // Print-out at pre-determined rate.
176  // Vortex & wfc analysis performed here also.
177  if(i % printSteps == 0) {
178  // If the unit_test flag is on, we need a special case
179  printf("Step: %d Omega: %lf\n", i, omega_0);
180  cudaMemcpy(wfc, gpuWfc, sizeof(cufftDoubleComplex)*xDim*yDim*zDim,
181  cudaMemcpyDeviceToHost);
182 
183  // Printing out time of iteration
184  end = clock();
185  time_spent = (double) (end - begin) / CLOCKS_PER_SEC;
186  printf("Time spent: %lf\n", time_spent);
187  std::string fileName = "";
188  //printf("ramp=%d gstate=%d rg=%d \n",
189  // ramp, gstate, ramp | (gstate << 1));
190  switch (ramp | (gstate << 1)) {
191  case 0: //Groundstate solver, constant Omega value.
192  {
193  fileName = "wfc_0_const";
194  break;
195  }
196  case 1: //Groundstate solver, ramped Omega value.
197  {
198  fileName = "wfc_0_ramp";
199  break;
200  }
201  case 2: //Real-time evolution, constant Omega value.
202  {
203  if (dimnum == 3){
204  // Note: In the case of 3d, we need to think about
205  // vortex tracking in a new way.
206  // It may be as simple as splitting the problem
207  // into 2D elements and working from there, but
208  // let's look into it when we need it in the
209  // future.
210  std::cout << "commencing 3d vortex tracking" << '\n';
211 
212  // Creating the necessary double* values
213  double* edges = (double *)malloc(sizeof(double)
214  *gridSize);
215 
216 
217  find_edges(par, wfc, edges);
218  double* edges_gpu = par.dsval("edges_gpu");
219 
220  // Now we need to output everything
221  if (write_it){
222  FileIO::writeOutDouble(buffer, data_dir+"Edges",
223  edges, gridSize, i);
224  }
225  free(edges);
226 
227  }
228  else if (dimnum == 2){
229  vortexLocation = (int *) calloc(xDim * yDim, sizeof(int));
230  num_vortices[0] = Tracker::findVortex(vortexLocation, wfc,
231  mask_2d, xDim, x, i);
232  // If initial step, locate vortices, least-squares to find
233  // exact centre, calculate lattice angle, generate optical
234  // lattice.
235  if (i == 0) {
236  if(num_vortices[0] > 0){
237  //Reserve enough space for the vortices
238  //reserve(num_vortices[0]);
239  vortCoords = std::make_shared<Vtx::VtxList>
240  (num_vortices[0]);
241  vortCoordsP = std::make_shared<Vtx::VtxList>
242  (num_vortices[0]);
243 
244  //Locate the vortex positions to the nearest grid, then
245  //perform a least-squares fit to determine the location
246  //to sub-grid reolution.
247  Tracker::vortPos(vortexLocation,
248  vortCoords->getVortices(), xDim, wfc);
249  Tracker::lsFit(vortCoords->getVortices(),wfc,xDim);
250 
251  //Find the centre-most vortex in the lattice
252  central_vortex = Tracker::vortCentre(vortCoords->
253  getVortices(), xDim);
254  //Determine the Angle formed by the lattice relative to
255  //the x-axis
256  vort_angle = Tracker::vortAngle(vortCoords->
257  getVortices(), central_vortex);
258 
259  //Store the vortex angle in the parameter file
260  par.store("Vort_angle", vort_angle);
261 
262  //Determine average lattice spacing.
263  double sepAvg = Tracker::vortSepAvg(vortCoords->
264  getVortices(), central_vortex);
265 
266  par.store("Central_vort_x",
267  (double) central_vortex->getCoords().x);
268  par.store("Central_vort_y",
269  (double) central_vortex->getCoords().y);
270  par.store("Central_vort_winding",
271  (double) central_vortex->getWinding());
272  par.store("Num_vort", (double) vortCoords->
273  getVortices().size());
274 
275  //Setup the optical lattice to match the spacing and
276  // angle+angle_sweep of the vortex lattice.
277  // Amplitude matched by setting laser_power
278  // parameter switch.
279  optLatSetup(central_vortex, V,
280  vortCoords->getVortices(),
281  vort_angle + PI * angle_sweep / 180.0,
282  laser_power * HBAR * sqrt(omegaX * omegaY),
283  V_opt, x, y, par);
284 
285 
286  }
287  // If kick_it param is 2, perform a single kick of the
288  // optical lattice for the first timestep only.
289  // This is performed by loading the
290  // EV_opt exp(V + V_opt) array into GPU memory
291  // for the potential.
292  if (kick_it == 2) {
293  printf("Kicked it 1\n");
294  cudaMemcpy(V_gpu, EV_opt,
295  sizeof(cufftDoubleComplex) * xDim * yDim,
296  cudaMemcpyHostToDevice);
297  }
298  // Write out the newly specified potential
299  // and exp potential to files
300  FileIO::writeOutDouble(buffer, data_dir + "V_opt_1",
301  V_opt, xDim * yDim, 0);
302  FileIO::writeOut(buffer, data_dir + "EV_opt_1", EV_opt,
303  xDim * yDim, 0);
304 
305  //Store necessary parameters to Params.dat file.
306  FileIO::writeOutParam(buffer, par,
307  data_dir + "Params.dat");
308  }
309  //If i!=0 and the number of vortices changes
310  // if num_vortices[1] < num_vortices[0] ... Fewer vortices
311  else {
312  if (num_vortices[0] > 0){
313  Tracker::vortPos(vortexLocation,
314  vortCoords->getVortices(), xDim, wfc);
315  Tracker::lsFit(vortCoords->getVortices(),
316  wfc, xDim);
317  Tracker::vortArrange(vortCoords->getVortices(),
318  vortCoordsP->getVortices());
319  FileIO::writeOutInt(buffer, data_dir + "vLoc_",
320  vortexLocation, xDim * yDim, i);
321  }
322  }
323 
324  // The following will eventually be modified and moved into
325  // a new library that works closely wy0_shiftUE. Used to
326  // also defined for vortex elimination using graph positions
327  // and UID numbers.
328  if (graph && num_vortices[0] > 0) {
329  for (int ii = 0; ii < vortCoords->getVortices().size();
330  ++ii) {
331  std::shared_ptr<LatticeGraph::Node>
332  n(new LatticeGraph::Node(
333  *vortCoords->getVortices().at(ii).get()));
334  lattice.addVortex(std::move(n));
335  }
336  unsigned int *uids = (unsigned int *) malloc(
337  sizeof(unsigned int) *
338  lattice.getVortices().size());
339  for (size_t a=0; a < lattice.getVortices().size(); ++a){
340  uids[a] = lattice.getVortexIdx(a)->getUid();
341  }
342  if(i==0) {
343  //Lambda for vortex annihilation/creation.
344  auto killIt=[&](int idx, int winding,
345  double delta_x, double delta_y) {
346  if (abs(delta_x) > 0 || abs(delta_y) > 0){
347  // Killing initial vortex and then
348  // imprinting new one
349  WFC::phaseWinding(Phi, 1, x,y, dx,dy,
350  lattice.getVortexUid(idx)->
351  getData().getCoordsD().x,
352  lattice.getVortexUid(idx)->
353  getData().getCoordsD().y,
354  xDim);
355 
356  cudaMemcpy(Phi_gpu, Phi,
357  sizeof(double) * xDim * yDim,
358  cudaMemcpyHostToDevice);
359  cMultPhi <<<grid, threads>>>(gpuWfc,Phi_gpu,
360  gpuWfc);
361 
362  // Imprinting new one
363  int cval = -winding;
364  WFC::phaseWinding(Phi, cval, x,y, dx,dy,
365  lattice.getVortexUid(idx)->
366  getData().getCoordsD().x + delta_x,
367  lattice.getVortexUid(idx)->
368  getData().getCoordsD().y + delta_y,
369  xDim);
370 
371  // Sending to device for imprinting
372  cudaMemcpy(Phi_gpu, Phi,
373  sizeof(double) * xDim * yDim,
374  cudaMemcpyHostToDevice);
375  cMultPhi <<<grid, threads>>>(gpuWfc,Phi_gpu,
376  gpuWfc);
377  }
378  else{
379  int cval = -(winding-1);
380  WFC::phaseWinding(Phi, cval, x,y,dx,dy,
381  lattice.getVortexUid(idx)->
382  getData().getCoordsD().x,
383  lattice.getVortexUid(idx)->
384  getData().getCoordsD().y,
385  xDim);
386  cudaMemcpy(Phi_gpu, Phi,
387  sizeof(double) * xDim * yDim,
388  cudaMemcpyHostToDevice);
389  cMultPhi <<<grid, threads>>>(gpuWfc,Phi_gpu,
390  gpuWfc);
391  }
392  };
393  if (kill_idx > 0){
394  killIt(kill_idx, charge, x0_shift, y0_shift);
395  }
396  }
397  lattice.createEdges(1.5 * 2e-5 / dx);
398 
399  //Assumes that vortices
400  //only form edges when the distance is upto 1.5*2e-5.
401  //Replace with delaunay triangulation determined edges
402  //for better computational scaling (and sanity)
403 
404  //O(n^2) -> terrible implementation. It works for now.
405  //Generates the adjacency matrix from the graph, and
406  //outputs to a Mathematica compatible format.
407  adjMat = (double *)calloc(lattice.getVortices().size() *
408  lattice.getVortices().size(),
409  sizeof(double));
410  lattice.genAdjMat(adjMat);
411  FileIO::writeOutAdjMat(buffer, data_dir + "graph",
412  adjMat, uids,
413  lattice.getVortices().size(), i);
414 
415  //Free and clear all memory blocks
416  free(adjMat);
417  free(uids);
418  lattice.getVortices().clear();
419  lattice.getEdges().clear();
420  //exit(0);
421  }
422 
423  //Write out the vortex locations
424  FileIO::writeOutVortex(buffer, data_dir + "vort_arr",
425  vortCoords->getVortices(), i);
426  printf("Located %d vortices\n",
427  vortCoords->getVortices().size());
428 
429  //Free memory block for now.
430  free(vortexLocation);
431 
432  //Current values become previous values.
433  num_vortices[1] = num_vortices[0];
434  vortCoords->getVortices().swap(vortCoordsP->getVortices());
435  vortCoords->getVortices().clear();
436 
437  }
438  fileName = "wfc_ev";
439  break;
440  }
441  case 3:
442  {
443  fileName = "wfc_ev_ramp";
444  break;
445  }
446  default:
447  {
448  break;
449  }
450  }
451 
452  //std::cout << "writing" << '\n';
453  if (write_it) {
454  FileIO::writeOut(buffer, data_dir + fileName,
455  wfc, xDim*yDim*zDim, i);
456  }
457  //std::cout << "written" << '\n';
458  if (par.bval("energy_calc")){
459  double energy = energy_calc(par,gpuWfc);
460  // Now opening and closing file for writing.
461  std::ofstream energy_out;
462  std::string mode = "energyi.dat";
463  if (gstate == 1){
464  mode = "energy.dat";
465  }
466  if (i == 0){
467  energy_out.open(data_dir + mode);
468  }
469  else{
470  energy_out.open(data_dir + mode, std::ios::out |
471  std::ios::app);
472  }
473  energy_out << energy << '\n';
474  energy_out.close();
475  printf("Energy[t@%d]=%E\n",i,energy);
476  par.store("energy", energy);
477  }
478 
479  }
480 
481  // No longer writing out
482 
483  // ** ########################################################## ** //
484  // ** More F'n' Dragons! ** //
485  // ** ########################################################## ** //
486 
487  // U_r(dt/2)*wfc
488  if(nonlin == 1){
489  if(par.bval("V_time")){
490  EqnNode_gpu* V_eqn = par.astval("V");
491  int e_num = par.ival("V_num");
492  cMultDensity_ast<<<grid,threads>>>(V_eqn,gpuWfc,gpuWfc,
493  dx, dy, dz, time, e_num, 0.5*Dt,
494  mass,gstate,interaction*gDenConst);
495  }
496  else{
497  cMultDensity<<<grid,threads>>>(V_gpu,gpuWfc,gpuWfc,0.5*Dt,
498  mass,gstate,interaction*gDenConst);
499  }
500  }
501  else {
502  if(par.bval("V_time")){
503  EqnNode_gpu* V_eqn = par.astval("V");
504  int e_num = par.ival("V_num");
505  ast_op_mult<<<grid,threads>>>(gpuWfc,gpuWfc, V_eqn,
506  dx, dy, dz, time, e_num, gstate+1, Dt);
507  }
508  else{
509  cMult<<<grid,threads>>>(V_gpu,gpuWfc,gpuWfc);
510  }
511  }
512 
513  // U_p(dt)*fft2(wfc)
514  result = cufftExecZ2Z(plan_3d,gpuWfc,gpuWfc,CUFFT_FORWARD);
515 
516  // Normalise
517  scalarMult<<<grid,threads>>>(gpuWfc,renorm_factor_nd,gpuWfc);
518  if (par.bval("K_time")){
519  EqnNode_gpu* k_eqn = par.astval("k");
520  int e_num = par.ival("k_num");
521  ast_op_mult<<<grid,threads>>>(gpuWfc,gpuWfc, k_eqn,
522  dx, dy, dz, time, e_num, gstate+1, Dt);
523  }
524  else{
525  cMult<<<grid,threads>>>(K_gpu,gpuWfc,gpuWfc);
526  }
527  result = cufftExecZ2Z(plan_3d,gpuWfc,gpuWfc,CUFFT_INVERSE);
528 
529  // Normalise
530  scalarMult<<<grid,threads>>>(gpuWfc,renorm_factor_nd,gpuWfc);
531 
532  // U_r(dt/2)*wfc
533  if(nonlin == 1){
534  if(par.bval("V_time")){
535  EqnNode_gpu* V_eqn = par.astval("V");
536  int e_num = par.ival("V_num");
537  cMultDensity_ast<<<grid,threads>>>(V_eqn,gpuWfc,gpuWfc,
538  dx, dy, dz, time, e_num, 0.5*Dt,
539  mass,gstate,interaction*gDenConst);
540  }
541  else{
542  cMultDensity<<<grid,threads>>>(V_gpu,gpuWfc,gpuWfc,0.5*Dt,
543  mass,gstate,interaction*gDenConst);
544  }
545  }
546  else {
547  if(par.bval("V_time")){
548  EqnNode_gpu* V_eqn = par.astval("V");
549  int e_num = par.ival("V_num");
550  ast_op_mult<<<grid,threads>>>(gpuWfc,gpuWfc, V_eqn,
551  dx, dy, dz, time, e_num, gstate+1, Dt);
552  }
553  else{
554  cMult<<<grid,threads>>>(V_gpu,gpuWfc,gpuWfc);
555  }
556 
557  }
558 
559  // Angular momentum pAy-pAx (if engaged) //
560  if(lz == true){
561  // Multiplying by ramping factor if necessary
562  // Note: using scalarPow to do the scaling inside of the exp
563  if (ramp){
564  scalarPow<<<grid,threads>>>((cufftDoubleComplex*) gpu1dpAy,
565  omega_0,
566  (cufftDoubleComplex*) gpu1dpAy);
567  if (dimnum > 1){
568  scalarPow<<<grid,threads>>>((cufftDoubleComplex*) gpu1dpAx,
569  omega_0,
570  (cufftDoubleComplex*) gpu1dpAx);
571  }
572  if (dimnum > 2){
573  scalarPow<<<grid,threads>>>((cufftDoubleComplex*) gpu1dpAz,
574  omega_0,
575  (cufftDoubleComplex*) gpu1dpAz);
576  }
577  }
578  int size = xDim*zDim;
579  if (dimnum == 3){
580  switch(i%2){
581  case 0: //Groundstate solver, even step
582 
583  // 1d forward / mult by Az
584  result = cufftExecZ2Z(plan_1d,gpuWfc,gpuWfc,
585  CUFFT_FORWARD);
586  scalarMult<<<grid,threads>>>(gpuWfc,
587  renorm_factor_1d,gpuWfc);
588  if(par.bval("Ay_time")){
589  EqnNode_gpu* Ay_eqn = par.astval("Ay");
590  int e_num = par.ival("Ay_num");
591  ast_cmult<<<grid,threads>>>(gpuWfc, gpuWfc,
592  Ay_eqn, dx, dy, dz, time, e_num);
593  }
594  else{
595  cMult<<<grid,threads>>>(gpuWfc,
596  (cufftDoubleComplex*) gpu1dpAy, gpuWfc);
597  }
598  result = cufftExecZ2Z(plan_1d,gpuWfc,gpuWfc,
599  CUFFT_INVERSE);
600  scalarMult<<<grid,threads>>>(gpuWfc,
601  renorm_factor_1d, gpuWfc);
602 
603  // loop to multiply by Ay
604  for (int i = 0; i < yDim; i++){
605  //size = xDim * zDim;
606  result = cufftExecZ2Z(plan_dim2,
607  &gpuWfc[i*size],
608  &gpuWfc[i*size],CUFFT_FORWARD);
609  }
610 
611  scalarMult<<<grid,threads>>>(gpuWfc,
612  renorm_factor_1d,gpuWfc);
613  if(par.bval("Ax_time")){
614  EqnNode_gpu* Ax_eqn = par.astval("Ax");
615  int e_num = par.ival("Ax_num");
616  ast_cmult<<<grid,threads>>>(gpuWfc, gpuWfc,
617  Ax_eqn, dx, dy, dz, time, e_num);
618  }
619  else{
620  cMult<<<grid,threads>>>(gpuWfc,
621  (cufftDoubleComplex*) gpu1dpAx, gpuWfc);
622  }
623 
624  for (int i = 0; i < yDim; i++){
625  //size = xDim * zDim;
626  result = cufftExecZ2Z(plan_dim2,
627  &gpuWfc[i*size],
628  &gpuWfc[i*size],CUFFT_INVERSE);
629  }
630  scalarMult<<<grid,threads>>>(gpuWfc,
631  renorm_factor_1d,gpuWfc);
632 
633  // 1D FFT to Ax
634  result = cufftExecZ2Z(plan_dim3,gpuWfc,gpuWfc,
635  CUFFT_FORWARD);
636  scalarMult<<<grid,threads>>>(gpuWfc,
637  renorm_factor_1d,gpuWfc);
638  if(par.bval("Az_time")){
639  EqnNode_gpu* Az_eqn = par.astval("Az");
640  int e_num = par.ival("Az_num");
641  ast_cmult<<<grid,threads>>>(gpuWfc, gpuWfc,
642  Az_eqn, dx, dy, dz, time, e_num);
643  }
644  else{
645  cMult<<<grid,threads>>>(gpuWfc,
646  (cufftDoubleComplex*) gpu1dpAz, gpuWfc);
647  }
648 
649  result = cufftExecZ2Z(plan_dim3,gpuWfc,gpuWfc,
650  CUFFT_INVERSE);
651  scalarMult<<<grid,threads>>>(gpuWfc,
652  renorm_factor_1d, gpuWfc);
653 
654  break;
655 
656  case 1: //Groundstate solver, odd step
657 
658  // 1D FFT to Ax
659  result = cufftExecZ2Z(plan_dim3,gpuWfc,gpuWfc,
660  CUFFT_FORWARD);
661  scalarMult<<<grid,threads>>>(gpuWfc,
662  renorm_factor_1d,gpuWfc);
663  if(par.bval("Az_time")){
664  EqnNode_gpu* Az_eqn = par.astval("Az");
665  int e_num = par.ival("Az_num");
666  ast_cmult<<<grid,threads>>>(gpuWfc, gpuWfc,
667  Az_eqn, dx, dy, dz, time, e_num);
668  }
669  else{
670  cMult<<<grid,threads>>>(gpuWfc,
671  (cufftDoubleComplex*) gpu1dpAz, gpuWfc);
672  }
673 
674  result = cufftExecZ2Z(plan_dim3,gpuWfc,gpuWfc,
675  CUFFT_INVERSE);
676  scalarMult<<<grid,threads>>>(gpuWfc,
677  renorm_factor_1d, gpuWfc);
678 
679  // loop to multiply by Ay
680  for (int i = 0; i < yDim; i++){
681  //size = xDim * zDim;
682  result = cufftExecZ2Z(plan_dim2,
683  &gpuWfc[i*size],
684  &gpuWfc[i*size],CUFFT_FORWARD);
685  }
686 
687  scalarMult<<<grid,threads>>>(gpuWfc,
688  renorm_factor_1d,gpuWfc);
689  if(par.bval("Ax_time")){
690  EqnNode_gpu* Ax_eqn = par.astval("Ax");
691  int e_num = par.ival("Ax_num");
692  ast_cmult<<<grid,threads>>>(gpuWfc, gpuWfc,
693  Ax_eqn, dx, dy, dz, time, e_num);
694  }
695  else{
696  cMult<<<grid,threads>>>(gpuWfc,
697  (cufftDoubleComplex*) gpu1dpAx, gpuWfc);
698  }
699 
700  for (int i = 0; i < yDim; i++){
701  //size = xDim * zDim;
702  result = cufftExecZ2Z(plan_dim2,
703  &gpuWfc[i*size],
704  &gpuWfc[i*size],CUFFT_INVERSE);
705  }
706  scalarMult<<<grid,threads>>>(gpuWfc,
707  renorm_factor_1d,gpuWfc);
708 
709  // 1d forward / mult by Az
710  result = cufftExecZ2Z(plan_1d,gpuWfc,gpuWfc,
711  CUFFT_FORWARD);
712  scalarMult<<<grid,threads>>>(gpuWfc,
713  renorm_factor_1d,gpuWfc);
714  if(par.bval("Ay_time")){
715  EqnNode_gpu* Ay_eqn = par.astval("Ay");
716  int e_num = par.ival("Ay_num");
717  ast_cmult<<<grid,threads>>>(gpuWfc, gpuWfc,
718  Ay_eqn, dx, dy, dz, time, e_num);
719  }
720  else{
721  cMult<<<grid,threads>>>(gpuWfc,
722  (cufftDoubleComplex*) gpu1dpAy, gpuWfc);
723  }
724 
725  result = cufftExecZ2Z(plan_1d,gpuWfc,gpuWfc,
726  CUFFT_INVERSE);
727  scalarMult<<<grid,threads>>>(gpuWfc,
728  renorm_factor_1d, gpuWfc);
729 
730  break;
731  }
732  }
733  else if (dimnum == 2){
734  switch(i%2){
735  case 0: //Groundstate solver, even step
736 
737  // 1d forward / mult by Ay
738  result = cufftExecZ2Z(plan_1d,gpuWfc,gpuWfc,
739  CUFFT_FORWARD);
740  scalarMult<<<grid,threads>>>(gpuWfc,
741  renorm_factor_1d,gpuWfc);
742  if(par.bval("Ay_time")){
743  EqnNode_gpu* Ay_eqn = par.astval("Ay");
744  int e_num = par.ival("Ay_num");
745  ast_cmult<<<grid,threads>>>(gpuWfc, gpuWfc,
746  Ay_eqn, dx, dy, dz, time, e_num);
747  }
748  else{
749  cMult<<<grid,threads>>>(gpuWfc,
750  (cufftDoubleComplex*) gpu1dpAy, gpuWfc);
751  }
752  result = cufftExecZ2Z(plan_1d,gpuWfc,gpuWfc,
753  CUFFT_INVERSE);
754  scalarMult<<<grid,threads>>>(gpuWfc,
755  renorm_factor_1d, gpuWfc);
756 
757 
758  // 1D FFT to wfc_pAx
759  result = cufftExecZ2Z(plan_other2d,gpuWfc,gpuWfc,
760  CUFFT_FORWARD);
761  scalarMult<<<grid,threads>>>(gpuWfc,
762  renorm_factor_1d,gpuWfc);
763  if(par.bval("Ax_time")){
764  EqnNode_gpu* Ax_eqn = par.astval("Ax");
765  int e_num = par.ival("Ax_num");
766  ast_cmult<<<grid,threads>>>(gpuWfc, gpuWfc,
767  Ax_eqn, dx, dy, dz, time, e_num);
768  }
769  else{
770  cMult<<<grid,threads>>>(gpuWfc,
771  (cufftDoubleComplex*) gpu1dpAx, gpuWfc);
772  }
773 
774  result = cufftExecZ2Z(plan_other2d,gpuWfc,gpuWfc,
775  CUFFT_INVERSE);
776  scalarMult<<<grid,threads>>>(gpuWfc,
777  renorm_factor_1d, gpuWfc);
778  break;
779 
780  case 1: //Groundstate solver, odd step
781  // 1D FFT to wfc_pAx
782  result = cufftExecZ2Z(plan_other2d,gpuWfc,gpuWfc,
783  CUFFT_FORWARD);
784  scalarMult<<<grid,threads>>>(gpuWfc,
785  renorm_factor_1d,gpuWfc);
786  if(par.bval("Ax_time")){
787  EqnNode_gpu* Ax_eqn = par.astval("Ax");
788  int e_num = par.ival("Ax_num");
789  ast_cmult<<<grid,threads>>>(gpuWfc, gpuWfc,
790  Ax_eqn, dx, dy, dz, time, e_num);
791  }
792  else{
793  cMult<<<grid,threads>>>(gpuWfc,
794  (cufftDoubleComplex*) gpu1dpAx, gpuWfc);
795  }
796 
797  result = cufftExecZ2Z(plan_other2d,gpuWfc,gpuWfc,
798  CUFFT_INVERSE);
799  scalarMult<<<grid,threads>>>(gpuWfc,
800  renorm_factor_1d, gpuWfc);
801 
802  // wfc_pAy
803  result = cufftExecZ2Z(plan_1d,gpuWfc,gpuWfc,
804  CUFFT_FORWARD);
805  scalarMult<<<grid,threads>>>(gpuWfc,
806  renorm_factor_1d,gpuWfc);
807  if(par.bval("Ay_time")){
808  EqnNode_gpu* Ay_eqn = par.astval("Ay");
809  int e_num = par.ival("Ay_num");
810  ast_cmult<<<grid,threads>>>(gpuWfc, gpuWfc,
811  Ay_eqn, dx, dy, dz, time, e_num);
812  }
813  else{
814  cMult<<<grid,threads>>>(gpuWfc,
815  (cufftDoubleComplex*) gpu1dpAy, gpuWfc);
816  }
817 
818  result = cufftExecZ2Z(plan_1d,gpuWfc,gpuWfc,
819  CUFFT_INVERSE);
820  scalarMult<<<grid,threads>>>(gpuWfc,
821  renorm_factor_1d, gpuWfc);
822  break;
823 
824  }
825  }
826  else if (dimnum == 1){
827  result = cufftExecZ2Z(plan_1d,gpuWfc,gpuWfc,CUFFT_FORWARD);
828  scalarMult<<<grid,threads>>>(gpuWfc,renorm_factor_1d,gpuWfc);
829  if(par.bval("Ax_time")){
830  EqnNode_gpu* Ax_eqn = par.astval("Ax");
831  int e_num = par.ival("Ax_num");
832  ast_cmult<<<grid,threads>>>(gpuWfc, gpuWfc,
833  Ax_eqn, dx, dy, dz, time, e_num);
834  }
835  else{
836  cMult<<<grid,threads>>>(gpuWfc,
837  (cufftDoubleComplex*) gpu1dpAx, gpuWfc);
838  }
839 
840  result = cufftExecZ2Z(plan_1d,gpuWfc,gpuWfc, CUFFT_INVERSE);
841  scalarMult<<<grid,threads>>>(gpuWfc, renorm_factor_1d, gpuWfc);
842  }
843  }
844 
845  if(gstate==0){
846  parSum(gpuWfc, par);
847  }
848  }
849 
850  par.store("wfc", wfc);
851  par.store("wfc_gpu", gpuWfc);
852 }