GPUE  v1.0
GPU Gross-Pitaevskii Equation numerical solver for Bose-Einstein condensates
kernels.cu
Go to the documentation of this file.
1 #include "../include/constants.h"
2 #include "../include/dynamic.h"
3 #include "../include/ds.h"
4 #include <stdio.h>
5 
6 __device__ double2 subtract(double2 a, double2 b){
7  return {a.x-b.x, a.y-b.y};
8 }
9 __device__ double2 add(double2 a, double2 b){
10  return {a.x+b.x, a.y+b.y};
11 }
12 __device__ double2 pow(double2 a, int b){
13  double r = sqrt(a.x*a.x + a.y*a.y);
14  double theta = atan(a.y / a.x);
15  return{pow(r,b)*cos(b*theta),pow(r,b)*sin(b*theta)};
16 }
17 
18 __device__ double2 mult(double2 a, double b){
19  return {a.x*b, a.y*b};
20 }
21 __device__ double2 mult(double2 a, double2 b){
22  return {a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x};
23 }
24 
25 //Evaluted in MATLAB: N*4*HBAR*HBAR*PI*(4.67e-9/mass)*sqrt(mass*(omegaZ)/(2*PI*HBAR))
26 //__constant__ double gDenConst = 6.6741e-40;
27 
28 __device__ unsigned int getGid3d3d(){
29  int blockId = blockIdx.x + blockIdx.y * gridDim.x
30  + gridDim.x * gridDim.y * blockIdx.z;
31  int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
32  + (threadIdx.y * blockDim.x)
33  + (threadIdx.z * (blockDim.x * blockDim.y)) + threadIdx.x;
34  return threadId;
35 }
36 
37 __global__ void is_eq(bool *a, bool *b, bool *ans){
38  int gid = getGid3d3d();
39  ans[0] = true;
40  if (a[gid] != b[gid]){
41  ans[0] = false;
42  }
43 }
44 
45 
46 // Function to convert a double* to double2*
47 __global__ void make_cufftDoubleComplex(double *in, double2 *out){
48  int gid = getGid3d3d();
49  out[gid].x = in[gid];
50  out[gid].y = 0;
51 }
52 
53 // function to perform a transposition (2d) or permutation (3d)
54 // Note: The 3 ints represent the final placement of that data direction
55 // after transposition
56 inline __device__ unsigned int permuteGid(int d1, int d2, int d3){
57 
58  // I cannot seem to think of any way to write this in a general case...
59 
60  unsigned int x, y, z;
61 
62  // If the three axes are in the original directions.
63  if (d1 == 0 && d2 == 1 && d3 == 2){
64  return getGid3d3d();
65  }
66 
67  else if (d1 == 1 && d2 == 2 && d3 == 0){
68  x = blockIdx.x * blockDim.x + threadIdx.x;
69  z = blockDim.z * (x + blockIdx.z) + threadIdx.z;
70  y = blockDim.y * (z + blockIdx.y) + threadIdx.y;
71  return y;
72  }
73 
74  else if (d1 == 2 && d2 == 0 && d3 == 1){
75  y = blockIdx.y * blockDim.y + threadIdx.y;
76  x = blockDim.x * (y + blockIdx.x) + threadIdx.x;
77  z = blockDim.z * (x + blockIdx.z) + threadIdx.z;
78  return z;
79  }
80 
81  else if (d1 == 0 && d2 == 2 && d3 == 1){
82  y = blockIdx.y * blockDim.y + threadIdx.y;
83  z = blockDim.z * (y + blockIdx.z) + threadIdx.z;
84  x = blockDim.x * (z + blockIdx.x) + threadIdx.x;
85  return x;
86  }
87 
88  else if (d1 == 1 && d2 == 0 && d3 == 2){
89  //z = blockIdx.z * blockDim.z + threadIdx.z;
90  //y = blockDim.y * (z + blockIdx.x) + threadIdx.y;
91  //x = blockDim.x * (y + blockIdx.y) + threadIdx.x;
92  //x = blockDim.x * (z + blockIdx.x) + threadIdx.x;
93  //y = blockDim.y * (x + blockIdx.y) + threadIdx.y;
94  //return y;
95  //return x;
96  z = blockIdx.z*blockDim.z + threadIdx.z;
97  x = blockIdx.x*blockDim.x + threadIdx.x;
98  y = blockIdx.y*blockDim.y + threadIdx.y;
99  return x + blockDim.x*y + blockDim.y*blockDim.x*z;
100  }
101 
102  else if (d1 == 2 && d2 == 1 && d3 == 0){
103  x = blockIdx.x * blockDim.x + threadIdx.x;
104  y = blockDim.y * (x + blockIdx.y) + threadIdx.y;
105  z = blockDim.z * (y + blockIdx.z) + threadIdx.z;
106  return z;
107  }
108  else{
109  return 0;
110  }
111 
112 }
113 
114 __device__ unsigned int getBid3d3d(){
115  return blockIdx.x + gridDim.x*(blockIdx.y + gridDim.y * blockIdx.z);
116 }
117 
118 __device__ unsigned int getTid3d3d(){
119  return blockDim.x * ( blockDim.y * ( blockDim.z + ( threadIdx.z * blockDim.y ) ) + threadIdx.y ) + threadIdx.x;
120 }
121 
122 __device__ double2 make_complex(double in, int evolution_type){
123  double2 result;
124 
125  switch(evolution_type){
126  // No change
127  case 0:
128  result.x = in;
129  result.y = 0;
130  break;
131  // Im. Time evolution
132  case 1:
133  result.x = exp(-in);
134  result.y = 0;
135  break;
136  // Real Time evolution
137  case 2:
138  result.x = cos(-in);
139  result.y = sin(-in);
140  break;
141  }
142 
143  return result;
144 }
145 
146 __device__ double2 conjugate(double2 in){
147  double2 result = in;
148  result.y = -result.y;
149  return result;
150 }
151 
152 __device__ double2 realCompMult(double scalar, double2 comp){
153  double2 result;
154  result.x = scalar * comp.x;
155  result.y = scalar * comp.y;
156  return result;
157 }
158 
159 __device__ double complexMagnitude(double2 in){
160  return sqrt(in.x*in.x + in.y*in.y);
161 }
162 
163 __global__ void complexAbsSum(double2 *in1, double2 *in2, double *out){
164  int gid = getGid3d3d();
165  double2 temp;
166  temp.x = in1[gid].x + in2[gid].x;
167  temp.y = in1[gid].y + in2[gid].y;
168  out[gid] = sqrt(temp.x*temp.x + temp.y*temp.y);
169 }
170 
171 __global__ void complexMagnitude(double2 *in, double *out){
172  int gid = getGid3d3d();
173  out[gid] = sqrt(in[gid].x*in[gid].x + in[gid].y*in[gid].y);
174 }
175 
176 __host__ __device__ double complexMagnitudeSquared(double2 in){
177  return in.x*in.x + in.y*in.y;
178 }
179 
180 __global__ void complexMagnitudeSquared(double2 *in, double *out){
181  int gid = getGid3d3d();
182  out[gid] = in[gid].x*in[gid].x + in[gid].y*in[gid].y;
183 }
184 
185 __global__ void complexMagnitudeSquared(double2 *in, double2 *out){
186  int gid = getGid3d3d();
187  out[gid].x = in[gid].x*in[gid].x + in[gid].y*in[gid].y;
188  out[gid].y = 0;
189 }
190 
191 __host__ __device__ double2 complexMultiply(double2 in1, double2 in2){
192  double2 result;
193  result.x = (in1.x*in2.x - in1.y*in2.y);
194  result.y = (in1.x*in2.y + in1.y*in2.x);
195  return result;
196 }
197 
198 __global__ void complexMultiply(double2 *in1, double2 *in2, double2 *out){
199  int gid = getGid3d3d();
200  out[gid] = complexMultiply(in1[gid], in2[gid]);
201 }
202 
203 
204 /*
205 * Used to perform conj(in1)*in2; == < in1 | in2 >
206 */
207 inline __device__ double2 braKetMult(double2 in1, double2 in2){
208  return complexMultiply(conjugate(in1),in2);
209 }
210 
211 /**
212  * Performs complex multiplication of in1 and in2, giving result as out.
213  */
214 __global__ void cMult(double2* in1, double2* in2, double2* out){
215  unsigned int gid = getGid3d3d();
216  double2 result;
217  double2 tin1 = in1[gid];
218  double2 tin2 = in2[gid];
219  result.x = (tin1.x*tin2.x - tin1.y*tin2.y);
220  result.y = (tin1.x*tin2.y + tin1.y*tin2.x);
221  out[gid] = result;
222 }
223 
224 __global__ void cMultPhi(double2* in1, double* in2, double2* out){
225  double2 result;
226  unsigned int gid = getGid3d3d();
227  result.x = cos(in2[gid])*in1[gid].x - in1[gid].y*sin(in2[gid]);
228  result.y = in1[gid].x*sin(in2[gid]) + in1[gid].y*cos(in2[gid]);
229  out[gid] = result;
230 }
231 
232 /**
233  * Performs multiplication of double* with double2*
234  */
235 __global__ void vecMult(double2 *in, double *factor, double2 *out){
236  double2 result;
237  unsigned int gid = getGid3d3d();
238  result.x = in[gid].x * factor[gid];
239  result.y = in[gid].y * factor[gid];
240  out[gid] = result;
241 }
242 
243 __global__ void l2_norm(double *in1, double *in2, double *in3, double *out){
244 
245  int gid = getGid3d3d();
246  out[gid] = sqrt(in1[gid]*in1[gid] + in2[gid]*in2[gid] + in3[gid]*in3[gid]);
247 }
248 
249 __global__ void l2_norm(double2 *in1, double2 *in2, double2 *in3, double *out){
250 
251  int gid = getGid3d3d();
252  out[gid] = sqrt(in1[gid].x*in1[gid].x + in1[gid].y*in1[gid].y
253  + in2[gid].x*in2[gid].x + in2[gid].y*in2[gid].y
254  + in3[gid].x*in3[gid].x + in3[gid].y*in3[gid].y);
255 }
256 
257 __global__ void l2_norm(double *in1, double *in2, double *out){
258 
259  int gid = getGid3d3d();
260  out[gid] = sqrt(in1[gid]*in1[gid] + in2[gid]*in2[gid]);
261 }
262 
263 __global__ void l2_norm(double2 *in1, double2 *in2, double *out){
264 
265  int gid = getGid3d3d();
266  out[gid] = sqrt(in1[gid].x*in1[gid].x + in1[gid].y*in1[gid].y
267  + in2[gid].x*in2[gid].x + in2[gid].y*in2[gid].y);
268 }
269 
270 /**
271  * Performs the non-linear evolution term of Gross--Pitaevskii equation.
272  */
273 __global__ void cMultDensity(double2* in1, double2* in2, double2* out, double dt, double mass, int gstate, double gDenConst){
274  double2 result;
275  double gDensity;
276 
277  int gid = getGid3d3d();
278  double2 tin1 = in1[gid];
279  double2 tin2 = in2[gid];
280  gDensity = gDenConst*complexMagnitudeSquared(in2[gid])*(dt/HBAR);
281 
282  if(gstate == 0){
283  double tmp = in1[gid].x*exp(-gDensity);
284  result.x = (tmp)*tin2.x - (tin1.y)*tin2.y;
285  result.y = (tmp)*tin2.y + (tin1.y)*tin2.x;
286  }
287  else{
288  double2 tmp;
289  tmp.x = tin1.x*cos(-gDensity) - tin1.y*sin(-gDensity);
290  tmp.y = tin1.y*cos(-gDensity) + tin1.x*sin(-gDensity);
291  //printf("%d\t%f\t%f\t%f\t%f\t%f\n", gid, tin1.x, tin1.y, tmp.x, tmp.y, gDensity);
292 
293  result.x = (tmp.x)*tin2.x - (tmp.y)*tin2.y;
294  result.y = (tmp.x)*tin2.y + (tmp.y)*tin2.x;
295  }
296  out[gid] = result;
297 }
298 
299 //cMultDensity for ast V
300 __global__ void cMultDensity_ast(EqnNode_gpu *eqn, double2* in, double2* out,
301  double dx, double dy, double dz, double time,
302  int e_num, double dt, double mass, int gstate,
303  double gDenConst){
304  double2 result;
305  double gDensity;
306 
307  int gid = getGid3d3d();
308  int xid = blockIdx.x*blockDim.x + threadIdx.x;
309  int yid = blockIdx.y*blockDim.y + threadIdx.y;
310  int zid = blockIdx.z*blockDim.z + threadIdx.z;
311 
312  double2 tin = in[gid];
313  gDensity = gDenConst*complexMagnitudeSquared(in[gid])*(dt/HBAR);
314  double2 val = make_complex(evaluate_eqn_gpu(eqn, xid*dx, yid*dy, zid*dz,
315  time, e_num), gstate+1);
316 
317  if(gstate == 0){
318  double tmp = val.x*exp(-gDensity);
319  result.x = (tmp)*tin.x - (val.y)*tin.y;
320  result.y = (tmp)*tin.y + (val.y)*tin.x;
321  }
322  else{
323  double2 tmp;
324  tmp.x = val.x*cos(-gDensity) - val.y*sin(-gDensity);
325  tmp.y = val.y*cos(-gDensity) + val.x*sin(-gDensity);
326 
327  result.x = (tmp.x)*tin.x - (tmp.y)*tin.y;
328  result.y = (tmp.x)*tin.y + (tmp.y)*tin.x;
329  }
330  out[gid] = result;
331 }
332 
333 /**
334  * Divides both components of vector type "in", by the value "factor".
335  * Results given with "out".
336  */
337 __global__ void scalarDiv(double2* in, double factor, double2* out){
338  double2 result;
339  unsigned int gid = getGid3d3d();
340  result.x = (in[gid].x / factor);
341  result.y = (in[gid].y / factor);
342  out[gid] = result;
343 }
344 
345 /**
346  * Divides both components of vector type "in", by the value "factor".
347  * Results given with "out".
348  */
349 __global__ void scalarDiv(double* in, double factor, double* out){
350  double result;
351  unsigned int gid = getGid3d3d();
352  result = (in[gid] / factor);
353  out[gid] = result;
354 }
355 
356 
357 /**
358  * Multiplies both components of vector type "in", by the value "factor".
359  * Results given with "out".
360  */
361 __global__ void scalarMult(double2* in, double factor, double2* out){
362  double2 result;
363  //extern __shared__ double2 tmp_in[];
364  unsigned int gid = getGid3d3d();
365  result.x = (in[gid].x * factor);
366  result.y = (in[gid].y * factor);
367  out[gid] = result;
368 }
369 
370 /**
371  * As above, but normalises for wfc
372  */
373 __global__ void scalarDiv_wfcNorm(double2* in, double dr, double* pSum, double2* out){
374  unsigned int gid = getGid3d3d();
375  double2 result;
376  double norm = sqrt((pSum[0])*dr);
377  result.x = (in[gid].x/norm);
378  result.y = (in[gid].y/norm);
379  out[gid] = result;
380 }
381 
382 /**
383  * Raises in to the power of param
384  */
385 __global__ void scalarPow(double2* in, double param, double2* out){
386  unsigned int gid = getGid3d3d();
387  double2 result;
388  result.x = pow(in[gid].x, param);
389  result.y = pow(in[gid].y, param);
390  out[gid] = result;
391 }
392 
393 /**
394  * Finds conjugate for double2*
395  */
396 __global__ void vecConjugate(double2 *in, double2 *out){
397  double2 result;
398  unsigned int gid = getGid3d3d();
399  result.x = in[gid].x;
400  result.y = -in[gid].y;
401  out[gid] = result;
402 }
403 
404 __global__ void angularOp(double omega, double dt, double2* wfc, double* xpyypx, double2* out){
405  unsigned int gid = getGid3d3d();
406  double2 result;
407  double op;
408  op = exp( -omega*xpyypx[gid]*dt);
409  result.x=wfc[gid].x*op;
410  result.y=wfc[gid].y*op;
411  out[gid]=result;
412 }
413 
414 /**
415  * Kernel for a quick test of the threads and such for GPU computing
416  */
417 __global__ void thread_test(double *in, double *out){
418 
419  unsigned int Gid = getGid3d3d();
420 
421  // Now we set each element in the
422  out[Gid] = Gid;
423  //in[Gid] = Gid;
424 }
425 
426 /**
427  * Routine for parallel summation. Can be looped over from host.
428  */
429 __global__ void multipass(double2* input, double2* output, int pass){
430  unsigned int tid = threadIdx.x + threadIdx.y*blockDim.x
431  + threadIdx.z * blockDim.x * blockDim.y;
432  unsigned int bid = blockIdx.x + blockIdx.y * gridDim.x
433  + gridDim.x * gridDim.y * blockIdx.z;
434 
435  //unsigned int tid = getTid3d3d();
436  //unsigned int bid = getBid3d3d();
437  // printf("bid0=%d\n",bid);
438 
439  unsigned int gid = getGid3d3d();
440  extern __shared__ double2 sdata[];
441  sdata[tid] = input[gid];
442  __syncthreads();
443  for(int i = blockDim.x>>1; i > 0; i>>=1){
444  if(tid < i){
445  sdata[tid].x += sdata[tid + i].x;
446  sdata[tid].y += sdata[tid + i].y;
447  }
448  __syncthreads();
449  }
450  if(tid==0){
451  output[bid] = sdata[0];
452  }
453 }
454 
455 /**
456  * Routine for parallel summation. Can be looped over from host.
457  */
458 __global__ void multipass(double* input, double* output){
459  unsigned int tid = threadIdx.x + threadIdx.y*blockDim.x
460  + threadIdx.z * blockDim.x * blockDim.y;
461  unsigned int bid = blockIdx.x + blockIdx.y * gridDim.x
462  + gridDim.x * gridDim.y * blockIdx.z;
463 
464  //unsigned int tid = getTid3d3d();
465  //unsigned int bid = getBid3d3d();
466  // printf("bid0=%d\n",bid);
467 
468  unsigned int gid = getGid3d3d();
469  extern __shared__ double sdatad[];
470  sdatad[tid] = input[gid];
471  __syncthreads();
472 
473  for(int i = blockDim.x>>1; i > 0; i>>=1){
474  if(tid < i){
475  sdatad[tid] += sdatad[tid + i];
476  }
477  __syncthreads();
478  }
479  if(tid==0){
480  output[bid] = sdatad[0];
481  }
482 }
483 
484 /*
485 * Calculates all of the energy of the current state. sqrt_omegaz_mass = sqrt(omegaZ/mass), part of the nonlin interaction term
486 */
487 __global__ void energyCalc(double2 *wfc, double2 *op, double dt, double2 *energy, int gnd_state, int op_space, double sqrt_omegaz_mass, double gDenConst){
488  unsigned int gid = getGid3d3d();
489  //double hbar_dt = HBAR/dt;
490  double2 result;
491  if(op_space == 0){
492  double g_local = gDenConst*sqrt_omegaz_mass*complexMagnitudeSquared(wfc[gid]);
493  op[gid].x += g_local;
494  }
495 
496  if (op_space < 2){
497  result = braKetMult(wfc[gid], energy[gid]);
498  energy[gid].x += result.x;
499  energy[gid].y += result.y;
500  }
501  else{
502  result = complexMultiply(op[gid],wfc[gid]);
503  }
504  result = braKetMult(wfc[gid], complexMultiply(op[gid],wfc[gid]));
505  energy[gid].x += result.x;
506  energy[gid].y += result.y;
507 
508 }
509 
510 // Function to multiply a double* with an astval
511 __global__ void ast_mult(double *array, double *array_out, EqnNode_gpu *eqn,
512  double dx, double dy, double dz, double time,
513  int element_num){
514  int gid = getGid3d3d();
515  int xid = blockIdx.x*blockDim.x + threadIdx.x;
516  int yid = blockIdx.y*blockDim.y + threadIdx.y;
517  int zid = blockIdx.z*blockDim.z + threadIdx.z;
518 
519  double val = evaluate_eqn_gpu(eqn, xid*dx, yid*dy, zid*dz,
520  time, element_num);
521 
522  array_out[gid] = array[gid] * val;
523 }
524 
525 // Function to multiply a double* with an astval
526 __global__ void ast_cmult(double2 *array, double2 *array_out, EqnNode_gpu *eqn,
527  double dx, double dy, double dz, double time,
528  int element_num){
529  int gid = getGid3d3d();
530  int xid = blockIdx.x*blockDim.x + threadIdx.x;
531  int yid = blockIdx.y*blockDim.y + threadIdx.y;
532  int zid = blockIdx.z*blockDim.z + threadIdx.z;
533 
534  double val = evaluate_eqn_gpu(eqn, xid*dx, yid*dy, zid*dz,
535  time, element_num);
536 
537  array_out[gid].x = array[gid].x * val;
538  array_out[gid].y = array[gid].y * val;
539 }
540 
541 // Function to multiply an AST V in real or imaginary time evolution
542 __global__ void ast_op_mult(double2 *array, double2 *array_out,
543  EqnNode_gpu *eqn,
544  double dx, double dy, double dz, double time,
545  int element_num, int evolution_type, double dt){
546  int gid = getGid3d3d();
547  int xid = blockIdx.x*blockDim.x + threadIdx.x;
548  int yid = blockIdx.y*blockDim.y + threadIdx.y;
549  int zid = blockIdx.z*blockDim.z + threadIdx.z;
550 
551  double val = evaluate_eqn_gpu(eqn, xid*dx, yid*dy, zid*dz,
552  time, element_num);
553  double2 complex_val = make_complex(val*dt, evolution_type);
554 
555  array_out[gid].x = array[gid].x * complex_val.x
556  - array[gid].y * complex_val.y;
557  array_out[gid].y = array[gid].x * complex_val.y
558  + array[gid].y * complex_val.x;
559 }
560 
561 
562 // Function to find the ast in real-time dynamics
563 __device__ double2 real_ast(double val, double dt){
564 
565  return {cos(-val*dt), sin(-val*dt)};
566 }
567 
568 // Function to find the ast in real-time dynamics
569 __device__ double2 im_ast(double val, double dt){
570 
571  return {exp(-val*dt), 0};
572 }
573 
574 __global__ void zeros(bool *in, bool *out){
575  int gid = getGid3d3d();
576  out[gid] = 0;
577 }
578 
579 __global__ void set_eq(double *in1, double *in2){
580  int gid = getGid3d3d();
581  in2[gid] = in1[gid];
582 }
583 
584 //##############################################################################
585 //##############################################################################
586 
587 /**
588  * Routine for parallel summation. Can be looped over from host.
589  */
590 template<typename T> __global__ void pSumT(T* in1, T* output, int pass){
591  unsigned int tid = threadIdx.x;
592  unsigned int bid = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x;// printf("bid0=%d\n",bid);
593  unsigned int gid = getGid3d3d();
594  extern __shared__ T sdata[];
595  for(int i = blockDim.x>>1; i > 0; i>>=1){
596  if(tid < blockDim.x>>1){
597  sdata[tid] += sdata[tid + i];
598  }
599  __syncthreads();
600  }
601  if(tid==0){
602  output[bid] = sdata[0];
603  }
604 }
605 
606 /**
607  * Routine for parallel summation. Can be looped over from host. BETA
608  */
609 __global__ void pSum(double* in1, double* output, int pass){
610  unsigned int tid = threadIdx.x;
611  unsigned int bid = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x;// printf("bid0=%d\n",bid);
612  unsigned int gid = getGid3d3d();
613  extern __shared__ double sdata2[];
614  for(int i = blockDim.x>>1; i > 0; i>>=1){
615  if(tid < blockDim.x>>1){
616  sdata2[tid] += sdata2[tid + i];
617  }
618  __syncthreads();
619  }
620  if(tid==0){
621  output[bid] = sdata2[0];
622  }
623 }
624 
625 //##############################################################################
626 //##############################################################################