1 #include "../include/constants.h"
2 #include "../include/dynamic.h"
3 #include "../include/ds.h"
6 __device__ double2 subtract(double2 a, double2 b){
7 return {a.x-b.x, a.y-b.y};
9 __device__ double2 add(double2 a, double2 b){
10 return {a.x+b.x, a.y+b.y};
12 __device__ double2 pow(double2 a, int b){
13 double r = sqrt(a.x*a.x + a.y*a.y);
14 double theta = atan(a.y / a.x);
15 return{pow(r,b)*cos(b*theta),pow(r,b)*sin(b*theta)};
18 __device__ double2 mult(double2 a, double b){
19 return {a.x*b, a.y*b};
21 __device__ double2 mult(double2 a, double2 b){
22 return {a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x};
25 //Evaluted in MATLAB: N*4*HBAR*HBAR*PI*(4.67e-9/mass)*sqrt(mass*(omegaZ)/(2*PI*HBAR))
26 //__constant__ double gDenConst = 6.6741e-40;
28 __device__ unsigned int getGid3d3d(){
29 int blockId = blockIdx.x + blockIdx.y * gridDim.x
30 + gridDim.x * gridDim.y * blockIdx.z;
31 int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
32 + (threadIdx.y * blockDim.x)
33 + (threadIdx.z * (blockDim.x * blockDim.y)) + threadIdx.x;
37 __global__ void is_eq(bool *a, bool *b, bool *ans){
38 int gid = getGid3d3d();
40 if (a[gid] != b[gid]){
46 // Function to convert a double* to double2*
47 __global__ void make_cufftDoubleComplex(double *in, double2 *out){
48 int gid = getGid3d3d();
53 // function to perform a transposition (2d) or permutation (3d)
54 // Note: The 3 ints represent the final placement of that data direction
55 // after transposition
56 inline __device__ unsigned int permuteGid(int d1, int d2, int d3){
58 // I cannot seem to think of any way to write this in a general case...
62 // If the three axes are in the original directions.
63 if (d1 == 0 && d2 == 1 && d3 == 2){
67 else if (d1 == 1 && d2 == 2 && d3 == 0){
68 x = blockIdx.x * blockDim.x + threadIdx.x;
69 z = blockDim.z * (x + blockIdx.z) + threadIdx.z;
70 y = blockDim.y * (z + blockIdx.y) + threadIdx.y;
74 else if (d1 == 2 && d2 == 0 && d3 == 1){
75 y = blockIdx.y * blockDim.y + threadIdx.y;
76 x = blockDim.x * (y + blockIdx.x) + threadIdx.x;
77 z = blockDim.z * (x + blockIdx.z) + threadIdx.z;
81 else if (d1 == 0 && d2 == 2 && d3 == 1){
82 y = blockIdx.y * blockDim.y + threadIdx.y;
83 z = blockDim.z * (y + blockIdx.z) + threadIdx.z;
84 x = blockDim.x * (z + blockIdx.x) + threadIdx.x;
88 else if (d1 == 1 && d2 == 0 && d3 == 2){
89 //z = blockIdx.z * blockDim.z + threadIdx.z;
90 //y = blockDim.y * (z + blockIdx.x) + threadIdx.y;
91 //x = blockDim.x * (y + blockIdx.y) + threadIdx.x;
92 //x = blockDim.x * (z + blockIdx.x) + threadIdx.x;
93 //y = blockDim.y * (x + blockIdx.y) + threadIdx.y;
96 z = blockIdx.z*blockDim.z + threadIdx.z;
97 x = blockIdx.x*blockDim.x + threadIdx.x;
98 y = blockIdx.y*blockDim.y + threadIdx.y;
99 return x + blockDim.x*y + blockDim.y*blockDim.x*z;
102 else if (d1 == 2 && d2 == 1 && d3 == 0){
103 x = blockIdx.x * blockDim.x + threadIdx.x;
104 y = blockDim.y * (x + blockIdx.y) + threadIdx.y;
105 z = blockDim.z * (y + blockIdx.z) + threadIdx.z;
114 __device__ unsigned int getBid3d3d(){
115 return blockIdx.x + gridDim.x*(blockIdx.y + gridDim.y * blockIdx.z);
118 __device__ unsigned int getTid3d3d(){
119 return blockDim.x * ( blockDim.y * ( blockDim.z + ( threadIdx.z * blockDim.y ) ) + threadIdx.y ) + threadIdx.x;
122 __device__ double2 make_complex(double in, int evolution_type){
125 switch(evolution_type){
131 // Im. Time evolution
136 // Real Time evolution
146 __device__ double2 conjugate(double2 in){
148 result.y = -result.y;
152 __device__ double2 realCompMult(double scalar, double2 comp){
154 result.x = scalar * comp.x;
155 result.y = scalar * comp.y;
159 __device__ double complexMagnitude(double2 in){
160 return sqrt(in.x*in.x + in.y*in.y);
163 __global__ void complexAbsSum(double2 *in1, double2 *in2, double *out){
164 int gid = getGid3d3d();
166 temp.x = in1[gid].x + in2[gid].x;
167 temp.y = in1[gid].y + in2[gid].y;
168 out[gid] = sqrt(temp.x*temp.x + temp.y*temp.y);
171 __global__ void complexMagnitude(double2 *in, double *out){
172 int gid = getGid3d3d();
173 out[gid] = sqrt(in[gid].x*in[gid].x + in[gid].y*in[gid].y);
176 __host__ __device__ double complexMagnitudeSquared(double2 in){
177 return in.x*in.x + in.y*in.y;
180 __global__ void complexMagnitudeSquared(double2 *in, double *out){
181 int gid = getGid3d3d();
182 out[gid] = in[gid].x*in[gid].x + in[gid].y*in[gid].y;
185 __global__ void complexMagnitudeSquared(double2 *in, double2 *out){
186 int gid = getGid3d3d();
187 out[gid].x = in[gid].x*in[gid].x + in[gid].y*in[gid].y;
191 __host__ __device__ double2 complexMultiply(double2 in1, double2 in2){
193 result.x = (in1.x*in2.x - in1.y*in2.y);
194 result.y = (in1.x*in2.y + in1.y*in2.x);
198 __global__ void complexMultiply(double2 *in1, double2 *in2, double2 *out){
199 int gid = getGid3d3d();
200 out[gid] = complexMultiply(in1[gid], in2[gid]);
205 * Used to perform conj(in1)*in2; == < in1 | in2 >
207 inline __device__ double2 braKetMult(double2 in1, double2 in2){
208 return complexMultiply(conjugate(in1),in2);
212 * Performs complex multiplication of in1 and in2, giving result as out.
214 __global__ void cMult(double2* in1, double2* in2, double2* out){
215 unsigned int gid = getGid3d3d();
217 double2 tin1 = in1[gid];
218 double2 tin2 = in2[gid];
219 result.x = (tin1.x*tin2.x - tin1.y*tin2.y);
220 result.y = (tin1.x*tin2.y + tin1.y*tin2.x);
224 __global__ void cMultPhi(double2* in1, double* in2, double2* out){
226 unsigned int gid = getGid3d3d();
227 result.x = cos(in2[gid])*in1[gid].x - in1[gid].y*sin(in2[gid]);
228 result.y = in1[gid].x*sin(in2[gid]) + in1[gid].y*cos(in2[gid]);
233 * Performs multiplication of double* with double2*
235 __global__ void vecMult(double2 *in, double *factor, double2 *out){
237 unsigned int gid = getGid3d3d();
238 result.x = in[gid].x * factor[gid];
239 result.y = in[gid].y * factor[gid];
243 __global__ void l2_norm(double *in1, double *in2, double *in3, double *out){
245 int gid = getGid3d3d();
246 out[gid] = sqrt(in1[gid]*in1[gid] + in2[gid]*in2[gid] + in3[gid]*in3[gid]);
249 __global__ void l2_norm(double2 *in1, double2 *in2, double2 *in3, double *out){
251 int gid = getGid3d3d();
252 out[gid] = sqrt(in1[gid].x*in1[gid].x + in1[gid].y*in1[gid].y
253 + in2[gid].x*in2[gid].x + in2[gid].y*in2[gid].y
254 + in3[gid].x*in3[gid].x + in3[gid].y*in3[gid].y);
257 __global__ void l2_norm(double *in1, double *in2, double *out){
259 int gid = getGid3d3d();
260 out[gid] = sqrt(in1[gid]*in1[gid] + in2[gid]*in2[gid]);
263 __global__ void l2_norm(double2 *in1, double2 *in2, double *out){
265 int gid = getGid3d3d();
266 out[gid] = sqrt(in1[gid].x*in1[gid].x + in1[gid].y*in1[gid].y
267 + in2[gid].x*in2[gid].x + in2[gid].y*in2[gid].y);
271 * Performs the non-linear evolution term of Gross--Pitaevskii equation.
273 __global__ void cMultDensity(double2* in1, double2* in2, double2* out, double dt, double mass, int gstate, double gDenConst){
277 int gid = getGid3d3d();
278 double2 tin1 = in1[gid];
279 double2 tin2 = in2[gid];
280 gDensity = gDenConst*complexMagnitudeSquared(in2[gid])*(dt/HBAR);
283 double tmp = in1[gid].x*exp(-gDensity);
284 result.x = (tmp)*tin2.x - (tin1.y)*tin2.y;
285 result.y = (tmp)*tin2.y + (tin1.y)*tin2.x;
289 tmp.x = tin1.x*cos(-gDensity) - tin1.y*sin(-gDensity);
290 tmp.y = tin1.y*cos(-gDensity) + tin1.x*sin(-gDensity);
291 //printf("%d\t%f\t%f\t%f\t%f\t%f\n", gid, tin1.x, tin1.y, tmp.x, tmp.y, gDensity);
293 result.x = (tmp.x)*tin2.x - (tmp.y)*tin2.y;
294 result.y = (tmp.x)*tin2.y + (tmp.y)*tin2.x;
299 //cMultDensity for ast V
300 __global__ void cMultDensity_ast(EqnNode_gpu *eqn, double2* in, double2* out,
301 double dx, double dy, double dz, double time,
302 int e_num, double dt, double mass, int gstate,
307 int gid = getGid3d3d();
308 int xid = blockIdx.x*blockDim.x + threadIdx.x;
309 int yid = blockIdx.y*blockDim.y + threadIdx.y;
310 int zid = blockIdx.z*blockDim.z + threadIdx.z;
312 double2 tin = in[gid];
313 gDensity = gDenConst*complexMagnitudeSquared(in[gid])*(dt/HBAR);
314 double2 val = make_complex(evaluate_eqn_gpu(eqn, xid*dx, yid*dy, zid*dz,
315 time, e_num), gstate+1);
318 double tmp = val.x*exp(-gDensity);
319 result.x = (tmp)*tin.x - (val.y)*tin.y;
320 result.y = (tmp)*tin.y + (val.y)*tin.x;
324 tmp.x = val.x*cos(-gDensity) - val.y*sin(-gDensity);
325 tmp.y = val.y*cos(-gDensity) + val.x*sin(-gDensity);
327 result.x = (tmp.x)*tin.x - (tmp.y)*tin.y;
328 result.y = (tmp.x)*tin.y + (tmp.y)*tin.x;
334 * Divides both components of vector type "in", by the value "factor".
335 * Results given with "out".
337 __global__ void scalarDiv(double2* in, double factor, double2* out){
339 unsigned int gid = getGid3d3d();
340 result.x = (in[gid].x / factor);
341 result.y = (in[gid].y / factor);
346 * Divides both components of vector type "in", by the value "factor".
347 * Results given with "out".
349 __global__ void scalarDiv(double* in, double factor, double* out){
351 unsigned int gid = getGid3d3d();
352 result = (in[gid] / factor);
358 * Multiplies both components of vector type "in", by the value "factor".
359 * Results given with "out".
361 __global__ void scalarMult(double2* in, double factor, double2* out){
363 //extern __shared__ double2 tmp_in[];
364 unsigned int gid = getGid3d3d();
365 result.x = (in[gid].x * factor);
366 result.y = (in[gid].y * factor);
371 * As above, but normalises for wfc
373 __global__ void scalarDiv_wfcNorm(double2* in, double dr, double* pSum, double2* out){
374 unsigned int gid = getGid3d3d();
376 double norm = sqrt((pSum[0])*dr);
377 result.x = (in[gid].x/norm);
378 result.y = (in[gid].y/norm);
383 * Raises in to the power of param
385 __global__ void scalarPow(double2* in, double param, double2* out){
386 unsigned int gid = getGid3d3d();
388 result.x = pow(in[gid].x, param);
389 result.y = pow(in[gid].y, param);
394 * Finds conjugate for double2*
396 __global__ void vecConjugate(double2 *in, double2 *out){
398 unsigned int gid = getGid3d3d();
399 result.x = in[gid].x;
400 result.y = -in[gid].y;
404 __global__ void angularOp(double omega, double dt, double2* wfc, double* xpyypx, double2* out){
405 unsigned int gid = getGid3d3d();
408 op = exp( -omega*xpyypx[gid]*dt);
409 result.x=wfc[gid].x*op;
410 result.y=wfc[gid].y*op;
415 * Kernel for a quick test of the threads and such for GPU computing
417 __global__ void thread_test(double *in, double *out){
419 unsigned int Gid = getGid3d3d();
421 // Now we set each element in the
427 * Routine for parallel summation. Can be looped over from host.
429 __global__ void multipass(double2* input, double2* output, int pass){
430 unsigned int tid = threadIdx.x + threadIdx.y*blockDim.x
431 + threadIdx.z * blockDim.x * blockDim.y;
432 unsigned int bid = blockIdx.x + blockIdx.y * gridDim.x
433 + gridDim.x * gridDim.y * blockIdx.z;
435 //unsigned int tid = getTid3d3d();
436 //unsigned int bid = getBid3d3d();
437 // printf("bid0=%d\n",bid);
439 unsigned int gid = getGid3d3d();
440 extern __shared__ double2 sdata[];
441 sdata[tid] = input[gid];
443 for(int i = blockDim.x>>1; i > 0; i>>=1){
445 sdata[tid].x += sdata[tid + i].x;
446 sdata[tid].y += sdata[tid + i].y;
451 output[bid] = sdata[0];
456 * Routine for parallel summation. Can be looped over from host.
458 __global__ void multipass(double* input, double* output){
459 unsigned int tid = threadIdx.x + threadIdx.y*blockDim.x
460 + threadIdx.z * blockDim.x * blockDim.y;
461 unsigned int bid = blockIdx.x + blockIdx.y * gridDim.x
462 + gridDim.x * gridDim.y * blockIdx.z;
464 //unsigned int tid = getTid3d3d();
465 //unsigned int bid = getBid3d3d();
466 // printf("bid0=%d\n",bid);
468 unsigned int gid = getGid3d3d();
469 extern __shared__ double sdatad[];
470 sdatad[tid] = input[gid];
473 for(int i = blockDim.x>>1; i > 0; i>>=1){
475 sdatad[tid] += sdatad[tid + i];
480 output[bid] = sdatad[0];
485 * Calculates all of the energy of the current state. sqrt_omegaz_mass = sqrt(omegaZ/mass), part of the nonlin interaction term
487 __global__ void energyCalc(double2 *wfc, double2 *op, double dt, double2 *energy, int gnd_state, int op_space, double sqrt_omegaz_mass, double gDenConst){
488 unsigned int gid = getGid3d3d();
489 //double hbar_dt = HBAR/dt;
492 double g_local = gDenConst*sqrt_omegaz_mass*complexMagnitudeSquared(wfc[gid]);
493 op[gid].x += g_local;
497 result = braKetMult(wfc[gid], energy[gid]);
498 energy[gid].x += result.x;
499 energy[gid].y += result.y;
502 result = complexMultiply(op[gid],wfc[gid]);
504 result = braKetMult(wfc[gid], complexMultiply(op[gid],wfc[gid]));
505 energy[gid].x += result.x;
506 energy[gid].y += result.y;
510 // Function to multiply a double* with an astval
511 __global__ void ast_mult(double *array, double *array_out, EqnNode_gpu *eqn,
512 double dx, double dy, double dz, double time,
514 int gid = getGid3d3d();
515 int xid = blockIdx.x*blockDim.x + threadIdx.x;
516 int yid = blockIdx.y*blockDim.y + threadIdx.y;
517 int zid = blockIdx.z*blockDim.z + threadIdx.z;
519 double val = evaluate_eqn_gpu(eqn, xid*dx, yid*dy, zid*dz,
522 array_out[gid] = array[gid] * val;
525 // Function to multiply a double* with an astval
526 __global__ void ast_cmult(double2 *array, double2 *array_out, EqnNode_gpu *eqn,
527 double dx, double dy, double dz, double time,
529 int gid = getGid3d3d();
530 int xid = blockIdx.x*blockDim.x + threadIdx.x;
531 int yid = blockIdx.y*blockDim.y + threadIdx.y;
532 int zid = blockIdx.z*blockDim.z + threadIdx.z;
534 double val = evaluate_eqn_gpu(eqn, xid*dx, yid*dy, zid*dz,
537 array_out[gid].x = array[gid].x * val;
538 array_out[gid].y = array[gid].y * val;
541 // Function to multiply an AST V in real or imaginary time evolution
542 __global__ void ast_op_mult(double2 *array, double2 *array_out,
544 double dx, double dy, double dz, double time,
545 int element_num, int evolution_type, double dt){
546 int gid = getGid3d3d();
547 int xid = blockIdx.x*blockDim.x + threadIdx.x;
548 int yid = blockIdx.y*blockDim.y + threadIdx.y;
549 int zid = blockIdx.z*blockDim.z + threadIdx.z;
551 double val = evaluate_eqn_gpu(eqn, xid*dx, yid*dy, zid*dz,
553 double2 complex_val = make_complex(val*dt, evolution_type);
555 array_out[gid].x = array[gid].x * complex_val.x
556 - array[gid].y * complex_val.y;
557 array_out[gid].y = array[gid].x * complex_val.y
558 + array[gid].y * complex_val.x;
562 // Function to find the ast in real-time dynamics
563 __device__ double2 real_ast(double val, double dt){
565 return {cos(-val*dt), sin(-val*dt)};
568 // Function to find the ast in real-time dynamics
569 __device__ double2 im_ast(double val, double dt){
571 return {exp(-val*dt), 0};
574 __global__ void zeros(bool *in, bool *out){
575 int gid = getGid3d3d();
579 __global__ void set_eq(double *in1, double *in2){
580 int gid = getGid3d3d();
584 //##############################################################################
585 //##############################################################################
588 * Routine for parallel summation. Can be looped over from host.
590 template<typename T> __global__ void pSumT(T* in1, T* output, int pass){
591 unsigned int tid = threadIdx.x;
592 unsigned int bid = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x;// printf("bid0=%d\n",bid);
593 unsigned int gid = getGid3d3d();
594 extern __shared__ T sdata[];
595 for(int i = blockDim.x>>1; i > 0; i>>=1){
596 if(tid < blockDim.x>>1){
597 sdata[tid] += sdata[tid + i];
602 output[bid] = sdata[0];
607 * Routine for parallel summation. Can be looped over from host. BETA
609 __global__ void pSum(double* in1, double* output, int pass){
610 unsigned int tid = threadIdx.x;
611 unsigned int bid = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x;// printf("bid0=%d\n",bid);
612 unsigned int gid = getGid3d3d();
613 extern __shared__ double sdata2[];
614 for(int i = blockDim.x>>1; i > 0; i>>=1){
615 if(tid < blockDim.x>>1){
616 sdata2[tid] += sdata2[tid + i];
621 output[bid] = sdata2[0];
625 //##############################################################################
626 //##############################################################################