GPUE  v1.0
GPU Gross-Pitaevskii Equation numerical solver for Bose-Einstein condensates
unit_test.cu
Go to the documentation of this file.
1 #include "../include/ds.h"
2 #include "../include/unit_test.h"
3 #include "../include/parser.h"
4 #include "../include/evolution.h"
5 #include "../include/init.h"
6 #include "../include/dynamic.h"
7 #include "../include/vortex_3d.h"
8 #include <string.h>
9 #include <assert.h>
10 #include <cufft.h>
11 #include <vector>
12 #include <fstream>
13 
14 // Adding tests for mathematical operator kernels
15 void math_operator_test();
16 __global__ void add_test(double2 *a, double2 *b, double2 *c);
17 __global__ void subtract_test(double2 *a, double2 *b, double2 *c);
18 __global__ void mult_test(double2 *a, double2 *b, double2 *c);
19 __global__ void mult_test(double2 *a, double b, double2 *c);
20 __global__ void pow_test(double2 *a, int b, double2 *c);
21 
22 // Tests for cufftDoubleComplex functions
23 void cufftDoubleComplex_functions_test();
24 __global__ void complexMag_test(double2 *in, double *out);
25 __global__ void complexMag2_test(double2 *in, double *out);
26 void realCompMult_test();
27 void cMult_test();
28 
29 // Tests for quantum operations
30 __global__ void make_complex_kernel(double *in, int *evolution_type,
31  double2 *out);
32 void make_complex_test();
33 void cMultPhi_test();
34 
35 // Tests for complex mathematical operations
36 void vecMult_test();
37 void scalarDiv_test();
38 void vecConj_test();
39 
40 // AST tests
41 void ast_mult_test();
42 void ast_cmult_test();
43 void ast_op_mult_test();
44 
45 // Other
46 void energyCalc_test();
47 void braKetMult_test();
48 
49 // Test for the Grid structure with parameters in it
50 void parameter_test();
51 
52 // Test for the parsing function
53 void parser_test();
54 
55 // Testing the evolve_2d function in evolution.cu
56 void evolve_test();
57 
58 // Testing the parSum function
59 void parSum_test();
60 
61 // Simple test of grid / cuda stuff
62 void grid_test2d();
63 void grid_test3d();
64 
65 // Test of 1D fft's along all 3d grids
66 void fft_test();
67 
68 // Test to check the equation parser for dynamic fields
69 void dynamic_test();
70 
71 // Test to make sure the kernel for the polynomial approx. of Bessel fxns works
72 void bessel_test();
73 
74 // Test for the vortex tracking functions in vortex_3d
75 void vortex3d_test();
76 
77 // Kernel testing will be added later
78 __device__ bool close(double a, double b, double threshold){
79  return (abs(a-b) < threshold);
80 }
81 
82 
83 /*----------------------------------------------------------------------------//
84 * MAIN
85 *-----------------------------------------------------------------------------*/
86 
87 void test_all(){
88  std::cout << "Starting unit tests..." << '\n';
89  parameter_test();
90 
91  std::cout
92  << "Beginning testing of standard mathematical operation kernels...\n";
93  math_operator_test();
94 
95  std::cout << "Beginning testing of cufftDoubleComplex kernels...\n";
96  cufftDoubleComplex_functions_test();
97 
98  // Do not uncomment these 2
99  //parser_test();
100 
101  grid_test2d();
102  grid_test3d();
103  parSum_test();
104  fft_test();
105  dynamic_test();
106  bessel_test();
107  //vortex3d_test();
108  make_complex_test();
109  cMultPhi_test();
110  evolve_test();
111 
112  std::cout << "All tests completed. GPUE passed." << '\n';
113 }
114 
115 void math_operator_test(){
116 
117  // First, we need to create a set of grids and threads to read into the
118  // kernels for testing
119  dim3 grid = {1,1,1};
120  dim3 threads = {1,1,1};
121 
122  double2 *ha, *hb, *hc;
123  double2 *da, *db, *dc;
124 
125  // Allocating single-element arrays to test kernels with.
126  ha = (double2*)malloc(sizeof(double2));
127  hb = (double2*)malloc(sizeof(double2));
128  hc = (double2*)malloc(sizeof(double2));
129 
130  ha[0].x = 0.01;
131  ha[0].y = 0.1;
132  hb[0].x = 0.02;
133  hb[0].y = 0.2;
134 
135  cudaMalloc((void**) &da, sizeof(double2));
136  cudaMalloc((void**) &db, sizeof(double2));
137  cudaMalloc((void**) &dc, sizeof(double2));
138 
139  cudaMemcpy(da, ha, sizeof(double2), cudaMemcpyHostToDevice);
140  cudaMemcpy(db, hb, sizeof(double2), cudaMemcpyHostToDevice);
141 
142  add_test<<<grid, threads>>>(da, db, dc);
143  cudaMemcpy(hc, dc, sizeof(double2), cudaMemcpyDeviceToHost);
144 
145  if (abs(hc[0].x - 0.03) > 1e-16 || abs(hc[0].y - 0.3) > 1e-16){
146  std::cout << "Complex addition test failed!\n";
147  exit(1);
148  }
149 
150  subtract_test<<<grid, threads>>>(da, db, dc);
151  cudaMemcpy(hc, dc, sizeof(double2), cudaMemcpyDeviceToHost);
152 
153  if (hc[0].x != -0.01 || hc[0].y != -0.1){
154  std::cout << "Complex subtraction test failed!\n";
155  exit(1);
156  }
157 
158  pow_test<<<grid, threads>>>(da, 3, dc);
159  cudaMemcpy(hc, dc, sizeof(double2), cudaMemcpyDeviceToHost);
160 
161  if (abs(hc[0].x + 0.000299) > 1e-16 || abs(hc[0].y + 0.00097) > 1e-16){
162  std::cout << "Complex power test failed!\n";
163  exit(1);
164  }
165 
166  mult_test<<<grid, threads>>>(da, db, dc);
167  cudaMemcpy(hc, dc, sizeof(double2), cudaMemcpyDeviceToHost);
168 
169  if (abs(hc[0].x + 0.0198) > 1e-16 || abs(hc[0].y - 0.004) > 1e-16){
170  std::cout << "Complex multiplication test failed!\n";
171  exit(1);
172  }
173 
174  mult_test<<<grid, threads>>>(da, 3.0, dc);
175  cudaMemcpy(hc, dc, sizeof(double2), cudaMemcpyDeviceToHost);
176 
177  if (abs(hc[0].x - 0.03) > 1e-16 || abs(hc[0].y - 0.3) > 1e-16){
178  std::cout << "Complex multiplication test with real number failed!\n";
179  exit(1);
180  }
181 
182  std::cout << "Complex addition, subtraction, multiplication, and powers have been tested\n";
183 }
184 
185 __global__ void add_test(double2 *a, double2 *b, double2 *c){
186  c[0] = add(a[0],b[0]);
187 }
188 
189 __global__ void subtract_test(double2 *a, double2 *b, double2 *c){
190  c[0] = subtract(a[0],b[0]);
191 }
192 
193 __global__ void pow_test(double2 *a, int b, double2 *c){
194  c[0] = pow(a[0],b);
195 }
196 
197 __global__ void mult_test(double2 *a, double2 *b, double2 *c){
198  c[0] = mult(a[0],b[0]);
199 }
200 
201 __global__ void mult_test(double2 *a, double b, double2 *c){
202  c[0] = mult(a[0],b);
203 }
204 
205 void cufftDoubleComplex_functions_test(){
206 
207  // first creating the grid and threads
208  dim3 grid = {1,1,1};
209  dim3 threads = {1,1,1};
210 
211  double *hval_double, *dval_double, *dout, *hout;
212  double2 *hval_double2, *dval_double2;
213 
214  double2 *hin, *din;
215 
216  hval_double = (double*)malloc(sizeof(double));
217  hval_double2 = (double2*)malloc(sizeof(double2));
218  hout = (double*)malloc(sizeof(double));
219  hin = (double2*)malloc(sizeof(double2));
220 
221  hval_double[0] = 3.0;
222  hval_double2[0].x = 0.3;
223  hval_double2[0].y = 0.4;
224 
225  hin[0].x = 3.0;
226  hin[0].y = 4.0;
227 
228  cudaMalloc((void**)&dval_double, sizeof(double));
229  cudaMalloc((void**)&dval_double2, sizeof(double2));
230  cudaMalloc((void**)&dout, sizeof(double));
231  cudaMalloc((void**)&din, sizeof(double2));
232 
233 
234  // Testing make_cufftDoubleComplex function
235  cudaMemcpy(dval_double, hval_double, sizeof(double),
236  cudaMemcpyHostToDevice);
237 
238  make_cufftDoubleComplex<<<grid, threads>>>(dval_double, dval_double2);
239 
240  cudaMemcpy(hval_double2, dval_double2, sizeof(double2),
241  cudaMemcpyDeviceToHost);
242 
243  if (hval_double2[0].x != 3.0 || hval_double2[0].y != 0){
244  std::cout << "Test of make_cufftDoubleComplex failed!\n";
245  exit(1);
246  }
247 
248  // testing device complexMagnitude function
249  cudaMemcpy(din, hin, sizeof(double2), cudaMemcpyHostToDevice);
250  complexMag_test<<<grid, threads>>>(din, dout);
251 
252  cudaMemcpy(hout, dout, sizeof(double), cudaMemcpyDeviceToHost);
253 
254  if (hout[0] != 5.0){
255  std::cout << hout[0] << '\n';
256  std::cout << "Test of device complexMagnitude failed!\n";
257  exit(1);
258  }
259 
260  // Testing global complexMagnitude function
261  complexMagnitude<<<grid, threads>>>(din, dout);
262  cudaMemcpy(hout, dout, sizeof(double), cudaMemcpyDeviceToHost);
263 
264  if (hout[0] != 5.0){
265  std::cout << hout[0] << '\n';
266  std::cout << "Test of global complexMagnitude failed!\n";
267  exit(1);
268  }
269 
270  complexMag2_test<<<grid, threads>>>(din, dout);
271 
272  cudaMemcpy(hout, dout, sizeof(double), cudaMemcpyDeviceToHost);
273 
274  if (hout[0] != 25.0){
275  std::cout << hout[0] << '\n';
276  std::cout << "Test of device complexMagnitudeSquared failed!\n";
277  exit(1);
278  }
279 
280  // Testing global complexMagnitude function
281  complexMagnitudeSquared<<<grid, threads>>>(din, dout);
282  cudaMemcpy(hout, dout, sizeof(double), cudaMemcpyDeviceToHost);
283 
284  if (hout[0] != 25.0){
285  std::cout << hout[0] << '\n';
286  std::cout << "Test of global complexMagnitudeSquared failed!\n";
287  exit(1);
288  }
289 
290 
291  std::cout << "make_cufftDoubleComplex, and complexMagnitude[Squared] have been tested\n";
292 
293 }
294 
295 __global__ void complexMag_test(double2 *in, double *out){
296  out[0] = complexMagnitude(in[0]);
297 }
298 
299 __global__ void complexMag2_test(double2 *in, double *out){
300  out[0] = complexMagnitudeSquared(in[0]);
301 }
302 
303 // Test to check the equation parser for dynamic fields
304 // For this test, we will need a general set of parameters to read in and a
305 // standard equation string to look at.
306 void dynamic_test(){
307 
308  std::cout << "Beginning test of dynamic functions..." <<'\n';
309  std::string eqn_string = "(((3*x)+7)+(5-7)+cos(0)*1)+pow(120,2)";
310 
311  Grid par;
312  par.store("x",5.0);
313  par.store("y",5.0);
314  par.store("z",5.0);
315  par.store("omegaX",5.0);
316  par.store("omegaY",5.0);
317  par.store("omegaZ",5.0);
318  par.store("xMax",5.0);
319  par.store("yMax",5.0);
320  par.store("zMax",5.0);
321  par.store("fudge",5.0);
322  par.store("mass",5.0);
323  std::string val_string = "check_var";
324 
325  EqnNode eqn_tree = parse_eqn(par, eqn_string, val_string);
326 
327  std::cout << "finding the number of elements in abstract syntax tree...\n";
328 
329  int num = 0;
330  find_element_num(eqn_tree, num);
331  int element_num = num;
332 
333  std::cout << "Total number of elements is: " << num << '\n';
334 
335  std::cout << "Now to copy the tree to the GPU..." << '\n';
336 
337  EqnNode_gpu *eqn_gpu, *eqn_cpu;
338  eqn_cpu = (EqnNode_gpu *)malloc(sizeof(EqnNode_gpu)*element_num);
339  num = 0;
340  tree_to_array(eqn_tree, eqn_cpu, num);
341 
342 /*
343  for (int i = 0; i < num; ++i){
344  std::cout << eqn_cpu[i].val << '\n';
345  std::cout << eqn_cpu[i].left << '\n';
346  std::cout << eqn_cpu[i].right << '\n' << '\n';
347  }
348 */
349 
350  cudaMalloc((void**)&eqn_gpu, sizeof(EqnNode_gpu)*element_num);
351  cudaMemcpy(eqn_gpu, eqn_cpu, sizeof(EqnNode_gpu)*element_num,
352  cudaMemcpyHostToDevice);
353 
354  // Now to check some simple evaluation
355  std::cout << "Now to check simple GPU evaluation..." << '\n';
356  int n = 64;
357  double *array, *array_gpu;
358  array = (double *)malloc(sizeof(double)*n);
359  cudaMalloc(&array_gpu, sizeof(double)*n);
360 
361  int threads = 64;
362  int grid = (int)ceil((float)n/threads);
363 
364  //zeros<<<grid, threads>>>(array_gpu, n);
365  find_field<<<grid, threads>>>(array_gpu, 1, 0.0, 0.0, 0.0, 1,1,1,eqn_gpu);
366 
367  cudaMemcpy(array, array_gpu, sizeof(double)*n, cudaMemcpyDeviceToHost);
368 
369  for (int i = 0; i < n; ++i){
370  double eqn_val = (((3*i)+7)+(5-7)+cos(0)*1)+pow(120,2);
371  if (array[i] != eqn_val){
372  std::cout << "GPU evaluation failed in dynamic test!\n";
373  assert(array[i] == eqn_val);
374  }
375  }
376 
377  // Now testing simple parsing of example "example.cfg"
378  std::cout << "Testing simple parameter parsing." << '\n';
379  par.store("param_file", (std::string)"src/example.cfg");
380  parse_param_file(par);
381  EqnNode_gpu *eqn = par.astval("V");
382  find_field<<<grid, threads>>>(array_gpu, 0.1, 0.1, 0.1, 1, 1, 1, 0, eqn);
383  cudaDeviceSynchronize();
384 
385  cudaMemcpy(array, array_gpu, sizeof(double)*n, cudaMemcpyDeviceToHost);
386 
387 /*
388  for (int i = 0; i < n; ++i){
389  std::cout << array[i] << '\n';
390  }
391 */
392 
393  std::cout << "Dynamic tests passed" <<'\n';
394 }
395 
396 __global__ void bessel_test_kernel(double *j, double *j_poly, bool *val){
397  int xid = blockDim.x*blockIdx.x + threadIdx.x;
398  j[xid] = j0(xid * 2.0 / 128);
399  j_poly[xid] = poly_j(0,xid * 2.0 / 128, 40);
400 
401  if (!close(j[xid],j_poly[xid], 0.0001)){
402  val[0] = false;
403  printf("Error at element %u in Bessel test!\tValues: %f, %f\n",
404  xid, j[xid], j_poly[xid]);
405  }
406 }
407 
408 // Test for bessel functions
409 void bessel_test(){
410 
411  std::cout << "Testing Bessel Functions..." << '\n';
412 
413  double *j_gpu, *j_poly_gpu;
414  bool *val, *val_gpu;
415  int n = 128;
416  cudaMalloc((void **)&j_gpu, sizeof(double)*n);
417  cudaMalloc((void **)&j_poly_gpu, sizeof(double)*n);
418 
419  cudaMalloc((void **)&val_gpu, sizeof(bool));
420  val = (bool *)malloc(sizeof(bool));
421  val[0] = true;
422  cudaMemcpy(val_gpu, val, sizeof(bool), cudaMemcpyHostToDevice);
423 
424  bessel_test_kernel<<<64,2>>>(j_gpu, j_poly_gpu, val_gpu);
425  cudaMemcpy(val, val_gpu, sizeof(bool), cudaMemcpyDeviceToHost);
426 
427  if(val[0]){
428  std::cout << "Bessel Test Passed!" << '\n';
429  }
430  else{
431  std::cout << "Bessel Test Failed!" << '\n';
432  exit(1);
433  }
434 
435 }
436 
437 // Test of 1D fft's along all 3d grids
438 // In particular, we need to test the generate_plan_other3d function
439 // These will be checked against 1d
440 void fft_test(){
441 
442  std::cout << "Beginning cufft test.\n";
443 
444  // For these tests, we are assuming that the x, y and z dimensions are
445  // All the same (2x2x2)
446  // Note that yDim needs to be singled out differently, but z/x need no loops
447 
448  // now we need to create the necessary parameters and store everything
449  int xDim = 2;
450  int yDim = 2;
451  int zDim = 2;
452  int gsize = xDim * yDim * zDim;
453 
454  Grid par;
455  par.store("xDim", xDim);
456  par.store("yDim", yDim);
457  par.store("zDim", zDim);
458 
459  cufftHandle plan_x, plan_y, plan_z;
460  // Now creating the plans
461  generate_plan_other3d(&plan_x, par, 0);
462  generate_plan_other3d(&plan_y, par, 1);
463  generate_plan_other3d(&plan_z, par, 2);
464 
465  // And the result / error
466  cudaError_t err;
467  cufftResult result;
468 
469  // Creating the initial array for the x dimension fft
470  double2 *array, *gpu_array;
471  array = (double2 *) malloc(sizeof(double2)*gsize);
472  cudaMalloc((void**) &gpu_array, sizeof(double2)*gsize);
473  for (int i = 0; i < gsize; i++){
474  array[i].x = 1;
475  array[i].y = 0;
476  }
477 
478  // transferring to gpu
479  err = cudaMemcpy(gpu_array, array, sizeof(double2)*gsize,
480  cudaMemcpyHostToDevice);
481  if (err != cudaSuccess){
482  std::cout << "Could not coppy array to device!" << '\n';
483  std::cout << "error code: " << err << '\n';
484  exit(1);
485  }
486 
487  // Performing the x transformation
488  for (int i = 0; i < yDim; i++){
489  result = cufftExecZ2Z(plan_y, &gpu_array[i*xDim*yDim],
490  &gpu_array[i*xDim*yDim], CUFFT_FORWARD);
491  }
492  //result = cufftExecZ2Z(plan_z, gpu_array, gpu_array, CUFFT_FORWARD);
493 
494  // transferring back to host to check output
495  err = cudaMemcpy(array, gpu_array, sizeof(double2)*gsize,
496  cudaMemcpyDeviceToHost);
497  if (err != cudaSuccess){
498  std::cout << "Could not coppy gpu_array to host!" << '\n';
499  std::cout << "error code: " << err << '\n';
500  exit(1);
501  }
502 
503 /*
504  for (int i = 0; i < gsize; i++){
505  std::cout << array[i].x << '\t' << array[i].y << '\n';
506  }
507 */
508 
509  // Now to try the inverse direction
510 
511  for (int i = 0; i < yDim; i++){
512  result = cufftExecZ2Z(plan_y, &gpu_array[i*xDim*yDim],
513  &gpu_array[i*xDim*yDim], CUFFT_INVERSE);
514  }
515  //result = cufftExecZ2Z(plan_z, gpu_array, gpu_array, CUFFT_INVERSE);
516 
517  // copying back
518  err = cudaMemcpy(array, gpu_array, sizeof(double2)*gsize,
519  cudaMemcpyDeviceToHost);
520  if (err != cudaSuccess){
521  std::cout << "Could not coppy gpu_array to host!" << '\n';
522  std::cout << "error code: " << err << '\n';
523  exit(1);
524  }
525 
526 /*
527  for (int i = 0; i < gsize; i++){
528  std::cout << array[i].x << '\t' << array[i].y << '\n';
529  }
530 */
531 
532  std::cout << "cufft test passed!\n";
533 
534 
535 
536 }
537 
538 // Simple test of CUDA grid stuff
539 void grid_test2d(){
540 
541  std::cout << "testing grid / threads and stuff" << '\n';
542 
543  int max_threads = 256;
544 
545  int xDim = 1024;
546  int yDim = 1024;
547  int zDim = 1;
548 
549  int xD = 1, yD = 1, zD = 1;
550 
551  int gsize = xDim * yDim;
552 
553  // Now to set up the CUDA grid / threads
554  dim3 block;
555  dim3 grid;
556 
557  Grid par;
558  par.store("xDim",xDim);
559  par.store("yDim",yDim);
560  par.store("zDim",zDim);
561  par.store("dimnum", 2);
562 
563  generate_grid(par);
564 
565  if (xDim <= max_threads){
566  block.x = xDim;
567  block.y = 1;
568  block.z = 1;
569 
570  xD = 1;
571  yD = yDim;
572  zD = 1;
573  }
574  else{
575  int count = 0;
576  int dim_tmp = xDim;
577  while (dim_tmp > max_threads){
578  count++;
579  dim_tmp /= 2;
580  }
581 
582  std::cout << "count is: " << count << '\n';
583 
584  block.x = dim_tmp;
585  block.y = 1;
586  block.z = 1;
587  xD = pow(2,count);
588  yD = yDim;
589  zD = 1;
590  }
591 
592  std::cout << "threads in x are: " << block.x << '\n';
593  std::cout << "dimensions are: " << xD << '\t' << yD << '\t' << zD << '\n';
594 
595  grid.x=xD;
596  grid.y=yD;
597  grid.z=zD;
598 
599  if (grid.x != par.grid.x || block.x != par.threads.x ||
600  grid.y != par.grid.y || block.y != par.threads.y ||
601  grid.z != par.grid.z || block.z != par.threads.z){
602  std::cout << "Gridding test 2D failed! Improper generation of threads!"
603  << '\n';
604  assert(grid.x == par.grid.x);
605  assert(grid.y == par.grid.y);
606  assert(grid.z == par.grid.z);
607  assert(block.x == par.threads.x);
608  assert(block.y == par.threads.y);
609  assert(block.z == par.threads.z);
610  }
611 
612  int total_threads = block.x * block.y * block.z;
613 
614  // Now we need to initialize our double * and send it to the gpu
615  double *host_array, *device_array;
616  host_array = (double *) malloc(sizeof(double)*gsize);
617  cudaMalloc((void**) &device_array, sizeof(double)*gsize);
618 
619  // initializing 2d array
620  for (int i = 0; i < gsize; i++){
621  host_array[i] = -1;
622  }
623 
624  // Now to copy to device
625  cudaMemcpy(device_array, host_array,
626  sizeof(double)*gsize,
627  cudaMemcpyHostToDevice);
628 
629  // Test
630  thread_test<<<grid,block>>>(device_array,device_array);
631 
632  // Now to copy back and print
633  cudaMemcpy(host_array, device_array,
634  sizeof(double)*gsize,
635  cudaMemcpyDeviceToHost);
636 
637 
638  for (int i = 0; i < xDim; ++i){
639  for (int j = 0; j < yDim; ++j){
640  int index = i*yDim + j;
641  if (host_array[index] != index){
642  std::cout << "Threadding values improperly set!\n";
643  assert(host_array[index] == index);
644  }
645  }
646  }
647 
648  std::cout << "2d grid tests completed. Now for 3d cases" << '\n';
649 
650 }
651 
652 // Simple test of CUDA grid stuff
653 void grid_test3d(){
654 
655  std::cout << "testing grid / threads and stuff for 3d" << '\n';
656 
657  int max_threads = 256;
658 
659  int xDim = 256;
660  int yDim = 256;
661  int zDim = 256;
662 
663  int xD = 1, yD = 1, zD = 1;
664 
665  int gsize = xDim * yDim * zDim;
666 
667  // Now to set up the CUDA grid / threads
668  dim3 block;
669  dim3 grid;
670 
671  Grid par;
672  par.store("xDim",xDim);
673  par.store("yDim",yDim);
674  par.store("zDim",zDim);
675  par.store("dimnum", 3);
676 
677  generate_grid(par);
678 
679 
680  if (xDim <= max_threads){
681  block.x = xDim;
682  block.y = 1;
683  block.z = 1;
684 
685  xD = 1;
686  yD = yDim;
687  zD = zDim;
688  }
689  else{
690  int count = 0;
691  int dim_tmp = xDim;
692  while (dim_tmp > max_threads){
693  count++;
694  dim_tmp /= 2;
695  }
696 
697  std::cout << "count is: " << count << '\n';
698 
699  block.x = dim_tmp;
700  block.y = 1;
701  block.z = 1;
702  xD = pow(2,count);
703  yD = yDim;
704  zD = zDim;
705  }
706 
707  std::cout << "threads in x are: " << block.x << '\n';
708  std::cout << "dimensions are: " << xD << '\t' << yD << '\t' << zD << '\n';
709 
710  grid.x=xD;
711  grid.y=yD;
712  grid.z=zD;
713 
714  if (grid.x != par.grid.x || block.x != par.threads.x ||
715  grid.y != par.grid.y || block.y != par.threads.y ||
716  grid.z != par.grid.z || block.z != par.threads.z){
717  std::cout << "Gridding test 3D failed! Improper generation of threads!"
718  << '\n';
719  assert(grid.x == par.grid.x);
720  assert(grid.y == par.grid.y);
721  assert(grid.z == par.grid.z);
722  assert(block.x == par.threads.x);
723  assert(block.y == par.threads.y);
724  assert(block.z == par.threads.z);
725  }
726 
727 
728  int total_threads = block.x * block.y * block.z;
729 
730  // Now we need to initialize our double * and send it to the gpu
731  double *host_array, *device_array;
732  host_array = (double *) malloc(sizeof(double)*gsize);
733  cudaMalloc((void**) &device_array, sizeof(double)*gsize);
734 
735  // initializing 2d array
736  for (int i = 0; i < gsize; i++){
737  host_array[i] = -1;
738  }
739 
740  // Now to copy to device
741  cudaMemcpy(device_array, host_array,
742  sizeof(double)*gsize,
743  cudaMemcpyHostToDevice);
744 
745  // Test
746  thread_test<<<grid,block>>>(device_array,device_array);
747 
748  // Now to copy back and print
749  cudaMemcpy(host_array, device_array,
750  sizeof(double)*gsize,
751  cudaMemcpyDeviceToHost);
752 
753 
754  for (int i = 0; i < xDim; ++i){
755  for (int j = 0; j < yDim; ++j){
756  for (int k = 0; k < zDim; ++k){
757  int index = i*yDim*zDim + j*yDim + k;
758  if (host_array[index] != index){
759  std::cout << "Threadding values improperly set!\n";
760  assert(host_array[index] == index);
761  }
762  }
763  }
764  }
765 
766  std::cout << "3d grid tests completed. now for 3d cases" << '\n';
767 
768 }
769 
770 // Test of the parSum function in 3d
771 void parSum_test(){
772 
773  std::cout << "Beginning test of parallel summation.\n";
774 
775  // Setting error
776  cudaError_t err;
777 
778  // first, we need to initialize the Grid and Cuda classes
779  Grid par;
780 
781  // 2D test first
782 
783  // For now, we will assume an 64x64 array for summing
784  dim3 threads(16, 1, 1);
785  int total_threads = threads.x*threads.y*threads.z;
786 
787  double dx = 0.1;
788  double dy = 0.1;
789  double dz = 0.1;
790 
791  par.store("dimnum", 2);
792  par.store("xDim", 64);
793  par.store("yDim", 64);
794  par.store("zDim", 1);
795  par.store("dx",dx);
796  par.store("dy",dy);
797  par.store("dz",dz);
798  par.threads = threads;
799 
800  // Now we need to initialize the grid for the getGid3d3d kernel
801  int gsize = 64*64;
802  dim3 grid;
803  grid.x = 4;
804  grid.y = 64;
805 
806  par.grid = grid;
807 
808  // now we need to initialize the wfc to all 1's;
809  double2 *wfc, *host_sum;
810  wfc = (cufftDoubleComplex *) malloc(sizeof(cufftDoubleComplex) * gsize);
811  host_sum = (cufftDoubleComplex *)
812  malloc(sizeof(cufftDoubleComplex) * gsize / total_threads);
813 
814  // init wfc
815  for (int i = 0; i < gsize; i++){
816  wfc[i].x = 2;
817  wfc[i].y = 2;
818  }
819 
820  double2 *gpu_wfc;
821  cudaMalloc((void**) &gpu_wfc, sizeof(cufftDoubleComplex)*gsize);
822 
823  // copying wfc to device
824  err = cudaMemcpy(gpu_wfc, wfc, sizeof(cufftDoubleComplex)*gsize,
825  cudaMemcpyHostToDevice);
826 
827  if (err!=cudaSuccess){
828  std::cout << "ERROR: Could not copy wfc to device!" << '\n';
829  }
830 
831  // Creating parsum on device
832 
833  parSum(gpu_wfc, par);
834 
835  // copying parsum back
836  err = cudaMemcpy(wfc, gpu_wfc,
837  sizeof(cufftDoubleComplex)*gsize,
838  cudaMemcpyDeviceToHost);
839  if (err!=cudaSuccess){
840  std::cout << err << '\n';
841  std::cout << "ERROR: Could not copy wfc to the host!" << '\n';
842  exit(1);
843  }
844 
845  for (int i = 0; i < gsize; ++i){
846  if (wfc[i].x != 2/sqrt(32768.0*dx*dy) ||
847  wfc[i].y != 2/sqrt(32768.0*dx*dy)){
848  std::cout << "Wavefunction not normalized!" << '\n';
849  std::cout << i << '\t' << wfc[i].x << '\t' << wfc[i].y << '\n';
850  assert(wfc[i].x == 2/sqrt(32768.0*dx*dy));
851  assert(wfc[i].y == 2/sqrt(32768.0*dx*dy));
852  }
853  }
854 
855  // Now for the 3d case
856  // For now, we will assume a 16x16x16 array for summing
857  par.store("dimnum", 3);
858  par.store("xDim", 16);
859  par.store("yDim", 16);
860  par.store("zDim", 16);
861  par.store("dx",dx);
862  par.store("dy",dy);
863  par.store("dz",dz);
864 
865  // Now we need to initialize the grid for the getGid3d3d kernel
866  grid.x = 16;
867  grid.y = 16;
868  grid.z = 16;
869 
870  par.grid = grid;
871 
872  // copying host wfc back to device
873  err = cudaMemcpy(gpu_wfc, wfc, sizeof(cufftDoubleComplex)*gsize,
874  cudaMemcpyHostToDevice);
875 
876  parSum(gpu_wfc, par);
877 
878  // copying parsum back
879  err = cudaMemcpy(wfc, gpu_wfc,
880  sizeof(cufftDoubleComplex)*gsize,
881  cudaMemcpyDeviceToHost);
882  if (err!=cudaSuccess){
883  std::cout << "ERROR: Could not copy wfc to the host!" << '\n';
884  exit(1);
885  }
886 
887  for (int i = 0; i < gsize; ++i){
888  if (wfc[i].x != 2/sqrt(32768.0*dx*dy*dz) ||
889  wfc[i].y != 2/sqrt(32768.0*dx*dy*dz)){
890  std::cout << "Wavefunction not normalized!" << '\n';
891  std::cout << wfc[i].x << '\t' << wfc[i].y << '\n';
892  assert(wfc[i].x == 2/sqrt(32768.0*dx*dy*dz));
893  assert(wfc[i].y == 2/sqrt(32768.0*dx*dy*dz));
894  }
895  }
896 
897  std::cout << "Parallel summation test passed in 2 and 3D!\n";
898 
899 }
900 
901 // Test for the Grid structure with paramters in it
902 // Initialize all necessary variables and read them back out
903 void parameter_test(){
904  // For this test, we simply need to read in and out stuff from each
905  // class and structure in ds.h / ds.cc
906 
907  // Certain variables will be used multiple times.
908  double *dstar_var;
909  dstar_var = (double *)malloc(sizeof(double) * 5);
910  cufftDoubleComplex *cdc_var;
911  cdc_var = (cufftDoubleComplex *)malloc(sizeof(cufftDoubleComplex) * 5);
912  for (int i = 0; i < 5; ++i){
913  dstar_var[i] = (double)i * 0.5;
914  cdc_var[i].x = (double)i * 0.5;
915  cdc_var[i].y = (double)i * 0.5;
916  }
917 
918  double dvar = 1.05;
919  int ivar = 5;
920  bool bvar = true;
921 
922  // Now testing the Grid class
923  Grid grid_test;
924  grid_test.store("dstar_var",dstar_var);
925  grid_test.store("dvar", dvar);
926  grid_test.store("ivar", ivar);
927  grid_test.store("bvar", bvar);
928 
929  assert(dstar_var == grid_test.dsval("dstar_var"));
930  assert(dvar == grid_test.dval("dvar"));
931  assert(ivar == grid_test.ival("ivar"));
932  assert(bvar == grid_test.bval("bvar"));
933 
934  std::cout << "Grid class checked, now checking the Cuda class..." << '\n';
935 
936  std::cout << "All data structures checked" << '\n';
937 
938 }
939 
940 // Test for the parsing function
941 void parser_test(){
942 
943  // Testing the command-line parser with defaults and with modifications
944  std::cout << "Testing command-line parser with no arguments..." << '\n';
945 
946  // First testing default values in and out of the parser function
947  char *fake_noargv[] = {NULL};
948  Grid noarg_grid;
949  noarg_grid = parseArgs(0,fake_noargv);
950 
951  // Checking contents of noarg_grid:
952  assert(noarg_grid.ival("xDim") == 256);
953  assert(noarg_grid.ival("yDim") == 256);
954  assert(noarg_grid.ival("zDim") == 256);
955  assert(noarg_grid.dval("omega") == 0);
956  assert(noarg_grid.dval("gammaY") == 1.0);
957  assert(noarg_grid.ival("gsteps") == 1);
958  assert(noarg_grid.ival("esteps") == 1);
959  assert(noarg_grid.dval("gdt") == 1e-4);
960  assert(noarg_grid.dval("dt") == 1e-4);
961  assert(noarg_grid.ival("device") == 0);
962  assert(noarg_grid.ival("atoms") == 1);
963  assert(noarg_grid.bval("read_wfc") == false);
964  assert(noarg_grid.ival("printSteps") == 100);
965  assert(noarg_grid.dval("winding") == 0);
966  assert(noarg_grid.bval("corotating") == false);
967  assert(noarg_grid.bval("gpe") == false);
968  assert(noarg_grid.dval("omegaZ") == 6.283);
969  assert(noarg_grid.dval("interaction") == 1);
970  assert(noarg_grid.dval("laser_power") == 0);
971  assert(noarg_grid.dval("angle_sweep") == 0);
972  assert(noarg_grid.ival("kick_it") == 0);
973  assert(noarg_grid.bval("write_it") == false);
974  assert(noarg_grid.dval("x0_shift") == 0);
975  assert(noarg_grid.dval("y0_shift") == 0);
976  assert(noarg_grid.dval("z0_shift") == 0);
977  assert(noarg_grid.dval("sepMinEpsilon") == 0);
978  assert(noarg_grid.bval("graph") == false);
979  assert(noarg_grid.bval("unit_test") == false);
980  assert(noarg_grid.dval("omegaX") == 6.283);
981  assert(noarg_grid.dval("omegaY") == 6.283);
982  assert(noarg_grid.sval("data_dir") == "data/");
983  assert(noarg_grid.bval("ramp") == false);
984  assert(noarg_grid.ival("ramp_type") == 1);
985  assert(noarg_grid.ival("dimnum") == 2);
986  assert(noarg_grid.bval("write_file") == true);
987  assert(noarg_grid.dval("fudge") == 1.0);
988  assert(noarg_grid.ival("kill_idx") == -1);
989  assert(noarg_grid.dval("mask_2d") == 1.5e-4);
990  assert(noarg_grid.dval("box_size") == 2.5e-5);
991  assert(noarg_grid.bval("found_sobel") == false);
992  assert(noarg_grid.Afn == "rotation");
993  assert(noarg_grid.Kfn == "rotation_K");
994  assert(noarg_grid.Vfn == "2d");
995  assert(noarg_grid.Wfcfn == "2d");
996  assert(noarg_grid.sval("conv_type") == "FFT");
997  assert(noarg_grid.ival("charge") == 0);
998  assert(noarg_grid.bval("flip") == false);
999 
1000  // Now testing all values specified by command-line arguments
1001  std::cout << "Testing command-line parser with all arguments..." << '\n';
1002  std::vector<std::string> argarray(10);
1003 
1004  // I apologize for the mess... If you have a better way of creating the
1005  // char ** for this without running into memory issues, let me know!
1006  char *fake_fullargv[] = {strdup("./gpue"),
1007  strdup("-A"), strdup("rotation"),
1008  strdup("-a"),
1009  strdup("-b"), strdup("2.5e-5"),
1010  strdup("-C"), strdup("0"),
1011  strdup("-c"), strdup("3"),
1012  strdup("-D"), strdup("data"),
1013  strdup("-E"),
1014  strdup("-e"), strdup("1"),
1015  strdup("-f"),
1016  strdup("-G"), strdup("1"),
1017  strdup("-g"), strdup("1"),
1018  strdup("-i"), strdup("1"),
1019  strdup("-K"), strdup("0"),
1020  strdup("-k"), strdup("0"),
1021  strdup("-L"), strdup("0"),
1022  strdup("-l"),
1023  strdup("-n"), strdup("1"),
1024  strdup("-O"), strdup("0"),
1025  strdup("-P"), strdup("0"),
1026  strdup("-p"), strdup("100"),
1027  strdup("-Q"), strdup("0"),
1028  strdup("-q"), strdup("0"),
1029  strdup("-R"), strdup("1"),
1030  //strdup("-r"),
1031  strdup("-S"), strdup("0"),
1032  strdup("-s"),
1033  strdup("-T"), strdup("1e-4"),
1034  strdup("-t"), strdup("1e-4"),
1035  strdup("-U"), strdup("0"),
1036  strdup("-V"), strdup("0"),
1037  strdup("-W"),
1038  strdup("-w"), strdup("0"),
1039  strdup("-X"), strdup("1.0"),
1040  strdup("-x"), strdup("256"),
1041  strdup("-Y"), strdup("1.0"),
1042  strdup("-y"), strdup("256"),
1043  strdup("-Z"), strdup("6.283"),
1044  strdup("-z"), strdup("256"),
1045  NULL};
1046  int fake_argc = sizeof(fake_fullargv) / sizeof(char *) - 1;
1047 
1048  // Now to read into gpue and see what happens
1049  Grid fullarg_grid;
1050  fullarg_grid = parseArgs(fake_argc, fake_fullargv);
1051 
1052  // Checking contents of fullarg_grid:
1053  assert(fullarg_grid.ival("xDim") == 256);
1054  assert(fullarg_grid.ival("yDim") == 256);
1055  assert(fullarg_grid.ival("zDim") == 256);
1056  assert(fullarg_grid.dval("omega") == 0);
1057  assert(fullarg_grid.dval("gammaY") == 1.0);
1058  assert(fullarg_grid.ival("gsteps") == 1);
1059  assert(fullarg_grid.ival("esteps") == 1);
1060  assert(fullarg_grid.dval("gdt") == 1e-4);
1061  assert(fullarg_grid.dval("dt") == 1e-4);
1062  assert(fullarg_grid.ival("device") == 0);
1063  assert(fullarg_grid.ival("atoms") == 1);
1064  assert(fullarg_grid.bval("read_wfc") == false);
1065  assert(fullarg_grid.ival("printSteps") == 100);
1066  assert(fullarg_grid.dval("winding") == 0);
1067  assert(fullarg_grid.bval("corotating") == true);
1068  assert(fullarg_grid.bval("gpe") == true);
1069  assert(fullarg_grid.dval("omegaZ") == 6.283);
1070  assert(fullarg_grid.dval("interaction") == 1);
1071  assert(fullarg_grid.dval("laser_power") == 0);
1072  assert(fullarg_grid.dval("angle_sweep") == 0);
1073  assert(fullarg_grid.ival("kick_it") == 0);
1074  assert(fullarg_grid.bval("write_it") == true);
1075  assert(fullarg_grid.dval("x0_shift") == 0);
1076  assert(fullarg_grid.dval("y0_shift") == 0);
1077  assert(fullarg_grid.dval("z0_shift") == 0);
1078  assert(fullarg_grid.dval("sepMinEpsilon") == 0);
1079  assert(fullarg_grid.bval("graph") == true);
1080  assert(fullarg_grid.bval("unit_test") == false);
1081  assert(fullarg_grid.dval("omegaX") == 1.0);
1082  assert(fullarg_grid.dval("omegaY") == 1.0);
1083  assert(fullarg_grid.sval("data_dir") == "data/");
1084  assert(fullarg_grid.bval("ramp") == true);
1085  assert(fullarg_grid.ival("ramp_type") == 1);
1086  assert(fullarg_grid.ival("dimnum") == 3);
1087  assert(fullarg_grid.bval("write_file") == false);
1088  assert(fullarg_grid.dval("fudge") == 1.0);
1089  assert(fullarg_grid.ival("kill_idx") == 0);
1090  assert(fullarg_grid.dval("mask_2d") == 1.5e-4);
1091  assert(fullarg_grid.dval("box_size") == 2.5e-5);
1092  assert(fullarg_grid.bval("found_sobel") == false);
1093  assert(fullarg_grid.Afn == "rotation");
1094  assert(fullarg_grid.Kfn == "rotation_K3d");
1095  assert(fullarg_grid.Vfn == "3d");
1096  assert(fullarg_grid.Wfcfn == "3d");
1097  assert(fullarg_grid.sval("conv_type") == "FFT");
1098  assert(fullarg_grid.ival("charge") == 0);
1099  assert(fullarg_grid.bval("flip") == true);
1100 
1101 }
1102 
1103 // Testing the evolve function in evolution.cu
1104 // This test will also test the energy function
1105 // Run the simulation in imaginary time for a simple harmonic oscillator and
1106 // check energies in 1, 2, and 3D.
1107 // Run the simulation in real time to make sure the energy remains the same
1108 void evolve_test(){
1109 
1110  std::cout << "Starting test of evolution function in nD...\n";
1111 
1112  // Setting default values
1113  Grid par;
1114 
1115  par.store("omega", 0.0);
1116  par.store("gammaY", 1.0);
1117  par.store("device", 0);
1118  par.store("read_wfc", false);
1119  par.store("winding", 0.0);
1120  par.store("corotating", false);
1121  par.store("gpe", false);
1122  par.store("interaction",1.0);
1123  par.store("laser_power",0.0);
1124  par.store("angle_sweep",0.0);
1125  par.store("kick_it", 0);
1126  par.store("x0_shift",0.0);
1127  par.store("y0_shift",0.0);
1128  par.store("z0_shift",0.0);
1129  par.store("sepMinEpsilon",0.0);
1130  par.store("graph", false);
1131  par.store("unit_test",false);
1132  par.store("data_dir", (std::string)"data/");
1133  par.store("ramp", false);
1134  par.store("ramp_type", 1);
1135  par.store("dimnum", 2);
1136  par.store("fudge", 0.0);
1137  par.store("kill_idx", -1);
1138  par.store("mask_2d", 1.5e-4);
1139  par.store("found_sobel", false);
1140  par.store("use_param_file", false);
1141  par.store("param_file","param.cfg");
1142  par.store("cyl_coord",false);
1143  par.Afn = "rotation";
1144  par.Kfn = "rotation_K";
1145  par.Vfn = "2d";
1146  par.Wfcfn = "2d";
1147  par.store("conv_type", (std::string)"FFT");
1148  par.store("charge", 0);
1149  par.store("flip", false);
1150  par.store("thresh_const", 1.0);
1151 
1152 
1153  double thresh = 0.01;
1154  std::string buffer;
1155  int gsteps = 30001;
1156  int esteps = 30001;
1157 
1158  par.store("gdt", 1e-4);
1159  par.store("dt", 1e-4);
1160  par.store("atoms", 1);
1161  par.store("omegaZ", 1.0);
1162  par.store("omegaX", 1.0);
1163  par.store("omegaY", 1.0);
1164  par.store("esteps", esteps);
1165  par.store("gsteps", gsteps);
1166  par.store("printSteps", 30000);
1167  par.store("write_file", true);
1168  par.store("write_it", true);
1169  par.store("energy_calc", true);
1170  par.store("box_size", 0.00007);
1171  par.store("xDim", 64);
1172  par.store("yDim", 1);
1173  par.store("zDim", 1);
1174 
1175 
1176  // Running through all the dimensions to check the energy
1177  for (int i = 1; i <= 3; ++i){
1178  if (i == 2){
1179  par.store("yDim", 64);
1180  }
1181  if (i == 3){
1182  par.store("zDim", 64);
1183  }
1184  par.store("dimnum",i);
1185  init(par);
1186 
1187  if (par.bval("write_file")){
1188  FileIO::writeOutParam(buffer, par, "data/Params.dat");
1189  }
1190 
1191  double omegaX = par.dval("omegaX");
1192  set_variables(par, 0);
1193 
1194  // Evolve and find the energy
1195  evolve(par, gsteps, 0, buffer);
1196 
1197  // Check that the energy is correct
1198  double energy = par.dval("energy");
1199  double energy_check = 0;
1200  energy_check = (double)i * 0.5 * HBAR * omegaX;
1201 
1202  if (abs(energy - energy_check) > thresh*energy_check){
1203  std::cout << "Energy is not correct in imaginary-time for "
1204  << i << "D!\n";
1205  assert(energy == energy_check);
1206  }
1207 
1208  // Run in real time to make sure that the energy is constant
1209  set_variables(par, 1);
1210  evolve(par, esteps, 1, buffer);
1211  double energy_ev = par.dval("energy");
1212 
1213  if (abs(energy - energy_ev) > thresh*energy_check){
1214  std::cout << "Energy is not constant in real-time for "
1215  << i << "D!\n";
1216  assert(energy == energy_ev);
1217  }
1218  }
1219 
1220 }
1221 
1222 __global__ void make_complex_kernel(double *in, int *evolution_type,
1223  double2 *out){
1224 
1225  //int id = threadIdx.x + blockIdx.x*blockDim.x;
1226  //out[id] = make_complex(in[id], evolution_type[id]);
1227  for (int i = 0; i < 3; ++i){
1228  out[i] = make_complex(in[i], evolution_type[i]);
1229  }
1230 }
1231 
1232 void make_complex_test(){
1233 
1234  // Creating a simple array to hold the 3 possible make_complex options
1235  double *input_array, *dinput_array;
1236  double2 *output_array, *doutput_array;
1237  int *evolution_type, *devolution_type;
1238 
1239  input_array = (double *)malloc(sizeof(double)*3);
1240  output_array = (double2 *)malloc(sizeof(double2)*3);
1241  evolution_type = (int *)malloc(sizeof(int)*3);
1242 
1243  input_array[0] = 10;
1244  input_array[1] = 10;
1245  input_array[2] = 10;
1246 
1247  evolution_type[0] = 0;
1248  evolution_type[1] = 1;
1249  evolution_type[2] = 2;
1250 
1251  cudaMalloc((void **)&dinput_array, sizeof(double)*3);
1252  cudaMalloc((void **)&doutput_array, sizeof(double2)*3);
1253  cudaMalloc((void **)&devolution_type, sizeof(int)*3);
1254 
1255  cudaMemcpy(dinput_array, input_array, sizeof(double)*3,
1256  cudaMemcpyHostToDevice);
1257  cudaMemcpy(devolution_type, evolution_type, sizeof(int)*3,
1258  cudaMemcpyHostToDevice);
1259 
1260  dim3 threads = {1,1,1};
1261  dim3 grid = {1,1,1};
1262 
1263  make_complex_kernel<<<1,1>>>(dinput_array, devolution_type,
1264  doutput_array);
1265  cudaDeviceSynchronize();
1266 
1267  cudaMemcpy(output_array, doutput_array, sizeof(double2)*3,
1268  cudaMemcpyDeviceToHost);
1269 
1270  bool pass = true;
1271  double thresh = 0.000001;
1272 
1273  if (abs(output_array[0].x - input_array[0]) > thresh ||
1274  (output_array[0].y) > thresh){
1275  std::cout << "failed 1\n";
1276  pass = false;
1277  }
1278  if (abs(output_array[1].x - exp(-input_array[1])) > thresh ||
1279  abs(output_array[1].y) > thresh){
1280  std::cout << "failed 2\n";
1281  pass = false;
1282  }
1283  if (abs(output_array[2].x - cos(-input_array[2])) > thresh ||
1284  abs(output_array[2].y - sin(-input_array[2])) > thresh){
1285  std::cout << "failed 3\n";
1286  pass = false;
1287  }
1288 
1289  if(pass){
1290  std::cout << "make_complex test passed!\n";
1291  }
1292  else{
1293  std::cout << "make_complex test failed!\n";
1294  exit(1);
1295  }
1296 
1297 }
1298 
1299 void cMultPhi_test(){
1300  // first, we are creating a double2 array to work with
1301  int n = 32;
1302  double2 *in1, *out;
1303  double *in2;
1304  double2 *din1, *dout;
1305  double *din2;
1306 
1307  in1 = (double2 *)malloc(sizeof(double2)*n);
1308  in2 = (double *)malloc(sizeof(double)*n);
1309  out = (double2 *)malloc(sizeof(double2)*n);
1310 
1311  cudaMalloc((void **)&din1, sizeof(double2)*n);
1312  cudaMalloc((void **)&din2, sizeof(double)*n);
1313  cudaMalloc((void **)&dout, sizeof(double2)*n);
1314 
1315  for (int i = 0; i < n; ++i){
1316  in1[i].x = i;
1317  in1[i].y = n-i;
1318  in2[i] = n-i;
1319  }
1320 
1321  cudaMemcpy(din1, in1, sizeof(double2)*n, cudaMemcpyHostToDevice);
1322  cudaMemcpy(din2, in2, sizeof(double)*n, cudaMemcpyHostToDevice);
1323 
1324  cMultPhi<<<1,n>>>(din1, din2, dout);
1325  cudaDeviceSynchronize();
1326 
1327  cudaMemcpy(out, dout, sizeof(double2)*n, cudaMemcpyDeviceToHost);
1328 
1329  double thresh = 0.000001;
1330  bool result = true;
1331  for (int i = 0; i < n; ++i){
1332  if (abs(out[i].x-cos(in2[i])*in1[i].x-in1[i].y*sin(in2[i])) < thresh ||
1333  abs(out[i].y-in1[i].x*sin(in2[i])+in1[i].y*cos(in2[i])) < thresh){
1334  result = false;
1335  }
1336  }
1337 
1338  if (result){
1339  std::cout << "cMultPhi test passed!\n";
1340  }
1341  else{
1342  std::cout << "cMultPhi test failed!\n";
1343  exit(1);
1344  }
1345 
1346 }
1347