cudapi.cu 1.83 KB
Newer Older
Neil Gershenfeld's avatar
wip  
Neil Gershenfeld committed
1
2
3
4
5
6
7
8
//
// cudapi.cu
// Neil Gershenfeld 3/1/20
// calculation of pi by a CUDA sum
// pi = 3.14159265358979323846 
//
#include <iostream>
#include <chrono>
Neil Gershenfeld's avatar
wip    
Neil Gershenfeld committed
9
#include <string>
Neil Gershenfeld's avatar
Neil Gershenfeld committed
10
using namespace std;
Neil Gershenfeld's avatar
wip  
Neil Gershenfeld committed
11
12
13
14
uint64_t blocks = 1024;
uint64_t threads = 1024;
uint64_t nloop = 1000000;
uint64_t npts = blocks*threads;
Neil Gershenfeld's avatar
wip    
Neil Gershenfeld committed
15
void cudaCheck(string msg) {
Neil Gershenfeld's avatar
wip    
Neil Gershenfeld committed
16
17
18
19
20
   cudaError err;
   err = cudaGetLastError();
   if (cudaSuccess != err)
   cerr << msg << ": " << cudaGetErrorString(err) << endl;
   }
Neil Gershenfeld's avatar
wip  
Neil Gershenfeld committed
21
22
23
24
25
26
27
28
29
30
31
32
33
__global__ void init(double *arr,uint64_t nloop) {
   uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
   uint64_t start = nloop*i+1;
   uint64_t end = nloop*(i+1)+1;
   arr[i] = 0;
   for (uint64_t j = start; j < end; ++j)
      arr[i] += 0.5/((j-0.75)*(j-0.25));
   }
__global__ void reduce_sum(double *arr,uint64_t len) {
   uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
   if (i < len)
      arr[i] += arr[i+len];
   }
Neil Gershenfeld's avatar
Neil Gershenfeld committed
34
35
36
37
void reduce(double *arr) {
   uint64_t len = npts >> 1;
   while (1) {
      reduce_sum<<<blocks,threads>>>(arr,len);
Neil Gershenfeld's avatar
wip    
Neil Gershenfeld committed
38
      cudaCheck("reduce_sum");
Neil Gershenfeld's avatar
Neil Gershenfeld committed
39
40
41
42
43
      len = len >> 1;
      if (len == 0)
         return;
      }
   }
Neil Gershenfeld's avatar
wip  
Neil Gershenfeld committed
44
45
46
int main(void) {
   double harr[1],*darr;
   cudaMalloc(&darr,npts*sizeof(double));
Neil Gershenfeld's avatar
Neil Gershenfeld committed
47
   cudaCheck("cudaMalloc");
Neil Gershenfeld's avatar
wip  
Neil Gershenfeld committed
48
49
   auto tstart = std::chrono::high_resolution_clock::now();        
   init<<<blocks,threads>>>(darr,nloop);
Neil Gershenfeld's avatar
Neil Gershenfeld committed
50
   cudaCheck("init");
Neil Gershenfeld's avatar
wip  
Neil Gershenfeld committed
51
52
   reduce(darr);
   cudaDeviceSynchronize();
Neil Gershenfeld's avatar
wip    
Neil Gershenfeld committed
53
   cudaCheck("cudaDeviceSynchronize");
Neil Gershenfeld's avatar
wip  
Neil Gershenfeld committed
54
55
56
57
   auto tend = std::chrono::high_resolution_clock::now();        
	auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
   auto mflops = npts*nloop*5.0/dt;
   cudaMemcpy(harr,darr,8,cudaMemcpyDeviceToHost);
Neil Gershenfeld's avatar
wip    
Neil Gershenfeld committed
58
   cudaCheck("cudaMemcpy");
Neil Gershenfeld's avatar
wip  
Neil Gershenfeld committed
59
60
61
62
63
   printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,harr[0]);
   printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops);
   cudaFree(darr);
   return 0;
   }