CPU reduce

9867b41d · Neil Gershenfeld · 818dcf9c · 9867b41d
Commit 9867b41d authored 4 years ago by Neil Gershenfeld
--- a/CUDA/cudapic.cu
+++ b/CUDA/cudapic.cu
+//
+// cudapic.cu
+// Neil Gershenfeld 7/14/20
+// calculation of pi by a CUDA sum with CPU reduction
+// pi = 3.14159265358979323846 
+//
+#include <iostream>
+#include <chrono>
+#include <cstdint>
+uint64_t blocks = 1024;
+uint64_t threads = 1024;
+uint64_t nloop = 1000000;
+uint64_t npts = blocks*threads;
+__global__ void init(double *arr,uint64_t nloop) {
+   uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
+   uint64_t start = nloop*i+1;
+   uint64_t end = nloop*(i+1)+1;
+   arr[i] = 0;
+   for (uint64_t j = start; j < end; ++j)
+      arr[i] += 0.5/((j-0.75)*(j-0.25));
+   }
+int main(void) {
+   double *arr,*darr;
+   arr = new double[npts];
+   cudaMalloc(&darr,npts*sizeof(double));
+   auto tstart = std::chrono::high_resolution_clock::now();        
+   init<<<blocks,threads>>>(darr,nloop);
+   cudaDeviceSynchronize();
+   cudaMemcpy(arr,darr,npts*sizeof(double),cudaMemcpyDeviceToHost);
+   float pi = 0;
+   for (int i = 0; i < npts; ++i)
+      pi += arr[i];
+   auto tend = std::chrono::high_resolution_clock::now();        
+	auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
+   auto mflops = npts*nloop*5.0/dt;
+   printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,pi);
+   printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops);
+   cudaFree(darr);
+   return 0;
+   }