From 9867b41d7718404764afb23792d2ea34bf6989b4 Mon Sep 17 00:00:00 2001
From: Neil Gershenfeld <gersh@cba.mit.edu>
Date: Tue, 14 Jul 2020 19:19:58 +0000
Subject: [PATCH] CPU reduce

---
 CUDA/cudapic.cu | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100755 CUDA/cudapic.cu

diff --git a/CUDA/cudapic.cu b/CUDA/cudapic.cu
new file mode 100755
index 0000000..5ce51f6
--- /dev/null
+++ b/CUDA/cudapic.cu
@@ -0,0 +1,40 @@
+//
+// cudapic.cu
+// Neil Gershenfeld 7/14/20
+// calculation of pi by a CUDA sum with CPU reduction
+// pi = 3.14159265358979323846 
+//
+#include <iostream>
+#include <chrono>
+#include <cstdint>
+uint64_t blocks = 1024;
+uint64_t threads = 1024;
+uint64_t nloop = 1000000;
+uint64_t npts = blocks*threads;
+__global__ void init(double *arr,uint64_t nloop) {
+   uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
+   uint64_t start = nloop*i+1;
+   uint64_t end = nloop*(i+1)+1;
+   arr[i] = 0;
+   for (uint64_t j = start; j < end; ++j)
+      arr[i] += 0.5/((j-0.75)*(j-0.25));
+   }
+int main(void) {
+   double *arr,*darr;
+   arr = new double[npts];
+   cudaMalloc(&darr,npts*sizeof(double));
+   auto tstart = std::chrono::high_resolution_clock::now();        
+   init<<<blocks,threads>>>(darr,nloop);
+   cudaDeviceSynchronize();
+   cudaMemcpy(arr,darr,npts*sizeof(double),cudaMemcpyDeviceToHost);
+   float pi = 0;
+   for (int i = 0; i < npts; ++i)
+      pi += arr[i];
+   auto tend = std::chrono::high_resolution_clock::now();        
+	auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
+   auto mflops = npts*nloop*5.0/dt;
+   printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,pi);
+   printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops);
+   cudaFree(darr);
+   return 0;
+   }
-- 
GitLab