From 9867b41d7718404764afb23792d2ea34bf6989b4 Mon Sep 17 00:00:00 2001 From: Neil Gershenfeld <gersh@cba.mit.edu> Date: Tue, 14 Jul 2020 19:19:58 +0000 Subject: [PATCH] CPU reduce --- CUDA/cudapic.cu | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100755 CUDA/cudapic.cu diff --git a/CUDA/cudapic.cu b/CUDA/cudapic.cu new file mode 100755 index 0000000..5ce51f6 --- /dev/null +++ b/CUDA/cudapic.cu @@ -0,0 +1,40 @@ +// +// cudapic.cu +// Neil Gershenfeld 7/14/20 +// calculation of pi by a CUDA sum with CPU reduction +// pi = 3.14159265358979323846 +// +#include <iostream> +#include <chrono> +#include <cstdint> +uint64_t blocks = 1024; +uint64_t threads = 1024; +uint64_t nloop = 1000000; +uint64_t npts = blocks*threads; +__global__ void init(double *arr,uint64_t nloop) { + uint64_t i = blockIdx.x*blockDim.x+threadIdx.x; + uint64_t start = nloop*i+1; + uint64_t end = nloop*(i+1)+1; + arr[i] = 0; + for (uint64_t j = start; j < end; ++j) + arr[i] += 0.5/((j-0.75)*(j-0.25)); + } +int main(void) { + double *arr,*darr; + arr = new double[npts]; + cudaMalloc(&darr,npts*sizeof(double)); + auto tstart = std::chrono::high_resolution_clock::now(); + init<<<blocks,threads>>>(darr,nloop); + cudaDeviceSynchronize(); + cudaMemcpy(arr,darr,npts*sizeof(double),cudaMemcpyDeviceToHost); + float pi = 0; + for (int i = 0; i < npts; ++i) + pi += arr[i]; + auto tend = std::chrono::high_resolution_clock::now(); + auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count(); + auto mflops = npts*nloop*5.0/dt; + printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,pi); + printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops); + cudaFree(darr); + return 0; + } -- GitLab