Commit 9867b41d authored by Neil Gershenfeld's avatar Neil Gershenfeld

CPU reduce

parent 818dcf9c
Pipeline #5658 passed with stage
in 1 second
//
// cudapic.cu
// Neil Gershenfeld 7/14/20
// calculation of pi by a CUDA sum with CPU reduction
// pi = 3.14159265358979323846
//
#include <iostream>
#include <chrono>
#include <cstdint>
uint64_t blocks = 1024;
uint64_t threads = 1024;
uint64_t nloop = 1000000;
uint64_t npts = blocks*threads;
__global__ void init(double *arr,uint64_t nloop) {
uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
uint64_t start = nloop*i+1;
uint64_t end = nloop*(i+1)+1;
arr[i] = 0;
for (uint64_t j = start; j < end; ++j)
arr[i] += 0.5/((j-0.75)*(j-0.25));
}
int main(void) {
double *arr,*darr;
arr = new double[npts];
cudaMalloc(&darr,npts*sizeof(double));
auto tstart = std::chrono::high_resolution_clock::now();
init<<<blocks,threads>>>(darr,nloop);
cudaDeviceSynchronize();
cudaMemcpy(arr,darr,npts*sizeof(double),cudaMemcpyDeviceToHost);
float pi = 0;
for (int i = 0; i < npts; ++i)
pi += arr[i];
auto tend = std::chrono::high_resolution_clock::now();
auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
auto mflops = npts*nloop*5.0/dt;
printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,pi);
printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops);
cudaFree(darr);
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment