Skip to content
Snippets Groups Projects
Commit 9867b41d authored by Neil Gershenfeld's avatar Neil Gershenfeld
Browse files

CPU reduce

parent 818dcf9c
No related branches found
No related tags found
No related merge requests found
Pipeline #5658 passed
//
// cudapic.cu
// Neil Gershenfeld 7/14/20
// calculation of pi by a CUDA sum with CPU reduction
// pi = 3.14159265358979323846
//
#include <iostream>
#include <chrono>
#include <cstdint>
uint64_t blocks = 1024;
uint64_t threads = 1024;
uint64_t nloop = 1000000;
uint64_t npts = blocks*threads;
__global__ void init(double *arr,uint64_t nloop) {
uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
uint64_t start = nloop*i+1;
uint64_t end = nloop*(i+1)+1;
arr[i] = 0;
for (uint64_t j = start; j < end; ++j)
arr[i] += 0.5/((j-0.75)*(j-0.25));
}
int main(void) {
double *arr,*darr;
arr = new double[npts];
cudaMalloc(&darr,npts*sizeof(double));
auto tstart = std::chrono::high_resolution_clock::now();
init<<<blocks,threads>>>(darr,nloop);
cudaDeviceSynchronize();
cudaMemcpy(arr,darr,npts*sizeof(double),cudaMemcpyDeviceToHost);
float pi = 0;
for (int i = 0; i < npts; ++i)
pi += arr[i];
auto tend = std::chrono::high_resolution_clock::now();
auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
auto mflops = npts*nloop*5.0/dt;
printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,pi);
printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops);
cudaFree(darr);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment