From ad3a74d770bbfaae59794aeeb595d40c2e317f1e Mon Sep 17 00:00:00 2001 From: Thrasyvoulos Karydis <thrasyvoulos.karydis@cba.mit.edu> Date: Mon, 26 Nov 2018 13:40:29 -0500 Subject: [PATCH] Add CUDA pi implementation from NMM class --- CUDA/pi.cu | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 CUDA/pi.cu diff --git a/CUDA/pi.cu b/CUDA/pi.cu new file mode 100644 index 0000000..a30a3d6 --- /dev/null +++ b/CUDA/pi.cu @@ -0,0 +1,68 @@ +/* +* pi.cu +* Thras Karydis 2/4/2017 +* use CUDA to evaluate pi by summation +*/ + +#include <stdio.h> +#include <cuda.h> +#include <sys/time.h> + +#define NLOOP 1 +#define NPTS 1000000000 // Number of points (one billion) +#define NUM_THREAD 256 // Number of threads per block (max is 1024) +#define NUM_BLOCK (NPTS + NUM_THREAD - 1) / NUM_THREAD // Number of blocks per grid + +int tid; +double pi = 0; + +// Kernel that executes on the GPU +__global__ void compute_pi(float *sum) { + + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; // Sequential thread index across the blocks + if (idx < NPTS) + sum[idx] = 0.5/((idx-0.75)*(idx-0.25)); +} + +// Main routine that executes on the host +int main(void) { + + unsigned long int start_time,end_time; + struct timeval start,end; + double flops; + + int blocksPerGrid = (NPTS + NUM_THREAD - 1) / NUM_THREAD; // Compute number of blocks needed + float *sumHost, *sumDev; // Pointer to host & device arrays + + size_t size = blocksPerGrid*NUM_THREAD*sizeof(float); //Array memory size + sumHost = (float *)malloc(size); // Allocate array on host + cudaMalloc((void **) &sumDev, size); // Allocate array on device + // Initialize array in device to 0 + cudaMemset(sumDev, 0, size); + + gettimeofday(&start, NULL); + + // Do calculation on device + compute_pi <<<blocksPerGrid, NUM_THREAD>>> (sumDev); // call CUDA kernel + // Retrieve result from device and store it in host array + cudaMemcpy(sumHost, sumDev, size, cudaMemcpyDeviceToHost); + // Reduce + for(tid=1; tid<NPTS; tid++) + pi += sumHost[tid]; + + gettimeofday(&end, NULL); + + // Print results + start_time = start.tv_sec * 1e6 + start.tv_usec; + end_time = end.tv_sec * 1e6 + end.tv_usec; + flops = NLOOP*(NPTS*(5.0/(end_time-start_time))); + printf("processes = %d: %d blocks and %d threads per block\n", blocksPerGrid*NUM_THREAD, blocksPerGrid, NUM_THREAD); + printf("NPTS = %d, NLOOP = %d, pi = %f\n", NPTS, NLOOP, pi/NLOOP); + printf("time = %f, estimated GFlops = %f\n",(end_time-start_time)/1.0e9,flops); + + // Cleanup + free(sumHost); + cudaFree(sumDev); + + return 0; +} \ No newline at end of file -- GitLab