From ad3a74d770bbfaae59794aeeb595d40c2e317f1e Mon Sep 17 00:00:00 2001
From: Thrasyvoulos Karydis <thrasyvoulos.karydis@cba.mit.edu>
Date: Mon, 26 Nov 2018 13:40:29 -0500
Subject: [PATCH] Add CUDA pi implementation from NMM class

---
 CUDA/pi.cu | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 CUDA/pi.cu

diff --git a/CUDA/pi.cu b/CUDA/pi.cu
new file mode 100644
index 0000000..a30a3d6
--- /dev/null
+++ b/CUDA/pi.cu
@@ -0,0 +1,68 @@
+/*
+* pi.cu
+* Thras Karydis  2/4/2017
+* use CUDA to evaluate pi by summation
+*/
+
+#include <stdio.h>
+#include <cuda.h>
+#include <sys/time.h>
+
+#define NLOOP 1
+#define NPTS 1000000000  // Number of points (one billion)
+#define NUM_THREAD  256  // Number of threads per block (max is 1024)
+#define NUM_BLOCK  (NPTS + NUM_THREAD - 1) / NUM_THREAD // Number of blocks per grid
+
+int tid;
+double pi = 0;
+
+// Kernel that executes on the GPU
+__global__ void compute_pi(float *sum) {
+
+        unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;  // Sequential thread index across the blocks
+        if (idx < NPTS)
+        sum[idx] = 0.5/((idx-0.75)*(idx-0.25));
+}
+
+// Main routine that executes on the host
+int main(void) {
+
+        unsigned long int start_time,end_time;
+        struct timeval start,end;
+        double flops;
+
+        int blocksPerGrid = (NPTS + NUM_THREAD - 1) / NUM_THREAD; // Compute number of blocks needed
+        float *sumHost, *sumDev;  // Pointer to host & device arrays
+
+        size_t size = blocksPerGrid*NUM_THREAD*sizeof(float);  //Array memory size
+        sumHost = (float *)malloc(size);  //  Allocate array on host
+        cudaMalloc((void **) &sumDev, size);  // Allocate array on device
+        // Initialize array in device to 0
+        cudaMemset(sumDev, 0, size);
+
+        gettimeofday(&start, NULL);
+
+        // Do calculation on device
+        compute_pi <<<blocksPerGrid, NUM_THREAD>>> (sumDev); // call CUDA kernel
+        // Retrieve result from device and store it in host array
+        cudaMemcpy(sumHost, sumDev, size, cudaMemcpyDeviceToHost);
+        // Reduce
+        for(tid=1; tid<NPTS; tid++)
+                pi += sumHost[tid];
+
+        gettimeofday(&end, NULL);
+
+        // Print results
+        start_time = start.tv_sec * 1e6 + start.tv_usec;
+        end_time = end.tv_sec * 1e6 + end.tv_usec;
+        flops = NLOOP*(NPTS*(5.0/(end_time-start_time)));
+        printf("processes = %d: %d blocks and %d threads per block\n", blocksPerGrid*NUM_THREAD, blocksPerGrid, NUM_THREAD);
+        printf("NPTS = %d, NLOOP = %d, pi = %f\n", NPTS, NLOOP, pi/NLOOP);
+        printf("time = %f, estimated GFlops = %f\n",(end_time-start_time)/1.0e9,flops);
+
+        // Cleanup
+        free(sumHost);
+        cudaFree(sumDev);
+
+        return 0;
+}
\ No newline at end of file
-- 
GitLab