diff --git a/CUDA/cudapi.cu b/CUDA/cudapi.cu new file mode 100755 index 0000000000000000000000000000000000000000..dd7ac6aa1fe118796836a6ae8302a1e79a694431 --- /dev/null +++ b/CUDA/cudapi.cu @@ -0,0 +1,51 @@ +// +// cudapi.cu +// Neil Gershenfeld 3/1/20 +// calculation of pi by a CUDA sum +// pi = 3.14159265358979323846 +// +#include <iostream> +#include <chrono> +#include <cstdint> +uint64_t blocks = 1024; +uint64_t threads = 1024; +uint64_t nloop = 1000000; +uint64_t npts = blocks*threads; +__global__ void init(double *arr,uint64_t nloop) { + uint64_t i = blockIdx.x*blockDim.x+threadIdx.x; + uint64_t start = nloop*i+1; + uint64_t end = nloop*(i+1)+1; + arr[i] = 0; + for (uint64_t j = start; j < end; ++j) + arr[i] += 0.5/((j-0.75)*(j-0.25)); + } +__global__ void reduce_sum(double *arr,uint64_t len) { + uint64_t i = blockIdx.x*blockDim.x+threadIdx.x; + if (i < len) + arr[i] += arr[i+len]; + } +void reduce(double *arr) { + uint64_t len = npts >> 1; + while (1) { + reduce_sum<<<blocks,threads>>>(arr,len); + len = len >> 1; + if (len == 0) + return; + } + } +int main(void) { + double harr[1],*darr; + cudaMalloc(&darr,npts*sizeof(double)); + auto tstart = std::chrono::high_resolution_clock::now(); + init<<<blocks,threads>>>(darr,nloop); + reduce(darr); + cudaDeviceSynchronize(); + auto tend = std::chrono::high_resolution_clock::now(); + auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count(); + auto mflops = npts*nloop*5.0/dt; + cudaMemcpy(harr,darr,8,cudaMemcpyDeviceToHost); + printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,harr[0]); + printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops); + cudaFree(darr); + return 0; + } diff --git a/Python/numbapig.py b/Python/numbapig.py index 71c8eb8a00e394e92b8bae309fadb6d222edfd49..433c50795c6713b9c83a25067bce492d0f27b452 100644 --- a/Python/numbapig.py +++ b/Python/numbapig.py @@ -1,133 +1,46 @@ # # numbapig.py -# Neil Gershenfeld 2/9/20 -# calculation of pi by a Numba GPU sum +# Neil Gershenfeld 3/1/20 +# calculation of pi by a Numba CUDA sum # pi = 3.14159265358979323846 # from numba import cuda import numpy as np import time -# -# problem size -# -block_size = 2**10 -grid_size = 2**21 -NPTS = grid_size*block_size -# -# kernels and functions -# +block_size = 1024 +grid_size = 1024 +nloop = 1000000 +npts = grid_size*block_size @cuda.jit -def init(arr): - i = 1+cuda.grid(1) - arr[i-1] = 0.5/((i-0.75)*(i-0.25)) - #arr[i-1] = i # for testing reduction -# -@cuda.reduce -def Numba_reduce(a,b): - return a+b -# +def init(arr,nloop): + i = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x; + start = nloop*i+1; + end = nloop*(i+1)+1; + arr[i] = 0; + for j in range(start,end): + arr[i] += 0.5/((j-0.75)*(j-0.25)); @cuda.jit -def CUDA_sum(arr,len): - i = cuda.grid(1) - if (i < len): - arr[i] += arr[i+len] -# -def CUDA_reduce(arr,NPTS): - len = NPTS >> 1 +def reduce_sum(arr,len): + i = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x; + if (i < len): + arr[i] += arr[i+len] +def reduce(arr,npts): + len = npts >> 1 while (1): - CUDA_sum[grid_size,block_size](arr,len) - cuda.synchronize() + reduce_sum[grid_size,block_size](arr,len) len = len >> 1 if (len == 0): return -# -@cuda.jit -def CUDA_result(arr,result): - i = cuda.grid(1) - if (i == 0): - result[0] = arr[0] -# -# device array -# -arr = cuda.device_array(NPTS,np.float32) -result = cuda.device_array(1,np.float32) -#arr = cuda.device_array(NPTS,np.int64) # for testing reduction -#result = cuda.device_array(1,np.int64) # for testing reduction -# -# compile kernels by calling them -# -init[grid_size,block_size](arr) -pi = Numba_reduce(arr) -CUDA_reduce(arr,NPTS) -CUDA_result(arr,result) -# -# CUDA kernel array calculation -# -start_time = time.time() -init[grid_size,block_size](arr) -cuda.synchronize() -end_time = time.time() -mflops = NPTS*4.0/(1.0e6*(end_time-start_time)) -print("CUDA kernel array calculation:") -print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) -# -# Numba reduce -# -init[grid_size,block_size](arr) -start_time = time.time() -pi = Numba_reduce(arr) -end_time = time.time() -mflops = NPTS*1.0/(1.0e6*(end_time-start_time)) -print("Numba reduce:") -print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) -# -# both with Numba reduce -# +darr = cuda.device_array(npts,np.float64) +init[grid_size,block_size](darr,nloop) # compile kernel +reduce(darr,npts) # compile kernel start_time = time.time() -init[grid_size,block_size](arr) +init[grid_size,block_size](darr,nloop) +reduce(darr,npts) cuda.synchronize() -pi = Numba_reduce(arr) -end_time = time.time() -mflops = NPTS*5.0/(1.0e6*(end_time-start_time)) -print("both with Numba reduce:") -print(" NPTS = %d, pi = %f"%(NPTS,pi)) -print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) -# -# CUDA kernel reduction -# -init[grid_size,block_size](arr) -start_time = time.time() -CUDA_reduce(arr,NPTS) -end_time = time.time() -mflops = NPTS*1.0/(1.0e6*(end_time-start_time)) -print("CUDA kernel reduction:") -print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) -# -# both with CUDA kernel reduction -# -start_time = time.time() -init[grid_size,block_size](arr) -cuda.synchronize() -CUDA_reduce(arr,NPTS) -CUDA_result(arr,result) -cuda.synchronize() -end_time = time.time() -pi = result.copy_to_host() -mflops = NPTS*5.0/(1.0e6*(end_time-start_time)) -print("both with CUDA kernel reduction:") -print(" NPTS = %d, pi = %f"%(NPTS,pi[0])) -print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) -# -# both with CUDA kernel reduction and transfer -# -start_time = time.time() -init[grid_size,block_size](arr) -CUDA_reduce(arr,NPTS) -CUDA_result(arr,result) -pi = result.copy_to_host() end_time = time.time() -mflops = NPTS*5.0/(1.0e6*(end_time-start_time)) -print("both with CUDA kernel reduction and transfer:") -print(" NPTS = %d, pi = %f"%(NPTS,pi[0])) -print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) +mflops = npts*nloop*5.0/(1.0e6*(end_time-start_time)) +harr = darr.copy_to_host() +print("npts = %d, nloop = %d, pi = %f"%(npts,nloop,harr[0])) +print("time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) diff --git a/README.md b/README.md index f44456e47fca9c224bf90556d985787378f7a9f9..3bead95b930285626b824da0d68b892b5109afbe 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ |88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019| |2,117|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 10 nodes, 96 cores/node<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019| |2,102|[mpipi2.py](Python/mpipi2.py)|Python, Numba, MPI<br>10 nodes, 96 cores/node|Intel 2x Xeon Platinum 8175M|Feb 6, 2020| +|1,635|[cudapi.cu](CUDA/cudapi.cu)|C++, CUDA<br>5120 cores|NVIDIA V100|March 1, 2020| +|1,090|[numbapig.py](Python/numbapig.py)|Python, Numba, CUDA<br>5120 cores|NVIDIA V100|March 1, 2020| |315|[numbapip.py](Python/numbapip.py)|Python, Numba, parallel, fastmath<br>96 cores|Intel 2x Xeon Platinum 8175M|Feb 7, 2020| |272|[threadpi.c](C/threadpi.c)|C, 96 threads<br>gcc threadpi.c -o threadpi -O3 -ffast-math -pthread|Intel 2x Xeon Platinum 8175M|Jun 3, 2019| |211|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 1 node, 96 cores<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019|