From e300c31016b9f077ec73d0b828e696be85dd8380 Mon Sep 17 00:00:00 2001 From: Neil Gershenfeld <gersh@cba.mit.edu> Date: Sun, 9 Feb 2020 20:00:01 -0500 Subject: [PATCH] wip --- Python/numbapig.py | 21 +++++++++++++++++---- README.md | 1 + 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/Python/numbapig.py b/Python/numbapig.py index a89425c..7f68a25 100644 --- a/Python/numbapig.py +++ b/Python/numbapig.py @@ -1,7 +1,7 @@ # # numbapig.py # Neil Gershenfeld 2/9/20 -# calculation of pi by a Numba CUDA sum +# calculation of pi by a Numba GPU sum # pi = 3.14159265358979323846 # from numba import cuda @@ -20,6 +20,7 @@ NPTS = grid_size*block_size def init(arr): i = 1+cuda.grid(1) arr[i-1] = 0.5/((i-0.75)*(i-0.25)) + #arr[i-1] = i # for testing reduction # @cuda.reduce def Numba_reduce(a,b): @@ -39,15 +40,25 @@ def CUDA_reduce(arr,NPTS): if (len == 0): return # +@cuda.jit +def CUDA_result(arr,result): + i = cuda.grid(1) + if (i == 0): + result[0] = arr[0] +# # device array # arr = cuda.device_array(NPTS,np.float32) +result = cuda.device_array(1,np.float32) +#arr = cuda.device_array(NPTS,np.int64) # for testing reduction +#result = cuda.device_array(1,np.int64) # for testing reduction # -# compile kernels +# compile kernels by calling them # init[grid_size,block_size](arr) pi = Numba_reduce(arr) CUDA_reduce(arr,NPTS) +CUDA_result(arr,result) # # CUDA kernel array calculation # @@ -94,9 +105,11 @@ print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) start_time = time.time() init[grid_size,block_size](arr) CUDA_reduce(arr,NPTS) +CUDA_result(arr,result) end_time = time.time() -darr = arr.copy_to_host() +pi = result.copy_to_host() mflops = NPTS*5.0/(1.0e6*(end_time-start_time)) print("both with CUDA kernel reduction:") -print(" NPTS = %d, pi = %f"%(NPTS,darr[0])) +print(" NPTS = %d, pi = %f"%(NPTS,pi[0])) print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) + diff --git a/README.md b/README.md index f44456e..c2400e3 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ |88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019| |2,117|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 10 nodes, 96 cores/node<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019| |2,102|[mpipi2.py](Python/mpipi2.py)|Python, Numba, MPI<br>10 nodes, 96 cores/node|Intel 2x Xeon Platinum 8175M|Feb 6, 2020| +|1,919|[numbapig.py](Python/numbapig.py)|Python, Numba, GPU<br>5120 cores|NVIDIA V100|Feb 9, 2020| |315|[numbapip.py](Python/numbapip.py)|Python, Numba, parallel, fastmath<br>96 cores|Intel 2x Xeon Platinum 8175M|Feb 7, 2020| |272|[threadpi.c](C/threadpi.c)|C, 96 threads<br>gcc threadpi.c -o threadpi -O3 -ffast-math -pthread|Intel 2x Xeon Platinum 8175M|Jun 3, 2019| |211|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 1 node, 96 cores<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019| -- GitLab