diff --git a/Python/numbapig.py b/Python/numbapig.py
index a89425c9760ad1bcef24f8665aab09390a4c0118..7f68a257951173dbe807385795a0fd1bfcc94eb1 100644
--- a/Python/numbapig.py
+++ b/Python/numbapig.py
@@ -1,7 +1,7 @@
 #
 # numbapig.py
 # Neil Gershenfeld 2/9/20
-# calculation of pi by a Numba CUDA sum
+# calculation of pi by a Numba GPU sum
 # pi = 3.14159265358979323846 
 #
 from numba import cuda
@@ -20,6 +20,7 @@ NPTS = grid_size*block_size
 def init(arr):
     i = 1+cuda.grid(1)
     arr[i-1] = 0.5/((i-0.75)*(i-0.25))
+    #arr[i-1] = i # for testing reduction
 #
 @cuda.reduce
 def Numba_reduce(a,b):
@@ -39,15 +40,25 @@ def CUDA_reduce(arr,NPTS):
       if (len == 0):
          return
 #
+@cuda.jit
+def CUDA_result(arr,result):
+    i = cuda.grid(1)
+    if (i == 0):
+      result[0] = arr[0]
+#
 # device array
 #
 arr = cuda.device_array(NPTS,np.float32)
+result = cuda.device_array(1,np.float32)
+#arr = cuda.device_array(NPTS,np.int64) # for testing reduction
+#result = cuda.device_array(1,np.int64) # for testing reduction
 #
-# compile kernels
+# compile kernels by calling them
 #
 init[grid_size,block_size](arr)
 pi = Numba_reduce(arr)
 CUDA_reduce(arr,NPTS)
+CUDA_result(arr,result)
 #
 # CUDA kernel array calculation
 #
@@ -94,9 +105,11 @@ print("   time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
 start_time = time.time()
 init[grid_size,block_size](arr)
 CUDA_reduce(arr,NPTS)
+CUDA_result(arr,result)
 end_time = time.time()
-darr = arr.copy_to_host()
+pi = result.copy_to_host()
 mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
 print("both with CUDA kernel reduction:")
-print("   NPTS = %d, pi = %f"%(NPTS,darr[0]))
+print("   NPTS = %d, pi = %f"%(NPTS,pi[0]))
 print("   time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
+
diff --git a/README.md b/README.md
index f44456e47fca9c224bf90556d985787378f7a9f9..c2400e3b4fe8998ca4d011a12f548090d4b82019 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@
 |88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019|
 |2,117|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 10 nodes, 96 cores/node<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019|
 |2,102|[mpipi2.py](Python/mpipi2.py)|Python, Numba, MPI<br>10 nodes, 96 cores/node|Intel 2x Xeon Platinum 8175M|Feb 6, 2020|
+|1,919|[numbapig.py](Python/numbapig.py)|Python, Numba, GPU<br>5120 cores|NVIDIA V100|Feb 9, 2020|
 |315|[numbapip.py](Python/numbapip.py)|Python, Numba, parallel, fastmath<br>96 cores|Intel 2x Xeon Platinum 8175M|Feb 7, 2020|
 |272|[threadpi.c](C/threadpi.c)|C, 96 threads<br>gcc threadpi.c -o threadpi -O3 -ffast-math -pthread|Intel 2x Xeon Platinum 8175M|Jun 3, 2019|
 |211|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 1 node, 96 cores<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019|