diff --git a/CUDA/cudapi.cu b/CUDA/cudapi.cu
new file mode 100755
index 0000000000000000000000000000000000000000..dd7ac6aa1fe118796836a6ae8302a1e79a694431
--- /dev/null
+++ b/CUDA/cudapi.cu
@@ -0,0 +1,51 @@
+//
+// cudapi.cu
+// Neil Gershenfeld 3/1/20
+// calculation of pi by a CUDA sum
+// pi = 3.14159265358979323846 
+//
+#include <iostream>
+#include <chrono>
+#include <cstdint>
+uint64_t blocks = 1024;
+uint64_t threads = 1024;
+uint64_t nloop = 1000000;
+uint64_t npts = blocks*threads;
+__global__ void init(double *arr,uint64_t nloop) {
+   uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
+   uint64_t start = nloop*i+1;
+   uint64_t end = nloop*(i+1)+1;
+   arr[i] = 0;
+   for (uint64_t j = start; j < end; ++j)
+      arr[i] += 0.5/((j-0.75)*(j-0.25));
+   }
+__global__ void reduce_sum(double *arr,uint64_t len) {
+   uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
+   if (i < len)
+      arr[i] += arr[i+len];
+   }
+void reduce(double *arr) {
+   uint64_t len = npts >> 1;
+   while (1) {
+      reduce_sum<<<blocks,threads>>>(arr,len);
+      len = len >> 1;
+      if (len == 0)
+         return;
+      }
+   }
+int main(void) {
+   double harr[1],*darr;
+   cudaMalloc(&darr,npts*sizeof(double));
+   auto tstart = std::chrono::high_resolution_clock::now();        
+   init<<<blocks,threads>>>(darr,nloop);
+   reduce(darr);
+   cudaDeviceSynchronize();
+   auto tend = std::chrono::high_resolution_clock::now();        
+	auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
+   auto mflops = npts*nloop*5.0/dt;
+   cudaMemcpy(harr,darr,8,cudaMemcpyDeviceToHost);
+   printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,harr[0]);
+   printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops);
+   cudaFree(darr);
+   return 0;
+   }
diff --git a/Python/numbapig.py b/Python/numbapig.py
index 71c8eb8a00e394e92b8bae309fadb6d222edfd49..433c50795c6713b9c83a25067bce492d0f27b452 100644
--- a/Python/numbapig.py
+++ b/Python/numbapig.py
@@ -1,133 +1,46 @@
 #
 # numbapig.py
-# Neil Gershenfeld 2/9/20
-# calculation of pi by a Numba GPU sum
+# Neil Gershenfeld 3/1/20
+# calculation of pi by a Numba CUDA sum
 # pi = 3.14159265358979323846 
 #
 from numba import cuda
 import numpy as np
 import time
-#
-# problem size
-#
-block_size = 2**10
-grid_size = 2**21
-NPTS = grid_size*block_size
-#
-# kernels and functions
-#
+block_size = 1024
+grid_size = 1024
+nloop = 1000000
+npts = grid_size*block_size
 @cuda.jit
-def init(arr):
-    i = 1+cuda.grid(1)
-    arr[i-1] = 0.5/((i-0.75)*(i-0.25))
-    #arr[i-1] = i # for testing reduction
-#
-@cuda.reduce
-def Numba_reduce(a,b):
-    return a+b
-#
+def init(arr,nloop):
+   i = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x;
+   start = nloop*i+1;
+   end = nloop*(i+1)+1;
+   arr[i] = 0;
+   for j in range(start,end):
+      arr[i] += 0.5/((j-0.75)*(j-0.25));
 @cuda.jit
-def CUDA_sum(arr,len):
-    i = cuda.grid(1)
-    if (i < len):
-       arr[i] += arr[i+len]
-#
-def CUDA_reduce(arr,NPTS):
-   len = NPTS >> 1
+def reduce_sum(arr,len):
+   i = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x;
+   if (i < len):
+      arr[i] += arr[i+len]
+def reduce(arr,npts):
+   len = npts >> 1
    while (1):
-      CUDA_sum[grid_size,block_size](arr,len)
-      cuda.synchronize()
+      reduce_sum[grid_size,block_size](arr,len)
       len = len >> 1
       if (len == 0):
          return
-#
-@cuda.jit
-def CUDA_result(arr,result):
-    i = cuda.grid(1)
-    if (i == 0):
-      result[0] = arr[0]
-#
-# device array
-#
-arr = cuda.device_array(NPTS,np.float32)
-result = cuda.device_array(1,np.float32)
-#arr = cuda.device_array(NPTS,np.int64) # for testing reduction
-#result = cuda.device_array(1,np.int64) # for testing reduction
-#
-# compile kernels by calling them
-#
-init[grid_size,block_size](arr)
-pi = Numba_reduce(arr)
-CUDA_reduce(arr,NPTS)
-CUDA_result(arr,result)
-#
-# CUDA kernel array calculation
-#
-start_time = time.time()
-init[grid_size,block_size](arr)
-cuda.synchronize()
-end_time = time.time()
-mflops = NPTS*4.0/(1.0e6*(end_time-start_time))
-print("CUDA kernel array calculation:")
-print("   time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
-#
-# Numba reduce
-#
-init[grid_size,block_size](arr)
-start_time = time.time()
-pi = Numba_reduce(arr)
-end_time = time.time()
-mflops = NPTS*1.0/(1.0e6*(end_time-start_time))
-print("Numba reduce:")
-print("   time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
-#
-# both with Numba reduce
-#
+darr = cuda.device_array(npts,np.float64)
+init[grid_size,block_size](darr,nloop) # compile kernel
+reduce(darr,npts) # compile kernel
 start_time = time.time()
-init[grid_size,block_size](arr)
+init[grid_size,block_size](darr,nloop)
+reduce(darr,npts)
 cuda.synchronize()
-pi = Numba_reduce(arr)
-end_time = time.time()
-mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
-print("both with Numba reduce:")
-print("   NPTS = %d, pi = %f"%(NPTS,pi))
-print("   time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
-#
-# CUDA kernel reduction
-#
-init[grid_size,block_size](arr)
-start_time = time.time()
-CUDA_reduce(arr,NPTS)
-end_time = time.time()
-mflops = NPTS*1.0/(1.0e6*(end_time-start_time))
-print("CUDA kernel reduction:")
-print("   time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
-#
-# both with CUDA kernel reduction
-#
-start_time = time.time()
-init[grid_size,block_size](arr)
-cuda.synchronize()
-CUDA_reduce(arr,NPTS)
-CUDA_result(arr,result)
-cuda.synchronize()
-end_time = time.time()
-pi = result.copy_to_host()
-mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
-print("both with CUDA kernel reduction:")
-print("   NPTS = %d, pi = %f"%(NPTS,pi[0]))
-print("   time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
-#
-# both with CUDA kernel reduction and transfer
-#
-start_time = time.time()
-init[grid_size,block_size](arr)
-CUDA_reduce(arr,NPTS)
-CUDA_result(arr,result)
-pi = result.copy_to_host()
 end_time = time.time()
-mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
-print("both with CUDA kernel reduction and transfer:")
-print("   NPTS = %d, pi = %f"%(NPTS,pi[0]))
-print("   time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
+mflops = npts*nloop*5.0/(1.0e6*(end_time-start_time))
+harr = darr.copy_to_host()
+print("npts = %d, nloop = %d, pi = %f"%(npts,nloop,harr[0]))
+print("time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
 
diff --git a/README.md b/README.md
index f44456e47fca9c224bf90556d985787378f7a9f9..3bead95b930285626b824da0d68b892b5109afbe 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,8 @@
 |88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019|
 |2,117|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 10 nodes, 96 cores/node<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019|
 |2,102|[mpipi2.py](Python/mpipi2.py)|Python, Numba, MPI<br>10 nodes, 96 cores/node|Intel 2x Xeon Platinum 8175M|Feb 6, 2020|
+|1,635|[cudapi.cu](CUDA/cudapi.cu)|C++, CUDA<br>5120 cores|NVIDIA V100|March 1, 2020|
+|1,090|[numbapig.py](Python/numbapig.py)|Python, Numba, CUDA<br>5120 cores|NVIDIA V100|March 1, 2020|
 |315|[numbapip.py](Python/numbapip.py)|Python, Numba, parallel, fastmath<br>96 cores|Intel 2x Xeon Platinum 8175M|Feb 7, 2020|
 |272|[threadpi.c](C/threadpi.c)|C, 96 threads<br>gcc threadpi.c -o threadpi -O3 -ffast-math -pthread|Intel 2x Xeon Platinum 8175M|Jun 3, 2019|
 |211|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 1 node, 96 cores<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019|