#
# numbapig.py
# Neil Gershenfeld 3/1/20
# calculation of pi by a Numba CUDA sum
# pi = 3.14159265358979323846 
#
from numba import cuda
import numpy as np
import time
block_size = 1024
grid_size = 1024
nloop = 1000000
npts = grid_size*block_size
@cuda.jit
def init(arr,nloop):
   i = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x;
   start = nloop*i+1;
   end = nloop*(i+1)+1;
   arr[i] = 0;
   for j in range(start,end):
      arr[i] += 0.5/((j-0.75)*(j-0.25));
@cuda.jit
def reduce_sum(arr,len):
   i = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x;
   if (i < len):
      arr[i] += arr[i+len]
def reduce(arr,npts):
   len = npts >> 1
   while (1):
      reduce_sum[grid_size,block_size](arr,len)
      len = len >> 1
      if (len == 0):
         return
darr = cuda.device_array(npts,np.float64)
init[grid_size,block_size](darr,nloop) # compile kernel
reduce(darr,npts) # compile kernel
start_time = time.time()
init[grid_size,block_size](darr,nloop)
reduce(darr,npts)
cuda.synchronize()
end_time = time.time()
mflops = npts*nloop*5.0/(1.0e6*(end_time-start_time))
harr = darr.copy_to_host()
print("npts = %d, nloop = %d, pi = %f"%(npts,nloop,harr[0]))
print("time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))