diff --git a/Python/numbapig.py b/Python/numbapig.py index 1dc38920a50de6d1cc8f5f8ce883ad4e399806cf..a89425c9760ad1bcef24f8665aab09390a4c0118 100644 --- a/Python/numbapig.py +++ b/Python/numbapig.py @@ -11,51 +11,92 @@ import time # problem size # block_size = 2**10 -grid_size = 2**20 +grid_size = 2**21 NPTS = grid_size*block_size # -# CUDA kernels +# kernels and functions # @cuda.jit def init(arr): i = 1+cuda.grid(1) - arr[i] = 0.5/((i-0.75)*(i-0.25)) - + arr[i-1] = 0.5/((i-0.75)*(i-0.25)) +# @cuda.reduce -def sum_reduce(a,b): +def Numba_reduce(a,b): return a+b # -# compile kernels +@cuda.jit +def CUDA_sum(arr,len): + i = cuda.grid(1) + if (i < len): + arr[i] += arr[i+len] +# +def CUDA_reduce(arr,NPTS): + len = NPTS >> 1 + while (1): + CUDA_sum[grid_size,block_size](arr,len) + len = len >> 1 + if (len == 0): + return +# +# device array # arr = cuda.device_array(NPTS,np.float32) +# +# compile kernels +# init[grid_size,block_size](arr) -pi = sum_reduce(arr) +pi = Numba_reduce(arr) +CUDA_reduce(arr,NPTS) # -# array calc +# CUDA kernel array calculation # start_time = time.time() init[grid_size,block_size](arr) end_time = time.time() mflops = NPTS*4.0/(1.0e6*(end_time-start_time)) -print("Numba CUDA array calculation:") +print("CUDA kernel array calculation:") print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) # -# reduction +# Numba reduce # +init[grid_size,block_size](arr) start_time = time.time() -pi = sum_reduce(arr) +pi = Numba_reduce(arr) end_time = time.time() mflops = NPTS*1.0/(1.0e6*(end_time-start_time)) -print("Numba CUDA reduction:") +print("Numba reduce:") print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) # -# both +# both with Numba reduce # start_time = time.time() init[grid_size,block_size](arr) -pi = sum_reduce(arr) +pi = Numba_reduce(arr) end_time = time.time() mflops = NPTS*5.0/(1.0e6*(end_time-start_time)) -print("Numba CUDA both:") +print("both with Numba reduce:") print(" NPTS = %d, pi = %f"%(NPTS,pi)) print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) +# +# CUDA kernel reduction +# +init[grid_size,block_size](arr) +start_time = time.time() +CUDA_reduce(arr,NPTS) +end_time = time.time() +mflops = NPTS*1.0/(1.0e6*(end_time-start_time)) +print("CUDA kernel reduction:") +print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) +# +# both with CUDA kernel reduction +# +start_time = time.time() +init[grid_size,block_size](arr) +CUDA_reduce(arr,NPTS) +end_time = time.time() +darr = arr.copy_to_host() +mflops = NPTS*5.0/(1.0e6*(end_time-start_time)) +print("both with CUDA kernel reduction:") +print(" NPTS = %d, pi = %f"%(NPTS,darr[0])) +print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))