Commit 174c2a1b authored by Neil Gershenfeld's avatar Neil Gershenfeld

wip

parent 73d4808e
Pipeline #5090 passed with stage
in 1 second
//
// cudapi.cu
// Neil Gershenfeld 3/1/20
// calculation of pi by a CUDA sum
// pi = 3.14159265358979323846
//
#include <iostream>
#include <chrono>
#include <cstdint>
uint64_t blocks = 1024;
uint64_t threads = 1024;
uint64_t nloop = 1000000;
uint64_t npts = blocks*threads;
__global__ void init(double *arr,uint64_t nloop) {
uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
uint64_t start = nloop*i+1;
uint64_t end = nloop*(i+1)+1;
arr[i] = 0;
for (uint64_t j = start; j < end; ++j)
arr[i] += 0.5/((j-0.75)*(j-0.25));
}
__global__ void reduce_sum(double *arr,uint64_t len) {
uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
if (i < len)
arr[i] += arr[i+len];
}
void reduce(double *arr) {
uint64_t len = npts >> 1;
while (1) {
reduce_sum<<<blocks,threads>>>(arr,len);
len = len >> 1;
if (len == 0)
return;
}
}
int main(void) {
double harr[1],*darr;
cudaMalloc(&darr,npts*sizeof(double));
auto tstart = std::chrono::high_resolution_clock::now();
init<<<blocks,threads>>>(darr,nloop);
reduce(darr);
cudaDeviceSynchronize();
auto tend = std::chrono::high_resolution_clock::now();
auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
auto mflops = npts*nloop*5.0/dt;
cudaMemcpy(harr,darr,8,cudaMemcpyDeviceToHost);
printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,harr[0]);
printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops);
cudaFree(darr);
return 0;
}
#
# numbapig.py
# Neil Gershenfeld 2/9/20
# calculation of pi by a Numba GPU sum
# Neil Gershenfeld 3/1/20
# calculation of pi by a Numba CUDA sum
# pi = 3.14159265358979323846
#
from numba import cuda
import numpy as np
import time
#
# problem size
#
block_size = 2**10
grid_size = 2**21
NPTS = grid_size*block_size
#
# kernels and functions
#
block_size = 1024
grid_size = 1024
nloop = 1000000
npts = grid_size*block_size
@cuda.jit
def init(arr):
i = 1+cuda.grid(1)
arr[i-1] = 0.5/((i-0.75)*(i-0.25))
#arr[i-1] = i # for testing reduction
#
@cuda.reduce
def Numba_reduce(a,b):
return a+b
#
def init(arr,nloop):
i = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x;
start = nloop*i+1;
end = nloop*(i+1)+1;
arr[i] = 0;
for j in range(start,end):
arr[i] += 0.5/((j-0.75)*(j-0.25));
@cuda.jit
def CUDA_sum(arr,len):
i = cuda.grid(1)
def reduce_sum(arr,len):
i = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x;
if (i < len):
arr[i] += arr[i+len]
#
def CUDA_reduce(arr,NPTS):
len = NPTS >> 1
def reduce(arr,npts):
len = npts >> 1
while (1):
CUDA_sum[grid_size,block_size](arr,len)
cuda.synchronize()
reduce_sum[grid_size,block_size](arr,len)
len = len >> 1
if (len == 0):
return
#
@cuda.jit
def CUDA_result(arr,result):
i = cuda.grid(1)
if (i == 0):
result[0] = arr[0]
#
# device array
#
arr = cuda.device_array(NPTS,np.float32)
result = cuda.device_array(1,np.float32)
#arr = cuda.device_array(NPTS,np.int64) # for testing reduction
#result = cuda.device_array(1,np.int64) # for testing reduction
#
# compile kernels by calling them
#
init[grid_size,block_size](arr)
pi = Numba_reduce(arr)
CUDA_reduce(arr,NPTS)
CUDA_result(arr,result)
#
# CUDA kernel array calculation
#
start_time = time.time()
init[grid_size,block_size](arr)
cuda.synchronize()
end_time = time.time()
mflops = NPTS*4.0/(1.0e6*(end_time-start_time))
print("CUDA kernel array calculation:")
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# Numba reduce
#
init[grid_size,block_size](arr)
start_time = time.time()
pi = Numba_reduce(arr)
end_time = time.time()
mflops = NPTS*1.0/(1.0e6*(end_time-start_time))
print("Numba reduce:")
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# both with Numba reduce
#
darr = cuda.device_array(npts,np.float64)
init[grid_size,block_size](darr,nloop) # compile kernel
reduce(darr,npts) # compile kernel
start_time = time.time()
init[grid_size,block_size](arr)
init[grid_size,block_size](darr,nloop)
reduce(darr,npts)
cuda.synchronize()
pi = Numba_reduce(arr)
end_time = time.time()
mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
print("both with Numba reduce:")
print(" NPTS = %d, pi = %f"%(NPTS,pi))
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# CUDA kernel reduction
#
init[grid_size,block_size](arr)
start_time = time.time()
CUDA_reduce(arr,NPTS)
end_time = time.time()
mflops = NPTS*1.0/(1.0e6*(end_time-start_time))
print("CUDA kernel reduction:")
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# both with CUDA kernel reduction
#
start_time = time.time()
init[grid_size,block_size](arr)
cuda.synchronize()
CUDA_reduce(arr,NPTS)
CUDA_result(arr,result)
cuda.synchronize()
end_time = time.time()
pi = result.copy_to_host()
mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
print("both with CUDA kernel reduction:")
print(" NPTS = %d, pi = %f"%(NPTS,pi[0]))
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# both with CUDA kernel reduction and transfer
#
start_time = time.time()
init[grid_size,block_size](arr)
CUDA_reduce(arr,NPTS)
CUDA_result(arr,result)
pi = result.copy_to_host()
end_time = time.time()
mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
print("both with CUDA kernel reduction and transfer:")
print(" NPTS = %d, pi = %f"%(NPTS,pi[0]))
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
mflops = npts*nloop*5.0/(1.0e6*(end_time-start_time))
harr = darr.copy_to_host()
print("npts = %d, nloop = %d, pi = %f"%(npts,nloop,harr[0]))
print("time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
......@@ -6,6 +6,8 @@
|88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019|
|2,117|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 10 nodes, 96 cores/node<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019|
|2,102|[mpipi2.py](Python/mpipi2.py)|Python, Numba, MPI<br>10 nodes, 96 cores/node|Intel 2x Xeon Platinum 8175M|Feb 6, 2020|
|1,635|[cudapi.cu](CUDA/cudapi.cu)|C++, CUDA<br>5120 cores|NVIDIA V100|March 1, 2020|
|1,090|[numbapig.py](Python/numbapig.py)|Python, Numba, CUDA<br>5120 cores|NVIDIA V100|March 1, 2020|
|315|[numbapip.py](Python/numbapip.py)|Python, Numba, parallel, fastmath<br>96 cores|Intel 2x Xeon Platinum 8175M|Feb 7, 2020|
|272|[threadpi.c](C/threadpi.c)|C, 96 threads<br>gcc threadpi.c -o threadpi -O3 -ffast-math -pthread|Intel 2x Xeon Platinum 8175M|Jun 3, 2019|
|211|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 1 node, 96 cores<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019|
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment