Commit d935c24d authored by Neil Gershenfeld's avatar Neil Gershenfeld
Browse files

wip

parent 29d3f0dd
Pipeline #8246 passed with stage
in 1 second
//
// cudampipi.cu
// Neil Gershenfeld 10/14/20
// calculation of pi by a CUDA+MPI sum
// assumes one GPU per MPI rank
// pi = 3.14159265358979323846
//
#include <iostream>
#include <cstdint>
#include <mpi.h>
//
using namespace std;
//
uint64_t blocks = 1024;
uint64_t threads = 1024;
uint64_t npts = 1000000;
uint64_t nthreads = blocks*threads;
int nloop = 10;
//
__global__ void init(double *arr,uint64_t npts,uint64_t nthreads,int rank) {
uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
uint64_t start = npts*index+npts*nthreads*rank+1;
uint64_t end = npts*(index+1)+npts*nthreads*rank+1;
arr[index] = 0;
for (uint64_t j = start; j < end; ++j)
arr[index] += 0.5/((j-0.75)*(j-0.25));
}
//
__global__ void reduce_sum(double *arr,uint64_t len) {
uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < len)
arr[index] += arr[index+len];
}
//
void reduce(double *arr) {
uint64_t len = nthreads >> 1;
while (1) {
reduce_sum<<<blocks,threads>>>(arr,len);
len = len >> 1;
if (len == 0)
return;
}
}
//
int main(int argc, char** argv) {
int rank,nranks;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&nranks);
double *arr,result,pi;
cudaMalloc(&arr,nthreads*sizeof(double));
if (rank == 0) {
for (int i = 0; i < nloop; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
double tstart = MPI_Wtime();
init<<<blocks,threads>>>(arr,npts,nthreads,rank);
reduce(arr);
cudaDeviceSynchronize();
cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
double tend = MPI_Wtime();
double dt = tend-tstart;
double gflops = npts*nthreads*nranks*5.0/dt/1e9;
printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi);
printf("time = %f, estimated GFlops = %f\n",dt,gflops);
}
}
else {
for (int i = 0; i < nloop; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
init<<<blocks,threads>>>(arr,npts,nthreads,rank);
reduce(arr);
cudaDeviceSynchronize();
cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
}
}
cudaFree(arr);
MPI_Finalize();
return 0;
}
......@@ -3,6 +3,7 @@
|estimated GFlops|code|description|system|date|
|---|---|---|---|---|
|2,493,988|[cudampipi.cu](CUDA/cudampipi.cu)|C++, CUDA+MPI, 256 nodes, 1536 ranks<br>nvcc -arch=sm_70 -std=c++11|Oak Ridge OLCF Summit<br>IBM POWER9+NVIDA V100|October 15, 2020|
|88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019|
|12,589|[cudapit.cu](CUDA/cudapit.cu)|C++, CUDA, 8 GPUs, 5120 cores/GPU|NVIDIA V100|March 3, 2020|
|11,083|[mpithreadpi.cpp](hybrid/mpithreadpi.cpp)|C++, MPI+threads, 128 nodes, 64 cores/node, 4 threads/core<br>CC mpithreadpi.cpp -o mpithreadpi -O3 -ffast-math -std=c++11|Argonne ALCF Theta<br>Cray XC40|Mar 8, 2020|
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment