Skip to content
Snippets Groups Projects
Commit 2e704462 authored by Erik Strand's avatar Erik Strand
Browse files

Fix indexing errors and clean up code

parent 995d1a3a
No related branches found
No related tags found
No related merge requests found
kernels.o
mpi_pi_gpu
output
mpi_pi_gpu: mpi_pi_gpu.cpp
mpic++ $< -lcudart -o $@
mpi_pi_gpu: mpi_pi_gpu.cpp kernels.o
mpic++ $^ -lcudart -O3 -o $@
kernels.o: kernels.cu kernels.h constants.h
nvcc -c $< -O3 -use_fast_math -o $@
#ifndef MPI_PI_GPU_CONSTANTS_H
#define MPI_PI_GPU_CONSTANTS_H
#include <cstdint>
// currently init_kernel assumes n_terms_per_thread is a multiple of 10
uint64_t const n_terms_per_thread = 1000000;
uint64_t const n_threads_per_gpu = 1024 * 1024;
uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu;
uint64_t const n_threads_per_block = 1024;
uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block;
#endif
......@@ -2,20 +2,30 @@
//--------------------------------------------------------------------------------------------------
__global__
void init(double *arr, int gpu_idx) {
void init_kernel(double *arr, int gpu_idx) {
uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_idx >= n_threads_per_gpu) {
return;
}
uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1;
uint64_t const end = n_terms_per_gpu * (gpu_idx + 1) + n_terms_per_thread * thread_idx + 1;
uint64_t const end = start + n_terms_per_thread;
double sum = 0.0;
for (uint64_t i = start; i < end; ++i) {
uint64_t i = start;
while (i < end) {
#pragma unroll
for (int j = 0; j < 10; ++j) {
sum += 0.5 / ((i - 0.75) * (i - 0.25));
++i;
}
}
arr[thread_idx] = sum;
}
//--------------------------------------------------------------------------------------------------
__global__
void reduce_sum(double *arr, uint64_t stride) {
void reduce_sum_kernel(double *arr, uint64_t stride) {
uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_idx < stride) {
arr[thread_idx] += arr[thread_idx + stride];
......@@ -23,13 +33,15 @@ void reduce_sum(double *arr, uint64_t stride) {
}
//..................................................................................................
void init(double *arr, int gpu_idx);
void init(double *arr, int gpu_idx) {
init_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, gpu_idx);
}
//..................................................................................................
void reduce(double *arr) {
uint64_t stride = n_threads_per_gpu >> 1;
while (stride > 0) {
reduce_sum<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, stride);
reduce_sum_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, stride);
stride = stride >> 1;
}
}
#ifndef MPI_PI_GPU_KERNELS_H
#define MPI_PI_GPU_KERNELS_H
#include "constants.h"
//--------------------------------------------------------------------------------------------------
void init(double *arr, int gpu_idx);
......
......@@ -4,90 +4,106 @@
// assumes one GPU per MPI rank
#include <chrono>
#include <cstdint>
#include <cuda_runtime.h>
#include <iostream>
#include <mpi.h>
//#include "kernels.h"
#include "constants.h"
#include "kernels.h"
using namespace std;
uint64_t const n_terms_per_thread = 100000;
uint64_t const n_threads_per_gpu = 1024;
uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu;
uint64_t const n_threads_per_block = 512;
uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block;
int const n_loops = 8;
int main(int argc, char** argv) {
char* local_rank_str = NULL;
// Determine our index within this node.
char* local_rank_str = nullptr;
int local_rank = 0;
local_rank_str = getenv("SLURM_LOCALID");
if (local_rank_str != NULL) {
if (local_rank_str != nullptr) {
local_rank = atoi(local_rank_str);
std::cout << "slurm local rank = " << local_rank << '\n';
} else {
std::cout << "slurm local rank not defined\n";
std::cerr << "slurm local rank not defined\n";
}
// Determine how many GPUs this node has.
int n_gpus;
cudaGetDeviceCount(&n_gpus);
// relative to this node
// local meaning relative to this node
int local_gpu_idx = local_rank % n_gpus;
//cudaSetDevice(local_gpu_idx);
cudaSetDevice(local_gpu_idx);
// Initialize MPI.
int n_tasks, rank;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &n_tasks);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// We assume that all nodes have the same number of GPUs.
int gpu_idx = n_gpus * rank + local_gpu_idx;
// We assume one GPU per rank.
int gpu_idx = rank;
/*
// Check what node we're on.
int host_name_length;
char host_name[MPI_MAX_PROCESSOR_NAME];
MPI_Get_processor_name(host_name, &host_name_length);
std::cout << "task: " << rank << " of " << n_tasks << '\n';
std::cout << "node: " << host_name << '\n';
std::cout << "local gpu idx: " << local_gpu_idx << '\n';
std::cout << "slurm local id: " << local_rank << '\n';
std::cout << "local gpu idx: " << local_gpu_idx << " of " << n_gpus << '\n';
std::cout << "global gpu idx: " << gpu_idx << '\n';
std::cout << '\n';
*/
// allocate device data
double *d_arr;
cudaMalloc(&d_arr, n_threads_per_gpu * sizeof(double));
// host data
double result, pi;
// rank 0 timing data
decltype(std::chrono::high_resolution_clock::now()) global_start;
decltype(std::chrono::high_resolution_clock::now()) global_stop;
decltype(std::chrono::high_resolution_clock::now()) start;
decltype(std::chrono::high_resolution_clock::now()) stop;
/*
int rank,nranks;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&nranks);
double *arr,result,pi;
cudaMalloc(&arr,nthreads*sizeof(double));
if (rank == 0) {
for (int i = 0; i < nloop; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
double tstart = MPI_Wtime();
init<<<blocks,threads>>>(arr,npts,nthreads,rank);
reduce(arr);
cudaDeviceSynchronize();
cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
double tend = MPI_Wtime();
double dt = tend-tstart;
double gflops = npts*nthreads*nranks*5.0/dt/1e9;
printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi);
printf("time = %f, estimated GFlops = %f\n",dt,gflops);
global_start = std::chrono::high_resolution_clock::now();
}
for (int i = 0; i < n_loops; ++i) {
if (rank == 0) {
start = std::chrono::high_resolution_clock::now();
}
else {
for (int i = 0; i < nloop; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
init<<<blocks,threads>>>(arr,npts,nthreads,rank);
reduce(arr);
init(d_arr, gpu_idx);
reduce(d_arr);
cudaDeviceSynchronize();
cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
cudaMemcpy(&result, d_arr, sizeof(double), cudaMemcpyDeviceToHost);
MPI_Reduce(&result, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0) {
stop = std::chrono::high_resolution_clock::now();
auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
auto const millis = duration.count();
auto const n_terms_total = n_tasks * n_terms_per_gpu;
auto const gflops = n_terms_total * 5.0 / (millis * 1e-3) * 1e-9;
std::cout << "loop " << i << '\n';
std::cout << "processes = " << n_tasks << ", terms per GPU = " << n_terms_per_gpu
<< ", total terms = " << n_terms_total << '\n';
std::cout << "time = " << millis * 1e-3 << "s, estimated GFlops = " << gflops << '\n';
std::cout << "pi ~ " << pi << '\n';
std::cout << '\n';
}
}
cudaFree(arr);
*/
if (rank == 0) {
global_stop = std::chrono::high_resolution_clock::now();
auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(global_stop - global_start);
auto const millis = duration.count();
std::cout << "total time = " << millis * 1e-3 << "s\n";
}
cudaFree(d_arr);
MPI_Finalize();
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment