Skip to content
Snippets Groups Projects
Commit e93ae6b4 authored by Erik Strand's avatar Erik Strand
Browse files

Write an MPI pi calculation for GPUs

parents fdf08d82 fafbebf0
No related branches found
No related tags found
No related merge requests found
kernels.o
mpi_pi_gpu
output
mpi_pi_gpu: mpi_pi_gpu.cpp kernels.o
mpic++ $^ -lcudart -O3 -o $@
kernels.o: kernels.cu kernels.h constants.h
nvcc -c $< -O3 -use_fast_math -o $@
#ifndef MPI_PI_GPU_CONSTANTS_H
#define MPI_PI_GPU_CONSTANTS_H
#include <cstdint>
// currently init_kernel assumes n_terms_per_thread is a multiple of 10
uint64_t const n_terms_per_thread = 1000000;
uint64_t const n_threads_per_gpu = 1024 * 1024;
uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu;
uint64_t const n_threads_per_block = 1024;
uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block;
#endif
#include "kernels.h"
//--------------------------------------------------------------------------------------------------
__global__
void init_kernel(double *arr, int gpu_idx) {
uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_idx >= n_threads_per_gpu) {
return;
}
uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1;
uint64_t const end = start + n_terms_per_thread;
double sum = 0.0;
uint64_t i = start;
while (i < end) {
#pragma unroll
for (int j = 0; j < 10; ++j) {
sum += 0.5 / ((i - 0.75) * (i - 0.25));
++i;
}
}
arr[thread_idx] = sum;
}
//--------------------------------------------------------------------------------------------------
__global__
void reduce_sum_kernel(double *arr, uint64_t stride) {
uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_idx < stride) {
arr[thread_idx] += arr[thread_idx + stride];
}
}
//..................................................................................................
void init(double *arr, int gpu_idx) {
init_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, gpu_idx);
}
//..................................................................................................
void reduce(double *arr) {
uint64_t stride = n_threads_per_gpu >> 1;
while (stride > 0) {
reduce_sum_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, stride);
stride = stride >> 1;
}
}
#ifndef MPI_PI_GPU_KERNELS_H
#define MPI_PI_GPU_KERNELS_H
#include "constants.h"
//--------------------------------------------------------------------------------------------------
void init(double *arr, int gpu_idx);
//--------------------------------------------------------------------------------------------------
void reduce(double *arr);
#endif
#!/bin/bash
module purge
module load spack git gcc/7.3.0 cuda/10.1 openmpi/3.1.4-pmi-cuda-ucx
// Neil Gershenfeld 10/14/20
// Erik Strand 3/6/2021
// calculation of pi by a CUDA+MPI sum
// assumes one GPU per MPI rank
#include <chrono>
#include <cuda_runtime.h>
#include <iostream>
#include <mpi.h>
#include "constants.h"
#include "kernels.h"
using namespace std;
int const n_loops = 8;
int main(int argc, char** argv) {
// Determine our index within this node.
char* local_rank_str = nullptr;
int local_rank = 0;
local_rank_str = getenv("SLURM_LOCALID");
if (local_rank_str != nullptr) {
local_rank = atoi(local_rank_str);
} else {
std::cerr << "slurm local rank not defined\n";
}
// Determine how many GPUs this node has.
int n_gpus;
cudaGetDeviceCount(&n_gpus);
// local meaning relative to this node
int local_gpu_idx = local_rank % n_gpus;
cudaSetDevice(local_gpu_idx);
// Initialize MPI.
int n_tasks, rank;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &n_tasks);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// We assume one GPU per rank.
int gpu_idx = rank;
/*
// Check what node we're on.
int host_name_length;
char host_name[MPI_MAX_PROCESSOR_NAME];
MPI_Get_processor_name(host_name, &host_name_length);
std::cout << "task: " << rank << " of " << n_tasks << '\n';
std::cout << "node: " << host_name << '\n';
std::cout << "slurm local id: " << local_rank << '\n';
std::cout << "local gpu idx: " << local_gpu_idx << " of " << n_gpus << '\n';
std::cout << "global gpu idx: " << gpu_idx << '\n';
std::cout << '\n';
*/
// allocate device data
double *d_arr;
cudaMalloc(&d_arr, n_threads_per_gpu * sizeof(double));
// host data
double result, pi;
// rank 0 timing data
decltype(std::chrono::high_resolution_clock::now()) global_start;
decltype(std::chrono::high_resolution_clock::now()) global_stop;
decltype(std::chrono::high_resolution_clock::now()) start;
decltype(std::chrono::high_resolution_clock::now()) stop;
if (rank == 0) {
global_start = std::chrono::high_resolution_clock::now();
}
for (int i = 0; i < n_loops; ++i) {
if (rank == 0) {
start = std::chrono::high_resolution_clock::now();
}
MPI_Barrier(MPI_COMM_WORLD);
init(d_arr, gpu_idx);
reduce(d_arr);
cudaDeviceSynchronize();
cudaMemcpy(&result, d_arr, sizeof(double), cudaMemcpyDeviceToHost);
MPI_Reduce(&result, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0) {
stop = std::chrono::high_resolution_clock::now();
auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
auto const millis = duration.count();
auto const n_terms_total = n_tasks * n_terms_per_gpu;
auto const gflops = n_terms_total * 5.0 / (millis * 1e-3) * 1e-9;
std::cout << "loop " << i << '\n';
std::cout << "processes = " << n_tasks << ", terms per GPU = " << n_terms_per_gpu
<< ", total terms = " << n_terms_total << '\n';
std::cout << "time = " << millis * 1e-3 << "s, estimated GFlops = " << gflops << '\n';
std::cout << "pi ~ " << pi << '\n';
std::cout << '\n';
}
}
if (rank == 0) {
global_stop = std::chrono::high_resolution_clock::now();
auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(global_stop - global_start);
auto const millis = duration.count();
std::cout << "total time = " << millis * 1e-3 << "s\n";
}
cudaFree(d_arr);
MPI_Finalize();
return 0;
}
#!/bin/bash
#SBATCH -J mpi_pi_gpu
#SBATCH -o output/%j.out
#SBATCH -e output/%j.err
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=4
#SBATCH --gres=gpu:4
#SBATCH --exclusive
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-core=1
#SBATCH --threads-per-core=1
#SBATCH --mem=10G
#SBATCH --time 00:05:00
source ./load_modules.sh
srun ./mpi_pi_gpu
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment