diff --git a/mpi_pi_gpu/.gitignore b/mpi_pi_gpu/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d3109f4afc0ddc627b1af2a06988e91fae3405ef --- /dev/null +++ b/mpi_pi_gpu/.gitignore @@ -0,0 +1,3 @@ +kernels.o +mpi_pi_gpu +output diff --git a/mpi_pi_gpu/Makefile b/mpi_pi_gpu/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..a879a7db4327833ef3e89a445d6a4f3684839089 --- /dev/null +++ b/mpi_pi_gpu/Makefile @@ -0,0 +1,5 @@ +mpi_pi_gpu: mpi_pi_gpu.cpp kernels.o + mpic++ $^ -lcudart -O3 -o $@ + +kernels.o: kernels.cu kernels.h constants.h + nvcc -c $< -O3 -use_fast_math -o $@ diff --git a/mpi_pi_gpu/constants.h b/mpi_pi_gpu/constants.h new file mode 100644 index 0000000000000000000000000000000000000000..a8c307fc9ee1edf16cd969f7404eaec15d8e29d5 --- /dev/null +++ b/mpi_pi_gpu/constants.h @@ -0,0 +1,13 @@ +#ifndef MPI_PI_GPU_CONSTANTS_H +#define MPI_PI_GPU_CONSTANTS_H + +#include <cstdint> + +// currently init_kernel assumes n_terms_per_thread is a multiple of 10 +uint64_t const n_terms_per_thread = 1000000; +uint64_t const n_threads_per_gpu = 1024 * 1024; +uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu; +uint64_t const n_threads_per_block = 1024; +uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block; + +#endif diff --git a/mpi_pi_gpu/kernels.cu b/mpi_pi_gpu/kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..a4254e712f9ed7a891ee8db9948e2162a3d9b8a8 --- /dev/null +++ b/mpi_pi_gpu/kernels.cu @@ -0,0 +1,47 @@ +#include "kernels.h" + +//-------------------------------------------------------------------------------------------------- +__global__ +void init_kernel(double *arr, int gpu_idx) { + uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_idx >= n_threads_per_gpu) { + return; + } + + uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1; + uint64_t const end = start + n_terms_per_thread; + + double sum = 0.0; + uint64_t i = start; + while (i < end) { + #pragma unroll + for (int j = 0; j < 10; ++j) { + sum += 0.5 / ((i - 0.75) * (i - 0.25)); + ++i; + } + } + arr[thread_idx] = sum; +} + +//-------------------------------------------------------------------------------------------------- +__global__ +void reduce_sum_kernel(double *arr, uint64_t stride) { + uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_idx < stride) { + arr[thread_idx] += arr[thread_idx + stride]; + } +} + +//.................................................................................................. +void init(double *arr, int gpu_idx) { + init_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, gpu_idx); +} + +//.................................................................................................. +void reduce(double *arr) { + uint64_t stride = n_threads_per_gpu >> 1; + while (stride > 0) { + reduce_sum_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, stride); + stride = stride >> 1; + } +} diff --git a/mpi_pi_gpu/kernels.h b/mpi_pi_gpu/kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..984b36b17c1b413be0ed4441ae54696d1797ced0 --- /dev/null +++ b/mpi_pi_gpu/kernels.h @@ -0,0 +1,12 @@ +#ifndef MPI_PI_GPU_KERNELS_H +#define MPI_PI_GPU_KERNELS_H + +#include "constants.h" + +//-------------------------------------------------------------------------------------------------- +void init(double *arr, int gpu_idx); + +//-------------------------------------------------------------------------------------------------- +void reduce(double *arr); + +#endif diff --git a/mpi_pi_gpu/load_modules.sh b/mpi_pi_gpu/load_modules.sh new file mode 100755 index 0000000000000000000000000000000000000000..450477e7571b88a7dc52e6a251bcc1c112e4bd35 --- /dev/null +++ b/mpi_pi_gpu/load_modules.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +module purge +module load spack git gcc/7.3.0 cuda/10.1 openmpi/3.1.4-pmi-cuda-ucx diff --git a/mpi_pi_gpu/mpi_pi_gpu.cpp b/mpi_pi_gpu/mpi_pi_gpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8d7b10c9c52d0aa17c46e871cccbf9b7efeb9543 --- /dev/null +++ b/mpi_pi_gpu/mpi_pi_gpu.cpp @@ -0,0 +1,109 @@ +// Neil Gershenfeld 10/14/20 +// Erik Strand 3/6/2021 +// calculation of pi by a CUDA+MPI sum +// assumes one GPU per MPI rank + +#include <chrono> +#include <cuda_runtime.h> +#include <iostream> +#include <mpi.h> +#include "constants.h" +#include "kernels.h" + +using namespace std; + +int const n_loops = 8; + +int main(int argc, char** argv) { + // Determine our index within this node. + char* local_rank_str = nullptr; + int local_rank = 0; + local_rank_str = getenv("SLURM_LOCALID"); + if (local_rank_str != nullptr) { + local_rank = atoi(local_rank_str); + } else { + std::cerr << "slurm local rank not defined\n"; + } + + // Determine how many GPUs this node has. + int n_gpus; + cudaGetDeviceCount(&n_gpus); + // local meaning relative to this node + int local_gpu_idx = local_rank % n_gpus; + cudaSetDevice(local_gpu_idx); + + // Initialize MPI. + int n_tasks, rank; + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &n_tasks); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + // We assume one GPU per rank. + int gpu_idx = rank; + + /* + // Check what node we're on. + int host_name_length; + char host_name[MPI_MAX_PROCESSOR_NAME]; + MPI_Get_processor_name(host_name, &host_name_length); + std::cout << "task: " << rank << " of " << n_tasks << '\n'; + std::cout << "node: " << host_name << '\n'; + std::cout << "slurm local id: " << local_rank << '\n'; + std::cout << "local gpu idx: " << local_gpu_idx << " of " << n_gpus << '\n'; + std::cout << "global gpu idx: " << gpu_idx << '\n'; + std::cout << '\n'; + */ + + // allocate device data + double *d_arr; + cudaMalloc(&d_arr, n_threads_per_gpu * sizeof(double)); + + // host data + double result, pi; + + // rank 0 timing data + decltype(std::chrono::high_resolution_clock::now()) global_start; + decltype(std::chrono::high_resolution_clock::now()) global_stop; + decltype(std::chrono::high_resolution_clock::now()) start; + decltype(std::chrono::high_resolution_clock::now()) stop; + + if (rank == 0) { + global_start = std::chrono::high_resolution_clock::now(); + } + for (int i = 0; i < n_loops; ++i) { + if (rank == 0) { + start = std::chrono::high_resolution_clock::now(); + } + + MPI_Barrier(MPI_COMM_WORLD); + init(d_arr, gpu_idx); + reduce(d_arr); + cudaDeviceSynchronize(); + cudaMemcpy(&result, d_arr, sizeof(double), cudaMemcpyDeviceToHost); + MPI_Reduce(&result, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (rank == 0) { + stop = std::chrono::high_resolution_clock::now(); + auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start); + auto const millis = duration.count(); + auto const n_terms_total = n_tasks * n_terms_per_gpu; + auto const gflops = n_terms_total * 5.0 / (millis * 1e-3) * 1e-9; + + std::cout << "loop " << i << '\n'; + std::cout << "processes = " << n_tasks << ", terms per GPU = " << n_terms_per_gpu + << ", total terms = " << n_terms_total << '\n'; + std::cout << "time = " << millis * 1e-3 << "s, estimated GFlops = " << gflops << '\n'; + std::cout << "pi ~ " << pi << '\n'; + std::cout << '\n'; + } + } + if (rank == 0) { + global_stop = std::chrono::high_resolution_clock::now(); + auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(global_stop - global_start); + auto const millis = duration.count(); + std::cout << "total time = " << millis * 1e-3 << "s\n"; + } + + cudaFree(d_arr); + MPI_Finalize(); + return 0; +} diff --git a/mpi_pi_gpu/mpi_pi_gpu.slurm b/mpi_pi_gpu/mpi_pi_gpu.slurm new file mode 100644 index 0000000000000000000000000000000000000000..ff487ba2d5f88cf2358cd9c43a1dbc6a123d43f6 --- /dev/null +++ b/mpi_pi_gpu/mpi_pi_gpu.slurm @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH -J mpi_pi_gpu +#SBATCH -o output/%j.out +#SBATCH -e output/%j.err +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=4 +#SBATCH --gres=gpu:4 +#SBATCH --exclusive +#SBATCH --cpus-per-task=1 +#SBATCH --ntasks-per-core=1 +#SBATCH --threads-per-core=1 +#SBATCH --mem=10G +#SBATCH --time 00:05:00 + +source ./load_modules.sh +srun ./mpi_pi_gpu