diff --git a/mpi_pi_gpu/.gitignore b/mpi_pi_gpu/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d3109f4afc0ddc627b1af2a06988e91fae3405ef
--- /dev/null
+++ b/mpi_pi_gpu/.gitignore
@@ -0,0 +1,3 @@
+kernels.o
+mpi_pi_gpu
+output
diff --git a/mpi_pi_gpu/Makefile b/mpi_pi_gpu/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..a879a7db4327833ef3e89a445d6a4f3684839089
--- /dev/null
+++ b/mpi_pi_gpu/Makefile
@@ -0,0 +1,5 @@
+mpi_pi_gpu: mpi_pi_gpu.cpp kernels.o
+	mpic++ $^ -lcudart -O3 -o $@
+
+kernels.o: kernels.cu kernels.h constants.h
+	nvcc -c $< -O3 -use_fast_math -o $@
diff --git a/mpi_pi_gpu/constants.h b/mpi_pi_gpu/constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8c307fc9ee1edf16cd969f7404eaec15d8e29d5
--- /dev/null
+++ b/mpi_pi_gpu/constants.h
@@ -0,0 +1,13 @@
+#ifndef MPI_PI_GPU_CONSTANTS_H
+#define MPI_PI_GPU_CONSTANTS_H
+
+#include <cstdint>
+
+// currently init_kernel assumes n_terms_per_thread is a multiple of 10
+uint64_t const n_terms_per_thread = 1000000;
+uint64_t const n_threads_per_gpu = 1024 * 1024;
+uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu;
+uint64_t const n_threads_per_block = 1024;
+uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block;
+
+#endif
diff --git a/mpi_pi_gpu/kernels.cu b/mpi_pi_gpu/kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a4254e712f9ed7a891ee8db9948e2162a3d9b8a8
--- /dev/null
+++ b/mpi_pi_gpu/kernels.cu
@@ -0,0 +1,47 @@
+#include "kernels.h"
+
+//--------------------------------------------------------------------------------------------------
+__global__
+void init_kernel(double *arr, int gpu_idx) {
+    uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_idx >= n_threads_per_gpu) {
+        return;
+    }
+
+    uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1;
+    uint64_t const end = start + n_terms_per_thread;
+
+    double sum = 0.0;
+    uint64_t i = start;
+    while (i < end) {
+        #pragma unroll
+        for (int j = 0; j < 10; ++j) {
+            sum += 0.5 / ((i - 0.75) * (i - 0.25));
+            ++i;
+        }
+    }
+    arr[thread_idx] = sum;
+}
+
+//--------------------------------------------------------------------------------------------------
+__global__
+void reduce_sum_kernel(double *arr, uint64_t stride) {
+    uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_idx < stride) {
+        arr[thread_idx] += arr[thread_idx + stride];
+    }
+}
+
+//..................................................................................................
+void init(double *arr, int gpu_idx) {
+    init_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, gpu_idx);
+}
+
+//..................................................................................................
+void reduce(double *arr) {
+    uint64_t stride = n_threads_per_gpu >> 1;
+    while (stride > 0) {
+        reduce_sum_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, stride);
+        stride = stride >> 1;
+    }
+}
diff --git a/mpi_pi_gpu/kernels.h b/mpi_pi_gpu/kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..984b36b17c1b413be0ed4441ae54696d1797ced0
--- /dev/null
+++ b/mpi_pi_gpu/kernels.h
@@ -0,0 +1,12 @@
+#ifndef MPI_PI_GPU_KERNELS_H
+#define MPI_PI_GPU_KERNELS_H
+
+#include "constants.h"
+
+//--------------------------------------------------------------------------------------------------
+void init(double *arr, int gpu_idx);
+
+//--------------------------------------------------------------------------------------------------
+void reduce(double *arr);
+
+#endif
diff --git a/mpi_pi_gpu/load_modules.sh b/mpi_pi_gpu/load_modules.sh
new file mode 100755
index 0000000000000000000000000000000000000000..450477e7571b88a7dc52e6a251bcc1c112e4bd35
--- /dev/null
+++ b/mpi_pi_gpu/load_modules.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+module purge
+module load spack git gcc/7.3.0 cuda/10.1 openmpi/3.1.4-pmi-cuda-ucx
diff --git a/mpi_pi_gpu/mpi_pi_gpu.cpp b/mpi_pi_gpu/mpi_pi_gpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d7b10c9c52d0aa17c46e871cccbf9b7efeb9543
--- /dev/null
+++ b/mpi_pi_gpu/mpi_pi_gpu.cpp
@@ -0,0 +1,109 @@
+// Neil Gershenfeld 10/14/20
+// Erik Strand 3/6/2021
+// calculation of pi by a CUDA+MPI sum
+// assumes one GPU per MPI rank
+
+#include <chrono>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <mpi.h>
+#include "constants.h"
+#include "kernels.h"
+
+using namespace std;
+
+int const n_loops = 8;
+
+int main(int argc, char** argv) {
+    // Determine our index within this node.
+    char* local_rank_str = nullptr;
+    int local_rank = 0;
+    local_rank_str = getenv("SLURM_LOCALID");
+    if (local_rank_str != nullptr) {
+        local_rank = atoi(local_rank_str);
+    } else {
+        std::cerr << "slurm local rank not defined\n";
+    }
+
+    // Determine how many GPUs this node has.
+    int n_gpus;
+    cudaGetDeviceCount(&n_gpus);
+    // local meaning relative to this node
+    int local_gpu_idx = local_rank % n_gpus;
+    cudaSetDevice(local_gpu_idx);
+
+    // Initialize MPI.
+    int n_tasks, rank;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &n_tasks);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    // We assume one GPU per rank.
+    int gpu_idx = rank;
+
+    /*
+    // Check what node we're on.
+    int host_name_length;
+    char host_name[MPI_MAX_PROCESSOR_NAME];
+    MPI_Get_processor_name(host_name, &host_name_length);
+    std::cout << "task: " << rank << " of " << n_tasks << '\n';
+    std::cout << "node: " << host_name << '\n';
+    std::cout << "slurm local id: " << local_rank << '\n';
+    std::cout << "local gpu idx: " << local_gpu_idx << " of " << n_gpus << '\n';
+    std::cout << "global gpu idx: " << gpu_idx << '\n';
+    std::cout << '\n';
+    */
+
+    // allocate device data
+    double *d_arr;
+    cudaMalloc(&d_arr, n_threads_per_gpu * sizeof(double));
+
+    // host data
+    double result, pi;
+
+    // rank 0 timing data
+    decltype(std::chrono::high_resolution_clock::now()) global_start;
+    decltype(std::chrono::high_resolution_clock::now()) global_stop;
+    decltype(std::chrono::high_resolution_clock::now()) start;
+    decltype(std::chrono::high_resolution_clock::now()) stop;
+
+    if (rank == 0) {
+        global_start = std::chrono::high_resolution_clock::now();
+    }
+    for (int i = 0; i < n_loops; ++i) {
+        if (rank == 0) {
+            start = std::chrono::high_resolution_clock::now();
+        }
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        init(d_arr, gpu_idx);
+        reduce(d_arr);
+        cudaDeviceSynchronize();
+        cudaMemcpy(&result, d_arr, sizeof(double), cudaMemcpyDeviceToHost);
+        MPI_Reduce(&result, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+        if (rank == 0) {
+            stop = std::chrono::high_resolution_clock::now();
+            auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
+            auto const millis = duration.count();
+            auto const n_terms_total = n_tasks * n_terms_per_gpu;
+            auto const gflops = n_terms_total * 5.0 / (millis * 1e-3) * 1e-9;
+
+            std::cout << "loop " << i << '\n';
+            std::cout << "processes = " << n_tasks << ", terms per GPU = " << n_terms_per_gpu
+                << ", total terms = " << n_terms_total << '\n';
+            std::cout << "time = " << millis * 1e-3 << "s, estimated GFlops = " << gflops << '\n';
+            std::cout << "pi ~ " << pi << '\n';
+            std::cout << '\n';
+        }
+    }
+    if (rank == 0) {
+        global_stop = std::chrono::high_resolution_clock::now();
+        auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(global_stop - global_start);
+        auto const millis = duration.count();
+        std::cout << "total time = " << millis * 1e-3 << "s\n";
+    }
+
+    cudaFree(d_arr);
+    MPI_Finalize();
+    return 0;
+}
diff --git a/mpi_pi_gpu/mpi_pi_gpu.slurm b/mpi_pi_gpu/mpi_pi_gpu.slurm
new file mode 100644
index 0000000000000000000000000000000000000000..ff487ba2d5f88cf2358cd9c43a1dbc6a123d43f6
--- /dev/null
+++ b/mpi_pi_gpu/mpi_pi_gpu.slurm
@@ -0,0 +1,16 @@
+#!/bin/bash
+#SBATCH -J mpi_pi_gpu
+#SBATCH -o output/%j.out
+#SBATCH -e output/%j.err
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --exclusive
+#SBATCH --cpus-per-task=1
+#SBATCH --ntasks-per-core=1
+#SBATCH --threads-per-core=1
+#SBATCH --mem=10G
+#SBATCH --time 00:05:00
+
+source ./load_modules.sh
+srun ./mpi_pi_gpu