Skip to content
Snippets Groups Projects
Commit 995d1a3a authored by Erik Strand's avatar Erik Strand
Browse files

Separate kernels

I think they need to live in their own file, since the main application
has to be compiled with mpicc, not nvcc.
parent e1334a58
Branches
No related tags found
No related merge requests found
mpi_pi_gpu: mpi_pi_gpu.c
mpi_pi_gpu: mpi_pi_gpu.cpp
mpic++ $< -lcudart -o $@
#include "kernels.h"
//--------------------------------------------------------------------------------------------------
__global__
void init(double *arr, int gpu_idx) {
uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1;
uint64_t const end = n_terms_per_gpu * (gpu_idx + 1) + n_terms_per_thread * thread_idx + 1;
double sum = 0.0;
for (uint64_t i = start; i < end; ++i) {
sum += 0.5 / ((i - 0.75) * (i - 0.25));
}
arr[thread_idx] = sum;
}
//--------------------------------------------------------------------------------------------------
__global__
void reduce_sum(double *arr, uint64_t stride) {
uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_idx < stride) {
arr[thread_idx] += arr[thread_idx + stride];
}
}
//..................................................................................................
void init(double *arr, int gpu_idx);
//..................................................................................................
void reduce(double *arr) {
uint64_t stride = n_threads_per_gpu >> 1;
while (stride > 0) {
reduce_sum<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, stride);
stride = stride >> 1;
}
}
#ifndef MPI_PI_GPU_KERNELS_H
#define MPI_PI_GPU_KERNELS_H
//--------------------------------------------------------------------------------------------------
void init(double *arr, int gpu_idx);
//--------------------------------------------------------------------------------------------------
void reduce(double *arr);
#endif
......@@ -8,6 +8,7 @@
#include <cuda_runtime.h>
#include <iostream>
#include <mpi.h>
//#include "kernels.h"
using namespace std;
......@@ -17,36 +18,6 @@ uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu;
uint64_t const n_threads_per_block = 512;
uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block;
int nloop = 10;
__global__
void init(double *arr, int gpu_idx) {
uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1;
uint64_t const end = n_terms_per_gpu * (gpu_idx + 1) + n_terms_per_thread * thread_idx + 1;
double sum = 0.0;
for (uint64_t i = start; i < end; ++i) {
sum += 0.5 / ((i - 0.75) * (i - 0.25));
}
arr[thread_idx] = sum;
}
__global__
void reduce_sum(double *arr, uint64_t stride) {
uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_idx < stride) {
arr[thread_idx] += arr[thread_idx + stride];
}
}
void reduce(double *arr) {
uint64_t stride = n_threads_per_gpu >> 1;
while (stride > 0) {
reduce_sum<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, stride);
stride = stride >> 1;
}
}
int main(int argc, char** argv) {
char* local_rank_str = NULL;
int local_rank = 0;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment