Commit a05e3b3e authored by Erik Strand's avatar Erik Strand

Compare GPU SAXPY to CPU

parent 747c016d
.PHONY: all
all: get_gpu_info saxpy
get_gpu_info: get_gpu_info.cu
nvcc get_gpu_info.cu -o get_gpu_info
saxpy: saxpy.cu
nvcc saxpy.cu -o saxpy
nvcc saxpy.cu -o saxpy
// This code performs a single precision a*X plus Y operation on the GPU.
// Adapated from https://developer.nvidia.com/blog/easy-introduction-cuda-c-and-c/
#include <stdio.h>
#include <iostream>
#include <chrono>
void saxpy_cpu(int n, float a, float *x, float *y) {
for (int i = 0; i < n; ++i) {
y[i] = a * x[i] + y[i];
}
}
// To turn a function into a GPU kernel, mark it __global__.
// All CUDA kernels return void. To get data back to the CPU you have to copy it explicitly.
__global__
void saxpy(int n, float a, float *x, float *y) {
void saxpy_gpu(int n, float a, float *x, float *y) {
int const i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
y[i] = a * x[i] + y[i];
......@@ -15,11 +23,13 @@ void saxpy(int n, float a, float *x, float *y) {
int main() {
// We'll put 2^20 numbers in each vector.
int N = 1048576;
std::cout << "Performing SAXPY on vectors of dim " << N << '\n';
// Allocate host (CPU) memory.
float *h_x, *h_y;
float *h_x, *h_y, *h_z;
h_x = (float*)malloc(N * sizeof(float));
h_y = (float*)malloc(N * sizeof(float));
h_z = (float*)malloc(N * sizeof(float));
// Allocate device (GPU) memory.
float *d_x, *d_y;
......@@ -30,27 +40,49 @@ int main() {
for (int i = 0; i < N; ++i) {
h_x[i] = 1.0f;
h_y[i] = 2.0f;
h_z[i] = 2.0f;
}
// Perform SAXPY on the CPU.
auto start = std::chrono::high_resolution_clock::now();
saxpy_cpu(N, 2.0f, h_x, h_z);
auto stop = std::chrono::high_resolution_clock::now();
float cpu_time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
std::cout << "CPU time: " << cpu_time << " microseconds\n";
// Copy data to the GPU.
cudaMemcpy(d_x, h_x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on the data.
saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
// Perform SAXPY on the GPU.
int threads_per_block = 256;
int blocks = (N + threads_per_block - 1) / threads_per_block;
start = std::chrono::high_resolution_clock::now();
// Kernel launches are specified with a number of blocks and number of threads per block
// enclosed in triple angle brackets.
saxpy_gpu<<<blocks, threads_per_block>>>(N, 2.0f, d_x, d_y);
// This function waits until the GPU has finished all assigned work. If didn't call it, the CPU
// would just keep going. This is really convenient when there's something else the CPU could be
// doing, but here we just want to know how long the GPU takes.
cudaDeviceSynchronize();
stop = std::chrono::high_resolution_clock::now();
float gpu_time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
std::cout << "GPU time: " << gpu_time << " microseconds\n";
// Copy the result back to the host.
// Copy the result back to the host and print error (if any).
cudaMemcpy(h_y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
float max_error = 0.0f;
for (int i = 0; i < N; i++) {
max_error = max(max_error, abs(h_y[i] - h_z[i]));
}
std::cout << "Max error: " << max_error << '\n';
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = max(maxError, abs(h_y[i]-4.0f));
printf("Max error: %f\n", maxError);
// Free memory.
cudaFree(d_x);
cudaFree(d_y);
free(h_x);
free(h_y);
free(h_z);
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment