Commit 747c016d authored by Erik Strand's avatar Erik Strand

Add a basic saxpy example

parent dc832c05
......@@ -4,3 +4,4 @@
# binaries
get_gpu_info
saxpy
get_gpu_info: get_gpu_info.cu
nvcc get_gpu_info.cu -o get_gpu_info
nvcc get_gpu_info.cu -o get_gpu_info
saxpy: saxpy.cu
nvcc saxpy.cu -o saxpy
// This code performs a single precision a*X plus Y operation on the GPU.
// Adapated from https://developer.nvidia.com/blog/easy-introduction-cuda-c-and-c/
#include <stdio.h>
// All CUDA kernels return void. To get data back to the CPU you have to copy it explicitly.
__global__
void saxpy(int n, float a, float *x, float *y) {
int const i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
y[i] = a * x[i] + y[i];
}
}
int main() {
// We'll put 2^20 numbers in each vector.
int N = 1048576;
// Allocate host (CPU) memory.
float *h_x, *h_y;
h_x = (float*)malloc(N * sizeof(float));
h_y = (float*)malloc(N * sizeof(float));
// Allocate device (GPU) memory.
float *d_x, *d_y;
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
// Initialize data.
for (int i = 0; i < N; ++i) {
h_x[i] = 1.0f;
h_y[i] = 2.0f;
}
// Copy data to the GPU.
cudaMemcpy(d_x, h_x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on the data.
saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
// Copy the result back to the host.
cudaMemcpy(h_y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = max(maxError, abs(h_y[i]-4.0f));
printf("Max error: %f\n", maxError);
cudaFree(d_x);
cudaFree(d_y);
free(h_x);
free(h_y);
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment