CUDA Programming

If I have this most simple task to accomplish, writing in CUDA codes:

#include <iostream>
#include <cuda_runtime.h>

// CUDA kernel for adding two integers
__global__ void addKernel(int* a, int* b, int* c) {
    *c = *a + *b;
}

int main() {
    // Create two integers for host
    int h_a = 5, h_b = 7; // Example integers
    int h_c;              // Result on host

    // Allocate memory for copies of them on the device
    int *d_a, *d_b, *d_c;
    cudaMalloc((void**)&d_a, sizeof(int));
    cudaMalloc((void**)&d_b, sizeof(int));
    cudaMalloc((void**)&d_c, sizeof(int));

    // Copy the integers to the device's memory
    cudaMemcpy(d_a, &h_a, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, &h_b, sizeof(int), cudaMemcpyHostToDevice);

    // Call the kernel to add them together
    addKernel<<<1, 1>>>(d_a, d_b, d_c);

    // Copy the result back to host memory
    cudaMemcpy(&h_c, d_c, sizeof(int), cudaMemcpyDeviceToHost);

    // Print out the result the GPU computed
    std::cout << "Result: " << h_c << std::endl;

    // Free the device's memory we allocated
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}
nvcc -o addIntegers addIntegers.cu
./addIntegers

Note in the kernel function, indicated by __global__ qualifier, the input a, b and output c are all pointers, meaning pointing to address of a, b c integers. then use cudaMalloc to allocate memory according to size of int, sizeof(int). memory allocated on device not enough, need to copy memory on device too: Copy the integers to the device’s memory
cudaMemcpy(d_a, &h_a, sizeof(int), cudaMemcpyHostToDevice);

Call the kernel to add them together
addKernel<<<1, 1>>>(d_a, d_b, d_c);

Copy the result back to host memory
cudaMemcpy(&h_c, d_c, sizeof(int), cudaMemcpyDeviceToHost);

Lastly, Free the device’s memory we allocated
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

Leave a comment

This site uses Akismet to reduce spam. Learn how your comment data is processed.