If I have this most simple task to accomplish, writing in CUDA codes:

#include <iostream>
#include <cuda_runtime.h>
// CUDA kernel for adding two integers
__global__ void addKernel(int* a, int* b, int* c) {
*c = *a + *b;
}
int main() {
// Create two integers for host
int h_a = 5, h_b = 7; // Example integers
int h_c; // Result on host
// Allocate memory for copies of them on the device
int *d_a, *d_b, *d_c;
cudaMalloc((void**)&d_a, sizeof(int));
cudaMalloc((void**)&d_b, sizeof(int));
cudaMalloc((void**)&d_c, sizeof(int));
// Copy the integers to the device's memory
cudaMemcpy(d_a, &h_a, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, sizeof(int), cudaMemcpyHostToDevice);
// Call the kernel to add them together
addKernel<<<1, 1>>>(d_a, d_b, d_c);
// Copy the result back to host memory
cudaMemcpy(&h_c, d_c, sizeof(int), cudaMemcpyDeviceToHost);
// Print out the result the GPU computed
std::cout << "Result: " << h_c << std::endl;
// Free the device's memory we allocated
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
nvcc -o addIntegers addIntegers.cu
./addIntegers
Note in the kernel function, indicated by __global__ qualifier, the input a, b and output c are all pointers, meaning pointing to address of a, b c integers. then use cudaMalloc to allocate memory according to size of int, sizeof(int). memory allocated on device not enough, need to copy memory on device too: Copy the integers to the device’s memory
cudaMemcpy(d_a, &h_a, sizeof(int), cudaMemcpyHostToDevice);
Call the kernel to add them together
addKernel<<<1, 1>>>(d_a, d_b, d_c);
Copy the result back to host memory
cudaMemcpy(&h_c, d_c, sizeof(int), cudaMemcpyDeviceToHost);
Lastly, Free the device’s memory we allocated
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);