What NVIDIA provides is the cuFFT library one can directly use,
#include <iostream>
#include <cufft.h>
int main() {
const int N = 8;
// Host input (complex numbers: float2)
cufftComplex h_signal[N];
for (int i = 0; i < N; ++i) {
h_signal[i].x = i; // real part
h_signal[i].y = 0.0f; // imaginary part
}
// Device memory
cufftComplex* d_signal;
cudaMalloc((void**)&d_signal, sizeof(cufftComplex) * N);
// Copy data to device
cudaMemcpy(d_signal, h_signal, sizeof(cufftComplex) * N, cudaMemcpyHostToDevice);
// Create FFT plan
cufftHandle plan;
cufftPlan1d(&plan, N, CUFFT_C2C, 1);
// Execute FFT (forward)
cufftExecC2C(plan, d_signal, d_signal, CUFFT_FORWARD);
// Copy result back to host
cudaMemcpy(h_signal, d_signal, sizeof(cufftComplex) * N, cudaMemcpyDeviceToHost);
// Print results
std::cout << "FFT output:\n";
for (int i = 0; i < N; ++i) {
std::cout << "(" << h_signal[i].x << ", " << h_signal[i].y << ")\n";
}
// Clean up
cufftDestroy(plan);
cudaFree(d_signal);
return 0;
}
Computers are good at processing problem in numerical method, not analytical, for Fourier Transform problem, it’s to find the frequency:

Well, how does cuFFT do the numerical computation? certainly it’s not open-sourced, but we can have simple mimicking:
#include <cuda_runtime.h>
#include <stdio.h>
#include <math.h>
#include <cuComplex.h> // CUDA complex number support
// Complex multiply (CUDA provides cuComplex helpers, but let's write explicitly)
__device__ cuFloatComplex complexMul(cuFloatComplex a, cuFloatComplex b) {
return make_cuFloatComplex(
a.x * b.x - a.y * b.y,
a.x * b.y + a.y * b.x
);
}
// Kernel performing one stage of radix-2 FFT butterflies
__global__ void fftStage(cuFloatComplex* data, int n, int stage) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int halfSize = 1 << (stage - 1);
int blockSize = halfSize << 1;
if (tid < n / 2) {
int blockStart = (tid / halfSize) * blockSize;
int i = blockStart + (tid % halfSize);
int j = i + halfSize;
// Compute twiddle factor W_N^k
float angle = -2.0f * 3.14159265358979323846f * (tid % halfSize) / blockSize;
cuFloatComplex w = make_cuFloatComplex(cosf(angle), sinf(angle));
// Butterfly computation
cuFloatComplex t = complexMul(w, data[j]);
cuFloatComplex u = data[i];
data[i] = make_cuFloatComplex(u.x + t.x, u.y + t.y);
data[j] = make_cuFloatComplex(u.x - t.x, u.y - t.y);
}
}
int main() {
const int N = 8;
cuFloatComplex h_data[N];
// Initialize input: example real data with imaginary = 0
for (int i = 0; i < N; i++) {
h_data[i] = make_cuFloatComplex((float)i, 0.0f);
}
cuFloatComplex* d_data;
cudaMalloc((void**)&d_data, sizeof(cuFloatComplex) * N);
cudaMemcpy(d_data, h_data, sizeof(cuFloatComplex) * N, cudaMemcpyHostToDevice);
int threads = N / 2;
int blocks = 1;
// Run log2(N) stages
for (int stage = 1; stage <= (int)(log2f(N)); stage++) {
fftStage<<<blocks, threads>>>(d_data, N, stage);
cudaDeviceSynchronize();
}
cudaMemcpy(h_data, d_data, sizeof(cuFloatComplex) * N, cudaMemcpyDeviceToHost);
printf("FFT output:\n");
for (int i = 0; i < N; i++) {
printf("(%f, %f)\n", h_data[i].x, h_data[i].y);
}
cudaFree(d_data);
return 0;
}