Fast Fourier Transform (FFT) is a useful tool used in many areas, and we understand how demanding its computation can be. GPUs help manage this heavy processing, making a big difference for those working on these tasks daily.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cufft.h>
#include <cuda_runtime.h>
// Error checking macro for CUDA calls
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// Error checking macro for cuFFT calls
#define CUFFT_CHECK(call) \
do { \
cufftResult err = call; \
if (err != CUFFT_SUCCESS) { \
fprintf(stderr, "cuFFT error at %s:%d: %d\n", __FILE__, __LINE__, err); \
exit(EXIT_FAILURE); \
} \
} while(0)
int main() {
const int N = 1024; // Signal size
const float frequency = 10.0f; // Hz
printf("=== cuFFT 1D Real-to-Complex FFT Example ===\n");
printf("Signal size: %d\n", N);
printf("Input frequency: %.1f Hz\n\n", frequency);
// 1. Create cuFFT plan
cufftHandle plan;
CUFFT_CHECK(cufftPlan1d(&plan, N, CUFFT_R2C, 1));
// 2. Allocate host memory
float *h_input = (float*)malloc(N * sizeof(float));
cufftComplex *h_output = (cufftComplex*)malloc((N/2 + 1) * sizeof(cufftComplex));
if (!h_input || !h_output) {
fprintf(stderr, "Failed to allocate host memory\n");
exit(EXIT_FAILURE);
}
// 3. Allocate device memory
float *d_input;
cufftComplex *d_output;
CUDA_CHECK(cudaMalloc(&d_input, N * sizeof(float)));
CUDA_CHECK(cudaMalloc(&d_output, (N/2 + 1) * sizeof(cufftComplex)));
// 4. Generate input signal (sine wave)
printf("Generating sine wave...\n");
for (int i = 0; i < N; i++) {
h_input[i] = sin(2.0 * M_PI * frequency * i / N);
}
// 5. Copy data from host to device
CUDA_CHECK(cudaMemcpy(d_input, h_input, N * sizeof(float),
cudaMemcpyHostToDevice));
// 6. Execute FFT on GPU
printf("Executing FFT on GPU...\n");
CUFFT_CHECK(cufftExecR2C(plan, d_input, d_output));
// 7. Copy results from device to host
CUDA_CHECK(cudaMemcpy(h_output, d_output, (N/2 + 1) * sizeof(cufftComplex),
cudaMemcpyDeviceToHost));
// 8. Display results
printf("\nFFT Results:\n");
printf("Frequency bins with highest magnitude:\n");
printf("%-10s %-15s %-15s %-15s\n", "Bin", "Frequency (Hz)", "Magnitude", "Phase");
printf("-----------------------------------------------------------\n");
// Find and display top 5 frequency components
for (int i = 0; i < 20; i++) {
float magnitude = sqrt(h_output[i].x * h_output[i].x +
h_output[i].y * h_output[i].y);
float phase = atan2(h_output[i].y, h_output[i].x);
float freq_hz = (float)i * N / N; // Frequency in Hz
if (magnitude > 1.0) { // Only show significant components
printf("%-10d %-15.2f %-15.2f %-15.2f\n",
i, freq_hz, magnitude, phase);
}
}
// Verify the peak is at the expected frequency bin
int expected_bin = (int)frequency;
float peak_magnitude = sqrt(h_output[expected_bin].x * h_output[expected_bin].x +
h_output[expected_bin].y * h_output[expected_bin].y);
printf("\nPeak at bin %d (%.1f Hz): magnitude = %.2f\n",
expected_bin, frequency, peak_magnitude);
printf("Expected bin: %d ā\n", expected_bin);
// 9. Cleanup
printf("\nCleaning up...\n");
CUFFT_CHECK(cufftDestroy(plan));
CUDA_CHECK(cudaFree(d_input));
CUDA_CHECK(cudaFree(d_output));
free(h_input);
free(h_output);
printf("Done!\n");
return 0;
}
If in python, looks easier:
#!/usr/bin/env python3
"""
cuFFT Example - Python version using CuPy
Same functionality as the CUDA C++ version, but much simpler!
"""
import numpy as np
import cupy as cp
import matplotlib.pyplot as plt
def main():
print("=== cuFFT 1D Real-to-Complex FFT Example (Python/CuPy) ===")
# Parameters
N = 1024
frequency = 10.0 # Hz
print(f"Signal size: {N}")
print(f"Input frequency: {frequency} Hz\n")
# Generate sine wave on CPU
print("Generating sine wave...")
t = np.arange(N)
h_input = np.sin(2.0 * np.pi * frequency * t / N).astype(np.float32)
# Transfer to GPU (this happens automatically with CuPy!)
print("Transferring to GPU...")
d_input = cp.array(h_input)
# Execute FFT on GPU (cuFFT is used automatically!)
print("Executing FFT on GPU...")
d_output = cp.fft.rfft(d_input)
# Transfer back to CPU
h_output = cp.asnumpy(d_output)
# Calculate magnitude and phase
magnitude = np.abs(h_output)
phase = np.angle(h_output)
# Display results
print("\nFFT Results:")
print("Frequency bins with highest magnitude:")
print(f"{'Bin':<10} {'Frequency (Hz)':<15} {'Magnitude':<15} {'Phase':<15}")
print("-" * 60)
# Show top frequency components
for i in range(20):
freq_hz = i * N / N
if magnitude[i] > 1.0:
print(f"{i:<10} {freq_hz:<15.2f} {magnitude[i]:<15.2f} {phase[i]:<15.2f}")
# Verify peak
expected_bin = int(frequency)
peak_magnitude = magnitude[expected_bin]
print(f"\nPeak at bin {expected_bin} ({frequency} Hz): magnitude = {peak_magnitude:.2f}")
print(f"Expected bin: {expected_bin} ā")
# Optional: Plot results
plot_results(h_input, magnitude, frequency, N)
print("\nDone!")
def plot_results(signal, magnitude, frequency, N):
"""Plot the input signal and FFT magnitude spectrum"""
try:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
# Plot time domain signal
ax1.plot(signal[:200], linewidth=2) # Show first 200 samples
ax1.set_title(f'Input Signal: {frequency} Hz Sine Wave', fontsize=14, fontweight='bold')
ax1.set_xlabel('Sample')
ax1.set_ylabel('Amplitude')
ax1.grid(True, alpha=0.3)
# Plot frequency domain
freq_bins = np.arange(len(magnitude))
ax2.stem(freq_bins[:50], magnitude[:50], basefmt=' ') # Show first 50 bins
ax2.set_title('FFT Magnitude Spectrum', fontsize=14, fontweight='bold')
ax2.set_xlabel('Frequency Bin')
ax2.set_ylabel('Magnitude')
ax2.grid(True, alpha=0.3)
ax2.axvline(x=frequency, color='r', linestyle='--', label=f'Expected: {frequency} Hz')
ax2.legend()
plt.tight_layout()
plt.savefig('fft_result.png', dpi=150, bbox_inches='tight')
print("\nš Plot saved as 'fft_result.png'")
except Exception as e:
print(f"\nā ļø Could not create plot: {e}")
print("(matplotlib not available or display not configured)")
def compare_cpu_gpu_performance():
"""Compare CPU vs GPU FFT performance"""
print("\n" + "="*60)
print("Performance Comparison: CPU vs GPU")
print("="*60)
sizes = [2**10, 2**15, 2**20] # 1K, 32K, 1M
for N in sizes:
# Generate test signal
signal_cpu = np.random.random(N).astype(np.float32)
signal_gpu = cp.array(signal_cpu)
# CPU FFT (NumPy)
import time
start = time.time()
_ = np.fft.rfft(signal_cpu)
cpu_time = time.time() - start
# GPU FFT (CuPy with cuFFT)
start = time.time()
_ = cp.fft.rfft(signal_gpu)
cp.cuda.Stream.null.synchronize() # Wait for GPU
gpu_time = time.time() - start
speedup = cpu_time / gpu_time
print(f"N = {N:>7} | CPU: {cpu_time*1000:>7.2f}ms | GPU: {gpu_time*1000:>7.2f}ms | Speedup: {speedup:>5.1f}x")
if __name__ == "__main__":
main()
# Uncomment to see CPU vs GPU performance comparison
# compare_cpu_gpu_performance()