cuFFT Library Guide

November 3, 2025 / Naixian Zhang

Fast Fourier Transform (FFT) is a useful tool used in many areas, and we understand how demanding its computation can be. GPUs help manage this heavy processing, making a big difference for those working on these tasks daily.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cufft.h>
#include <cuda_runtime.h>

// Error checking macro for CUDA calls
#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
                    cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

// Error checking macro for cuFFT calls
#define CUFFT_CHECK(call) \
    do { \
        cufftResult err = call; \
        if (err != CUFFT_SUCCESS) { \
            fprintf(stderr, "cuFFT error at %s:%d: %d\n", __FILE__, __LINE__, err); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

int main() {
    const int N = 1024;  // Signal size
    const float frequency = 10.0f;  // Hz
    
    printf("=== cuFFT 1D Real-to-Complex FFT Example ===\n");
    printf("Signal size: %d\n", N);
    printf("Input frequency: %.1f Hz\n\n", frequency);
    
    // 1. Create cuFFT plan
    cufftHandle plan;
    CUFFT_CHECK(cufftPlan1d(&plan, N, CUFFT_R2C, 1));
    
    // 2. Allocate host memory
    float *h_input = (float*)malloc(N * sizeof(float));
    cufftComplex *h_output = (cufftComplex*)malloc((N/2 + 1) * sizeof(cufftComplex));
    
    if (!h_input || !h_output) {
        fprintf(stderr, "Failed to allocate host memory\n");
        exit(EXIT_FAILURE);
    }
    
    // 3. Allocate device memory
    float *d_input;
    cufftComplex *d_output;
    CUDA_CHECK(cudaMalloc(&d_input, N * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_output, (N/2 + 1) * sizeof(cufftComplex)));
    
    // 4. Generate input signal (sine wave)
    printf("Generating sine wave...\n");
    for (int i = 0; i < N; i++) {
        h_input[i] = sin(2.0 * M_PI * frequency * i / N);
    }
    
    // 5. Copy data from host to device
    CUDA_CHECK(cudaMemcpy(d_input, h_input, N * sizeof(float), 
                          cudaMemcpyHostToDevice));
    
    // 6. Execute FFT on GPU
    printf("Executing FFT on GPU...\n");
    CUFFT_CHECK(cufftExecR2C(plan, d_input, d_output));
    
    // 7. Copy results from device to host
    CUDA_CHECK(cudaMemcpy(h_output, d_output, (N/2 + 1) * sizeof(cufftComplex),
                          cudaMemcpyDeviceToHost));
    
    // 8. Display results
    printf("\nFFT Results:\n");
    printf("Frequency bins with highest magnitude:\n");
    printf("%-10s %-15s %-15s %-15s\n", "Bin", "Frequency (Hz)", "Magnitude", "Phase");
    printf("-----------------------------------------------------------\n");
    
    // Find and display top 5 frequency components
    for (int i = 0; i < 20; i++) {
        float magnitude = sqrt(h_output[i].x * h_output[i].x + 
                              h_output[i].y * h_output[i].y);
        float phase = atan2(h_output[i].y, h_output[i].x);
        float freq_hz = (float)i * N / N;  // Frequency in Hz
        
        if (magnitude > 1.0) {  // Only show significant components
            printf("%-10d %-15.2f %-15.2f %-15.2f\n", 
                   i, freq_hz, magnitude, phase);
        }
    }
    
    // Verify the peak is at the expected frequency bin
    int expected_bin = (int)frequency;
    float peak_magnitude = sqrt(h_output[expected_bin].x * h_output[expected_bin].x + 
                                h_output[expected_bin].y * h_output[expected_bin].y);
    printf("\nPeak at bin %d (%.1f Hz): magnitude = %.2f\n", 
           expected_bin, frequency, peak_magnitude);
    printf("Expected bin: %d ✓\n", expected_bin);
    
    // 9. Cleanup
    printf("\nCleaning up...\n");
    CUFFT_CHECK(cufftDestroy(plan));
    CUDA_CHECK(cudaFree(d_input));
    CUDA_CHECK(cudaFree(d_output));
    free(h_input);
    free(h_output);
    
    printf("Done!\n");
    
    return 0;
}

If in python, looks easier:

#!/usr/bin/env python3
"""
cuFFT Example - Python version using CuPy
Same functionality as the CUDA C++ version, but much simpler!
"""

import numpy as np
import cupy as cp
import matplotlib.pyplot as plt

def main():
    print("=== cuFFT 1D Real-to-Complex FFT Example (Python/CuPy) ===")
    
    # Parameters
    N = 1024
    frequency = 10.0  # Hz
    
    print(f"Signal size: {N}")
    print(f"Input frequency: {frequency} Hz\n")
    
    # Generate sine wave on CPU
    print("Generating sine wave...")
    t = np.arange(N)
    h_input = np.sin(2.0 * np.pi * frequency * t / N).astype(np.float32)
    
    # Transfer to GPU (this happens automatically with CuPy!)
    print("Transferring to GPU...")
    d_input = cp.array(h_input)
    
    # Execute FFT on GPU (cuFFT is used automatically!)
    print("Executing FFT on GPU...")
    d_output = cp.fft.rfft(d_input)
    
    # Transfer back to CPU
    h_output = cp.asnumpy(d_output)
    
    # Calculate magnitude and phase
    magnitude = np.abs(h_output)
    phase = np.angle(h_output)
    
    # Display results
    print("\nFFT Results:")
    print("Frequency bins with highest magnitude:")
    print(f"{'Bin':<10} {'Frequency (Hz)':<15} {'Magnitude':<15} {'Phase':<15}")
    print("-" * 60)
    
    # Show top frequency components
    for i in range(20):
        freq_hz = i * N / N
        if magnitude[i] > 1.0:
            print(f"{i:<10} {freq_hz:<15.2f} {magnitude[i]:<15.2f} {phase[i]:<15.2f}")
    
    # Verify peak
    expected_bin = int(frequency)
    peak_magnitude = magnitude[expected_bin]
    print(f"\nPeak at bin {expected_bin} ({frequency} Hz): magnitude = {peak_magnitude:.2f}")
    print(f"Expected bin: {expected_bin} ✓")
    
    # Optional: Plot results
    plot_results(h_input, magnitude, frequency, N)
    
    print("\nDone!")


def plot_results(signal, magnitude, frequency, N):
    """Plot the input signal and FFT magnitude spectrum"""
    try:
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
        
        # Plot time domain signal
        ax1.plot(signal[:200], linewidth=2)  # Show first 200 samples
        ax1.set_title(f'Input Signal: {frequency} Hz Sine Wave', fontsize=14, fontweight='bold')
        ax1.set_xlabel('Sample')
        ax1.set_ylabel('Amplitude')
        ax1.grid(True, alpha=0.3)
        
        # Plot frequency domain
        freq_bins = np.arange(len(magnitude))
        ax2.stem(freq_bins[:50], magnitude[:50], basefmt=' ')  # Show first 50 bins
        ax2.set_title('FFT Magnitude Spectrum', fontsize=14, fontweight='bold')
        ax2.set_xlabel('Frequency Bin')
        ax2.set_ylabel('Magnitude')
        ax2.grid(True, alpha=0.3)
        ax2.axvline(x=frequency, color='r', linestyle='--', label=f'Expected: {frequency} Hz')
        ax2.legend()
        
        plt.tight_layout()
        plt.savefig('fft_result.png', dpi=150, bbox_inches='tight')
        print("\n📊 Plot saved as 'fft_result.png'")
        
    except Exception as e:
        print(f"\n⚠️  Could not create plot: {e}")
        print("(matplotlib not available or display not configured)")


def compare_cpu_gpu_performance():
    """Compare CPU vs GPU FFT performance"""
    print("\n" + "="*60)
    print("Performance Comparison: CPU vs GPU")
    print("="*60)
    
    sizes = [2**10, 2**15, 2**20]  # 1K, 32K, 1M
    
    for N in sizes:
        # Generate test signal
        signal_cpu = np.random.random(N).astype(np.float32)
        signal_gpu = cp.array(signal_cpu)
        
        # CPU FFT (NumPy)
        import time
        start = time.time()
        _ = np.fft.rfft(signal_cpu)
        cpu_time = time.time() - start
        
        # GPU FFT (CuPy with cuFFT)
        start = time.time()
        _ = cp.fft.rfft(signal_gpu)
        cp.cuda.Stream.null.synchronize()  # Wait for GPU
        gpu_time = time.time() - start
        
        speedup = cpu_time / gpu_time
        print(f"N = {N:>7} | CPU: {cpu_time*1000:>7.2f}ms | GPU: {gpu_time*1000:>7.2f}ms | Speedup: {speedup:>5.1f}x")


if __name__ == "__main__":
    main()
    
    # Uncomment to see CPU vs GPU performance comparison
    # compare_cpu_gpu_performance()

Leave a comment Cancel reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.