CUDA exception capture

Technical background

In CUDA programming, you may encounter some relatively hidden errors, but it is not revealed if you directly compile and run the cu file. Then you can monitor possible errors during the operation of the CUDA program by adding a macro for inspection.

We implement this macro in the CUDA header file:

#pragma once
#include <>

#define CHECK(call) do{const cudaError_t error_code = call; if (error_code != cudaSuccess){printf("CUDA Error:\n"); printf("    File:   %s\n", __FILE__); printf("    Line:   %d\n", __LINE__); printf("    Error code: %d\n", error_code); printf("    Error text: %s\n", cudaGetErrorString(error_code)); exit(1);}} while (0)

Then when calling CUDA related functions or kernel functions, you can use CHECK operations to monitor whether there are any related exceptions.

Call test

First, use a simple test case, which is the scenario of video memory allocation. If it is a normal video memory allocation:

// nvcc ./test_error.cu -Xcompiler -fPIC -o ./test_error && ./test_error
#include ""
#include <>

int main(void){
    const int N = 100000000;
    const int M = sizeof(double) * N;
    double *d_x;
    CHECK(cudaMalloc((void **)&d_x, M));
    CHECK(cudaFree(d_x));
    printf("Success!\n");
}

There is no error in the run result:

Success!

But if we increase the value of N so that it exceeds the memory size:

// nvcc ./test_error.cu -Xcompiler -fPIC -o ./test_error && ./test_error
#include ""
#include <>

int main(void){
    const int N = 1000000000;
    const int M = sizeof(double) * N;
    double *d_x;
    CHECK(cudaMalloc((void **)&d_x, M));
    CHECK(cudaFree(d_x));
    printf("Success!\n");
}

If you run it again, you will report an OOM error:

./test_error.cu(7): warning #69-D: integer conversion resulted in truncation
      const int M = sizeof(double) * N;
                    ^

Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"

./test_error.cu(9): warning #68-D: integer conversion resulted in a change of sign
      do{const cudaError_t error_code = cudaMalloc((void **)&d_x, M); if (error_code != cudaSuccess){printf("CUDA Error:\n"); printf("    File:   %s\n", "./test_error.cu"); printf("    Line:   %d\n", 9); printf("    Error code: %d\n", error_code); printf("    Error text: %s\n", cudaGetErrorString(error_code)); exit(1);}} while (0);
                                                                  ^

./test_error.cu(7): warning #69-D: integer conversion resulted in truncation
      const int M = sizeof(double) * N;
                    ^

Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"

./test_error.cu(9): warning #68-D: integer conversion resulted in a change of sign
      do{const cudaError_t error_code = cudaMalloc((void **)&d_x, M); if (error_code != cudaSuccess){printf("CUDA Error:\n"); printf("    File:   %s\n", "./test_error.cu"); printf("    Line:   %d\n", 9); printf("    Error code: %d\n", error_code); printf("    Error text: %s\n", cudaGetErrorString(error_code)); exit(1);}} while (0);
                                                                  ^

./test_error.cu: In function 'int main()':
./test_error.cu:7:31: warning: overflow in conversion from 'long unsigned int' to 'int' changes value from '8000000000' to '-589934592' [-Woverflow]
    7 |     const int M = sizeof(double) * N;
      |               ~~~~~~~~~~~~~~~~^~~~
CUDA Error:
    File:   ./test_error.cu
    Line:   9
    Error code: 2
    Error text: out of memory

Of course, there are some other warning information in the middle due to plastic surgery, but the main thing to show is the OOM error problem.

Kernel function detection

The above exception detection is for the CUDA operation of cudaMalloc. In fact, for kernel functions, its exceptions can also be detected. Let's first demonstrate a normal example:

// nvcc ./test_error.cu -Xcompiler -fPIC -o ./test_error && chmod +x ./test_error && ./test_error
#include ""
#include <>
#include <>

void __global__ add(const double *x, const double *y, double *z, const int N){
    int idx =  *  + ;
    if (idx < N){
        z[idx] = x[idx] + y[idx];
    }
}

int main(void){
    const int N = 10;
    const int M = sizeof(double) * N;
    const double a = 1.23;
    double *h_x = (double*) malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = a;
    }
    double *d_x, *d_z;
    CHECK(cudaMalloc((void **)&d_x, M));
    CHECK(cudaMalloc((void **)&d_z, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
    const int block_size = 1024;
    const int grid_size = (N + block_size - 1) / block_size;
    add<<<grid_size, block_size>>>(d_x, d_x, d_z, N);
    CHECK(cudaGetLastError());
    CHECK(cudaDeviceSynchronize());
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_z));
    free(h_x);
    printf("Success!\n");
    return 0;
}

This CUDA program runs an array addition. Running results:

$ nvcc ./test_error.cu -Xcompiler -fPIC -o ./test_error && chmod +x ./test_error && ./test_error
Success!

Adjust the block_size parameter:

// nvcc ./test_error.cu -Xcompiler -fPIC -o ./test_error && chmod +x ./test_error && ./test_error
#include ""
#include <>
#include <>

void __global__ add(const double *x, const double *y, double *z, const int N){
    int idx =  *  + ;
    if (idx < N){
        z[idx] = x[idx] + y[idx];
    }
}

int main(void){
    const int N = 10;
    const int M = sizeof(double) * N;
    const double a = 1.23;
    double *h_x = (double*) malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = a;
    }
    double *d_x, *d_z;
    CHECK(cudaMalloc((void **)&d_x, M));
    CHECK(cudaMalloc((void **)&d_z, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
    const int block_size = 1025;
    const int grid_size = (N + block_size - 1) / block_size;
    add<<<grid_size, block_size>>>(d_x, d_x, d_z, N);
    CHECK(cudaGetLastError());
    CHECK(cudaDeviceSynchronize());
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_z));
    free(h_x);
    printf("Success!\n");
    return 0;
}

Since the maximum block size in CUDA programs can only be 1024, an exception will appear if this number is exceeded. However, if there is no exception detection function, the program can be executed normally, so that the exception will remain in the program. Running results:

$ nvcc ./test_error.cu -Xcompiler -fPIC -o ./test_error && chmod +x ./test_error && ./test_error
CUDA Error:
    File:   ./test_error.cu
    Line:   29
    Error code: 9
    Error text: invalid configuration argument

Because it's addedcudaGetLastError()function and use exception-catching macros, so here will prompt the parameter configuration exception.

Summary

This article mainly introduces the addition of an exception-catching macro module in the practice of CUDA programming to ensure the accuracy of CUDA project results. The main code content refers to "The Basics and Practice of CUDA Programming" written by Fan Zheyong, which is a good introductory book for CUDA programming.

The first link to this article is:/dechinphy/p/cuda_error.html

Author ID: DechinPhy