Ultimate Guide to Nvidia CUDA nvrtc cu11 for Developers

Ultimate Guide to Nvidia CUDA nvrtc cu11 for Developers

The nvidia-cuda-nvrtc-cu11 library provides a powerful toolkit for developers looking to leverage the performance of Nvidia GPUs. This guide introduces the nvrtc library, walks through many of its APIs, and provides sample code snippets to help you get started. By the end of this guide, you will be well-equipped to build high-performance applications using CUDA runtime compilation.

Introduction to nvidia-cuda-nvrtc-cu11

The NVRTC (NVIDIA Runtime Compilation) library allows developers to compile CUDA C++ kernels at runtime, enabling dynamic applications and simplifying the development of software pipelines that benefit from GPU acceleration. NVRTC provides a highly flexible API that supports features such as CUDA kernel compilation, error checking, and direct integration with CUDA driver and runtime APIs.

Basic NVRTC API Usage

Below are some essential NVRTC APIs with explanations and sample code snippets for better understanding:

1. nvrtcCreateProgram

Creates a program from the source code.

nvrtcProgram prog;
nvrtcCreateProgram(&prog, cuda_src, "example.cu", 0, NULL, NULL);

2. nvrtcCompileProgram

Compiles the CUDA program.

nvrtcResult res = nvrtcCompileProgram(prog, 0, NULL);
if(res != NVRTC_SUCCESS) {
  const char* log;
  nvrtcGetProgramLog(prog, &log);
  printf("Compile Error: %s\n", log);
}

3. nvrtcGetPTX

Obtains the generated PTX (Parallel Thread Execution) code from the compiled program.

size_t ptxSize;
nvrtcGetPTXSize(prog, &ptxSize);
char* ptx = new char[ptxSize];
nvrtcGetPTX(prog, ptx);

Additional NVRTC APIs

Here are more NVRTC functions with brief descriptions:

4. nvrtcAddNameExpression

Adds a name expression to the program.

nvrtcAddNameExpression(prog, "myKernel");

5. nvrtcDestroyProgram

Frees the program resources.

nvrtcDestroyProgram(&prog);

6. nvrtcGetProgramLog

Retrieves the compilation log.

size_t logSize;
nvrtcGetProgramLogSize(prog, &logSize);
char* log = new char[logSize];
nvrtcGetProgramLog(prog, log);

Building an Application with NVRTC APIs

Let’s create a simple application that demonstrates the use of NVRTC APIs to compile and run a CUDA kernel at runtime:

#include 
#include 
#include 

const char* cuda_src = R"(
  extern "C" __global__ void addKernel(int* c, const int* a, const int* b) {
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
  })";

int main() {
  // NVRTC: Create and compile the program
  nvrtcProgram prog;
  nvrtcCreateProgram(&prog, cuda_src, "addKernel.cu", 0, NULL, NULL);
  
  nvrtcResult compileResult = nvrtcCompileProgram(prog, 0, NULL);
  if (compileResult != NVRTC_SUCCESS) {
    size_t logSize;
    nvrtcGetProgramLogSize(prog, &logSize);
    char* log = new char[logSize];
    nvrtcGetProgramLog(prog, log);
    printf("Compile Error: %s\n", log);
    nvrtcDestroyProgram(&prog);
    return -1;
  }

  // Get PTX code
  size_t ptxSize;
  nvrtcGetPTXSize(prog, &ptxSize);
  char* ptx = new char[ptxSize];
  nvrtcGetPTX(prog, ptx);

  // CUDA: Load PTX code
  CUdevice cuDevice;
  CUcontext context;
  CUmodule module;
  CUfunction kernel;
  cuInit(0);
  cuDeviceGet(&cuDevice, 0);
  cuCtxCreate(&context, 0, cuDevice);
  cuModuleLoadDataEx(&module, ptx, 0, 0, 0);
  cuModuleGetFunction(&kernel, module, "addKernel");

  // CUDA: Prepare and execute kernel
  int N = 10;
  int h_a[N], h_b[N], h_c[N];
  for (int i = 0; i < N; ++i) {
    h_a[i] = i;
    h_b[i] = N - i;
  }

  int *d_a, *d_b, *d_c;
  cuMemAlloc((CUdeviceptr*)&d_a, N * sizeof(int));
  cuMemAlloc((CUdeviceptr*)&d_b, N * sizeof(int));
  cuMemAlloc((CUdeviceptr*)&d_c, N * sizeof(int));
  cuMemcpyHtoD((CUdeviceptr)d_a, h_a, N * sizeof(int));
  cuMemcpyHtoD((CUdeviceptr)d_b, h_b, N * sizeof(int));
  
  void* args[] = { &d_c, &d_a, &d_b };
  cuLaunchKernel(kernel, 1, 1, 1, N, 1, 1, 0, NULL, args, 0);
  cuMemcpyDtoH(h_c, (CUdeviceptr)d_c, N * sizeof(int));

  // Verify the result
  for (int i = 0; i < N; i++) {
    printf("%d + %d = %d\n", h_a[i], h_b[i], h_c[i]);
  }

  // Cleanup
  cuMemFree((CUdeviceptr)d_a);
  cuMemFree((CUdeviceptr)d_b);
  cuMemFree((CUdeviceptr)d_c);
  cuModuleUnload(module);
  cuCtxDestroy(context);
  nvrtcDestroyProgram(&prog);
  delete[] ptx;
  
  return 0;
}

This sample application demonstrates runtime CUDA code compilation and execution using the NVRTC APIs. It dynamically compiles a kernel that adds two vectors and runs the kernel on the GPU.


Hash: 046b388caa84388eafa955c92f1533fdc32f547e484cf7e821699eae75b977ce

Leave a Reply

Your email address will not be published. Required fields are marked *