Ultimate Guide to Nvidia CUDA nvrtc cu11 for Developers
The nvidia-cuda-nvrtc-cu11 library provides a powerful toolkit for developers looking to leverage the performance of Nvidia GPUs. This guide introduces the nvrtc library, walks through many of its APIs, and provides sample code snippets to help you get started. By the end of this guide, you will be well-equipped to build high-performance applications using CUDA runtime compilation.
Introduction to nvidia-cuda-nvrtc-cu11
The NVRTC (NVIDIA Runtime Compilation) library allows developers to compile CUDA C++ kernels at runtime, enabling dynamic applications and simplifying the development of software pipelines that benefit from GPU acceleration. NVRTC provides a highly flexible API that supports features such as CUDA kernel compilation, error checking, and direct integration with CUDA driver and runtime APIs.
Basic NVRTC API Usage
Below are some essential NVRTC APIs with explanations and sample code snippets for better understanding:
1. nvrtcCreateProgram
Creates a program from the source code.
nvrtcProgram prog; nvrtcCreateProgram(&prog, cuda_src, "example.cu", 0, NULL, NULL);
2. nvrtcCompileProgram
Compiles the CUDA program.
nvrtcResult res = nvrtcCompileProgram(prog, 0, NULL); if(res != NVRTC_SUCCESS) { const char* log; nvrtcGetProgramLog(prog, &log); printf("Compile Error: %s\n", log); }
3. nvrtcGetPTX
Obtains the generated PTX (Parallel Thread Execution) code from the compiled program.
size_t ptxSize; nvrtcGetPTXSize(prog, &ptxSize); char* ptx = new char[ptxSize]; nvrtcGetPTX(prog, ptx);
Additional NVRTC APIs
Here are more NVRTC functions with brief descriptions:
4. nvrtcAddNameExpression
Adds a name expression to the program.
nvrtcAddNameExpression(prog, "myKernel");
5. nvrtcDestroyProgram
Frees the program resources.
nvrtcDestroyProgram(&prog);
6. nvrtcGetProgramLog
Retrieves the compilation log.
size_t logSize; nvrtcGetProgramLogSize(prog, &logSize); char* log = new char[logSize]; nvrtcGetProgramLog(prog, log);
Building an Application with NVRTC APIs
Let’s create a simple application that demonstrates the use of NVRTC APIs to compile and run a CUDA kernel at runtime:
#include#include #include const char* cuda_src = R"( extern "C" __global__ void addKernel(int* c, const int* a, const int* b) { int i = threadIdx.x; c[i] = a[i] + b[i]; })"; int main() { // NVRTC: Create and compile the program nvrtcProgram prog; nvrtcCreateProgram(&prog, cuda_src, "addKernel.cu", 0, NULL, NULL); nvrtcResult compileResult = nvrtcCompileProgram(prog, 0, NULL); if (compileResult != NVRTC_SUCCESS) { size_t logSize; nvrtcGetProgramLogSize(prog, &logSize); char* log = new char[logSize]; nvrtcGetProgramLog(prog, log); printf("Compile Error: %s\n", log); nvrtcDestroyProgram(&prog); return -1; } // Get PTX code size_t ptxSize; nvrtcGetPTXSize(prog, &ptxSize); char* ptx = new char[ptxSize]; nvrtcGetPTX(prog, ptx); // CUDA: Load PTX code CUdevice cuDevice; CUcontext context; CUmodule module; CUfunction kernel; cuInit(0); cuDeviceGet(&cuDevice, 0); cuCtxCreate(&context, 0, cuDevice); cuModuleLoadDataEx(&module, ptx, 0, 0, 0); cuModuleGetFunction(&kernel, module, "addKernel"); // CUDA: Prepare and execute kernel int N = 10; int h_a[N], h_b[N], h_c[N]; for (int i = 0; i < N; ++i) { h_a[i] = i; h_b[i] = N - i; } int *d_a, *d_b, *d_c; cuMemAlloc((CUdeviceptr*)&d_a, N * sizeof(int)); cuMemAlloc((CUdeviceptr*)&d_b, N * sizeof(int)); cuMemAlloc((CUdeviceptr*)&d_c, N * sizeof(int)); cuMemcpyHtoD((CUdeviceptr)d_a, h_a, N * sizeof(int)); cuMemcpyHtoD((CUdeviceptr)d_b, h_b, N * sizeof(int)); void* args[] = { &d_c, &d_a, &d_b }; cuLaunchKernel(kernel, 1, 1, 1, N, 1, 1, 0, NULL, args, 0); cuMemcpyDtoH(h_c, (CUdeviceptr)d_c, N * sizeof(int)); // Verify the result for (int i = 0; i < N; i++) { printf("%d + %d = %d\n", h_a[i], h_b[i], h_c[i]); } // Cleanup cuMemFree((CUdeviceptr)d_a); cuMemFree((CUdeviceptr)d_b); cuMemFree((CUdeviceptr)d_c); cuModuleUnload(module); cuCtxDestroy(context); nvrtcDestroyProgram(&prog); delete[] ptx; return 0; }
This sample application demonstrates runtime CUDA code compilation and execution using the NVRTC APIs. It dynamically compiles a kernel that adds two vectors and runs the kernel on the GPU.
Hash: 046b388caa84388eafa955c92f1533fdc32f547e484cf7e821699eae75b977ce