Introduction to NVIDIA cuDNN-cu11
The NVIDIA CUDA Deep Neural Network library (cuDNN) is a GPU-accelerated library for deep neural networks. It provides highly optimized implementations for standard routines such as forward and backward convolution, pooling, normalization, and activation layers. cuDNN is widely used in deep learning applications to enhance the performance and efficiency of training and inference on NVIDIA GPUs.
Getting Started with NVIDIA cuDNN-cu11
To leverage the full power of NVIDIA GPUs, it’s imperative to understand and implement the various APIs provided by cuDNN. Below, we illustrate several APIs with code snippets and explanations.
1. Convolution Forward
cudnnHandle_t cudnn; cudnnCreate(&cudnn); cudnnTensorDescriptor_t input_descriptor; cudnnCreateTensorDescriptor(&input_descriptor); cudnnSetTensor4dDescriptor(input_descriptor, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, batch_size, in_channels, height, width); cudnnFilterDescriptor_t filter_descriptor; cudnnCreateFilterDescriptor(&filter_descriptor); cudnnSetFilter4dDescriptor(filter_descriptor, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, out_channels, in_channels, kernel_h, kernel_w); cudnnConvolutionDescriptor_t convolution_descriptor; cudnnCreateConvolutionDescriptor(&convolution_descriptor); cudnnSetConvolution2dDescriptor(convolution_descriptor, pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); cudnnGetConvolutionForwardAlgorithm(cudnn, input_descriptor, filter_descriptor, convolution_descriptor, output_descriptor, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &conv_algorithm); cudnnGetConvolutionForwardWorkspaceSize(cudnn, input_descriptor, filter_descriptor, convolution_descriptor, output_descriptor, conv_algorithm, &workspace_size); cudaMalloc(&d_workspace, workspace_size); cudnnConvolutionForward(cudnn, &alpha, input_descriptor, d_input, filter_descriptor, d_kernel, convolution_descriptor, conv_algorithm, d_workspace, workspace_size, &beta, output_descriptor, d_output);
2. Pooling Forward
cudnnPoolingDescriptor_t pooling_descriptor; cudnnCreatePoolingDescriptor(&pooling_descriptor); cudnnSetPooling2dDescriptor(pooling_descriptor, CUDNN_POOLING_MAX, CUDNN_PROPAGATE_NAN, window_h, window_w, pad_h, pad_w, stride_h, stride_w); cudnnPoolingForward(cudnn, pooling_descriptor, &alpha, input_descriptor, d_input, &beta, output_descriptor, d_output);
3. Activation Forward
cudnnActivationDescriptor_t activation_descriptor; cudnnCreateActivationDescriptor(&activation_descriptor); cudnnSetActivationDescriptor(activation_descriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0); cudnnActivationForward(cudnn, activation_descriptor, &alpha, input_descriptor, d_input, &beta, output_descriptor, d_output);
Application Example: CNN using cuDNN
Let’s implement a simple convolutional neural network (CNN) using the APIs described above. The CNN will have one convolution layer followed by a pooling layer and a ReLU activation.
void simple_cnn(int batch_size, int channels, int height, int width) { cudnnHandle_t cudnn; cudnnCreate(&cudnn); // Input layer cudnnTensorDescriptor_t input_descriptor; cudnnCreateTensorDescriptor(&input_descriptor); cudnnSetTensor4dDescriptor(input_descriptor, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, batch_size, channels, height, width); // Convolution layer cudnnFilterDescriptor_t filter_descriptor; cudnnCreateFilterDescriptor(&filter_descriptor); cudnnSetFilter4dDescriptor(filter_descriptor, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 32, channels, 3, 3); cudnnConvolutionDescriptor_t conv_descriptor; cudnnCreateConvolutionDescriptor(&conv_descriptor); cudnnSetConvolution2dDescriptor(conv_descriptor, 1, 1, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); int out_height, out_width; cudnnGetConvolution2dForwardOutputDim(conv_descriptor, input_descriptor, filter_descriptor, &batch_size, &channels, &out_height, &out_width); cudnnTensorDescriptor_t output_descriptor; cudnnCreateTensorDescriptor(&output_descriptor); cudnnSetTensor4dDescriptor(output_descriptor, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, batch_size, 32, out_height, out_width); cudnnConvolutionFwdAlgo_t conv_algo; cudnnGetConvolutionForwardAlgorithm(cudnn, input_descriptor, filter_descriptor, conv_descriptor, output_descriptor, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &conv_algo); size_t workspace_bytes; cudnnGetConvolutionForwardWorkspaceSize(cudnn, input_descriptor, filter_descriptor, conv_descriptor, output_descriptor, conv_algo, &workspace_bytes); void* d_workspace; cudaMalloc(&d_workspace, workspace_bytes); float alpha = 1.0f, beta = 0.0f; cudnnConvolutionForward(cudnn, &alpha, input_descriptor, d_input, filter_descriptor, d_kernel, conv_descriptor, conv_algo, d_workspace, workspace_bytes, &beta, output_descriptor, d_output); // Pooling layer cudnnPoolingDescriptor_t pooling_descriptor; cudnnCreatePoolingDescriptor(&pooling_descriptor); cudnnSetPooling2dDescriptor(pooling_descriptor, CUDNN_POOLING_MAX, CUDNN_PROPAGATE_NAN, 2, 2, 0, 0, 2, 2); cudnnTensorDescriptor_t pool_output_descriptor; cudnnCreateTensorDescriptor(&pool_output_descriptor); cudnnSetTensor4dDescriptor(pool_output_descriptor, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, batch_size, 32, out_height/2, out_width/2); cudnnPoolingForward(cudnn, pooling_descriptor, &alpha, output_descriptor, d_output, &beta, pool_output_descriptor, d_pool_output); // ReLU activation cudnnActivationDescriptor_t activation_descriptor; cudnnCreateActivationDescriptor(&activation_descriptor); cudnnSetActivationDescriptor(activation_descriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0); cudnnActivationForward(cudnn, activation_descriptor, &alpha, pool_output_descriptor, d_pool_output, &beta, pool_output_descriptor, d_activation_output); // Clean up cudnnDestroy(cudnn); cudnnDestroyTensorDescriptor(input_descriptor); cudnnDestroyFilterDescriptor(filter_descriptor); cudnnDestroyConvolutionDescriptor(conv_descriptor); cudnnDestroyTensorDescriptor(output_descriptor); cudnnDestroyPoolingDescriptor(pooling_descriptor); cudnnDestroyActivationDescriptor(activation_descriptor); }
This example illustrates the creation of a simple CNN using cuDNN APIs for convolution, pooling, and activation operations. With these APIs, you can build more complex deep learning models that take full advantage of NVIDIA GPU acceleration.
Hash: 77067092e6683c2d9f00b88bffac7e057f576a41fa989c12f38b2749efa94c6e