Cuda硬件基础

 
Category: C_C++

写在前面

nvcc编译常见选项

/home/opt/cuda-10.1/bin/nvcc -g -G -arch=sm_35 \
    -I../../include/ \
    -Xcompiler "-std=gnu++14 -D_GLIBCXX_USE_CXX11_ABI=1" \
    -ccbin /opt/compiler/gcc-8.2/bin/gcc \
    -Xlinker "-rpath,/opt/compiler/gcc-8.2/lib:/opt/lib64" \
    "$@" \
    -lcudadevrt \
    --relocatable-device-code true && \
    ./a.out
  • -g: 为主机代码(CPU 代码)生成调试信息
  • -G: 为设备代码(GPU 代码)生成调试信息
    • 两者结合可以在 GPU 内核中设置断点和调试
  • -Xcompiler:将后续参数传递给主机 C++ 编译器
  • -std=gnu++14
    • 使用 C++14 标准,包含 GNU 扩展
    • 支持 C++14 特性如泛型 lambda、变量模板等
  • -D_GLIBCXX_USE_CXX11_ABI=1
    • 定义预处理器宏
    • 强制使用 C++11 ABI(应用程序二进制接口)
    • 解决 GCC 5+ 与旧版本库的兼容性问题
  • -lcudadevrt

    • 链接 CUDA 设备运行时库

    • 支持动态并行(GPU 内核启动其他 GPU 内核)

    • 需要与 --relocatable-device-code 配合使用

  • --relocatable-device-code true

    • 生成可重定位的设备代码

    • 允许分离编译(单独编译多个 CUDA 文件后链接)

    • 支持更复杂的项目结构和大规模代码库

查看硬件属性

Tesla T4

#include <stdio.h>
#include <cuda_runtime.h>

/*
 * Fetches basic information on the first device in the current CUDA platform,
 * including number of SMs, bytes of constant memory, bytes of shared memory per
 * block, etc.
 */
 
#define CUDA_CHECK(call)                                                  \
    do {                                                                  \
        cudaError_t err = call;                                           \
        if (err != cudaSuccess) {                                         \
            fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
                    cudaGetErrorString(err));                             \
            exit(1);                                                      \
        }                                                                 \
    } while (0)


int main(int argc, char *argv[])
{
    int iDev = 0;
    cudaDeviceProp iProp;
    CUDA_CHECK(cudaGetDeviceProperties(&iProp, iDev));

    printf("Device %d: %s\n", iDev, iProp.name);
    printf("  Number of multiprocessors:                     %d\n",
           iProp.multiProcessorCount);
    printf("  Total amount of constant memory:               %4.2f KB\n",
           iProp.totalConstMem / 1024.0);
    printf("  Total amount of shared memory per block:       %4.2f KB\n",
           iProp.sharedMemPerBlock / 1024.0);
    printf("  Total number of registers available per block: %d\n",
           iProp.regsPerBlock);
    printf("  Warp size:                                     %d\n",
           iProp.warpSize);
    printf("  Maximum number of threads per block:           %d\n",
           iProp.maxThreadsPerBlock);
    printf("  Maximum number of threads per multiprocessor:  %d\n",
           iProp.maxThreadsPerMultiProcessor);
    printf("  Maximum number of warps per multiprocessor:    %d\n",
           iProp.maxThreadsPerMultiProcessor / 32);
    return EXIT_SUCCESS;
}

基本信息

Device 0: Tesla T4
  Number of multiprocessors:                     40
  Total amount of constant memory:               64.00 KB
  Total amount of shared memory per block:       48.00 KB
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per block:           1024
  Maximum number of threads per multiprocessor:  1024
  Maximum number of warps per multiprocessor:    32
Device 0: NVIDIA A10
  Number of multiprocessors:                     72
  Total amount of constant memory:               64.00 KB
  Total amount of shared memory per block:       48.00 KB
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per block:           1024
  Maximum number of threads per multiprocessor:  1536
  Maximum number of warps per multiprocessor:    48
Device 0: NVIDIA L20
  Number of multiprocessors:                     92
  Total amount of constant memory:               64.00 KB
  Total amount of shared memory per block:       48.00 KB
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per block:           1024
  Maximum number of threads per multiprocessor:  1536
  Maximum number of warps per multiprocessor:    48