写在前面
nvcc编译常见选项
/home/opt/cuda-10.1/bin/nvcc -g -G -arch=sm_35 \
-I../../include/ \
-Xcompiler "-std=gnu++14 -D_GLIBCXX_USE_CXX11_ABI=1" \
-ccbin /opt/compiler/gcc-8.2/bin/gcc \
-Xlinker "-rpath,/opt/compiler/gcc-8.2/lib:/opt/lib64" \
"$@" \
-lcudadevrt \
--relocatable-device-code true && \
./a.out
-
-g: 为主机代码(CPU 代码)生成调试信息 -
-G: 为设备代码(GPU 代码)生成调试信息- 两者结合可以在 GPU 内核中设置断点和调试
-
-Xcompiler:将后续参数传递给主机 C++ 编译器 -
-std=gnu++14:- 使用 C++14 标准,包含 GNU 扩展
- 支持 C++14 特性如泛型 lambda、变量模板等
-
-D_GLIBCXX_USE_CXX11_ABI=1:- 定义预处理器宏
- 强制使用 C++11 ABI(应用程序二进制接口)
- 解决 GCC 5+ 与旧版本库的兼容性问题
-
-lcudadevrt-
链接 CUDA 设备运行时库
-
支持动态并行(GPU 内核启动其他 GPU 内核)
-
需要与
--relocatable-device-code配合使用
-
-
--relocatable-device-code true-
生成可重定位的设备代码
-
允许分离编译(单独编译多个 CUDA 文件后链接)
-
支持更复杂的项目结构和大规模代码库
-
查看硬件属性
Tesla T4
#include <stdio.h>
#include <cuda_runtime.h>
/*
* Fetches basic information on the first device in the current CUDA platform,
* including number of SMs, bytes of constant memory, bytes of shared memory per
* block, etc.
*/
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(err)); \
exit(1); \
} \
} while (0)
int main(int argc, char *argv[])
{
int iDev = 0;
cudaDeviceProp iProp;
CUDA_CHECK(cudaGetDeviceProperties(&iProp, iDev));
printf("Device %d: %s\n", iDev, iProp.name);
printf(" Number of multiprocessors: %d\n",
iProp.multiProcessorCount);
printf(" Total amount of constant memory: %4.2f KB\n",
iProp.totalConstMem / 1024.0);
printf(" Total amount of shared memory per block: %4.2f KB\n",
iProp.sharedMemPerBlock / 1024.0);
printf(" Total number of registers available per block: %d\n",
iProp.regsPerBlock);
printf(" Warp size: %d\n",
iProp.warpSize);
printf(" Maximum number of threads per block: %d\n",
iProp.maxThreadsPerBlock);
printf(" Maximum number of threads per multiprocessor: %d\n",
iProp.maxThreadsPerMultiProcessor);
printf(" Maximum number of warps per multiprocessor: %d\n",
iProp.maxThreadsPerMultiProcessor / 32);
return EXIT_SUCCESS;
}
基本信息
Device 0: Tesla T4
Number of multiprocessors: 40
Total amount of constant memory: 64.00 KB
Total amount of shared memory per block: 48.00 KB
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per block: 1024
Maximum number of threads per multiprocessor: 1024
Maximum number of warps per multiprocessor: 32
Device 0: NVIDIA A10
Number of multiprocessors: 72
Total amount of constant memory: 64.00 KB
Total amount of shared memory per block: 48.00 KB
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per block: 1024
Maximum number of threads per multiprocessor: 1536
Maximum number of warps per multiprocessor: 48
Device 0: NVIDIA L20
Number of multiprocessors: 92
Total amount of constant memory: 64.00 KB
Total amount of shared memory per block: 48.00 KB
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per block: 1024
Maximum number of threads per multiprocessor: 1536
Maximum number of warps per multiprocessor: 48