Hello World CUDA！

Original deepin OS 深度操作系统 2022-06-15

引言

这是很久以前跟师弟了解CUDA时写的一个文章，还挺有意思的，涉及内容比较简单，一些皮毛。抛砖引玉，和大家分享一下。

我们经常开玩笑说自己精通十种语言的Hello，World的写法。这周因为要阅读研究YOLO，在其中有一定的CUDA代码，所以写下了这篇报告“Hello，World的CUDA写法”。什么是CUDA呢，来自百度百科的解释是CUDA（Compute Unified Device Architecture），是显卡厂商NVIDIA推出的运算平台。 CUDA™是一种由NVIDIA推出的通用并行计算架构，该架构使GPU能够解决复杂的计算问题。

开始

安装过程就不在赘述，因为就是点击下一步。本机上使用到的版本是VS2017&CUDA 9.0正式版。

在开始学习CUDA的Hello，World写法之前我们先写一个C的Hello，World。这个应该非常简单吧。

#include

int main()

{

printf("Hello,World");

return 0;

}

然后要将这个程序转化成一个CUDA程序需要加上一些语句，加上之后的程序段就是这个样子的。

#include

#include "cuda_runtime.h"

__global__ void helloKernel()

{

}

int main()

{

helloKernel <<>> ();

printf("Hello,World");

return 0;

}

这个程序段和之前的c语言的Hello，World相比多了一个空函数helloKernel并且这个函数带有global修饰符，对于这个空函数的调用，需要用<<>>来修饰。

这个函数就是我们在CUDA编程中会经常使用的核函数。这个核函数在官方的手册中的说明是这样的，

A kernel is defined using the globaldeclaration specifier and the number of CUDA threads that execute that kernel for a given kernel call is specified using a new <<>>execution configuration syntax (see C Language Extensions). Each thread that executes the kernel is given a unique thread ID that is accessible within the kernel through the built-in threadIdx variable.

使用__global__声明说明符定义内核函数，并使用新的<<>>执行配置语法指定为给定内核调用执行该内核的CUDA线程数。执行内核的每个线程都被赋予一个唯一的线程ID，该内核可以通过内置的threadIdx变量在内核中访问。

对于一个CUDA初学者来说，官方的例子肯定是最好的学习手段。我们下面就从一个官方给出的矩阵相加的例子来初步的学习CUDA在GPU上的编程是怎么实现的。

sample学习

在开始学习之前我先将整个代码贴出来，这个代码是用vs2017创建CUDA项目之后自动生成的代码。

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)

{

int i = threadIdx.x;

c[i] = a[i] + b[i];

}

__global__ void helloKernel(void)

{

printf("hello");

}

int main()

{

const int arraySize = 5;

const int a[arraySize] = { 1, 2, 3, 4, 5 };

const int b[arraySize] = { 10, 20, 30, 40, 50 };

int c[arraySize] = { 0 };

// Add vectors in parallel.

cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "addWithCuda failed!");

return 1;

}

printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",

c[0], c[1], c[2], c[3], c[4]);

// cudaDeviceReset must be called before exiting in order for profiling and

// tracing tools such as Nsight and Visual Profiler to show complete traces.

cudaStatus = cudaDeviceReset();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaDeviceReset failed!");

return 1;

}

return 0;

}

// Helper function for using CUDA to add vectors in parallel.

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)

{

int *dev_a = 0;

int *dev_b = 0;

int *dev_c = 0;

cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.

cudaStatus = cudaSetDevice(0);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");

goto Error;

}

// Allocate GPU buffers for three vectors (two input, one output) .

cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

// Copy input vectors from host memory to GPU buffers.

cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

// Launch a kernel on the GPU with one thread for each element.

addKernel<<>>(dev_c, dev_a, dev_b);

// Check for any errors launching the kernel

cudaStatus = cudaGetLastError();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));

goto Error;

}

// cudaDeviceSynchronize waits for the kernel to finish, and returns

// any errors encountered during the launch.

cudaStatus = cudaDeviceSynchronize();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

goto Error;

}

// Copy output vector from GPU buffer to host memory.

cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

Error:

cudaFree(dev_c);

cudaFree(dev_a);

cudaFree(dev_b);

return cudaStatus;

}

这个代码不算太长，但是实现了调用GPU进行矩阵运算的所有功能。

在进行CUDA编程之前，我们需要包含CUDA_RUNTIME版本的头文件，然后我们就可以定义我们的核函数了。在CUDA中定义核函数很简单，只需要将一个函数前面加上global修饰就可以将这个函数定义为一个核函数。CUDA程序的编译是由nvidia公司提供的nvcc编译器进行编译的，所以我们在安装的时候一定要确定自己的nvcc编译器运行没有错误。

我们从main函数开始阅读这段代码，这样可以很清晰的看出代码的运行过程。

int main()

{

const int arraySize = 5;

const int a[arraySize] = { 1, 2, 3, 4, 5 };

const int b[arraySize] = { 10, 20, 30, 40, 50 };

int c[arraySize] = { 0 };

// Add vectors in parallel.

cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "addWithCuda failed!");

return 1;

}

printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",

c[0], c[1], c[2], c[3], c[4]);

// cudaDeviceReset must be called before exiting in order for profiling and

// tracing tools such as Nsight and Visual Profiler to show complete traces.

cudaStatus = cudaDeviceReset();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaDeviceReset failed!");

return 1;

}

return 0;

}

首先main函数中定义了我们要相加的两个数组的大小和里面的数据。然后调用了一个addWithCuda的函数，这个函数会返回一个状态，这个状态是CUDA使用的错误状态码，在调用完相加程序之后，我们可以用这个状态码来判断程序是否正确的运行了，并且得到了正确的结果。

cudaStatus = cudaDeviceReset();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaDeviceReset failed!");

return 1;

}

这个函数是CUDA编程里面比较重要的一个函数，这个函数主要的作用就是在当前流程中销毁所有分配并重置当前设备上的所有状态。

在官方手册中的解释是这样的：

When a host thread calls cudaDeviceReset(), this destroys the primary context of the device the host thread currently operates on (i.e., the current device as defined in Device Selection). The next runtime function call made by any host thread that has this device as current will create a new primary context for this device.

当主机线程调用cudaDeviceReset()时，这销毁了主机线程操作的设备的主要上下文。任何以这个设备为当前设备的主机线程调用的运行时函数将为设备重新建立一个主要上下文。

CUDA 上下文类似于CPU的进程。所有资源和在驱动程序API 中执行的操作都封装在CUDA 上下文内，在销毁上下文时，系统将自动清理这些资源。

然后就到了我们的相加函数中，在这个函数中，有几个调用函数比较重要，分别是：

cudaStatus = cudaSetDevice(0);

cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));

cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);

addKernel<<>>(dev_c, dev_a, dev_b);

cudaStatus = cudaGetLastError();

cudaStatus = cudaDeviceSynchronize();

cudaFree(dev_c);

cudaStatus = cudaSetDevice(0);

在CUDA编程中，我们需要选择设备上的某一块显卡来执行我们的计算操作，这个代码就是选择设备商的第0块显卡来执行我们的程序。这个语句对于多GPU的设备非常重要，因为CUDA可以分别让不同的GPU执行不同的任务。

cudaMalloc、cudaMemcpy函数和我们平时在C语言下使用的Malloc和Memcpy函数非常相似。为什么我们需要这两个函数呢。因为大家都知道的是，我们使用GPU进行运算，这个运算的位置是位于GPU的显存上，那么又有人会去问什么是显存呢？显存全称显示内存，亦称帧缓存，它是用来存储显示芯片处理过或者即将读取的渲染数据。如同计算机的内存一样，显存是用来存储图形数据的硬件。显存在GPU中相当于电脑中的内存，GPU进行运算的时候是由GPU处理核心从显存中提取数据然后进行运算，然后将运算的结果再存放到显存中。那么为什么我们需要将内存中的内容拷贝到显存中呢？因为，显卡和内存之间的带宽并不是想象中的那么大，在高速运算的情况下，这种带宽的限制可能就成为了我们计算速度的瓶颈，所以，我们在使用GPU加速运算的时候，是将数据先拷贝到显存中，然后再利用GPU的运算特性来进行运算加速。

下面就要讲到编程中最重要的核函数了。在讲解核函数之前我们要先了解一下关于CUDA 的一些基本的架构。

CUDA中的线程（thread）是设备中并行运算结构中的最小单位，类似于主机中的线程的概念，thread可以以一维、二维、三维的形式组织在一起，threadIdx.x表示的是thread在x方向的索引号，还可能存在thread在y和z方向的索引号threadIdx.y和threadIdx.z。

一维、二维或三维的thread组成一个线程块（Block），一维、二维或三维的线程块（Block）组合成一个线程块网格（Grid），线程块网格（Grid）可以是一维或二维的。通过网格块（Grid）->线程块（Block）->线程（thread）的顺序可以定位到每一个并且唯一的线程。

回到程序中的addKernel函数上来，这个函数会被GPU上的多个线程同时执行一次，线程间彼此没有通信，相互独立。到底会有多少个线程来分别执行核函数，是在“<<>>”符号里定义的。“<<>>”表示运行时配置符号，在本程序中的定义是<<>>，表示分配了一个线程块（Block），每个线程块有分配了size个线程，“<<>>”中的参数并不是传递给设备代码的参数，而是定义主机代码运行时如何启动设备代码。以上定义的这些线程都是一个维度上的，可以通过thredaIdx.x来获取执行当前计算任务的线程的ID号。

并且在CUDA中是使用了host 和kernel.

在CUDA 的架构下，一个程序分为两个部份：host 端和device 端。Host 端是指在CPU 上执行的部份，而device 端则是在显示芯片上执行的部份。Device 端的程序又称为“kernel”。通常host 端程序会将数据准备好后，复制到显卡的内存中，再由显示芯片执行device 端程序，完成后再由host 端程序将结果从显卡的内存中取回。

核函数的语法是这样的函数名称 <<>>(参数...);

在核函数计算完成之后我们要将计算结果复制回主内存上并且使用cudaStatus = cudaDeviceSynchronize();将设备进行重置。

最后在主函数中将结果进行打印并且再次将设备进行重置。这样我们的第一个CUDA程序就完成了。

参考资料

1 《GPU高性能编程CUDA实战》

2 http://blog.csdn.net/dcrmg/article/details/54446393

3 http://blog.csdn.net/sunmc1204953974/article/details/51016816

4 http://blog.csdn.net/sunmc1204953974/article/details/51000970

5 http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-c-runtime

6 http://docs.nvidia.com/cuda/cuda-runtime-api/index.html

7 《CUDA编程指南5.0中文版》

【相关链接】

法明传[2024]173号：1月1日起，未用示范文本提交起诉状，部分法院将不予立案

法明传[2024]173号：关于加快推进起诉状、答辩状示范文本全面应用工作的通知(附下载链接)

2025.1.1起，全国法院全面推进应用民事起诉状、答辩状示范文本(附下载链接)

法明传[2024]173号：1月1日起，未用示范文本提交起诉状，部分法院将不予立案

突然意外坠落！2人身亡！

Hello World CUDA！

引言

您可能也对以下帖子感兴趣

法明传[2024]173号：1月1日起，未用示范文本提交起诉状，部分法院将不予立案

法明传[2024]173号：关于加快推进起诉状、答辩状示范文本全面应用工作的通知(附下载链接)

2025.1.1起，全国法院全面推进应用民事起诉状、答辩状示范文本(附下载链接)

法明传[2024]173号：1月1日起，未用示范文本提交起诉状，部分法院将不予立案

突然意外坠落！2人身亡！

生成图片，分享到微信朋友圈

Hello World CUDA！

引言

您可能也对以下帖子感兴趣