安裝
wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
sudo service lightdm stop
sudo dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
sudo apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
sudo apt-get update && sudo apt-get install cuda -y
sudo reboot
sudo ln -s /usr/local/cuda/bin/nvcc /usr/bin/nvcc
設定環境變數
編輯.bashrc
vim ~/.bashrc
加入
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/nvidia-367
export CUDA_HOME=/usr/local/cuda
export PATH=$PATH:/usr/local/cuda/bin
測試編譯
方法1
cd /usr/local/cuda/samples/0_Simple/vectorAdd
make
./vectorAdd
方法2
建立測試檔案vectorAdd.cu
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements){
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements){
C[i] = A[i] + B[i];
}
}
int main(void){
int numElements = 50000;
//初始化測試資料
float *h_A=new float[numElements];
float *h_B=new float[numElements];
float *h_C=new float[numElements];
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
}
//配置GPU記憶體空間,並從記憶體中複製資料至GPU中
size_t size = numElements * sizeof(float);
float *d_A = NULL; cudaMalloc((void **)&d_A, size);
float *d_B = NULL; cudaMalloc((void **)&d_B, size);
float *d_C = NULL; cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
//運算
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock – 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
//取回運算結果
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
//清除GPU記憶體空間
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
//驗證資料
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] – h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
//清除記憶體
delete d_A;
delete d_B;
delete d_C;
printf("Test PASSED\n");
return 0;
}
接著手動編譯~
/usr/local/cuda-9.0/bin/nvcc \
-ccbin g++ \
-m64 \
-gencode arch=compute_30,code=sm_30 \
-gencode arch=compute_35,code=sm_35 \
-gencode arch=compute_37,code=sm_37 \
-gencode arch=compute_50,code=sm_50 \
-gencode arch=compute_52,code=sm_52 \
-gencode arch=compute_60,code=sm_60 \
-gencode arch=compute_70,code=sm_70 \
-gencode arch=compute_70,code=compute_70 \
-c vectorAdd.cu -o vectorAdd.o
/usr/local/cuda-9.0/bin/nvcc \
-ccbin g++ \
-m64 \
-gencode arch=compute_30,code=sm_30 \
-gencode arch=compute_35,code=sm_35 \
-gencode arch=compute_37,code=sm_37 \
-gencode arch=compute_50,code=sm_50 \
-gencode arch=compute_52,code=sm_52 \
-gencode arch=compute_60,code=sm_60 \
-gencode arch=compute_70,code=sm_70 \
-gencode arch=compute_70,code=compute_70 \
vectorAdd.o -o vectorAdd
文章短網址: https://slanla.com/__hep83y