install & test cuda 9.0 on ubuntu 16.04

安裝

wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
sudo service lightdm stop
sudo dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
sudo apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
sudo apt-get update && sudo apt-get install cuda -y
sudo reboot
sudo ln -s /usr/local/cuda/bin/nvcc /usr/bin/nvcc

設定環境變數

編輯.bashrc

vim ~/.bashrc

加入

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/nvidia-367
export CUDA_HOME=/usr/local/cuda
export PATH=$PATH:/usr/local/cuda/bin

測試編譯

方法1

cd /usr/local/cuda/samples/0_Simple/vectorAdd
make
./vectorAdd

方法2

建立測試檔案vectorAdd.cu

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements){
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < numElements){
    C[i] = A[i] + B[i];
  }
}

int main(void){
  int numElements = 50000;

  //初始化測試資料
  float *h_A=new float[numElements];
  float *h_B=new float[numElements];
  float *h_C=new float[numElements];
  for (int i = 0; i < numElements; ++i) {
    h_A[i] = rand()/(float)RAND_MAX;
    h_B[i] = rand()/(float)RAND_MAX;
  }

  //配置GPU記憶體空間,並從記憶體中複製資料至GPU中
  size_t size = numElements * sizeof(float);
  float *d_A = NULL; cudaMalloc((void **)&d_A, size);
  float *d_B = NULL; cudaMalloc((void **)&d_B, size);
  float *d_C = NULL; cudaMalloc((void **)&d_C, size);
  cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

  //運算
  int threadsPerBlock = 256;
  int blocksPerGrid =(numElements + threadsPerBlock – 1) / threadsPerBlock;
  vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);

  //取回運算結果
  cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

  //清除GPU記憶體空間
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  //驗證資料
  for (int i = 0; i < numElements; ++i) {
    if (fabs(h_A[i] + h_B[i] – h_C[i]) > 1e-5) {
      fprintf(stderr, "Result verification failed at element %d!\n", i);
      exit(EXIT_FAILURE);
    }
  }

  //清除記憶體
  delete d_A;
  delete d_B;
  delete d_C;

  printf("Test PASSED\n");
  return 0;
}

接著手動編譯~

/usr/local/cuda-9.0/bin/nvcc \
  -ccbin g++  \
  -m64 \
  -gencode arch=compute_30,code=sm_30 \
  -gencode arch=compute_35,code=sm_35 \
  -gencode arch=compute_37,code=sm_37 \
  -gencode arch=compute_50,code=sm_50 \
  -gencode arch=compute_52,code=sm_52 \
  -gencode arch=compute_60,code=sm_60 \
  -gencode arch=compute_70,code=sm_70 \
  -gencode arch=compute_70,code=compute_70 \
  -c vectorAdd.cu -o vectorAdd.o

/usr/local/cuda-9.0/bin/nvcc \
  -ccbin g++ \
  -m64 \
  -gencode arch=compute_30,code=sm_30 \
  -gencode arch=compute_35,code=sm_35 \
  -gencode arch=compute_37,code=sm_37 \
  -gencode arch=compute_50,code=sm_50 \
  -gencode arch=compute_52,code=sm_52 \
  -gencode arch=compute_60,code=sm_60 \
  -gencode arch=compute_70,code=sm_70 \
  -gencode arch=compute_70,code=compute_70 \
  vectorAdd.o -o vectorAdd