install & test cuda 9.0 on ubuntu 16.04

安裝

wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
sudo service lightdm stop
sudo dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
sudo apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
sudo apt-get update && sudo apt-get install cuda -y
sudo reboot
sudo ln -s /usr/local/cuda/bin/nvcc /usr/bin/nvcc

設定環境變數

編輯.bashrc

vim ~/.bashrc

加入

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/nvidia-367
export CUDA_HOME=/usr/local/cuda
export PATH=$PATH:/usr/local/cuda/bin

測試編譯

方法1

cd /usr/local/cuda/samples/0_Simple/vectorAdd
make
./vectorAdd

方法2

建立測試檔案vectorAdd.cu

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements){
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < numElements){
    C[i] = A[i] + B[i];
  }
}

int main(void){
  int numElements = 50000;

  //初始化測試資料
  float *h_A=new float[numElements];
  float *h_B=new float[numElements];
  float *h_C=new float[numElements];
  for (int i = 0; i < numElements; ++i) {
    h_A[i] = rand()/(float)RAND_MAX;
    h_B[i] = rand()/(float)RAND_MAX;
  }

  //配置GPU記憶體空間,並從記憶體中複製資料至GPU中
  size_t size = numElements * sizeof(float);
  float *d_A = NULL; cudaMalloc((void **)&d_A, size);
  float *d_B = NULL; cudaMalloc((void **)&d_B, size);
  float *d_C = NULL; cudaMalloc((void **)&d_C, size);
  cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

  //運算
  int threadsPerBlock = 256;
  int blocksPerGrid =(numElements + threadsPerBlock – 1) / threadsPerBlock;
  vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);

  //取回運算結果
  cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

  //清除GPU記憶體空間
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  //驗證資料
  for (int i = 0; i < numElements; ++i) {
    if (fabs(h_A[i] + h_B[i] – h_C[i]) > 1e-5) {
      fprintf(stderr, "Result verification failed at element %d!\n", i);
      exit(EXIT_FAILURE);
    }
  }

  //清除記憶體
  delete d_A;
  delete d_B;
  delete d_C;

  printf("Test PASSED\n");
  return 0;
}

接著手動編譯~

/usr/local/cuda-9.0/bin/nvcc \
  -ccbin g++  \
  -m64 \
  -gencode arch=compute_30,code=sm_30 \
  -gencode arch=compute_35,code=sm_35 \
  -gencode arch=compute_37,code=sm_37 \
  -gencode arch=compute_50,code=sm_50 \
  -gencode arch=compute_52,code=sm_52 \
  -gencode arch=compute_60,code=sm_60 \
  -gencode arch=compute_70,code=sm_70 \
  -gencode arch=compute_70,code=compute_70 \
  -c vectorAdd.cu -o vectorAdd.o

/usr/local/cuda-9.0/bin/nvcc \
  -ccbin g++ \
  -m64 \
  -gencode arch=compute_30,code=sm_30 \
  -gencode arch=compute_35,code=sm_35 \
  -gencode arch=compute_37,code=sm_37 \
  -gencode arch=compute_50,code=sm_50 \
  -gencode arch=compute_52,code=sm_52 \
  -gencode arch=compute_60,code=sm_60 \
  -gencode arch=compute_70,code=sm_70 \
  -gencode arch=compute_70,code=compute_70 \
  vectorAdd.o -o vectorAdd

NTP client for node js / particle

NTP 封包內容

0~3 Bytes:
   0                   1                   2                   3
   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  |LI | VN  |Mode |    Stratum     |     Poll      |  Precision   |
  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
4~7 Bytes:
  Estimated Error
8~11 Bytes:
  Estimated Drift Rate
12~15 Bytes:
  Reference Clock Identifier
16~23 Bytes:
  Reference Timestamp
24~31 Bytes:
  Originate Timestamp
32~39 Bytes:
  Receive Timestamp
40~47 Bytes:
  Transit Timestamp

for node js (參考網路程式碼)

//refer: https://github.com/moonpyk/node-ntp-client/blob/master/lib/ntp-client.js

var NTPClient=function(server,port){
  var self=this;
  var construct=function (server,port){
    self.server=server;
    self.port=port;
    self.t1=self.now();
    self.last_time=self.now();
    update();
  }
  self.now=function(){
    return new Date().getTime()/1000.0;
  }
  var intParer=function(packetBuffer,offset){
    var intpart = 0
    for (var i = 0; i <= 3; i++) intpart = 256 * intpart + packetBuffer[offset + i]; return intpart; } var timestampParser=function(packetBuffer,offset){ var intpart = intParer(packetBuffer,offset); var fractpart = intParer(packetBuffer,offset+4); //2208988800 = (70 * 365 + 17) * 24 * 60 * 60 是 1970與1900之秒差 if(intpart>2208988800)
      intpart-=2208988800;    
    return parseInt(intpart)+fractpart/0x100000000;
  }
  var update=function(){
    var ntpData = new Buffer(48);   //ntpData= 0x1B,0,0,0,0,~,0  //length:48, 24~31 bytes為本地端時間
    ntpData[0] = 0x1B;
    for (var i = 1; i < 48; i++) ntpData[i] = 0; var udp = require('dgram'); var client = udp.createSocket("udp4"); var timeout = setTimeout(function () { console.log('[error]','timeout'); client.close(); },3000); var errorFired = false; client.on('error', function (err) { if (errorFired) { return; } errorFired = true; console.log('[error]','timeout'); clearTimeout(timeout); }); client.send(ntpData, 0, ntpData.length, port, server, function (err) { if (err) { if (errorFired) { return; } clearTimeout(timeout); errorFired = true; client.close(); return; } client.once('message', function (msg) { clearTimeout(timeout); client.close(); self.t1=self.now(); self.last_time=timestampParser(msg,40); console.log('======================',(self.last_time-self.t1)); }); }); } var dt=function(){ return self.now()-self.t1; }; self.time=function(){ if(dt()>6)
      update();
    return self.last_time+dt();
  }
  construct(server,port);
};



var server = "time.stdtime.gov.tw"; //Stratum=2 (第二階時間伺服器)
//server="time.nc7j.com"; //Stratum=1 (第一階時間伺服器)
server="10.31.160.1"; //Stratum=3
var port = 123;

ntp=new NTPClient(server,port);
console.log(ntp.time());
setInterval(function(){
  console.log(ntp.now(),ntp.time());
},100);

for paritcle

class NTPClient{
  private:
    double last_time;
    double time_diff;
    unsigned long t1;
    unsigned long t2;
    byte address[4];
    int port;
    UDP udp;
    unsigned long intParser(byte* packetBuffer, int offset){
      unsigned long highWord = (packetBuffer[offset+0] >> 8) + packetBuffer[offset+1];
      unsigned long lowWord = (packetBuffer[offset+2] >> 8) + packetBuffer[offset+3];
      unsigned long intpart = highWord >> 16 | lowWord;
      return intpart;
    }
    double timestampParser(byte* packetBuffer, int offset){
      unsigned long intpart = intParser(packetBuffer,offset);
      unsigned long fractpart=intParser(packetBuffer,offset+4);
      //2208988800 = (70 * 365 + 17) * 24 * 60 * 60  是 1970與1900之秒差
      if(intpart>2208988800)
        intpart-=2208988800;
      return (int)(intpart)+(double)fractpart/0x100000000;
    }
    double update(){
      #define NTP_PACKET_SIZE 48
      byte packetBuffer[NTP_PACKET_SIZE]={0};
      memset(packetBuffer, 0, NTP_PACKET_SIZE);

      packetBuffer[0]=0x1B;

      this->udp.beginPacket(this->address, this->port);
      this->udp.write(packetBuffer, NTP_PACKET_SIZE);
      this->udp.endPacket();

      while(1){
          if ( this->udp.parsePacket() ) {
              t1=micros();
              this->udp.read(packetBuffer, NTP_PACKET_SIZE);
              this->last_time=timestampParser(packetBuffer,40);
              return this->last_time;
          }
          delay(10);
      }
      return 0;
    }
    double dt(){
      return (micros()-t1)/1000000.0;
    }
  public:
    NTPClient(byte address[], int port=123){
      Time.zone(8);
      memcpy(this->address, address, 4);
      this->port=port;
      this->udp.begin(8888);    //隨便一個 port
      this->update();
    }
    ~NTPClient(){

    }
    double time(){
      if(this->dt()>600)  //每10分鐘強制校時
        this->update();
      return this->last_time+this->dt();
    }
};

NTPClient *ntp;
byte address[] = {118,163,81,61}; //time.stdtime.gov.tw
ntp=new NTPClient(address,123);

CUDA rand (use MT19937)

For generator random number every time, we must put a seed from CPU. (eg : time(0))

cuda_kernel<<<grid,block>>>(time(0));

And in the parallel thread, we also use a offset value to generator difference random number in the same time.

c__global__ void cuda_kernel(int seed)
{
  int index= (blockIdx.y*blockDim.y+threadIdx.y)*(gridDim.x*blockDim.x)+(blockIdx.x*blockDim.x+threadIdx.x);
  MTRand mtrand(seed+index);  //seed + offset:index
  printf("%5.2f",mtrand.randf(0,10)");
}

MTRand Class code:

class MTRand
{
/*
  MT19937
  source: http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.c
*/
  private:
#define __MTRand_N__ 624
#define __MTRand_M__ 397
#define __MTRand_MATRIX_A__ 0x9908b0dfUL
#define __MTRand_UPPER_MASK__ 0x80000000UL
#define __MTRand_LOWER_MASK__ 0x7fffffffUL
    unsigned long mt[__MTRand_N__];
    unsigned long mag01[2];
    int mti;
    __host__ __device__
    void init_genrand(unsigned long s)
    {
      mt[0]= s & 0xffffffffUL;
      for (mti=1; mti<__MTRand_N__; mti++) {
        mt[mti] = (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
        mt[mti] &= 0xffffffffUL;
      }
    }
    __host__ __device__
    unsigned long genrand_int32(void)
    {
      unsigned long y;
      if (mti >= __MTRand_N__) {
        int kk;
        if (mti == __MTRand_N__+1)  init_genrand(5489UL);
        for (kk=0;kk<__MTRand_N__-__MTRand_M__;kk++) {
          y = (mt[kk]&__MTRand_UPPER_MASK__)|(mt[kk+1]&__MTRand_LOWER_MASK__);
          mt[kk] = mt[kk+__MTRand_M__] ^ (y >> 1) ^ mag01[y & 0x1UL];
        }
        for (;kk<__MTRand_N__-1;kk++) {
          y = (mt[kk]&__MTRand_UPPER_MASK__)|(mt[kk+1]&__MTRand_LOWER_MASK__);
          mt[kk] = mt[kk+(__MTRand_M__-__MTRand_N__)] ^ (y >> 1) ^ mag01[y & 0x1UL];
        }
        y = (mt[__MTRand_N__-1]&__MTRand_UPPER_MASK__)|(mt[0]&__MTRand_LOWER_MASK__);
        mt[__MTRand_N__-1] = mt[__MTRand_M__-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
        mti = 0;
      }
      y = mt[mti++];
      y ^= (y >> 11);
      y ^= (y << 7) & 0x9d2c5680UL;
      y ^= (y << 15) & 0xefc60000UL;
      y ^= (y >> 18);
      return y;
    }
  public:
    __host__ __device__
    MTRand(int seed)
      :mti(__MTRand_N__+1)
    {
      mag01[0]=0x0UL;
      mag01[1]=__MTRand_MATRIX_A__;
      init_genrand(19650218UL+seed);
    }
    __host__ __device__
    unsigned long rand()
    {
      return genrand_int32();
    }
    __host__ __device__
    float randf()
    {
      return genrand_int32()*(1.0/4294967295.0);
    }
    __host__ __device__
    float randf(float min,float max)
    {
      if(max<min) { float t=min; min=max; max=t; }
      return randf()*(max-min)+min;
    }
};

php.ini 常見設定

php.ini檔案位置/etc/php5/apache2/php.ini
php-fpm檔案位置/etc/php5/fpm/php.ini
而php cli位置應該是在 /etc/php5/cli/php.ini

sudo vim /etc/php5/apache2/php.ini
# or
sudo vim /etc/php5/fpm/php.ini
# or
sudo vim /etc/php5/cli/php.ini

可以用下列指令確認

php -i |grep php\.ini

我個人常用的設定為(for debug):

short_open_tag=On
memory_limit=2000M
error_reporting=E_ALL & ~E_DEPRECATED & ~E_NOTICE & ~E_STRICT
display_error=On
post_max_size=20000M
upload_max_filesize=20000M
max_file_uploads=200

其中,memory_limit > post_max_size > upload_max_filesize

修改之後,得要重新啟動apache2

sudo /etc/init.d/apache2 restart
# or
sudo service apache2 restart

vim 快速修改方式:

#for apache2
sudo vim /etc/php5/apache2/php.ini
:1,$s/short_open_tag\ =\ Off/short_open_tag\ =\ On/g
:1,$s/memory_limit\ =\ 128M/memory_limit\ =\ 2000M/g
:1,$s/display_errors\ =\ Off/display_errors\ =\ On/g
:1,$s/post_max_size\ =\ 8M/post_max_size\ =\ 2000M/g
:1,$s/upload_max_filesize\ =\ 2M/upload_max_filesize\ =\ 2000M/g
:1,$s/max_file_uploads\ =\ 20/max_file_uploads\ =\ 200/g
/error_reporting
/
:1,$s/error_reporting\ =\ E_ALL\ &\ \~E_DEPRECATED\ &\ \~E_STRICT//g
aerror_reporting = E_ALL & ~E_DEPRECATED & ~E_NOTICE & ~E_STRICT
#for cli
sudo vim /etc/php5/cli/php.ini
:1,$s/short_open_tag\ =\ Off/short_open_tag\ =\ On/g
:1,$s/memory_limit\ =\ 128M/memory_limit\ =\ 2000M/g
:1,$s/display_errors\ =\ Off/display_errors\ =\ On/g
:1,$s/post_max_size\ =\ 8M/post_max_size\ =\ 2000M/g
:1,$s/upload_max_filesize\ =\ 2M/upload_max_filesize\ =\ 2000M/g
:1,$s/max_file_uploads\ =\ 20/max_file_uploads\ =\ 200/g
/error_reporting
/
:1,$s/error_reporting\ =\ E_ALL\ &\ \~E_DEPRECATED\ &\ \~E_STRICT//g
aerror_reporting = E_ALL & ~E_DEPRECATED & ~E_NOTICE & ~E_STRICT