| 110 | | [[Image(nvidia_xserver.jpg)]] |
| 111 | | |
| 112 | | |
| 113 | | |
| 114 | | |
| 115 | | |
| 116 | | |
| | 110 | [[Image(query_gpu.jpg)]] |
| | 111 | |
| | 112 | 8.安裝CUDA 工具套件和CUDA 軟體開發套件[[br]] |
| | 113 | 進入終端機[[br]] |
| | 114 | $sudo sh ./cudatoolkit_2.3_linux_64_ubuntu9.04.run[[br]] |
| | 115 | $sudo sh ./cudasdk_2.3_linux.run[[br]] |
| | 116 | 過程中,所有選項按Enter會使用預設的設定。[[br]] |
| | 117 | 修改 etc 目錄下的 profile 檔[[br]] |
| | 118 | $cd /etc/[[br]] |
| | 119 | $cudo emacs profile (可用任意的編輯器)[[br]] |
| | 120 | 依照CUDA所在目錄,在檔案的最後加上[[br]] |
| | 121 | PATH=/usr/local/cuda/bin:$PATH [[br]] |
| | 122 | LD_LIBRARY_PATH=/usr/local/cuda/lib64[[br]] |
| | 123 | export PATH[[br]] |
| | 124 | export LD_LIBRARY_PATH[[br]] |
| | 125 | 這四行。[[br]] |
| | 126 | 如果是32 位元的作業系統,第二行改成LD_LIBRARY_PATH=/usr/local/cuda/lib[[br]] |
| | 127 | 如果是64 位元的作業系統,第二行改成LD_LIBRARY_PATH=/usr/local/cuda/lib64[[br]] |
| | 128 | 登出,再登入, 用CUDA程式測試,如果正確無誤,即完成安裝。[[br]] |
| | 129 | |
| | 130 | 若出現以下錯誤訊息[[br]] |
| | 131 | = ./a.out: error while loading shared libraries: libcudart.so.2: cannot open shared object file: No such file or directory = |
| | 132 | |
| | 133 | 若是64位元 命令列 [[br]] |
| | 134 | $sudo ln -sf /usr/local/cuda/lib64/libcudart.so.2.3 /lib64/libcudart.so.2[[br]] |
| | 135 | 如果是32位元可能需要加此行[[br]] |
| | 136 | $sudo ln -sf /usr/local/cuda/lib/libcudart.so.2.3 /lib/libcudart.so.2[[br]] |
| | 137 | |
| | 138 | 可以檢查(option)[[br]] |
| | 139 | $emacs /etc/ld.so.conf[[br]] |
| | 140 | include /usr/local/cuda/lib[[br]] |
| | 141 | |
| | 142 | 若32位元 include /usr/local/cuda/lib[[br]] |
| | 143 | 若64位元 include /usr/local/cuda/lib64[[br]] |
| | 144 | |
| | 145 | 可用以下程式測試。[[br]] |
| | 146 | 用nvcc 編譯cuda 程式 number_add_1.cu。[[br]] |
| | 147 | 如果CPU的執行結果和GPU相同,代表GPU成功運作。[[br]] |
| | 148 | |
| | 149 | |
| | 150 | $nvcc number_add_1.cu[[br]] |
| | 151 | $./a.out[[br]] |
| | 152 | |
| | 153 | |
| | 154 | number_add_1.cu程式碼:[[br]] |
| | 155 | |
| | 156 | #include <stdio.h> [[br]] |
| | 157 | #include <stdlib.h>[[br]] |
| | 158 | #include <math.h>[[br]] |
| | 159 | #include <time.h>[[br]] |
| | 160 | #include <sys/time.h>[[br]] |
| | 161 | #include <iostream>[[br]] |
| | 162 | #include <iomanip>[[br]] |
| | 163 | using namespace std;[[br]] |
| | 164 | #define DATA_SIZE 1048576[[br]] |
| | 165 | int data[DATA_SIZE];[[br]] |
| | 166 | |
| | 167 | __global__ static void sumOfSquares(int *num, int* result){[[br]] |
| | 168 | int sum = 0;[[br]] |
| | 169 | int i;[[br]] |
| | 170 | for(i = 0; i < DATA_SIZE; i++) {[[br]] |
| | 171 | sum += num[i] * num[i];[[br]] |
| | 172 | }[[br]] |
| | 173 | |
| | 174 | *result = sum;[[br]] |
| | 175 | }[[br]] |
| | 176 | void GenerateNumbers(int *number ,int size){[[br]] |
| | 177 | int i;[[br]] |
| | 178 | for(i=0;i<size;i++){ [[br]] |
| | 179 | number[i]=rand() % 10; [[br]] //value = 0 to 9 [[br]] |
| | 180 | }[[br]] |
| | 181 | }[[br]] |
| | 182 | |
| | 183 | double wallclock(void){[[br]] |
| | 184 | struct timeval tv;[[br]] |
| | 185 | struct timezone tz;[[br]] |
| | 186 | double t;[[br]] |
| | 187 | |
| | 188 | gettimeofday(&tv, &tz);[[br]] |
| | 189 | t = (double)tv.tv_sec*1000;[[br]] |
| | 190 | t += ((double)tv.tv_usec)/1000.0;[[br]] |
| | 191 | |
| | 192 | return t;[[br]] |
| | 193 | }// millisecond[[br]] |
| | 194 | |
| | 195 | int main(){[[br]] |
| | 196 | cudaSetDevice(0); [[br]] set device number[[br]] |
| | 197 | GenerateNumbers(data, DATA_SIZE);[[br]] |
| | 198 | |
| | 199 | int* gpudata,*result,sum;[[br]] |
| | 200 | double t1,t2;[[br]] |
| | 201 | |
| | 202 | cudaMalloc((void**) &gpudata, sizeof(int) * DATA_SIZE);[[br]] |
| | 203 | cudaMalloc((void**) &result, sizeof(int));[[br]] |
| | 204 | cudaMemcpy(gpudata, data, sizeof(int) * DATA_SIZE,cudaMemcpyHostToDevice);[[br]] |
| | 205 | |
| | 206 | t1 = wallclock();[[br]] |
| | 207 | sumOfSquares<<<1, 1, 0>>>(gpudata, result);[[br]] |
| | 208 | cudaMemcpy(&sum, result, sizeof(int), cudaMemcpyDeviceToHost);[[br]] |
| | 209 | |
| | 210 | t2 = wallclock();[[br]] |
| | 211 | |
| | 212 | printf("Elapsed time = %f(ms) in GPU\n",t2-t1);[[br]] |
| | 213 | |
| | 214 | cudaFree(gpudata);[[br]] |
| | 215 | cudaFree(result);[[br]] |
| | 216 | |
| | 217 | printf("sum: %d\n", sum);[[br]] |
| | 218 | |
| | 219 | sum = 0;[[br]] |
| | 220 | for(int i = 0; i < DATA_SIZE; i++) {[[br]] |
| | 221 | sum += data[i] * data[i];[[br]] |
| | 222 | }[[br]] |
| | 223 | printf("sum (CPU): %d\n", sum);[[br]] |
| | 224 | }[[br]] |
| | 225 | |
| | 226 | |
| | 227 | |
| | 228 | |
| | 229 | |
| | 230 | |
| | 231 | |
| | 232 | |