1. #undef _GLIBCXX_ATOMIC_BUILTINS
  2. #undef _GLIBCXX_USE_INT128
  3. #include "cuda_code.h"
  4. void checkCUDAError(const char *msg)
  5. {
  6. cudaError_t err = cudaGetLastError();
  7. if (cudaSuccess != err) {
  8. printf("Cuda error: %s : %s.\n", msg, cudaGetErrorString(err));
  9. exit(EXIT_FAILURE);
  10. }
  11. }
  12. /// ACTIVATION
  13. __device__
  14. float Func(float number, FunctionType functionType)
  15. {
  16. switch (functionType) {
  17. case FT_BINARY_STEP:
  18. if (number > 0) {
  19. return 1;
  20. } else {
  21. return 0;
  22. }
  23. case FT_BIPOLAR_STEP:
  24. if (number > 0) {
  25. return 1;
  26. } else {
  27. return -1;
  28. }
  29. case SIGMOID:
  30. return 1.0f / (1.0f - exp(-number));
  31. case FT_BIPOLAR_SIGMOID:
  32. return -1.0f + (2.0f / (1.0f + exp(-number)));
  33. case FT_HYPERBOLIC_TANGENT:
  34. return tanh(number);
  35. case FT_IDENTITY:
  36. default:
  37. return number;
  38. }
  39. }
  40. __device__
  41. unsigned device_min(unsigned a, unsigned b)
  42. {
  43. if (a < b)
  44. return a;
  45. return b;
  46. }
  47. __global__
  48. void activation_float_kernel(float* results, float* thresholds, float* output, unsigned output_sz,
  49. FunctionType functionType)
  50. {
  51. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  52. if (idx < output_sz) {
  53. output[idx] = Func(results[idx] - thresholds[idx], functionType);
  54. }
  55. }
  56. __global__
  57. void activation_bit_kernel(float* results, float* thresholds, unsigned* output, unsigned output_sz)
  58. {
  59. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  60. unsigned offset = idx * BITS_PER_UNSIGNED;
  61. if (output_sz > offset) {
  62. unsigned toRead = device_min(BITS_PER_UNSIGNED, output_sz - offset);
  63. unsigned threadOutput = 0;
  64. unsigned mask = 0x80000000;
  65. for (unsigned i = 0; i < toRead; i++) {
  66. unsigned pos = offset + i;
  67. if (results[pos] - thresholds[pos] > 0) {
  68. threadOutput |= mask;
  69. } else {
  70. threadOutput &= ~mask;
  71. }
  72. mask >>= 1;
  73. }
  74. output[idx] = threadOutput;
  75. }
  76. }
  77. extern "C" void cuda_activation(void* output, unsigned size, BufferType bufferType, float* results,
  78. float* thresholds, FunctionType functionType, unsigned block_size)
  79. {
  80. unsigned grid_size;
  81. switch (bufferType) {
  82. case BT_BYTE:
  83. {
  84. std::string error = "cuda_activation is not implemented for BufferType BYTE.";
  85. throw error;
  86. }
  87. case BT_FLOAT:
  88. {
  89. grid_size = ((size - 1) / block_size) + 1;
  90. activation_float_kernel<<< grid_size, block_size >>>(results, thresholds, (float*)output, size, functionType);
  91. }
  92. break;
  93. case BT_BIT:
  94. case BT_SIGN:
  95. {
  96. grid_size = ((size - 1) / (block_size * BITS_PER_UNSIGNED)) + 1;
  97. activation_bit_kernel<<< grid_size, block_size >>>(results, thresholds, (unsigned*)output, size);
  98. }
  99. break;
  100. }
  101. checkCUDAError("activation");
  102. }
  103. // MEMORY MANAGEMENT
  104. extern "C" void* cuda_malloc(unsigned byteSize)
  105. {
  106. void* ptr;
  107. cudaMalloc((void**) &(ptr), byteSize);
  108. checkCUDAError("malloc");
  109. return ptr;
  110. }
  111. extern "C" void cuda_free(void* d_ptr)
  112. {
  113. cudaFree(d_ptr);
  114. checkCUDAError("free");
  115. }
  116. extern "C" void cuda_copyToDevice(void* d_dest, void* h_src, unsigned count)
  117. {
  118. cudaMemcpy(d_dest, h_src, count, cudaMemcpyHostToDevice);
  119. checkCUDAError("copyToDevice");
  120. }
  121. extern "C" void cuda_copyToHost(void* h_dest, void* d_src, unsigned count)
  122. {
  123. cudaMemcpy(h_dest, d_src, count, cudaMemcpyDeviceToHost);
  124. checkCUDAError("copyToHost");
  125. }
  126. // INITIALIZATION
  127. template <class bufferType>
  128. __global__
  129. void SetValueToAnArrayKernel(bufferType* data, unsigned size, bufferType value)
  130. {
  131. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  132. if (idx < size)
  133. data[idx] = value;
  134. }
  135. extern "C" void cuda_setZero(void* data, unsigned byteSize, BufferType bufferType, unsigned block_size)
  136. {
  137. unsigned grid_size;
  138. unsigned size;
  139. switch (bufferType) {
  140. case BT_BYTE:
  141. size = byteSize / sizeof(unsigned char);
  142. grid_size = ((size - 1) / block_size) + 1;
  143. SetValueToAnArrayKernel<unsigned char><<< grid_size, block_size >>>((unsigned char*)data, size, (unsigned char)0);
  144. break;
  145. case BT_FLOAT:
  146. size = byteSize / sizeof(float);
  147. grid_size = ((size - 1) / block_size) + 1;
  148. SetValueToAnArrayKernel<float><<< grid_size, block_size >>>((float*)data, size, 0);
  149. break;
  150. case BT_BIT:
  151. case BT_SIGN:
  152. cudaMemset(data, 0, byteSize);
  153. break;
  154. }
  155. }
  156. // GENETIC OPERATORS
  157. template <class type>
  158. __global__
  159. void crossoverKernel(type* buffer1, type* buffer2, unsigned* bitBuffer, unsigned size)
  160. {
  161. unsigned weighPos = (blockIdx.x * blockDim.x * BITS_PER_UNSIGNED) + threadIdx.x;
  162. unsigned maxPosForThisBlock = device_min ( (blockIdx.x + 1) * blockDim.x * BITS_PER_UNSIGNED,
  163. size);
  164. unsigned bitsForTheThread, mask;
  165. if (weighPos < maxPosForThisBlock) {
  166. bitsForTheThread = bitBuffer[(blockIdx.x * blockDim.x) + threadIdx.x];
  167. mask = 0x80000000;
  168. }
  169. __syncthreads();
  170. while (weighPos < maxPosForThisBlock) {
  171. if (mask & bitsForTheThread) {
  172. type aux = buffer1[weighPos];
  173. buffer1[weighPos] = buffer2[weighPos];
  174. buffer2[weighPos] = aux;
  175. }
  176. weighPos += blockDim.x;
  177. mask >>= 1;
  178. }
  179. }
  180. extern "C" void cuda_crossover(void* buffer1, void* buffer2, unsigned* bitBuffer, unsigned size,
  181. BufferType bufferType, unsigned block_size)
  182. {
  183. unsigned grid_size = ((size - 1) / (block_size * BITS_PER_UNSIGNED)) + 1;
  184. switch (bufferType) {
  185. case BT_BYTE:
  186. crossoverKernel<unsigned char><<< grid_size, block_size >>>
  187. ((unsigned char*)buffer1, (unsigned char*)buffer2, (unsigned*)bitBuffer, size);
  188. break;
  189. case BT_FLOAT:
  190. crossoverKernel<float><<< grid_size, block_size >>>
  191. ((float*)buffer1, (float*)buffer2, (unsigned*)bitBuffer, size);
  192. break;
  193. case BT_BIT:
  194. case BT_SIGN:
  195. {
  196. std::string error = "cuda_crossover is not implemented for BufferType BIT nor SIGN.";
  197. throw error;
  198. }
  199. }
  200. }
  201. //TODO CU es necesario usar un kernel para esto ??
  202. __global__
  203. void resetFloatKernel(float* buffer, unsigned pos)
  204. {
  205. if (threadIdx.x == 0) {
  206. buffer[pos] = 0;
  207. }
  208. }
  209. __global__
  210. void resetByteKernel(unsigned char* buffer, unsigned pos)
  211. {
  212. if (threadIdx.x == 0) {
  213. buffer[pos] = 128;
  214. }
  215. }
  216. __global__
  217. void mutateFloatKernel(float* buffer, unsigned pos, float mutation)
  218. {
  219. if (threadIdx.x == 0) {
  220. buffer[pos] += mutation;
  221. }
  222. }
  223. __global__
  224. void mutateByteKernel(unsigned char* buffer, unsigned pos, int mutation)
  225. {
  226. if (threadIdx.x == 0) {
  227. int result = mutation + buffer[pos];
  228. if (result <= 0) {
  229. buffer[pos] = 0;
  230. } else if (result >= 255) {
  231. buffer[pos] = 255;
  232. } else {
  233. buffer[pos] = (unsigned char) result;
  234. }
  235. }
  236. }
  237. extern "C" void cuda_mutateWeigh(void* buffer, unsigned pos, float mutation, BufferType bufferType)
  238. {
  239. switch (bufferType) {
  240. case BT_BYTE:
  241. mutateByteKernel<<< 1, 8 >>>((unsigned char*)buffer, pos, (int)mutation);
  242. break;
  243. case BT_FLOAT:
  244. mutateFloatKernel<<< 1, 8 >>>((float*)buffer, pos, mutation);
  245. break;
  246. case BT_BIT:
  247. case BT_SIGN:
  248. {
  249. std::string error = "cuda_mutateWeigh is not implemented for BufferType BIT nor SIGN.";
  250. throw error;
  251. }
  252. }
  253. }
  254. extern "C" void cuda_resetWeigh(void* buffer, unsigned pos, BufferType bufferType)
  255. {
  256. switch (bufferType) {
  257. case BT_BYTE:
  258. resetByteKernel<<< 1, 8 >>>((unsigned char*)buffer, pos);
  259. break;
  260. case BT_FLOAT:
  261. resetFloatKernel<<< 1, 8 >>>((float*)buffer, pos);
  262. break;
  263. case BT_BIT:
  264. case BT_SIGN:
  265. {
  266. std::string error = "cuda_resetWeigh is not implemented for BufferType BIT nor SIGN.";
  267. throw error;
  268. }
  269. }
  270. }
  271. // CL_LAYER CALCULATION
  272. __global__
  273. void SumFloatsConnectionsKernel(float* inputs, unsigned input_size, unsigned output_size, float* weighs,
  274. float* results)
  275. {
  276. extern __shared__ float sdata[];
  277. unsigned outputNeuron = blockIdx.x * blockDim.x + threadIdx.x;
  278. unsigned weighsOffset = outputNeuron * input_size;
  279. float result = 0;
  280. unsigned pos = threadIdx.x;
  281. while (pos < input_size) {
  282. sdata[pos] = inputs[pos];
  283. pos += blockDim.x;
  284. }
  285. __syncthreads();
  286. if (outputNeuron < output_size) {
  287. //////////////////////////
  288. for (unsigned i = 0; i < input_size; i++) {
  289. result += sdata[i] * weighs[weighsOffset + i];
  290. //printf(" peso %f ", weighs[weighsOffset + i]);
  291. }
  292. /////TODO TR OTRA OPCION
  293. /* if (blockDim.x <= input_size){
  294. unsigned pos = tid;
  295. while (pos < input_size){
  296. result += sdata[pos] * weighs[weighsOffset + pos];
  297. ++pos;
  298. }
  299. pos = 0;
  300. while (pos < tid){
  301. result += sdata[pos] * weighs[weighsOffset + pos];
  302. ++pos;
  303. }
  304. } else {
  305. unsigned pos = tid;
  306. while (pos < input_size){
  307. result += sdata[pos] * weighs[weighsOffset + pos];
  308. ++pos;
  309. }
  310. unsigned newMax = device_min(tid, input_size);
  311. pos = 0;
  312. while (pos < newMax){
  313. result += sdata[pos] * weighs[weighsOffset + pos];
  314. ++pos;
  315. }
  316. }*/
  317. /////////////
  318. results[outputNeuron] += result;
  319. }
  320. }
  321. template <BufferType inputType>
  322. __global__
  323. void SumBitsConnectionsKernel(unsigned* inputs, unsigned input_size, unsigned output_size, unsigned char* weighs, float* results)
  324. {
  325. extern __shared__ unsigned shared_inputs[];
  326. unsigned tid = threadIdx.x;
  327. unsigned input_blocks_to_read = ((input_size - 1) / BITS_PER_UNSIGNED) + 1;
  328. unsigned readingLoops = ((input_blocks_to_read - 1) / blockDim.x) + 1;
  329. unsigned pos = tid;
  330. for (unsigned i=0; i < readingLoops; i++) {
  331. if (pos < input_blocks_to_read) {
  332. shared_inputs[pos] = inputs[pos];
  333. }
  334. pos += blockDim.x;
  335. }
  336. __syncthreads();
  337. unsigned outputNeuron = blockIdx.x*blockDim.x + threadIdx.x;
  338. if (outputNeuron < output_size) {
  339. float result = 0;
  340. unsigned weighsOffset = (outputNeuron * input_size);
  341. for (unsigned i=0; i < input_blocks_to_read; i++) {
  342. //TODO TCC check performance penalty (this is just for BT_SIGN)
  343. unsigned maxBits = device_min(BITS_PER_UNSIGNED, input_size - (i * BITS_PER_UNSIGNED));
  344. unsigned input_block = shared_inputs[i];
  345. unsigned mask = 0x80000000;
  346. for (unsigned j=0; j < maxBits; j++) {
  347. if (input_block & mask) {
  348. result += weighs[weighsOffset] - 128;
  349. } else {
  350. if (inputType == BT_SIGN) {
  351. result += 128 - weighs[weighsOffset];
  352. }
  353. }
  354. ++weighsOffset;
  355. mask >>= 1;
  356. }
  357. }
  358. results[outputNeuron] += result;
  359. }
  360. }
  361. __global__
  362. void SumFloatsInvertedConnectionsKernel(float* inputs, unsigned input_size, float* weighs, float* results,
  363. unsigned output_size)
  364. {
  365. extern __shared__ float sdata[];
  366. unsigned input_pos = threadIdx.x;
  367. while (input_pos < input_size) {
  368. sdata[input_pos] = inputs[input_pos];
  369. input_pos += blockDim.x;
  370. }
  371. __syncthreads();
  372. unsigned output_pos = blockIdx.x * blockDim.x + threadIdx.x;
  373. float result = 0;
  374. if (output_pos < output_size) {
  375. for (unsigned i = 0; i < input_size; i++) {
  376. result += sdata[i] * weighs[output_pos + (i * output_size)];
  377. }
  378. results[output_pos] += result;
  379. }
  380. }
  381. template <BufferType inputType>
  382. __global__
  383. void SumBitsInvertedConnectionsKernel(unsigned* inputs, unsigned input_size, unsigned output_size, unsigned char* weighs, float* results)
  384. {
  385. extern __shared__ unsigned shared_inputs[];
  386. unsigned tid = threadIdx.x;
  387. unsigned input_blocks_to_read = ((input_size - 1) / BITS_PER_UNSIGNED) + 1;
  388. unsigned readingLoops = ((input_blocks_to_read - 1) / blockDim.x) + 1;
  389. unsigned pos = tid;
  390. for (unsigned i=0; i < readingLoops; i++) {
  391. if (pos < input_blocks_to_read) {
  392. shared_inputs[pos] = inputs[pos];
  393. }
  394. pos += blockDim.x;
  395. }
  396. __syncthreads();
  397. unsigned outputNeuron = blockIdx.x*blockDim.x + threadIdx.x;
  398. if (outputNeuron < output_size) {
  399. float result = 0;
  400. for (unsigned i=0; i < input_blocks_to_read; i++) {
  401. //TODO TCC check performance penalty (this is just for BT_SIGN)
  402. unsigned maxBits = device_min(BITS_PER_UNSIGNED, input_size - (i * BITS_PER_UNSIGNED));
  403. unsigned weighsOffset = (i * BITS_PER_UNSIGNED * output_size) + outputNeuron;
  404. unsigned input_block = shared_inputs[i];
  405. unsigned mask = 0x80000000;
  406. for (unsigned j=0; j < maxBits; j++) {
  407. if (input_block & mask) {
  408. result += weighs[weighsOffset] - 128;
  409. } else {
  410. if (inputType == BT_SIGN) {
  411. result += 128 - weighs[weighsOffset];
  412. }
  413. }
  414. weighsOffset += output_size;
  415. mask >>= 1;
  416. }
  417. }
  418. results[outputNeuron] += result;
  419. }
  420. }
  421. extern "C" void cuda_inputCalculation(void* inputPtr, unsigned input_size, BufferType inputType,
  422. unsigned output_size, void* weighs, float* results, unsigned block_size)
  423. {
  424. unsigned grid_size = ((output_size - 1) / block_size) + 1;
  425. unsigned shared_mem_size;
  426. if (inputType == BT_BYTE) {
  427. std::string error = "cuda_inputCalculation is not implemented for BufferType BYTE as input.";
  428. throw error;
  429. } else if (inputType == BT_FLOAT) {
  430. if (input_size > 4032) {
  431. string error = "The maximum float input size is 4032.";
  432. throw error;
  433. }
  434. shared_mem_size = input_size * sizeof(float);
  435. SumFloatsConnectionsKernel<<< grid_size, block_size, shared_mem_size >>>((float*)inputPtr, input_size, output_size, (float*)weighs, results);
  436. } else {
  437. shared_mem_size =(((input_size - 1)/BITS_PER_UNSIGNED) + 1) * sizeof(unsigned);
  438. if (shared_mem_size > 16128) {
  439. //16128 * 8
  440. string error = "The maximum bit/sign input size is 129024.";
  441. throw error;
  442. }
  443. if (inputType == BT_BIT) {
  444. SumBitsConnectionsKernel<BT_BIT><<< grid_size, block_size, shared_mem_size >>>((unsigned*)inputPtr, input_size, output_size, (unsigned char*)weighs, results);
  445. } else {
  446. SumBitsConnectionsKernel<BT_SIGN><<< grid_size, block_size, shared_mem_size >>>((unsigned*)inputPtr, input_size, output_size, (unsigned char*)weighs, results);
  447. }
  448. }
  449. }
  450. extern "C" void cuda_inputCalculationInvertedMatrix(void* inputPtr, unsigned input_size, BufferType inputType,
  451. unsigned output_size, void* weighs, float* results,
  452. unsigned block_size)
  453. {
  454. unsigned grid_size = ((output_size - 1) / block_size) + 1;
  455. unsigned shared_mem_size;
  456. if (inputType == BT_BYTE) {
  457. std::string error = "cuda_inputCalculation is not implemented for BufferType BYTE as input.";
  458. throw error;
  459. } else if (inputType == BT_FLOAT) {
  460. while (input_size > CUDA_MAX_SHARED_FLOATS) {
  461. shared_mem_size = CUDA_MAX_SHARED_FLOATS * sizeof(float);
  462. SumFloatsInvertedConnectionsKernel<<< grid_size, block_size, shared_mem_size >>>((float*)inputPtr, CUDA_MAX_SHARED_FLOATS, (float*)weighs, results, output_size);
  463. inputPtr = (void*) ((float*) inputPtr + CUDA_MAX_SHARED_FLOATS);
  464. weighs = (void*) ((float*) weighs + (CUDA_MAX_SHARED_FLOATS * output_size));
  465. input_size -= CUDA_MAX_SHARED_FLOATS;
  466. }
  467. shared_mem_size = input_size * sizeof(float);
  468. SumFloatsInvertedConnectionsKernel <<< grid_size, block_size, shared_mem_size >>>((float*)inputPtr, input_size, (float*)weighs, results, output_size);
  469. } else {
  470. //TODO TCC esta parte no funciona bien
  471. while (input_size > CUDA_MAX_SHARED_BITS) {
  472. shared_mem_size = CUDA_MAX_SHARED_FLOATS * sizeof(unsigned);
  473. // TODO TCC probar sin emulaciĆ³n
  474. // printf("grid_size %d, block_size %d, shared_mem_size %d \n", grid_size, block_size, shared_mem_size);
  475. if (inputType == BT_BIT) {
  476. SumBitsInvertedConnectionsKernel<BT_BIT><<< grid_size, block_size, shared_mem_size >>>((unsigned*)inputPtr, CUDA_MAX_SHARED_BITS, output_size, (unsigned char*)weighs, results);
  477. } else {
  478. SumBitsInvertedConnectionsKernel<BT_SIGN><<< grid_size, block_size, shared_mem_size >>>((unsigned*)inputPtr, CUDA_MAX_SHARED_BITS, output_size, (unsigned char*)weighs, results);
  479. }
  480. inputPtr = (void*)((float*)inputPtr + CUDA_MAX_SHARED_FLOATS);
  481. weighs = (void*)((float*)weighs + (CUDA_MAX_SHARED_BITS * output_size));
  482. input_size -= CUDA_MAX_SHARED_BITS;
  483. }
  484. shared_mem_size =(((input_size - 1)/BITS_PER_UNSIGNED) + 1) * sizeof(unsigned);
  485. // TODO TCC probar sin emulaciĆ³n
  486. //printf("grid_size %d, block_size %d, shared_mem_size %d \n", grid_size, block_size, shared_mem_size);
  487. if (inputType == BT_BIT) {
  488. SumBitsInvertedConnectionsKernel<BT_BIT><<< grid_size, block_size, shared_mem_size >>>((unsigned*)inputPtr, input_size, output_size, (unsigned char*)weighs, results);
  489. } else {
  490. SumBitsInvertedConnectionsKernel<BT_SIGN><<< grid_size, block_size, shared_mem_size >>>((unsigned*)inputPtr, input_size, output_size, (unsigned char*)weighs, results);
  491. }
  492. }
  493. }
  494. template <unsigned int blockSize, BufferType inputType>
  495. __global__
  496. void SumConnectionsKernel(void* inputPtr, unsigned input_size, unsigned output_size, void* weighs, float* results)
  497. {
  498. extern __shared__ float sdata[];
  499. unsigned weighsOffset = (blockIdx.x * input_size);
  500. float result = 0;
  501. unsigned i = threadIdx.x;
  502. if (inputType == BT_FLOAT) {
  503. while (i < input_size) {
  504. result += ((float*)inputPtr)[i] * ((float*)weighs)[weighsOffset + i];
  505. i += blockDim.x;
  506. }
  507. } else {
  508. weighsOffset += threadIdx.x * BITS_PER_UNSIGNED;
  509. unsigned input_blocks_to_read = ((input_size - 1) / BITS_PER_UNSIGNED) + 1;
  510. while (i < input_blocks_to_read) {
  511. //TODO TCC check performance penalty (this is just for BT_SIGN)
  512. unsigned maxBits = device_min(BITS_PER_UNSIGNED, input_size - (i * BITS_PER_UNSIGNED));
  513. unsigned mask = 0x80000000;
  514. unsigned currentInput = ((unsigned*)inputPtr)[i];
  515. for (unsigned j=0; j < maxBits; j++) {
  516. if (currentInput & mask) {
  517. result += ((unsigned char*)weighs)[weighsOffset + j] - 128;
  518. } else {
  519. if (inputType == BT_SIGN) {
  520. result -= ((unsigned char*)weighs)[weighsOffset + j] - 128;
  521. }
  522. }
  523. mask >>= 1;
  524. }
  525. i += blockSize;
  526. weighsOffset += blockDim.x * BITS_PER_UNSIGNED;
  527. }
  528. }
  529. unsigned tid = threadIdx.x;
  530. sdata[tid] = result;
  531. __syncthreads();
  532. if (blockSize >= 512) {if (tid < 256) {sdata[tid] += sdata[tid + 256];}__syncthreads();}
  533. if (blockSize >= 256) {if (tid < 128) {sdata[tid] += sdata[tid + 128];}__syncthreads();}
  534. if (blockSize >= 128) {if (tid < 64) {sdata[tid] += sdata[tid + 64];}__syncthreads();}
  535. #if __DEVICE_EMULATION__
  536. if (blockSize >= 64) {if (tid < 32) {sdata[tid] += sdata[tid + 32];}__syncthreads();}
  537. if (blockSize >= 32) {if (tid < 16) {sdata[tid] += sdata[tid + 16];}__syncthreads();}
  538. if (blockSize >= 16) {if (tid < 8) {sdata[tid] += sdata[tid + 8];}__syncthreads();}
  539. if (blockSize >= 8) {if (tid < 4) {sdata[tid] += sdata[tid + 4];}__syncthreads();}
  540. if (blockSize >= 4) {if (tid < 2) {sdata[tid] += sdata[tid + 2];}__syncthreads();}
  541. if (blockSize >= 2) {if (tid < 1) {sdata[tid] += sdata[tid + 1];}__syncthreads();}
  542. #else
  543. if (tid < 32) {
  544. if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
  545. if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
  546. if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
  547. if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
  548. if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
  549. if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
  550. }
  551. #endif
  552. if (tid == 0) {
  553. results[blockIdx.x] += sdata[0];
  554. }
  555. }
  556. extern "C" void cuda_inputCalculationReduction(void* inputPtr, unsigned input_size, BufferType inputType,
  557. unsigned output_size, void* weighs, float* results,
  558. unsigned block_size)
  559. {
  560. unsigned grid_size = output_size;
  561. unsigned shared_mem_size = block_size * sizeof(float);
  562. if (inputType == BT_BYTE) {
  563. std::string error = "cuda_inputCalculation is not implemented for BufferType BYTE as input.";
  564. throw error;
  565. } else if (inputType == BT_FLOAT) {
  566. switch (block_size) {
  567. case 512:
  568. SumConnectionsKernel<512, BT_FLOAT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  569. case 256:
  570. SumConnectionsKernel<256, BT_FLOAT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  571. case 128:
  572. SumConnectionsKernel<128, BT_FLOAT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  573. case 64:
  574. SumConnectionsKernel< 64, BT_FLOAT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  575. case 32:
  576. SumConnectionsKernel< 32, BT_FLOAT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  577. case 16:
  578. SumConnectionsKernel< 16, BT_FLOAT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  579. case 8:
  580. SumConnectionsKernel< 8, BT_FLOAT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  581. case 4:
  582. SumConnectionsKernel< 4, BT_FLOAT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  583. case 2:
  584. SumConnectionsKernel< 2, BT_FLOAT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  585. case 1:
  586. SumConnectionsKernel< 1, BT_FLOAT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  587. }
  588. } else if (inputType == BT_BIT) {
  589. switch (block_size) {
  590. case 512:
  591. SumConnectionsKernel<512, BT_BIT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  592. case 256:
  593. SumConnectionsKernel<256, BT_BIT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  594. case 128:
  595. SumConnectionsKernel<128, BT_BIT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  596. case 64:
  597. SumConnectionsKernel< 64, BT_BIT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  598. case 32:
  599. SumConnectionsKernel< 32, BT_BIT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  600. case 16:
  601. SumConnectionsKernel< 16, BT_BIT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  602. case 8:
  603. SumConnectionsKernel< 8, BT_BIT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  604. case 4:
  605. SumConnectionsKernel< 4, BT_BIT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  606. case 2:
  607. SumConnectionsKernel< 2, BT_BIT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  608. case 1:
  609. SumConnectionsKernel< 1, BT_BIT><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  610. }
  611. } else {
  612. switch (block_size) {
  613. case 512:
  614. SumConnectionsKernel<512, BT_SIGN><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  615. case 256:
  616. SumConnectionsKernel<256, BT_SIGN><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  617. case 128:
  618. SumConnectionsKernel<128, BT_SIGN><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  619. case 64:
  620. SumConnectionsKernel< 64, BT_SIGN><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  621. case 32:
  622. SumConnectionsKernel< 32, BT_SIGN><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  623. case 16:
  624. SumConnectionsKernel< 16, BT_SIGN><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  625. case 8:
  626. SumConnectionsKernel< 8, BT_SIGN><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  627. case 4:
  628. SumConnectionsKernel< 4, BT_SIGN><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  629. case 2:
  630. SumConnectionsKernel< 2, BT_SIGN><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  631. case 1:
  632. SumConnectionsKernel< 1, BT_SIGN><<< grid_size, block_size, shared_mem_size >>>(inputPtr, input_size, output_size, weighs, results); break;
  633. }
  634. }
  635. checkCUDAError("cuda_inputCalculation2");
  636. }