#include #include #include "include/convolution.h" #include "../include/colors.h" #include "../include/utils.h" #include "include/make.h" #define BLOCKSIZE_x 16 #define BLOCKSIZE_y 8 #define BLOCKSIZE_z 8 float max_flt(float a, float b) { // Return the max between the two floats if (a > b) { return a; } return b; } /* * Average Pooling */ #ifdef __CUDACC__ __global__ void make_average_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) { // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim int n = size*size; if (idx >= output_depth || idy >= output_dim || idz >= output_dim) { return; } float m = FLT_MIN; float temp; for (int a=0; a < size; a++) { for (int b=0; b < size; b++) { temp = input[idx][size*idy +a][size*idz +b]; m = m > temp ? m : temp; // max(m, temp) } } output[idx][idy][idz] = m/(float)n; } void make_average_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) { // Make computation dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z)); dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); make_average_pooling_kernel<<>>(input, output, size, output_depth, output_dim); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); } #endif void make_average_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) { // input[output_depth][output_dim+size-1][output_dim+size-1] // output[output_depth][output_dim][output_dim] float m; int n = size*size; for (int i=0; i < output_depth; i++) { for (int j=0; j < output_dim; j++) { for (int k=0; k < output_dim; k++) { m = FLT_MIN; for (int a=0; a < size; a++) { for (int b=0; b < size; b++) { m = max_flt(m, input[i][size*j +a][size*k +b]); } } output[i][j][k] = m/(float)n; } } } } #ifdef __CUDACC__ extern "C" #endif void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) { #ifndef __CUDACC__ make_average_pooling_cpu(input, output, size, output_depth, output_dim); #else make_average_pooling_device(input, output, size, output_depth, output_dim); #endif } /* * Max Pooling */ #ifdef __CUDACC__ __global__ void make_max_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) { // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim if (idx >= output_depth || idy >= output_dim || idz >= output_dim) { return; } float m = FLT_MIN; float temp; for (int a=0; a < size; a++) { for (int b=0; b < size; b++) { temp = input[idx][size*idy +a][size*idz +b]; m = m > temp ? m : temp; // max(m, temp) } } output[idx][idy][idz] = m; } void make_max_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) { // Make computation dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z)); dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); make_max_pooling_kernel<<>>(input, output, size, output_depth, output_dim); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); } #endif void make_max_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) { // input[output_depth][output_dim+size-1][output_dim+size-1] // output[output_depth][output_dim][output_dim] float m; for (int i=0; i < output_depth; i++) { for (int j=0; j < output_dim; j++) { for (int k=0; k < output_dim; k++) { m = FLT_MIN; for (int a=0; a < size; a++) { for (int b=0; b < size; b++) { m = max_flt(m, input[i][size*j +a][size*k +b]); } } output[i][j][k] = m; } } } } #ifdef __CUDACC__ extern "C" #endif void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) { #ifndef __CUDACC__ make_max_pooling_cpu(input, output, size, output_depth, output_dim); #else make_max_pooling_device(input, output, size, output_depth, output_dim); #endif } /* * Dense */ #ifdef __CUDACC__ __global__ void make_dense_kernel(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output if (idx >= size_output) { return; } float f = kernel->bias[idx]; for (int j=0; j < size_input; j++) { f += kernel->weights[j][idx]*input[j]; } output[idx] = f; } void make_dense_device(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { // Make computation dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1); dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z); make_dense_kernel<<>>(kernel, input, output, size_input, size_output); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); } #endif #ifdef __CUDACC__ extern "C" #endif void make_dense_cpu(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { // input[size_input] // output[size_output] float f; for (int i=0; i < size_output; i++) { f = kernel->bias[i]; for (int j=0; j < size_input; j++) { f += kernel->weights[j][i]*input[j]; } output[i] = f; } } #ifdef __CUDACC__ extern "C" #endif void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { #ifndef __CUDACC__ make_dense_cpu(kernel, input, output, size_input, size_output); #else make_dense_device(kernel, input, output, size_input, size_output); #endif } /* * Dense linearised */ #ifdef __CUDACC__ __global__ void make_dense_linearised_kernel(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output if (idx >= size_output) { return; } float f = 0; for (int i=0; i < depth_input; i++) { for (int j=0; j < dim_input; j++) { for (int k=0; k < dim_input; k++) { f += input[i][j][k]*kernel->weights[k + j*dim_input + i*depth_input][idx]; } } } output[idx] = f; } void make_dense_linearised_device(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { // Make computation dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1); dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z); make_dense_linearised_kernel<<>>(kernel, input, output, depth_input, dim_input, size_output); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); } #endif void make_dense_linearised_cpu(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { // input[depth_input][dim_input][dim_input] // output[size_output] float f; for (int l=0; l < size_output; l++) { f = 0; for (int i=0; i < depth_input; i++) { for (int j=0; j < dim_input; j++) { for (int k=0; k < dim_input; k++) { f += input[i][j][k]*kernel->weights[k + j*dim_input + i*depth_input][l]; } } } output[l] = f; } } #ifdef __CUDACC__ extern "C" #endif void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { #ifndef __CUDACC__ make_dense_linearised_cpu(kernel, input, output, depth_input, dim_input, size_output); #else make_dense_linearised_device(kernel, input, output, depth_input, dim_input, size_output); #endif }