diff --git a/Makefile b/Makefile index 5b4a3b2..430adb6 100644 --- a/Makefile +++ b/Makefile @@ -101,7 +101,7 @@ $(BUILDDIR)/cnn-main-cuda: $(BUILDDIR)/cnn_main.cuda.o \ $(BUILDDIR)/cnn_free.cuda.o \ $(BUILDDIR)/cnn_jpeg.cuda.o \ $(BUILDDIR)/cnn_cuda_convolution.o \ - $(BUILDDIR)/cnn_backpropagation.cuda.o \ + $(BUILDDIR)/cnn_cuda_backpropagation.o \ $(BUILDDIR)/colors.cuda.o \ $(BUILDDIR)/cuda_memory_management.o \ $(BUILDDIR)/mnist.cuda.o \ @@ -126,7 +126,7 @@ $(BUILDDIR)/cnn_%.cuda.o: $(CNN_SRCDIR)/%.c $(CNN_SRCDIR)/include/%.h ifdef NVCC_INSTALLED $(BUILDDIR)/cnn_cuda_%.o: $(CNN_SRCDIR)/%.cu $(CNN_SRCDIR)/include/%.h - $(NVCC) $(NVCCFLAGS) -c $< -o $@ + $(NVCC) $(NVCCFLAGS) -c -dc $< -o $@ else $(BUILDDIR)/cnn_cuda_%.o: $(CNN_SRCDIR)/%.cu $(CNN_SRCDIR)/include/%.h @echo "$(NVCC) not found, skipping" @@ -142,7 +142,7 @@ $(BUILDDIR)/%.cuda.o: $(SRCDIR)/%.c $(SRCDIR)/include/%.h ifdef NVCC_INSTALLED $(BUILDDIR)/cuda_%.o: $(SRCDIR)/%.cu $(SRCDIR)/include/%.h - $(NVCC) $(NVCCFLAGS) -c $< -o $@ + $(NVCC) $(NVCCFLAGS) -c -dc $< -o $@ else @echo "$(NVCC) not found, skipping" endif diff --git a/src/cnn/backpropagation.c b/src/cnn/backpropagation.c index 758842c..fa8499f 100644 --- a/src/cnn/backpropagation.c +++ b/src/cnn/backpropagation.c @@ -3,8 +3,12 @@ #include #include "include/backpropagation.h" +#include "../include/utils.h" #include "include/struct.h" +#include "include/config.h" + +#ifndef __CUDACC__ int min(int a, int b) { return a b ? a : b; } +#endif -void softmax_backward_mse(float* input, float* output, int size) { +/* +* Softmax backward MSE +*/ +#ifdef __CUDACC__ +__global__ void softmax_backward_mse_kernel(float* input, float* output, int size) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; + + if (idx >= size) { + return; + } + + int input_val = input[idx]; + int output_val = output[idx]; + + input[idx] = (output_val-input_val)*input_val*(1-input_val); +} + +void softmax_backward_mse_device(float* input, float* output, int size) { + // Make computation + dim3 gridSize(i_div_up(size, BLOCKSIZE_x)); + dim3 blockSize(BLOCKSIZE_x); + + softmax_backward_mse_kernel<<>>(input, output, size); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void softmax_backward_mse_cpu(float* input, float* output, int size) { /* Input et output ont la même taille */ for (int i=0; i < size; i++){ @@ -21,7 +55,42 @@ void softmax_backward_mse(float* input, float* output, int size) { } } -void softmax_backward_cross_entropy(float* input, float* output, int size) { +void softmax_backward_mse(float* input, float* output, int size) { + #ifdef __CUDACC__ + softmax_backward_mse_device(input, output, size); + #else + softmax_backward_mse_cpu(input, output, size); + #endif +} + + +/* +* Softmax backward Cross entropy +*/ +#ifdef __CUDACC__ +__global__ void softmax_backward_cross_entropy_kernel(float* input, float* output, int size) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; + + if (idx >= size) { + return; + } + + input[idx] = output[idx] - input[idx]; +} + +void softmax_backward_cross_entropy_device(float* input, float* output, int size) { + // Make computation + dim3 gridSize(i_div_up(size, BLOCKSIZE_x)); + dim3 blockSize(BLOCKSIZE_x); + + softmax_backward_cross_entropy_kernel<<>>(input, output, size); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void softmax_backward_cross_entropy_cpu(float* input, float* output, int size) { /* Input et output ont la même taille */ for (int i=0; i < size; i++){ @@ -29,16 +98,60 @@ void softmax_backward_cross_entropy(float* input, float* output, int size) { } } -void backward_average_pooling(float*** input, float*** output, int input_width, int output_width, int depth) { +void softmax_backward_cross_entropy(float* input, float* output, int size) { + #ifdef __CUDACC__ + softmax_backward_cross_entropy_device(input, output, size); + #else + softmax_backward_cross_entropy_cpu(input, output, size); + #endif +} + + +/* +* Backward average pooling +*/ +#ifdef __CUDACC__ +__global__ void backward_average_pooling_kernel(float*** input, float*** output, int input_width, int output_width, int depth, int n, int size) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_width + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_width + + if (idx >= depth || idy >= output_width || idz >= output_width) { + return; + } + + for (int a=0; a < size; a++) { + for (int b=0; b < size; b++) { + input[idx][size*idy +a][size*idz +b] += output[idx][idy][idz]/n; + } + } +} + + +void backward_average_pooling_device(float*** input, float*** output, int input_width, int output_width, int depth) { + // Make computation + dim3 gridSize(i_div_up(depth, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + int size = input_width/output_width; // Taille du pooling + + reset_3d_array(input, depth, input_width, input_width); + + backward_average_pooling_kernel<<>>(input, output, input_width, output_width, depth, size*size, size); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void backward_average_pooling_cpu(float*** input, float*** output, int input_width, int output_width, int depth) { /* Input et output ont la même profondeur (depth) */ int size = input_width/output_width; // Taille du pooling int n = size*size; // Nombre d'éléments dans le pooling - for (int a=0; a < depth; a++) - for (int b=0; b < input_width; b++) - for (int c=0; c < input_width; c++) - input[a][b][c] = 0; + reset_3d_array(input, depth, input_width, input_width); for (int i=0; i < depth; i++) { for (int j=0; j < output_width; j++) { @@ -53,7 +166,65 @@ void backward_average_pooling(float*** input, float*** output, int input_width, } } -void backward_max_pooling(float*** input, float*** output, int input_width, int output_width, int depth) { +#ifdef __CUDACC__ +extern "C" +#endif +void backward_average_pooling(float*** input, float*** output, int input_width, int output_width, int depth) { + #ifndef __CUDACC__ + backward_average_pooling_cpu(input, output, input_width, output_width, depth); + #else + backward_average_pooling_device(input, output, input_width, output_width, depth); + #endif +} + + +/* +* Backward max pooling +*/ +#ifdef __CUDACC__ +__global__ void backward_max_pooling_kernel(float*** input, float*** output, int input_width, int output_width, int depth, int n, int size) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_width + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_width + + if (idx >= depth || idy >= output_width || idz >= output_width) { + return; + } + + float m = -FLT_MAX; + int a_max = -1; + int b_max = -1; + + for (int a=0; a < size; a++) { + for (int b=0; b < size; b++) { + if (input[idx][size*idy +a][size*idz +b] > m) { + m = input[idx][size*idy +a][size*idz +b]; + a_max = a; + b_max = b; + } + input[idx][size*idy +a][size*idz +b] = 0; + } + } + input[idx][size*idy +a_max][size*idz +b_max] = output[idx][idy][idz]/n; +} + + +void backward_max_pooling_device(float*** input, float*** output, int input_width, int output_width, int depth) { + // Make computation + dim3 gridSize(i_div_up(depth, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + int size = input_width/output_width; // Taille du pooling + + backward_max_pooling_kernel<<>>(input, output, input_width, output_width, depth, size*size, size); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void backward_max_pooling_cpu(float*** input, float*** output, int input_width, int output_width, int depth) { int size = input_width/output_width; float m; // Maximum @@ -82,7 +253,78 @@ void backward_max_pooling(float*** input, float*** output, int input_width, int } } -void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, funcPtr d_function, int is_first) { +#ifdef __CUDACC__ +extern "C" +#endif +void backward_max_pooling(float*** input, float*** output, int input_width, int output_width, int depth) { + #ifndef __CUDACC__ + backward_max_pooling_cpu(input, output, input_width, output_width, depth); + #else + backward_max_pooling_device(input, output, input_width, output_width, depth); + #endif +} + +/* +* Backward Dense +*/ +#ifdef __CUDACC__ +__global__ void backward_dense_kernel_1(Kernel_nn* ker, float* input, float* output, int size_input, int size_output) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_input + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < size_output + + if (idx >= size_input || idy >= size_output) { + return; + } + + if (idx == 0) { + ker->d_bias[idy] += output[idy]; + } + ker->d_weights[idx][idy] += input[idx]*output[idy]; +} + +__global__ void backward_dense_kernel_2(float** weights, float* input, float* input_z, float* output, int size_input, int size_output, funcPtr d_f) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_input + + if (idx >= size_input) { + return; + } + + float tmp=0; + for (int j=0; j < size_output; j++) { + tmp += output[j]*weights[idx][j]; + } + input[idx] = tmp*( (*d_f)(input_z[idx]) ); +} + +void backward_dense_device(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first) { + // Make computation + dim3 gridSize1(i_div_up(size_input, BLOCKSIZE_x), i_div_up(size_output, BLOCKSIZE_y)); + dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y); + + backward_dense_kernel_1<<>>(ker, input, output, size_input, size_output); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + + // Second kernel + if (is_first != 1) { + dim3 gridSize1(i_div_up(size_input, BLOCKSIZE_x)); + dim3 blockSize1(BLOCKSIZE_x); + + funcPtr d_function = get_activation_function_cuda(activation); + + backward_dense_kernel_2<<>>(ker->weights, input, input_z, output, size_input, size_output, d_function); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + } +} +#endif + +void backward_dense_cpu(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first) { + + funcPtr d_function = get_activation_function(activation); // Bias for (int j=0; j < size_output; j++) { ker->d_bias[j] += output[j]; @@ -109,7 +351,85 @@ void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output, } } -void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output,funcPtr d_function) { +#ifdef __CUDACC__ +extern "C" +#endif +void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first) { + #ifndef __CUDACC__ + backward_dense_cpu(ker, input, input_z, output, size_input, size_output, activation, is_first); + #else + backward_dense_device(ker, input, input_z, output, size_input, size_output, activation, is_first); + #endif +} + + + +/* +* Backward linearisation +*/ +#ifdef __CUDACC__ +__global__ void backward_linearisation_kernel_1(Kernel_nn* ker, float*** input, float* output, int depth_input, int dim_input, int size_output) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth_input + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < dim_input + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < dim_input + + if (idx >= depth_input || idy >= dim_input || idz >= dim_input) { + return; + } + + int id = idx*dim_input*dim_input + idy*dim_input + idz; + + for (int j=0; j < size_output; j++) { + ker->d_weights[id][j] += input[idx][idy][idz]*output[j]; + } + if (id == 0) { + for (int j=0; j < size_output; j++) { + ker->d_bias[j] += output[j]; + } + } +} + +__global__ void backward_linearisation_kernel_2(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, funcPtr d_f) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth_input + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < dim_input + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < dim_input + + if (idx >= depth_input || idy >= dim_input || idz >= dim_input) { + return; + } + int id = idx*dim_input*dim_input + idy*dim_input + idz; + + float tmp=0; + for (int j=0; j < size_output; j++) { + tmp += output[j]*ker->weights[id][j]; + } + input[idx][idy][idz] = tmp*( (*d_f)(input_z[idx][idy][idz]) ); +} + +void backward_linearisation_device(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation) { + // Make computation + dim3 gridSize(i_div_up(depth_input, BLOCKSIZE_x), i_div_up(dim_input, BLOCKSIZE_y), i_div_up(dim_input, BLOCKSIZE_y)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + backward_linearisation_kernel_1<<>>(ker, input, output, depth_input, dim_input, size_output); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + + // Second kernel + funcPtr d_function = get_activation_function_cuda(activation); + + backward_linearisation_kernel_2<<>>(ker, input, input_z, output, depth_input, dim_input, size_output, d_function); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void backward_linearisation_cpu(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation) { + + funcPtr d_function = get_activation_function(activation); + // Bias for (int j=0; j < size_output; j++) { ker->d_bias[j] += output[j]; @@ -144,7 +464,119 @@ void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, fl } } -void backward_convolution(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, funcPtr d_function, int is_first) { +#ifdef __CUDACC__ +extern "C" +#endif +void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation) { + #ifndef __CUDACC__ + backward_linearisation_cpu(ker, input, input_z, output, depth_input, dim_input, size_output, activation); + #else + backward_linearisation_device(ker, input, input_z, output, depth_input, dim_input, size_output, activation); + #endif +} + +/* +* Backward convolution +*/ +#ifdef __CUDACC__ +__global__ void backward_convolution_dbias_kernel(Kernel_cnn* ker, float*** output, int depth_output, int dim_output) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; + int idy = threadIdx.y + blockDim.y*blockIdx.y; + int idz = threadIdx.z + blockDim.z*blockIdx.z; + + if (idx >= depth_output || idy >= dim_output || idz >= dim_output) { + return; + } + ker->d_bias[idx][idy][idz] += output[idx][idy][idz]; +} + +__global__ void backward_convolution_dweight_kernel(Kernel_cnn* ker, float*** input, float*** output, int depth_input, int depth_output, int dim_output, int k_size) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; + int idy = threadIdx.y + blockDim.y*blockIdx.y; + int idz = threadIdx.z + blockDim.z*blockIdx.z; + + int idz1 = idz / k_size; + int idz2 = idz % k_size; + + if (idx >= depth_input || idy >= depth_output || idz1 >= k_size || idz2 >= k_size) { + return; + } + + float tmp = 0; + for (int l=0; l < dim_output; l++) { + for (int m=0; m < dim_output; m++) { + tmp += input[idx][l+idz1][m+idz2]*output[idy][l][m]; + } + } + ker->d_weights[idx][idy][idz1][idz2] += tmp; +} + +__global__ void backward_convolution_propagate_kernel(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int k_size, funcPtr d_f) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; + int idy = threadIdx.y + blockDim.y*blockIdx.y; + int idz = threadIdx.z + blockDim.z*blockIdx.z; + + if (idx >= depth_input || idy >= dim_input || idz >= dim_input) { + return; + } + + int min_m, max_m, min_n, max_n; + float tmp = 0; + for (int l=0; l < depth_output; l++) { + min_m = max(0, k_size-1-idy); + max_m = min(k_size, dim_input - idy); + min_n = max(0, k_size-1-idz); + max_n = min(k_size, dim_input-idz); + for (int m=min_m; m < max_m; m++) { + for (int n=min_n; n < max_n; n++) { + tmp += output[l][idy-k_size+m+1][idz-k_size+n+1]*ker->weights[idx][l][m][n]; + } + } + } + input[idx][idy][idz] = tmp*( (*d_f)(input_z[idx][idy][idz]) ); +} + +void backward_convolution_device(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first) { + // Bias Kernel + dim3 gridSize1(i_div_up(depth_output, BLOCKSIZE_x), i_div_up(dim_output, BLOCKSIZE_y), i_div_up(dim_output, BLOCKSIZE_y)); + dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + backward_convolution_dbias_kernel<<>>(ker, output, depth_output, dim_output); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + + // Weights Kernel + int k_size = dim_input - dim_output +1; + + dim3 gridSize2(i_div_up(depth_input, BLOCKSIZE_x), i_div_up(depth_output, BLOCKSIZE_y), i_div_up(k_size*k_size, BLOCKSIZE_y)); + dim3 blockSize2(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + backward_convolution_dweight_kernel<<>>(ker, input, output, depth_input, depth_output, dim_output, k_size); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + + // input propagation Kernel + if (is_first != 1) { + dim3 gridSize3(i_div_up(depth_input, BLOCKSIZE_x), i_div_up(dim_input, BLOCKSIZE_y), i_div_up(dim_input, BLOCKSIZE_y)); + dim3 blockSize3(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + funcPtr d_function = get_activation_function_cuda(activation); + + backward_convolution_propagate_kernel<<>>(ker, input, input_z, output, depth_input, dim_input, depth_output, k_size, d_function); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + } +} +#endif + + +void backward_convolution_cpu(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first) { + + funcPtr d_function = get_activation_function(activation); + // Bias for (int i=0; i < depth_output; i++) { for (int j=0; j < dim_output; j++) { @@ -197,3 +629,14 @@ void backward_convolution(Kernel_cnn* ker, float*** input, float*** input_z, flo } } } + +#ifdef __CUDACC__ +extern "C" +#endif +void backward_convolution(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first) { + #ifndef __CUDACC__ + backward_convolution_cpu(ker, input, input_z, output, depth_input, dim_input, depth_output, dim_output, activation, is_first); + #else + backward_convolution_device(ker, input, input_z, output, depth_input, dim_input, depth_output, dim_output, activation, is_first); + #endif +} \ No newline at end of file diff --git a/src/cnn/backpropagation.cu b/src/cnn/backpropagation.cu new file mode 100644 index 0000000..fa8499f --- /dev/null +++ b/src/cnn/backpropagation.cu @@ -0,0 +1,642 @@ +#include +#include +#include + +#include "include/backpropagation.h" +#include "../include/utils.h" +#include "include/struct.h" + +#include "include/config.h" + +#ifndef __CUDACC__ +int min(int a, int b) { + return a b ? a : b; +} +#endif + +/* +* Softmax backward MSE +*/ +#ifdef __CUDACC__ +__global__ void softmax_backward_mse_kernel(float* input, float* output, int size) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; + + if (idx >= size) { + return; + } + + int input_val = input[idx]; + int output_val = output[idx]; + + input[idx] = (output_val-input_val)*input_val*(1-input_val); +} + +void softmax_backward_mse_device(float* input, float* output, int size) { + // Make computation + dim3 gridSize(i_div_up(size, BLOCKSIZE_x)); + dim3 blockSize(BLOCKSIZE_x); + + softmax_backward_mse_kernel<<>>(input, output, size); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void softmax_backward_mse_cpu(float* input, float* output, int size) { + /* Input et output ont la même taille */ + + for (int i=0; i < size; i++){ + input[i] = (output[i]-input[i])*input[i]*(1-input[i]); + } +} + +void softmax_backward_mse(float* input, float* output, int size) { + #ifdef __CUDACC__ + softmax_backward_mse_device(input, output, size); + #else + softmax_backward_mse_cpu(input, output, size); + #endif +} + + +/* +* Softmax backward Cross entropy +*/ +#ifdef __CUDACC__ +__global__ void softmax_backward_cross_entropy_kernel(float* input, float* output, int size) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; + + if (idx >= size) { + return; + } + + input[idx] = output[idx] - input[idx]; +} + +void softmax_backward_cross_entropy_device(float* input, float* output, int size) { + // Make computation + dim3 gridSize(i_div_up(size, BLOCKSIZE_x)); + dim3 blockSize(BLOCKSIZE_x); + + softmax_backward_cross_entropy_kernel<<>>(input, output, size); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void softmax_backward_cross_entropy_cpu(float* input, float* output, int size) { + /* Input et output ont la même taille */ + + for (int i=0; i < size; i++){ + input[i] = output[i] - input[i]; + } +} + +void softmax_backward_cross_entropy(float* input, float* output, int size) { + #ifdef __CUDACC__ + softmax_backward_cross_entropy_device(input, output, size); + #else + softmax_backward_cross_entropy_cpu(input, output, size); + #endif +} + + +/* +* Backward average pooling +*/ +#ifdef __CUDACC__ +__global__ void backward_average_pooling_kernel(float*** input, float*** output, int input_width, int output_width, int depth, int n, int size) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_width + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_width + + if (idx >= depth || idy >= output_width || idz >= output_width) { + return; + } + + for (int a=0; a < size; a++) { + for (int b=0; b < size; b++) { + input[idx][size*idy +a][size*idz +b] += output[idx][idy][idz]/n; + } + } +} + + +void backward_average_pooling_device(float*** input, float*** output, int input_width, int output_width, int depth) { + // Make computation + dim3 gridSize(i_div_up(depth, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + int size = input_width/output_width; // Taille du pooling + + reset_3d_array(input, depth, input_width, input_width); + + backward_average_pooling_kernel<<>>(input, output, input_width, output_width, depth, size*size, size); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void backward_average_pooling_cpu(float*** input, float*** output, int input_width, int output_width, int depth) { + /* Input et output ont la même profondeur (depth) */ + + int size = input_width/output_width; // Taille du pooling + int n = size*size; // Nombre d'éléments dans le pooling + + reset_3d_array(input, depth, input_width, input_width); + + for (int i=0; i < depth; i++) { + for (int j=0; j < output_width; j++) { + for (int k=0; k < output_width; k++) { + for (int a=0; a < size; a++) { + for (int b=0; b < size; b++) { + input[i][size*j +a][size*k +b] += output[i][j][k]/n; + } + } + } + } + } +} + +#ifdef __CUDACC__ +extern "C" +#endif +void backward_average_pooling(float*** input, float*** output, int input_width, int output_width, int depth) { + #ifndef __CUDACC__ + backward_average_pooling_cpu(input, output, input_width, output_width, depth); + #else + backward_average_pooling_device(input, output, input_width, output_width, depth); + #endif +} + + +/* +* Backward max pooling +*/ +#ifdef __CUDACC__ +__global__ void backward_max_pooling_kernel(float*** input, float*** output, int input_width, int output_width, int depth, int n, int size) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_width + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_width + + if (idx >= depth || idy >= output_width || idz >= output_width) { + return; + } + + float m = -FLT_MAX; + int a_max = -1; + int b_max = -1; + + for (int a=0; a < size; a++) { + for (int b=0; b < size; b++) { + if (input[idx][size*idy +a][size*idz +b] > m) { + m = input[idx][size*idy +a][size*idz +b]; + a_max = a; + b_max = b; + } + input[idx][size*idy +a][size*idz +b] = 0; + } + } + input[idx][size*idy +a_max][size*idz +b_max] = output[idx][idy][idz]/n; +} + + +void backward_max_pooling_device(float*** input, float*** output, int input_width, int output_width, int depth) { + // Make computation + dim3 gridSize(i_div_up(depth, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + int size = input_width/output_width; // Taille du pooling + + backward_max_pooling_kernel<<>>(input, output, input_width, output_width, depth, size*size, size); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void backward_max_pooling_cpu(float*** input, float*** output, int input_width, int output_width, int depth) { + int size = input_width/output_width; + + float m; // Maximum + int a_max, b_max; // Indices du maximum + + for (int i=0; i < depth; i++) { + for (int j=0; j < output_width; j++) { + for (int k=0; k < output_width; k++) { + m = -FLT_MAX; + a_max = -1; + b_max = -1; + + for (int a=0; a < size; a++) { + for (int b=0; b < size; b++) { + if (input[i][size*j +a][size*k +b] > m) { + m = input[i][size*j +a][size*k +b]; + a_max = a; + b_max = b; + } + input[i][size*j +a][size*k +b] = 0; + } + } + input[i][size*j +a_max][size*k +b_max] = output[i][j][k]/(size*size); + } + } + } +} + +#ifdef __CUDACC__ +extern "C" +#endif +void backward_max_pooling(float*** input, float*** output, int input_width, int output_width, int depth) { + #ifndef __CUDACC__ + backward_max_pooling_cpu(input, output, input_width, output_width, depth); + #else + backward_max_pooling_device(input, output, input_width, output_width, depth); + #endif +} + +/* +* Backward Dense +*/ +#ifdef __CUDACC__ +__global__ void backward_dense_kernel_1(Kernel_nn* ker, float* input, float* output, int size_input, int size_output) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_input + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < size_output + + if (idx >= size_input || idy >= size_output) { + return; + } + + if (idx == 0) { + ker->d_bias[idy] += output[idy]; + } + ker->d_weights[idx][idy] += input[idx]*output[idy]; +} + +__global__ void backward_dense_kernel_2(float** weights, float* input, float* input_z, float* output, int size_input, int size_output, funcPtr d_f) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_input + + if (idx >= size_input) { + return; + } + + float tmp=0; + for (int j=0; j < size_output; j++) { + tmp += output[j]*weights[idx][j]; + } + input[idx] = tmp*( (*d_f)(input_z[idx]) ); +} + +void backward_dense_device(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first) { + // Make computation + dim3 gridSize1(i_div_up(size_input, BLOCKSIZE_x), i_div_up(size_output, BLOCKSIZE_y)); + dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y); + + backward_dense_kernel_1<<>>(ker, input, output, size_input, size_output); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + + // Second kernel + if (is_first != 1) { + dim3 gridSize1(i_div_up(size_input, BLOCKSIZE_x)); + dim3 blockSize1(BLOCKSIZE_x); + + funcPtr d_function = get_activation_function_cuda(activation); + + backward_dense_kernel_2<<>>(ker->weights, input, input_z, output, size_input, size_output, d_function); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + } +} +#endif + +void backward_dense_cpu(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first) { + + funcPtr d_function = get_activation_function(activation); + // Bias + for (int j=0; j < size_output; j++) { + ker->d_bias[j] += output[j]; + } + + // Weights + for (int i=0; i < size_input; i++) { + for (int j=0; j < size_output; j++) { + ker->d_weights[i][j] += input[i]*output[j]; + } + } + + // Input + if (is_first==1) {// Pas besoin de backpropager dans l'input + return; + } + + for (int i=0; i < size_input; i++) { + float tmp=0; + for (int j=0; j < size_output; j++) { + tmp += output[j]*ker->weights[i][j]; + } + input[i] = tmp*d_function(input_z[i]); + } +} + +#ifdef __CUDACC__ +extern "C" +#endif +void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first) { + #ifndef __CUDACC__ + backward_dense_cpu(ker, input, input_z, output, size_input, size_output, activation, is_first); + #else + backward_dense_device(ker, input, input_z, output, size_input, size_output, activation, is_first); + #endif +} + + + +/* +* Backward linearisation +*/ +#ifdef __CUDACC__ +__global__ void backward_linearisation_kernel_1(Kernel_nn* ker, float*** input, float* output, int depth_input, int dim_input, int size_output) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth_input + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < dim_input + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < dim_input + + if (idx >= depth_input || idy >= dim_input || idz >= dim_input) { + return; + } + + int id = idx*dim_input*dim_input + idy*dim_input + idz; + + for (int j=0; j < size_output; j++) { + ker->d_weights[id][j] += input[idx][idy][idz]*output[j]; + } + if (id == 0) { + for (int j=0; j < size_output; j++) { + ker->d_bias[j] += output[j]; + } + } +} + +__global__ void backward_linearisation_kernel_2(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, funcPtr d_f) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth_input + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < dim_input + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < dim_input + + if (idx >= depth_input || idy >= dim_input || idz >= dim_input) { + return; + } + int id = idx*dim_input*dim_input + idy*dim_input + idz; + + float tmp=0; + for (int j=0; j < size_output; j++) { + tmp += output[j]*ker->weights[id][j]; + } + input[idx][idy][idz] = tmp*( (*d_f)(input_z[idx][idy][idz]) ); +} + +void backward_linearisation_device(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation) { + // Make computation + dim3 gridSize(i_div_up(depth_input, BLOCKSIZE_x), i_div_up(dim_input, BLOCKSIZE_y), i_div_up(dim_input, BLOCKSIZE_y)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + backward_linearisation_kernel_1<<>>(ker, input, output, depth_input, dim_input, size_output); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + + // Second kernel + funcPtr d_function = get_activation_function_cuda(activation); + + backward_linearisation_kernel_2<<>>(ker, input, input_z, output, depth_input, dim_input, size_output, d_function); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void backward_linearisation_cpu(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation) { + + funcPtr d_function = get_activation_function(activation); + + // Bias + for (int j=0; j < size_output; j++) { + ker->d_bias[j] += output[j]; + } + + // Weights + int cpt = 0; + for (int i=0; i < depth_input; i++) { + for (int k=0; k < dim_input; k++) { + for (int l=0; l < dim_input; l++) { + for (int j=0; j < size_output; j++) { + ker->d_weights[cpt][j] += input[i][k][l]*output[j]; + } + cpt++; + } + } + } + + // Input + cpt = 0; + for (int i=0; i < depth_input; i++) { + for (int k=0; k < dim_input; k++) { + for (int l=0; l < dim_input; l++) { + float tmp=0; + for (int j=0; j < size_output; j++) { + tmp += output[j]*ker->weights[cpt][j]; + } + input[i][k][l] = tmp*d_function(input_z[i][k][l]); + cpt++; + } + } + } +} + +#ifdef __CUDACC__ +extern "C" +#endif +void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation) { + #ifndef __CUDACC__ + backward_linearisation_cpu(ker, input, input_z, output, depth_input, dim_input, size_output, activation); + #else + backward_linearisation_device(ker, input, input_z, output, depth_input, dim_input, size_output, activation); + #endif +} + +/* +* Backward convolution +*/ +#ifdef __CUDACC__ +__global__ void backward_convolution_dbias_kernel(Kernel_cnn* ker, float*** output, int depth_output, int dim_output) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; + int idy = threadIdx.y + blockDim.y*blockIdx.y; + int idz = threadIdx.z + blockDim.z*blockIdx.z; + + if (idx >= depth_output || idy >= dim_output || idz >= dim_output) { + return; + } + ker->d_bias[idx][idy][idz] += output[idx][idy][idz]; +} + +__global__ void backward_convolution_dweight_kernel(Kernel_cnn* ker, float*** input, float*** output, int depth_input, int depth_output, int dim_output, int k_size) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; + int idy = threadIdx.y + blockDim.y*blockIdx.y; + int idz = threadIdx.z + blockDim.z*blockIdx.z; + + int idz1 = idz / k_size; + int idz2 = idz % k_size; + + if (idx >= depth_input || idy >= depth_output || idz1 >= k_size || idz2 >= k_size) { + return; + } + + float tmp = 0; + for (int l=0; l < dim_output; l++) { + for (int m=0; m < dim_output; m++) { + tmp += input[idx][l+idz1][m+idz2]*output[idy][l][m]; + } + } + ker->d_weights[idx][idy][idz1][idz2] += tmp; +} + +__global__ void backward_convolution_propagate_kernel(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int k_size, funcPtr d_f) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; + int idy = threadIdx.y + blockDim.y*blockIdx.y; + int idz = threadIdx.z + blockDim.z*blockIdx.z; + + if (idx >= depth_input || idy >= dim_input || idz >= dim_input) { + return; + } + + int min_m, max_m, min_n, max_n; + float tmp = 0; + for (int l=0; l < depth_output; l++) { + min_m = max(0, k_size-1-idy); + max_m = min(k_size, dim_input - idy); + min_n = max(0, k_size-1-idz); + max_n = min(k_size, dim_input-idz); + for (int m=min_m; m < max_m; m++) { + for (int n=min_n; n < max_n; n++) { + tmp += output[l][idy-k_size+m+1][idz-k_size+n+1]*ker->weights[idx][l][m][n]; + } + } + } + input[idx][idy][idz] = tmp*( (*d_f)(input_z[idx][idy][idz]) ); +} + +void backward_convolution_device(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first) { + // Bias Kernel + dim3 gridSize1(i_div_up(depth_output, BLOCKSIZE_x), i_div_up(dim_output, BLOCKSIZE_y), i_div_up(dim_output, BLOCKSIZE_y)); + dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + backward_convolution_dbias_kernel<<>>(ker, output, depth_output, dim_output); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + + // Weights Kernel + int k_size = dim_input - dim_output +1; + + dim3 gridSize2(i_div_up(depth_input, BLOCKSIZE_x), i_div_up(depth_output, BLOCKSIZE_y), i_div_up(k_size*k_size, BLOCKSIZE_y)); + dim3 blockSize2(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + backward_convolution_dweight_kernel<<>>(ker, input, output, depth_input, depth_output, dim_output, k_size); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + + // input propagation Kernel + if (is_first != 1) { + dim3 gridSize3(i_div_up(depth_input, BLOCKSIZE_x), i_div_up(dim_input, BLOCKSIZE_y), i_div_up(dim_input, BLOCKSIZE_y)); + dim3 blockSize3(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + funcPtr d_function = get_activation_function_cuda(activation); + + backward_convolution_propagate_kernel<<>>(ker, input, input_z, output, depth_input, dim_input, depth_output, k_size, d_function); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + } +} +#endif + + +void backward_convolution_cpu(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first) { + + funcPtr d_function = get_activation_function(activation); + + // Bias + for (int i=0; i < depth_output; i++) { + for (int j=0; j < dim_output; j++) { + for (int k=0; k < dim_output; k++) { + ker->d_bias[i][j][k] += output[i][j][k]; + } + } + } + + // Weights + int k_size = dim_input - dim_output +1; + + for (int h=0; h < depth_input; h++) { + for (int i=0; i < depth_output; i++) { + for (int j=0; j < k_size; j++) { + for (int k=0; k < k_size; k++) { + float tmp = 0; + for (int l=0; l < dim_output; l++) { + for (int m=0; m < dim_output; m++) { + tmp += input[h][l+j][m+k]*output[i][l][m]; + } + } + ker->d_weights[h][i][j][k] += tmp; + } + } + } + } + + // Input + if (is_first==1) // Pas besoin de backpropager dans l'input + return; + int min_m, max_m, min_n, max_n; + for (int i=0; i < depth_input; i++) { + for (int j=0; j < dim_input; j++) { + for (int k=0; k < dim_input; k++) { + float tmp = 0; + for (int l=0; l < depth_output; l++) { + min_m = max(0, k_size-1-j); + max_m = min(k_size, dim_input - j); + min_n = max(0, k_size-1-k); + max_n = min(k_size, dim_input-k); + for (int m=min_m; m < max_m; m++) { + for (int n=min_n; n < max_n; n++) { + tmp += output[l][j-k_size+m+1][k-k_size+n+1]*ker->weights[i][l][m][n]; + } + } + } + input[i][j][k] = tmp*d_function(input_z[i][j][k]); + } + } + } +} + +#ifdef __CUDACC__ +extern "C" +#endif +void backward_convolution(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first) { + #ifndef __CUDACC__ + backward_convolution_cpu(ker, input, input_z, output, depth_input, dim_input, depth_output, dim_output, activation, is_first); + #else + backward_convolution_device(ker, input, input_z, output, depth_input, dim_input, depth_output, dim_output, activation, is_first); + #endif +} \ No newline at end of file diff --git a/src/cnn/cnn.c b/src/cnn/cnn.c index 9dd2dcb..c05eb10 100644 --- a/src/cnn/cnn.c +++ b/src/cnn/cnn.c @@ -4,6 +4,7 @@ #include // Is it used ? #include +#include "../include/memory_management.h" #include "include/backpropagation.h" #include "include/initialisation.h" #include "include/function.h" @@ -226,7 +227,7 @@ void backward_propagation(Network* network, int wanted_number) { // Backward sur la dernière couche qui utilise toujours SOFTMAX float* wanted_output = generate_wanted_output(wanted_number, network->width[network->size -1]); // Sortie désirée, permet d'initialiser une erreur softmax_backward_cross_entropy(network->input[n-1][0][0], wanted_output, network->width[n-1]); - free(wanted_output); + gree(wanted_output); /* * On propage à chaque étape: @@ -252,14 +253,12 @@ void backward_propagation(Network* network, int wanted_number) { if (k_i->cnn) { // Convolution - funcPtr d_f = get_activation_function(-activation); - backward_convolution(k_i->cnn, input, input_z, output, input_depth, input_width, output_depth, output_width, d_f, i==0); + backward_convolution(k_i->cnn, input, input_z, output, input_depth, input_width, output_depth, output_width, -activation, i==0); } else if (k_i->nn) { // Full connection - funcPtr d_f = get_activation_function(-activation); if (k_i->linearisation == DOESNT_LINEARISE) { // Vecteur -> Vecteur - backward_dense(k_i->nn, input[0][0], input_z[0][0], output[0][0], input_width, output_width, d_f, i==0); + backward_dense(k_i->nn, input[0][0], input_z[0][0], output[0][0], input_width, output_width, -activation, i==0); } else { // Matrice -> vecteur - backward_linearisation(k_i->nn, input, input_z, output[0][0], input_depth, input_width, output_width, d_f); + backward_linearisation(k_i->nn, input, input_z, output[0][0], input_depth, input_width, output_width, -activation); } } else { // Pooling if (k_i->pooling == AVG_POOLING) { @@ -313,7 +312,7 @@ float compute_cross_entropy_loss(float* output, float* wanted_output, int len) { } float* generate_wanted_output(int wanted_number, int size_output) { - float* wanted_output = (float*)malloc(sizeof(float)*size_output); + float* wanted_output = (float*)nalloc(size_output, sizeof(float)); for (int i=0; i < size_output; i++) { if (i==wanted_number) { wanted_output[i]=1; diff --git a/src/cnn/function.c b/src/cnn/function.c index 4ac6ecb..71af124 100644 --- a/src/cnn/function.c +++ b/src/cnn/function.c @@ -107,20 +107,16 @@ float leaky_relu_derivative(float x) { //* Tanh #ifdef __CUDACC__ -__device__ -#endif -float device_tanh_(float x) { +__device__ float device_tanh_(float x) { return tanh(x); } -#ifdef __CUDACC__ -__device__ -#endif -float device_tanh_derivative(float x) { +__device__ float device_tanh_derivative(float x) { float a = tanh(x); return 1 - a*a; } +#endif float tanh_(float x) { return tanh(x); } @@ -303,6 +299,7 @@ funcPtr get_activation_function(int activation) { #ifdef __CUDACC__ +extern "C" funcPtr get_activation_function_cuda(int activation) { funcPtr host_function; diff --git a/src/cnn/function.cu b/src/cnn/function.cu index 4ac6ecb..22bfae6 100644 --- a/src/cnn/function.cu +++ b/src/cnn/function.cu @@ -107,19 +107,15 @@ float leaky_relu_derivative(float x) { //* Tanh #ifdef __CUDACC__ -__device__ -#endif -float device_tanh_(float x) { +__device__ float device_tanh_(float x) { return tanh(x); } -#ifdef __CUDACC__ -__device__ -#endif -float device_tanh_derivative(float x) { +__device__ float device_tanh_derivative(float x) { float a = tanh(x); return 1 - a*a; } +#endif float tanh_(float x) { return tanh(x); @@ -303,6 +299,7 @@ funcPtr get_activation_function(int activation) { #ifdef __CUDACC__ +extern "C" funcPtr get_activation_function_cuda(int activation) { funcPtr host_function; diff --git a/src/cnn/include/backpropagation.h b/src/cnn/include/backpropagation.h index d44af8e..4656e18 100644 --- a/src/cnn/include/backpropagation.h +++ b/src/cnn/include/backpropagation.h @@ -14,42 +14,70 @@ int min(int a, int b); */ int max(int a, int b); + +#ifdef __CUDACC__ +extern "C" +#endif /* * Transfert les informations d'erreur de la sortie voulue à la sortie réelle */ void softmax_backward_mse(float* input, float* output, int size); + +#ifdef __CUDACC__ +extern "C" +#endif /* * Transfert les informations d'erreur de la sortie voulue à la sortie réelle * en considérant MSE (Mean Squared Error) comme fonction d'erreur */ void softmax_backward_cross_entropy(float* input, float* output, int size); + +#ifdef __CUDACC__ +extern "C" +#endif /* * Transfert les informations d'erreur à travers une couche d'average pooling * en considérant cross_entropy comme fonction d'erreur */ void backward_average_pooling(float*** input, float*** output, int input_width, int output_width, int depth); + +#ifdef __CUDACC__ +extern "C" +#endif /* * Transfert les informations d'erreur à travers une couche de max pooling * en considérant cross_entropy comme fonction d'erreur */ void backward_max_pooling(float*** input, float*** output, int input_width, int output_width, int depth); + +#ifdef __CUDACC__ +extern "C" +#endif /* * Transfert les informations d'erreur à travers une couche fully connected */ -void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, funcPtr d_function, int is_first); +void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first); + +#ifdef __CUDACC__ +extern "C" +#endif /* * Transfert les informations d'erreur à travers une couche de linéarisation */ -void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, funcPtr d_function); +void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation); + +#ifdef __CUDACC__ +extern "C" +#endif /* * Transfert les informations d'erreur à travers un couche de convolution */ -void backward_convolution(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, funcPtr d_function, int is_first); +void backward_convolution(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first); #endif diff --git a/src/cnn/include/config.h b/src/cnn/include/config.h index c5dbc99..313192b 100644 --- a/src/cnn/include/config.h +++ b/src/cnn/include/config.h @@ -39,6 +39,8 @@ #define NETWORK_CLIP_VALUE 300 //* Paramètres CUDA +// Le produit des 3 dimensions doit être au maximum 1024 (atteignable avec 8*8*16) +// Le réduire permet d'éviter des erreurs "Out of memory" au lancement des Kernel #define BLOCKSIZE_x 10 #define BLOCKSIZE_y 10 #define BLOCKSIZE_z 10 diff --git a/src/cnn/include/function.h b/src/cnn/include/function.h index 4234009..6ff17b4 100644 --- a/src/cnn/include/function.h +++ b/src/cnn/include/function.h @@ -142,6 +142,9 @@ funcPtr get_activation_function(int activation); /* * Récupère un pointeur sur le device vers la fonction d'activation demandée puis le transforme en pointeur sur l'host */ +#ifdef __CUDACC__ +extern "C" funcPtr get_activation_function_cuda(int activation); +#endif #endif \ No newline at end of file diff --git a/src/cnn/test_network.c b/src/cnn/test_network.c index 6b83c40..6c33117 100644 --- a/src/cnn/test_network.c +++ b/src/cnn/test_network.c @@ -51,7 +51,7 @@ float* test_network_mnist(Network* network, char* images_file, char* labels_file // Compute loss wanted_output = generate_wanted_output(labels[i], 10); loss += compute_mean_squared_error(network->input[network->size-1][0][0], wanted_output, 10); - free(wanted_output); + gree(wanted_output); for (int j=0; j < height; j++) { free(images[i][j]); @@ -60,7 +60,7 @@ float* test_network_mnist(Network* network, char* images_file, char* labels_file } free(images); - float* results = malloc(sizeof(float)*2); + float* results = (float*)malloc(sizeof(float)*2); results[0] = 100*accuracy/(float)nb_elem; results[1] = loss/(float)nb_elem; return results; @@ -90,7 +90,7 @@ float* test_network_jpg(Network* network, char* data_dir, bool preview_fails, bo free(dataset->images[i]); } - float* results = malloc(sizeof(float)*2); + float* results = (float*)malloc(sizeof(float)*2); results[0] = 100*accuracy/(float)dataset->numImages; results[1] = 0; diff --git a/src/cnn/train.c b/src/cnn/train.c index 42033ff..3ccc891 100644 --- a/src/cnn/train.c +++ b/src/cnn/train.c @@ -62,7 +62,7 @@ void* train_thread(void* parameters) { wanted_output = generate_wanted_output(labels[index[i]], 10); loss += compute_mean_squared_error(network->input[network->size-1][0][0], wanted_output, 10); - free(wanted_output); + gree(wanted_output); backward_propagation(network, labels[index[i]]); diff --git a/src/include/utils.h b/src/include/utils.h index 3305c44..b18d59c 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -45,4 +45,12 @@ extern "C" * Copier des valeurs d'un tableau de dimension 3 de mémoire partagée */ void copy_3d_array(float*** source, float*** dest, int dimension1, int dimension2, int dimension3); + +#ifdef __CUDACC__ +extern "C" +#endif +/* +* Remplir un tableau de 0. +*/ +void reset_3d_array(float*** source, int dimension1, int dimension2, int dimension3); #endif \ No newline at end of file diff --git a/src/memory_management.c b/src/memory_management.c index b07b3ca..c895694 100644 --- a/src/memory_management.c +++ b/src/memory_management.c @@ -5,6 +5,7 @@ #include "include/memory_management.h" #include "include/colors.h" +#include "include/utils.h" Memory* memory = NULL; @@ -56,6 +57,9 @@ Memory* create_memory_block(size_t size) { Memory* mem = (Memory*)malloc(sizeof(Memory)); #ifdef __CUDACC__ cudaMallocManaged(&(mem->start), size, cudaMemAttachHost); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); #else mem->start = malloc(size); #endif @@ -93,6 +97,7 @@ void* allocate_memory(int nb_elements, size_t size, Memory* mem) { //printf("Mémoire disponible: %ld. Nécessaire: %ld\n", mem->size - ((intptr_t)mem->cursor - (intptr_t)mem->start), nb_elements*size); // Sinon on continue sur l'élément suivant de la liste if (!mem->next) { + //! WARNING: May cause Infinite allocations when trying to allocate more than MEMORY_BLOCK size at once that is not naturally aligned (CUDA only) mem->next = create_memory_block(MEMORY_BLOCK < nb_elements*size ? nb_elements*size : MEMORY_BLOCK); } return allocate_memory(nb_elements, size, mem->next); diff --git a/src/memory_management.cu b/src/memory_management.cu index 6d2791e..c895694 100644 --- a/src/memory_management.cu +++ b/src/memory_management.cu @@ -5,6 +5,7 @@ #include "include/memory_management.h" #include "include/colors.h" +#include "include/utils.h" Memory* memory = NULL; @@ -56,6 +57,9 @@ Memory* create_memory_block(size_t size) { Memory* mem = (Memory*)malloc(sizeof(Memory)); #ifdef __CUDACC__ cudaMallocManaged(&(mem->start), size, cudaMemAttachHost); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); #else mem->start = malloc(size); #endif diff --git a/src/utils.c b/src/utils.c index 5cee8ee..f8073eb 100644 --- a/src/utils.c +++ b/src/utils.c @@ -92,4 +92,39 @@ void copy_3d_array(float*** source, float*** dest, int dimension1, int dimension } } } +#endif + +#ifdef __CUDACC__ +__global__ void reset_3d_array_kernel(float*** dest, int dimension1, int dimension2, int dimension3) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < dimension1 + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < dimension2 + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < dimension3 + + if (idx >= dimension1 || idy >= dimension2 || idz >= dimension3) { + return; + } + + dest[idx][idy][idz] = 0.; +} + +extern "C" +void reset_3d_array(float*** dest, int dimension1, int dimension2, int dimension3) { + dim3 gridSize(i_div_up(dimension1, BLOCKSIZE_x), i_div_up(dimension2, BLOCKSIZE_y), i_div_up(dimension3, BLOCKSIZE_z)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + reset_3d_array_kernel<<>>(dest, dimension1, dimension2, dimension3); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#else +void reset_3d_array(float*** dest, int dimension1, int dimension2, int dimension3) { + for (int i=0; i < dimension1; i++) { + for (int j=0; j < dimension2; j++) { + for (int k=0; k < dimension3; k++) { + dest[i][j][k] = 0.; + } + } + } +} #endif \ No newline at end of file diff --git a/src/utils.cu b/src/utils.cu index bc60c5f..f8073eb 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -73,7 +73,6 @@ __global__ void copy_3d_array_kernel(float*** source, float*** dest, int dimensi dest[idx][idy][idz] = source[idx][idy][idz]; } -extern "C" void copy_3d_array(float*** source, float*** dest, int dimension1, int dimension2, int dimension3) { dim3 gridSize(i_div_up(dimension1, BLOCKSIZE_x), i_div_up(dimension2, BLOCKSIZE_y), i_div_up(dimension3, BLOCKSIZE_z)); dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); @@ -93,4 +92,39 @@ void copy_3d_array(float*** source, float*** dest, int dimension1, int dimension } } } +#endif + +#ifdef __CUDACC__ +__global__ void reset_3d_array_kernel(float*** dest, int dimension1, int dimension2, int dimension3) { + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < dimension1 + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < dimension2 + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < dimension3 + + if (idx >= dimension1 || idy >= dimension2 || idz >= dimension3) { + return; + } + + dest[idx][idy][idz] = 0.; +} + +extern "C" +void reset_3d_array(float*** dest, int dimension1, int dimension2, int dimension3) { + dim3 gridSize(i_div_up(dimension1, BLOCKSIZE_x), i_div_up(dimension2, BLOCKSIZE_y), i_div_up(dimension3, BLOCKSIZE_z)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + reset_3d_array_kernel<<>>(dest, dimension1, dimension2, dimension3); + + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#else +void reset_3d_array(float*** dest, int dimension1, int dimension2, int dimension3) { + for (int i=0; i < dimension1; i++) { + for (int j=0; j < dimension2; j++) { + for (int k=0; k < dimension3; k++) { + dest[i][j][k] = 0.; + } + } + } +} #endif \ No newline at end of file