mirror of
https://github.com/augustin64/projet-tipe
synced 2025-02-02 19:39:39 +01:00
Compare commits
9 Commits
eeff720ae4
...
710306a286
Author | SHA1 | Date | |
---|---|---|---|
710306a286 | |||
d6d03162b2 | |||
05315a3567 | |||
a3a803ba40 | |||
7511856621 | |||
5088c415d6 | |||
dd16e34cce | |||
953c92ac61 | |||
2ee1bc4079 |
6
Makefile
6
Makefile
@ -101,7 +101,7 @@ $(BUILDDIR)/cnn-main-cuda: $(BUILDDIR)/cnn_main.cuda.o \
|
||||
$(BUILDDIR)/cnn_free.cuda.o \
|
||||
$(BUILDDIR)/cnn_jpeg.cuda.o \
|
||||
$(BUILDDIR)/cnn_cuda_convolution.o \
|
||||
$(BUILDDIR)/cnn_backpropagation.cuda.o \
|
||||
$(BUILDDIR)/cnn_cuda_backpropagation.o \
|
||||
$(BUILDDIR)/colors.cuda.o \
|
||||
$(BUILDDIR)/cuda_memory_management.o \
|
||||
$(BUILDDIR)/mnist.cuda.o \
|
||||
@ -126,7 +126,7 @@ $(BUILDDIR)/cnn_%.cuda.o: $(CNN_SRCDIR)/%.c $(CNN_SRCDIR)/include/%.h
|
||||
|
||||
ifdef NVCC_INSTALLED
|
||||
$(BUILDDIR)/cnn_cuda_%.o: $(CNN_SRCDIR)/%.cu $(CNN_SRCDIR)/include/%.h
|
||||
$(NVCC) $(NVCCFLAGS) -c $< -o $@
|
||||
$(NVCC) $(NVCCFLAGS) -c -dc $< -o $@
|
||||
else
|
||||
$(BUILDDIR)/cnn_cuda_%.o: $(CNN_SRCDIR)/%.cu $(CNN_SRCDIR)/include/%.h
|
||||
@echo "$(NVCC) not found, skipping"
|
||||
@ -142,7 +142,7 @@ $(BUILDDIR)/%.cuda.o: $(SRCDIR)/%.c $(SRCDIR)/include/%.h
|
||||
|
||||
ifdef NVCC_INSTALLED
|
||||
$(BUILDDIR)/cuda_%.o: $(SRCDIR)/%.cu $(SRCDIR)/include/%.h
|
||||
$(NVCC) $(NVCCFLAGS) -c $< -o $@
|
||||
$(NVCC) $(NVCCFLAGS) -c -dc $< -o $@
|
||||
else
|
||||
@echo "$(NVCC) not found, skipping"
|
||||
endif
|
||||
|
@ -3,8 +3,12 @@
|
||||
#include <math.h>
|
||||
|
||||
#include "include/backpropagation.h"
|
||||
#include "../include/utils.h"
|
||||
#include "include/struct.h"
|
||||
|
||||
#include "include/config.h"
|
||||
|
||||
#ifndef __CUDACC__
|
||||
int min(int a, int b) {
|
||||
return a<b?a:b;
|
||||
}
|
||||
@ -12,8 +16,38 @@ int min(int a, int b) {
|
||||
int max(int a, int b) {
|
||||
return a > b ? a : b;
|
||||
}
|
||||
#endif
|
||||
|
||||
void softmax_backward_mse(float* input, float* output, int size) {
|
||||
/*
|
||||
* Softmax backward MSE
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void softmax_backward_mse_kernel(float* input, float* output, int size) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
|
||||
if (idx >= size) {
|
||||
return;
|
||||
}
|
||||
|
||||
int input_val = input[idx];
|
||||
int output_val = output[idx];
|
||||
|
||||
input[idx] = (output_val-input_val)*input_val*(1-input_val);
|
||||
}
|
||||
|
||||
void softmax_backward_mse_device(float* input, float* output, int size) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(size, BLOCKSIZE_x));
|
||||
dim3 blockSize(BLOCKSIZE_x);
|
||||
|
||||
softmax_backward_mse_kernel<<<gridSize, blockSize>>>(input, output, size);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void softmax_backward_mse_cpu(float* input, float* output, int size) {
|
||||
/* Input et output ont la même taille */
|
||||
|
||||
for (int i=0; i < size; i++){
|
||||
@ -21,7 +55,42 @@ void softmax_backward_mse(float* input, float* output, int size) {
|
||||
}
|
||||
}
|
||||
|
||||
void softmax_backward_cross_entropy(float* input, float* output, int size) {
|
||||
void softmax_backward_mse(float* input, float* output, int size) {
|
||||
#ifdef __CUDACC__
|
||||
softmax_backward_mse_device(input, output, size);
|
||||
#else
|
||||
softmax_backward_mse_cpu(input, output, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Softmax backward Cross entropy
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void softmax_backward_cross_entropy_kernel(float* input, float* output, int size) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
|
||||
if (idx >= size) {
|
||||
return;
|
||||
}
|
||||
|
||||
input[idx] = output[idx] - input[idx];
|
||||
}
|
||||
|
||||
void softmax_backward_cross_entropy_device(float* input, float* output, int size) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(size, BLOCKSIZE_x));
|
||||
dim3 blockSize(BLOCKSIZE_x);
|
||||
|
||||
softmax_backward_cross_entropy_kernel<<<gridSize, blockSize>>>(input, output, size);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void softmax_backward_cross_entropy_cpu(float* input, float* output, int size) {
|
||||
/* Input et output ont la même taille */
|
||||
|
||||
for (int i=0; i < size; i++){
|
||||
@ -29,16 +98,60 @@ void softmax_backward_cross_entropy(float* input, float* output, int size) {
|
||||
}
|
||||
}
|
||||
|
||||
void backward_average_pooling(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
void softmax_backward_cross_entropy(float* input, float* output, int size) {
|
||||
#ifdef __CUDACC__
|
||||
softmax_backward_cross_entropy_device(input, output, size);
|
||||
#else
|
||||
softmax_backward_cross_entropy_cpu(input, output, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Backward average pooling
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void backward_average_pooling_kernel(float*** input, float*** output, int input_width, int output_width, int depth, int n, int size) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_width
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_width
|
||||
|
||||
if (idx >= depth || idy >= output_width || idz >= output_width) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
input[idx][size*idy +a][size*idz +b] += output[idx][idy][idz]/n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void backward_average_pooling_device(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(depth, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
int size = input_width/output_width; // Taille du pooling
|
||||
|
||||
reset_3d_array(input, depth, input_width, input_width);
|
||||
|
||||
backward_average_pooling_kernel<<<gridSize, blockSize>>>(input, output, input_width, output_width, depth, size*size, size);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void backward_average_pooling_cpu(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
/* Input et output ont la même profondeur (depth) */
|
||||
|
||||
int size = input_width/output_width; // Taille du pooling
|
||||
int n = size*size; // Nombre d'éléments dans le pooling
|
||||
|
||||
for (int a=0; a < depth; a++)
|
||||
for (int b=0; b < input_width; b++)
|
||||
for (int c=0; c < input_width; c++)
|
||||
input[a][b][c] = 0;
|
||||
reset_3d_array(input, depth, input_width, input_width);
|
||||
|
||||
for (int i=0; i < depth; i++) {
|
||||
for (int j=0; j < output_width; j++) {
|
||||
@ -53,7 +166,65 @@ void backward_average_pooling(float*** input, float*** output, int input_width,
|
||||
}
|
||||
}
|
||||
|
||||
void backward_max_pooling(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void backward_average_pooling(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
#ifndef __CUDACC__
|
||||
backward_average_pooling_cpu(input, output, input_width, output_width, depth);
|
||||
#else
|
||||
backward_average_pooling_device(input, output, input_width, output_width, depth);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Backward max pooling
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void backward_max_pooling_kernel(float*** input, float*** output, int input_width, int output_width, int depth, int n, int size) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_width
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_width
|
||||
|
||||
if (idx >= depth || idy >= output_width || idz >= output_width) {
|
||||
return;
|
||||
}
|
||||
|
||||
float m = -FLT_MAX;
|
||||
int a_max = -1;
|
||||
int b_max = -1;
|
||||
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
if (input[idx][size*idy +a][size*idz +b] > m) {
|
||||
m = input[idx][size*idy +a][size*idz +b];
|
||||
a_max = a;
|
||||
b_max = b;
|
||||
}
|
||||
input[idx][size*idy +a][size*idz +b] = 0;
|
||||
}
|
||||
}
|
||||
input[idx][size*idy +a_max][size*idz +b_max] = output[idx][idy][idz]/n;
|
||||
}
|
||||
|
||||
|
||||
void backward_max_pooling_device(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(depth, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
int size = input_width/output_width; // Taille du pooling
|
||||
|
||||
backward_max_pooling_kernel<<<gridSize, blockSize>>>(input, output, input_width, output_width, depth, size*size, size);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void backward_max_pooling_cpu(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
int size = input_width/output_width;
|
||||
|
||||
float m; // Maximum
|
||||
@ -82,7 +253,78 @@ void backward_max_pooling(float*** input, float*** output, int input_width, int
|
||||
}
|
||||
}
|
||||
|
||||
void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, funcPtr d_function, int is_first) {
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void backward_max_pooling(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
#ifndef __CUDACC__
|
||||
backward_max_pooling_cpu(input, output, input_width, output_width, depth);
|
||||
#else
|
||||
backward_max_pooling_device(input, output, input_width, output_width, depth);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Backward Dense
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void backward_dense_kernel_1(Kernel_nn* ker, float* input, float* output, int size_input, int size_output) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_input
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < size_output
|
||||
|
||||
if (idx >= size_input || idy >= size_output) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (idx == 0) {
|
||||
ker->d_bias[idy] += output[idy];
|
||||
}
|
||||
ker->d_weights[idx][idy] += input[idx]*output[idy];
|
||||
}
|
||||
|
||||
__global__ void backward_dense_kernel_2(float** weights, float* input, float* input_z, float* output, int size_input, int size_output, funcPtr d_f) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_input
|
||||
|
||||
if (idx >= size_input) {
|
||||
return;
|
||||
}
|
||||
|
||||
float tmp=0;
|
||||
for (int j=0; j < size_output; j++) {
|
||||
tmp += output[j]*weights[idx][j];
|
||||
}
|
||||
input[idx] = tmp*( (*d_f)(input_z[idx]) );
|
||||
}
|
||||
|
||||
void backward_dense_device(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first) {
|
||||
// Make computation
|
||||
dim3 gridSize1(i_div_up(size_input, BLOCKSIZE_x), i_div_up(size_output, BLOCKSIZE_y));
|
||||
dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y);
|
||||
|
||||
backward_dense_kernel_1<<<gridSize1, blockSize1>>>(ker, input, output, size_input, size_output);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
|
||||
// Second kernel
|
||||
if (is_first != 1) {
|
||||
dim3 gridSize1(i_div_up(size_input, BLOCKSIZE_x));
|
||||
dim3 blockSize1(BLOCKSIZE_x);
|
||||
|
||||
funcPtr d_function = get_activation_function_cuda(activation);
|
||||
|
||||
backward_dense_kernel_2<<<gridSize1, blockSize1>>>(ker->weights, input, input_z, output, size_input, size_output, d_function);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void backward_dense_cpu(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first) {
|
||||
|
||||
funcPtr d_function = get_activation_function(activation);
|
||||
// Bias
|
||||
for (int j=0; j < size_output; j++) {
|
||||
ker->d_bias[j] += output[j];
|
||||
@ -109,7 +351,85 @@ void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output,
|
||||
}
|
||||
}
|
||||
|
||||
void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output,funcPtr d_function) {
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first) {
|
||||
#ifndef __CUDACC__
|
||||
backward_dense_cpu(ker, input, input_z, output, size_input, size_output, activation, is_first);
|
||||
#else
|
||||
backward_dense_device(ker, input, input_z, output, size_input, size_output, activation, is_first);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Backward linearisation
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void backward_linearisation_kernel_1(Kernel_nn* ker, float*** input, float* output, int depth_input, int dim_input, int size_output) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth_input
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < dim_input
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < dim_input
|
||||
|
||||
if (idx >= depth_input || idy >= dim_input || idz >= dim_input) {
|
||||
return;
|
||||
}
|
||||
|
||||
int id = idx*dim_input*dim_input + idy*dim_input + idz;
|
||||
|
||||
for (int j=0; j < size_output; j++) {
|
||||
ker->d_weights[id][j] += input[idx][idy][idz]*output[j];
|
||||
}
|
||||
if (id == 0) {
|
||||
for (int j=0; j < size_output; j++) {
|
||||
ker->d_bias[j] += output[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void backward_linearisation_kernel_2(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, funcPtr d_f) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth_input
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < dim_input
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < dim_input
|
||||
|
||||
if (idx >= depth_input || idy >= dim_input || idz >= dim_input) {
|
||||
return;
|
||||
}
|
||||
int id = idx*dim_input*dim_input + idy*dim_input + idz;
|
||||
|
||||
float tmp=0;
|
||||
for (int j=0; j < size_output; j++) {
|
||||
tmp += output[j]*ker->weights[id][j];
|
||||
}
|
||||
input[idx][idy][idz] = tmp*( (*d_f)(input_z[idx][idy][idz]) );
|
||||
}
|
||||
|
||||
void backward_linearisation_device(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(depth_input, BLOCKSIZE_x), i_div_up(dim_input, BLOCKSIZE_y), i_div_up(dim_input, BLOCKSIZE_y));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
backward_linearisation_kernel_1<<<gridSize, blockSize>>>(ker, input, output, depth_input, dim_input, size_output);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
|
||||
// Second kernel
|
||||
funcPtr d_function = get_activation_function_cuda(activation);
|
||||
|
||||
backward_linearisation_kernel_2<<<gridSize, blockSize>>>(ker, input, input_z, output, depth_input, dim_input, size_output, d_function);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void backward_linearisation_cpu(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation) {
|
||||
|
||||
funcPtr d_function = get_activation_function(activation);
|
||||
|
||||
// Bias
|
||||
for (int j=0; j < size_output; j++) {
|
||||
ker->d_bias[j] += output[j];
|
||||
@ -144,7 +464,119 @@ void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, fl
|
||||
}
|
||||
}
|
||||
|
||||
void backward_convolution(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, funcPtr d_function, int is_first) {
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation) {
|
||||
#ifndef __CUDACC__
|
||||
backward_linearisation_cpu(ker, input, input_z, output, depth_input, dim_input, size_output, activation);
|
||||
#else
|
||||
backward_linearisation_device(ker, input, input_z, output, depth_input, dim_input, size_output, activation);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Backward convolution
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void backward_convolution_dbias_kernel(Kernel_cnn* ker, float*** output, int depth_output, int dim_output) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||
|
||||
if (idx >= depth_output || idy >= dim_output || idz >= dim_output) {
|
||||
return;
|
||||
}
|
||||
ker->d_bias[idx][idy][idz] += output[idx][idy][idz];
|
||||
}
|
||||
|
||||
__global__ void backward_convolution_dweight_kernel(Kernel_cnn* ker, float*** input, float*** output, int depth_input, int depth_output, int dim_output, int k_size) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||
|
||||
int idz1 = idz / k_size;
|
||||
int idz2 = idz % k_size;
|
||||
|
||||
if (idx >= depth_input || idy >= depth_output || idz1 >= k_size || idz2 >= k_size) {
|
||||
return;
|
||||
}
|
||||
|
||||
float tmp = 0;
|
||||
for (int l=0; l < dim_output; l++) {
|
||||
for (int m=0; m < dim_output; m++) {
|
||||
tmp += input[idx][l+idz1][m+idz2]*output[idy][l][m];
|
||||
}
|
||||
}
|
||||
ker->d_weights[idx][idy][idz1][idz2] += tmp;
|
||||
}
|
||||
|
||||
__global__ void backward_convolution_propagate_kernel(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int k_size, funcPtr d_f) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||
|
||||
if (idx >= depth_input || idy >= dim_input || idz >= dim_input) {
|
||||
return;
|
||||
}
|
||||
|
||||
int min_m, max_m, min_n, max_n;
|
||||
float tmp = 0;
|
||||
for (int l=0; l < depth_output; l++) {
|
||||
min_m = max(0, k_size-1-idy);
|
||||
max_m = min(k_size, dim_input - idy);
|
||||
min_n = max(0, k_size-1-idz);
|
||||
max_n = min(k_size, dim_input-idz);
|
||||
for (int m=min_m; m < max_m; m++) {
|
||||
for (int n=min_n; n < max_n; n++) {
|
||||
tmp += output[l][idy-k_size+m+1][idz-k_size+n+1]*ker->weights[idx][l][m][n];
|
||||
}
|
||||
}
|
||||
}
|
||||
input[idx][idy][idz] = tmp*( (*d_f)(input_z[idx][idy][idz]) );
|
||||
}
|
||||
|
||||
void backward_convolution_device(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first) {
|
||||
// Bias Kernel
|
||||
dim3 gridSize1(i_div_up(depth_output, BLOCKSIZE_x), i_div_up(dim_output, BLOCKSIZE_y), i_div_up(dim_output, BLOCKSIZE_y));
|
||||
dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
backward_convolution_dbias_kernel<<<gridSize1, blockSize1>>>(ker, output, depth_output, dim_output);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
|
||||
// Weights Kernel
|
||||
int k_size = dim_input - dim_output +1;
|
||||
|
||||
dim3 gridSize2(i_div_up(depth_input, BLOCKSIZE_x), i_div_up(depth_output, BLOCKSIZE_y), i_div_up(k_size*k_size, BLOCKSIZE_y));
|
||||
dim3 blockSize2(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
backward_convolution_dweight_kernel<<<gridSize2, blockSize2>>>(ker, input, output, depth_input, depth_output, dim_output, k_size);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
|
||||
// input propagation Kernel
|
||||
if (is_first != 1) {
|
||||
dim3 gridSize3(i_div_up(depth_input, BLOCKSIZE_x), i_div_up(dim_input, BLOCKSIZE_y), i_div_up(dim_input, BLOCKSIZE_y));
|
||||
dim3 blockSize3(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
funcPtr d_function = get_activation_function_cuda(activation);
|
||||
|
||||
backward_convolution_propagate_kernel<<<gridSize3, blockSize3>>>(ker, input, input_z, output, depth_input, dim_input, depth_output, k_size, d_function);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void backward_convolution_cpu(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first) {
|
||||
|
||||
funcPtr d_function = get_activation_function(activation);
|
||||
|
||||
// Bias
|
||||
for (int i=0; i < depth_output; i++) {
|
||||
for (int j=0; j < dim_output; j++) {
|
||||
@ -197,3 +629,14 @@ void backward_convolution(Kernel_cnn* ker, float*** input, float*** input_z, flo
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void backward_convolution(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first) {
|
||||
#ifndef __CUDACC__
|
||||
backward_convolution_cpu(ker, input, input_z, output, depth_input, dim_input, depth_output, dim_output, activation, is_first);
|
||||
#else
|
||||
backward_convolution_device(ker, input, input_z, output, depth_input, dim_input, depth_output, dim_output, activation, is_first);
|
||||
#endif
|
||||
}
|
642
src/cnn/backpropagation.cu
Normal file
642
src/cnn/backpropagation.cu
Normal file
@ -0,0 +1,642 @@
|
||||
#include <stdio.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "include/backpropagation.h"
|
||||
#include "../include/utils.h"
|
||||
#include "include/struct.h"
|
||||
|
||||
#include "include/config.h"
|
||||
|
||||
#ifndef __CUDACC__
|
||||
int min(int a, int b) {
|
||||
return a<b?a:b;
|
||||
}
|
||||
|
||||
int max(int a, int b) {
|
||||
return a > b ? a : b;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Softmax backward MSE
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void softmax_backward_mse_kernel(float* input, float* output, int size) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
|
||||
if (idx >= size) {
|
||||
return;
|
||||
}
|
||||
|
||||
int input_val = input[idx];
|
||||
int output_val = output[idx];
|
||||
|
||||
input[idx] = (output_val-input_val)*input_val*(1-input_val);
|
||||
}
|
||||
|
||||
void softmax_backward_mse_device(float* input, float* output, int size) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(size, BLOCKSIZE_x));
|
||||
dim3 blockSize(BLOCKSIZE_x);
|
||||
|
||||
softmax_backward_mse_kernel<<<gridSize, blockSize>>>(input, output, size);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void softmax_backward_mse_cpu(float* input, float* output, int size) {
|
||||
/* Input et output ont la même taille */
|
||||
|
||||
for (int i=0; i < size; i++){
|
||||
input[i] = (output[i]-input[i])*input[i]*(1-input[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void softmax_backward_mse(float* input, float* output, int size) {
|
||||
#ifdef __CUDACC__
|
||||
softmax_backward_mse_device(input, output, size);
|
||||
#else
|
||||
softmax_backward_mse_cpu(input, output, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Softmax backward Cross entropy
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void softmax_backward_cross_entropy_kernel(float* input, float* output, int size) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
|
||||
if (idx >= size) {
|
||||
return;
|
||||
}
|
||||
|
||||
input[idx] = output[idx] - input[idx];
|
||||
}
|
||||
|
||||
void softmax_backward_cross_entropy_device(float* input, float* output, int size) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(size, BLOCKSIZE_x));
|
||||
dim3 blockSize(BLOCKSIZE_x);
|
||||
|
||||
softmax_backward_cross_entropy_kernel<<<gridSize, blockSize>>>(input, output, size);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void softmax_backward_cross_entropy_cpu(float* input, float* output, int size) {
|
||||
/* Input et output ont la même taille */
|
||||
|
||||
for (int i=0; i < size; i++){
|
||||
input[i] = output[i] - input[i];
|
||||
}
|
||||
}
|
||||
|
||||
void softmax_backward_cross_entropy(float* input, float* output, int size) {
|
||||
#ifdef __CUDACC__
|
||||
softmax_backward_cross_entropy_device(input, output, size);
|
||||
#else
|
||||
softmax_backward_cross_entropy_cpu(input, output, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Backward average pooling
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void backward_average_pooling_kernel(float*** input, float*** output, int input_width, int output_width, int depth, int n, int size) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_width
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_width
|
||||
|
||||
if (idx >= depth || idy >= output_width || idz >= output_width) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
input[idx][size*idy +a][size*idz +b] += output[idx][idy][idz]/n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void backward_average_pooling_device(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(depth, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
int size = input_width/output_width; // Taille du pooling
|
||||
|
||||
reset_3d_array(input, depth, input_width, input_width);
|
||||
|
||||
backward_average_pooling_kernel<<<gridSize, blockSize>>>(input, output, input_width, output_width, depth, size*size, size);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void backward_average_pooling_cpu(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
/* Input et output ont la même profondeur (depth) */
|
||||
|
||||
int size = input_width/output_width; // Taille du pooling
|
||||
int n = size*size; // Nombre d'éléments dans le pooling
|
||||
|
||||
reset_3d_array(input, depth, input_width, input_width);
|
||||
|
||||
for (int i=0; i < depth; i++) {
|
||||
for (int j=0; j < output_width; j++) {
|
||||
for (int k=0; k < output_width; k++) {
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
input[i][size*j +a][size*k +b] += output[i][j][k]/n;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void backward_average_pooling(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
#ifndef __CUDACC__
|
||||
backward_average_pooling_cpu(input, output, input_width, output_width, depth);
|
||||
#else
|
||||
backward_average_pooling_device(input, output, input_width, output_width, depth);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Backward max pooling
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void backward_max_pooling_kernel(float*** input, float*** output, int input_width, int output_width, int depth, int n, int size) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_width
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_width
|
||||
|
||||
if (idx >= depth || idy >= output_width || idz >= output_width) {
|
||||
return;
|
||||
}
|
||||
|
||||
float m = -FLT_MAX;
|
||||
int a_max = -1;
|
||||
int b_max = -1;
|
||||
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
if (input[idx][size*idy +a][size*idz +b] > m) {
|
||||
m = input[idx][size*idy +a][size*idz +b];
|
||||
a_max = a;
|
||||
b_max = b;
|
||||
}
|
||||
input[idx][size*idy +a][size*idz +b] = 0;
|
||||
}
|
||||
}
|
||||
input[idx][size*idy +a_max][size*idz +b_max] = output[idx][idy][idz]/n;
|
||||
}
|
||||
|
||||
|
||||
void backward_max_pooling_device(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(depth, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
int size = input_width/output_width; // Taille du pooling
|
||||
|
||||
backward_max_pooling_kernel<<<gridSize, blockSize>>>(input, output, input_width, output_width, depth, size*size, size);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void backward_max_pooling_cpu(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
int size = input_width/output_width;
|
||||
|
||||
float m; // Maximum
|
||||
int a_max, b_max; // Indices du maximum
|
||||
|
||||
for (int i=0; i < depth; i++) {
|
||||
for (int j=0; j < output_width; j++) {
|
||||
for (int k=0; k < output_width; k++) {
|
||||
m = -FLT_MAX;
|
||||
a_max = -1;
|
||||
b_max = -1;
|
||||
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
if (input[i][size*j +a][size*k +b] > m) {
|
||||
m = input[i][size*j +a][size*k +b];
|
||||
a_max = a;
|
||||
b_max = b;
|
||||
}
|
||||
input[i][size*j +a][size*k +b] = 0;
|
||||
}
|
||||
}
|
||||
input[i][size*j +a_max][size*k +b_max] = output[i][j][k]/(size*size);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void backward_max_pooling(float*** input, float*** output, int input_width, int output_width, int depth) {
|
||||
#ifndef __CUDACC__
|
||||
backward_max_pooling_cpu(input, output, input_width, output_width, depth);
|
||||
#else
|
||||
backward_max_pooling_device(input, output, input_width, output_width, depth);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Backward Dense
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void backward_dense_kernel_1(Kernel_nn* ker, float* input, float* output, int size_input, int size_output) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_input
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < size_output
|
||||
|
||||
if (idx >= size_input || idy >= size_output) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (idx == 0) {
|
||||
ker->d_bias[idy] += output[idy];
|
||||
}
|
||||
ker->d_weights[idx][idy] += input[idx]*output[idy];
|
||||
}
|
||||
|
||||
__global__ void backward_dense_kernel_2(float** weights, float* input, float* input_z, float* output, int size_input, int size_output, funcPtr d_f) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_input
|
||||
|
||||
if (idx >= size_input) {
|
||||
return;
|
||||
}
|
||||
|
||||
float tmp=0;
|
||||
for (int j=0; j < size_output; j++) {
|
||||
tmp += output[j]*weights[idx][j];
|
||||
}
|
||||
input[idx] = tmp*( (*d_f)(input_z[idx]) );
|
||||
}
|
||||
|
||||
void backward_dense_device(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first) {
|
||||
// Make computation
|
||||
dim3 gridSize1(i_div_up(size_input, BLOCKSIZE_x), i_div_up(size_output, BLOCKSIZE_y));
|
||||
dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y);
|
||||
|
||||
backward_dense_kernel_1<<<gridSize1, blockSize1>>>(ker, input, output, size_input, size_output);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
|
||||
// Second kernel
|
||||
if (is_first != 1) {
|
||||
dim3 gridSize1(i_div_up(size_input, BLOCKSIZE_x));
|
||||
dim3 blockSize1(BLOCKSIZE_x);
|
||||
|
||||
funcPtr d_function = get_activation_function_cuda(activation);
|
||||
|
||||
backward_dense_kernel_2<<<gridSize1, blockSize1>>>(ker->weights, input, input_z, output, size_input, size_output, d_function);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void backward_dense_cpu(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first) {
|
||||
|
||||
funcPtr d_function = get_activation_function(activation);
|
||||
// Bias
|
||||
for (int j=0; j < size_output; j++) {
|
||||
ker->d_bias[j] += output[j];
|
||||
}
|
||||
|
||||
// Weights
|
||||
for (int i=0; i < size_input; i++) {
|
||||
for (int j=0; j < size_output; j++) {
|
||||
ker->d_weights[i][j] += input[i]*output[j];
|
||||
}
|
||||
}
|
||||
|
||||
// Input
|
||||
if (is_first==1) {// Pas besoin de backpropager dans l'input
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i=0; i < size_input; i++) {
|
||||
float tmp=0;
|
||||
for (int j=0; j < size_output; j++) {
|
||||
tmp += output[j]*ker->weights[i][j];
|
||||
}
|
||||
input[i] = tmp*d_function(input_z[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first) {
|
||||
#ifndef __CUDACC__
|
||||
backward_dense_cpu(ker, input, input_z, output, size_input, size_output, activation, is_first);
|
||||
#else
|
||||
backward_dense_device(ker, input, input_z, output, size_input, size_output, activation, is_first);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Backward linearisation
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void backward_linearisation_kernel_1(Kernel_nn* ker, float*** input, float* output, int depth_input, int dim_input, int size_output) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth_input
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < dim_input
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < dim_input
|
||||
|
||||
if (idx >= depth_input || idy >= dim_input || idz >= dim_input) {
|
||||
return;
|
||||
}
|
||||
|
||||
int id = idx*dim_input*dim_input + idy*dim_input + idz;
|
||||
|
||||
for (int j=0; j < size_output; j++) {
|
||||
ker->d_weights[id][j] += input[idx][idy][idz]*output[j];
|
||||
}
|
||||
if (id == 0) {
|
||||
for (int j=0; j < size_output; j++) {
|
||||
ker->d_bias[j] += output[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void backward_linearisation_kernel_2(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, funcPtr d_f) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth_input
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < dim_input
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < dim_input
|
||||
|
||||
if (idx >= depth_input || idy >= dim_input || idz >= dim_input) {
|
||||
return;
|
||||
}
|
||||
int id = idx*dim_input*dim_input + idy*dim_input + idz;
|
||||
|
||||
float tmp=0;
|
||||
for (int j=0; j < size_output; j++) {
|
||||
tmp += output[j]*ker->weights[id][j];
|
||||
}
|
||||
input[idx][idy][idz] = tmp*( (*d_f)(input_z[idx][idy][idz]) );
|
||||
}
|
||||
|
||||
void backward_linearisation_device(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(depth_input, BLOCKSIZE_x), i_div_up(dim_input, BLOCKSIZE_y), i_div_up(dim_input, BLOCKSIZE_y));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
backward_linearisation_kernel_1<<<gridSize, blockSize>>>(ker, input, output, depth_input, dim_input, size_output);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
|
||||
// Second kernel
|
||||
funcPtr d_function = get_activation_function_cuda(activation);
|
||||
|
||||
backward_linearisation_kernel_2<<<gridSize, blockSize>>>(ker, input, input_z, output, depth_input, dim_input, size_output, d_function);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void backward_linearisation_cpu(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation) {
|
||||
|
||||
funcPtr d_function = get_activation_function(activation);
|
||||
|
||||
// Bias
|
||||
for (int j=0; j < size_output; j++) {
|
||||
ker->d_bias[j] += output[j];
|
||||
}
|
||||
|
||||
// Weights
|
||||
int cpt = 0;
|
||||
for (int i=0; i < depth_input; i++) {
|
||||
for (int k=0; k < dim_input; k++) {
|
||||
for (int l=0; l < dim_input; l++) {
|
||||
for (int j=0; j < size_output; j++) {
|
||||
ker->d_weights[cpt][j] += input[i][k][l]*output[j];
|
||||
}
|
||||
cpt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Input
|
||||
cpt = 0;
|
||||
for (int i=0; i < depth_input; i++) {
|
||||
for (int k=0; k < dim_input; k++) {
|
||||
for (int l=0; l < dim_input; l++) {
|
||||
float tmp=0;
|
||||
for (int j=0; j < size_output; j++) {
|
||||
tmp += output[j]*ker->weights[cpt][j];
|
||||
}
|
||||
input[i][k][l] = tmp*d_function(input_z[i][k][l]);
|
||||
cpt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation) {
|
||||
#ifndef __CUDACC__
|
||||
backward_linearisation_cpu(ker, input, input_z, output, depth_input, dim_input, size_output, activation);
|
||||
#else
|
||||
backward_linearisation_device(ker, input, input_z, output, depth_input, dim_input, size_output, activation);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Backward convolution
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void backward_convolution_dbias_kernel(Kernel_cnn* ker, float*** output, int depth_output, int dim_output) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||
|
||||
if (idx >= depth_output || idy >= dim_output || idz >= dim_output) {
|
||||
return;
|
||||
}
|
||||
ker->d_bias[idx][idy][idz] += output[idx][idy][idz];
|
||||
}
|
||||
|
||||
__global__ void backward_convolution_dweight_kernel(Kernel_cnn* ker, float*** input, float*** output, int depth_input, int depth_output, int dim_output, int k_size) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||
|
||||
int idz1 = idz / k_size;
|
||||
int idz2 = idz % k_size;
|
||||
|
||||
if (idx >= depth_input || idy >= depth_output || idz1 >= k_size || idz2 >= k_size) {
|
||||
return;
|
||||
}
|
||||
|
||||
float tmp = 0;
|
||||
for (int l=0; l < dim_output; l++) {
|
||||
for (int m=0; m < dim_output; m++) {
|
||||
tmp += input[idx][l+idz1][m+idz2]*output[idy][l][m];
|
||||
}
|
||||
}
|
||||
ker->d_weights[idx][idy][idz1][idz2] += tmp;
|
||||
}
|
||||
|
||||
__global__ void backward_convolution_propagate_kernel(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int k_size, funcPtr d_f) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||
|
||||
if (idx >= depth_input || idy >= dim_input || idz >= dim_input) {
|
||||
return;
|
||||
}
|
||||
|
||||
int min_m, max_m, min_n, max_n;
|
||||
float tmp = 0;
|
||||
for (int l=0; l < depth_output; l++) {
|
||||
min_m = max(0, k_size-1-idy);
|
||||
max_m = min(k_size, dim_input - idy);
|
||||
min_n = max(0, k_size-1-idz);
|
||||
max_n = min(k_size, dim_input-idz);
|
||||
for (int m=min_m; m < max_m; m++) {
|
||||
for (int n=min_n; n < max_n; n++) {
|
||||
tmp += output[l][idy-k_size+m+1][idz-k_size+n+1]*ker->weights[idx][l][m][n];
|
||||
}
|
||||
}
|
||||
}
|
||||
input[idx][idy][idz] = tmp*( (*d_f)(input_z[idx][idy][idz]) );
|
||||
}
|
||||
|
||||
void backward_convolution_device(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first) {
|
||||
// Bias Kernel
|
||||
dim3 gridSize1(i_div_up(depth_output, BLOCKSIZE_x), i_div_up(dim_output, BLOCKSIZE_y), i_div_up(dim_output, BLOCKSIZE_y));
|
||||
dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
backward_convolution_dbias_kernel<<<gridSize1, blockSize1>>>(ker, output, depth_output, dim_output);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
|
||||
// Weights Kernel
|
||||
int k_size = dim_input - dim_output +1;
|
||||
|
||||
dim3 gridSize2(i_div_up(depth_input, BLOCKSIZE_x), i_div_up(depth_output, BLOCKSIZE_y), i_div_up(k_size*k_size, BLOCKSIZE_y));
|
||||
dim3 blockSize2(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
backward_convolution_dweight_kernel<<<gridSize2, blockSize2>>>(ker, input, output, depth_input, depth_output, dim_output, k_size);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
|
||||
// input propagation Kernel
|
||||
if (is_first != 1) {
|
||||
dim3 gridSize3(i_div_up(depth_input, BLOCKSIZE_x), i_div_up(dim_input, BLOCKSIZE_y), i_div_up(dim_input, BLOCKSIZE_y));
|
||||
dim3 blockSize3(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
funcPtr d_function = get_activation_function_cuda(activation);
|
||||
|
||||
backward_convolution_propagate_kernel<<<gridSize3, blockSize3>>>(ker, input, input_z, output, depth_input, dim_input, depth_output, k_size, d_function);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void backward_convolution_cpu(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first) {
|
||||
|
||||
funcPtr d_function = get_activation_function(activation);
|
||||
|
||||
// Bias
|
||||
for (int i=0; i < depth_output; i++) {
|
||||
for (int j=0; j < dim_output; j++) {
|
||||
for (int k=0; k < dim_output; k++) {
|
||||
ker->d_bias[i][j][k] += output[i][j][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Weights
|
||||
int k_size = dim_input - dim_output +1;
|
||||
|
||||
for (int h=0; h < depth_input; h++) {
|
||||
for (int i=0; i < depth_output; i++) {
|
||||
for (int j=0; j < k_size; j++) {
|
||||
for (int k=0; k < k_size; k++) {
|
||||
float tmp = 0;
|
||||
for (int l=0; l < dim_output; l++) {
|
||||
for (int m=0; m < dim_output; m++) {
|
||||
tmp += input[h][l+j][m+k]*output[i][l][m];
|
||||
}
|
||||
}
|
||||
ker->d_weights[h][i][j][k] += tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Input
|
||||
if (is_first==1) // Pas besoin de backpropager dans l'input
|
||||
return;
|
||||
int min_m, max_m, min_n, max_n;
|
||||
for (int i=0; i < depth_input; i++) {
|
||||
for (int j=0; j < dim_input; j++) {
|
||||
for (int k=0; k < dim_input; k++) {
|
||||
float tmp = 0;
|
||||
for (int l=0; l < depth_output; l++) {
|
||||
min_m = max(0, k_size-1-j);
|
||||
max_m = min(k_size, dim_input - j);
|
||||
min_n = max(0, k_size-1-k);
|
||||
max_n = min(k_size, dim_input-k);
|
||||
for (int m=min_m; m < max_m; m++) {
|
||||
for (int n=min_n; n < max_n; n++) {
|
||||
tmp += output[l][j-k_size+m+1][k-k_size+n+1]*ker->weights[i][l][m][n];
|
||||
}
|
||||
}
|
||||
}
|
||||
input[i][j][k] = tmp*d_function(input_z[i][j][k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void backward_convolution(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first) {
|
||||
#ifndef __CUDACC__
|
||||
backward_convolution_cpu(ker, input, input_z, output, depth_input, dim_input, depth_output, dim_output, activation, is_first);
|
||||
#else
|
||||
backward_convolution_device(ker, input, input_z, output, depth_input, dim_input, depth_output, dim_output, activation, is_first);
|
||||
#endif
|
||||
}
|
@ -4,6 +4,7 @@
|
||||
#include <float.h> // Is it used ?
|
||||
#include <math.h>
|
||||
|
||||
#include "../include/memory_management.h"
|
||||
#include "include/backpropagation.h"
|
||||
#include "include/initialisation.h"
|
||||
#include "include/function.h"
|
||||
@ -226,7 +227,7 @@ void backward_propagation(Network* network, int wanted_number) {
|
||||
// Backward sur la dernière couche qui utilise toujours SOFTMAX
|
||||
float* wanted_output = generate_wanted_output(wanted_number, network->width[network->size -1]); // Sortie désirée, permet d'initialiser une erreur
|
||||
softmax_backward_cross_entropy(network->input[n-1][0][0], wanted_output, network->width[n-1]);
|
||||
free(wanted_output);
|
||||
gree(wanted_output);
|
||||
|
||||
/*
|
||||
* On propage à chaque étape:
|
||||
@ -252,14 +253,12 @@ void backward_propagation(Network* network, int wanted_number) {
|
||||
|
||||
|
||||
if (k_i->cnn) { // Convolution
|
||||
funcPtr d_f = get_activation_function(-activation);
|
||||
backward_convolution(k_i->cnn, input, input_z, output, input_depth, input_width, output_depth, output_width, d_f, i==0);
|
||||
backward_convolution(k_i->cnn, input, input_z, output, input_depth, input_width, output_depth, output_width, -activation, i==0);
|
||||
} else if (k_i->nn) { // Full connection
|
||||
funcPtr d_f = get_activation_function(-activation);
|
||||
if (k_i->linearisation == DOESNT_LINEARISE) { // Vecteur -> Vecteur
|
||||
backward_dense(k_i->nn, input[0][0], input_z[0][0], output[0][0], input_width, output_width, d_f, i==0);
|
||||
backward_dense(k_i->nn, input[0][0], input_z[0][0], output[0][0], input_width, output_width, -activation, i==0);
|
||||
} else { // Matrice -> vecteur
|
||||
backward_linearisation(k_i->nn, input, input_z, output[0][0], input_depth, input_width, output_width, d_f);
|
||||
backward_linearisation(k_i->nn, input, input_z, output[0][0], input_depth, input_width, output_width, -activation);
|
||||
}
|
||||
} else { // Pooling
|
||||
if (k_i->pooling == AVG_POOLING) {
|
||||
@ -313,7 +312,7 @@ float compute_cross_entropy_loss(float* output, float* wanted_output, int len) {
|
||||
}
|
||||
|
||||
float* generate_wanted_output(int wanted_number, int size_output) {
|
||||
float* wanted_output = (float*)malloc(sizeof(float)*size_output);
|
||||
float* wanted_output = (float*)nalloc(size_output, sizeof(float));
|
||||
for (int i=0; i < size_output; i++) {
|
||||
if (i==wanted_number) {
|
||||
wanted_output[i]=1;
|
||||
|
@ -5,26 +5,21 @@
|
||||
#include "../include/colors.h"
|
||||
#include "../include/utils.h"
|
||||
|
||||
#include "include/function.h"
|
||||
|
||||
#include "include/config.h"
|
||||
|
||||
#include "include/function.h"
|
||||
|
||||
//* Identity
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_identity(float x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
__device__ float device_identity_derivative(float x) {
|
||||
(void)x;
|
||||
return 1;
|
||||
}
|
||||
__host__ __device__
|
||||
#endif
|
||||
|
||||
float identity(float x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__host__ __device__
|
||||
#endif
|
||||
float identity_derivative(float x) {
|
||||
(void)x;
|
||||
return 1;
|
||||
@ -33,20 +28,15 @@ float identity_derivative(float x) {
|
||||
|
||||
//* Sigmoid
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_sigmoid(float x) {
|
||||
return 1/(1 + exp(-x));
|
||||
}
|
||||
|
||||
__device__ float device_sigmoid_derivative(float x) {
|
||||
float tmp = exp(-x);
|
||||
return tmp/((1+tmp)*(1+tmp));
|
||||
}
|
||||
__host__ __device__
|
||||
#endif
|
||||
|
||||
float sigmoid(float x) {
|
||||
return 1/(1 + exp(-x));
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__host__ __device__
|
||||
#endif
|
||||
float sigmoid_derivative(float x) {
|
||||
float tmp = exp(-x);
|
||||
return tmp/((1+tmp)*(1+tmp));
|
||||
@ -55,21 +45,15 @@ float sigmoid_derivative(float x) {
|
||||
|
||||
//* RELU
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_relu(float x) {
|
||||
return fmaxf(0, fminf(x, RELU_CLIP_VALUE));
|
||||
}
|
||||
|
||||
__device__ float device_relu_derivative(float x) {
|
||||
if (x > 0)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
__host__ __device__
|
||||
#endif
|
||||
|
||||
float relu(float x) {
|
||||
return fmaxf(0, fminf(x, RELU_CLIP_VALUE));
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__host__ __device__
|
||||
#endif
|
||||
float relu_derivative(float x) {
|
||||
if (x > 0)
|
||||
return 1;
|
||||
@ -79,25 +63,17 @@ float relu_derivative(float x) {
|
||||
|
||||
//* Leaky RELU
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_leaky_relu(float x) {
|
||||
if (x>0)
|
||||
return fminf(x, RELU_CLIP_VALUE);
|
||||
return x*LEAKER;
|
||||
}
|
||||
|
||||
__device__ float device_leaky_relu_derivative(float x) {
|
||||
if (x > 0)
|
||||
return 1;
|
||||
return LEAKER;
|
||||
}
|
||||
__host__ __device__
|
||||
#endif
|
||||
|
||||
float leaky_relu(float x) {
|
||||
if (x>0)
|
||||
return fminf(x, RELU_CLIP_VALUE);
|
||||
return x*LEAKER;
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__host__ __device__
|
||||
#endif
|
||||
float leaky_relu_derivative(float x) {
|
||||
if (x > 0)
|
||||
return 1;
|
||||
@ -107,24 +83,15 @@ float leaky_relu_derivative(float x) {
|
||||
|
||||
//* Tanh
|
||||
#ifdef __CUDACC__
|
||||
__device__
|
||||
__host__ __device__
|
||||
#endif
|
||||
float device_tanh_(float x) {
|
||||
return tanh(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__
|
||||
#endif
|
||||
float device_tanh_derivative(float x) {
|
||||
float a = tanh(x);
|
||||
return 1 - a*a;
|
||||
}
|
||||
|
||||
float tanh_(float x) {
|
||||
return tanh(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__host__ __device__
|
||||
#endif
|
||||
float tanh_derivative(float x) {
|
||||
float a = tanh(x);
|
||||
return 1 - a*a;
|
||||
@ -138,17 +105,17 @@ float tanh_derivative(float x) {
|
||||
* Définition des pointeurs de fonctions pour CUDA
|
||||
* voir https://stackoverflow.com/a/15646771
|
||||
*/
|
||||
__device__ funcPtr ptr_sigmoid = device_sigmoid;
|
||||
__device__ funcPtr ptr_relu = device_relu;
|
||||
__device__ funcPtr ptr_leaky_relu = device_leaky_relu;
|
||||
__device__ funcPtr ptr_tanh = device_tanh_;
|
||||
__device__ funcPtr ptr_identity = device_identity;
|
||||
__device__ funcPtr ptr_sigmoid = sigmoid;
|
||||
__device__ funcPtr ptr_relu = relu;
|
||||
__device__ funcPtr ptr_leaky_relu = leaky_relu;
|
||||
__device__ funcPtr ptr_tanh = tanh_;
|
||||
__device__ funcPtr ptr_identity = identity;
|
||||
|
||||
__device__ funcPtr ptr_identity_derivative = device_identity_derivative;
|
||||
__device__ funcPtr ptr_sigmoid_derivative = device_sigmoid_derivative;
|
||||
__device__ funcPtr ptr_relu_derivative = device_relu_derivative;
|
||||
__device__ funcPtr ptr_leaky_relu_derivative = device_leaky_relu_derivative;
|
||||
__device__ funcPtr ptr_tanh_derivative = device_tanh_derivative;
|
||||
__device__ funcPtr ptr_identity_derivative = identity_derivative;
|
||||
__device__ funcPtr ptr_sigmoid_derivative = sigmoid_derivative;
|
||||
__device__ funcPtr ptr_relu_derivative = relu_derivative;
|
||||
__device__ funcPtr ptr_leaky_relu_derivative = leaky_relu_derivative;
|
||||
__device__ funcPtr ptr_tanh_derivative = tanh_derivative;
|
||||
#endif
|
||||
|
||||
|
||||
@ -303,6 +270,7 @@ funcPtr get_activation_function(int activation) {
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
funcPtr get_activation_function_cuda(int activation) {
|
||||
funcPtr host_function;
|
||||
|
||||
|
@ -5,26 +5,21 @@
|
||||
#include "../include/colors.h"
|
||||
#include "../include/utils.h"
|
||||
|
||||
#include "include/function.h"
|
||||
|
||||
#include "include/config.h"
|
||||
|
||||
#include "include/function.h"
|
||||
|
||||
//* Identity
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_identity(float x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
__device__ float device_identity_derivative(float x) {
|
||||
(void)x;
|
||||
return 1;
|
||||
}
|
||||
__host__ __device__
|
||||
#endif
|
||||
|
||||
float identity(float x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__host__ __device__
|
||||
#endif
|
||||
float identity_derivative(float x) {
|
||||
(void)x;
|
||||
return 1;
|
||||
@ -33,20 +28,15 @@ float identity_derivative(float x) {
|
||||
|
||||
//* Sigmoid
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_sigmoid(float x) {
|
||||
return 1/(1 + exp(-x));
|
||||
}
|
||||
|
||||
__device__ float device_sigmoid_derivative(float x) {
|
||||
float tmp = exp(-x);
|
||||
return tmp/((1+tmp)*(1+tmp));
|
||||
}
|
||||
__host__ __device__
|
||||
#endif
|
||||
|
||||
float sigmoid(float x) {
|
||||
return 1/(1 + exp(-x));
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__host__ __device__
|
||||
#endif
|
||||
float sigmoid_derivative(float x) {
|
||||
float tmp = exp(-x);
|
||||
return tmp/((1+tmp)*(1+tmp));
|
||||
@ -55,21 +45,15 @@ float sigmoid_derivative(float x) {
|
||||
|
||||
//* RELU
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_relu(float x) {
|
||||
return fmaxf(0, fminf(x, RELU_CLIP_VALUE));
|
||||
}
|
||||
|
||||
__device__ float device_relu_derivative(float x) {
|
||||
if (x > 0)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
__host__ __device__
|
||||
#endif
|
||||
|
||||
float relu(float x) {
|
||||
return fmaxf(0, fminf(x, RELU_CLIP_VALUE));
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__host__ __device__
|
||||
#endif
|
||||
float relu_derivative(float x) {
|
||||
if (x > 0)
|
||||
return 1;
|
||||
@ -79,25 +63,17 @@ float relu_derivative(float x) {
|
||||
|
||||
//* Leaky RELU
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_leaky_relu(float x) {
|
||||
if (x>0)
|
||||
return fminf(x, RELU_CLIP_VALUE);
|
||||
return x*LEAKER;
|
||||
}
|
||||
|
||||
__device__ float device_leaky_relu_derivative(float x) {
|
||||
if (x > 0)
|
||||
return 1;
|
||||
return LEAKER;
|
||||
}
|
||||
__host__ __device__
|
||||
#endif
|
||||
|
||||
float leaky_relu(float x) {
|
||||
if (x>0)
|
||||
return fminf(x, RELU_CLIP_VALUE);
|
||||
return x*LEAKER;
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__host__ __device__
|
||||
#endif
|
||||
float leaky_relu_derivative(float x) {
|
||||
if (x > 0)
|
||||
return 1;
|
||||
@ -107,24 +83,15 @@ float leaky_relu_derivative(float x) {
|
||||
|
||||
//* Tanh
|
||||
#ifdef __CUDACC__
|
||||
__device__
|
||||
__host__ __device__
|
||||
#endif
|
||||
float device_tanh_(float x) {
|
||||
return tanh(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__
|
||||
#endif
|
||||
float device_tanh_derivative(float x) {
|
||||
float a = tanh(x);
|
||||
return 1 - a*a;
|
||||
}
|
||||
|
||||
float tanh_(float x) {
|
||||
return tanh(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__host__ __device__
|
||||
#endif
|
||||
float tanh_derivative(float x) {
|
||||
float a = tanh(x);
|
||||
return 1 - a*a;
|
||||
@ -138,17 +105,17 @@ float tanh_derivative(float x) {
|
||||
* Définition des pointeurs de fonctions pour CUDA
|
||||
* voir https://stackoverflow.com/a/15646771
|
||||
*/
|
||||
__device__ funcPtr ptr_sigmoid = device_sigmoid;
|
||||
__device__ funcPtr ptr_relu = device_relu;
|
||||
__device__ funcPtr ptr_leaky_relu = device_leaky_relu;
|
||||
__device__ funcPtr ptr_tanh = device_tanh_;
|
||||
__device__ funcPtr ptr_identity = device_identity;
|
||||
__device__ funcPtr ptr_sigmoid = sigmoid;
|
||||
__device__ funcPtr ptr_relu = relu;
|
||||
__device__ funcPtr ptr_leaky_relu = leaky_relu;
|
||||
__device__ funcPtr ptr_tanh = tanh_;
|
||||
__device__ funcPtr ptr_identity = identity;
|
||||
|
||||
__device__ funcPtr ptr_identity_derivative = device_identity_derivative;
|
||||
__device__ funcPtr ptr_sigmoid_derivative = device_sigmoid_derivative;
|
||||
__device__ funcPtr ptr_relu_derivative = device_relu_derivative;
|
||||
__device__ funcPtr ptr_leaky_relu_derivative = device_leaky_relu_derivative;
|
||||
__device__ funcPtr ptr_tanh_derivative = device_tanh_derivative;
|
||||
__device__ funcPtr ptr_identity_derivative = identity_derivative;
|
||||
__device__ funcPtr ptr_sigmoid_derivative = sigmoid_derivative;
|
||||
__device__ funcPtr ptr_relu_derivative = relu_derivative;
|
||||
__device__ funcPtr ptr_leaky_relu_derivative = leaky_relu_derivative;
|
||||
__device__ funcPtr ptr_tanh_derivative = tanh_derivative;
|
||||
#endif
|
||||
|
||||
|
||||
@ -303,6 +270,7 @@ funcPtr get_activation_function(int activation) {
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
funcPtr get_activation_function_cuda(int activation) {
|
||||
funcPtr host_function;
|
||||
|
||||
|
@ -14,42 +14,70 @@ int min(int a, int b);
|
||||
*/
|
||||
int max(int a, int b);
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
/*
|
||||
* Transfert les informations d'erreur de la sortie voulue à la sortie réelle
|
||||
*/
|
||||
void softmax_backward_mse(float* input, float* output, int size);
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
/*
|
||||
* Transfert les informations d'erreur de la sortie voulue à la sortie réelle
|
||||
* en considérant MSE (Mean Squared Error) comme fonction d'erreur
|
||||
*/
|
||||
void softmax_backward_cross_entropy(float* input, float* output, int size);
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
/*
|
||||
* Transfert les informations d'erreur à travers une couche d'average pooling
|
||||
* en considérant cross_entropy comme fonction d'erreur
|
||||
*/
|
||||
void backward_average_pooling(float*** input, float*** output, int input_width, int output_width, int depth);
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
/*
|
||||
* Transfert les informations d'erreur à travers une couche de max pooling
|
||||
* en considérant cross_entropy comme fonction d'erreur
|
||||
*/
|
||||
void backward_max_pooling(float*** input, float*** output, int input_width, int output_width, int depth);
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
/*
|
||||
* Transfert les informations d'erreur à travers une couche fully connected
|
||||
*/
|
||||
void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, funcPtr d_function, int is_first);
|
||||
void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output, int size_input, int size_output, int activation, int is_first);
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
/*
|
||||
* Transfert les informations d'erreur à travers une couche de linéarisation
|
||||
*/
|
||||
void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, funcPtr d_function);
|
||||
void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, float* output, int depth_input, int dim_input, int size_output, int activation);
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
/*
|
||||
* Transfert les informations d'erreur à travers un couche de convolution
|
||||
*/
|
||||
void backward_convolution(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, funcPtr d_function, int is_first);
|
||||
void backward_convolution(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int depth_input, int dim_input, int depth_output, int dim_output, int activation, int is_first);
|
||||
|
||||
#endif
|
||||
|
@ -39,6 +39,8 @@
|
||||
#define NETWORK_CLIP_VALUE 300
|
||||
|
||||
//* Paramètres CUDA
|
||||
// Le produit des 3 dimensions doit être au maximum 1024 (atteignable avec 8*8*16)
|
||||
// Le réduire permet d'éviter des erreurs "Out of memory" au lancement des Kernel
|
||||
#define BLOCKSIZE_x 10
|
||||
#define BLOCKSIZE_y 10
|
||||
#define BLOCKSIZE_z 10
|
||||
|
@ -19,82 +19,67 @@
|
||||
typedef float (*funcPtr)(float);
|
||||
|
||||
//* Identité
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_identity(float x);
|
||||
__device__ float device_identity_derivative(float x);
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
__host__ __device__
|
||||
#endif
|
||||
float identity(float x);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
__host__ __device__
|
||||
#endif
|
||||
float identity_derivative(float x);
|
||||
|
||||
//* Sigmoid
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_sigmoid(float x);
|
||||
__device__ float device_sigmoid_derivative(float x);
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
__host__ __device__
|
||||
#endif
|
||||
float sigmoid(float x);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
__host__ __device__
|
||||
#endif
|
||||
float sigmoid_derivative(float x);
|
||||
|
||||
//* RELU
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_relu(float x);
|
||||
__device__ float device_relu_derivative(float x);
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
__host__ __device__
|
||||
#endif
|
||||
float relu(float x);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
__host__ __device__
|
||||
#endif
|
||||
float relu_derivative(float x);
|
||||
|
||||
//* Leaky RELU
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_leaky_relu(float x);
|
||||
__device__ float device_leaky_relu_derivative(float x);
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
__host__ __device__
|
||||
#endif
|
||||
float leaky_relu(float x);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
__host__ __device__
|
||||
#endif
|
||||
float leaky_relu_derivative(float x);
|
||||
|
||||
//* Tanh
|
||||
#ifdef __CUDACC__
|
||||
__device__ float device_tanh_(float x);
|
||||
__device__ float device_tanh_derivative(float x);
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
__host__ __device__
|
||||
#endif
|
||||
float tanh_(float x);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
__host__ __device__
|
||||
#endif
|
||||
float tanh_derivative(float x);
|
||||
|
||||
@ -142,6 +127,9 @@ funcPtr get_activation_function(int activation);
|
||||
/*
|
||||
* Récupère un pointeur sur le device vers la fonction d'activation demandée puis le transforme en pointeur sur l'host
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
funcPtr get_activation_function_cuda(int activation);
|
||||
#endif
|
||||
|
||||
#endif
|
@ -51,7 +51,7 @@ float* test_network_mnist(Network* network, char* images_file, char* labels_file
|
||||
// Compute loss
|
||||
wanted_output = generate_wanted_output(labels[i], 10);
|
||||
loss += compute_mean_squared_error(network->input[network->size-1][0][0], wanted_output, 10);
|
||||
free(wanted_output);
|
||||
gree(wanted_output);
|
||||
|
||||
for (int j=0; j < height; j++) {
|
||||
free(images[i][j]);
|
||||
@ -60,7 +60,7 @@ float* test_network_mnist(Network* network, char* images_file, char* labels_file
|
||||
}
|
||||
free(images);
|
||||
|
||||
float* results = malloc(sizeof(float)*2);
|
||||
float* results = (float*)malloc(sizeof(float)*2);
|
||||
results[0] = 100*accuracy/(float)nb_elem;
|
||||
results[1] = loss/(float)nb_elem;
|
||||
return results;
|
||||
@ -90,7 +90,7 @@ float* test_network_jpg(Network* network, char* data_dir, bool preview_fails, bo
|
||||
free(dataset->images[i]);
|
||||
}
|
||||
|
||||
float* results = malloc(sizeof(float)*2);
|
||||
float* results = (float*)malloc(sizeof(float)*2);
|
||||
results[0] = 100*accuracy/(float)dataset->numImages;
|
||||
results[1] = 0;
|
||||
|
||||
|
@ -62,7 +62,7 @@ void* train_thread(void* parameters) {
|
||||
|
||||
wanted_output = generate_wanted_output(labels[index[i]], 10);
|
||||
loss += compute_mean_squared_error(network->input[network->size-1][0][0], wanted_output, 10);
|
||||
free(wanted_output);
|
||||
gree(wanted_output);
|
||||
|
||||
backward_propagation(network, labels[index[i]]);
|
||||
|
||||
|
@ -45,4 +45,12 @@ extern "C"
|
||||
* Copier des valeurs d'un tableau de dimension 3 de mémoire partagée
|
||||
*/
|
||||
void copy_3d_array(float*** source, float*** dest, int dimension1, int dimension2, int dimension3);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
/*
|
||||
* Remplir un tableau de 0.
|
||||
*/
|
||||
void reset_3d_array(float*** source, int dimension1, int dimension2, int dimension3);
|
||||
#endif
|
@ -5,6 +5,7 @@
|
||||
|
||||
#include "include/memory_management.h"
|
||||
#include "include/colors.h"
|
||||
#include "include/utils.h"
|
||||
|
||||
|
||||
Memory* memory = NULL;
|
||||
@ -56,6 +57,9 @@ Memory* create_memory_block(size_t size) {
|
||||
Memory* mem = (Memory*)malloc(sizeof(Memory));
|
||||
#ifdef __CUDACC__
|
||||
cudaMallocManaged(&(mem->start), size, cudaMemAttachHost);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
#else
|
||||
mem->start = malloc(size);
|
||||
#endif
|
||||
@ -93,6 +97,7 @@ void* allocate_memory(int nb_elements, size_t size, Memory* mem) {
|
||||
//printf("Mémoire disponible: %ld. Nécessaire: %ld\n", mem->size - ((intptr_t)mem->cursor - (intptr_t)mem->start), nb_elements*size);
|
||||
// Sinon on continue sur l'élément suivant de la liste
|
||||
if (!mem->next) {
|
||||
//! WARNING: May cause Infinite allocations when trying to allocate more than MEMORY_BLOCK size at once that is not naturally aligned (CUDA only)
|
||||
mem->next = create_memory_block(MEMORY_BLOCK < nb_elements*size ? nb_elements*size : MEMORY_BLOCK);
|
||||
}
|
||||
return allocate_memory(nb_elements, size, mem->next);
|
||||
|
@ -5,6 +5,7 @@
|
||||
|
||||
#include "include/memory_management.h"
|
||||
#include "include/colors.h"
|
||||
#include "include/utils.h"
|
||||
|
||||
|
||||
Memory* memory = NULL;
|
||||
@ -56,6 +57,9 @@ Memory* create_memory_block(size_t size) {
|
||||
Memory* mem = (Memory*)malloc(sizeof(Memory));
|
||||
#ifdef __CUDACC__
|
||||
cudaMallocManaged(&(mem->start), size, cudaMemAttachHost);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
#else
|
||||
mem->start = malloc(size);
|
||||
#endif
|
||||
|
35
src/utils.c
35
src/utils.c
@ -92,4 +92,39 @@ void copy_3d_array(float*** source, float*** dest, int dimension1, int dimension
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__global__ void reset_3d_array_kernel(float*** dest, int dimension1, int dimension2, int dimension3) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < dimension1
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < dimension2
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < dimension3
|
||||
|
||||
if (idx >= dimension1 || idy >= dimension2 || idz >= dimension3) {
|
||||
return;
|
||||
}
|
||||
|
||||
dest[idx][idy][idz] = 0.;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void reset_3d_array(float*** dest, int dimension1, int dimension2, int dimension3) {
|
||||
dim3 gridSize(i_div_up(dimension1, BLOCKSIZE_x), i_div_up(dimension2, BLOCKSIZE_y), i_div_up(dimension3, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
reset_3d_array_kernel<<<gridSize, blockSize>>>(dest, dimension1, dimension2, dimension3);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#else
|
||||
void reset_3d_array(float*** dest, int dimension1, int dimension2, int dimension3) {
|
||||
for (int i=0; i < dimension1; i++) {
|
||||
for (int j=0; j < dimension2; j++) {
|
||||
for (int k=0; k < dimension3; k++) {
|
||||
dest[i][j][k] = 0.;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
36
src/utils.cu
36
src/utils.cu
@ -73,7 +73,6 @@ __global__ void copy_3d_array_kernel(float*** source, float*** dest, int dimensi
|
||||
dest[idx][idy][idz] = source[idx][idy][idz];
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void copy_3d_array(float*** source, float*** dest, int dimension1, int dimension2, int dimension3) {
|
||||
dim3 gridSize(i_div_up(dimension1, BLOCKSIZE_x), i_div_up(dimension2, BLOCKSIZE_y), i_div_up(dimension3, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
@ -93,4 +92,39 @@ void copy_3d_array(float*** source, float*** dest, int dimension1, int dimension
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__global__ void reset_3d_array_kernel(float*** dest, int dimension1, int dimension2, int dimension3) {
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < dimension1
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < dimension2
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < dimension3
|
||||
|
||||
if (idx >= dimension1 || idy >= dimension2 || idz >= dimension3) {
|
||||
return;
|
||||
}
|
||||
|
||||
dest[idx][idy][idz] = 0.;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void reset_3d_array(float*** dest, int dimension1, int dimension2, int dimension3) {
|
||||
dim3 gridSize(i_div_up(dimension1, BLOCKSIZE_x), i_div_up(dimension2, BLOCKSIZE_y), i_div_up(dimension3, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
reset_3d_array_kernel<<<gridSize, blockSize>>>(dest, dimension1, dimension2, dimension3);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#else
|
||||
void reset_3d_array(float*** dest, int dimension1, int dimension2, int dimension3) {
|
||||
for (int i=0; i < dimension1; i++) {
|
||||
for (int j=0; j < dimension2; j++) {
|
||||
for (int k=0; k < dimension3; k++) {
|
||||
dest[i][j][k] = 0.;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
@ -7,17 +7,25 @@
|
||||
#include "../src/include/colors.h"
|
||||
#include "../src/include/utils.h"
|
||||
|
||||
#include "../src/cnn/include/config.h"
|
||||
|
||||
int main() {
|
||||
printf("Checking CUDA compatibility.\n");
|
||||
bool cuda_compatible = check_cuda_compatibility();
|
||||
if (!cuda_compatible) {
|
||||
printf(RED "CUDA not compatible, skipping tests.\n" RESET);
|
||||
return 0;
|
||||
__global__ void local_kernel(funcPtr f, float*** input, int depth, int rows, int columns) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < depth
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < rows
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < columns
|
||||
|
||||
if (idx >= depth || idy >= rows || idz >= columns) {
|
||||
return;
|
||||
}
|
||||
printf(GREEN "OK\n" RESET);
|
||||
|
||||
printf("Initialisation OK\n");
|
||||
input[idx][idy][idz] = (*f)(input[idx][idy][idz]);
|
||||
}
|
||||
|
||||
|
||||
void test1(int activation, bool use_local_kernel) {
|
||||
printf("Test sur la fonction %d\n", activation);
|
||||
printf("\tInitialisation OK\n");
|
||||
// Initialise values
|
||||
int depth = 10;
|
||||
int rows = 10;
|
||||
@ -32,27 +40,40 @@ int main() {
|
||||
input[i][j] = (float*)nalloc(columns, sizeof(float));
|
||||
input_initial[i][j] = (float*)malloc(columns*sizeof(float));
|
||||
for (int k=0; k < columns; k++) {
|
||||
input[i][j][k] = rand()/RAND_MAX;
|
||||
input[i][j][k] = rand()/(float)RAND_MAX;
|
||||
input_initial[i][j][k] = input[i][j][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
printf(GREEN "OK\n" RESET);
|
||||
printf("\t" GREEN "OK\n" RESET);
|
||||
|
||||
funcPtr func = get_activation_function(TANH);
|
||||
funcPtr func_cpu = get_activation_function(activation);
|
||||
|
||||
printf("Calcul par CUDA\n");
|
||||
apply_function_input(TANH, input, depth, rows, columns);
|
||||
printf(GREEN "OK\n" RESET);
|
||||
if (!use_local_kernel) {
|
||||
printf("\tCalcul par CUDA\n");
|
||||
apply_function_input(activation, input, depth, rows, columns);
|
||||
} else {
|
||||
printf("\tCalcul par CUDA sur le kernel local\n");
|
||||
dim3 gridSize(i_div_up(depth, BLOCKSIZE_x), i_div_up(rows, BLOCKSIZE_y), i_div_up(columns, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
printf("Vérification des résultats\n");
|
||||
funcPtr function_cuda = get_activation_function_cuda(activation);
|
||||
|
||||
local_kernel<<<gridSize, blockSize>>>(function_cuda, input, depth, rows, columns);
|
||||
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
printf("\t" GREEN "OK\n" RESET);
|
||||
|
||||
printf("\tVérification des résultats\n");
|
||||
for (int i=0; i < depth; i++) {
|
||||
for (int j=0; j < rows; j++) {
|
||||
for (int k=0; k < columns; k++) {
|
||||
if (fabs((*func)(input_initial[i][j][k]) - input[i][j][k]) > 1e-6) {
|
||||
if (fabs((*func_cpu)(input_initial[i][j][k]) - input[i][j][k]) > 1e-6) {
|
||||
printf_error((char*)"Les résultats ne coincident pas\n");
|
||||
printf("Différence %e\n", fabs((*func)(input_initial[i][j][k]) - input[i][j][k]));
|
||||
//exit(1);
|
||||
printf("Différence %e\n", fabs((*func_cpu)(input_initial[i][j][k]) - input[i][j][k]));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
gree(input[i][j]);
|
||||
@ -64,6 +85,26 @@ int main() {
|
||||
gree(input);
|
||||
free(input_initial);
|
||||
|
||||
printf("\t" GREEN "OK\n" RESET);
|
||||
printf(GREEN "OK\n" RESET);
|
||||
}
|
||||
|
||||
int main() {
|
||||
printf("Checking CUDA compatibility.\n");
|
||||
bool cuda_compatible = check_cuda_compatibility();
|
||||
if (!cuda_compatible) {
|
||||
printf(RED "CUDA not compatible, skipping tests.\n" RESET);
|
||||
return 0;
|
||||
}
|
||||
printf(GREEN "OK\n" RESET);
|
||||
|
||||
for (int i=1; i < 7; i++) {
|
||||
if (i != 5) { // Exclude SOFTMAX
|
||||
test1(i, false); // use function i
|
||||
test1(-i, false); // use function i'
|
||||
test1(i, true); // use function i in the kernel declared in this file
|
||||
test1(-i, true); // use function i' in the kernel declared in this file
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user