cnn/config: add FREE_ALL_OPT

backpropagation: fix misaligned addresses
backpropagation: Fix padding issues
2025-04-23 05:23:54 +02:00 · 2023-05-20 22:45:40 +02:00 · 2023-05-20 20:15:36 +02:00 · 2023-05-20 20:04:22 +02:00
6 changed files with 175 additions and 121 deletions
--- a/src/cnn/backpropagation.c
+++ b/src/cnn/backpropagation.c
@ -287,7 +287,7 @@ void backward_max_pooling(float*** input, float*** output, int input_width, int
 * Backward Dense
 */
 #ifdef __CUDACC__
-__global__ void backward_dense_kernel_1(Kernel_nn* ker, float* input, float* output, int size_input, int size_output) {
+__global__ void backward_dense_kernel_1(float** d_weights, float* d_bias, float* input, float* output, int size_input, int size_output) {
    // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
    int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_input
    int idy = threadIdx.y + blockDim.y*blockIdx.y; // < size_output
@ -297,9 +297,9 @@ __global__ void backward_dense_kernel_1(Kernel_nn* ker, float* input, float* out
    }

    if (idx == 0) {
-        ker->d_bias[idy] += output[idy];
+        d_bias[idy] += output[idy];
    }
-    ker->d_weights[idx][idy] += input[idx]*output[idy];
+    d_weights[idx][idy] += input[idx]*output[idy];
 }

 __global__ void backward_dense_kernel_2(float** weights, float* input, float* input_z, float* output, int size_input, int size_output, funcPtr d_f) {
@ -321,7 +321,7 @@ void backward_dense_device(Kernel_nn* ker, float* input, float* input_z, float*
    dim3 gridSize1(i_div_up(size_input, BLOCKSIZE_x), i_div_up(size_output, BLOCKSIZE_y));
    dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y);

-    backward_dense_kernel_1<<<gridSize1, blockSize1>>>(ker, input, output, size_input, size_output);
+    backward_dense_kernel_1<<<gridSize1, blockSize1>>>(ker->d_weights, ker->d_bias, input, output, size_input, size_output);
    
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
@ -387,7 +387,7 @@ void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output,
 * Backward linearisation
 */
 #ifdef __CUDACC__
-__global__ void backward_linearisation_kernel_1(Kernel_nn* ker, float*** input, float* output, int input_depth, int input_width, int size_output) {
+__global__ void backward_linearisation_kernel_1(float** d_weights, float* d_bias, float*** input, float* output, int input_depth, int input_width, int size_output) {
    int idx = threadIdx.x + blockDim.x*blockIdx.x; // < input_depth
    int idy = threadIdx.y + blockDim.y*blockIdx.y; // < input_width
    int idz = threadIdx.z + blockDim.z*blockIdx.z; // < input_width
@ -399,16 +399,16 @@ __global__ void backward_linearisation_kernel_1(Kernel_nn* ker, float*** input,
    int id = idx*input_width*input_width + idy*input_width + idz;
    
    for (int j=0; j < size_output; j++) {
-        ker->d_weights[id][j] += input[idx][idy][idz]*output[j];
+        d_weights[id][j] += input[idx][idy][idz]*output[j];
    }
    if (id == 0) {
        for (int j=0; j < size_output;  j++) {
-            ker->d_bias[j] += output[j];
+            d_bias[j] += output[j];
        }
    }
 }

-__global__ void backward_linearisation_kernel_2(Kernel_nn* ker, float*** input, float*** input_z, float* output, int input_depth, int input_width, int size_output, funcPtr d_f) {
+__global__ void backward_linearisation_kernel_2(float** weights, float*** input, float*** input_z, float* output, int input_depth, int input_width, int size_output, funcPtr d_f) {
    int idx = threadIdx.x + blockDim.x*blockIdx.x; // < input_depth
    int idy = threadIdx.y + blockDim.y*blockIdx.y; // < input_width
    int idz = threadIdx.z + blockDim.z*blockIdx.z; // < input_width
@ -420,7 +420,7 @@ __global__ void backward_linearisation_kernel_2(Kernel_nn* ker, float*** input,

    float tmp=0;
    for (int j=0; j < size_output; j++) {
-        tmp += output[j]*ker->weights[id][j];
+        tmp += output[j]*weights[id][j];
    }
    input[idx][idy][idz] = tmp*( (*d_f)(input_z[idx][idy][idz]) );
 }
@ -430,7 +430,7 @@ void backward_linearisation_device(Kernel_nn* ker, float*** input, float*** inpu
    dim3 gridSize(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(input_width, BLOCKSIZE_y), i_div_up(input_width, BLOCKSIZE_y));
    dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);

-    backward_linearisation_kernel_1<<<gridSize, blockSize>>>(ker, input, output, input_depth, input_width, size_output);
+    backward_linearisation_kernel_1<<<gridSize, blockSize>>>(ker->d_weights, ker->d_bias, input, output, input_depth, input_width, size_output);
    
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
@ -438,7 +438,7 @@ void backward_linearisation_device(Kernel_nn* ker, float*** input, float*** inpu
    // Second kernel
    funcPtr d_function = get_activation_function_cuda(activation);

-    backward_linearisation_kernel_2<<<gridSize, blockSize>>>(ker, input, input_z, output, input_depth, input_width, size_output, d_function);
+    backward_linearisation_kernel_2<<<gridSize, blockSize>>>(ker->weights, input, input_z, output, input_depth, input_width, size_output, d_function);
    
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
@ -509,50 +509,64 @@ __global__ void backward_convolution_dbias_kernel(float*** d_bias, float*** outp
    d_bias[idx][idy][idz] += output[idx][idy][idz];
 }

-__global__ void backward_convolution_dweight_kernel(float**** d_weights, float*** input, float*** output, int input_depth, int output_depth, int output_width, int kernel_size) {
-    int idx = threadIdx.x + blockDim.x*blockIdx.x;
-    int idy = threadIdx.y + blockDim.y*blockIdx.y;
-    int idz = threadIdx.z + blockDim.z*blockIdx.z;
+__global__ void backward_convolution_dweight_kernel(float**** d_weights, float*** input, float*** output, int input_depth, int output_depth, int input_width, int output_width, int k_size, int stride, int padding) {
+    /*
+    * L'ordre des boucles a été changé par rapport à l'implémentation sur CPU
+    * afin d'utiliser possiblement plus de coeurs à la fois (car en général, depth << width)
+    * En gardant les indices des boucles sur CPU notées h,i,j,k,l,m; on fait donc l,m,i,h,j,k 
+    */
+    int idx = threadIdx.x + blockDim.x*blockIdx.x; // l
+    int idy = threadIdx.y + blockDim.y*blockIdx.y; // m
+    int idz = threadIdx.z + blockDim.z*blockIdx.z; // i

-    int idz1 = idz / kernel_size;
-    int idz2 = idz % kernel_size;
-    
-    if (idx >= input_depth || idy >= output_depth || idz1 >= kernel_size || idz2 >= kernel_size) {
-        return;
-    }
-    
-    float tmp = 0;
-    for (int l=0; l < output_width; l++) {
-        for (int m=0; m < output_width; m++) {
-            tmp += input[idx][l+idz1][m+idz2]*output[idy][l][m];
-        }
-    }
-    d_weights[idx][idy][idz1][idz2] += tmp;
-}
-
-__global__ void backward_convolution_propagate_kernel(float**** weights, float*** input, float*** input_z, float*** output, int input_depth, int input_width, int output_depth, int k_size, funcPtr d_f) {
-    int idx = threadIdx.x + blockDim.x*blockIdx.x;
-    int idy = threadIdx.y + blockDim.y*blockIdx.y;
-    int idz = threadIdx.z + blockDim.z*blockIdx.z;
-
-    if (idx >= input_depth || idy >= input_width || idz >= input_width) {
+    if (idx >= output_width || idy >= output_width || idz >= output_depth) {
        return;
    }

-    int min_m, max_m, min_n, max_n;
-    float tmp = 0;
-    for (int l=0; l < output_depth; l++) {
-        min_m = max(0, k_size-1-idy);
-        max_m = min(k_size, input_width - idy);
-        min_n = max(0, k_size-1-idz);
-        max_n = min(k_size, input_width-idz);
-        for (int m=min_m; m < max_m; m++) {
-            for (int n=min_n; n < max_n; n++) {
-                tmp += output[l][idy-k_size+m+1][idz-k_size+n+1]*weights[idx][l][m][n];
+    int max_move = k_size - padding;
+    for (int h=0; h < input_depth; h++) {
+        for (int j=-padding; j < max_move; j++) {
+            for (int k=-padding; k < max_move; k++) {
+                if (not_outside(idx*stride+j, idy*stride+k, 0, input_width)) {
+                    atomicAdd(&d_weights[h][idz][j+padding][k+padding], input[h][idx*stride+j][idy*stride+k]*output[idz][idx][idy]);
+                }
            }
        }
    }
-    input[idx][idy][idz] = tmp*( (*d_f)(input_z[idx][idy][idz]) );
+}
+
+__global__ void backward_convolution_propagate_kernel(float**** weights, float*** input, float*** output, int input_depth, int input_width, int output_width, int output_depth, int k_size, int stride, int padding) {
+    int idx = threadIdx.x + blockDim.x*blockIdx.x;
+    int idy = threadIdx.y + blockDim.y*blockIdx.y;
+
+    if (idx >= input_depth || idy >= output_depth) {
+        return;
+    }
+    int max_move = k_size - padding;
+    for (int j=-padding; j < max_move; j++) {
+        for (int k=-padding; k < max_move; k++) {
+            for (int l=0; l < output_width; l++) {
+                for (int m=0; m < output_width; m++) {
+                    if (not_outside(l*stride+j, m*stride+k, 0, input_width)) {
+                        atomicAdd(&input[idx][l*stride+j][m*stride+k], output[idy][l][m]*weights[idx][idy][j+padding][k+padding]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+__global__ void backward_convolution_apply_propagate_kernel(float*** input, float*** input_z, int input_depth, int input_width, funcPtr d_f) {
+    int idx = threadIdx.x + blockDim.x*blockIdx.x;
+    int idy = threadIdx.y + blockDim.y*blockIdx.y;
+
+    if (idx >= input_depth || idy >= input_width) {
+        return;
+    }
+
+    for (int k=0; k < input_width; k++) {
+        input[idx][idy][k] = input[idx][idy][k]*d_f(input_z[idx][idy][k]);
+    }
 }

 void backward_convolution_device(Kernel_cnn* kernel, float*** input, float*** input_z, float*** output, int input_depth, int input_width, int output_depth, int output_width, int activation, int is_first, int kernel_size, int padding, int stride) {
@ -565,23 +579,30 @@ void backward_convolution_device(Kernel_cnn* kernel, float*** input, float*** in
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

-    dim3 gridSize2(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(output_depth, BLOCKSIZE_y), i_div_up(kernel_size*kernel_size, BLOCKSIZE_y));
+    dim3 gridSize2(i_div_up(output_width, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_depth, BLOCKSIZE_y));
    dim3 blockSize2(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);

-    backward_convolution_dweight_kernel<<<gridSize2, blockSize2>>>(kernel->d_weights, input, output, input_depth, output_depth, output_width, kernel_size);
+    backward_convolution_dweight_kernel<<<gridSize2, blockSize2>>>(kernel->d_weights, input, output, input_depth, output_depth, input_width, output_width, kernel_size, stride, padding);
    
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    // input propagation Kernel
    if (is_first != 1) {
-        dim3 gridSize3(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(input_width, BLOCKSIZE_y), i_div_up(input_width, BLOCKSIZE_y));
-        dim3 blockSize3(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
+        reset_3d_array(input, input_depth, input_width, input_width);
+
+        dim3 gridSize3(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(output_depth, BLOCKSIZE_y));
+        dim3 blockSize3(BLOCKSIZE_x, BLOCKSIZE_y);
+
+        backward_convolution_propagate_kernel<<<gridSize3, blockSize3>>>(kernel->weights, input, output, input_depth, input_width, output_width, output_depth, kernel_size, stride, padding);
+
+        dim3 gridSize4(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(input_width, BLOCKSIZE_y));
+        dim3 blockSize4(BLOCKSIZE_x, BLOCKSIZE_y);

        funcPtr d_function = get_activation_function_cuda(activation);

-        backward_convolution_propagate_kernel<<<gridSize3, blockSize3>>>(kernel->weights, input, input_z, output, input_depth, input_width, output_depth, kernel_size, d_function);
-    
+        backward_convolution_apply_propagate_kernel<<<gridSize4, blockSize4>>>(input, input_z, input_depth, input_width, d_function);
+
        gpuErrchk( cudaPeekAtLastError() );
        gpuErrchk( cudaDeviceSynchronize() );
    }
@ -616,7 +637,7 @@ void backward_convolution_cpu(Kernel_cnn* ker, float*** input, float*** input_z,
                            }
                        }
                    }
-                    ker->d_weights[h][i][j][k] += tmp;
+                    ker->d_weights[h][i][j+padding][k+padding] += tmp;
                }
            }
        }
--- a/src/cnn/backpropagation.cu
+++ b/src/cnn/backpropagation.cu
@ -287,7 +287,7 @@ void backward_max_pooling(float*** input, float*** output, int input_width, int
 * Backward Dense
 */
 #ifdef __CUDACC__
-__global__ void backward_dense_kernel_1(Kernel_nn* ker, float* input, float* output, int size_input, int size_output) {
+__global__ void backward_dense_kernel_1(float** d_weights, float* d_bias, float* input, float* output, int size_input, int size_output) {
    // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
    int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_input
    int idy = threadIdx.y + blockDim.y*blockIdx.y; // < size_output
@ -297,9 +297,9 @@ __global__ void backward_dense_kernel_1(Kernel_nn* ker, float* input, float* out
    }

    if (idx == 0) {
-        ker->d_bias[idy] += output[idy];
+        d_bias[idy] += output[idy];
    }
-    ker->d_weights[idx][idy] += input[idx]*output[idy];
+    d_weights[idx][idy] += input[idx]*output[idy];
 }

 __global__ void backward_dense_kernel_2(float** weights, float* input, float* input_z, float* output, int size_input, int size_output, funcPtr d_f) {
@ -321,7 +321,7 @@ void backward_dense_device(Kernel_nn* ker, float* input, float* input_z, float*
    dim3 gridSize1(i_div_up(size_input, BLOCKSIZE_x), i_div_up(size_output, BLOCKSIZE_y));
    dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y);

-    backward_dense_kernel_1<<<gridSize1, blockSize1>>>(ker, input, output, size_input, size_output);
+    backward_dense_kernel_1<<<gridSize1, blockSize1>>>(ker->d_weights, ker->d_bias, input, output, size_input, size_output);
    
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
@ -387,7 +387,7 @@ void backward_dense(Kernel_nn* ker, float* input, float* input_z, float* output,
 * Backward linearisation
 */
 #ifdef __CUDACC__
-__global__ void backward_linearisation_kernel_1(Kernel_nn* ker, float*** input, float* output, int input_depth, int input_width, int size_output) {
+__global__ void backward_linearisation_kernel_1(float** d_weights, float* d_bias, float*** input, float* output, int input_depth, int input_width, int size_output) {
    int idx = threadIdx.x + blockDim.x*blockIdx.x; // < input_depth
    int idy = threadIdx.y + blockDim.y*blockIdx.y; // < input_width
    int idz = threadIdx.z + blockDim.z*blockIdx.z; // < input_width
@ -399,16 +399,16 @@ __global__ void backward_linearisation_kernel_1(Kernel_nn* ker, float*** input,
    int id = idx*input_width*input_width + idy*input_width + idz;
    
    for (int j=0; j < size_output; j++) {
-        ker->d_weights[id][j] += input[idx][idy][idz]*output[j];
+        d_weights[id][j] += input[idx][idy][idz]*output[j];
    }
    if (id == 0) {
        for (int j=0; j < size_output;  j++) {
-            ker->d_bias[j] += output[j];
+            d_bias[j] += output[j];
        }
    }
 }

-__global__ void backward_linearisation_kernel_2(Kernel_nn* ker, float*** input, float*** input_z, float* output, int input_depth, int input_width, int size_output, funcPtr d_f) {
+__global__ void backward_linearisation_kernel_2(float** weights, float*** input, float*** input_z, float* output, int input_depth, int input_width, int size_output, funcPtr d_f) {
    int idx = threadIdx.x + blockDim.x*blockIdx.x; // < input_depth
    int idy = threadIdx.y + blockDim.y*blockIdx.y; // < input_width
    int idz = threadIdx.z + blockDim.z*blockIdx.z; // < input_width
@ -420,7 +420,7 @@ __global__ void backward_linearisation_kernel_2(Kernel_nn* ker, float*** input,

    float tmp=0;
    for (int j=0; j < size_output; j++) {
-        tmp += output[j]*ker->weights[id][j];
+        tmp += output[j]*weights[id][j];
    }
    input[idx][idy][idz] = tmp*( (*d_f)(input_z[idx][idy][idz]) );
 }
@ -430,7 +430,7 @@ void backward_linearisation_device(Kernel_nn* ker, float*** input, float*** inpu
    dim3 gridSize(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(input_width, BLOCKSIZE_y), i_div_up(input_width, BLOCKSIZE_y));
    dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);

-    backward_linearisation_kernel_1<<<gridSize, blockSize>>>(ker, input, output, input_depth, input_width, size_output);
+    backward_linearisation_kernel_1<<<gridSize, blockSize>>>(ker->d_weights, ker->d_bias, input, output, input_depth, input_width, size_output);
    
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
@ -438,7 +438,7 @@ void backward_linearisation_device(Kernel_nn* ker, float*** input, float*** inpu
    // Second kernel
    funcPtr d_function = get_activation_function_cuda(activation);

-    backward_linearisation_kernel_2<<<gridSize, blockSize>>>(ker, input, input_z, output, input_depth, input_width, size_output, d_function);
+    backward_linearisation_kernel_2<<<gridSize, blockSize>>>(ker->weights, input, input_z, output, input_depth, input_width, size_output, d_function);
    
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
@ -509,50 +509,64 @@ __global__ void backward_convolution_dbias_kernel(float*** d_bias, float*** outp
    d_bias[idx][idy][idz] += output[idx][idy][idz];
 }

-__global__ void backward_convolution_dweight_kernel(float**** d_weights, float*** input, float*** output, int input_depth, int output_depth, int output_width, int kernel_size) {
-    int idx = threadIdx.x + blockDim.x*blockIdx.x;
-    int idy = threadIdx.y + blockDim.y*blockIdx.y;
-    int idz = threadIdx.z + blockDim.z*blockIdx.z;
+__global__ void backward_convolution_dweight_kernel(float**** d_weights, float*** input, float*** output, int input_depth, int output_depth, int input_width, int output_width, int k_size, int stride, int padding) {
+    /*
+    * L'ordre des boucles a été changé par rapport à l'implémentation sur CPU
+    * afin d'utiliser possiblement plus de coeurs à la fois (car en général, depth << width)
+    * En gardant les indices des boucles sur CPU notées h,i,j,k,l,m; on fait donc l,m,i,h,j,k 
+    */
+    int idx = threadIdx.x + blockDim.x*blockIdx.x; // l
+    int idy = threadIdx.y + blockDim.y*blockIdx.y; // m
+    int idz = threadIdx.z + blockDim.z*blockIdx.z; // i

-    int idz1 = idz / kernel_size;
-    int idz2 = idz % kernel_size;
-    
-    if (idx >= input_depth || idy >= output_depth || idz1 >= kernel_size || idz2 >= kernel_size) {
-        return;
-    }
-    
-    float tmp = 0;
-    for (int l=0; l < output_width; l++) {
-        for (int m=0; m < output_width; m++) {
-            tmp += input[idx][l+idz1][m+idz2]*output[idy][l][m];
-        }
-    }
-    d_weights[idx][idy][idz1][idz2] += tmp;
-}
-
-__global__ void backward_convolution_propagate_kernel(float**** weights, float*** input, float*** input_z, float*** output, int input_depth, int input_width, int output_depth, int k_size, funcPtr d_f) {
-    int idx = threadIdx.x + blockDim.x*blockIdx.x;
-    int idy = threadIdx.y + blockDim.y*blockIdx.y;
-    int idz = threadIdx.z + blockDim.z*blockIdx.z;
-
-    if (idx >= input_depth || idy >= input_width || idz >= input_width) {
+    if (idx >= output_width || idy >= output_width || idz >= output_depth) {
        return;
    }

-    int min_m, max_m, min_n, max_n;
-    float tmp = 0;
-    for (int l=0; l < output_depth; l++) {
-        min_m = max(0, k_size-1-idy);
-        max_m = min(k_size, input_width - idy);
-        min_n = max(0, k_size-1-idz);
-        max_n = min(k_size, input_width-idz);
-        for (int m=min_m; m < max_m; m++) {
-            for (int n=min_n; n < max_n; n++) {
-                tmp += output[l][idy-k_size+m+1][idz-k_size+n+1]*weights[idx][l][m][n];
+    int max_move = k_size - padding;
+    for (int h=0; h < input_depth; h++) {
+        for (int j=-padding; j < max_move; j++) {
+            for (int k=-padding; k < max_move; k++) {
+                if (not_outside(idx*stride+j, idy*stride+k, 0, input_width)) {
+                    atomicAdd(&d_weights[h][idz][j+padding][k+padding], input[h][idx*stride+j][idy*stride+k]*output[idz][idx][idy]);
+                }
            }
        }
    }
-    input[idx][idy][idz] = tmp*( (*d_f)(input_z[idx][idy][idz]) );
+}
+
+__global__ void backward_convolution_propagate_kernel(float**** weights, float*** input, float*** output, int input_depth, int input_width, int output_width, int output_depth, int k_size, int stride, int padding) {
+    int idx = threadIdx.x + blockDim.x*blockIdx.x;
+    int idy = threadIdx.y + blockDim.y*blockIdx.y;
+
+    if (idx >= input_depth || idy >= output_depth) {
+        return;
+    }
+    int max_move = k_size - padding;
+    for (int j=-padding; j < max_move; j++) {
+        for (int k=-padding; k < max_move; k++) {
+            for (int l=0; l < output_width; l++) {
+                for (int m=0; m < output_width; m++) {
+                    if (not_outside(l*stride+j, m*stride+k, 0, input_width)) {
+                        atomicAdd(&input[idx][l*stride+j][m*stride+k], output[idy][l][m]*weights[idx][idy][j+padding][k+padding]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+__global__ void backward_convolution_apply_propagate_kernel(float*** input, float*** input_z, int input_depth, int input_width, funcPtr d_f) {
+    int idx = threadIdx.x + blockDim.x*blockIdx.x;
+    int idy = threadIdx.y + blockDim.y*blockIdx.y;
+
+    if (idx >= input_depth || idy >= input_width) {
+        return;
+    }
+
+    for (int k=0; k < input_width; k++) {
+        input[idx][idy][k] = input[idx][idy][k]*d_f(input_z[idx][idy][k]);
+    }
 }

 void backward_convolution_device(Kernel_cnn* kernel, float*** input, float*** input_z, float*** output, int input_depth, int input_width, int output_depth, int output_width, int activation, int is_first, int kernel_size, int padding, int stride) {
@ -565,23 +579,30 @@ void backward_convolution_device(Kernel_cnn* kernel, float*** input, float*** in
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

-    dim3 gridSize2(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(output_depth, BLOCKSIZE_y), i_div_up(kernel_size*kernel_size, BLOCKSIZE_y));
+    dim3 gridSize2(i_div_up(output_width, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_depth, BLOCKSIZE_y));
    dim3 blockSize2(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);

-    backward_convolution_dweight_kernel<<<gridSize2, blockSize2>>>(kernel->d_weights, input, output, input_depth, output_depth, output_width, kernel_size);
+    backward_convolution_dweight_kernel<<<gridSize2, blockSize2>>>(kernel->d_weights, input, output, input_depth, output_depth, input_width, output_width, kernel_size, stride, padding);
    
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    // input propagation Kernel
    if (is_first != 1) {
-        dim3 gridSize3(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(input_width, BLOCKSIZE_y), i_div_up(input_width, BLOCKSIZE_y));
-        dim3 blockSize3(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
+        reset_3d_array(input, input_depth, input_width, input_width);
+
+        dim3 gridSize3(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(output_depth, BLOCKSIZE_y));
+        dim3 blockSize3(BLOCKSIZE_x, BLOCKSIZE_y);
+
+        backward_convolution_propagate_kernel<<<gridSize3, blockSize3>>>(kernel->weights, input, output, input_depth, input_width, output_width, output_depth, kernel_size, stride, padding);
+
+        dim3 gridSize4(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(input_width, BLOCKSIZE_y));
+        dim3 blockSize4(BLOCKSIZE_x, BLOCKSIZE_y);

        funcPtr d_function = get_activation_function_cuda(activation);

-        backward_convolution_propagate_kernel<<<gridSize3, blockSize3>>>(kernel->weights, input, input_z, output, input_depth, input_width, output_depth, kernel_size, d_function);
-    
+        backward_convolution_apply_propagate_kernel<<<gridSize4, blockSize4>>>(input, input_z, input_depth, input_width, d_function);
+
        gpuErrchk( cudaPeekAtLastError() );
        gpuErrchk( cudaDeviceSynchronize() );
    }
@ -616,7 +637,7 @@ void backward_convolution_cpu(Kernel_cnn* ker, float*** input, float*** input_z,
                            }
                        }
                    }
-                    ker->d_weights[h][i][j][k] += tmp;
+                    ker->d_weights[h][i][j+padding][k+padding] += tmp;
                }
            }
        }
--- a/src/cnn/free.c
+++ b/src/cnn/free.c
@ -174,7 +174,7 @@ void free_network_creation(Network* network) {
 }

 void free_network(Network* network) {
-    #if defined(USE_CUDA) || defined(TEST_MEMORY_MANAGEMENT)
+    #if (defined(USE_CUDA) || defined(TEST_MEMORY_MANAGEMENT)) && defined(FREE_ALL_OPT)
        // Supprimer toute la mémoire allouée avec nalloc directement
        // Il n'y a alors plus besoin de parcourir tout le réseau,
        // mais il faut que TOUTE la mémoire du réseau ait été allouée de cette manière
--- a/src/cnn/include/config.h
+++ b/src/cnn/include/config.h
@ -2,7 +2,7 @@
 #define DEF_CONFIG_H


-//* Paramètres d'entraînement
+//** Paramètres d'entraînement
 #define EPOCHS 10 // Nombre d'époques par défaut (itérations sur toutes les images)
 #define BATCHES 32 // Nombre d'images à voir avant de mettre le réseau à jour
 #define LEARNING_RATE 3e-4 // Taux d'apprentissage
@ -25,14 +25,6 @@
 //#define ADAM_CNN_BIAS


-//* Paramètre d'optimisation pour un dataset Jpeg
-// keep images in ram e.g re-read and decompress each time
-// Enabling this will lead to a large amount of ram used while economizing not that
-// much computing power
-// Note: 50States10K dataset is 90Go once decompressed, use with caution
-//#define STORE_IMAGES_TO_RAM
-
-
 //* Limite du réseau
 // Des valeurs trop grandes dans le réseau risqueraient de provoquer des overflows notamment.
 // On utilise donc la méthode gradient_clipping,
@ -40,11 +32,29 @@
 // https://arxiv.org/pdf/1905.11881.pdf
 #define NETWORK_CLIP_VALUE 300

-//* Paramètres CUDA
+
+//** Paramètres CUDA
 // Le produit des 3 dimensions doit être au maximum 1024 (atteignable avec 8*8*16)
 // Le réduire permet d'éviter des erreurs "Out of memory" ou "too many resources requested" au lancement des Kernel
 #define BLOCKSIZE_x 8
 #define BLOCKSIZE_y 8
 #define BLOCKSIZE_z 8

+
+//** Paramètres d'optimisation
+//* Paramètre d'optimisation pour un dataset Jpeg
+// keep images in ram e.g re-read and decompress each time
+// Enabling this will lead to a large amount of ram used while economizing not that
+// much computing power
+// Note: 50States10K dataset is 90Go once decompressed, use with caution
+//#define STORE_IMAGES_TO_RAM
+
+//* Optimisation de libération de la mémoire pour de larges réseaux
+// En utilisant CUDA, de larges réseaux créés dans src/common/memory_management.cu
+// peuvent prendre jusqu'à plusieurs heures pour être libérés
+// Une optimisation consiste alors à considérer que seul le réseau est dans cet emplacement de mémoire.
+// La libération d'un réseau entraîne alors la libération de toute la mémoire, ce qui peut poser problème
+// dans certaines situations.
+#define FREE_ALL_OPT
+
 #endif
--- a/src/common/memory_management.c
+++ b/src/common/memory_management.c
@ -69,6 +69,7 @@ void free_all_memory() {
        tail = NULL;
    #endif

+    memory = NULL;
    pthread_mutex_unlock(&memory_lock);
 }

--- a/src/common/memory_management.cu
+++ b/src/common/memory_management.cu
@ -69,6 +69,7 @@ void free_all_memory() {
        tail = NULL;
    #endif

+    memory = NULL;
    pthread_mutex_unlock(&memory_lock);
 }
Author	SHA1	Message	Date
augustin64	94c14cedba	cnn/config: add FREE_ALL_OPT	2023-05-20 22:45:40 +02:00
augustin64	5d306f39ee	backpropagation: fix misaligned addresses	2023-05-20 20:15:36 +02:00
augustin64	bed3d3123e	backpropagation: Fix padding issues This issue can cause a segfault since 4633329	2023-05-20 20:04:22 +02:00