diff --git a/src/cnn/convolution.c b/src/cnn/convolution.c index 0589462..c280b71 100644 --- a/src/cnn/convolution.c +++ b/src/cnn/convolution.c @@ -42,27 +42,27 @@ void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, i #ifdef __CUDACC__ -__global__ void make_convolution_kernel(Kernel_cnn* kernel, float*** input, float*** output, int output_width, int stride, int padding) { +__global__ void make_convolution_kernel(float**** weights, float*** bias, int k_size, int rows, int columns, float*** input, float*** output, int output_width, int stride, int padding) { // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_width, k_size) int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_width, k_size) - int max_move = kernel->k_size - padding; - int input_width = output_width*stride - 2*padding + kernel->k_size - stride; + int max_move = k_size - padding; + int input_width = output_width*stride - 2*padding + k_size - stride; - if (idx >= kernel->columns || idy >= output_width || idz >= output_width) { + if (idx >= columns || idy >= output_width || idz >= output_width) { return; } - float f = kernel->bias[idx][idy][idz]; + float f = bias[idx][idy][idz]; - for (int a=0; a < kernel->rows; a++) { + for (int a=0; a < rows; a++) { for (int b=-padding; b < max_move; b++) { for (int c=-padding; c < max_move; c++) { int idy_2 = idy*stride+b; int idz_2 = idz*stride+c; if (not_outside(idy_2, idz_2, 0, input_width)) { - f += kernel->weights[a][idx][b][c]*input[a][idy_2][idz_2]; + f += weights[a][idx][b][c]*input[a][idy_2][idz_2]; } } } @@ -76,7 +76,9 @@ void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z)); dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); - make_convolution_kernel<<>>(kernel, input, output, output_width, stride, padding); + // We can't pass `kernel` directly to the CUDA kernel function + // as it will create a 'misaligned adress' error + make_convolution_kernel<<>>(kernel->weights, kernel->bias, kernel->k_size, kernel->rows, kernel->columns, input, output, output_width, stride, padding); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); } diff --git a/src/cnn/convolution.cu b/src/cnn/convolution.cu index 0589462..c280b71 100644 --- a/src/cnn/convolution.cu +++ b/src/cnn/convolution.cu @@ -42,27 +42,27 @@ void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, i #ifdef __CUDACC__ -__global__ void make_convolution_kernel(Kernel_cnn* kernel, float*** input, float*** output, int output_width, int stride, int padding) { +__global__ void make_convolution_kernel(float**** weights, float*** bias, int k_size, int rows, int columns, float*** input, float*** output, int output_width, int stride, int padding) { // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_width, k_size) int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_width, k_size) - int max_move = kernel->k_size - padding; - int input_width = output_width*stride - 2*padding + kernel->k_size - stride; + int max_move = k_size - padding; + int input_width = output_width*stride - 2*padding + k_size - stride; - if (idx >= kernel->columns || idy >= output_width || idz >= output_width) { + if (idx >= columns || idy >= output_width || idz >= output_width) { return; } - float f = kernel->bias[idx][idy][idz]; + float f = bias[idx][idy][idz]; - for (int a=0; a < kernel->rows; a++) { + for (int a=0; a < rows; a++) { for (int b=-padding; b < max_move; b++) { for (int c=-padding; c < max_move; c++) { int idy_2 = idy*stride+b; int idz_2 = idz*stride+c; if (not_outside(idy_2, idz_2, 0, input_width)) { - f += kernel->weights[a][idx][b][c]*input[a][idy_2][idz_2]; + f += weights[a][idx][b][c]*input[a][idy_2][idz_2]; } } } @@ -76,7 +76,9 @@ void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z)); dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); - make_convolution_kernel<<>>(kernel, input, output, output_width, stride, padding); + // We can't pass `kernel` directly to the CUDA kernel function + // as it will create a 'misaligned adress' error + make_convolution_kernel<<>>(kernel->weights, kernel->bias, kernel->k_size, kernel->rows, kernel->columns, input, output, output_width, stride, padding); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); } diff --git a/src/cnn/include/convolution.h b/src/cnn/include/convolution.h index 6919f5c..fc2c08a 100644 --- a/src/cnn/include/convolution.h +++ b/src/cnn/include/convolution.h @@ -10,7 +10,7 @@ void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, i /* * Kernel de la convolution sur carte graphique */ -__global__ void make_convolution_kernel(int k_size, int columns, int rows, float* bias, size_t pitch_bias, float**** weights, size_t pitch_weights, float*** input, size_t pitch_input, float*** output, size_t pitch_output, int output_width, int stride, int padding); +__global__ void make_convolution_kernel(float**** weights, float*** bias, int k_size, int rows, int columns, float*** input, float*** output, int output_width, int stride, int padding); /* * Effectue la convolution naïvement sur la carte graphique