mirror of
https://github.com/augustin64/projet-tipe
synced 2025-02-02 19:39:39 +01:00
Compare commits
3 Commits
2f333bfc1d
...
0fb23c9b15
Author | SHA1 | Date | |
---|---|---|---|
0fb23c9b15 | |||
3d7b641965 | |||
37ba3a5976 |
@ -416,7 +416,7 @@ __global__ void backward_linearisation_kernel_2(Kernel_nn* ker, float*** input,
|
|||||||
if (idx >= input_depth || idy >= input_width || idz >= input_width) {
|
if (idx >= input_depth || idy >= input_width || idz >= input_width) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int id = idx*input_width*input_width + idy*input_width + idz;
|
int id = (idx*input_width+idy)*input_width + idz;
|
||||||
|
|
||||||
float tmp=0;
|
float tmp=0;
|
||||||
for (int j=0; j < size_output; j++) {
|
for (int j=0; j < size_output; j++) {
|
||||||
@ -498,7 +498,7 @@ void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, fl
|
|||||||
* Backward convolution
|
* Backward convolution
|
||||||
*/
|
*/
|
||||||
#ifdef __CUDACC__
|
#ifdef __CUDACC__
|
||||||
__global__ void backward_convolution_dbias_kernel(Kernel_cnn* ker, float*** output, int output_depth, int output_width) {
|
__global__ void backward_convolution_dbias_kernel(float*** d_bias, float*** output, int output_depth, int output_width) {
|
||||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||||
@ -506,10 +506,10 @@ __global__ void backward_convolution_dbias_kernel(Kernel_cnn* ker, float*** outp
|
|||||||
if (idx >= output_depth || idy >= output_width || idz >= output_width) {
|
if (idx >= output_depth || idy >= output_width || idz >= output_width) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ker->d_bias[idx][idy][idz] += output[idx][idy][idz];
|
d_bias[idx][idy][idz] += output[idx][idy][idz];
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void backward_convolution_dweight_kernel(Kernel_cnn* ker, float*** input, float*** output, int input_depth, int output_depth, int output_width, int kernel_size) {
|
__global__ void backward_convolution_dweight_kernel(float**** d_weights, float*** input, float*** output, int input_depth, int output_depth, int output_width, int kernel_size) {
|
||||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||||
@ -527,10 +527,10 @@ __global__ void backward_convolution_dweight_kernel(Kernel_cnn* ker, float*** in
|
|||||||
tmp += input[idx][l+idz1][m+idz2]*output[idy][l][m];
|
tmp += input[idx][l+idz1][m+idz2]*output[idy][l][m];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ker->d_weights[idx][idy][idz1][idz2] += tmp;
|
d_weights[idx][idy][idz1][idz2] += tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void backward_convolution_propagate_kernel(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int input_depth, int input_width, int output_depth, int k_size, funcPtr d_f) {
|
__global__ void backward_convolution_propagate_kernel(float**** weights, float*** input, float*** input_z, float*** output, int input_depth, int input_width, int output_depth, int k_size, funcPtr d_f) {
|
||||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||||
@ -548,7 +548,7 @@ __global__ void backward_convolution_propagate_kernel(Kernel_cnn* ker, float***
|
|||||||
max_n = min(k_size, input_width-idz);
|
max_n = min(k_size, input_width-idz);
|
||||||
for (int m=min_m; m < max_m; m++) {
|
for (int m=min_m; m < max_m; m++) {
|
||||||
for (int n=min_n; n < max_n; n++) {
|
for (int n=min_n; n < max_n; n++) {
|
||||||
tmp += output[l][idy-k_size+m+1][idz-k_size+n+1]*ker->weights[idx][l][m][n];
|
tmp += output[l][idy-k_size+m+1][idz-k_size+n+1]*weights[idx][l][m][n];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -560,7 +560,7 @@ void backward_convolution_device(Kernel_cnn* kernel, float*** input, float*** in
|
|||||||
dim3 gridSize1(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_y));
|
dim3 gridSize1(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_y));
|
||||||
dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||||
|
|
||||||
backward_convolution_dbias_kernel<<<gridSize1, blockSize1>>>(kernel, output, output_depth, output_width);
|
backward_convolution_dbias_kernel<<<gridSize1, blockSize1>>>(kernel->d_bias, output, output_depth, output_width);
|
||||||
|
|
||||||
gpuErrchk( cudaPeekAtLastError() );
|
gpuErrchk( cudaPeekAtLastError() );
|
||||||
gpuErrchk( cudaDeviceSynchronize() );
|
gpuErrchk( cudaDeviceSynchronize() );
|
||||||
@ -568,7 +568,7 @@ void backward_convolution_device(Kernel_cnn* kernel, float*** input, float*** in
|
|||||||
dim3 gridSize2(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(output_depth, BLOCKSIZE_y), i_div_up(kernel_size*kernel_size, BLOCKSIZE_y));
|
dim3 gridSize2(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(output_depth, BLOCKSIZE_y), i_div_up(kernel_size*kernel_size, BLOCKSIZE_y));
|
||||||
dim3 blockSize2(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
dim3 blockSize2(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||||
|
|
||||||
backward_convolution_dweight_kernel<<<gridSize2, blockSize2>>>(kernel, input, output, input_depth, output_depth, output_width, kernel_size);
|
backward_convolution_dweight_kernel<<<gridSize2, blockSize2>>>(kernel->d_weights, input, output, input_depth, output_depth, output_width, kernel_size);
|
||||||
|
|
||||||
gpuErrchk( cudaPeekAtLastError() );
|
gpuErrchk( cudaPeekAtLastError() );
|
||||||
gpuErrchk( cudaDeviceSynchronize() );
|
gpuErrchk( cudaDeviceSynchronize() );
|
||||||
@ -580,7 +580,7 @@ void backward_convolution_device(Kernel_cnn* kernel, float*** input, float*** in
|
|||||||
|
|
||||||
funcPtr d_function = get_activation_function_cuda(activation);
|
funcPtr d_function = get_activation_function_cuda(activation);
|
||||||
|
|
||||||
backward_convolution_propagate_kernel<<<gridSize3, blockSize3>>>(kernel, input, input_z, output, input_depth, input_width, output_depth, kernel_size, d_function);
|
backward_convolution_propagate_kernel<<<gridSize3, blockSize3>>>(kernel->weights, input, input_z, output, input_depth, input_width, output_depth, kernel_size, d_function);
|
||||||
|
|
||||||
gpuErrchk( cudaPeekAtLastError() );
|
gpuErrchk( cudaPeekAtLastError() );
|
||||||
gpuErrchk( cudaDeviceSynchronize() );
|
gpuErrchk( cudaDeviceSynchronize() );
|
||||||
|
@ -416,7 +416,7 @@ __global__ void backward_linearisation_kernel_2(Kernel_nn* ker, float*** input,
|
|||||||
if (idx >= input_depth || idy >= input_width || idz >= input_width) {
|
if (idx >= input_depth || idy >= input_width || idz >= input_width) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int id = idx*input_width*input_width + idy*input_width + idz;
|
int id = (idx*input_width+idy)*input_width + idz;
|
||||||
|
|
||||||
float tmp=0;
|
float tmp=0;
|
||||||
for (int j=0; j < size_output; j++) {
|
for (int j=0; j < size_output; j++) {
|
||||||
@ -498,7 +498,7 @@ void backward_linearisation(Kernel_nn* ker, float*** input, float*** input_z, fl
|
|||||||
* Backward convolution
|
* Backward convolution
|
||||||
*/
|
*/
|
||||||
#ifdef __CUDACC__
|
#ifdef __CUDACC__
|
||||||
__global__ void backward_convolution_dbias_kernel(Kernel_cnn* ker, float*** output, int output_depth, int output_width) {
|
__global__ void backward_convolution_dbias_kernel(float*** d_bias, float*** output, int output_depth, int output_width) {
|
||||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||||
@ -506,10 +506,10 @@ __global__ void backward_convolution_dbias_kernel(Kernel_cnn* ker, float*** outp
|
|||||||
if (idx >= output_depth || idy >= output_width || idz >= output_width) {
|
if (idx >= output_depth || idy >= output_width || idz >= output_width) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ker->d_bias[idx][idy][idz] += output[idx][idy][idz];
|
d_bias[idx][idy][idz] += output[idx][idy][idz];
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void backward_convolution_dweight_kernel(Kernel_cnn* ker, float*** input, float*** output, int input_depth, int output_depth, int output_width, int kernel_size) {
|
__global__ void backward_convolution_dweight_kernel(float**** d_weights, float*** input, float*** output, int input_depth, int output_depth, int output_width, int kernel_size) {
|
||||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||||
@ -527,10 +527,10 @@ __global__ void backward_convolution_dweight_kernel(Kernel_cnn* ker, float*** in
|
|||||||
tmp += input[idx][l+idz1][m+idz2]*output[idy][l][m];
|
tmp += input[idx][l+idz1][m+idz2]*output[idy][l][m];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ker->d_weights[idx][idy][idz1][idz2] += tmp;
|
d_weights[idx][idy][idz1][idz2] += tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void backward_convolution_propagate_kernel(Kernel_cnn* ker, float*** input, float*** input_z, float*** output, int input_depth, int input_width, int output_depth, int k_size, funcPtr d_f) {
|
__global__ void backward_convolution_propagate_kernel(float**** weights, float*** input, float*** input_z, float*** output, int input_depth, int input_width, int output_depth, int k_size, funcPtr d_f) {
|
||||||
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
int idy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
int idz = threadIdx.z + blockDim.z*blockIdx.z;
|
||||||
@ -548,7 +548,7 @@ __global__ void backward_convolution_propagate_kernel(Kernel_cnn* ker, float***
|
|||||||
max_n = min(k_size, input_width-idz);
|
max_n = min(k_size, input_width-idz);
|
||||||
for (int m=min_m; m < max_m; m++) {
|
for (int m=min_m; m < max_m; m++) {
|
||||||
for (int n=min_n; n < max_n; n++) {
|
for (int n=min_n; n < max_n; n++) {
|
||||||
tmp += output[l][idy-k_size+m+1][idz-k_size+n+1]*ker->weights[idx][l][m][n];
|
tmp += output[l][idy-k_size+m+1][idz-k_size+n+1]*weights[idx][l][m][n];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -560,7 +560,7 @@ void backward_convolution_device(Kernel_cnn* kernel, float*** input, float*** in
|
|||||||
dim3 gridSize1(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_y));
|
dim3 gridSize1(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_y));
|
||||||
dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
dim3 blockSize1(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||||
|
|
||||||
backward_convolution_dbias_kernel<<<gridSize1, blockSize1>>>(kernel, output, output_depth, output_width);
|
backward_convolution_dbias_kernel<<<gridSize1, blockSize1>>>(kernel->d_bias, output, output_depth, output_width);
|
||||||
|
|
||||||
gpuErrchk( cudaPeekAtLastError() );
|
gpuErrchk( cudaPeekAtLastError() );
|
||||||
gpuErrchk( cudaDeviceSynchronize() );
|
gpuErrchk( cudaDeviceSynchronize() );
|
||||||
@ -568,7 +568,7 @@ void backward_convolution_device(Kernel_cnn* kernel, float*** input, float*** in
|
|||||||
dim3 gridSize2(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(output_depth, BLOCKSIZE_y), i_div_up(kernel_size*kernel_size, BLOCKSIZE_y));
|
dim3 gridSize2(i_div_up(input_depth, BLOCKSIZE_x), i_div_up(output_depth, BLOCKSIZE_y), i_div_up(kernel_size*kernel_size, BLOCKSIZE_y));
|
||||||
dim3 blockSize2(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
dim3 blockSize2(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||||
|
|
||||||
backward_convolution_dweight_kernel<<<gridSize2, blockSize2>>>(kernel, input, output, input_depth, output_depth, output_width, kernel_size);
|
backward_convolution_dweight_kernel<<<gridSize2, blockSize2>>>(kernel->d_weights, input, output, input_depth, output_depth, output_width, kernel_size);
|
||||||
|
|
||||||
gpuErrchk( cudaPeekAtLastError() );
|
gpuErrchk( cudaPeekAtLastError() );
|
||||||
gpuErrchk( cudaDeviceSynchronize() );
|
gpuErrchk( cudaDeviceSynchronize() );
|
||||||
@ -580,7 +580,7 @@ void backward_convolution_device(Kernel_cnn* kernel, float*** input, float*** in
|
|||||||
|
|
||||||
funcPtr d_function = get_activation_function_cuda(activation);
|
funcPtr d_function = get_activation_function_cuda(activation);
|
||||||
|
|
||||||
backward_convolution_propagate_kernel<<<gridSize3, blockSize3>>>(kernel, input, input_z, output, input_depth, input_width, output_depth, kernel_size, d_function);
|
backward_convolution_propagate_kernel<<<gridSize3, blockSize3>>>(kernel->weights, input, input_z, output, input_depth, input_width, output_depth, kernel_size, d_function);
|
||||||
|
|
||||||
gpuErrchk( cudaPeekAtLastError() );
|
gpuErrchk( cudaPeekAtLastError() );
|
||||||
gpuErrchk( cudaDeviceSynchronize() );
|
gpuErrchk( cudaDeviceSynchronize() );
|
||||||
|
@ -42,27 +42,27 @@ void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, i
|
|||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef __CUDACC__
|
||||||
|
|
||||||
__global__ void make_convolution_kernel(Kernel_cnn* kernel, float*** input, float*** output, int output_width, int stride, int padding) {
|
__global__ void make_convolution_kernel(float**** weights, float*** bias, int k_size, int rows, int columns, float*** input, float*** output, int output_width, int stride, int padding) {
|
||||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns
|
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns
|
||||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_width, k_size)
|
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_width, k_size)
|
||||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_width, k_size)
|
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_width, k_size)
|
||||||
int max_move = kernel->k_size - padding;
|
int max_move = k_size - padding;
|
||||||
int input_width = output_width*stride - 2*padding + kernel->k_size - stride;
|
int input_width = output_width*stride - 2*padding + k_size - stride;
|
||||||
|
|
||||||
if (idx >= kernel->columns || idy >= output_width || idz >= output_width) {
|
if (idx >= columns || idy >= output_width || idz >= output_width) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
float f = kernel->bias[idx][idy][idz];
|
float f = bias[idx][idy][idz];
|
||||||
|
|
||||||
for (int a=0; a < kernel->rows; a++) {
|
for (int a=0; a < rows; a++) {
|
||||||
for (int b=-padding; b < max_move; b++) {
|
for (int b=-padding; b < max_move; b++) {
|
||||||
for (int c=-padding; c < max_move; c++) {
|
for (int c=-padding; c < max_move; c++) {
|
||||||
int idy_2 = idy*stride+b;
|
int idy_2 = idy*stride+b;
|
||||||
int idz_2 = idz*stride+c;
|
int idz_2 = idz*stride+c;
|
||||||
if (not_outside(idy_2, idz_2, 0, input_width)) {
|
if (not_outside(idy_2, idz_2, 0, input_width)) {
|
||||||
f += kernel->weights[a][idx][b][c]*input[a][idy_2][idz_2];
|
f += weights[a][idx][b][c]*input[a][idy_2][idz_2];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -76,7 +76,9 @@ void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output
|
|||||||
dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z));
|
dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z));
|
||||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||||
|
|
||||||
make_convolution_kernel<<<gridSize, blockSize>>>(kernel, input, output, output_width, stride, padding);
|
// We can't pass `kernel` directly to the CUDA kernel function
|
||||||
|
// as it will create a 'misaligned adress' error
|
||||||
|
make_convolution_kernel<<<gridSize, blockSize>>>(kernel->weights, kernel->bias, kernel->k_size, kernel->rows, kernel->columns, input, output, output_width, stride, padding);
|
||||||
gpuErrchk( cudaPeekAtLastError() );
|
gpuErrchk( cudaPeekAtLastError() );
|
||||||
gpuErrchk( cudaDeviceSynchronize() );
|
gpuErrchk( cudaDeviceSynchronize() );
|
||||||
}
|
}
|
||||||
|
@ -42,27 +42,27 @@ void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, i
|
|||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef __CUDACC__
|
||||||
|
|
||||||
__global__ void make_convolution_kernel(Kernel_cnn* kernel, float*** input, float*** output, int output_width, int stride, int padding) {
|
__global__ void make_convolution_kernel(float**** weights, float*** bias, int k_size, int rows, int columns, float*** input, float*** output, int output_width, int stride, int padding) {
|
||||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns
|
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns
|
||||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_width, k_size)
|
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_width, k_size)
|
||||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_width, k_size)
|
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_width, k_size)
|
||||||
int max_move = kernel->k_size - padding;
|
int max_move = k_size - padding;
|
||||||
int input_width = output_width*stride - 2*padding + kernel->k_size - stride;
|
int input_width = output_width*stride - 2*padding + k_size - stride;
|
||||||
|
|
||||||
if (idx >= kernel->columns || idy >= output_width || idz >= output_width) {
|
if (idx >= columns || idy >= output_width || idz >= output_width) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
float f = kernel->bias[idx][idy][idz];
|
float f = bias[idx][idy][idz];
|
||||||
|
|
||||||
for (int a=0; a < kernel->rows; a++) {
|
for (int a=0; a < rows; a++) {
|
||||||
for (int b=-padding; b < max_move; b++) {
|
for (int b=-padding; b < max_move; b++) {
|
||||||
for (int c=-padding; c < max_move; c++) {
|
for (int c=-padding; c < max_move; c++) {
|
||||||
int idy_2 = idy*stride+b;
|
int idy_2 = idy*stride+b;
|
||||||
int idz_2 = idz*stride+c;
|
int idz_2 = idz*stride+c;
|
||||||
if (not_outside(idy_2, idz_2, 0, input_width)) {
|
if (not_outside(idy_2, idz_2, 0, input_width)) {
|
||||||
f += kernel->weights[a][idx][b][c]*input[a][idy_2][idz_2];
|
f += weights[a][idx][b][c]*input[a][idy_2][idz_2];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -76,7 +76,9 @@ void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output
|
|||||||
dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z));
|
dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_width, BLOCKSIZE_y), i_div_up(output_width, BLOCKSIZE_z));
|
||||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||||
|
|
||||||
make_convolution_kernel<<<gridSize, blockSize>>>(kernel, input, output, output_width, stride, padding);
|
// We can't pass `kernel` directly to the CUDA kernel function
|
||||||
|
// as it will create a 'misaligned adress' error
|
||||||
|
make_convolution_kernel<<<gridSize, blockSize>>>(kernel->weights, kernel->bias, kernel->k_size, kernel->rows, kernel->columns, input, output, output_width, stride, padding);
|
||||||
gpuErrchk( cudaPeekAtLastError() );
|
gpuErrchk( cudaPeekAtLastError() );
|
||||||
gpuErrchk( cudaDeviceSynchronize() );
|
gpuErrchk( cudaDeviceSynchronize() );
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,7 @@ void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, i
|
|||||||
/*
|
/*
|
||||||
* Kernel de la convolution sur carte graphique
|
* Kernel de la convolution sur carte graphique
|
||||||
*/
|
*/
|
||||||
__global__ void make_convolution_kernel(int k_size, int columns, int rows, float* bias, size_t pitch_bias, float**** weights, size_t pitch_weights, float*** input, size_t pitch_input, float*** output, size_t pitch_output, int output_width, int stride, int padding);
|
__global__ void make_convolution_kernel(float**** weights, float*** bias, int k_size, int rows, int columns, float*** input, float*** output, int output_width, int stride, int padding);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Effectue la convolution naïvement sur la carte graphique
|
* Effectue la convolution naïvement sur la carte graphique
|
||||||
|
@ -257,7 +257,7 @@ __global__ void make_dense_linearized_kernel(float** weights, float* bias, float
|
|||||||
for (int i=0; i < input_depth; i++) {
|
for (int i=0; i < input_depth; i++) {
|
||||||
for (int j=0; j < input_width; j++) {
|
for (int j=0; j < input_width; j++) {
|
||||||
for (int k=0; k < input_width; k++) {
|
for (int k=0; k < input_width; k++) {
|
||||||
f += input[i][j][k]*weights[k + j*input_width + i*input_depth][idx];
|
f += input[i][j][k]*weights[k + (i*input_width+j)*input_width][idx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -285,7 +285,7 @@ void make_dense_linearized_cpu(Kernel_nn* kernel, float*** input, float* output,
|
|||||||
for (int i=0; i < input_depth; i++) {
|
for (int i=0; i < input_depth; i++) {
|
||||||
for (int j=0; j < input_width; j++) {
|
for (int j=0; j < input_width; j++) {
|
||||||
for (int k=0; k < input_width; k++) {
|
for (int k=0; k < input_width; k++) {
|
||||||
f += input[i][j][k]*kernel->weights[k + j*input_width + i*input_depth][l];
|
f += input[i][j][k]*kernel->weights[k + (i*input_width+j)*input_width][l];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -257,7 +257,7 @@ __global__ void make_dense_linearized_kernel(float** weights, float* bias, float
|
|||||||
for (int i=0; i < input_depth; i++) {
|
for (int i=0; i < input_depth; i++) {
|
||||||
for (int j=0; j < input_width; j++) {
|
for (int j=0; j < input_width; j++) {
|
||||||
for (int k=0; k < input_width; k++) {
|
for (int k=0; k < input_width; k++) {
|
||||||
f += input[i][j][k]*weights[k + j*input_width + i*input_depth][idx];
|
f += input[i][j][k]*weights[k + (i*input_width+j)*input_width][idx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -285,7 +285,7 @@ void make_dense_linearized_cpu(Kernel_nn* kernel, float*** input, float* output,
|
|||||||
for (int i=0; i < input_depth; i++) {
|
for (int i=0; i < input_depth; i++) {
|
||||||
for (int j=0; j < input_width; j++) {
|
for (int j=0; j < input_width; j++) {
|
||||||
for (int k=0; k < input_width; k++) {
|
for (int k=0; k < input_width; k++) {
|
||||||
f += input[i][j][k]*kernel->weights[k + j*input_width + i*input_depth][l];
|
f += input[i][j][k]*kernel->weights[k + (i*input_width+j)*input_width][l];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user