Full implementation of forward convolution in CUDA

This commit is contained in:
augustin64 2023-02-15 11:42:24 +01:00
parent a9262cacc1
commit 0aa6fbc5f0
3 changed files with 526 additions and 11 deletions

View File

@ -16,21 +16,33 @@ void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int o
/* /*
* Effectue un average pooling avec stride=size * Effectue un average pooling avec stride=size
*/ */
#ifdef __CUDACC__
extern "C"
#endif
void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim); void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim);
/* /*
* Effectue un max pooling avec stride=size * Effectue un max pooling avec stride=size
*/ */
#ifdef __CUDACC__
extern "C"
#endif
void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim); void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim);
/* /*
* Effectue une full connection * Effectue une full connection
*/ */
#ifdef __CUDACC__
extern "C"
#endif
void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output); void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output);
/* /*
* Effectue une full connection qui passe d'une matrice à un vecteur * Effectue une full connection qui passe d'une matrice à un vecteur
*/ */
#ifdef __CUDACC__
extern "C"
#endif
void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output); void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output);
#endif #endif

View File

@ -1,39 +1,140 @@
#include <stdio.h> #include <stdio.h>
#include <float.h> #include <float.h>
#include "../include/colors.h"
#include "include/convolution.h" #include "include/convolution.h"
#include "../include/colors.h"
#include "../include/utils.h"
#include "include/make.h" #include "include/make.h"
#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 8
#define BLOCKSIZE_z 8
float max_flt(float a, float b) { float max_flt(float a, float b) {
// Return the max between the two floats // Return the max between the two floats
if (a>b) { if (a > b) {
return a; return a;
} }
return b; return b;
} }
void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
/*
* Average Pooling
*/
#ifdef __CUDACC__
__global__ void make_average_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) {
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim
int n = size*size;
if (idx >= output_depth || idy >= output_dim || idz >= output_dim) {
return;
}
float m = FLT_MIN;
float temp;
for (int a=0; a < size; a++) {
for (int b=0; b < size; b++) {
temp = input[idx][size*idy +a][size*idz +b];
m = m > temp ? m : temp; // max(m, temp)
}
}
output[idx][idy][idz] = m/(float)n;
}
void make_average_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) {
// Make computation
dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
make_average_pooling_kernel<<<gridSize, blockSize>>>(input, output, size, output_depth, output_dim);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
#endif
void make_average_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) {
// input[output_depth][output_dim+size-1][output_dim+size-1] // input[output_depth][output_dim+size-1][output_dim+size-1]
// output[output_depth][output_dim][output_dim] // output[output_depth][output_dim][output_dim]
float sum; float m;
int n = size*size; int n = size*size;
for (int i=0; i < output_depth; i++) { for (int i=0; i < output_depth; i++) {
for (int j=0; j < output_dim; j++) { for (int j=0; j < output_dim; j++) {
for (int k=0; k < output_dim; k++) { for (int k=0; k < output_dim; k++) {
sum = 0.; m = FLT_MIN;
for (int a=0; a < size; a++) { for (int a=0; a < size; a++) {
for (int b=0; b < size; b++) { for (int b=0; b < size; b++) {
sum += input[i][size*j +a][size*k +b]; m = max_flt(m, input[i][size*j +a][size*k +b]);
} }
} }
output[i][j][k] = sum/(float)n; output[i][j][k] = m/(float)n;
} }
} }
} }
} }
void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) { #ifdef __CUDACC__
extern "C"
#endif
void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
#ifndef __CUDACC__
make_average_pooling_cpu(input, output, size, output_depth, output_dim);
#else
make_average_pooling_device(input, output, size, output_depth, output_dim);
#endif
}
/*
* Max Pooling
*/
#ifdef __CUDACC__
__global__ void make_max_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) {
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim
if (idx >= output_depth || idy >= output_dim || idz >= output_dim) {
return;
}
float m = FLT_MIN;
float temp;
for (int a=0; a < size; a++) {
for (int b=0; b < size; b++) {
temp = input[idx][size*idy +a][size*idz +b];
m = m > temp ? m : temp; // max(m, temp)
}
}
output[idx][idy][idz] = m;
}
void make_max_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) {
// Make computation
dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
make_max_pooling_kernel<<<gridSize, blockSize>>>(input, output, size, output_depth, output_dim);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
#endif
void make_max_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) {
// input[output_depth][output_dim+size-1][output_dim+size-1] // input[output_depth][output_dim+size-1][output_dim+size-1]
// output[output_depth][output_dim][output_dim] // output[output_depth][output_dim][output_dim]
float m; float m;
@ -52,7 +153,55 @@ void make_max_pooling(float*** input, float*** output, int size, int output_dept
} }
} }
void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { #ifdef __CUDACC__
extern "C"
#endif
void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
#ifndef __CUDACC__
make_max_pooling_cpu(input, output, size, output_depth, output_dim);
#else
make_max_pooling_device(input, output, size, output_depth, output_dim);
#endif
}
/*
* Dense
*/
#ifdef __CUDACC__
__global__ void make_dense_kernel(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output
if (idx >= size_output) {
return;
}
float f = kernel->bias[idx];
for (int j=0; j < size_input; j++) {
f += kernel->weights[j][idx]*input[j];
}
output[idx] = f;
}
void make_dense_device(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
// Make computation
dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1);
dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z);
make_dense_kernel<<<gridSize, blockSize>>>(kernel, input, output, size_input, size_output);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
#endif
#ifdef __CUDACC__
extern "C"
#endif
void make_dense_cpu(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
// input[size_input] // input[size_input]
// output[size_output] // output[size_output]
float f; float f;
@ -66,7 +215,56 @@ void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input,
} }
} }
void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { #ifdef __CUDACC__
extern "C"
#endif
void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
#ifndef __CUDACC__
make_dense_cpu(kernel, input, output, size_input, size_output);
#else
make_dense_device(kernel, input, output, size_input, size_output);
#endif
}
/*
* Dense linearised
*/
#ifdef __CUDACC__
__global__ void make_dense_linearised_kernel(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output
if (idx >= size_output) {
return;
}
float f = 0;
for (int i=0; i < depth_input; i++) {
for (int j=0; j < dim_input; j++) {
for (int k=0; k < dim_input; k++) {
f += input[i][j][k]*kernel->weights[k + j*dim_input + i*depth_input][idx];
}
}
}
output[idx] = f;
}
void make_dense_linearised_device(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
// Make computation
dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1);
dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z);
make_dense_linearised_kernel<<<gridSize, blockSize>>>(kernel, input, output, depth_input, dim_input, size_output);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
#endif
void make_dense_linearised_cpu(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
// input[depth_input][dim_input][dim_input] // input[depth_input][dim_input][dim_input]
// output[size_output] // output[size_output]
float f; float f;
@ -82,4 +280,15 @@ void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int
} }
output[l] = f; output[l] = f;
} }
} }
#ifdef __CUDACC__
extern "C"
#endif
void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
#ifndef __CUDACC__
make_dense_linearised_cpu(kernel, input, output, depth_input, dim_input, size_output);
#else
make_dense_linearised_device(kernel, input, output, depth_input, dim_input, size_output);
#endif
}

294
src/cnn/make.cu Normal file
View File

@ -0,0 +1,294 @@
#include <stdio.h>
#include <float.h>
#include "include/convolution.h"
#include "../include/colors.h"
#include "../include/utils.h"
#include "include/make.h"
#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 8
#define BLOCKSIZE_z 8
float max_flt(float a, float b) {
// Return the max between the two floats
if (a > b) {
return a;
}
return b;
}
/*
* Average Pooling
*/
#ifdef __CUDACC__
__global__ void make_average_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) {
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim
int n = size*size;
if (idx >= output_depth || idy >= output_dim || idz >= output_dim) {
return;
}
float m = FLT_MIN;
float temp;
for (int a=0; a < size; a++) {
for (int b=0; b < size; b++) {
temp = input[idx][size*idy +a][size*idz +b];
m = m > temp ? m : temp; // max(m, temp)
}
}
output[idx][idy][idz] = m/(float)n;
}
void make_average_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) {
// Make computation
dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
make_average_pooling_kernel<<<gridSize, blockSize>>>(input, output, size, output_depth, output_dim);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
#endif
void make_average_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) {
// input[output_depth][output_dim+size-1][output_dim+size-1]
// output[output_depth][output_dim][output_dim]
float m;
int n = size*size;
for (int i=0; i < output_depth; i++) {
for (int j=0; j < output_dim; j++) {
for (int k=0; k < output_dim; k++) {
m = FLT_MIN;
for (int a=0; a < size; a++) {
for (int b=0; b < size; b++) {
m = max_flt(m, input[i][size*j +a][size*k +b]);
}
}
output[i][j][k] = m/(float)n;
}
}
}
}
#ifdef __CUDACC__
extern "C"
#endif
void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
#ifndef __CUDACC__
make_average_pooling_cpu(input, output, size, output_depth, output_dim);
#else
make_average_pooling_device(input, output, size, output_depth, output_dim);
#endif
}
/*
* Max Pooling
*/
#ifdef __CUDACC__
__global__ void make_max_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) {
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim
if (idx >= output_depth || idy >= output_dim || idz >= output_dim) {
return;
}
float m = FLT_MIN;
float temp;
for (int a=0; a < size; a++) {
for (int b=0; b < size; b++) {
temp = input[idx][size*idy +a][size*idz +b];
m = m > temp ? m : temp; // max(m, temp)
}
}
output[idx][idy][idz] = m;
}
void make_max_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) {
// Make computation
dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
make_max_pooling_kernel<<<gridSize, blockSize>>>(input, output, size, output_depth, output_dim);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
#endif
void make_max_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) {
// input[output_depth][output_dim+size-1][output_dim+size-1]
// output[output_depth][output_dim][output_dim]
float m;
for (int i=0; i < output_depth; i++) {
for (int j=0; j < output_dim; j++) {
for (int k=0; k < output_dim; k++) {
m = FLT_MIN;
for (int a=0; a < size; a++) {
for (int b=0; b < size; b++) {
m = max_flt(m, input[i][size*j +a][size*k +b]);
}
}
output[i][j][k] = m;
}
}
}
}
#ifdef __CUDACC__
extern "C"
#endif
void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
#ifndef __CUDACC__
make_max_pooling_cpu(input, output, size, output_depth, output_dim);
#else
make_max_pooling_device(input, output, size, output_depth, output_dim);
#endif
}
/*
* Dense
*/
#ifdef __CUDACC__
__global__ void make_dense_kernel(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output
if (idx >= size_output) {
return;
}
float f = kernel->bias[idx];
for (int j=0; j < size_input; j++) {
f += kernel->weights[j][idx]*input[j];
}
output[idx] = f;
}
void make_dense_device(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
// Make computation
dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1);
dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z);
make_dense_kernel<<<gridSize, blockSize>>>(kernel, input, output, size_input, size_output);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
#endif
#ifdef __CUDACC__
extern "C"
#endif
void make_dense_cpu(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
// input[size_input]
// output[size_output]
float f;
for (int i=0; i < size_output; i++) {
f = kernel->bias[i];
for (int j=0; j < size_input; j++) {
f += kernel->weights[j][i]*input[j];
}
output[i] = f;
}
}
#ifdef __CUDACC__
extern "C"
#endif
void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
#ifndef __CUDACC__
make_dense_cpu(kernel, input, output, size_input, size_output);
#else
make_dense_device(kernel, input, output, size_input, size_output);
#endif
}
/*
* Dense linearised
*/
#ifdef __CUDACC__
__global__ void make_dense_linearised_kernel(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output
if (idx >= size_output) {
return;
}
float f = 0;
for (int i=0; i < depth_input; i++) {
for (int j=0; j < dim_input; j++) {
for (int k=0; k < dim_input; k++) {
f += input[i][j][k]*kernel->weights[k + j*dim_input + i*depth_input][idx];
}
}
}
output[idx] = f;
}
void make_dense_linearised_device(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
// Make computation
dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1);
dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z);
make_dense_linearised_kernel<<<gridSize, blockSize>>>(kernel, input, output, depth_input, dim_input, size_output);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
#endif
void make_dense_linearised_cpu(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
// input[depth_input][dim_input][dim_input]
// output[size_output]
float f;
for (int l=0; l < size_output; l++) {
f = 0;
for (int i=0; i < depth_input; i++) {
for (int j=0; j < dim_input; j++) {
for (int k=0; k < dim_input; k++) {
f += input[i][j][k]*kernel->weights[k + j*dim_input + i*depth_input][l];
}
}
}
output[l] = f;
}
}
#ifdef __CUDACC__
extern "C"
#endif
void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
#ifndef __CUDACC__
make_dense_linearised_cpu(kernel, input, output, depth_input, dim_input, size_output);
#else
make_dense_linearised_device(kernel, input, output, depth_input, dim_input, size_output);
#endif
}