Add cuda convolution

This commit is contained in:
augustin64 2022-11-01 17:24:29 +01:00
parent a1dba81e17
commit 1608256e43
9 changed files with 545 additions and 35 deletions

View File

@ -65,13 +65,16 @@ $(BUILDDIR)/mnist_%.o: $(MNIST_SRCDIR)/%.c $(MNIST_SRCDIR)/include/%.h
#
cnn: $(BUILDDIR)/cnn-main;
$(BUILDDIR)/cnn-main: $(CNN_SRCDIR)/main.c $(BUILDDIR)/cnn_train.o $(BUILDDIR)/cnn_cnn.o $(BUILDDIR)/cnn_creation.o $(BUILDDIR)/cnn_initialisation.o $(BUILDDIR)/cnn_make.o $(BUILDDIR)/cnn_neuron_io.o $(BUILDDIR)/cnn_function.o $(BUILDDIR)/cnn_utils.o $(BUILDDIR)/cnn_free.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o
$(BUILDDIR)/cnn-main: $(CNN_SRCDIR)/main.c $(BUILDDIR)/cnn_train.o $(BUILDDIR)/cnn_cnn.o $(BUILDDIR)/cnn_creation.o $(BUILDDIR)/cnn_initialisation.o $(BUILDDIR)/cnn_make.o $(BUILDDIR)/cnn_neuron_io.o $(BUILDDIR)/cnn_function.o $(BUILDDIR)/cnn_utils.o $(BUILDDIR)/cnn_free.o $(BUILDDIR)/cnn_convolution.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o
$(CC) $(CFLAGS) $^ -o $@
$(BUILDDIR)/cnn-main-cuda: $(CNN_SRCDIR)/main.c $(BUILDDIR)/cnn_train.o $(BUILDDIR)/cnn_cnn.o $(BUILDDIR)/cnn_creation.o $(BUILDDIR)/cnn_initialisation.o $(BUILDDIR)/cnn_make.o $(BUILDDIR)/cnn_neuron_io.o $(BUILDDIR)/cnn_function.o $(BUILDDIR)/cnn_utils.o $(BUILDDIR)/cnn_free.o $(BUILDDIR)/cnn_cuda_convolution.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o
$(NVCC) $(NVCCFLAGS) $^ -o $@
$(BUILDDIR)/cnn_%.o: $(CNN_SRCDIR)/%.c $(CNN_SRCDIR)/include/%.h
$(CC) $(CFLAGS) -c $< -o $@
$(BUILDDIR)/cnn_%.o: $(CNN_SRCDIR)/%.cu $(CNN_SRCDIR)/include/%.h
$(BUILDDIR)/cnn_cuda_%.o: $(CNN_SRCDIR)/%.cu $(CNN_SRCDIR)/include/%.h
ifndef NVCC_INSTALLED
@echo "nvcc not found, skipping"
else
@ -90,7 +93,7 @@ run-tests: build-tests
$(foreach file, $(wildcard $(BUILDDIR)/test-*), $(file);)
$(foreach file, $(wildcard $(TEST_SRCDIR)/*.sh), $(file);)
build-tests: prepare-tests $(TESTS_OBJ)
build-tests: prepare-tests $(TESTS_OBJ) $(BUILDDIR)/test-cnn_matrix_multiplication $(BUILDDIR)/test-cnn_convolution
prepare-tests:
@ -104,7 +107,7 @@ build/test-cnn_%: test/cnn_%.c $(CNN_OBJ) $(BUILDDIR)/colors.o $(BUILDDIR)/mnist
build/test-mnist_%: test/mnist_%.c $(MNIST_OBJ) $(BUILDDIR)/colors.o
$(CC) $(CFLAGS) $^ -o $@
$(BUILDDIR)/test-cnn_matrix_multiplication: test/cnn_matrix_multiplication.cu $(BUILDDIR)/cnn_matrix_multiplication.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o
$(BUILDDIR)/test-cnn_%: test/cnn_%.cu $(BUILDDIR)/cnn_cuda_%.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o $(CNN_OBJ)
ifndef NVCC_INSTALLED
@echo "nvcc not found, skipping"
else

159
src/cnn/convolution.c Normal file
View File

@ -0,0 +1,159 @@
/* This file is a copy of src/cnn/convolution.cu */
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include "include/struct.h"
#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 8
#define BLOCKSIZE_z 8
#ifdef __CUDACC__
/* CUDA memcheck */
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#endif
void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
// c'est le kernel de input
// input[kernel->rows][kernel_k_size + output_dim-1][kernel_k_size + output_dim-1]
// output[kernel->columns][output_dim][output_dim]
float f;
for (int i=0; i < kernel->columns; i++) {
for (int j=0; j < output_dim; j++) {
for (int k=0; k < output_dim; k++) {
f = kernel->bias[i][j][k];
for (int a=0; a < kernel->rows; a++) {
for (int b=0; b < kernel->k_size; b++) {
for (int c=0; c < kernel->k_size; c++) {
f += kernel->w[a][i][b][c]*input[a][j+b][k+c];
}
}
}
output[i][j][k] = f/kernel->k_size; // Average
}
}
}
}
#ifdef __CUDACC__
int i_div_up(int a, int b) { // Partie entière supérieure de a/b
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__ void make_convolution_kernel(int k_size, int columns, int rows, float*** bias, size_t pitch_bias, float**** w, size_t pitch_w, float*** input, size_t pitch_input, float*** output, size_t pitch_output, int output_dim) {
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_dim, k_size)
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_dim, k_size)
int input_dim = output_dim+k_size - 1;
if (idx >= columns || idy >= output_dim || idz >= output_dim) {
return;
}
float* bias_offset;
float* w_offset;
float* input_offset;
float* output_offset;
bias_offset = (float*)((char*)bias + (idx*output_dim+idy)*pitch_bias);
float f = bias_offset[idz];
for (int a=0; a < rows; a++) {
for (int b=0; b < k_size; b++) {
for (int c=0; c < k_size; c++) {
w_offset = (float*)((char*)w + ((a*columns + idx)*k_size+b)*pitch_w);
input_offset = (float*)((char*)input + (a*input_dim + idy+b)*pitch_input);
f += w_offset[c]*input_offset[idz+c];
}
}
}
output_offset = (float*)((char*)output + (idx*output_dim+idy)*pitch_output);
output_offset[idz] = f/(k_size);
}
void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
// Copy arrays
size_t pitch_input;
size_t pitch_output;
size_t pitch_bias;
size_t pitch_weight;
float*** input_dev;
float*** output_dev;
float*** kernel_bias;
float**** kernel_weight;
int input_dim = output_dim+kernel->k_size - 1;
// Copy ***input
gpuErrchk( cudaMallocPitch((void**)&input_dev, &pitch_input, input_dim*sizeof(float), kernel->rows*input_dim));
for (int i=0; i < kernel->rows; i++) {
for (int j=0; j < input_dim; j++) {
gpuErrchk( cudaMemcpy((void*)((char*)input_dev + (i*input_dim+j)*pitch_input), (const void*)&(input[i][j][0]), input_dim*sizeof(float), cudaMemcpyHostToDevice));
}
}
// cudaMalloc ***output
gpuErrchk( cudaMallocPitch((void**)&output_dev, &pitch_output, output_dim*sizeof(float), kernel->columns*output_dim));
// Copy ***Kernel bias
gpuErrchk( cudaMallocPitch((void**)&kernel_bias, &pitch_bias, output_dim*sizeof(float), kernel->columns*output_dim));
for (int i=0; i < kernel->columns; i++) {
for (int j=0; j < output_dim; j++) {
gpuErrchk( cudaMemcpy((void*)((char*)kernel_bias + (i*output_dim+j)*pitch_bias), (const void*)&(kernel->bias[i][j][0]), output_dim*sizeof(float), cudaMemcpyHostToDevice));
}
}
// Copy ****Kernel weights
gpuErrchk( cudaMallocPitch((void**)&kernel_weight, &pitch_weight, kernel->k_size*sizeof(float), (kernel->rows*kernel->columns*kernel->k_size)));
for (int i=0; i < kernel->rows; i++) {
for (int j=0; j < kernel->columns; j++) {
for (int k=0; k < kernel->k_size; k++) {
gpuErrchk( cudaMemcpy((void*)((char*)kernel_weight + ((i*kernel->columns+j)*kernel->k_size+k)*pitch_weight), (const void*)&(kernel->w[i][j][k][0]), kernel->k_size*sizeof(float), cudaMemcpyHostToDevice));
}
}
}
// Make computation
dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
make_convolution_kernel<<<gridSize, blockSize>>>(kernel->k_size, kernel->columns, kernel->rows, kernel_bias, pitch_bias, kernel_weight, pitch_weight, input_dev, pitch_input, output_dev, pitch_output, output_dim);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
// Copy output back
for (int i=0; i < kernel->columns; i++) {
for (int j=0; j < output_dim; j++) {
gpuErrchk( cudaMemcpy((void*)&(output[i][j][0]), (const void*)((char*)output_dev + (i*output_dim+j)*pitch_output), output_dim*sizeof(float), cudaMemcpyDeviceToHost));
}
}
// Free all the allocated memory
gpuErrchk( cudaFree(input_dev) );
gpuErrchk( cudaFree(output_dev) );
gpuErrchk( cudaFree(kernel_bias) );
gpuErrchk( cudaFree(kernel_weight) );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
#endif
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
#ifndef __CUDACC__
make_convolution_cpu(kernel, input, output, output_dim);
#else
make_convolution_device(kernel, input, output, output_dim);
#endif
}

157
src/cnn/convolution.cu Normal file
View File

@ -0,0 +1,157 @@
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include "include/struct.h"
#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 8
#define BLOCKSIZE_z 8
#ifdef __CUDACC__
/* CUDA memcheck */
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#endif
void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
// c'est le kernel de input
// input[kernel->rows][kernel_k_size + output_dim-1][kernel_k_size + output_dim-1]
// output[kernel->columns][output_dim][output_dim]
float f;
for (int i=0; i < kernel->columns; i++) {
for (int j=0; j < output_dim; j++) {
for (int k=0; k < output_dim; k++) {
f = kernel->bias[i][j][k];
for (int a=0; a < kernel->rows; a++) {
for (int b=0; b < kernel->k_size; b++) {
for (int c=0; c < kernel->k_size; c++) {
f += kernel->w[a][i][b][c]*input[a][j+b][k+c];
}
}
}
output[i][j][k] = f/kernel->k_size; // Average
}
}
}
}
#ifdef __CUDACC__
int i_div_up(int a, int b) { // Partie entière supérieure de a/b
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__ void make_convolution_kernel(int k_size, int columns, int rows, float*** bias, size_t pitch_bias, float**** w, size_t pitch_w, float*** input, size_t pitch_input, float*** output, size_t pitch_output, int output_dim) {
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_dim, k_size)
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_dim, k_size)
int input_dim = output_dim+k_size - 1;
if (idx >= columns || idy >= output_dim || idz >= output_dim) {
return;
}
float* bias_offset;
float* w_offset;
float* input_offset;
float* output_offset;
bias_offset = (float*)((char*)bias + (idx*output_dim+idy)*pitch_bias);
float f = bias_offset[idz];
for (int a=0; a < rows; a++) {
for (int b=0; b < k_size; b++) {
for (int c=0; c < k_size; c++) {
w_offset = (float*)((char*)w + ((a*columns + idx)*k_size+b)*pitch_w);
input_offset = (float*)((char*)input + (a*input_dim + idy+b)*pitch_input);
f += w_offset[c]*input_offset[idz+c];
}
}
}
output_offset = (float*)((char*)output + (idx*output_dim+idy)*pitch_output);
output_offset[idz] = f/(k_size);
}
void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
// Copy arrays
size_t pitch_input;
size_t pitch_output;
size_t pitch_bias;
size_t pitch_weight;
float*** input_dev;
float*** output_dev;
float*** kernel_bias;
float**** kernel_weight;
int input_dim = output_dim+kernel->k_size - 1;
// Copy ***input
gpuErrchk( cudaMallocPitch((void**)&input_dev, &pitch_input, input_dim*sizeof(float), kernel->rows*input_dim));
for (int i=0; i < kernel->rows; i++) {
for (int j=0; j < input_dim; j++) {
gpuErrchk( cudaMemcpy((void*)((char*)input_dev + (i*input_dim+j)*pitch_input), (const void*)&(input[i][j][0]), input_dim*sizeof(float), cudaMemcpyHostToDevice));
}
}
// cudaMalloc ***output
gpuErrchk( cudaMallocPitch((void**)&output_dev, &pitch_output, output_dim*sizeof(float), kernel->columns*output_dim));
// Copy ***Kernel bias
gpuErrchk( cudaMallocPitch((void**)&kernel_bias, &pitch_bias, output_dim*sizeof(float), kernel->columns*output_dim));
for (int i=0; i < kernel->columns; i++) {
for (int j=0; j < output_dim; j++) {
gpuErrchk( cudaMemcpy((void*)((char*)kernel_bias + (i*output_dim+j)*pitch_bias), (const void*)&(kernel->bias[i][j][0]), output_dim*sizeof(float), cudaMemcpyHostToDevice));
}
}
// Copy ****Kernel weights
gpuErrchk( cudaMallocPitch((void**)&kernel_weight, &pitch_weight, kernel->k_size*sizeof(float), (kernel->rows*kernel->columns*kernel->k_size)));
for (int i=0; i < kernel->rows; i++) {
for (int j=0; j < kernel->columns; j++) {
for (int k=0; k < kernel->k_size; k++) {
gpuErrchk( cudaMemcpy((void*)((char*)kernel_weight + ((i*kernel->columns+j)*kernel->k_size+k)*pitch_weight), (const void*)&(kernel->w[i][j][k][0]), kernel->k_size*sizeof(float), cudaMemcpyHostToDevice));
}
}
}
// Make computation
dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
make_convolution_kernel<<<gridSize, blockSize>>>(kernel->k_size, kernel->columns, kernel->rows, kernel_bias, pitch_bias, kernel_weight, pitch_weight, input_dev, pitch_input, output_dev, pitch_output, output_dim);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
// Copy output back
for (int i=0; i < kernel->columns; i++) {
for (int j=0; j < output_dim; j++) {
gpuErrchk( cudaMemcpy((void*)&(output[i][j][0]), (const void*)((char*)output_dev + (i*output_dim+j)*pitch_output), output_dim*sizeof(float), cudaMemcpyDeviceToHost));
}
}
// Free all the allocated memory
gpuErrchk( cudaFree(input_dev) );
gpuErrchk( cudaFree(output_dev) );
gpuErrchk( cudaFree(kernel_bias) );
gpuErrchk( cudaFree(kernel_weight) );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
#endif
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
#ifndef __CUDACC__
make_convolution_cpu(kernel, input, output, output_dim);
#else
make_convolution_device(kernel, input, output, output_dim);
#endif
}

View File

@ -0,0 +1,28 @@
#include "struct.h"
/*
* Effectue la convolution sur le processeur
*/
void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim);
#ifdef __CUDACC__
/*
* Partie entière supérieure de a/b
*/
int i_div_up(int a, int b);
/*
* Kernel de la convolution sur carte graphique
*/
__global__ void make_convolution_kernel(int k_size, int columns, int rows, float*** bias, size_t pitch_bias, float**** w, size_t pitch_w, float*** input, size_t pitch_input, float*** output, size_t pitch_output, int output_dim);
/*
* Effectue la convolution sur la carte graphique
*/
void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output, int output_dim);
#endif
/*
* Détermine si la convolution peut-être faite sur la carte graphique au moment de la compilation
*/
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim);

View File

@ -4,22 +4,26 @@
#define DEF_MAKE_H
/*
* Effectue une convolution sans stride
* Effectue une convolution sans stride sur le processeur
*/
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim);
void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim);
/*
* Effecute un average pooling avec stride=size
* Effectue la convolution sur le CPU ou GPU
*/
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim);
/*
* Effectue un average pooling avec stride=size
*/
void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim);
/*
* Effecute une full connection
* Effectue une full connection
*/
void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output);
/*
* Effecute une full connection qui passe d'une matrice à un vecteur
* Effectue une full connection qui passe d'une matrice à un vecteur
*/
void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output);

View File

@ -1,30 +1,9 @@
#include <stdio.h>
#include "../include/colors.h"
#include "include/convolution.h"
#include "include/make.h"
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
// c'est le kernel de input
// input[kernel->rows][kernel_k_size + output_dim-1][kernel_k_size + output_dim-1]
// output[kernel->columns][output_dim][output_dim]
float f;
int n = kernel->k_size;
for (int i=0; i < kernel->columns; i++) {
for (int j=0; j < output_dim; j++) {
for (int k=0; k < output_dim; k++) {
f = kernel->bias[i][j][k];
for (int a=0; a < kernel->rows; a++) {
for (int b=0; b < n; b++) {
for (int c=0; c < n; c++) {
f += kernel->w[a][i][b][c]*input[a][j+b][k+c];
}
}
}
output[i][j][k] = f/n; // Average
}
}
}
}
void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
// input[output_depth][output_dim+size-1][output_dim+size-1]

View File

@ -41,7 +41,7 @@ __global__ void matrix_mul_kernel(float* Md, float* Nd, float* Pd, int n, int p,
for (int i = 0; i < n; i++) {
M_offset = (float *)((char*)Md + i * pitch_m);
P_offset = (float*)((char*)Pd + i * pitch_p); // P[i], pitch_p est un décalage en bytes
atomicAdd(&P_offset[ty], M_offset[tx] * Nxy); // P[i][ty] += P[i][tx] * N[tx][ty]
atomicAdd(&P_offset[ty], M_offset[tx] * Nxy); // P[i][ty] += P[i][tx] * N[tx][ty]
}
}

180
test/cnn_convolution.cu Normal file
View File

@ -0,0 +1,180 @@
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <assert.h>
#include <math.h>
#include <time.h>
#include "../src/cnn/include/make.h"
#include "../src/cnn/include/convolution.h"
#include "../src/cnn/include/struct.h"
float random_float(float low, float high) {
float t = (float)rand() / (float)RAND_MAX;
return (1.0f - t) * low + t * high;
}
void fill_matrix_random(float ***matrix, int n, int p, int q, float max_val) {
for (int i=0; i < n; i++) {
for (int j=0; j < p; j++) {
for (int k=0; k < q; k++) {
matrix[i][j][k] = random_float(0.0f, max_val);
}
}
}
}
void print_matrix(float** mat, int n, int p) {
for (int i=0; i < n; i++) {
printf("[\t");
for (int j=0; j < p; j++) {
printf("%0.1f\t", mat[i][j]);
}
printf("]\n");
}
}
float*** create_matrix(int n, int p, int q, float max_val) {
float*** matrix = (float***)malloc(n*sizeof(float**));
for (int i=0; i < n; i++) {
matrix[i] = (float**)malloc(sizeof(float*)*p);
for (int j=0; j < p; j++) {
matrix[i][j] = (float*)malloc(sizeof(float)*q);
}
}
fill_matrix_random(matrix, n, p, q, max_val);
return matrix;
}
float*** create_empty_matrix(int n, int p, int q) {
float*** matrix = (float***)malloc(n*sizeof(float**));
for (int i=0; i < n; i++) {
matrix[i] = (float**)malloc(sizeof(float*)*p);
for (int j=0; j < p; j++) {
matrix[i][j] = (float*)malloc(sizeof(float)*q);
for (int k=0; k < q; k++) {
matrix[i][j][k] = 0.;
}
}
}
return matrix;
}
void free_matrix(float*** matrix, int n, int p) {
for (int i=0; i < n; i++) {
for (int j=0; j < p; j++) {
free(matrix[i][j]);
}
free(matrix[i]);
}
free(matrix);
}
bool check_matrices_equality(float*** m1, float*** m2, int n, int p, int q, int acceptation) {
for (int i=0; i < n; i++) {
for (int j=0; j < p; j++) {
for (int k=0; k < q; k++) {
if (fabs(m1[i][j][k] - m2[i][j][k]) > 0.01*acceptation) {
printf("diff %d %d %d: %f val: %f et %f\n", i, j, k, fabs(m1[i][j][k] - m2[i][j][k]), m1[i][j][k], m2[i][j][k]);
return false;
}
}
}
}
return true;
}
void run_convolution_test(int input_dim, int output_dim, int rows, int columns) {
assert(input_dim >= output_dim);
int k_size = input_dim - output_dim +1;
// Génération des données aléatoires
Kernel_cnn* kernel = (Kernel_cnn*)malloc(sizeof(Kernel_cnn));
kernel->k_size = k_size;
kernel->rows = rows;
kernel->columns = columns;
// bias[kernel->columns][dim_output][dim_output]
kernel->bias = create_matrix(kernel->columns, output_dim, output_dim, 15.0f);
kernel->d_bias = create_matrix(kernel->columns, output_dim, output_dim, 1.5f);
kernel->last_d_bias = create_matrix(kernel->columns, output_dim, output_dim, 0.1f);
// w[rows][columns][k_size][k_size]
kernel->w = (float****)malloc(sizeof(float***)*kernel->rows);
kernel->d_w = (float****)malloc(sizeof(float***)*kernel->rows);
kernel->last_d_w = (float****)malloc(sizeof(float***)*kernel->rows);
for (int i=0; i < kernel->rows; i++) {
kernel->w[i] = create_matrix(kernel->columns, kernel->k_size, kernel->k_size, 15.0f);
kernel->d_w[i] = create_matrix(kernel->columns, kernel->k_size, kernel->k_size, 1.5f);
kernel->last_d_w[i] = create_matrix(kernel->columns, kernel->k_size, kernel->k_size, 0.1f);
}
float*** input = create_matrix(kernel->rows, input_dim, input_dim, 5.0f);
float*** output_cpu = create_empty_matrix(kernel->columns, output_dim, output_dim);
float*** output_gpu = create_empty_matrix(kernel->columns, output_dim, output_dim);
printf("(%d, %d, %d, %d) Data generation complete\n", rows, columns, input_dim, output_dim);
// Lancement des calculs
clock_t start, end;
double cpu_time_used, gpu_time_used;
start = clock();
make_convolution_device(kernel, input, output_gpu, output_dim);
end = clock();
gpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("(%d, %d, %d, %d) Time used for GPU: %lf seconds\n", rows, columns, input_dim, output_dim, gpu_time_used);
start = clock();
make_convolution_cpu(kernel, input, output_cpu, output_dim);
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("(%d, %d, %d, %d) Time used for CPU: %lf seconds\n", rows, columns, input_dim, output_dim, cpu_time_used);
// Vérification de l'égalité des matrices
printf("(%d, %d, %d, %d) Checking equality.\n", rows, columns, input_dim, output_dim);
if (!check_matrices_equality(output_gpu, output_cpu, kernel->columns, output_dim, output_dim, kernel->k_size)) {// TODO: change acceptation
exit(1);
}
printf("OK\n");
free_matrix(kernel->bias, kernel->columns, output_dim);
free_matrix(kernel->d_bias, kernel->columns, output_dim);
free_matrix(kernel->last_d_bias, kernel->columns, output_dim);
for (int i=0; i < kernel->rows; i++) {
free_matrix(kernel->w[i], kernel->columns, kernel->k_size);
free_matrix(kernel->d_w[i], kernel->columns, kernel->k_size);
free_matrix(kernel->last_d_w[i], kernel->columns, kernel->k_size);
}
free(kernel->w);
free(kernel->d_w);
free(kernel->last_d_w);
free_matrix(input, kernel->rows, input_dim);
free_matrix(output_cpu, kernel->columns, output_dim);
free_matrix(output_gpu, kernel->columns, output_dim);
}
int main() {
srand(time(NULL));
run_convolution_test(20, 15, 30, 40);
run_convolution_test(30, 25, 40, 50);
run_convolution_test(200, 10, 40, 50);
return 0;
}

View File

@ -69,7 +69,7 @@ bool check_matrices_equality(float** m1, float** m2, int n, int p, int acceptati
void run_matrices_test(int n, int p, int q) {
clock_t start, end;
double cpu_time_used;
double cpu_time_used, gpu_time_used;
float** matrix1 = create_matrix(n, p);
float** matrix2 = create_matrix(p, q);
@ -90,8 +90,8 @@ void run_matrices_test(int n, int p, int q) {
matrix_multiplication_host(matrix1, matrix2, result_cpu, n, p, q);
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("(%d,%d)x(%d,%d) Time used for CPU: %lf seconds\n", n, p, p, q, cpu_time_used);
gpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("(%d,%d)x(%d,%d) Time used for CPU: %lf seconds\n", n, p, p, q, gpu_time_used);
printf("OK\n");
// Vérification de l'égalité des matrices