diff --git a/Makefile b/Makefile index e24d34d..0797d8d 100644 --- a/Makefile +++ b/Makefile @@ -65,13 +65,16 @@ $(BUILDDIR)/mnist_%.o: $(MNIST_SRCDIR)/%.c $(MNIST_SRCDIR)/include/%.h # cnn: $(BUILDDIR)/cnn-main; -$(BUILDDIR)/cnn-main: $(CNN_SRCDIR)/main.c $(BUILDDIR)/cnn_train.o $(BUILDDIR)/cnn_cnn.o $(BUILDDIR)/cnn_creation.o $(BUILDDIR)/cnn_initialisation.o $(BUILDDIR)/cnn_make.o $(BUILDDIR)/cnn_neuron_io.o $(BUILDDIR)/cnn_function.o $(BUILDDIR)/cnn_utils.o $(BUILDDIR)/cnn_free.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o +$(BUILDDIR)/cnn-main: $(CNN_SRCDIR)/main.c $(BUILDDIR)/cnn_train.o $(BUILDDIR)/cnn_cnn.o $(BUILDDIR)/cnn_creation.o $(BUILDDIR)/cnn_initialisation.o $(BUILDDIR)/cnn_make.o $(BUILDDIR)/cnn_neuron_io.o $(BUILDDIR)/cnn_function.o $(BUILDDIR)/cnn_utils.o $(BUILDDIR)/cnn_free.o $(BUILDDIR)/cnn_convolution.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o $(CC) $(CFLAGS) $^ -o $@ +$(BUILDDIR)/cnn-main-cuda: $(CNN_SRCDIR)/main.c $(BUILDDIR)/cnn_train.o $(BUILDDIR)/cnn_cnn.o $(BUILDDIR)/cnn_creation.o $(BUILDDIR)/cnn_initialisation.o $(BUILDDIR)/cnn_make.o $(BUILDDIR)/cnn_neuron_io.o $(BUILDDIR)/cnn_function.o $(BUILDDIR)/cnn_utils.o $(BUILDDIR)/cnn_free.o $(BUILDDIR)/cnn_cuda_convolution.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o + $(NVCC) $(NVCCFLAGS) $^ -o $@ + $(BUILDDIR)/cnn_%.o: $(CNN_SRCDIR)/%.c $(CNN_SRCDIR)/include/%.h $(CC) $(CFLAGS) -c $< -o $@ -$(BUILDDIR)/cnn_%.o: $(CNN_SRCDIR)/%.cu $(CNN_SRCDIR)/include/%.h +$(BUILDDIR)/cnn_cuda_%.o: $(CNN_SRCDIR)/%.cu $(CNN_SRCDIR)/include/%.h ifndef NVCC_INSTALLED @echo "nvcc not found, skipping" else @@ -90,7 +93,7 @@ run-tests: build-tests $(foreach file, $(wildcard $(BUILDDIR)/test-*), $(file);) $(foreach file, $(wildcard $(TEST_SRCDIR)/*.sh), $(file);) -build-tests: prepare-tests $(TESTS_OBJ) +build-tests: prepare-tests $(TESTS_OBJ) $(BUILDDIR)/test-cnn_matrix_multiplication $(BUILDDIR)/test-cnn_convolution prepare-tests: @@ -104,7 +107,7 @@ build/test-cnn_%: test/cnn_%.c $(CNN_OBJ) $(BUILDDIR)/colors.o $(BUILDDIR)/mnist build/test-mnist_%: test/mnist_%.c $(MNIST_OBJ) $(BUILDDIR)/colors.o $(CC) $(CFLAGS) $^ -o $@ -$(BUILDDIR)/test-cnn_matrix_multiplication: test/cnn_matrix_multiplication.cu $(BUILDDIR)/cnn_matrix_multiplication.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o +$(BUILDDIR)/test-cnn_%: test/cnn_%.cu $(BUILDDIR)/cnn_cuda_%.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o $(CNN_OBJ) ifndef NVCC_INSTALLED @echo "nvcc not found, skipping" else diff --git a/src/cnn/convolution.c b/src/cnn/convolution.c new file mode 100644 index 0000000..57a14e1 --- /dev/null +++ b/src/cnn/convolution.c @@ -0,0 +1,159 @@ +/* This file is a copy of src/cnn/convolution.cu */ +#include +#include +#include + +#include "include/struct.h" + +#define BLOCKSIZE_x 16 +#define BLOCKSIZE_y 8 +#define BLOCKSIZE_z 8 + + +#ifdef __CUDACC__ +/* CUDA memcheck */ +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { + if (code != cudaSuccess) { + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} +#endif + +void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) { + // c'est le kernel de input + // input[kernel->rows][kernel_k_size + output_dim-1][kernel_k_size + output_dim-1] + // output[kernel->columns][output_dim][output_dim] + float f; + + for (int i=0; i < kernel->columns; i++) { + for (int j=0; j < output_dim; j++) { + for (int k=0; k < output_dim; k++) { + f = kernel->bias[i][j][k]; + for (int a=0; a < kernel->rows; a++) { + for (int b=0; b < kernel->k_size; b++) { + for (int c=0; c < kernel->k_size; c++) { + f += kernel->w[a][i][b][c]*input[a][j+b][k+c]; + } + } + } + output[i][j][k] = f/kernel->k_size; // Average + } + } + } +} + +#ifdef __CUDACC__ +int i_div_up(int a, int b) { // Partie entière supérieure de a/b + return ((a % b) != 0) ? (a / b + 1) : (a / b); +} + +__global__ void make_convolution_kernel(int k_size, int columns, int rows, float*** bias, size_t pitch_bias, float**** w, size_t pitch_w, float*** input, size_t pitch_input, float*** output, size_t pitch_output, int output_dim) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_dim, k_size) + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_dim, k_size) + + int input_dim = output_dim+k_size - 1; + + if (idx >= columns || idy >= output_dim || idz >= output_dim) { + return; + } + + float* bias_offset; + float* w_offset; + float* input_offset; + float* output_offset; + + bias_offset = (float*)((char*)bias + (idx*output_dim+idy)*pitch_bias); + float f = bias_offset[idz]; + + for (int a=0; a < rows; a++) { + for (int b=0; b < k_size; b++) { + for (int c=0; c < k_size; c++) { + w_offset = (float*)((char*)w + ((a*columns + idx)*k_size+b)*pitch_w); + input_offset = (float*)((char*)input + (a*input_dim + idy+b)*pitch_input); + f += w_offset[c]*input_offset[idz+c]; + } + } + } + + output_offset = (float*)((char*)output + (idx*output_dim+idy)*pitch_output); + output_offset[idz] = f/(k_size); +} + +void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) { + // Copy arrays + size_t pitch_input; + size_t pitch_output; + size_t pitch_bias; + size_t pitch_weight; + float*** input_dev; + float*** output_dev; + float*** kernel_bias; + float**** kernel_weight; + + int input_dim = output_dim+kernel->k_size - 1; + + // Copy ***input + gpuErrchk( cudaMallocPitch((void**)&input_dev, &pitch_input, input_dim*sizeof(float), kernel->rows*input_dim)); + for (int i=0; i < kernel->rows; i++) { + for (int j=0; j < input_dim; j++) { + gpuErrchk( cudaMemcpy((void*)((char*)input_dev + (i*input_dim+j)*pitch_input), (const void*)&(input[i][j][0]), input_dim*sizeof(float), cudaMemcpyHostToDevice)); + } + } + // cudaMalloc ***output + gpuErrchk( cudaMallocPitch((void**)&output_dev, &pitch_output, output_dim*sizeof(float), kernel->columns*output_dim)); + + // Copy ***Kernel bias + gpuErrchk( cudaMallocPitch((void**)&kernel_bias, &pitch_bias, output_dim*sizeof(float), kernel->columns*output_dim)); + for (int i=0; i < kernel->columns; i++) { + for (int j=0; j < output_dim; j++) { + gpuErrchk( cudaMemcpy((void*)((char*)kernel_bias + (i*output_dim+j)*pitch_bias), (const void*)&(kernel->bias[i][j][0]), output_dim*sizeof(float), cudaMemcpyHostToDevice)); + } + } + + // Copy ****Kernel weights + gpuErrchk( cudaMallocPitch((void**)&kernel_weight, &pitch_weight, kernel->k_size*sizeof(float), (kernel->rows*kernel->columns*kernel->k_size))); + for (int i=0; i < kernel->rows; i++) { + for (int j=0; j < kernel->columns; j++) { + for (int k=0; k < kernel->k_size; k++) { + gpuErrchk( cudaMemcpy((void*)((char*)kernel_weight + ((i*kernel->columns+j)*kernel->k_size+k)*pitch_weight), (const void*)&(kernel->w[i][j][k][0]), kernel->k_size*sizeof(float), cudaMemcpyHostToDevice)); + } + } + } + + // Make computation + dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + make_convolution_kernel<<>>(kernel->k_size, kernel->columns, kernel->rows, kernel_bias, pitch_bias, kernel_weight, pitch_weight, input_dev, pitch_input, output_dev, pitch_output, output_dim); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + + // Copy output back + for (int i=0; i < kernel->columns; i++) { + for (int j=0; j < output_dim; j++) { + gpuErrchk( cudaMemcpy((void*)&(output[i][j][0]), (const void*)((char*)output_dev + (i*output_dim+j)*pitch_output), output_dim*sizeof(float), cudaMemcpyDeviceToHost)); + } + } + + // Free all the allocated memory + gpuErrchk( cudaFree(input_dev) ); + gpuErrchk( cudaFree(output_dev) ); + gpuErrchk( cudaFree(kernel_bias) ); + gpuErrchk( cudaFree(kernel_weight) ); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + + +void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) { + #ifndef __CUDACC__ + make_convolution_cpu(kernel, input, output, output_dim); + #else + make_convolution_device(kernel, input, output, output_dim); + #endif +} \ No newline at end of file diff --git a/src/cnn/convolution.cu b/src/cnn/convolution.cu new file mode 100644 index 0000000..5605590 --- /dev/null +++ b/src/cnn/convolution.cu @@ -0,0 +1,157 @@ +#include +#include +#include + +#include "include/struct.h" + +#define BLOCKSIZE_x 16 +#define BLOCKSIZE_y 8 +#define BLOCKSIZE_z 8 + +#ifdef __CUDACC__ +/* CUDA memcheck */ +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { + if (code != cudaSuccess) { + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} +#endif + +void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) { + // c'est le kernel de input + // input[kernel->rows][kernel_k_size + output_dim-1][kernel_k_size + output_dim-1] + // output[kernel->columns][output_dim][output_dim] + float f; + + for (int i=0; i < kernel->columns; i++) { + for (int j=0; j < output_dim; j++) { + for (int k=0; k < output_dim; k++) { + f = kernel->bias[i][j][k]; + for (int a=0; a < kernel->rows; a++) { + for (int b=0; b < kernel->k_size; b++) { + for (int c=0; c < kernel->k_size; c++) { + f += kernel->w[a][i][b][c]*input[a][j+b][k+c]; + } + } + } + output[i][j][k] = f/kernel->k_size; // Average + } + } + } +} + +#ifdef __CUDACC__ +int i_div_up(int a, int b) { // Partie entière supérieure de a/b + return ((a % b) != 0) ? (a / b + 1) : (a / b); +} + +__global__ void make_convolution_kernel(int k_size, int columns, int rows, float*** bias, size_t pitch_bias, float**** w, size_t pitch_w, float*** input, size_t pitch_input, float*** output, size_t pitch_output, int output_dim) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_dim, k_size) + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_dim, k_size) + + int input_dim = output_dim+k_size - 1; + + if (idx >= columns || idy >= output_dim || idz >= output_dim) { + return; + } + + float* bias_offset; + float* w_offset; + float* input_offset; + float* output_offset; + + bias_offset = (float*)((char*)bias + (idx*output_dim+idy)*pitch_bias); + float f = bias_offset[idz]; + + for (int a=0; a < rows; a++) { + for (int b=0; b < k_size; b++) { + for (int c=0; c < k_size; c++) { + w_offset = (float*)((char*)w + ((a*columns + idx)*k_size+b)*pitch_w); + input_offset = (float*)((char*)input + (a*input_dim + idy+b)*pitch_input); + f += w_offset[c]*input_offset[idz+c]; + } + } + } + + output_offset = (float*)((char*)output + (idx*output_dim+idy)*pitch_output); + output_offset[idz] = f/(k_size); +} + +void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) { + // Copy arrays + size_t pitch_input; + size_t pitch_output; + size_t pitch_bias; + size_t pitch_weight; + float*** input_dev; + float*** output_dev; + float*** kernel_bias; + float**** kernel_weight; + + int input_dim = output_dim+kernel->k_size - 1; + + // Copy ***input + gpuErrchk( cudaMallocPitch((void**)&input_dev, &pitch_input, input_dim*sizeof(float), kernel->rows*input_dim)); + for (int i=0; i < kernel->rows; i++) { + for (int j=0; j < input_dim; j++) { + gpuErrchk( cudaMemcpy((void*)((char*)input_dev + (i*input_dim+j)*pitch_input), (const void*)&(input[i][j][0]), input_dim*sizeof(float), cudaMemcpyHostToDevice)); + } + } + // cudaMalloc ***output + gpuErrchk( cudaMallocPitch((void**)&output_dev, &pitch_output, output_dim*sizeof(float), kernel->columns*output_dim)); + + // Copy ***Kernel bias + gpuErrchk( cudaMallocPitch((void**)&kernel_bias, &pitch_bias, output_dim*sizeof(float), kernel->columns*output_dim)); + for (int i=0; i < kernel->columns; i++) { + for (int j=0; j < output_dim; j++) { + gpuErrchk( cudaMemcpy((void*)((char*)kernel_bias + (i*output_dim+j)*pitch_bias), (const void*)&(kernel->bias[i][j][0]), output_dim*sizeof(float), cudaMemcpyHostToDevice)); + } + } + + // Copy ****Kernel weights + gpuErrchk( cudaMallocPitch((void**)&kernel_weight, &pitch_weight, kernel->k_size*sizeof(float), (kernel->rows*kernel->columns*kernel->k_size))); + for (int i=0; i < kernel->rows; i++) { + for (int j=0; j < kernel->columns; j++) { + for (int k=0; k < kernel->k_size; k++) { + gpuErrchk( cudaMemcpy((void*)((char*)kernel_weight + ((i*kernel->columns+j)*kernel->k_size+k)*pitch_weight), (const void*)&(kernel->w[i][j][k][0]), kernel->k_size*sizeof(float), cudaMemcpyHostToDevice)); + } + } + } + + // Make computation + dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + make_convolution_kernel<<>>(kernel->k_size, kernel->columns, kernel->rows, kernel_bias, pitch_bias, kernel_weight, pitch_weight, input_dev, pitch_input, output_dev, pitch_output, output_dim); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + + // Copy output back + for (int i=0; i < kernel->columns; i++) { + for (int j=0; j < output_dim; j++) { + gpuErrchk( cudaMemcpy((void*)&(output[i][j][0]), (const void*)((char*)output_dev + (i*output_dim+j)*pitch_output), output_dim*sizeof(float), cudaMemcpyDeviceToHost)); + } + } + + // Free all the allocated memory + gpuErrchk( cudaFree(input_dev) ); + gpuErrchk( cudaFree(output_dev) ); + gpuErrchk( cudaFree(kernel_bias) ); + gpuErrchk( cudaFree(kernel_weight) ); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + + +void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) { + #ifndef __CUDACC__ + make_convolution_cpu(kernel, input, output, output_dim); + #else + make_convolution_device(kernel, input, output, output_dim); + #endif +} \ No newline at end of file diff --git a/src/cnn/include/convolution.h b/src/cnn/include/convolution.h new file mode 100644 index 0000000..99d8e62 --- /dev/null +++ b/src/cnn/include/convolution.h @@ -0,0 +1,28 @@ +#include "struct.h" + +/* +* Effectue la convolution sur le processeur +*/ +void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim); + +#ifdef __CUDACC__ +/* +* Partie entière supérieure de a/b +*/ +int i_div_up(int a, int b); + +/* +* Kernel de la convolution sur carte graphique +*/ +__global__ void make_convolution_kernel(int k_size, int columns, int rows, float*** bias, size_t pitch_bias, float**** w, size_t pitch_w, float*** input, size_t pitch_input, float*** output, size_t pitch_output, int output_dim); + +/* +* Effectue la convolution sur la carte graphique +*/ +void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output, int output_dim); +#endif + +/* +* Détermine si la convolution peut-être faite sur la carte graphique au moment de la compilation +*/ +void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim); \ No newline at end of file diff --git a/src/cnn/include/make.h b/src/cnn/include/make.h index 1bad027..5a9dbe1 100644 --- a/src/cnn/include/make.h +++ b/src/cnn/include/make.h @@ -4,22 +4,26 @@ #define DEF_MAKE_H /* -* Effectue une convolution sans stride +* Effectue une convolution sans stride sur le processeur */ -void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim); +void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim); /* -* Effecute un average pooling avec stride=size +* Effectue la convolution sur le CPU ou GPU +*/ +void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim); +/* +* Effectue un average pooling avec stride=size */ void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim); /* -* Effecute une full connection +* Effectue une full connection */ void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output); /* -* Effecute une full connection qui passe d'une matrice à un vecteur +* Effectue une full connection qui passe d'une matrice à un vecteur */ void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output); diff --git a/src/cnn/make.c b/src/cnn/make.c index 1a3174c..2c53143 100644 --- a/src/cnn/make.c +++ b/src/cnn/make.c @@ -1,30 +1,9 @@ #include #include "../include/colors.h" +#include "include/convolution.h" #include "include/make.h" -void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) { - // c'est le kernel de input - // input[kernel->rows][kernel_k_size + output_dim-1][kernel_k_size + output_dim-1] - // output[kernel->columns][output_dim][output_dim] - float f; - int n = kernel->k_size; - for (int i=0; i < kernel->columns; i++) { - for (int j=0; j < output_dim; j++) { - for (int k=0; k < output_dim; k++) { - f = kernel->bias[i][j][k]; - for (int a=0; a < kernel->rows; a++) { - for (int b=0; b < n; b++) { - for (int c=0; c < n; c++) { - f += kernel->w[a][i][b][c]*input[a][j+b][k+c]; - } - } - } - output[i][j][k] = f/n; // Average - } - } - } -} void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) { // input[output_depth][output_dim+size-1][output_dim+size-1] diff --git a/src/cnn/matrix_multiplication.cu b/src/cnn/matrix_multiplication.cu index bf0ffce..6812c4b 100644 --- a/src/cnn/matrix_multiplication.cu +++ b/src/cnn/matrix_multiplication.cu @@ -41,7 +41,7 @@ __global__ void matrix_mul_kernel(float* Md, float* Nd, float* Pd, int n, int p, for (int i = 0; i < n; i++) { M_offset = (float *)((char*)Md + i * pitch_m); P_offset = (float*)((char*)Pd + i * pitch_p); // P[i], pitch_p est un décalage en bytes - atomicAdd(&P_offset[ty], M_offset[tx] * Nxy); // P[i][ty] += P[i][tx] * N[tx][ty] + atomicAdd(&P_offset[ty], M_offset[tx] * Nxy); // P[i][ty] += P[i][tx] * N[tx][ty] } } diff --git a/test/cnn_convolution.cu b/test/cnn_convolution.cu new file mode 100644 index 0000000..eccee08 --- /dev/null +++ b/test/cnn_convolution.cu @@ -0,0 +1,180 @@ +#include +#include +#include +#include +#include +#include + +#include "../src/cnn/include/make.h" +#include "../src/cnn/include/convolution.h" +#include "../src/cnn/include/struct.h" + + +float random_float(float low, float high) { + float t = (float)rand() / (float)RAND_MAX; + return (1.0f - t) * low + t * high; +} + + +void fill_matrix_random(float ***matrix, int n, int p, int q, float max_val) { + for (int i=0; i < n; i++) { + for (int j=0; j < p; j++) { + for (int k=0; k < q; k++) { + matrix[i][j][k] = random_float(0.0f, max_val); + } + } + } +} + + +void print_matrix(float** mat, int n, int p) { + for (int i=0; i < n; i++) { + printf("[\t"); + for (int j=0; j < p; j++) { + printf("%0.1f\t", mat[i][j]); + } + printf("]\n"); + } +} + + +float*** create_matrix(int n, int p, int q, float max_val) { + float*** matrix = (float***)malloc(n*sizeof(float**)); + for (int i=0; i < n; i++) { + matrix[i] = (float**)malloc(sizeof(float*)*p); + for (int j=0; j < p; j++) { + matrix[i][j] = (float*)malloc(sizeof(float)*q); + } + } + + fill_matrix_random(matrix, n, p, q, max_val); + return matrix; +} + + +float*** create_empty_matrix(int n, int p, int q) { + float*** matrix = (float***)malloc(n*sizeof(float**)); + for (int i=0; i < n; i++) { + matrix[i] = (float**)malloc(sizeof(float*)*p); + for (int j=0; j < p; j++) { + matrix[i][j] = (float*)malloc(sizeof(float)*q); + for (int k=0; k < q; k++) { + matrix[i][j][k] = 0.; + } + } + } + return matrix; +} + +void free_matrix(float*** matrix, int n, int p) { + for (int i=0; i < n; i++) { + for (int j=0; j < p; j++) { + free(matrix[i][j]); + } + free(matrix[i]); + } + free(matrix); +} + + +bool check_matrices_equality(float*** m1, float*** m2, int n, int p, int q, int acceptation) { + for (int i=0; i < n; i++) { + for (int j=0; j < p; j++) { + for (int k=0; k < q; k++) { + if (fabs(m1[i][j][k] - m2[i][j][k]) > 0.01*acceptation) { + printf("diff %d %d %d: %f val: %f et %f\n", i, j, k, fabs(m1[i][j][k] - m2[i][j][k]), m1[i][j][k], m2[i][j][k]); + return false; + } + } + } + } + return true; +} + +void run_convolution_test(int input_dim, int output_dim, int rows, int columns) { + assert(input_dim >= output_dim); + int k_size = input_dim - output_dim +1; + + // Génération des données aléatoires + Kernel_cnn* kernel = (Kernel_cnn*)malloc(sizeof(Kernel_cnn)); + + kernel->k_size = k_size; + kernel->rows = rows; + kernel->columns = columns; + + // bias[kernel->columns][dim_output][dim_output] + kernel->bias = create_matrix(kernel->columns, output_dim, output_dim, 15.0f); + kernel->d_bias = create_matrix(kernel->columns, output_dim, output_dim, 1.5f); + kernel->last_d_bias = create_matrix(kernel->columns, output_dim, output_dim, 0.1f); + + // w[rows][columns][k_size][k_size] + kernel->w = (float****)malloc(sizeof(float***)*kernel->rows); + kernel->d_w = (float****)malloc(sizeof(float***)*kernel->rows); + kernel->last_d_w = (float****)malloc(sizeof(float***)*kernel->rows); + for (int i=0; i < kernel->rows; i++) { + kernel->w[i] = create_matrix(kernel->columns, kernel->k_size, kernel->k_size, 15.0f); + kernel->d_w[i] = create_matrix(kernel->columns, kernel->k_size, kernel->k_size, 1.5f); + kernel->last_d_w[i] = create_matrix(kernel->columns, kernel->k_size, kernel->k_size, 0.1f); + } + + float*** input = create_matrix(kernel->rows, input_dim, input_dim, 5.0f); + float*** output_cpu = create_empty_matrix(kernel->columns, output_dim, output_dim); + float*** output_gpu = create_empty_matrix(kernel->columns, output_dim, output_dim); + + printf("(%d, %d, %d, %d) Data generation complete\n", rows, columns, input_dim, output_dim); + + + // Lancement des calculs + clock_t start, end; + double cpu_time_used, gpu_time_used; + + start = clock(); + make_convolution_device(kernel, input, output_gpu, output_dim); + end = clock(); + + gpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; + printf("(%d, %d, %d, %d) Time used for GPU: %lf seconds\n", rows, columns, input_dim, output_dim, gpu_time_used); + + + start = clock(); + make_convolution_cpu(kernel, input, output_cpu, output_dim); + end = clock(); + + cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; + printf("(%d, %d, %d, %d) Time used for CPU: %lf seconds\n", rows, columns, input_dim, output_dim, cpu_time_used); + + // Vérification de l'égalité des matrices + printf("(%d, %d, %d, %d) Checking equality.\n", rows, columns, input_dim, output_dim); + if (!check_matrices_equality(output_gpu, output_cpu, kernel->columns, output_dim, output_dim, kernel->k_size)) {// TODO: change acceptation + exit(1); + } + printf("OK\n"); + + free_matrix(kernel->bias, kernel->columns, output_dim); + free_matrix(kernel->d_bias, kernel->columns, output_dim); + free_matrix(kernel->last_d_bias, kernel->columns, output_dim); + + for (int i=0; i < kernel->rows; i++) { + free_matrix(kernel->w[i], kernel->columns, kernel->k_size); + free_matrix(kernel->d_w[i], kernel->columns, kernel->k_size); + free_matrix(kernel->last_d_w[i], kernel->columns, kernel->k_size); + } + free(kernel->w); + free(kernel->d_w); + free(kernel->last_d_w); + + free_matrix(input, kernel->rows, input_dim); + free_matrix(output_cpu, kernel->columns, output_dim); + free_matrix(output_gpu, kernel->columns, output_dim); +} + + +int main() { + srand(time(NULL)); + + run_convolution_test(20, 15, 30, 40); + run_convolution_test(30, 25, 40, 50); + run_convolution_test(200, 10, 40, 50); + + return 0; +} \ No newline at end of file diff --git a/test/cnn_matrix_multiplication.cu b/test/cnn_matrix_multiplication.cu index e4035ee..3f7d875 100644 --- a/test/cnn_matrix_multiplication.cu +++ b/test/cnn_matrix_multiplication.cu @@ -69,7 +69,7 @@ bool check_matrices_equality(float** m1, float** m2, int n, int p, int acceptati void run_matrices_test(int n, int p, int q) { clock_t start, end; - double cpu_time_used; + double cpu_time_used, gpu_time_used; float** matrix1 = create_matrix(n, p); float** matrix2 = create_matrix(p, q); @@ -90,8 +90,8 @@ void run_matrices_test(int n, int p, int q) { matrix_multiplication_host(matrix1, matrix2, result_cpu, n, p, q); end = clock(); - cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; - printf("(%d,%d)x(%d,%d) Time used for CPU: %lf seconds\n", n, p, p, q, cpu_time_used); + gpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; + printf("(%d,%d)x(%d,%d) Time used for CPU: %lf seconds\n", n, p, p, q, gpu_time_used); printf("OK\n"); // Vérification de l'égalité des matrices