mirror of
https://github.com/augustin64/projet-tipe
synced 2025-01-23 15:16:26 +01:00
Add cuda convolution
This commit is contained in:
parent
a1dba81e17
commit
1608256e43
11
Makefile
11
Makefile
@ -65,13 +65,16 @@ $(BUILDDIR)/mnist_%.o: $(MNIST_SRCDIR)/%.c $(MNIST_SRCDIR)/include/%.h
|
||||
#
|
||||
cnn: $(BUILDDIR)/cnn-main;
|
||||
|
||||
$(BUILDDIR)/cnn-main: $(CNN_SRCDIR)/main.c $(BUILDDIR)/cnn_train.o $(BUILDDIR)/cnn_cnn.o $(BUILDDIR)/cnn_creation.o $(BUILDDIR)/cnn_initialisation.o $(BUILDDIR)/cnn_make.o $(BUILDDIR)/cnn_neuron_io.o $(BUILDDIR)/cnn_function.o $(BUILDDIR)/cnn_utils.o $(BUILDDIR)/cnn_free.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o
|
||||
$(BUILDDIR)/cnn-main: $(CNN_SRCDIR)/main.c $(BUILDDIR)/cnn_train.o $(BUILDDIR)/cnn_cnn.o $(BUILDDIR)/cnn_creation.o $(BUILDDIR)/cnn_initialisation.o $(BUILDDIR)/cnn_make.o $(BUILDDIR)/cnn_neuron_io.o $(BUILDDIR)/cnn_function.o $(BUILDDIR)/cnn_utils.o $(BUILDDIR)/cnn_free.o $(BUILDDIR)/cnn_convolution.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o
|
||||
$(CC) $(CFLAGS) $^ -o $@
|
||||
|
||||
$(BUILDDIR)/cnn-main-cuda: $(CNN_SRCDIR)/main.c $(BUILDDIR)/cnn_train.o $(BUILDDIR)/cnn_cnn.o $(BUILDDIR)/cnn_creation.o $(BUILDDIR)/cnn_initialisation.o $(BUILDDIR)/cnn_make.o $(BUILDDIR)/cnn_neuron_io.o $(BUILDDIR)/cnn_function.o $(BUILDDIR)/cnn_utils.o $(BUILDDIR)/cnn_free.o $(BUILDDIR)/cnn_cuda_convolution.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o
|
||||
$(NVCC) $(NVCCFLAGS) $^ -o $@
|
||||
|
||||
$(BUILDDIR)/cnn_%.o: $(CNN_SRCDIR)/%.c $(CNN_SRCDIR)/include/%.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
$(BUILDDIR)/cnn_%.o: $(CNN_SRCDIR)/%.cu $(CNN_SRCDIR)/include/%.h
|
||||
$(BUILDDIR)/cnn_cuda_%.o: $(CNN_SRCDIR)/%.cu $(CNN_SRCDIR)/include/%.h
|
||||
ifndef NVCC_INSTALLED
|
||||
@echo "nvcc not found, skipping"
|
||||
else
|
||||
@ -90,7 +93,7 @@ run-tests: build-tests
|
||||
$(foreach file, $(wildcard $(BUILDDIR)/test-*), $(file);)
|
||||
$(foreach file, $(wildcard $(TEST_SRCDIR)/*.sh), $(file);)
|
||||
|
||||
build-tests: prepare-tests $(TESTS_OBJ)
|
||||
build-tests: prepare-tests $(TESTS_OBJ) $(BUILDDIR)/test-cnn_matrix_multiplication $(BUILDDIR)/test-cnn_convolution
|
||||
|
||||
|
||||
prepare-tests:
|
||||
@ -104,7 +107,7 @@ build/test-cnn_%: test/cnn_%.c $(CNN_OBJ) $(BUILDDIR)/colors.o $(BUILDDIR)/mnist
|
||||
build/test-mnist_%: test/mnist_%.c $(MNIST_OBJ) $(BUILDDIR)/colors.o
|
||||
$(CC) $(CFLAGS) $^ -o $@
|
||||
|
||||
$(BUILDDIR)/test-cnn_matrix_multiplication: test/cnn_matrix_multiplication.cu $(BUILDDIR)/cnn_matrix_multiplication.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o
|
||||
$(BUILDDIR)/test-cnn_%: test/cnn_%.cu $(BUILDDIR)/cnn_cuda_%.o $(BUILDDIR)/colors.o $(BUILDDIR)/mnist.o $(CNN_OBJ)
|
||||
ifndef NVCC_INSTALLED
|
||||
@echo "nvcc not found, skipping"
|
||||
else
|
||||
|
159
src/cnn/convolution.c
Normal file
159
src/cnn/convolution.c
Normal file
@ -0,0 +1,159 @@
|
||||
/* This file is a copy of src/cnn/convolution.cu */
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "include/struct.h"
|
||||
|
||||
#define BLOCKSIZE_x 16
|
||||
#define BLOCKSIZE_y 8
|
||||
#define BLOCKSIZE_z 8
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
/* CUDA memcheck */
|
||||
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
|
||||
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
|
||||
if (code != cudaSuccess) {
|
||||
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
|
||||
if (abort) exit(code);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
|
||||
// c'est le kernel de input
|
||||
// input[kernel->rows][kernel_k_size + output_dim-1][kernel_k_size + output_dim-1]
|
||||
// output[kernel->columns][output_dim][output_dim]
|
||||
float f;
|
||||
|
||||
for (int i=0; i < kernel->columns; i++) {
|
||||
for (int j=0; j < output_dim; j++) {
|
||||
for (int k=0; k < output_dim; k++) {
|
||||
f = kernel->bias[i][j][k];
|
||||
for (int a=0; a < kernel->rows; a++) {
|
||||
for (int b=0; b < kernel->k_size; b++) {
|
||||
for (int c=0; c < kernel->k_size; c++) {
|
||||
f += kernel->w[a][i][b][c]*input[a][j+b][k+c];
|
||||
}
|
||||
}
|
||||
}
|
||||
output[i][j][k] = f/kernel->k_size; // Average
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
int i_div_up(int a, int b) { // Partie entière supérieure de a/b
|
||||
return ((a % b) != 0) ? (a / b + 1) : (a / b);
|
||||
}
|
||||
|
||||
__global__ void make_convolution_kernel(int k_size, int columns, int rows, float*** bias, size_t pitch_bias, float**** w, size_t pitch_w, float*** input, size_t pitch_input, float*** output, size_t pitch_output, int output_dim) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_dim, k_size)
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_dim, k_size)
|
||||
|
||||
int input_dim = output_dim+k_size - 1;
|
||||
|
||||
if (idx >= columns || idy >= output_dim || idz >= output_dim) {
|
||||
return;
|
||||
}
|
||||
|
||||
float* bias_offset;
|
||||
float* w_offset;
|
||||
float* input_offset;
|
||||
float* output_offset;
|
||||
|
||||
bias_offset = (float*)((char*)bias + (idx*output_dim+idy)*pitch_bias);
|
||||
float f = bias_offset[idz];
|
||||
|
||||
for (int a=0; a < rows; a++) {
|
||||
for (int b=0; b < k_size; b++) {
|
||||
for (int c=0; c < k_size; c++) {
|
||||
w_offset = (float*)((char*)w + ((a*columns + idx)*k_size+b)*pitch_w);
|
||||
input_offset = (float*)((char*)input + (a*input_dim + idy+b)*pitch_input);
|
||||
f += w_offset[c]*input_offset[idz+c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output_offset = (float*)((char*)output + (idx*output_dim+idy)*pitch_output);
|
||||
output_offset[idz] = f/(k_size);
|
||||
}
|
||||
|
||||
void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
|
||||
// Copy arrays
|
||||
size_t pitch_input;
|
||||
size_t pitch_output;
|
||||
size_t pitch_bias;
|
||||
size_t pitch_weight;
|
||||
float*** input_dev;
|
||||
float*** output_dev;
|
||||
float*** kernel_bias;
|
||||
float**** kernel_weight;
|
||||
|
||||
int input_dim = output_dim+kernel->k_size - 1;
|
||||
|
||||
// Copy ***input
|
||||
gpuErrchk( cudaMallocPitch((void**)&input_dev, &pitch_input, input_dim*sizeof(float), kernel->rows*input_dim));
|
||||
for (int i=0; i < kernel->rows; i++) {
|
||||
for (int j=0; j < input_dim; j++) {
|
||||
gpuErrchk( cudaMemcpy((void*)((char*)input_dev + (i*input_dim+j)*pitch_input), (const void*)&(input[i][j][0]), input_dim*sizeof(float), cudaMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
// cudaMalloc ***output
|
||||
gpuErrchk( cudaMallocPitch((void**)&output_dev, &pitch_output, output_dim*sizeof(float), kernel->columns*output_dim));
|
||||
|
||||
// Copy ***Kernel bias
|
||||
gpuErrchk( cudaMallocPitch((void**)&kernel_bias, &pitch_bias, output_dim*sizeof(float), kernel->columns*output_dim));
|
||||
for (int i=0; i < kernel->columns; i++) {
|
||||
for (int j=0; j < output_dim; j++) {
|
||||
gpuErrchk( cudaMemcpy((void*)((char*)kernel_bias + (i*output_dim+j)*pitch_bias), (const void*)&(kernel->bias[i][j][0]), output_dim*sizeof(float), cudaMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
|
||||
// Copy ****Kernel weights
|
||||
gpuErrchk( cudaMallocPitch((void**)&kernel_weight, &pitch_weight, kernel->k_size*sizeof(float), (kernel->rows*kernel->columns*kernel->k_size)));
|
||||
for (int i=0; i < kernel->rows; i++) {
|
||||
for (int j=0; j < kernel->columns; j++) {
|
||||
for (int k=0; k < kernel->k_size; k++) {
|
||||
gpuErrchk( cudaMemcpy((void*)((char*)kernel_weight + ((i*kernel->columns+j)*kernel->k_size+k)*pitch_weight), (const void*)&(kernel->w[i][j][k][0]), kernel->k_size*sizeof(float), cudaMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
make_convolution_kernel<<<gridSize, blockSize>>>(kernel->k_size, kernel->columns, kernel->rows, kernel_bias, pitch_bias, kernel_weight, pitch_weight, input_dev, pitch_input, output_dev, pitch_output, output_dim);
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
|
||||
// Copy output back
|
||||
for (int i=0; i < kernel->columns; i++) {
|
||||
for (int j=0; j < output_dim; j++) {
|
||||
gpuErrchk( cudaMemcpy((void*)&(output[i][j][0]), (const void*)((char*)output_dev + (i*output_dim+j)*pitch_output), output_dim*sizeof(float), cudaMemcpyDeviceToHost));
|
||||
}
|
||||
}
|
||||
|
||||
// Free all the allocated memory
|
||||
gpuErrchk( cudaFree(input_dev) );
|
||||
gpuErrchk( cudaFree(output_dev) );
|
||||
gpuErrchk( cudaFree(kernel_bias) );
|
||||
gpuErrchk( cudaFree(kernel_weight) );
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
|
||||
#ifndef __CUDACC__
|
||||
make_convolution_cpu(kernel, input, output, output_dim);
|
||||
#else
|
||||
make_convolution_device(kernel, input, output, output_dim);
|
||||
#endif
|
||||
}
|
157
src/cnn/convolution.cu
Normal file
157
src/cnn/convolution.cu
Normal file
@ -0,0 +1,157 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "include/struct.h"
|
||||
|
||||
#define BLOCKSIZE_x 16
|
||||
#define BLOCKSIZE_y 8
|
||||
#define BLOCKSIZE_z 8
|
||||
|
||||
#ifdef __CUDACC__
|
||||
/* CUDA memcheck */
|
||||
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
|
||||
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
|
||||
if (code != cudaSuccess) {
|
||||
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
|
||||
if (abort) exit(code);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
|
||||
// c'est le kernel de input
|
||||
// input[kernel->rows][kernel_k_size + output_dim-1][kernel_k_size + output_dim-1]
|
||||
// output[kernel->columns][output_dim][output_dim]
|
||||
float f;
|
||||
|
||||
for (int i=0; i < kernel->columns; i++) {
|
||||
for (int j=0; j < output_dim; j++) {
|
||||
for (int k=0; k < output_dim; k++) {
|
||||
f = kernel->bias[i][j][k];
|
||||
for (int a=0; a < kernel->rows; a++) {
|
||||
for (int b=0; b < kernel->k_size; b++) {
|
||||
for (int c=0; c < kernel->k_size; c++) {
|
||||
f += kernel->w[a][i][b][c]*input[a][j+b][k+c];
|
||||
}
|
||||
}
|
||||
}
|
||||
output[i][j][k] = f/kernel->k_size; // Average
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
int i_div_up(int a, int b) { // Partie entière supérieure de a/b
|
||||
return ((a % b) != 0) ? (a / b + 1) : (a / b);
|
||||
}
|
||||
|
||||
__global__ void make_convolution_kernel(int k_size, int columns, int rows, float*** bias, size_t pitch_bias, float**** w, size_t pitch_w, float*** input, size_t pitch_input, float*** output, size_t pitch_output, int output_dim) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_dim, k_size)
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_dim, k_size)
|
||||
|
||||
int input_dim = output_dim+k_size - 1;
|
||||
|
||||
if (idx >= columns || idy >= output_dim || idz >= output_dim) {
|
||||
return;
|
||||
}
|
||||
|
||||
float* bias_offset;
|
||||
float* w_offset;
|
||||
float* input_offset;
|
||||
float* output_offset;
|
||||
|
||||
bias_offset = (float*)((char*)bias + (idx*output_dim+idy)*pitch_bias);
|
||||
float f = bias_offset[idz];
|
||||
|
||||
for (int a=0; a < rows; a++) {
|
||||
for (int b=0; b < k_size; b++) {
|
||||
for (int c=0; c < k_size; c++) {
|
||||
w_offset = (float*)((char*)w + ((a*columns + idx)*k_size+b)*pitch_w);
|
||||
input_offset = (float*)((char*)input + (a*input_dim + idy+b)*pitch_input);
|
||||
f += w_offset[c]*input_offset[idz+c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output_offset = (float*)((char*)output + (idx*output_dim+idy)*pitch_output);
|
||||
output_offset[idz] = f/(k_size);
|
||||
}
|
||||
|
||||
void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
|
||||
// Copy arrays
|
||||
size_t pitch_input;
|
||||
size_t pitch_output;
|
||||
size_t pitch_bias;
|
||||
size_t pitch_weight;
|
||||
float*** input_dev;
|
||||
float*** output_dev;
|
||||
float*** kernel_bias;
|
||||
float**** kernel_weight;
|
||||
|
||||
int input_dim = output_dim+kernel->k_size - 1;
|
||||
|
||||
// Copy ***input
|
||||
gpuErrchk( cudaMallocPitch((void**)&input_dev, &pitch_input, input_dim*sizeof(float), kernel->rows*input_dim));
|
||||
for (int i=0; i < kernel->rows; i++) {
|
||||
for (int j=0; j < input_dim; j++) {
|
||||
gpuErrchk( cudaMemcpy((void*)((char*)input_dev + (i*input_dim+j)*pitch_input), (const void*)&(input[i][j][0]), input_dim*sizeof(float), cudaMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
// cudaMalloc ***output
|
||||
gpuErrchk( cudaMallocPitch((void**)&output_dev, &pitch_output, output_dim*sizeof(float), kernel->columns*output_dim));
|
||||
|
||||
// Copy ***Kernel bias
|
||||
gpuErrchk( cudaMallocPitch((void**)&kernel_bias, &pitch_bias, output_dim*sizeof(float), kernel->columns*output_dim));
|
||||
for (int i=0; i < kernel->columns; i++) {
|
||||
for (int j=0; j < output_dim; j++) {
|
||||
gpuErrchk( cudaMemcpy((void*)((char*)kernel_bias + (i*output_dim+j)*pitch_bias), (const void*)&(kernel->bias[i][j][0]), output_dim*sizeof(float), cudaMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
|
||||
// Copy ****Kernel weights
|
||||
gpuErrchk( cudaMallocPitch((void**)&kernel_weight, &pitch_weight, kernel->k_size*sizeof(float), (kernel->rows*kernel->columns*kernel->k_size)));
|
||||
for (int i=0; i < kernel->rows; i++) {
|
||||
for (int j=0; j < kernel->columns; j++) {
|
||||
for (int k=0; k < kernel->k_size; k++) {
|
||||
gpuErrchk( cudaMemcpy((void*)((char*)kernel_weight + ((i*kernel->columns+j)*kernel->k_size+k)*pitch_weight), (const void*)&(kernel->w[i][j][k][0]), kernel->k_size*sizeof(float), cudaMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
make_convolution_kernel<<<gridSize, blockSize>>>(kernel->k_size, kernel->columns, kernel->rows, kernel_bias, pitch_bias, kernel_weight, pitch_weight, input_dev, pitch_input, output_dev, pitch_output, output_dim);
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
|
||||
// Copy output back
|
||||
for (int i=0; i < kernel->columns; i++) {
|
||||
for (int j=0; j < output_dim; j++) {
|
||||
gpuErrchk( cudaMemcpy((void*)&(output[i][j][0]), (const void*)((char*)output_dev + (i*output_dim+j)*pitch_output), output_dim*sizeof(float), cudaMemcpyDeviceToHost));
|
||||
}
|
||||
}
|
||||
|
||||
// Free all the allocated memory
|
||||
gpuErrchk( cudaFree(input_dev) );
|
||||
gpuErrchk( cudaFree(output_dev) );
|
||||
gpuErrchk( cudaFree(kernel_bias) );
|
||||
gpuErrchk( cudaFree(kernel_weight) );
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
|
||||
#ifndef __CUDACC__
|
||||
make_convolution_cpu(kernel, input, output, output_dim);
|
||||
#else
|
||||
make_convolution_device(kernel, input, output, output_dim);
|
||||
#endif
|
||||
}
|
28
src/cnn/include/convolution.h
Normal file
28
src/cnn/include/convolution.h
Normal file
@ -0,0 +1,28 @@
|
||||
#include "struct.h"
|
||||
|
||||
/*
|
||||
* Effectue la convolution sur le processeur
|
||||
*/
|
||||
void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
/*
|
||||
* Partie entière supérieure de a/b
|
||||
*/
|
||||
int i_div_up(int a, int b);
|
||||
|
||||
/*
|
||||
* Kernel de la convolution sur carte graphique
|
||||
*/
|
||||
__global__ void make_convolution_kernel(int k_size, int columns, int rows, float*** bias, size_t pitch_bias, float**** w, size_t pitch_w, float*** input, size_t pitch_input, float*** output, size_t pitch_output, int output_dim);
|
||||
|
||||
/*
|
||||
* Effectue la convolution sur la carte graphique
|
||||
*/
|
||||
void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output, int output_dim);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Détermine si la convolution peut-être faite sur la carte graphique au moment de la compilation
|
||||
*/
|
||||
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim);
|
@ -4,22 +4,26 @@
|
||||
#define DEF_MAKE_H
|
||||
|
||||
/*
|
||||
* Effectue une convolution sans stride
|
||||
* Effectue une convolution sans stride sur le processeur
|
||||
*/
|
||||
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim);
|
||||
void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim);
|
||||
|
||||
/*
|
||||
* Effecute un average pooling avec stride=size
|
||||
* Effectue la convolution sur le CPU ou GPU
|
||||
*/
|
||||
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim);
|
||||
/*
|
||||
* Effectue un average pooling avec stride=size
|
||||
*/
|
||||
void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim);
|
||||
|
||||
/*
|
||||
* Effecute une full connection
|
||||
* Effectue une full connection
|
||||
*/
|
||||
void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output);
|
||||
|
||||
/*
|
||||
* Effecute une full connection qui passe d'une matrice à un vecteur
|
||||
* Effectue une full connection qui passe d'une matrice à un vecteur
|
||||
*/
|
||||
void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output);
|
||||
|
||||
|
@ -1,30 +1,9 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#include "../include/colors.h"
|
||||
#include "include/convolution.h"
|
||||
#include "include/make.h"
|
||||
|
||||
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
|
||||
// c'est le kernel de input
|
||||
// input[kernel->rows][kernel_k_size + output_dim-1][kernel_k_size + output_dim-1]
|
||||
// output[kernel->columns][output_dim][output_dim]
|
||||
float f;
|
||||
int n = kernel->k_size;
|
||||
for (int i=0; i < kernel->columns; i++) {
|
||||
for (int j=0; j < output_dim; j++) {
|
||||
for (int k=0; k < output_dim; k++) {
|
||||
f = kernel->bias[i][j][k];
|
||||
for (int a=0; a < kernel->rows; a++) {
|
||||
for (int b=0; b < n; b++) {
|
||||
for (int c=0; c < n; c++) {
|
||||
f += kernel->w[a][i][b][c]*input[a][j+b][k+c];
|
||||
}
|
||||
}
|
||||
}
|
||||
output[i][j][k] = f/n; // Average
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// input[output_depth][output_dim+size-1][output_dim+size-1]
|
||||
|
@ -41,7 +41,7 @@ __global__ void matrix_mul_kernel(float* Md, float* Nd, float* Pd, int n, int p,
|
||||
for (int i = 0; i < n; i++) {
|
||||
M_offset = (float *)((char*)Md + i * pitch_m);
|
||||
P_offset = (float*)((char*)Pd + i * pitch_p); // P[i], pitch_p est un décalage en bytes
|
||||
atomicAdd(&P_offset[ty], M_offset[tx] * Nxy); // P[i][ty] += P[i][tx] * N[tx][ty]
|
||||
atomicAdd(&P_offset[ty], M_offset[tx] * Nxy); // P[i][ty] += P[i][tx] * N[tx][ty]
|
||||
}
|
||||
}
|
||||
|
||||
|
180
test/cnn_convolution.cu
Normal file
180
test/cnn_convolution.cu
Normal file
@ -0,0 +1,180 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "../src/cnn/include/make.h"
|
||||
#include "../src/cnn/include/convolution.h"
|
||||
#include "../src/cnn/include/struct.h"
|
||||
|
||||
|
||||
float random_float(float low, float high) {
|
||||
float t = (float)rand() / (float)RAND_MAX;
|
||||
return (1.0f - t) * low + t * high;
|
||||
}
|
||||
|
||||
|
||||
void fill_matrix_random(float ***matrix, int n, int p, int q, float max_val) {
|
||||
for (int i=0; i < n; i++) {
|
||||
for (int j=0; j < p; j++) {
|
||||
for (int k=0; k < q; k++) {
|
||||
matrix[i][j][k] = random_float(0.0f, max_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void print_matrix(float** mat, int n, int p) {
|
||||
for (int i=0; i < n; i++) {
|
||||
printf("[\t");
|
||||
for (int j=0; j < p; j++) {
|
||||
printf("%0.1f\t", mat[i][j]);
|
||||
}
|
||||
printf("]\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
float*** create_matrix(int n, int p, int q, float max_val) {
|
||||
float*** matrix = (float***)malloc(n*sizeof(float**));
|
||||
for (int i=0; i < n; i++) {
|
||||
matrix[i] = (float**)malloc(sizeof(float*)*p);
|
||||
for (int j=0; j < p; j++) {
|
||||
matrix[i][j] = (float*)malloc(sizeof(float)*q);
|
||||
}
|
||||
}
|
||||
|
||||
fill_matrix_random(matrix, n, p, q, max_val);
|
||||
return matrix;
|
||||
}
|
||||
|
||||
|
||||
float*** create_empty_matrix(int n, int p, int q) {
|
||||
float*** matrix = (float***)malloc(n*sizeof(float**));
|
||||
for (int i=0; i < n; i++) {
|
||||
matrix[i] = (float**)malloc(sizeof(float*)*p);
|
||||
for (int j=0; j < p; j++) {
|
||||
matrix[i][j] = (float*)malloc(sizeof(float)*q);
|
||||
for (int k=0; k < q; k++) {
|
||||
matrix[i][j][k] = 0.;
|
||||
}
|
||||
}
|
||||
}
|
||||
return matrix;
|
||||
}
|
||||
|
||||
void free_matrix(float*** matrix, int n, int p) {
|
||||
for (int i=0; i < n; i++) {
|
||||
for (int j=0; j < p; j++) {
|
||||
free(matrix[i][j]);
|
||||
}
|
||||
free(matrix[i]);
|
||||
}
|
||||
free(matrix);
|
||||
}
|
||||
|
||||
|
||||
bool check_matrices_equality(float*** m1, float*** m2, int n, int p, int q, int acceptation) {
|
||||
for (int i=0; i < n; i++) {
|
||||
for (int j=0; j < p; j++) {
|
||||
for (int k=0; k < q; k++) {
|
||||
if (fabs(m1[i][j][k] - m2[i][j][k]) > 0.01*acceptation) {
|
||||
printf("diff %d %d %d: %f val: %f et %f\n", i, j, k, fabs(m1[i][j][k] - m2[i][j][k]), m1[i][j][k], m2[i][j][k]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void run_convolution_test(int input_dim, int output_dim, int rows, int columns) {
|
||||
assert(input_dim >= output_dim);
|
||||
int k_size = input_dim - output_dim +1;
|
||||
|
||||
// Génération des données aléatoires
|
||||
Kernel_cnn* kernel = (Kernel_cnn*)malloc(sizeof(Kernel_cnn));
|
||||
|
||||
kernel->k_size = k_size;
|
||||
kernel->rows = rows;
|
||||
kernel->columns = columns;
|
||||
|
||||
// bias[kernel->columns][dim_output][dim_output]
|
||||
kernel->bias = create_matrix(kernel->columns, output_dim, output_dim, 15.0f);
|
||||
kernel->d_bias = create_matrix(kernel->columns, output_dim, output_dim, 1.5f);
|
||||
kernel->last_d_bias = create_matrix(kernel->columns, output_dim, output_dim, 0.1f);
|
||||
|
||||
// w[rows][columns][k_size][k_size]
|
||||
kernel->w = (float****)malloc(sizeof(float***)*kernel->rows);
|
||||
kernel->d_w = (float****)malloc(sizeof(float***)*kernel->rows);
|
||||
kernel->last_d_w = (float****)malloc(sizeof(float***)*kernel->rows);
|
||||
for (int i=0; i < kernel->rows; i++) {
|
||||
kernel->w[i] = create_matrix(kernel->columns, kernel->k_size, kernel->k_size, 15.0f);
|
||||
kernel->d_w[i] = create_matrix(kernel->columns, kernel->k_size, kernel->k_size, 1.5f);
|
||||
kernel->last_d_w[i] = create_matrix(kernel->columns, kernel->k_size, kernel->k_size, 0.1f);
|
||||
}
|
||||
|
||||
float*** input = create_matrix(kernel->rows, input_dim, input_dim, 5.0f);
|
||||
float*** output_cpu = create_empty_matrix(kernel->columns, output_dim, output_dim);
|
||||
float*** output_gpu = create_empty_matrix(kernel->columns, output_dim, output_dim);
|
||||
|
||||
printf("(%d, %d, %d, %d) Data generation complete\n", rows, columns, input_dim, output_dim);
|
||||
|
||||
|
||||
// Lancement des calculs
|
||||
clock_t start, end;
|
||||
double cpu_time_used, gpu_time_used;
|
||||
|
||||
start = clock();
|
||||
make_convolution_device(kernel, input, output_gpu, output_dim);
|
||||
end = clock();
|
||||
|
||||
gpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
|
||||
printf("(%d, %d, %d, %d) Time used for GPU: %lf seconds\n", rows, columns, input_dim, output_dim, gpu_time_used);
|
||||
|
||||
|
||||
start = clock();
|
||||
make_convolution_cpu(kernel, input, output_cpu, output_dim);
|
||||
end = clock();
|
||||
|
||||
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
|
||||
printf("(%d, %d, %d, %d) Time used for CPU: %lf seconds\n", rows, columns, input_dim, output_dim, cpu_time_used);
|
||||
|
||||
// Vérification de l'égalité des matrices
|
||||
printf("(%d, %d, %d, %d) Checking equality.\n", rows, columns, input_dim, output_dim);
|
||||
if (!check_matrices_equality(output_gpu, output_cpu, kernel->columns, output_dim, output_dim, kernel->k_size)) {// TODO: change acceptation
|
||||
exit(1);
|
||||
}
|
||||
printf("OK\n");
|
||||
|
||||
free_matrix(kernel->bias, kernel->columns, output_dim);
|
||||
free_matrix(kernel->d_bias, kernel->columns, output_dim);
|
||||
free_matrix(kernel->last_d_bias, kernel->columns, output_dim);
|
||||
|
||||
for (int i=0; i < kernel->rows; i++) {
|
||||
free_matrix(kernel->w[i], kernel->columns, kernel->k_size);
|
||||
free_matrix(kernel->d_w[i], kernel->columns, kernel->k_size);
|
||||
free_matrix(kernel->last_d_w[i], kernel->columns, kernel->k_size);
|
||||
}
|
||||
free(kernel->w);
|
||||
free(kernel->d_w);
|
||||
free(kernel->last_d_w);
|
||||
|
||||
free_matrix(input, kernel->rows, input_dim);
|
||||
free_matrix(output_cpu, kernel->columns, output_dim);
|
||||
free_matrix(output_gpu, kernel->columns, output_dim);
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
srand(time(NULL));
|
||||
|
||||
run_convolution_test(20, 15, 30, 40);
|
||||
run_convolution_test(30, 25, 40, 50);
|
||||
run_convolution_test(200, 10, 40, 50);
|
||||
|
||||
return 0;
|
||||
}
|
@ -69,7 +69,7 @@ bool check_matrices_equality(float** m1, float** m2, int n, int p, int acceptati
|
||||
|
||||
void run_matrices_test(int n, int p, int q) {
|
||||
clock_t start, end;
|
||||
double cpu_time_used;
|
||||
double cpu_time_used, gpu_time_used;
|
||||
|
||||
float** matrix1 = create_matrix(n, p);
|
||||
float** matrix2 = create_matrix(p, q);
|
||||
@ -90,8 +90,8 @@ void run_matrices_test(int n, int p, int q) {
|
||||
matrix_multiplication_host(matrix1, matrix2, result_cpu, n, p, q);
|
||||
end = clock();
|
||||
|
||||
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
|
||||
printf("(%d,%d)x(%d,%d) Time used for CPU: %lf seconds\n", n, p, p, q, cpu_time_used);
|
||||
gpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
|
||||
printf("(%d,%d)x(%d,%d) Time used for CPU: %lf seconds\n", n, p, p, q, gpu_time_used);
|
||||
printf("OK\n");
|
||||
|
||||
// Vérification de l'égalité des matrices
|
||||
|
Loading…
Reference in New Issue
Block a user