mirror of
https://github.com/augustin64/projet-tipe
synced 2025-01-23 23:26:25 +01:00
Merge branch 'main' of https://github.com/julienChemillier/TIPE
This commit is contained in:
commit
98290092dc
15
Makefile
15
Makefile
@ -38,6 +38,7 @@ NVCCFLAGS = -g
|
||||
# -g
|
||||
# See memory leaks and Incorrect Read/Write
|
||||
# -fsanitize=address -lasan
|
||||
#! WARNING: test/cnn-neuron_io fails with this option enabled
|
||||
|
||||
all: mnist cnn;
|
||||
#
|
||||
@ -95,19 +96,19 @@ ifdef NVCC_INSTALLED
|
||||
$(BUILDDIR)/cnn-main-cuda: $(BUILDDIR)/cnn_main.cuda.o \
|
||||
$(BUILDDIR)/cnn_train.cuda.o \
|
||||
$(BUILDDIR)/cnn_test_network.cuda.o \
|
||||
$(BUILDDIR)/cnn_cnn.o \
|
||||
$(BUILDDIR)/cnn_cnn.cuda.o \
|
||||
$(BUILDDIR)/cnn_creation.cuda.o \
|
||||
$(BUILDDIR)/cnn_initialisation.o \
|
||||
$(BUILDDIR)/cnn_make.o \
|
||||
$(BUILDDIR)/cnn_initialisation.cuda.o \
|
||||
$(BUILDDIR)/cnn_cuda_make.o \
|
||||
$(BUILDDIR)/cnn_neuron_io.cuda.o \
|
||||
$(BUILDDIR)/cnn_function.o \
|
||||
$(BUILDDIR)/cnn_function.cuda.o \
|
||||
$(BUILDDIR)/cnn_utils.cuda.o \
|
||||
$(BUILDDIR)/cnn_update.o \
|
||||
$(BUILDDIR)/cnn_update.cuda.o \
|
||||
$(BUILDDIR)/cnn_free.cuda.o \
|
||||
$(BUILDDIR)/cnn_jpeg.cuda.o \
|
||||
$(BUILDDIR)/cnn_cuda_convolution.o \
|
||||
$(BUILDDIR)/cnn_backpropagation.o \
|
||||
$(BUILDDIR)/colors.o \
|
||||
$(BUILDDIR)/cnn_backpropagation.cuda.o \
|
||||
$(BUILDDIR)/colors.cuda.o \
|
||||
$(BUILDDIR)/mnist.cuda.o \
|
||||
$(BUILDDIR)/cuda_utils.o
|
||||
$(NVCC) $(LD_NVCCFLAGS) $(NVCCFLAGS) $^ -o $@
|
||||
|
@ -70,14 +70,11 @@ void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output
|
||||
}
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
|
||||
extern "C"
|
||||
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
|
||||
#ifndef __CUDACC__
|
||||
make_convolution_cpu(kernel, input, output, output_dim);
|
||||
#else
|
||||
make_convolution_device(kernel, input, output, output_dim);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
@ -16,21 +16,33 @@ void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int o
|
||||
/*
|
||||
* Effectue un average pooling avec stride=size
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim);
|
||||
|
||||
/*
|
||||
* Effectue un max pooling avec stride=size
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim);
|
||||
|
||||
/*
|
||||
* Effectue une full connection
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output);
|
||||
|
||||
/*
|
||||
* Effectue une full connection qui passe d'une matrice à un vecteur
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output);
|
||||
|
||||
#endif
|
231
src/cnn/make.c
231
src/cnn/make.c
@ -1,39 +1,140 @@
|
||||
#include <stdio.h>
|
||||
#include <float.h>
|
||||
|
||||
#include "../include/colors.h"
|
||||
#include "include/convolution.h"
|
||||
#include "../include/colors.h"
|
||||
#include "../include/utils.h"
|
||||
|
||||
#include "include/make.h"
|
||||
|
||||
#define BLOCKSIZE_x 16
|
||||
#define BLOCKSIZE_y 8
|
||||
#define BLOCKSIZE_z 8
|
||||
|
||||
float max_flt(float a, float b) {
|
||||
// Return the max between the two floats
|
||||
if (a>b) {
|
||||
if (a > b) {
|
||||
return a;
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Average Pooling
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void make_average_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim
|
||||
int n = size*size;
|
||||
|
||||
if (idx >= output_depth || idy >= output_dim || idz >= output_dim) {
|
||||
return;
|
||||
}
|
||||
|
||||
float m = FLT_MIN;
|
||||
float temp;
|
||||
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
temp = input[idx][size*idy +a][size*idz +b];
|
||||
m = m > temp ? m : temp; // max(m, temp)
|
||||
}
|
||||
}
|
||||
output[idx][idy][idz] = m/(float)n;
|
||||
}
|
||||
|
||||
void make_average_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
make_average_pooling_kernel<<<gridSize, blockSize>>>(input, output, size, output_depth, output_dim);
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void make_average_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// input[output_depth][output_dim+size-1][output_dim+size-1]
|
||||
// output[output_depth][output_dim][output_dim]
|
||||
float sum;
|
||||
float m;
|
||||
int n = size*size;
|
||||
|
||||
for (int i=0; i < output_depth; i++) {
|
||||
for (int j=0; j < output_dim; j++) {
|
||||
for (int k=0; k < output_dim; k++) {
|
||||
sum = 0.;
|
||||
m = FLT_MIN;
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
sum += input[i][size*j +a][size*k +b];
|
||||
m = max_flt(m, input[i][size*j +a][size*k +b]);
|
||||
}
|
||||
}
|
||||
output[i][j][k] = sum/(float)n;
|
||||
output[i][j][k] = m/(float)n;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
#ifndef __CUDACC__
|
||||
make_average_pooling_cpu(input, output, size, output_depth, output_dim);
|
||||
#else
|
||||
make_average_pooling_device(input, output, size, output_depth, output_dim);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Max Pooling
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void make_max_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim
|
||||
|
||||
if (idx >= output_depth || idy >= output_dim || idz >= output_dim) {
|
||||
return;
|
||||
}
|
||||
|
||||
float m = FLT_MIN;
|
||||
float temp;
|
||||
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
temp = input[idx][size*idy +a][size*idz +b];
|
||||
m = m > temp ? m : temp; // max(m, temp)
|
||||
}
|
||||
}
|
||||
output[idx][idy][idz] = m;
|
||||
}
|
||||
|
||||
void make_max_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
make_max_pooling_kernel<<<gridSize, blockSize>>>(input, output, size, output_depth, output_dim);
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void make_max_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// input[output_depth][output_dim+size-1][output_dim+size-1]
|
||||
// output[output_depth][output_dim][output_dim]
|
||||
float m;
|
||||
@ -52,7 +153,55 @@ void make_max_pooling(float*** input, float*** output, int size, int output_dept
|
||||
}
|
||||
}
|
||||
|
||||
void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
#ifndef __CUDACC__
|
||||
make_max_pooling_cpu(input, output, size, output_depth, output_dim);
|
||||
#else
|
||||
make_max_pooling_device(input, output, size, output_depth, output_dim);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Dense
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void make_dense_kernel(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output
|
||||
|
||||
if (idx >= size_output) {
|
||||
return;
|
||||
}
|
||||
float f = kernel->bias[idx];
|
||||
|
||||
for (int j=0; j < size_input; j++) {
|
||||
f += kernel->weights[j][idx]*input[j];
|
||||
}
|
||||
output[idx] = f;
|
||||
}
|
||||
|
||||
void make_dense_device(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1);
|
||||
dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z);
|
||||
|
||||
make_dense_kernel<<<gridSize, blockSize>>>(kernel, input, output, size_input, size_output);
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_dense_cpu(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
|
||||
// input[size_input]
|
||||
// output[size_output]
|
||||
float f;
|
||||
@ -66,7 +215,56 @@ void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input,
|
||||
}
|
||||
}
|
||||
|
||||
void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
|
||||
#ifndef __CUDACC__
|
||||
make_dense_cpu(kernel, input, output, size_input, size_output);
|
||||
#else
|
||||
make_dense_device(kernel, input, output, size_input, size_output);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Dense linearised
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void make_dense_linearised_kernel(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output
|
||||
|
||||
if (idx >= size_output) {
|
||||
return;
|
||||
}
|
||||
float f = 0;
|
||||
|
||||
for (int i=0; i < depth_input; i++) {
|
||||
for (int j=0; j < dim_input; j++) {
|
||||
for (int k=0; k < dim_input; k++) {
|
||||
f += input[i][j][k]*kernel->weights[k + j*dim_input + i*depth_input][idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
output[idx] = f;
|
||||
}
|
||||
|
||||
void make_dense_linearised_device(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1);
|
||||
dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z);
|
||||
|
||||
make_dense_linearised_kernel<<<gridSize, blockSize>>>(kernel, input, output, depth_input, dim_input, size_output);
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void make_dense_linearised_cpu(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
|
||||
// input[depth_input][dim_input][dim_input]
|
||||
// output[size_output]
|
||||
float f;
|
||||
@ -82,4 +280,15 @@ void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int
|
||||
}
|
||||
output[l] = f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
|
||||
#ifndef __CUDACC__
|
||||
make_dense_linearised_cpu(kernel, input, output, depth_input, dim_input, size_output);
|
||||
#else
|
||||
make_dense_linearised_device(kernel, input, output, depth_input, dim_input, size_output);
|
||||
#endif
|
||||
}
|
||||
|
294
src/cnn/make.cu
Normal file
294
src/cnn/make.cu
Normal file
@ -0,0 +1,294 @@
|
||||
#include <stdio.h>
|
||||
#include <float.h>
|
||||
|
||||
#include "include/convolution.h"
|
||||
#include "../include/colors.h"
|
||||
#include "../include/utils.h"
|
||||
|
||||
#include "include/make.h"
|
||||
|
||||
#define BLOCKSIZE_x 16
|
||||
#define BLOCKSIZE_y 8
|
||||
#define BLOCKSIZE_z 8
|
||||
|
||||
float max_flt(float a, float b) {
|
||||
// Return the max between the two floats
|
||||
if (a > b) {
|
||||
return a;
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Average Pooling
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void make_average_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim
|
||||
int n = size*size;
|
||||
|
||||
if (idx >= output_depth || idy >= output_dim || idz >= output_dim) {
|
||||
return;
|
||||
}
|
||||
|
||||
float m = FLT_MIN;
|
||||
float temp;
|
||||
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
temp = input[idx][size*idy +a][size*idz +b];
|
||||
m = m > temp ? m : temp; // max(m, temp)
|
||||
}
|
||||
}
|
||||
output[idx][idy][idz] = m/(float)n;
|
||||
}
|
||||
|
||||
void make_average_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
make_average_pooling_kernel<<<gridSize, blockSize>>>(input, output, size, output_depth, output_dim);
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void make_average_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// input[output_depth][output_dim+size-1][output_dim+size-1]
|
||||
// output[output_depth][output_dim][output_dim]
|
||||
float m;
|
||||
int n = size*size;
|
||||
|
||||
for (int i=0; i < output_depth; i++) {
|
||||
for (int j=0; j < output_dim; j++) {
|
||||
for (int k=0; k < output_dim; k++) {
|
||||
m = FLT_MIN;
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
m = max_flt(m, input[i][size*j +a][size*k +b]);
|
||||
}
|
||||
}
|
||||
output[i][j][k] = m/(float)n;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
#ifndef __CUDACC__
|
||||
make_average_pooling_cpu(input, output, size, output_depth, output_dim);
|
||||
#else
|
||||
make_average_pooling_device(input, output, size, output_depth, output_dim);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Max Pooling
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void make_max_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth
|
||||
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim
|
||||
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim
|
||||
|
||||
if (idx >= output_depth || idy >= output_dim || idz >= output_dim) {
|
||||
return;
|
||||
}
|
||||
|
||||
float m = FLT_MIN;
|
||||
float temp;
|
||||
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
temp = input[idx][size*idy +a][size*idz +b];
|
||||
m = m > temp ? m : temp; // max(m, temp)
|
||||
}
|
||||
}
|
||||
output[idx][idy][idz] = m;
|
||||
}
|
||||
|
||||
void make_max_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
|
||||
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
||||
|
||||
make_max_pooling_kernel<<<gridSize, blockSize>>>(input, output, size, output_depth, output_dim);
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void make_max_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
// input[output_depth][output_dim+size-1][output_dim+size-1]
|
||||
// output[output_depth][output_dim][output_dim]
|
||||
float m;
|
||||
for (int i=0; i < output_depth; i++) {
|
||||
for (int j=0; j < output_dim; j++) {
|
||||
for (int k=0; k < output_dim; k++) {
|
||||
m = FLT_MIN;
|
||||
for (int a=0; a < size; a++) {
|
||||
for (int b=0; b < size; b++) {
|
||||
m = max_flt(m, input[i][size*j +a][size*k +b]);
|
||||
}
|
||||
}
|
||||
output[i][j][k] = m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) {
|
||||
#ifndef __CUDACC__
|
||||
make_max_pooling_cpu(input, output, size, output_depth, output_dim);
|
||||
#else
|
||||
make_max_pooling_device(input, output, size, output_depth, output_dim);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Dense
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void make_dense_kernel(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output
|
||||
|
||||
if (idx >= size_output) {
|
||||
return;
|
||||
}
|
||||
float f = kernel->bias[idx];
|
||||
|
||||
for (int j=0; j < size_input; j++) {
|
||||
f += kernel->weights[j][idx]*input[j];
|
||||
}
|
||||
output[idx] = f;
|
||||
}
|
||||
|
||||
void make_dense_device(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1);
|
||||
dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z);
|
||||
|
||||
make_dense_kernel<<<gridSize, blockSize>>>(kernel, input, output, size_input, size_output);
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_dense_cpu(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
|
||||
// input[size_input]
|
||||
// output[size_output]
|
||||
float f;
|
||||
|
||||
for (int i=0; i < size_output; i++) {
|
||||
f = kernel->bias[i];
|
||||
for (int j=0; j < size_input; j++) {
|
||||
f += kernel->weights[j][i]*input[j];
|
||||
}
|
||||
output[i] = f;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
|
||||
#ifndef __CUDACC__
|
||||
make_dense_cpu(kernel, input, output, size_input, size_output);
|
||||
#else
|
||||
make_dense_device(kernel, input, output, size_input, size_output);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Dense linearised
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
__global__ void make_dense_linearised_kernel(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
|
||||
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
||||
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output
|
||||
|
||||
if (idx >= size_output) {
|
||||
return;
|
||||
}
|
||||
float f = 0;
|
||||
|
||||
for (int i=0; i < depth_input; i++) {
|
||||
for (int j=0; j < dim_input; j++) {
|
||||
for (int k=0; k < dim_input; k++) {
|
||||
f += input[i][j][k]*kernel->weights[k + j*dim_input + i*depth_input][idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
output[idx] = f;
|
||||
}
|
||||
|
||||
void make_dense_linearised_device(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
|
||||
// Make computation
|
||||
dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1);
|
||||
dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z);
|
||||
|
||||
make_dense_linearised_kernel<<<gridSize, blockSize>>>(kernel, input, output, depth_input, dim_input, size_output);
|
||||
gpuErrchk( cudaPeekAtLastError() );
|
||||
gpuErrchk( cudaDeviceSynchronize() );
|
||||
}
|
||||
#endif
|
||||
|
||||
void make_dense_linearised_cpu(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
|
||||
// input[depth_input][dim_input][dim_input]
|
||||
// output[size_output]
|
||||
float f;
|
||||
|
||||
for (int l=0; l < size_output; l++) {
|
||||
f = 0;
|
||||
for (int i=0; i < depth_input; i++) {
|
||||
for (int j=0; j < dim_input; j++) {
|
||||
for (int k=0; k < dim_input; k++) {
|
||||
f += input[i][j][k]*kernel->weights[k + j*dim_input + i*depth_input][l];
|
||||
}
|
||||
}
|
||||
}
|
||||
output[l] = f;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C"
|
||||
#endif
|
||||
void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) {
|
||||
#ifndef __CUDACC__
|
||||
make_dense_linearised_cpu(kernel, input, output, depth_input, dim_input, size_output);
|
||||
#else
|
||||
make_dense_linearised_device(kernel, input, output, depth_input, dim_input, size_output);
|
||||
#endif
|
||||
}
|
@ -34,27 +34,18 @@ int i_div_up(int a, int b);
|
||||
* Vérification de la compatibilité CUDA
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
bool check_cuda_compatibility();
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
void* nalloc(size_t sz);
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
void gree(void* ptr);
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
25
src/utils.c
25
src/utils.c
@ -15,7 +15,7 @@ int i_div_up(int a, int b) { // Partie entière supérieure de a/b
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
bool check_cuda_compatibility() {
|
||||
#ifdef __CUDACC__
|
||||
@ -43,52 +43,37 @@ bool check_cuda_compatibility() {
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef USE_CUDA
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
void* nalloc(size_t sz) {
|
||||
void* ptr = malloc(sz);
|
||||
return ptr;
|
||||
}
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
void gree(void* ptr) {
|
||||
free(ptr);
|
||||
}
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
void* nalloc(size_t sz) {
|
||||
void* ptr;
|
||||
cudaMallocManaged(&ptr, sz, cudaMemAttachHost);
|
||||
return ptr;
|
||||
}
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
void gree(void* ptr) {
|
||||
cudaFree(ptr);
|
||||
}
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
25
src/utils.cu
25
src/utils.cu
@ -15,7 +15,7 @@ int i_div_up(int a, int b) { // Partie entière supérieure de a/b
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
bool check_cuda_compatibility() {
|
||||
#ifdef __CUDACC__
|
||||
@ -43,52 +43,37 @@ bool check_cuda_compatibility() {
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef USE_CUDA
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
void* nalloc(size_t sz) {
|
||||
void* ptr = malloc(sz);
|
||||
return ptr;
|
||||
}
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
void gree(void* ptr) {
|
||||
free(ptr);
|
||||
}
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
void* nalloc(size_t sz) {
|
||||
void* ptr;
|
||||
cudaMallocManaged(&ptr, sz, cudaMemAttachHost);
|
||||
return ptr;
|
||||
}
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
extern "C" {
|
||||
extern "C"
|
||||
#endif
|
||||
void gree(void* ptr) {
|
||||
cudaFree(ptr);
|
||||
}
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user