diff --git a/Makefile b/Makefile index de4d738..b3e038b 100644 --- a/Makefile +++ b/Makefile @@ -38,6 +38,7 @@ NVCCFLAGS = -g # -g # See memory leaks and Incorrect Read/Write # -fsanitize=address -lasan +#! WARNING: test/cnn-neuron_io fails with this option enabled all: mnist cnn; # @@ -95,19 +96,19 @@ ifdef NVCC_INSTALLED $(BUILDDIR)/cnn-main-cuda: $(BUILDDIR)/cnn_main.cuda.o \ $(BUILDDIR)/cnn_train.cuda.o \ $(BUILDDIR)/cnn_test_network.cuda.o \ - $(BUILDDIR)/cnn_cnn.o \ + $(BUILDDIR)/cnn_cnn.cuda.o \ $(BUILDDIR)/cnn_creation.cuda.o \ - $(BUILDDIR)/cnn_initialisation.o \ - $(BUILDDIR)/cnn_make.o \ + $(BUILDDIR)/cnn_initialisation.cuda.o \ + $(BUILDDIR)/cnn_cuda_make.o \ $(BUILDDIR)/cnn_neuron_io.cuda.o \ - $(BUILDDIR)/cnn_function.o \ + $(BUILDDIR)/cnn_function.cuda.o \ $(BUILDDIR)/cnn_utils.cuda.o \ - $(BUILDDIR)/cnn_update.o \ + $(BUILDDIR)/cnn_update.cuda.o \ $(BUILDDIR)/cnn_free.cuda.o \ $(BUILDDIR)/cnn_jpeg.cuda.o \ $(BUILDDIR)/cnn_cuda_convolution.o \ - $(BUILDDIR)/cnn_backpropagation.o \ - $(BUILDDIR)/colors.o \ + $(BUILDDIR)/cnn_backpropagation.cuda.o \ + $(BUILDDIR)/colors.cuda.o \ $(BUILDDIR)/mnist.cuda.o \ $(BUILDDIR)/cuda_utils.o $(NVCC) $(LD_NVCCFLAGS) $(NVCCFLAGS) $^ -o $@ diff --git a/src/cnn/convolution.cu b/src/cnn/convolution.cu index e3f4dc2..edd3fa5 100644 --- a/src/cnn/convolution.cu +++ b/src/cnn/convolution.cu @@ -70,14 +70,11 @@ void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output } #endif -extern "C" { - +extern "C" void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) { #ifndef __CUDACC__ make_convolution_cpu(kernel, input, output, output_dim); #else make_convolution_device(kernel, input, output, output_dim); #endif -} - } \ No newline at end of file diff --git a/src/cnn/include/make.h b/src/cnn/include/make.h index 1c859e9..af0f9ac 100644 --- a/src/cnn/include/make.h +++ b/src/cnn/include/make.h @@ -16,21 +16,33 @@ void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int o /* * Effectue un average pooling avec stride=size */ +#ifdef __CUDACC__ +extern "C" +#endif void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim); /* * Effectue un max pooling avec stride=size */ +#ifdef __CUDACC__ +extern "C" +#endif void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim); /* * Effectue une full connection */ +#ifdef __CUDACC__ +extern "C" +#endif void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output); /* * Effectue une full connection qui passe d'une matrice à un vecteur */ +#ifdef __CUDACC__ +extern "C" +#endif void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output); #endif \ No newline at end of file diff --git a/src/cnn/make.c b/src/cnn/make.c index 9aec661..b783808 100644 --- a/src/cnn/make.c +++ b/src/cnn/make.c @@ -1,39 +1,140 @@ #include #include -#include "../include/colors.h" #include "include/convolution.h" +#include "../include/colors.h" +#include "../include/utils.h" + #include "include/make.h" +#define BLOCKSIZE_x 16 +#define BLOCKSIZE_y 8 +#define BLOCKSIZE_z 8 + float max_flt(float a, float b) { // Return the max between the two floats - if (a>b) { + if (a > b) { return a; } return b; } -void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) { + + + + +/* +* Average Pooling +*/ +#ifdef __CUDACC__ +__global__ void make_average_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim + int n = size*size; + + if (idx >= output_depth || idy >= output_dim || idz >= output_dim) { + return; + } + + float m = FLT_MIN; + float temp; + + for (int a=0; a < size; a++) { + for (int b=0; b < size; b++) { + temp = input[idx][size*idy +a][size*idz +b]; + m = m > temp ? m : temp; // max(m, temp) + } + } + output[idx][idy][idz] = m/(float)n; +} + +void make_average_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) { + // Make computation + dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + make_average_pooling_kernel<<>>(input, output, size, output_depth, output_dim); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void make_average_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) { // input[output_depth][output_dim+size-1][output_dim+size-1] // output[output_depth][output_dim][output_dim] - float sum; + float m; int n = size*size; + for (int i=0; i < output_depth; i++) { for (int j=0; j < output_dim; j++) { for (int k=0; k < output_dim; k++) { - sum = 0.; + m = FLT_MIN; for (int a=0; a < size; a++) { for (int b=0; b < size; b++) { - sum += input[i][size*j +a][size*k +b]; + m = max_flt(m, input[i][size*j +a][size*k +b]); } } - output[i][j][k] = sum/(float)n; + output[i][j][k] = m/(float)n; } } } } -void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) { +#ifdef __CUDACC__ +extern "C" +#endif +void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) { + #ifndef __CUDACC__ + make_average_pooling_cpu(input, output, size, output_depth, output_dim); + #else + make_average_pooling_device(input, output, size, output_depth, output_dim); + #endif +} + + + + + +/* +* Max Pooling +*/ +#ifdef __CUDACC__ +__global__ void make_max_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim + + if (idx >= output_depth || idy >= output_dim || idz >= output_dim) { + return; + } + + float m = FLT_MIN; + float temp; + + for (int a=0; a < size; a++) { + for (int b=0; b < size; b++) { + temp = input[idx][size*idy +a][size*idz +b]; + m = m > temp ? m : temp; // max(m, temp) + } + } + output[idx][idy][idz] = m; +} + +void make_max_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) { + // Make computation + dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + make_max_pooling_kernel<<>>(input, output, size, output_depth, output_dim); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void make_max_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) { // input[output_depth][output_dim+size-1][output_dim+size-1] // output[output_depth][output_dim][output_dim] float m; @@ -52,7 +153,55 @@ void make_max_pooling(float*** input, float*** output, int size, int output_dept } } -void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { +#ifdef __CUDACC__ +extern "C" +#endif +void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) { + #ifndef __CUDACC__ + make_max_pooling_cpu(input, output, size, output_depth, output_dim); + #else + make_max_pooling_device(input, output, size, output_depth, output_dim); + #endif +} + + + + + +/* +* Dense +*/ +#ifdef __CUDACC__ +__global__ void make_dense_kernel(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output + + if (idx >= size_output) { + return; + } + float f = kernel->bias[idx]; + + for (int j=0; j < size_input; j++) { + f += kernel->weights[j][idx]*input[j]; + } + output[idx] = f; +} + +void make_dense_device(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { + // Make computation + dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1); + dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z); + + make_dense_kernel<<>>(kernel, input, output, size_input, size_output); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +#ifdef __CUDACC__ +extern "C" +#endif +void make_dense_cpu(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { // input[size_input] // output[size_output] float f; @@ -66,7 +215,56 @@ void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, } } -void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { +#ifdef __CUDACC__ +extern "C" +#endif +void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { + #ifndef __CUDACC__ + make_dense_cpu(kernel, input, output, size_input, size_output); + #else + make_dense_device(kernel, input, output, size_input, size_output); + #endif +} + + + + + +/* +* Dense linearised +*/ +#ifdef __CUDACC__ +__global__ void make_dense_linearised_kernel(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output + + if (idx >= size_output) { + return; + } + float f = 0; + + for (int i=0; i < depth_input; i++) { + for (int j=0; j < dim_input; j++) { + for (int k=0; k < dim_input; k++) { + f += input[i][j][k]*kernel->weights[k + j*dim_input + i*depth_input][idx]; + } + } + } + output[idx] = f; +} + +void make_dense_linearised_device(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { + // Make computation + dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1); + dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z); + + make_dense_linearised_kernel<<>>(kernel, input, output, depth_input, dim_input, size_output); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void make_dense_linearised_cpu(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { // input[depth_input][dim_input][dim_input] // output[size_output] float f; @@ -82,4 +280,15 @@ void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int } output[l] = f; } -} \ No newline at end of file +} + +#ifdef __CUDACC__ +extern "C" +#endif +void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { + #ifndef __CUDACC__ + make_dense_linearised_cpu(kernel, input, output, depth_input, dim_input, size_output); + #else + make_dense_linearised_device(kernel, input, output, depth_input, dim_input, size_output); + #endif +} diff --git a/src/cnn/make.cu b/src/cnn/make.cu new file mode 100644 index 0000000..b783808 --- /dev/null +++ b/src/cnn/make.cu @@ -0,0 +1,294 @@ +#include +#include + +#include "include/convolution.h" +#include "../include/colors.h" +#include "../include/utils.h" + +#include "include/make.h" + +#define BLOCKSIZE_x 16 +#define BLOCKSIZE_y 8 +#define BLOCKSIZE_z 8 + +float max_flt(float a, float b) { + // Return the max between the two floats + if (a > b) { + return a; + } + return b; +} + + + + + +/* +* Average Pooling +*/ +#ifdef __CUDACC__ +__global__ void make_average_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim + int n = size*size; + + if (idx >= output_depth || idy >= output_dim || idz >= output_dim) { + return; + } + + float m = FLT_MIN; + float temp; + + for (int a=0; a < size; a++) { + for (int b=0; b < size; b++) { + temp = input[idx][size*idy +a][size*idz +b]; + m = m > temp ? m : temp; // max(m, temp) + } + } + output[idx][idy][idz] = m/(float)n; +} + +void make_average_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) { + // Make computation + dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + make_average_pooling_kernel<<>>(input, output, size, output_depth, output_dim); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void make_average_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) { + // input[output_depth][output_dim+size-1][output_dim+size-1] + // output[output_depth][output_dim][output_dim] + float m; + int n = size*size; + + for (int i=0; i < output_depth; i++) { + for (int j=0; j < output_dim; j++) { + for (int k=0; k < output_dim; k++) { + m = FLT_MIN; + for (int a=0; a < size; a++) { + for (int b=0; b < size; b++) { + m = max_flt(m, input[i][size*j +a][size*k +b]); + } + } + output[i][j][k] = m/(float)n; + } + } + } +} + +#ifdef __CUDACC__ +extern "C" +#endif +void make_average_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) { + #ifndef __CUDACC__ + make_average_pooling_cpu(input, output, size, output_depth, output_dim); + #else + make_average_pooling_device(input, output, size, output_depth, output_dim); + #endif +} + + + + + +/* +* Max Pooling +*/ +#ifdef __CUDACC__ +__global__ void make_max_pooling_kernel(float*** input, float*** output, int size, int output_depth, int output_dim) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < output_depth + int idy = threadIdx.y + blockDim.y*blockIdx.y; // < output_dim + int idz = threadIdx.z + blockDim.z*blockIdx.z; // < output_dim + + if (idx >= output_depth || idy >= output_dim || idz >= output_dim) { + return; + } + + float m = FLT_MIN; + float temp; + + for (int a=0; a < size; a++) { + for (int b=0; b < size; b++) { + temp = input[idx][size*idy +a][size*idz +b]; + m = m > temp ? m : temp; // max(m, temp) + } + } + output[idx][idy][idz] = m; +} + +void make_max_pooling_device(float*** input, float*** output, int size, int output_depth, int output_dim) { + // Make computation + dim3 gridSize(i_div_up(output_depth, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z)); + dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z); + + make_max_pooling_kernel<<>>(input, output, size, output_depth, output_dim); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void make_max_pooling_cpu(float*** input, float*** output, int size, int output_depth, int output_dim) { + // input[output_depth][output_dim+size-1][output_dim+size-1] + // output[output_depth][output_dim][output_dim] + float m; + for (int i=0; i < output_depth; i++) { + for (int j=0; j < output_dim; j++) { + for (int k=0; k < output_dim; k++) { + m = FLT_MIN; + for (int a=0; a < size; a++) { + for (int b=0; b < size; b++) { + m = max_flt(m, input[i][size*j +a][size*k +b]); + } + } + output[i][j][k] = m; + } + } + } +} + +#ifdef __CUDACC__ +extern "C" +#endif +void make_max_pooling(float*** input, float*** output, int size, int output_depth, int output_dim) { + #ifndef __CUDACC__ + make_max_pooling_cpu(input, output, size, output_depth, output_dim); + #else + make_max_pooling_device(input, output, size, output_depth, output_dim); + #endif +} + + + + + +/* +* Dense +*/ +#ifdef __CUDACC__ +__global__ void make_dense_kernel(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output + + if (idx >= size_output) { + return; + } + float f = kernel->bias[idx]; + + for (int j=0; j < size_input; j++) { + f += kernel->weights[j][idx]*input[j]; + } + output[idx] = f; +} + +void make_dense_device(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { + // Make computation + dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1); + dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z); + + make_dense_kernel<<>>(kernel, input, output, size_input, size_output); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +#ifdef __CUDACC__ +extern "C" +#endif +void make_dense_cpu(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { + // input[size_input] + // output[size_output] + float f; + + for (int i=0; i < size_output; i++) { + f = kernel->bias[i]; + for (int j=0; j < size_input; j++) { + f += kernel->weights[j][i]*input[j]; + } + output[i] = f; + } +} + +#ifdef __CUDACC__ +extern "C" +#endif +void make_dense(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) { + #ifndef __CUDACC__ + make_dense_cpu(kernel, input, output, size_input, size_output); + #else + make_dense_device(kernel, input, output, size_input, size_output); + #endif +} + + + + + +/* +* Dense linearised +*/ +#ifdef __CUDACC__ +__global__ void make_dense_linearised_kernel(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { + // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu + int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output + + if (idx >= size_output) { + return; + } + float f = 0; + + for (int i=0; i < depth_input; i++) { + for (int j=0; j < dim_input; j++) { + for (int k=0; k < dim_input; k++) { + f += input[i][j][k]*kernel->weights[k + j*dim_input + i*depth_input][idx]; + } + } + } + output[idx] = f; +} + +void make_dense_linearised_device(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { + // Make computation + dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1); + dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z); + + make_dense_linearised_kernel<<>>(kernel, input, output, depth_input, dim_input, size_output); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); +} +#endif + +void make_dense_linearised_cpu(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { + // input[depth_input][dim_input][dim_input] + // output[size_output] + float f; + + for (int l=0; l < size_output; l++) { + f = 0; + for (int i=0; i < depth_input; i++) { + for (int j=0; j < dim_input; j++) { + for (int k=0; k < dim_input; k++) { + f += input[i][j][k]*kernel->weights[k + j*dim_input + i*depth_input][l]; + } + } + } + output[l] = f; + } +} + +#ifdef __CUDACC__ +extern "C" +#endif +void make_dense_linearised(Kernel_nn* kernel, float*** input, float* output, int depth_input, int dim_input, int size_output) { + #ifndef __CUDACC__ + make_dense_linearised_cpu(kernel, input, output, depth_input, dim_input, size_output); + #else + make_dense_linearised_device(kernel, input, output, depth_input, dim_input, size_output); + #endif +} diff --git a/src/include/utils.h b/src/include/utils.h index 5bc7ecc..96a78b4 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -34,27 +34,18 @@ int i_div_up(int a, int b); * Vérification de la compatibilité CUDA */ #ifdef __CUDACC__ -extern "C" { +extern "C" #endif bool check_cuda_compatibility(); -#ifdef __CUDACC__ -} -#endif #ifdef __CUDACC__ -extern "C" { +extern "C" #endif void* nalloc(size_t sz); -#ifdef __CUDACC__ -} -#endif #ifdef __CUDACC__ -extern "C" { +extern "C" #endif void gree(void* ptr); -#ifdef __CUDACC__ -} -#endif #endif \ No newline at end of file diff --git a/src/utils.c b/src/utils.c index 5868153..23f5847 100644 --- a/src/utils.c +++ b/src/utils.c @@ -15,7 +15,7 @@ int i_div_up(int a, int b) { // Partie entière supérieure de a/b } #ifdef __CUDACC__ -extern "C" { +extern "C" #endif bool check_cuda_compatibility() { #ifdef __CUDACC__ @@ -43,52 +43,37 @@ bool check_cuda_compatibility() { return false; #endif } -#ifdef __CUDACC__ -} -#endif #ifndef USE_CUDA #ifdef __CUDACC__ - extern "C" { + extern "C" #endif void* nalloc(size_t sz) { void* ptr = malloc(sz); return ptr; } - #ifdef __CUDACC__ - } - #endif #ifdef __CUDACC__ - extern "C" { + extern "C" #endif void gree(void* ptr) { free(ptr); } - #ifdef __CUDACC__ - } - #endif #else #ifdef __CUDACC__ - extern "C" { + extern "C" #endif void* nalloc(size_t sz) { void* ptr; cudaMallocManaged(&ptr, sz, cudaMemAttachHost); return ptr; } - #ifdef __CUDACC__ - } - #endif #ifdef __CUDACC__ - extern "C" { + extern "C" #endif void gree(void* ptr) { cudaFree(ptr); } - #ifdef __CUDACC__ - } - #endif #endif diff --git a/src/utils.cu b/src/utils.cu index 5868153..23f5847 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -15,7 +15,7 @@ int i_div_up(int a, int b) { // Partie entière supérieure de a/b } #ifdef __CUDACC__ -extern "C" { +extern "C" #endif bool check_cuda_compatibility() { #ifdef __CUDACC__ @@ -43,52 +43,37 @@ bool check_cuda_compatibility() { return false; #endif } -#ifdef __CUDACC__ -} -#endif #ifndef USE_CUDA #ifdef __CUDACC__ - extern "C" { + extern "C" #endif void* nalloc(size_t sz) { void* ptr = malloc(sz); return ptr; } - #ifdef __CUDACC__ - } - #endif #ifdef __CUDACC__ - extern "C" { + extern "C" #endif void gree(void* ptr) { free(ptr); } - #ifdef __CUDACC__ - } - #endif #else #ifdef __CUDACC__ - extern "C" { + extern "C" #endif void* nalloc(size_t sz) { void* ptr; cudaMallocManaged(&ptr, sz, cudaMemAttachHost); return ptr; } - #ifdef __CUDACC__ - } - #endif #ifdef __CUDACC__ - extern "C" { + extern "C" #endif void gree(void* ptr) { cudaFree(ptr); } - #ifdef __CUDACC__ - } - #endif #endif