mirror of
https://github.com/augustin64/projet-tipe
synced 2025-03-13 22:35:21 +01:00
Ajout de test.cu: Suppression de redondances dans le code Recoloration des tests: Ajout de couleurs pour clarifier le bon fonctionnement ou non du programme
174 lines
6.3 KiB
C
174 lines
6.3 KiB
C
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <stdbool.h>
|
|
|
|
#include "include/struct.h"
|
|
#ifdef __CUDACC__
|
|
#include "../include/utils.h"
|
|
#else
|
|
bool check_cuda_compatibility() {
|
|
#ifdef __CUDACC__
|
|
int nDevices;
|
|
cudaDeviceProp prop;
|
|
|
|
cudaGetDeviceCount(&nDevices);
|
|
if (nDevices == 0) {
|
|
printf("Pas d'utilisation du GPU\n\n");
|
|
return false;
|
|
}
|
|
|
|
printf("GPUs disponibles:\n");
|
|
|
|
for (int i=0; i < nDevices; i++) {
|
|
cudaGetDeviceProperties(&prop, i);
|
|
printf(" - %s\n", prop.name);
|
|
}
|
|
|
|
cudaGetDeviceProperties(&prop, 0);
|
|
printf("Utilisation du GPU: " BLUE "%s" RESET " (Compute capability: %d.%d)\n\n", prop.name, prop.major, prop.minor);
|
|
return true;
|
|
#else
|
|
printf("Pas d'utilisation du GPU\n\n");
|
|
return false;
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#define BLOCKSIZE_x 16
|
|
#define BLOCKSIZE_y 8
|
|
#define BLOCKSIZE_z 8
|
|
|
|
|
|
void make_convolution_cpu(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
|
|
// c'est le kernel de input
|
|
// input[kernel->rows][kernel_k_size + output_dim-1][kernel_k_size + output_dim-1]
|
|
// output[kernel->columns][output_dim][output_dim]
|
|
float f;
|
|
|
|
for (int i=0; i < kernel->columns; i++) {
|
|
for (int j=0; j < output_dim; j++) {
|
|
for (int k=0; k < output_dim; k++) {
|
|
f = kernel->bias[i][j][k];
|
|
for (int a=0; a < kernel->rows; a++) {
|
|
for (int b=0; b < kernel->k_size; b++) {
|
|
for (int c=0; c < kernel->k_size; c++) {
|
|
f += kernel->w[a][i][b][c]*input[a][j+b][k+c];
|
|
}
|
|
}
|
|
}
|
|
output[i][j][k] = f;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef __CUDACC__
|
|
|
|
__global__ void make_convolution_kernel(int k_size, int columns, int rows, float*** bias, size_t pitch_bias, float**** w, size_t pitch_w, float*** input, size_t pitch_input, float*** output, size_t pitch_output, int output_dim) {
|
|
// Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
|
|
int idx = threadIdx.x + blockDim.x*blockIdx.x; // < kernel->columns
|
|
int idy = threadIdx.y + blockDim.y*blockIdx.y; // < min(output_dim, k_size)
|
|
int idz = threadIdx.z + blockDim.z*blockIdx.z; // < min(output_dim, k_size)
|
|
|
|
int input_dim = output_dim+k_size - 1;
|
|
|
|
if (idx >= columns || idy >= output_dim || idz >= output_dim) {
|
|
return;
|
|
}
|
|
|
|
float* bias_offset;
|
|
float* w_offset;
|
|
float* input_offset;
|
|
float* output_offset;
|
|
|
|
bias_offset = (float*)((char*)bias + (idx*output_dim+idy)*pitch_bias);
|
|
float f = bias_offset[idz];
|
|
|
|
for (int a=0; a < rows; a++) {
|
|
for (int b=0; b < k_size; b++) {
|
|
for (int c=0; c < k_size; c++) {
|
|
w_offset = (float*)((char*)w + ((a*columns + idx)*k_size+b)*pitch_w);
|
|
input_offset = (float*)((char*)input + (a*input_dim + idy+b)*pitch_input);
|
|
f += w_offset[c]*input_offset[idz+c];
|
|
}
|
|
}
|
|
}
|
|
|
|
output_offset = (float*)((char*)output + (idx*output_dim+idy)*pitch_output);
|
|
output_offset[idz] = f;
|
|
}
|
|
|
|
void make_convolution_device(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
|
|
// Copy arrays
|
|
size_t pitch_input;
|
|
size_t pitch_output;
|
|
size_t pitch_bias;
|
|
size_t pitch_weight;
|
|
float*** input_dev;
|
|
float*** output_dev;
|
|
float*** kernel_bias;
|
|
float**** kernel_weight;
|
|
|
|
int input_dim = output_dim+kernel->k_size - 1;
|
|
|
|
// Copy ***input
|
|
gpuErrchk( cudaMallocPitch((void**)&input_dev, &pitch_input, input_dim*sizeof(float), kernel->rows*input_dim));
|
|
for (int i=0; i < kernel->rows; i++) {
|
|
for (int j=0; j < input_dim; j++) {
|
|
gpuErrchk( cudaMemcpy((void*)((char*)input_dev + (i*input_dim+j)*pitch_input), (const void*)&(input[i][j][0]), input_dim*sizeof(float), cudaMemcpyHostToDevice));
|
|
}
|
|
}
|
|
// cudaMalloc ***output
|
|
gpuErrchk( cudaMallocPitch((void**)&output_dev, &pitch_output, output_dim*sizeof(float), kernel->columns*output_dim));
|
|
|
|
// Copy ***Kernel bias
|
|
gpuErrchk( cudaMallocPitch((void**)&kernel_bias, &pitch_bias, output_dim*sizeof(float), kernel->columns*output_dim));
|
|
for (int i=0; i < kernel->columns; i++) {
|
|
for (int j=0; j < output_dim; j++) {
|
|
gpuErrchk( cudaMemcpy((void*)((char*)kernel_bias + (i*output_dim+j)*pitch_bias), (const void*)&(kernel->bias[i][j][0]), output_dim*sizeof(float), cudaMemcpyHostToDevice));
|
|
}
|
|
}
|
|
|
|
// Copy ****Kernel weights
|
|
gpuErrchk( cudaMallocPitch((void**)&kernel_weight, &pitch_weight, kernel->k_size*sizeof(float), (kernel->rows*kernel->columns*kernel->k_size)));
|
|
for (int i=0; i < kernel->rows; i++) {
|
|
for (int j=0; j < kernel->columns; j++) {
|
|
for (int k=0; k < kernel->k_size; k++) {
|
|
gpuErrchk( cudaMemcpy((void*)((char*)kernel_weight + ((i*kernel->columns+j)*kernel->k_size+k)*pitch_weight), (const void*)&(kernel->w[i][j][k][0]), kernel->k_size*sizeof(float), cudaMemcpyHostToDevice));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Make computation
|
|
dim3 gridSize(i_div_up(kernel->columns, BLOCKSIZE_x), i_div_up(output_dim, BLOCKSIZE_y), i_div_up(output_dim, BLOCKSIZE_z));
|
|
dim3 blockSize(BLOCKSIZE_x, BLOCKSIZE_y, BLOCKSIZE_z);
|
|
|
|
make_convolution_kernel<<<gridSize, blockSize>>>(kernel->k_size, kernel->columns, kernel->rows, kernel_bias, pitch_bias, kernel_weight, pitch_weight, input_dev, pitch_input, output_dev, pitch_output, output_dim);
|
|
gpuErrchk( cudaPeekAtLastError() );
|
|
gpuErrchk( cudaDeviceSynchronize() );
|
|
|
|
// Copy output back
|
|
for (int i=0; i < kernel->columns; i++) {
|
|
for (int j=0; j < output_dim; j++) {
|
|
gpuErrchk( cudaMemcpy((void*)&(output[i][j][0]), (const void*)((char*)output_dev + (i*output_dim+j)*pitch_output), output_dim*sizeof(float), cudaMemcpyDeviceToHost));
|
|
}
|
|
}
|
|
|
|
// Free all the allocated memory
|
|
gpuErrchk( cudaFree(input_dev) );
|
|
gpuErrchk( cudaFree(output_dev) );
|
|
gpuErrchk( cudaFree(kernel_bias) );
|
|
gpuErrchk( cudaFree(kernel_weight) );
|
|
gpuErrchk( cudaPeekAtLastError() );
|
|
gpuErrchk( cudaDeviceSynchronize() );
|
|
}
|
|
#endif
|
|
|
|
|
|
void make_convolution(Kernel_cnn* kernel, float*** input, float*** output, int output_dim) {
|
|
#ifndef __CUDACC__
|
|
make_convolution_cpu(kernel, input, output, output_dim);
|
|
#else
|
|
make_convolution_device(kernel, input, output, output_dim);
|
|
#endif
|
|
} |