From 957c17a1dfbd98d90d9fb312b5aaf70e56559611 Mon Sep 17 00:00:00 2001
From: augustin64 <augustin.lucas64@gmail.com>
Date: Sat, 27 May 2023 22:23:50 +0200
Subject: [PATCH 1/3] Add cnn-export-cuda

---
 Makefile         | 20 +++++++++++++++++++-
 src/cnn/export.c |  2 +-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index c904d8b..c59348a 100644
--- a/Makefile
+++ b/Makefile
@@ -70,7 +70,7 @@ $(BUILDDIR)/dense_%.o: $(DENSE_SRCDIR)/%.c $(DENSE_SRCDIR)/include/%.h
 #
 # Build cnn
 #
-cnn: $(BUILDDIR)/cnn-main $(BUILDDIR)/cnn-main-cuda $(BUILDDIR)/cnn-preview $(BUILDDIR)/cnn-export;
+cnn: $(BUILDDIR)/cnn-main $(BUILDDIR)/cnn-main-cuda $(BUILDDIR)/cnn-preview $(BUILDDIR)/cnn-export $(BUILDDIR)/cnn-export-cuda;
 
 $(BUILDDIR)/cnn-main: $(CNN_SRCDIR)/main.c \
 		$(BUILDDIR)/cnn_backpropagation.o \
@@ -147,6 +147,24 @@ $(BUILDDIR)/cnn-export: $(CNN_SRCDIR)/export.c \
 	$(CC)  $^ -o $@  $(CFLAGS) $(LD_CFLAGS)
 
 
+$(BUILDDIR)/cnn-export-cuda: $(CNN_SRCDIR)/export.c \
+		$(BUILDDIR)/cnn_cuda_backpropagation.o \
+		$(BUILDDIR)/cnn_cuda_convolution.o \
+		$(BUILDDIR)/cnn_neuron_io.cuda.o \
+		$(BUILDDIR)/cnn_cuda_function.o \
+		$(BUILDDIR)/cnn_free.cuda.o \
+		$(BUILDDIR)/cnn_cuda_make.o \
+		$(BUILDDIR)/cnn_cnn.cuda.o \
+		$(BUILDDIR)/cnn_utils.cuda.o \
+		$(BUILDDIR)/cnn_jpeg.cuda.o \
+		\
+		$(BUILDDIR)/cuda_memory_management.o \
+		$(BUILDDIR)/colors.cuda.o \
+		$(BUILDDIR)/mnist.o \
+		$(BUILDDIR)/cuda_utils.o
+	$(NVCC)  $^ -o $@  $(NVCCFLAGS) $(LD_NVCCFLAGS)
+
+
 $(BUILDDIR)/cnn_%.o: $(CNN_SRCDIR)/%.c $(CNN_SRCDIR)/include/%.h
 	$(CC)  -c $< -o $@  $(CFLAGS)
 
diff --git a/src/cnn/export.c b/src/cnn/export.c
index d304ab1..f571125 100644
--- a/src/cnn/export.c
+++ b/src/cnn/export.c
@@ -82,7 +82,7 @@ void print_poids_ker_cnn(char* modele) {
 
 
 void write_image(float** data, int width, int height, char* base_filename, int layer_id, int kernel_id) {
-    int filename_length = strlen(base_filename) + (int)log10(layer_id+1)+1 + (int)log10(kernel_id+1)+1 + 21;
+    int filename_length = strlen(base_filename) + (int)log10(layer_id+1)+1 + (int)log10(kernel_id+1)+1 + 21 +1;
     char* filename = (char*)malloc(sizeof(char)*filename_length);
 
     sprintf(filename, "%s_layer-%d_feature-%d.jpeg", base_filename, layer_id, kernel_id);

From 858c071bede9155c905cb824307ae73dcb9ce188 Mon Sep 17 00:00:00 2001
From: augustin64 <augustin.lucas64@gmail.com>
Date: Sat, 27 May 2023 22:26:09 +0200
Subject: [PATCH 2/3] cnn-export-cuda: check if nvcc is available

---
 Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c59348a..0d66068 100644
--- a/Makefile
+++ b/Makefile
@@ -147,6 +147,7 @@ $(BUILDDIR)/cnn-export: $(CNN_SRCDIR)/export.c \
 	$(CC)  $^ -o $@  $(CFLAGS) $(LD_CFLAGS)
 
 
+ifdef NVCC_INSTALLED
 $(BUILDDIR)/cnn-export-cuda: $(CNN_SRCDIR)/export.c \
 		$(BUILDDIR)/cnn_cuda_backpropagation.o \
 		$(BUILDDIR)/cnn_cuda_convolution.o \
@@ -163,7 +164,10 @@ $(BUILDDIR)/cnn-export-cuda: $(CNN_SRCDIR)/export.c \
 		$(BUILDDIR)/mnist.o \
 		$(BUILDDIR)/cuda_utils.o
 	$(NVCC)  $^ -o $@  $(NVCCFLAGS) $(LD_NVCCFLAGS)
-
+else
+$(BUILDDIR)/cnn-export-cuda: $(CNN_SRCDIR)/export.c
+	@echo "$(NVCC) not found, skipping"
+endif
 
 $(BUILDDIR)/cnn_%.o: $(CNN_SRCDIR)/%.c $(CNN_SRCDIR)/include/%.h
 	$(CC)  -c $< -o $@  $(CFLAGS)

From d63fb2c8705b005e9a9b1d87e3b5e5887187f2f4 Mon Sep 17 00:00:00 2001
From: augustin64 <augustin.lucas64@gmail.com>
Date: Sun, 28 May 2023 09:26:12 +0200
Subject: [PATCH 3/3] cnn/make: Fix misaligned address

---
 src/cnn/make.c  | 8 ++++----
 src/cnn/make.cu | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/cnn/make.c b/src/cnn/make.c
index a205d39..96102bc 100644
--- a/src/cnn/make.c
+++ b/src/cnn/make.c
@@ -183,17 +183,17 @@ void make_max_pooling(float*** input, float*** output, int size, int output_dept
 * Dense
 */
 #ifdef __CUDACC__
-__global__ void make_dense_kernel(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
+__global__ void make_dense_kernel(float** weights, float* bias, float* input, float* output, int size_input, int size_output) {
     // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
     int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output
 
     if (idx >= size_output) {
         return;
     }
-    float f = kernel->bias[idx];
+    float f = bias[idx];
 
     for (int j=0; j < size_input; j++) {
-        f += kernel->weights[j][idx]*input[j];
+        f += weights[j][idx]*input[j];
     }
     output[idx] = f;
 }
@@ -203,7 +203,7 @@ void make_dense_device(Kernel_nn* kernel, float* input, float* output, int size_
     dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1);
     dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z);
 
-    make_dense_kernel<<<gridSize, blockSize>>>(kernel, input, output, size_input, size_output);
+    make_dense_kernel<<<gridSize, blockSize>>>(kernel->weights, kernel->bias, input, output, size_input, size_output);
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
 }
diff --git a/src/cnn/make.cu b/src/cnn/make.cu
index a205d39..96102bc 100644
--- a/src/cnn/make.cu
+++ b/src/cnn/make.cu
@@ -183,17 +183,17 @@ void make_max_pooling(float*** input, float*** output, int size, int output_dept
 * Dense
 */
 #ifdef __CUDACC__
-__global__ void make_dense_kernel(Kernel_nn* kernel, float* input, float* output, int size_input, int size_output) {
+__global__ void make_dense_kernel(float** weights, float* bias, float* input, float* output, int size_input, int size_output) {
     // Équivalents respectifs de i, j et k dans la boucle effectuée par le cpu
     int idx = threadIdx.x + blockDim.x*blockIdx.x; // < size_output
 
     if (idx >= size_output) {
         return;
     }
-    float f = kernel->bias[idx];
+    float f = bias[idx];
 
     for (int j=0; j < size_input; j++) {
-        f += kernel->weights[j][idx]*input[j];
+        f += weights[j][idx]*input[j];
     }
     output[idx] = f;
 }
@@ -203,7 +203,7 @@ void make_dense_device(Kernel_nn* kernel, float* input, float* output, int size_
     dim3 gridSize(i_div_up(size_output, BLOCKSIZE_x*BLOCKSIZE_y), 1, 1);
     dim3 blockSize(BLOCKSIZE_x*BLOCKSIZE_y, 1, BLOCKSIZE_z);
 
-    make_dense_kernel<<<gridSize, blockSize>>>(kernel, input, output, size_input, size_output);
+    make_dense_kernel<<<gridSize, blockSize>>>(kernel->weights, kernel->bias, input, output, size_input, size_output);
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
 }