mem management: Add optional tail optimisation

VGG16: memory usage increases of ~1%, but initialisation time (for CUDA) goes from 1h down to 2.4s
2025-04-19 04:23:53 +02:00 · 2023-05-15 10:07:00 +02:00 · 2023-05-15 10:07:00 +02:00 · 4cffcc1c95
commit 4cffcc1c95
parent 003183d3fd
3 changed files with 43 additions and 2 deletions
--- a/src/common/include/memory_management.h
+++ b/src/common/include/memory_management.h
@ -8,6 +8,11 @@
 // https://forums.developer.nvidia.com/t/find-the-limit-of-shared-memory-that-can-be-used-per-block/48556
 #define MEMORY_BLOCK 49152

+// On n'alloue de la mémoire que dans le dernier bloc créé, on ne parcourt donc pas la liste
+// Cela augmente légèrement l'utilisation de la mémoire, mais permet un gain de temps conséquent
+// Pour VGG16, environ 1% de mémoire supplémentaire utilisée,
+// L'initialisation passe de 1h02 à 2.4s sur mon matériel
+#define MEMORY_TAIL_OPT

 // We define our memory with a linked list of memory blocks
 typedef struct Memory {
--- a/src/common/memory_management.c
+++ b/src/common/memory_management.c
@ -8,8 +8,12 @@
 #include "include/utils.h"


-Memory* memory = NULL;
 pthread_mutex_t memory_lock = PTHREAD_MUTEX_INITIALIZER;
+Memory* memory = NULL;
+#ifdef MEMORY_TAIL_OPT
+    Memory* tail = NULL;
+#endif
+


 int get_distinct_allocations(Memory* mem) {
@ -68,6 +72,9 @@ Memory* create_memory_block(size_t size) {
    mem->nb_alloc = 0;
    mem->next = NULL;
    mem->id = rand() %100000;
+    #ifdef MEMORY_TAIL_OPT
+    tail = mem;
+    #endif
    
    return mem;
 }
@ -116,6 +123,13 @@ Memory* free_memory(void* ptr, Memory* mem) {
        // printf(GREEN "%p <= %p < %p\n" RESET, mem->start, ptr, (void*)((intptr_t)mem->start + mem->size));
        if (mem->nb_alloc == 0) {
            Memory* mem_next = mem->next;
+
+            #ifdef MEMORY_TAIL_OPT
+            if (tail == mem) {
+                tail = memory;
+            }
+            #endif
+
            #ifdef __CUDACC__
            cudaFree(mem->start);
            #else
@ -145,7 +159,11 @@ void* nalloc(int nb_elements, size_t size) {
        }
        //printf("Distinct allocations: %d Blocks: %d\n", get_distinct_allocations(memory), get_length(memory));
        //printf("Requested memory of size %ld\n", sz);
+        #ifdef MEMORY_TAIL_OPT
+        void* ptr = allocate_memory(nb_elements, size, tail);
+        #else
        void* ptr = allocate_memory(nb_elements, size, memory);
+        #endif

        pthread_mutex_unlock(&memory_lock);
        return ptr;
--- a/src/common/memory_management.cu
+++ b/src/common/memory_management.cu
@ -8,8 +8,12 @@
 #include "include/utils.h"


-Memory* memory = NULL;
 pthread_mutex_t memory_lock = PTHREAD_MUTEX_INITIALIZER;
+Memory* memory = NULL;
+#ifdef MEMORY_TAIL_OPT
+    Memory* tail = NULL;
+#endif
+


 int get_distinct_allocations(Memory* mem) {
@ -68,6 +72,9 @@ Memory* create_memory_block(size_t size) {
    mem->nb_alloc = 0;
    mem->next = NULL;
    mem->id = rand() %100000;
+    #ifdef MEMORY_TAIL_OPT
+    tail = mem;
+    #endif
    
    return mem;
 }
@ -116,6 +123,13 @@ Memory* free_memory(void* ptr, Memory* mem) {
        // printf(GREEN "%p <= %p < %p\n" RESET, mem->start, ptr, (void*)((intptr_t)mem->start + mem->size));
        if (mem->nb_alloc == 0) {
            Memory* mem_next = mem->next;
+
+            #ifdef MEMORY_TAIL_OPT
+            if (tail == mem) {
+                tail = memory;
+            }
+            #endif
+
            #ifdef __CUDACC__
            cudaFree(mem->start);
            #else
@ -145,7 +159,11 @@ void* nalloc(int nb_elements, size_t size) {
        }
        //printf("Distinct allocations: %d Blocks: %d\n", get_distinct_allocations(memory), get_length(memory));
        //printf("Requested memory of size %ld\n", sz);
+        #ifdef MEMORY_TAIL_OPT
+        void* ptr = allocate_memory(nb_elements, size, tail);
+        #else
        void* ptr = allocate_memory(nb_elements, size, memory);
+        #endif

        pthread_mutex_unlock(&memory_lock);
        return ptr;