From 4cffcc1c95b1636046ebc82d49181f451f373120 Mon Sep 17 00:00:00 2001 From: augustin64 Date: Mon, 15 May 2023 10:07:00 +0200 Subject: [PATCH] mem management: Add optional tail optimisation VGG16: memory usage increases of ~1%, but initialisation time (for CUDA) goes from 1h down to 2.4s --- src/common/include/memory_management.h | 5 +++++ src/common/memory_management.c | 20 +++++++++++++++++++- src/common/memory_management.cu | 20 +++++++++++++++++++- 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/src/common/include/memory_management.h b/src/common/include/memory_management.h index 008a2b0..6f28cce 100644 --- a/src/common/include/memory_management.h +++ b/src/common/include/memory_management.h @@ -8,6 +8,11 @@ // https://forums.developer.nvidia.com/t/find-the-limit-of-shared-memory-that-can-be-used-per-block/48556 #define MEMORY_BLOCK 49152 +// On n'alloue de la mémoire que dans le dernier bloc créé, on ne parcourt donc pas la liste +// Cela augmente légèrement l'utilisation de la mémoire, mais permet un gain de temps conséquent +// Pour VGG16, environ 1% de mémoire supplémentaire utilisée, +// L'initialisation passe de 1h02 à 2.4s sur mon matériel +#define MEMORY_TAIL_OPT // We define our memory with a linked list of memory blocks typedef struct Memory { diff --git a/src/common/memory_management.c b/src/common/memory_management.c index c895694..5a403dc 100644 --- a/src/common/memory_management.c +++ b/src/common/memory_management.c @@ -8,8 +8,12 @@ #include "include/utils.h" -Memory* memory = NULL; pthread_mutex_t memory_lock = PTHREAD_MUTEX_INITIALIZER; +Memory* memory = NULL; +#ifdef MEMORY_TAIL_OPT + Memory* tail = NULL; +#endif + int get_distinct_allocations(Memory* mem) { @@ -68,6 +72,9 @@ Memory* create_memory_block(size_t size) { mem->nb_alloc = 0; mem->next = NULL; mem->id = rand() %100000; + #ifdef MEMORY_TAIL_OPT + tail = mem; + #endif return mem; } @@ -116,6 +123,13 @@ Memory* free_memory(void* ptr, Memory* mem) { // printf(GREEN "%p <= %p < %p\n" RESET, mem->start, ptr, (void*)((intptr_t)mem->start + mem->size)); if (mem->nb_alloc == 0) { Memory* mem_next = mem->next; + + #ifdef MEMORY_TAIL_OPT + if (tail == mem) { + tail = memory; + } + #endif + #ifdef __CUDACC__ cudaFree(mem->start); #else @@ -145,7 +159,11 @@ void* nalloc(int nb_elements, size_t size) { } //printf("Distinct allocations: %d Blocks: %d\n", get_distinct_allocations(memory), get_length(memory)); //printf("Requested memory of size %ld\n", sz); + #ifdef MEMORY_TAIL_OPT + void* ptr = allocate_memory(nb_elements, size, tail); + #else void* ptr = allocate_memory(nb_elements, size, memory); + #endif pthread_mutex_unlock(&memory_lock); return ptr; diff --git a/src/common/memory_management.cu b/src/common/memory_management.cu index c895694..5a403dc 100644 --- a/src/common/memory_management.cu +++ b/src/common/memory_management.cu @@ -8,8 +8,12 @@ #include "include/utils.h" -Memory* memory = NULL; pthread_mutex_t memory_lock = PTHREAD_MUTEX_INITIALIZER; +Memory* memory = NULL; +#ifdef MEMORY_TAIL_OPT + Memory* tail = NULL; +#endif + int get_distinct_allocations(Memory* mem) { @@ -68,6 +72,9 @@ Memory* create_memory_block(size_t size) { mem->nb_alloc = 0; mem->next = NULL; mem->id = rand() %100000; + #ifdef MEMORY_TAIL_OPT + tail = mem; + #endif return mem; } @@ -116,6 +123,13 @@ Memory* free_memory(void* ptr, Memory* mem) { // printf(GREEN "%p <= %p < %p\n" RESET, mem->start, ptr, (void*)((intptr_t)mem->start + mem->size)); if (mem->nb_alloc == 0) { Memory* mem_next = mem->next; + + #ifdef MEMORY_TAIL_OPT + if (tail == mem) { + tail = memory; + } + #endif + #ifdef __CUDACC__ cudaFree(mem->start); #else @@ -145,7 +159,11 @@ void* nalloc(int nb_elements, size_t size) { } //printf("Distinct allocations: %d Blocks: %d\n", get_distinct_allocations(memory), get_length(memory)); //printf("Requested memory of size %ld\n", sz); + #ifdef MEMORY_TAIL_OPT + void* ptr = allocate_memory(nb_elements, size, tail); + #else void* ptr = allocate_memory(nb_elements, size, memory); + #endif pthread_mutex_unlock(&memory_lock); return ptr;