From bc191015e387107f3163989cbaae60ac96d556a4 Mon Sep 17 00:00:00 2001 From: shivammonaka Date: Fri, 23 Feb 2024 18:36:13 +0530 Subject: [PATCH] Using OpenMP locks with NUM_PARALLEL --- driver/level3/level3_thread.c | 78 ++++++++++++++++++++++++++------- driver/others/blas_server_omp.c | 4 -- 2 files changed, 62 insertions(+), 20 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 02b60b50d9..ff32a74a97 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -548,13 +548,31 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG nthreads_m, BLASLONG nthreads_n) { -#ifndef USE_OPENMP -#ifndef OS_WINDOWS -static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; +#ifdef USE_OPENMP + static omp_lock_t level3_lock, critical_section_lock; + static volatile BLASLONG init_lock = 0, omp_lock_initialized = 0, + parallel_section_left = MAX_PARALLEL_NUMBER; + + // Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c + while(omp_lock_initialized == 0) + { + blas_lock(&init_lock); + { + if(omp_lock_initialized == 0) + { + omp_init_lock(&level3_lock); + omp_init_lock(&critical_section_lock); + omp_lock_initialized = 1; + WMB; + } + blas_unlock(&init_lock); + } + } +#elif defined(OS_WINDOWS) + CRITICAL_SECTION level3_lock; + InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); #else -CRITICAL_SECTION level3_lock; -InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); -#endif + static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; #endif blas_arg_t newarg; @@ -597,12 +615,28 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); #endif #endif -#ifndef USE_OPENMP -#ifndef OS_WINDOWS -pthread_mutex_lock(&level3_lock); +#ifdef USE_OPENMP + omp_set_lock(&level3_lock); + omp_set_lock(&critical_section_lock); + + parallel_section_left--; + + /* + How OpenMP locks works with NUM_PARALLEL + 1) parallel_section_left = Number of available concurrent executions of OpenBLAS - Number of currently executing OpenBLAS executions + 2) level3_lock is acting like a master lock or barrier which stops OpenBLAS calls when all the parallel_section are currently busy executing other OpenBLAS calls + 3) critical_section_lock is used for updating variables shared between threads executing OpenBLAS calls concurrently and for unlocking of master lock whenever required + 4) Unlock master lock only when we have not already exhausted all the parallel_sections and allow another thread with a OpenBLAS call to enter + */ + if(parallel_section_left != 0) + omp_unset_lock(&level3_lock); + + omp_unset_lock(&critical_section_lock); + +#elif defined(OS_WINDOWS) + EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); #else -EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); -#endif + pthread_mutex_lock(&level3_lock); #endif #ifdef USE_ALLOC_HEAP @@ -730,12 +764,24 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); free(job); #endif -#ifndef USE_OPENMP -#ifndef OS_WINDOWS - pthread_mutex_unlock(&level3_lock); -#else +#ifdef USE_OPENMP + omp_set_lock(&critical_section_lock); + parallel_section_left++; + + /* + Unlock master lock only when all the parallel_sections are already exhausted and one of the thread has completed its OpenBLAS call + otherwise just increment the parallel_section_left + The master lock is only locked when we have exhausted all the parallel_sections, So only unlock it then and otherwise just increment the count + */ + if(parallel_section_left == 1) + omp_unset_lock(&level3_lock); + + omp_unset_lock(&critical_section_lock); + +#elif defined(OS_WINDOWS) LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); -#endif +#else + pthread_mutex_unlock(&level3_lock); #endif return 0; diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 2e0c0f38c1..bcd9c29b5b 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -396,7 +396,6 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ } #endif - while(true) { for(i=0; i < MAX_PARALLEL_NUMBER; i++) { #ifdef HAVE_C11 _Bool inuse = false; @@ -409,9 +408,6 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ break; } } - if(i != MAX_PARALLEL_NUMBER) - break; - } if (openblas_omp_adaptive_env() != 0) { #pragma omp parallel for num_threads(num) schedule(OMP_SCHED)