diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 41edc45d21..404ca7b552 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -192,6 +192,9 @@ static int main_status[MAX_CPU_NUMBER]; BLASLONG exit_time[MAX_CPU_NUMBER]; #endif +//Prototypes +static void exec_threads(int , blas_queue_t *, int); +static void adjust_thread_buffers(); static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ @@ -372,127 +375,6 @@ int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) } #endif -static void adjust_thread_buffers() { - - int i=0; - - //adjust buffer for each thread - for(i=0; i < blas_cpu_number; i++){ - if(blas_thread_buffer[i] == NULL){ - blas_thread_buffer[i] = blas_memory_alloc(2); - } - } - for(; i < MAX_CPU_NUMBER; i++){ - if(blas_thread_buffer[i] != NULL){ - blas_memory_free(blas_thread_buffer[i]); - blas_thread_buffer[i] = NULL; - } - } -} - -static void exec_threads(int cpu, blas_queue_t *queue, int buf_index) -{ - -void *buffer, *sa, *sb; - -buffer = blas_thread_buffer[cpu]; -int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine; - -atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); - -sa = queue -> sa; -sb = queue -> sb; - -#ifdef SMP_DEBUG - if (queue -> args) { - fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", - cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); - } -#endif -#ifdef CONSISTENT_FPCSR -#ifdef __aarch64__ - __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); -#else - __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); - __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); -#endif -#endif - - -#ifdef MONITOR - main_status[cpu] = MAIN_RUNNING1; -#endif - - if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); - - if (sb == NULL) { - if (!(queue -> mode & BLAS_COMPLEX)){ -#ifdef EXPRECISION - if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ - sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else -#endif - if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { -#ifdef BUILD_DOUBLE - sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); -#endif - } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { -#ifdef BUILD_SINGLE - sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); -#endif - } else { - /* Other types in future */ - } - } else { -#ifdef EXPRECISION - if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ - sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else -#endif - if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ -#ifdef BUILD_COMPLEX16 - sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); -#endif - } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { -#ifdef BUILD_COMPLEX - sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) - + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); -#endif - } else { - /* Other types in future */ - } - } - queue->sb=sb; - } - -#ifdef MONITOR - main_status[cpu] = MAIN_RUNNING2; -#endif - - if (queue -> mode & BLAS_LEGACY) { - legacy_exec(routine, queue -> mode, queue -> args, sb); - } else - if (queue -> mode & BLAS_PTHREAD) { - void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine; - (pthreadcompat)(queue -> args); - } else - (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); - -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu); -#endif - -#ifdef MONITOR - main_status[cpu] = MAIN_FINISH; -#endif - -} - static void* blas_thread_server(void *arg){ /* Thread identifier */ @@ -1148,4 +1030,125 @@ int BLASFUNC(blas_thread_shutdown)(void){ return 0; } +static void adjust_thread_buffers() { + + int i=0; + + //adjust buffer for each thread + for(i=0; i < blas_cpu_number; i++){ + if(blas_thread_buffer[i] == NULL){ + blas_thread_buffer[i] = blas_memory_alloc(2); + } + } + for(; i < MAX_CPU_NUMBER; i++){ + if(blas_thread_buffer[i] != NULL){ + blas_memory_free(blas_thread_buffer[i]); + blas_thread_buffer[i] = NULL; + } + } +} + +static void exec_threads(int cpu, blas_queue_t *queue, int buf_index) +{ + +void *buffer, *sa, *sb; + +buffer = blas_thread_buffer[cpu]; +int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine; + +atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); + +sa = queue -> sa; +sb = queue -> sb; + +#ifdef SMP_DEBUG + if (queue -> args) { + fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", + cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); + } +#endif +#ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); +#else + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); + __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif +#endif + + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING1; +#endif + + if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); + + if (sb == NULL) { + if (!(queue -> mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { +#ifdef BUILD_DOUBLE + sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_SINGLE + sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + } else { + /* Other types in future */ + } + } else { +#ifdef EXPRECISION + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_COMPLEX16 + sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_COMPLEX + sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + } else { + /* Other types in future */ + } + } + queue->sb=sb; + } + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING2; +#endif + + if (queue -> mode & BLAS_LEGACY) { + legacy_exec(routine, queue -> mode, queue -> args, sb); + } else + if (queue -> mode & BLAS_PTHREAD) { + void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine; + (pthreadcompat)(queue -> args); + } else + (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu); +#endif + +#ifdef MONITOR + main_status[cpu] = MAIN_FINISH; +#endif + +} + #endif \ No newline at end of file