From 2718b37fedc81d13f3917b5644bfa32081434e9b Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Wed, 30 Oct 2024 13:57:13 +0530 Subject: [PATCH 01/66] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index a6d25b50bd..cf74524c8d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -229,3 +229,13 @@ In chronological order: * Christopher Daley * [2024-01-24] Optimize GEMV forwarding on ARM64 systems + +* Aniket P. Garade + * [2024-10-30] Optimized scal Level-1 BLAS routines with ARM SVE + +* Sushil Pratap Singh + * [2024-10-30] Optimized swap Level-1 BLAS routines with ARM SVE + +* Juliya James + * [2024-10-30] Optimized rot Level-1 BLAS routines with ARM SVE + From 0667cf6c92396e3813c0dc58460312ff70df6c71 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:01:09 +0530 Subject: [PATCH 02/66] Added optimized scal routine files --- kernel/arm64/scal.c | 40 +++++++++++++++++++++++++ kernel/arm64/scal_kernel_c.c | 43 +++++++++++++++++++++++++++ kernel/arm64/scal_kernel_sve.c | 54 ++++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+) create mode 100644 kernel/arm64/scal.c create mode 100644 kernel/arm64/scal_kernel_c.c create mode 100644 kernel/arm64/scal_kernel_sve.c diff --git a/kernel/arm64/scal.c b/kernel/arm64/scal.c new file mode 100644 index 0000000000..e64b0075e8 --- /dev/null +++ b/kernel/arm64/scal.c @@ -0,0 +1,40 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include "scal_kernel_sve.c" +#include "scal_kernel_c.c" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if ((n <= 0) || (inc_x <= 0)) + return (0); + if (inc_x == 1) + scal_kernel_sve(n, x, da); + else + scal_kernel_c(n, da, x, inc_x, y, inc_y); + return (0); +} diff --git a/kernel/arm64/scal_kernel_c.c b/kernel/arm64/scal_kernel_c.c new file mode 100644 index 0000000000..659168da54 --- /dev/null +++ b/kernel/arm64/scal_kernel_c.c @@ -0,0 +1,43 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" + +static int scal_kernel_c(BLASLONG n, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i = 0, j = 0; + + while (j < n) + { + if (da == 0.0) + x[i] = 0.0; + else + x[i] = da * x[i]; + i += inc_x; + j++; + } + return (0); +} diff --git a/kernel/arm64/scal_kernel_sve.c b/kernel/arm64/scal_kernel_sve.c new file mode 100644 index 0000000000..ccd5a4cd2b --- /dev/null +++ b/kernel/arm64/scal_kernel_sve.c @@ -0,0 +1,54 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static int scal_kernel_sve(int n, FLOAT *x, FLOAT da) +{ + for (int i = 0; i < n; i += SVE_WIDTH) + { + svbool_t pg = SVE_WHILELT(i, n); + SVE_TYPE x_vec = svld1(pg, &x[i]); + SVE_TYPE result = svmul_z(pg, x_vec, da); + svst1(pg, &x[i], result); + } + return (0); +} From b8bc2a752eb66ff696a5e6ebc951d615cf61b854 Mon Sep 17 00:00:00 2001 From: SushilPratap04 Date: Wed, 30 Oct 2024 14:02:57 +0530 Subject: [PATCH 03/66] Added sve optimized kernels for swap routine --- kernel/arm64/swap.c | 40 ++++++++++++++++++++++ kernel/arm64/swap_kernel_c.c | 46 +++++++++++++++++++++++++ kernel/arm64/swap_kernel_sve.c | 62 ++++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+) create mode 100644 kernel/arm64/swap.c create mode 100644 kernel/arm64/swap_kernel_c.c create mode 100644 kernel/arm64/swap_kernel_sve.c diff --git a/kernel/arm64/swap.c b/kernel/arm64/swap.c new file mode 100644 index 0000000000..c5af18e6ba --- /dev/null +++ b/kernel/arm64/swap.c @@ -0,0 +1,40 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include "common.h" +#include "swap_kernel_sve.c" +#include "swap_kernel_c.c" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if (n <= 0) + return 0; + if (inc_x == 1 && inc_y == 1) + swap_kernel_sve(n, x, y); + else + swap_kernel_c(n, x, inc_x, y, inc_y); + return (0); +} diff --git a/kernel/arm64/swap_kernel_c.c b/kernel/arm64/swap_kernel_c.c new file mode 100644 index 0000000000..c1d7cc619a --- /dev/null +++ b/kernel/arm64/swap_kernel_c.c @@ -0,0 +1,46 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include "common.h" +#include + +static int swap_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; + + while (i < n) + { + temp = x[ix]; + x[ix] = y[iy]; + y[iy] = temp; + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} diff --git a/kernel/arm64/swap_kernel_sve.c b/kernel/arm64/swap_kernel_sve.c new file mode 100644 index 0000000000..fed7e6d0f5 --- /dev/null +++ b/kernel/arm64/swap_kernel_sve.c @@ -0,0 +1,62 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static int swap_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) +{ + BLASLONG sve_width = SVE_WIDTH; + + for (BLASLONG i = 0; i < n; i += sve_width * 2) + { + svbool_t pg_a = SVE_WHILELT(i, n); + svbool_t pg_b = SVE_WHILELT((i + sve_width), n); + SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); + SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); + SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); + SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); + svst1(pg_a, &x[i], y_vec_a); + svst1(pg_a, &y[i], x_vec_a); + svst1(pg_b, &x[i + sve_width], y_vec_b); + svst1(pg_b, &y[i + sve_width], x_vec_b); + } + return (0); +} From 7822ae961784234f21d95b3de3aff53dfb0f799a Mon Sep 17 00:00:00 2001 From: SushilPratap04 Date: Wed, 30 Oct 2024 14:05:21 +0530 Subject: [PATCH 04/66] Added sve kernels for rot routine. --- kernel/arm64/rot.c | 40 ++++++++++++++++++++++++ kernel/arm64/rot_kernel_c.c | 44 ++++++++++++++++++++++++++ kernel/arm64/rot_kernel_sve.c | 59 +++++++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+) create mode 100644 kernel/arm64/rot.c create mode 100644 kernel/arm64/rot_kernel_c.c create mode 100644 kernel/arm64/rot_kernel_sve.c diff --git a/kernel/arm64/rot.c b/kernel/arm64/rot.c new file mode 100644 index 0000000000..abddc15381 --- /dev/null +++ b/kernel/arm64/rot.c @@ -0,0 +1,40 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include "rot_kernel_sve.c" +#include "rot_kernel_c.c" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + if (n <= 0) + return (0); + if (inc_x == 1 && inc_y == 1) + rot_kernel_sve(n, x, y, c, s); + else + rot_kernel_c(n, x, inc_x, y, inc_y, c, s); + return (0); +} diff --git a/kernel/arm64/rot_kernel_c.c b/kernel/arm64/rot_kernel_c.c new file mode 100644 index 0000000000..f37d2db169 --- /dev/null +++ b/kernel/arm64/rot_kernel_c.c @@ -0,0 +1,44 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" + +static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; + while (i < n) + { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} diff --git a/kernel/arm64/rot_kernel_sve.c b/kernel/arm64/rot_kernel_sve.c new file mode 100644 index 0000000000..0a790824f0 --- /dev/null +++ b/kernel/arm64/rot_kernel_sve.c @@ -0,0 +1,59 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + for (int i = 0; i < n; i += SVE_WIDTH) + { + svbool_t pg = SVE_WHILELT((uint32_t)i, (uint32_t)n); + SVE_TYPE x_vec = svld1(pg, &x[i]); + SVE_TYPE y_vec = svld1(pg, &y[i]); + SVE_TYPE cx_vec = svmul_z(pg, x_vec, c); + SVE_TYPE sy_vec = svmul_z(pg, y_vec, s); + SVE_TYPE sx_vec = svmul_z(pg, x_vec, s); + SVE_TYPE cy_vec = svmul_z(pg, y_vec, c); + svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec)); + svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec)); + } + return (0); +} From fa880ab1cfed1b449a4cbfbd9e55a0d6c78d2e9e Mon Sep 17 00:00:00 2001 From: SushilPratap04 Date: Wed, 30 Oct 2024 14:09:37 +0530 Subject: [PATCH 05/66] Update KERNEL.ARMV8SVE updated KERNEL.ARMV8SVE for level 1 sve (swap, rot and scal) kernels. --- kernel/arm64/KERNEL.ARMV8SVE | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index bfadf5cba9..cecc72cf96 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -64,13 +64,13 @@ DAXPYKERNEL = daxpy_thunderx2t99.S CAXPYKERNEL = zaxpy.S ZAXPYKERNEL = zaxpy.S -SROTKERNEL = rot.S -DROTKERNEL = rot.S +SROTKERNEL = rot.c +DROTKERNEL = rot.c CROTKERNEL = zrot.S ZROTKERNEL = zrot.S -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S +SSCALKERNEL = scal.c +DSCALKERNEL = scal.c CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S @@ -94,8 +94,8 @@ DCOPYKERNEL = copy_thunderx2t99.c CCOPYKERNEL = copy_thunderx2t99.c ZCOPYKERNEL = copy_thunderx2t99.c -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S +SSWAPKERNEL = swap.c +DSWAPKERNEL = swap.c CSWAPKERNEL = swap_thunderx2t99.S ZSWAPKERNEL = swap_thunderx2t99.S From 668e28adc445edc5d905713daef246733bc62444 Mon Sep 17 00:00:00 2001 From: Juliya32 <116022942+Juliya32@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:22:31 +0530 Subject: [PATCH 06/66] Delete kernel/arm64/rot.c --- kernel/arm64/rot.c | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 kernel/arm64/rot.c diff --git a/kernel/arm64/rot.c b/kernel/arm64/rot.c deleted file mode 100644 index abddc15381..0000000000 --- a/kernel/arm64/rot.c +++ /dev/null @@ -1,40 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include "rot_kernel_sve.c" -#include "rot_kernel_c.c" - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - if (n <= 0) - return (0); - if (inc_x == 1 && inc_y == 1) - rot_kernel_sve(n, x, y, c, s); - else - rot_kernel_c(n, x, inc_x, y, inc_y, c, s); - return (0); -} From d90ee00f8595ef46c31cb30fa045a75e8ba0056b Mon Sep 17 00:00:00 2001 From: Juliya32 <116022942+Juliya32@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:22:51 +0530 Subject: [PATCH 07/66] Delete kernel/arm64/rot_kernel_c.c --- kernel/arm64/rot_kernel_c.c | 44 ------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 kernel/arm64/rot_kernel_c.c diff --git a/kernel/arm64/rot_kernel_c.c b/kernel/arm64/rot_kernel_c.c deleted file mode 100644 index f37d2db169..0000000000 --- a/kernel/arm64/rot_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" - -static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT temp; - while (i < n) - { - temp = c * x[ix] + s * y[iy]; - y[iy] = c * y[iy] - s * x[ix]; - x[ix] = temp; - ix += inc_x; - iy += inc_y; - i++; - } - return (0); -} From 012fe4da36a31586965b1e25d70c62f7ad8ac713 Mon Sep 17 00:00:00 2001 From: Juliya32 <116022942+Juliya32@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:23:15 +0530 Subject: [PATCH 08/66] Delete kernel/arm64/rot_kernel_sve.c --- kernel/arm64/rot_kernel_sve.c | 59 ----------------------------------- 1 file changed, 59 deletions(-) delete mode 100644 kernel/arm64/rot_kernel_sve.c diff --git a/kernel/arm64/rot_kernel_sve.c b/kernel/arm64/rot_kernel_sve.c deleted file mode 100644 index 0a790824f0..0000000000 --- a/kernel/arm64/rot_kernel_sve.c +++ /dev/null @@ -1,59 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include - -#ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() -#else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() -#endif - -static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) -{ - for (int i = 0; i < n; i += SVE_WIDTH) - { - svbool_t pg = SVE_WHILELT((uint32_t)i, (uint32_t)n); - SVE_TYPE x_vec = svld1(pg, &x[i]); - SVE_TYPE y_vec = svld1(pg, &y[i]); - SVE_TYPE cx_vec = svmul_z(pg, x_vec, c); - SVE_TYPE sy_vec = svmul_z(pg, y_vec, s); - SVE_TYPE sx_vec = svmul_z(pg, x_vec, s); - SVE_TYPE cy_vec = svmul_z(pg, y_vec, c); - svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec)); - svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec)); - } - return (0); -} From 3b2421cba0c73db40ba796e9d4f79161cba0b2d9 Mon Sep 17 00:00:00 2001 From: Juliya32 <116022942+Juliya32@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:23:42 +0530 Subject: [PATCH 09/66] Add files via upload --- kernel/arm64/rot.c | 40 ++++++++++++++++++++++++ kernel/arm64/rot_kernel_c.c | 44 ++++++++++++++++++++++++++ kernel/arm64/rot_kernel_sve.c | 59 +++++++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+) create mode 100644 kernel/arm64/rot.c create mode 100644 kernel/arm64/rot_kernel_c.c create mode 100644 kernel/arm64/rot_kernel_sve.c diff --git a/kernel/arm64/rot.c b/kernel/arm64/rot.c new file mode 100644 index 0000000000..abddc15381 --- /dev/null +++ b/kernel/arm64/rot.c @@ -0,0 +1,40 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include "rot_kernel_sve.c" +#include "rot_kernel_c.c" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + if (n <= 0) + return (0); + if (inc_x == 1 && inc_y == 1) + rot_kernel_sve(n, x, y, c, s); + else + rot_kernel_c(n, x, inc_x, y, inc_y, c, s); + return (0); +} diff --git a/kernel/arm64/rot_kernel_c.c b/kernel/arm64/rot_kernel_c.c new file mode 100644 index 0000000000..f37d2db169 --- /dev/null +++ b/kernel/arm64/rot_kernel_c.c @@ -0,0 +1,44 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" + +static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; + while (i < n) + { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} diff --git a/kernel/arm64/rot_kernel_sve.c b/kernel/arm64/rot_kernel_sve.c new file mode 100644 index 0000000000..0a790824f0 --- /dev/null +++ b/kernel/arm64/rot_kernel_sve.c @@ -0,0 +1,59 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + for (int i = 0; i < n; i += SVE_WIDTH) + { + svbool_t pg = SVE_WHILELT((uint32_t)i, (uint32_t)n); + SVE_TYPE x_vec = svld1(pg, &x[i]); + SVE_TYPE y_vec = svld1(pg, &y[i]); + SVE_TYPE cx_vec = svmul_z(pg, x_vec, c); + SVE_TYPE sy_vec = svmul_z(pg, y_vec, s); + SVE_TYPE sx_vec = svmul_z(pg, x_vec, s); + SVE_TYPE cy_vec = svmul_z(pg, y_vec, c); + svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec)); + svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec)); + } + return (0); +} From 4060dd43e308e4dfa32a7024b98ef62314e9ed18 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Nov 2024 15:16:17 -0800 Subject: [PATCH 10/66] Add dummy implementations of openblas_get/set_affinity --- driver/others/blas_server_omp.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 4341389d81..38b48fc842 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -126,6 +126,18 @@ void openblas_set_num_threads(int num_threads) { goto_set_num_threads(num_threads); } +#ifdef OS_LINUX + +int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { + fprintf(stderr,"OpenBLAS: use OpenMP environment variables for setting cpu affinity\n"); + return -1; +} +int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { + fprintf(stderr,"OpenBLAS: use OpenMP environment variables for querying cpu affinity\n"); + return -1; +} +#endif + int blas_thread_init(void){ #if defined(__FreeBSD__) && defined(__clang__) From 9db51f790a53a0f9af295ca284bc76ce648537ec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 Nov 2024 23:19:58 +0100 Subject: [PATCH 11/66] Remove any optimization flags from DEBUG builds on POWER architecture --- Makefile.system | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile.system b/Makefile.system index 21a0fc3caa..29ea819f13 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1615,6 +1615,13 @@ NO_AFFINITY = 1 endif endif +ifeq ($(ARCH), POWER) +ifeq ($(DEBUG), 1) +CCOMMON_OPT := $(filter-out -O%, $(CCOMMON_OPT)) -O0 +FCOMMON_OPT := $(filter-out -O%, $(FCOMMON_OPT)) -O0 +endif +endif + ifdef NO_AFFINITY ifeq ($(NO_AFFINITY), 0) override undefine NO_AFFINITY From a0131e56e09c75372740579981becf75fab11edd Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Thu, 21 Nov 2024 13:56:54 +0100 Subject: [PATCH 12/66] doc: update README to link to the html docs and fix links Also some minor formatting improvements and linking the home page. --- README.md | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index a31588be02..f527fd429c 100644 --- a/README.md +++ b/README.md @@ -15,11 +15,14 @@ OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=O OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version. -Please read the documentation in the OpenBLAS folder: . +For more information about OpenBLAS, please see: + +- The documentation at [openmathlib.org/OpenBLAS/docs/](http://www.openmathlib.org/OpenBLAS/docs), +- The home page at [openmathlib.org/OpenBLAS/](http://www.openmathlib.org/OpenBLAS). For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib: . On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six -20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare or Youtube may be helpful. +20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare [here](https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/) or YouTube [here](https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek) may be helpful. ## Binary Packages @@ -27,15 +30,17 @@ We provide official binary packages for the following platform: * Windows x86/x86_64 -You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/OpenMathLib/OpenBLAS/releases](https://github.com/OpenMathLib/OpenBLAS/releases). +You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the [Releases section of the GitHub project page](https://github.com/OpenMathLib/OpenBLAS/releases). + +OpenBLAS is also packaged for many package managers - see [the installation section of the docs](http://www.openmathlib.org/OpenBLAS/docs/install/) for details. ## Installation from Source -Download from project homepage, https://github.com/OpenMathLib/OpenBLAS/, or check out the code -using Git from https://github.com/OpenMathLib/OpenBLAS.git. (If you want the most up to date version, be -sure to use the develop branch - master is several years out of date due to a change of maintainership.) -Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. -Most can also be given directly on the make or cmake command line. +Obtain the source code from https://github.com/OpenMathLib/OpenBLAS/. Note that the default branch +is `develop` (a `master` branch is still present, but far out of date). + +Build-time parameters can be chosen in `Makefile.rule`, see there for a short description of each option. +Most options can also be given directly on the command line as parameters to your `make` or `cmake` invocation. ### Dependencies @@ -60,6 +65,9 @@ For building with `cmake`, the usual conventions apply, i.e. create a build dire OpenBLAS source directory or separate from it, and invoke `cmake` there with the path to the source tree and any build options you plan to set. +For more details, see the [Building from source](http://www.openmathlib.org/OpenBLAS/docs/install/#building-from-source) +section in the docs. + ### Cross compile Set `CC` and `FC` to point to the cross toolchains, and if you use `make`, also set `HOSTCC` to your host C compiler. @@ -76,10 +84,12 @@ Examples: make CC="i686-w64-mingw32-gcc -Bstatic" FC="i686-w64-mingw32-gfortran -static-libgfortran" TARGET=HASWELL BINARY=32 CROSS=1 NUM_THREADS=20 CONSISTENT_FPCSR=1 HOSTCC=gcc ``` -You can find instructions for other cases both in the "Supported Systems" section below and in the docs folder. The .yml scripts included with the sources (which contain the +You can find instructions for other cases both in the "Supported Systems" section below and in +the [Building from source docs](http://www.openmathlib.org/OpenBLAS/docs/install). +The `.yml` scripts included with the sources (which contain the build scripts for the "continuous integration" (CI) build tests automatically run on every proposed change to the sources) may also provide additional hints. -When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build. +When compiling for a more modern CPU target of the same architecture, e.g. `TARGET=SKYLAKEX` on a `HASWELL` host, option `CROSS=1` can be used to suppress the automatic invocation of the tests at the end of the build. ### Debug version @@ -325,7 +335,7 @@ Please see Changelog.txt. ## Troubleshooting -* Please read the [FAQ](https://github.com/OpenMathLib/OpenBLAS/docs/faq,md) in the docs folder first. +* Please read the [FAQ](www.openmathlib.org/OpenBLAS/docs/faq) section of the docs first. * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. @@ -350,4 +360,4 @@ Please see Changelog.txt. ## Donation -Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation). +Please see [the donations section](http://www.openmathlib.org/OpenBLAS/docs/about/#donations) in the docs. From 0b3db03d4b41b86bb26170e4f4e36785ced9d947 Mon Sep 17 00:00:00 2001 From: daichengrong Date: Fri, 22 Nov 2024 11:13:24 +0800 Subject: [PATCH 13/66] added optimizations for RISC-V YIELDING --- common.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/common.h b/common.h index b8bac1ad27..b3fc6d7ded 100644 --- a/common.h +++ b/common.h @@ -372,6 +372,12 @@ typedef int blasint; #endif #endif +#if defined(ARCH_RISCV64) +#ifndef YIELDING +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif +#endif + #ifdef __EMSCRIPTEN__ #define YIELDING From 3a63bbabd1e032b4e0e5ef4199f7c19ff1a5594e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 22 Nov 2024 12:10:56 +0100 Subject: [PATCH 14/66] Add compiler version notes and mention the f2c fallback LAPACK --- README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f527fd429c..d8e73b2022 100644 --- a/README.md +++ b/README.md @@ -47,9 +47,12 @@ Most options can also be given directly on the command line as parameters to you Building OpenBLAS requires the following to be installed: * GNU Make or CMake -* A C compiler, e.g. GCC or Clang +* A C compiler, e.g. GCC or Clang * A Fortran compiler (optional, for LAPACK) +In general, using a recent version of the compiler is strongly recommended. +If a Fortran compiler is not available, it is possible to compile an older version of the included LAPACK +that has been machine-translated to C. ### Normal compile @@ -339,7 +342,10 @@ Please see Changelog.txt. * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. -* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels. +* Please use GCC version 6 or LLVM version 6 and above to compile Skylake/CooperLake AVX512 kernels +* Please use LLVM version 18 and above (version 19 and above on Windows) if you plan to use + its new flang compiler for Fortran +* Please use GCC version 11 and above to compile OpenBLAS on the POWER architecture * The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`), there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build the library with `BIGNUMA=1`. From 760a5371f317fda909c2d6850e3d3a71b2d7d280 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 22 Nov 2024 15:59:45 +0100 Subject: [PATCH 15/66] Update build instructions for WoA (use LLVM19 and its flang-new) --- docs/install.md | 45 +++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/docs/install.md b/docs/install.md index ffb4659d82..33e9323cd9 100644 --- a/docs/install.md +++ b/docs/install.md @@ -439,49 +439,34 @@ To then use the built OpenBLAS shared library in Visual Studio: #### Windows on Arm +While OpenBLAS can be built with Microsoft VisualStudio (Community Edition or commercial), you would only be able to build for the GENERIC target +that does not use optimized assembly kernels, also the stock VisualStudio lacks the Fortran compiler necessary for building the LAPACK component. +It is therefore highly recommended to download the free LLVM compiler suite and use it to compile OpenBLAS outside of VisualStudio. + The following tools needs to be installed to build for Windows on Arm (WoA): -- Clang for Windows on Arm. - Find the latest LLVM build for WoA from [LLVM release page](https://releases.llvm.org/). - E.g: LLVM 12 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-12.0.0/LLVM-12.0.0-woa64.exe) - Run the LLVM installer and ensure that LLVM is added to environment PATH. -- Download and install classic Flang for Windows on Arm. - Classic Flang is the only available Fortran compiler for Windows on Arm for now. - A pre-release build can be found [here](https://github.com/kaadam/flang/releases/tag/v0.1) - There is no installer for classic flang and the zip package can be - extracted and the path needs to be added to environment `PATH`. - E.g., in PowerShell: - ``` - $env:Path += ";C:\flang_woa\bin" - ``` +- LLVM for Windows on Arm. + Find the latest LLVM build for WoA from [LLVM release page](https://releases.llvm.org/) - you want the package whose name ends in "woa64.exe". + E.g: LLVM 19 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.4/LLVM-19.1.4-woa64.exe) + Run the LLVM installer and ensure that LLVM is added to environment PATH. (If you do not want to add it to the PATH, you will need to specify + both C and Fortran compiler to Make or CMake with their full path later on) -The following steps describe how to build the static library for OpenBLAS with and without LAPACK: +The following steps describe how to build the static library for OpenBLAS with either Make or CMake: -1. Build OpenBLAS static library with BLAS and LAPACK routines with Make: +1. Build OpenBLAS with Make: ```bash - $ make CC="clang-cl" HOSTCC="clang-cl" AR="llvm-ar" BUILD_WITHOUT_LAPACK=0 NOFORTRAN=0 DYNAMIC_ARCH=0 TARGET=ARMV8 ARCH=arm64 BINARY=64 USE_OPENMP=0 PARALLEL=1 RANLIB="llvm-ranlib" MAKE=make F_COMPILER=FLANG FC=FLANG FFLAGS_NOOPT="-march=armv8-a -cpp" FFLAGS="-march=armv8-a -cpp" NEED_PIC=0 HOSTARCH=arm64 libs netlib + $ make CC=clang-cl FC=flang-new AR="llvm-ar" TARGET=ARMV8 ARCH=arm64 RANLIB="llvm-ranlib" MAKE=make ``` -2. Build static library with BLAS routines using CMake: - - Classic Flang has compatibility issues with CMake, hence only BLAS routines can be compiled with CMake: - +2. Build OpenBLAS with CMake ```bash $ mkdir build $ cd build - $ cmake .. -G Ninja -DCMAKE_C_COMPILER=clang -DBUILD_WITHOUT_LAPACK=1 -DNOFORTRAN=1 -DDYNAMIC_ARCH=0 -DTARGET=ARMV8 -DARCH=arm64 -DBINARY=64 -DUSE_OPENMP=0 -DCMAKE_SYSTEM_PROCESSOR=ARM64 -DCMAKE_CROSSCOMPILING=1 -DCMAKE_SYSTEM_NAME=Windows - $ cmake --build . --config Release + $ cmake .. -G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang-new -DTARGET=ARMV8 -DCMAKE_BUILD_TYPE=Release + $ cmake --build . ``` -!!! tip "`getarch.exe` execution error" - - If you notice that platform-specific headers by `getarch.exe` are not - generated correctly, this could be due to a known debug runtime DLL issue for - arm64 platforms. Please check out [this page](https://linaro.atlassian.net/wiki/spaces/WOAR/pages/28677636097/Debug+run-time+DLL+issue#Workaround) - for a workaround. - - #### Generating an import library Microsoft Windows has this thing called "import libraries". You need it for From 009c1e0387357eff7cc9f6f7713ce92e2f17ef5b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Nov 2024 14:15:04 +0100 Subject: [PATCH 16/66] fix download link for the current WoA binary of LLVM --- docs/install.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/install.md b/docs/install.md index 33e9323cd9..b842d3355b 100644 --- a/docs/install.md +++ b/docs/install.md @@ -447,8 +447,9 @@ The following tools needs to be installed to build for Windows on Arm (WoA): - LLVM for Windows on Arm. Find the latest LLVM build for WoA from [LLVM release page](https://releases.llvm.org/) - you want the package whose name ends in "woa64.exe". - E.g: LLVM 19 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.4/LLVM-19.1.4-woa64.exe) - Run the LLVM installer and ensure that LLVM is added to environment PATH. (If you do not want to add it to the PATH, you will need to specify + (This may not always be present in the very latest point release, as building and uploading the binaries takes time.) + E.g: a LLVM 19 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.2/LLVM-19.1.2-woa64.exe). + Run the LLVM installer and ensure that LLVM is added to the environment variable PATH. (If you do not want to add it to the PATH, you will need to specify both C and Fortran compiler to Make or CMake with their full path later on) The following steps describe how to build the static library for OpenBLAS with either Make or CMake: From 7452af4471d6e71c40a5a9bec444eaaed6db5a8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Sat, 5 Aug 2023 16:48:04 +0200 Subject: [PATCH 17/66] CI (MinGW): Remove work-around with NO_AVX512 that was needed for older versions of LLVM Flang. --- .github/workflows/dynamic_arch.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 669aa81168..df61eccff6 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -174,9 +174,6 @@ jobs: idx: int32 target-prefix: mingw-w64-clang-x86_64 fc-pkg: fc - # Compiling with Flang 16 seems to cause test errors on machines - # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. - no-avx512-flags: -DNO_AVX512=1 - msystem: CLANG32 idx: int32 target-prefix: mingw-w64-clang-i686 @@ -192,9 +189,6 @@ jobs: idx64-flags: -DBINARY=64 -DINTERFACE64=1 target-prefix: mingw-w64-clang-x86_64 fc-pkg: fc - # Compiling with Flang 16 seems to cause test errors on machines - # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. - no-avx512-flags: -DNO_AVX512=1 - msystem: UCRT64 idx: int32 target-prefix: mingw-w64-ucrt-x86_64 @@ -281,7 +275,6 @@ jobs: -DTARGET=CORE2 \ ${{ matrix.idx64-flags }} \ ${{ matrix.c-lapack-flags }} \ - ${{ matrix.no-avx512-flags }} \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ .. From f5e6b5b5c91f3bc6b6f5f8f47a320864ef940538 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Tue, 26 Nov 2024 13:14:23 +0100 Subject: [PATCH 18/66] CI (MinGW): Remove CLANG32 environment from build matrix. The CLANG32 environment is in the process of being removed from MSYS2 currently: https://www.msys2.org/news/#2024-09-23-starting-to-drop-the-clang32-environment Remove it from the build matrix ahead of its complete removal from MSYS2. --- .github/workflows/dynamic_arch.yml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index df61eccff6..9e55e73467 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -158,7 +158,7 @@ jobs: strategy: fail-fast: false matrix: - msystem: [UCRT64, MINGW32, CLANG64, CLANG32] + msystem: [UCRT64, MINGW32, CLANG64] idx: [int32, int64] build-type: [Release] include: @@ -174,11 +174,6 @@ jobs: idx: int32 target-prefix: mingw-w64-clang-x86_64 fc-pkg: fc - - msystem: CLANG32 - idx: int32 - target-prefix: mingw-w64-clang-i686 - fc-pkg: cc - c-lapack-flags: -DC_LAPACK=ON - msystem: UCRT64 idx: int64 idx64-flags: -DBINARY=64 -DINTERFACE64=1 @@ -197,8 +192,6 @@ jobs: exclude: - msystem: MINGW32 idx: int64 - - msystem: CLANG32 - idx: int64 defaults: run: @@ -274,7 +267,6 @@ jobs: -DNUM_THREADS=64 \ -DTARGET=CORE2 \ ${{ matrix.idx64-flags }} \ - ${{ matrix.c-lapack-flags }} \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ .. From 57a51d74c915e7c957a0209eef1d18a1e5eb9b32 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 27 Nov 2024 09:52:56 +0100 Subject: [PATCH 19/66] translate CMAKE_SYSTEM_NAME in compilations on or for IOS --- cmake/system_check.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 59a1358789..fc81e9797d 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -10,6 +10,10 @@ if (${HOST_OS} STREQUAL "WINDOWS") set(HOST_OS WINNT) endif () +if (${HOST_OS} STREQUAL "IOS") + set(HOST_OS DARWIN) +endif () + if (${HOST_OS} STREQUAL "LINUX") # check if we're building natively on Android (TERMUX) EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) From 4918beecbef845cce1e5b496a6330dfc035d7964 Mon Sep 17 00:00:00 2001 From: "Iha, Taisei" Date: Mon, 2 Dec 2024 18:46:00 +0900 Subject: [PATCH 20/66] Loop-unrolled transposed [SD]GEMV kernels for A64FX and Neoverse V1 --- kernel/arm64/KERNEL.A64FX | 4 +- kernel/arm64/KERNEL.NEOVERSEV1 | 4 +- kernel/arm64/gemv_t_sve_v1x3.c | 152 +++++++++++++++++++++ kernel/arm64/gemv_t_sve_v4x3.c | 234 +++++++++++++++++++++++++++++++++ 4 files changed, 390 insertions(+), 4 deletions(-) create mode 100644 kernel/arm64/gemv_t_sve_v1x3.c create mode 100644 kernel/arm64/gemv_t_sve_v4x3.c diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 4abc840405..75f0f39a7e 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -2,5 +2,5 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE SGEMVNKERNEL = gemv_n_sve.c DGEMVNKERNEL = gemv_n_sve.c -SGEMVTKERNEL = gemv_t_sve.c -DGEMVTKERNEL = gemv_t_sve.c +SGEMVTKERNEL = gemv_t_sve_v4x3.c +DGEMVTKERNEL = gemv_t_sve_v4x3.c diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index 53d157a0aa..859466409e 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,4 +1,4 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE -SGEMVTKERNEL = gemv_t_sve.c -DGEMVTKERNEL = gemv_t_sve.c +SGEMVTKERNEL = gemv_t_sve_v1x3.c +DGEMVTKERNEL = gemv_t_sve_v1x3.c diff --git a/kernel/arm64/gemv_t_sve_v1x3.c b/kernel/arm64/gemv_t_sve_v1x3.c new file mode 100644 index 0000000000..e481abec7c --- /dev/null +++ b/kernel/arm64/gemv_t_sve_v1x3.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include + +#include "common.h" + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + iy = 0; + + if (inc_x == 1) { + BLASLONG width = (n + 3 - 1) / 3; + + FLOAT *a0_ptr = a + lda * width * 0; + FLOAT *a1_ptr = a + lda * width * 1; + FLOAT *a2_ptr = a + lda * width * 2; + + FLOAT *y0_ptr = y + inc_y * width * 0; + FLOAT *y1_ptr = y + inc_y * width * 1; + FLOAT *y2_ptr = y + inc_y * width * 2; + + for (j = 0; j < width; j++) { + svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + + SV_TYPE temp00_vec = SV_DUP(0.0); + SV_TYPE temp01_vec = SV_DUP(0.0); + SV_TYPE temp02_vec = SV_DUP(0.0); + + i = 0; + BLASLONG sve_size = SV_COUNT(); + while ((i + sve_size * 1 - 1) < m) { + SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + + temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); + temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); + temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); + + i += sve_size * 1; + } + + if (i < m) { + svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); + + pg00 = svand_z(SV_TRUE(), pg0, pg00); + pg01 = svand_z(SV_TRUE(), pg0, pg01); + pg02 = svand_z(SV_TRUE(), pg0, pg02); + + SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + + temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); + temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); + temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); + } + + if ((j + width * 0) < n) { + temp = svaddv(SV_TRUE(), temp00_vec); + y0_ptr[iy] += alpha * temp; + } + if ((j + width * 1) < n) { + temp = svaddv(SV_TRUE(), temp01_vec); + y1_ptr[iy] += alpha * temp; + } + if ((j + width * 2) < n) { + temp = svaddv(SV_TRUE(), temp02_vec); + y2_ptr[iy] += alpha * temp; + } + iy += inc_y; + + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + } + + return(0); + } + + a_ptr = a; + for (j = 0; j < n; j++) { + temp = 0.0; + ix = 0; + for (i = 0; i < m; i++) { + temp += a_ptr[i] * x[ix]; + ix += inc_x; + } + y[iy] += alpha * temp; + iy += inc_y; + a_ptr += lda; + } + return(0); +} diff --git a/kernel/arm64/gemv_t_sve_v4x3.c b/kernel/arm64/gemv_t_sve_v4x3.c new file mode 100644 index 0000000000..77c46feb34 --- /dev/null +++ b/kernel/arm64/gemv_t_sve_v4x3.c @@ -0,0 +1,234 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include + +#include "common.h" + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + iy = 0; + + if (inc_x == 1) { + BLASLONG width = (n + 3 - 1) / 3; + + FLOAT *a0_ptr = a + lda * width * 0; + FLOAT *a1_ptr = a + lda * width * 1; + FLOAT *a2_ptr = a + lda * width * 2; + + FLOAT *y0_ptr = y + inc_y * width * 0; + FLOAT *y1_ptr = y + inc_y * width * 1; + FLOAT *y2_ptr = y + inc_y * width * 2; + + for (j = 0; j < width; j++) { + svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + + SV_TYPE temp00_vec = SV_DUP(0.0); + SV_TYPE temp10_vec = SV_DUP(0.0); + SV_TYPE temp20_vec = SV_DUP(0.0); + SV_TYPE temp30_vec = SV_DUP(0.0); + SV_TYPE temp01_vec = SV_DUP(0.0); + SV_TYPE temp11_vec = SV_DUP(0.0); + SV_TYPE temp21_vec = SV_DUP(0.0); + SV_TYPE temp31_vec = SV_DUP(0.0); + SV_TYPE temp02_vec = SV_DUP(0.0); + SV_TYPE temp12_vec = SV_DUP(0.0); + SV_TYPE temp22_vec = SV_DUP(0.0); + SV_TYPE temp32_vec = SV_DUP(0.0); + + i = 0; + BLASLONG sve_size = SV_COUNT(); + while ((i + sve_size * 4 - 1) < m) { + SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0); + SV_TYPE x1_vec = svld1_vnum(SV_TRUE(), x + i, 1); + SV_TYPE x2_vec = svld1_vnum(SV_TRUE(), x + i, 2); + SV_TYPE x3_vec = svld1_vnum(SV_TRUE(), x + i, 3); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); + SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); + SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); + SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); + SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); + SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); + SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); + + temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); + temp10_vec = svmla_m(pg10, temp10_vec, a10_vec, x1_vec); + temp20_vec = svmla_m(pg20, temp20_vec, a20_vec, x2_vec); + temp30_vec = svmla_m(pg30, temp30_vec, a30_vec, x3_vec); + temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); + temp11_vec = svmla_m(pg11, temp11_vec, a11_vec, x1_vec); + temp21_vec = svmla_m(pg21, temp21_vec, a21_vec, x2_vec); + temp31_vec = svmla_m(pg31, temp31_vec, a31_vec, x3_vec); + temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); + temp12_vec = svmla_m(pg12, temp12_vec, a12_vec, x1_vec); + temp22_vec = svmla_m(pg22, temp22_vec, a22_vec, x2_vec); + temp32_vec = svmla_m(pg32, temp32_vec, a32_vec, x3_vec); + + i += sve_size * 4; + } + + if (i < m) { + svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); + svbool_t pg1 = SV_WHILE(i + sve_size * 1, m); + svbool_t pg2 = SV_WHILE(i + sve_size * 2, m); + svbool_t pg3 = SV_WHILE(i + sve_size * 3, m); + + pg00 = svand_z(SV_TRUE(), pg0, pg00); + pg10 = svand_z(SV_TRUE(), pg1, pg10); + pg20 = svand_z(SV_TRUE(), pg2, pg20); + pg30 = svand_z(SV_TRUE(), pg3, pg30); + pg01 = svand_z(SV_TRUE(), pg0, pg01); + pg11 = svand_z(SV_TRUE(), pg1, pg11); + pg21 = svand_z(SV_TRUE(), pg2, pg21); + pg31 = svand_z(SV_TRUE(), pg3, pg31); + pg02 = svand_z(SV_TRUE(), pg0, pg02); + pg12 = svand_z(SV_TRUE(), pg1, pg12); + pg22 = svand_z(SV_TRUE(), pg2, pg22); + pg32 = svand_z(SV_TRUE(), pg3, pg32); + + SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0); + SV_TYPE x1_vec = svld1_vnum(pg1, x + i, 1); + SV_TYPE x2_vec = svld1_vnum(pg2, x + i, 2); + SV_TYPE x3_vec = svld1_vnum(pg3, x + i, 3); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); + SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); + SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); + SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); + SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); + SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); + SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); + + temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); + temp10_vec = svmla_m(pg10, temp10_vec, a10_vec, x1_vec); + temp20_vec = svmla_m(pg20, temp20_vec, a20_vec, x2_vec); + temp30_vec = svmla_m(pg30, temp30_vec, a30_vec, x3_vec); + temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); + temp11_vec = svmla_m(pg11, temp11_vec, a11_vec, x1_vec); + temp21_vec = svmla_m(pg21, temp21_vec, a21_vec, x2_vec); + temp31_vec = svmla_m(pg31, temp31_vec, a31_vec, x3_vec); + temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); + temp12_vec = svmla_m(pg12, temp12_vec, a12_vec, x1_vec); + temp22_vec = svmla_m(pg22, temp22_vec, a22_vec, x2_vec); + temp32_vec = svmla_m(pg32, temp32_vec, a32_vec, x3_vec); + } + + temp00_vec = svadd_x(SV_TRUE(), temp00_vec, temp10_vec); + temp01_vec = svadd_x(SV_TRUE(), temp01_vec, temp11_vec); + temp02_vec = svadd_x(SV_TRUE(), temp02_vec, temp12_vec); + temp20_vec = svadd_x(SV_TRUE(), temp20_vec, temp30_vec); + temp21_vec = svadd_x(SV_TRUE(), temp21_vec, temp31_vec); + temp22_vec = svadd_x(SV_TRUE(), temp22_vec, temp32_vec); + temp00_vec = svadd_x(SV_TRUE(), temp00_vec, temp20_vec); + temp01_vec = svadd_x(SV_TRUE(), temp01_vec, temp21_vec); + temp02_vec = svadd_x(SV_TRUE(), temp02_vec, temp22_vec); + + if ((j + width * 0) < n) { + temp = svaddv(SV_TRUE(), temp00_vec); + y0_ptr[iy] += alpha * temp; + } + if ((j + width * 1) < n) { + temp = svaddv(SV_TRUE(), temp01_vec); + y1_ptr[iy] += alpha * temp; + } + if ((j + width * 2) < n) { + temp = svaddv(SV_TRUE(), temp02_vec); + y2_ptr[iy] += alpha * temp; + } + iy += inc_y; + + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + } + + return(0); + } + + a_ptr = a; + for (j = 0; j < n; j++) { + temp = 0.0; + ix = 0; + for (i = 0; i < m; i++) { + temp += a_ptr[i] * x[ix]; + ix += inc_x; + } + y[iy] += alpha * temp; + iy += inc_y; + a_ptr += lda; + } + return(0); +} From dc905636d12efa91b2e690ab2b1f07de45f0a6d2 Mon Sep 17 00:00:00 2001 From: Kai Pastor Date: Tue, 3 Dec 2024 07:42:44 +0100 Subject: [PATCH 21/66] arm: Declare symbols as .type function --- common_arm.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common_arm.h b/common_arm.h index 80aabc7b02..d6291018b1 100644 --- a/common_arm.h +++ b/common_arm.h @@ -102,9 +102,16 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) +#if !defined(__APPLE__) && !defined(_WIN32) +#define OPENBLAS_ARM_TYPE_FUNCTION .type REALNAME, %function ; +#else +#define OPENBLAS_ARM_TYPE_FUNCTION +#endif + #define PROLOGUE \ .arm ;\ .global REALNAME ;\ + OPENBLAS_ARM_TYPE_FUNCTION \ REALNAME: #define EPILOGUE From 93eb42fdc836871943bc2582599db854716e7659 Mon Sep 17 00:00:00 2001 From: Kai Pastor Date: Tue, 3 Dec 2024 09:45:04 +0100 Subject: [PATCH 22/66] Fix redefinition of FAILED --- ctest/cblas_test.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ctest/cblas_test.h b/ctest/cblas_test.h index 3eeb46ac2c..24ea677637 100644 --- a/ctest/cblas_test.h +++ b/ctest/cblas_test.h @@ -10,6 +10,11 @@ #define int long #endif +/* e.g. mingw64/x86_64-w64-mingw32/include/winerror.h */ +#ifdef FAILED +#undef FAILED +#endif + #define TRUE 1 #define PASSED 1 #define TEST_ROW_MJR 1 From a8b1705dbd39f079ecf120622fa889ecdd92ac04 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Tue, 26 Nov 2024 15:21:28 -0500 Subject: [PATCH 23/66] CMake build has wrong PIC flag for NAG --- cmake/system.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 6b891ca0ef..df1095c045 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -382,6 +382,8 @@ if (NEED_PIC) if (NOT NOFORTRAN) if (${F_COMPILER} STREQUAL "SUN") set(FCOMMON_OPT "${FCOMMON_OPT} -pic") + elseif (${F_COMPILER} STREQUAL "NAG") + set(FCOMMON_OPT "${FCOMMON_OPT} -PIC") else () set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC") endif () From 2eaf285de53d5f064e15e8e7ee9d3dd4cef61455 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Tue, 26 Nov 2024 15:26:55 -0500 Subject: [PATCH 24/66] Use F_COMPILER name --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index df1095c045..82d16c92fa 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -382,7 +382,7 @@ if (NEED_PIC) if (NOT NOFORTRAN) if (${F_COMPILER} STREQUAL "SUN") set(FCOMMON_OPT "${FCOMMON_OPT} -pic") - elseif (${F_COMPILER} STREQUAL "NAG") + elseif (${F_COMPILER} STREQUAL "NAGFOR") set(FCOMMON_OPT "${FCOMMON_OPT} -PIC") else () set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC") From be19966d3b7618625febeca35fe6be57899f4aea Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 4 Dec 2024 10:52:43 -0500 Subject: [PATCH 25/66] Fixes for NAG CMake --- cmake/fc.cmake | 25 +++++++++++++++++++++++++ cmake/lapack.cmake | 7 ++++++- cmake/system.cmake | 2 +- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 4ce1c99d4b..38bd406a3a 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -269,6 +269,31 @@ if (${F_COMPILER} STREQUAL "CRAY") endif () endif () +if (${F_COMPILER} STREQUAL "NAGFOR") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_NAG") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + # Options from Makefile.system + # -dcfuns: Enable non-standard double precision complex intrinsic functions + # -ieee=full: enables all IEEE arithmetic facilities including non-stop arithmetic. + # -w=obs: Suppress warning messages about obsolescent features + # -thread_safe: Compile code for safe execution in a multi-threaded environment. + # -recursive: Specifies that procedures are RECURSIVE by default. + set(FCOMMON_OPT "${FCOMMON_OPT} -dcfuns -recursive -ieee=full -w=obs -thread_safe") + # Options from Reference-LAPACK + # Suppress compiler banner and summary + set(FCOMMON_OPT "${FCOMMON_OPT} -quiet") + # Disable other common warnings + # -w=x77: Suppress warning messages about Fortran 77 features + # -w=ques: Suppress warning messages about questionable usage + # -w=unused: Suppress warning messages about unused variables + set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused") + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + # from the root Makefile - this is for lapack-netlib to compile the correct secnd file. if (${F_COMPILER} STREQUAL "GFORTRAN") set(TIMER "INT_ETIME") diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 003a8b3c17..6a74fb7640 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -1018,7 +1018,12 @@ foreach (LA_FILE ${LA_GEN_SRC}) endforeach () if (NOT C_LAPACK) - set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") + # The below line is duplicating Fortran flags but NAG has a few flags + # that cannot be specified twice. It's possible this is not needed for + # any compiler, but for safety, we only turn off for NAG + if (NOT ${F_COMPILER} STREQUAL "NAGFOR") + set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") + endif () if (${F_COMPILER} STREQUAL "GFORTRAN") set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize") endif() diff --git a/cmake/system.cmake b/cmake/system.cmake index 82d16c92fa..b58a0f4b55 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -642,7 +642,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") endif () if (CMAKE_Fortran_COMPILER) -if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") + if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") message(STATUS "removing fortran flags") From 35334ed2ea7cd5859b1ac7b767df2854d69d8f55 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 4 Dec 2024 10:53:05 -0500 Subject: [PATCH 26/66] Fixes for Fortran Standards violations for lapack-netlib --- lapack-netlib/SRC/claqp2rk.f | 6 +++--- lapack-netlib/SRC/claqp3rk.f | 6 +++--- lapack-netlib/TESTING/EIG/cchkhb2stg.f | 5 +++-- lapack-netlib/TESTING/EIG/dchksb2stg.f | 5 +++-- lapack-netlib/TESTING/EIG/schksb2stg.f | 5 +++-- lapack-netlib/TESTING/EIG/zchkhb2stg.f | 5 +++-- lapack-netlib/TESTING/LIN/alahd.f | 2 +- 7 files changed, 19 insertions(+), 15 deletions(-) diff --git a/lapack-netlib/SRC/claqp2rk.f b/lapack-netlib/SRC/claqp2rk.f index 6b1db085aa..0501c50bb4 100644 --- a/lapack-netlib/SRC/claqp2rk.f +++ b/lapack-netlib/SRC/claqp2rk.f @@ -378,7 +378,7 @@ SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, EXTERNAL CLARF, CLARFG, CSWAP * .. * .. Intrinsic Functions .. - INTRINSIC ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT + INTRINSIC ABS, REAL, CONJG, AIMAG, MAX, MIN, SQRT * .. * .. External Functions .. LOGICAL SISNAN @@ -599,8 +599,8 @@ SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, * IF( SISNAN( REAL( TAU(KK) ) ) ) THEN TAUNAN = REAL( TAU(KK) ) - ELSE IF( SISNAN( IMAG( TAU(KK) ) ) ) THEN - TAUNAN = IMAG( TAU(KK) ) + ELSE IF( SISNAN( AIMAG( TAU(KK) ) ) ) THEN + TAUNAN = AIMAG( TAU(KK) ) ELSE TAUNAN = ZERO END IF diff --git a/lapack-netlib/SRC/claqp3rk.f b/lapack-netlib/SRC/claqp3rk.f index 3703bcbd65..8fe5a220ff 100644 --- a/lapack-netlib/SRC/claqp3rk.f +++ b/lapack-netlib/SRC/claqp3rk.f @@ -431,7 +431,7 @@ SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, EXTERNAL CGEMM, CGEMV, CLARFG, CSWAP * .. * .. Intrinsic Functions .. - INTRINSIC ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT + INTRINSIC ABS, REAL, CONJG, AIMAG, MAX, MIN, SQRT * .. * .. External Functions .. LOGICAL SISNAN @@ -739,8 +739,8 @@ SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, * IF( SISNAN( REAL( TAU(K) ) ) ) THEN TAUNAN = REAL( TAU(K) ) - ELSE IF( SISNAN( IMAG( TAU(K) ) ) ) THEN - TAUNAN = IMAG( TAU(K) ) + ELSE IF( SISNAN( AIMAG( TAU(K) ) ) ) THEN + TAUNAN = AIMAG( TAU(K) ) ELSE TAUNAN = ZERO END IF diff --git a/lapack-netlib/TESTING/EIG/cchkhb2stg.f b/lapack-netlib/TESTING/EIG/cchkhb2stg.f index 1a11ac5eaf..7500c22791 100644 --- a/lapack-netlib/TESTING/EIG/cchkhb2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkhb2stg.f @@ -852,8 +852,9 @@ SUBROUTINE CCHKHB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, CALL SLASUM( 'CHB', NOUNIT, NERRS, NTESTT ) RETURN * - 9999 FORMAT( ' CCHKHB2STG: ', A, ' returned INFO=', I6, '.', / 9X, 'N=', - $ I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, ')' ) + 9999 FORMAT( ' CCHKHB2STG: ', A, ' returned INFO=', I6, '.', / 9X, + $ 'N=', I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, + $ ')' ) 9998 FORMAT( / 1X, A3, $ ' -- Complex Hermitian Banded Tridiagonal Reduction Routines' $ ) diff --git a/lapack-netlib/TESTING/EIG/dchksb2stg.f b/lapack-netlib/TESTING/EIG/dchksb2stg.f index 878da8b6f0..4e807f1c88 100644 --- a/lapack-netlib/TESTING/EIG/dchksb2stg.f +++ b/lapack-netlib/TESTING/EIG/dchksb2stg.f @@ -840,8 +840,9 @@ SUBROUTINE DCHKSB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, CALL DLASUM( 'DSB', NOUNIT, NERRS, NTESTT ) RETURN * - 9999 FORMAT( ' DCHKSB2STG: ', A, ' returned INFO=', I6, '.', / 9X, 'N=', - $ I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, ')' ) + 9999 FORMAT( ' DCHKSB2STG: ', A, ' returned INFO=', I6, '.', / 9X, + $ 'N=', I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, + $ ')' ) * 9998 FORMAT( / 1X, A3, $ ' -- Real Symmetric Banded Tridiagonal Reduction Routines' ) diff --git a/lapack-netlib/TESTING/EIG/schksb2stg.f b/lapack-netlib/TESTING/EIG/schksb2stg.f index 5de9204979..eee486ade7 100644 --- a/lapack-netlib/TESTING/EIG/schksb2stg.f +++ b/lapack-netlib/TESTING/EIG/schksb2stg.f @@ -840,8 +840,9 @@ SUBROUTINE SCHKSB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, CALL SLASUM( 'SSB', NOUNIT, NERRS, NTESTT ) RETURN * - 9999 FORMAT( ' SCHKSB2STG: ', A, ' returned INFO=', I6, '.', / 9X, 'N=', - $ I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, ')' ) + 9999 FORMAT( ' SCHKSB2STG: ', A, ' returned INFO=', I6, '.', / 9X, + $ 'N=', I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, + $ ')' ) * 9998 FORMAT( / 1X, A3, $ ' -- Real Symmetric Banded Tridiagonal Reduction Routines' ) diff --git a/lapack-netlib/TESTING/EIG/zchkhb2stg.f b/lapack-netlib/TESTING/EIG/zchkhb2stg.f index 786df7882c..bfe6ceadca 100644 --- a/lapack-netlib/TESTING/EIG/zchkhb2stg.f +++ b/lapack-netlib/TESTING/EIG/zchkhb2stg.f @@ -849,8 +849,9 @@ SUBROUTINE ZCHKHB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, CALL DLASUM( 'ZHB', NOUNIT, NERRS, NTESTT ) RETURN * - 9999 FORMAT( ' ZCHKHB2STG: ', A, ' returned INFO=', I6, '.', / 9X, 'N=', - $ I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, ')' ) + 9999 FORMAT( ' ZCHKHB2STG: ', A, ' returned INFO=', I6, '.', / 9X, + $ 'N=', I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, + $ ')' ) 9998 FORMAT( / 1X, A3, $ ' -- Complex Hermitian Banded Tridiagonal Reduction Routines' $ ) diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f index 8f966c5841..c0334b5de9 100644 --- a/lapack-netlib/TESTING/LIN/alahd.f +++ b/lapack-netlib/TESTING/LIN/alahd.f @@ -954,7 +954,7 @@ SUBROUTINE ALAHD( IOUNIT, PATH ) $ 4X, '10. Random, Last columns are zero starting from', $ ' MINMN/2+1, CNDNUM = 2', / $ 4X, '11. Random, Half MINMN columns in the middle are', - $ ' zero starting from MINMN/2-(MINMN/2)/2+1,' + $ ' zero starting from MINMN/2-(MINMN/2)/2+1,', $ ' CNDNUM = 2', / $ 4X, '12. Random, Odd columns are ZERO, CNDNUM = 2', / $ 4X, '13. Random, Even columns are ZERO, CNDNUM = 2', / From d3b2036d49c16cb9f7520d3ee5d3d39d349c18bf Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 4 Dec 2024 12:09:24 -0500 Subject: [PATCH 27/66] Move to use ERROR STOP instead of ABORT --- ctest/c_cblat1.f | 6 +++--- ctest/c_cblat2.f | 10 +++++----- ctest/c_cblat3.f | 14 +++++++------- ctest/c_cblat3_3m.f | 14 +++++++------- ctest/c_dblat1.f | 10 +++++----- ctest/c_dblat2.f | 10 +++++----- ctest/c_dblat3.f | 14 +++++++------- ctest/c_sblat1.f | 10 +++++----- ctest/c_sblat2.f | 10 +++++----- ctest/c_sblat3.f | 14 +++++++------- ctest/c_zblat1.f | 6 +++--- ctest/c_zblat2.f | 10 +++++----- ctest/c_zblat3.f | 14 +++++++------- ctest/c_zblat3_3m.f | 14 +++++++------- 14 files changed, 78 insertions(+), 78 deletions(-) diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index 73ab485bbd..2af54e7a65 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -41,7 +41,7 @@ PROGRAM CCBLAT1 IF (PASS) THEN WRITE (NOUT,99998) ELSE - CALL ABORT + ERROR STOP END IF 20 CONTINUE * @@ -231,7 +231,7 @@ SUBROUTINE CHECK1(SFAC) CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - CALL ABORT + ERROR STOP END IF * 40 CONTINUE @@ -515,7 +515,7 @@ SUBROUTINE CHECK2(SFAC) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - CALL ABORT + ERROR STOP END IF * 40 CONTINUE diff --git a/ctest/c_cblat2.f b/ctest/c_cblat2.f index d48c10b7c8..d31884cddc 100644 --- a/ctest/c_cblat2.f +++ b/ctest/c_cblat2.f @@ -10,7 +10,7 @@ PROGRAM CBLAT2 * 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -243,7 +243,7 @@ PROGRAM CBLAT2 $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - CALL ABORT + ERROR STOP 70 LTEST( I ) = LTESTT GO TO 50 * @@ -283,7 +283,7 @@ PROGRAM CBLAT2 SAME = LCE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANS = 'T' CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -291,7 +291,7 @@ PROGRAM CBLAT2 SAME = LCE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -419,7 +419,7 @@ PROGRAM CBLAT2 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_cblat3.f b/ctest/c_cblat3.f index 5d289aafe0..f713b2dd0a 100644 --- a/ctest/c_cblat3.f +++ b/ctest/c_cblat3.f @@ -10,7 +10,7 @@ PROGRAM CBLAT3 * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -194,7 +194,7 @@ PROGRAM CBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - CALL ABORT + ERROR STOP 50 LTEST( I ) = LTESTT GO TO 30 * @@ -237,7 +237,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -246,7 +246,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -264,7 +264,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -273,7 +273,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -386,7 +386,7 @@ PROGRAM CBLAT3 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_cblat3_3m.f b/ctest/c_cblat3_3m.f index 73fca5664f..3f8157b0ed 100644 --- a/ctest/c_cblat3_3m.f +++ b/ctest/c_cblat3_3m.f @@ -10,7 +10,7 @@ PROGRAM CBLAT3 * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -194,7 +194,7 @@ PROGRAM CBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - CALL ABORT + ERROR STOP 50 LTEST( I ) = LTESTT GO TO 30 * @@ -237,7 +237,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -246,7 +246,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -264,7 +264,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -273,7 +273,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -386,7 +386,7 @@ PROGRAM CBLAT3 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f index 99c8b5da49..4877ea62b8 100644 --- a/ctest/c_dblat1.f +++ b/ctest/c_dblat1.f @@ -47,7 +47,7 @@ PROGRAM DCBLAT1 IF (PASS) THEN WRITE (NOUT,99998) ELSE - CALL ABORT + ERROR STOP END IF 20 CONTINUE * @@ -139,7 +139,7 @@ SUBROUTINE CHECK0(SFAC) CALL STEST1(SS,DS1(K),DS1(K),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' - CALL ABORT + ERROR STOP END IF 20 CONTINUE 40 RETURN @@ -232,7 +232,7 @@ SUBROUTINE CHECK1(SFAC) CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - CALL ABORT + ERROR STOP END IF 60 CONTINUE 80 CONTINUE @@ -387,7 +387,7 @@ SUBROUTINE CHECK2(SFAC) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - CALL ABORT + ERROR STOP END IF 100 CONTINUE 120 CONTINUE @@ -475,7 +475,7 @@ SUBROUTINE CHECK3(SFAC) 70 CONTINUE ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' - CALL ABORT + ERROR STOP END IF 40 CONTINUE 60 CONTINUE diff --git a/ctest/c_dblat2.f b/ctest/c_dblat2.f index 01a21a7163..342382c9ed 100644 --- a/ctest/c_dblat2.f +++ b/ctest/c_dblat2.f @@ -10,7 +10,7 @@ PROGRAM DBLAT2 * 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -239,7 +239,7 @@ PROGRAM DBLAT2 $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - CALL ABORT + ERROR STOP 70 LTEST( I ) = LTESTT GO TO 50 * @@ -279,7 +279,7 @@ PROGRAM DBLAT2 SAME = LDE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANS = 'T' CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -287,7 +287,7 @@ PROGRAM DBLAT2 SAME = LDE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -415,7 +415,7 @@ PROGRAM DBLAT2 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_dblat3.f b/ctest/c_dblat3.f index 00d16c2961..cbd95b8544 100644 --- a/ctest/c_dblat3.f +++ b/ctest/c_dblat3.f @@ -10,7 +10,7 @@ PROGRAM DBLAT3 * 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -189,7 +189,7 @@ PROGRAM DBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - CALL ABORT + ERROR STOP 50 LTEST( I ) = LTESTT GO TO 30 * @@ -232,7 +232,7 @@ PROGRAM DBLAT3 SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'T' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -241,7 +241,7 @@ PROGRAM DBLAT3 SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -259,7 +259,7 @@ PROGRAM DBLAT3 SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'T' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -268,7 +268,7 @@ PROGRAM DBLAT3 SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -380,7 +380,7 @@ PROGRAM DBLAT3 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f index b88c2b7835..2e7c1d9b3f 100644 --- a/ctest/c_sblat1.f +++ b/ctest/c_sblat1.f @@ -47,7 +47,7 @@ PROGRAM SCBLAT1 IF (PASS) THEN WRITE (NOUT,99998) ELSE - CALL ABORT + ERROR STOP END IF 20 CONTINUE * @@ -139,7 +139,7 @@ SUBROUTINE CHECK0(SFAC) CALL STEST1(SS,DS1(K),DS1(K),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' - CALL ABORT + ERROR STOP END IF 20 CONTINUE 40 RETURN @@ -232,7 +232,7 @@ SUBROUTINE CHECK1(SFAC) CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - CALL ABORT + ERROR STOP END IF 60 CONTINUE 80 CONTINUE @@ -387,7 +387,7 @@ SUBROUTINE CHECK2(SFAC) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - CALL ABORT + ERROR STOP END IF 100 CONTINUE 120 CONTINUE @@ -482,7 +482,7 @@ SUBROUTINE CHECK3(SFAC) 70 CONTINUE ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' - CALL ABORT + ERROR STOP END IF 40 CONTINUE 60 CONTINUE diff --git a/ctest/c_sblat2.f b/ctest/c_sblat2.f index 18d568d5d3..00cbc8f011 100644 --- a/ctest/c_sblat2.f +++ b/ctest/c_sblat2.f @@ -10,7 +10,7 @@ PROGRAM SBLAT2 * 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -239,7 +239,7 @@ PROGRAM SBLAT2 $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - CALL ABORT + ERROR STOP 70 LTEST( I ) = LTESTT GO TO 50 * @@ -279,7 +279,7 @@ PROGRAM SBLAT2 SAME = LSE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANS = 'T' CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -287,7 +287,7 @@ PROGRAM SBLAT2 SAME = LSE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -415,7 +415,7 @@ PROGRAM SBLAT2 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_sblat3.f b/ctest/c_sblat3.f index bbb58d04f6..61bf46997f 100644 --- a/ctest/c_sblat3.f +++ b/ctest/c_sblat3.f @@ -10,7 +10,7 @@ PROGRAM SBLAT3 * 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -188,7 +188,7 @@ PROGRAM SBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - CALL ABORT + ERROR STOP 50 LTEST( I ) = LTESTT GO TO 30 * @@ -231,7 +231,7 @@ PROGRAM SBLAT3 SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'T' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -240,7 +240,7 @@ PROGRAM SBLAT3 SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -258,7 +258,7 @@ PROGRAM SBLAT3 SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'T' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -267,7 +267,7 @@ PROGRAM SBLAT3 SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -379,7 +379,7 @@ PROGRAM SBLAT3 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f index 43486433e3..1d48159c91 100644 --- a/ctest/c_zblat1.f +++ b/ctest/c_zblat1.f @@ -41,7 +41,7 @@ PROGRAM ZCBLAT1 IF (PASS) THEN WRITE (NOUT,99998) ELSE - CALL ABORT + ERROR STOP END IF 20 CONTINUE * @@ -231,7 +231,7 @@ SUBROUTINE CHECK1(SFAC) CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - CALL ABORT + ERROR STOP END IF * 40 CONTINUE @@ -515,7 +515,7 @@ SUBROUTINE CHECK2(SFAC) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - CALL ABORT + ERROR STOP END IF * 40 CONTINUE diff --git a/ctest/c_zblat2.f b/ctest/c_zblat2.f index daa1a603b2..220e2fd259 100644 --- a/ctest/c_zblat2.f +++ b/ctest/c_zblat2.f @@ -10,7 +10,7 @@ PROGRAM ZBLAT2 * 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -243,7 +243,7 @@ PROGRAM ZBLAT2 $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - CALL ABORT + ERROR STOP 70 LTEST( I ) = LTESTT GO TO 50 * @@ -283,7 +283,7 @@ PROGRAM ZBLAT2 SAME = LZE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANS = 'T' CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -291,7 +291,7 @@ PROGRAM ZBLAT2 SAME = LZE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -419,7 +419,7 @@ PROGRAM ZBLAT2 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_zblat3.f b/ctest/c_zblat3.f index 83eb9e9184..e14f5af65a 100644 --- a/ctest/c_zblat3.f +++ b/ctest/c_zblat3.f @@ -10,7 +10,7 @@ PROGRAM ZBLAT3 * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -195,7 +195,7 @@ PROGRAM ZBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - CALL ABORT + ERROR STOP 50 LTEST( I ) = LTESTT GO TO 30 * @@ -238,7 +238,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -247,7 +247,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -265,7 +265,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -274,7 +274,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -387,7 +387,7 @@ PROGRAM ZBLAT3 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_zblat3_3m.f b/ctest/c_zblat3_3m.f index d0923439e8..6f52b64036 100644 --- a/ctest/c_zblat3_3m.f +++ b/ctest/c_zblat3_3m.f @@ -10,7 +10,7 @@ PROGRAM ZBLAT3 * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -195,7 +195,7 @@ PROGRAM ZBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - CALL ABORT + ERROR STOP 50 LTEST( I ) = LTESTT GO TO 30 * @@ -238,7 +238,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -247,7 +247,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -265,7 +265,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -274,7 +274,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -387,7 +387,7 @@ PROGRAM ZBLAT3 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) From c4e8bac5a5e306731550f6ee39db99c184c31ed0 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 4 Dec 2024 12:11:35 -0500 Subject: [PATCH 28/66] Fix indent --- cmake/system.cmake | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index b58a0f4b55..4ac244e3ea 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -643,16 +643,16 @@ endif () if (CMAKE_Fortran_COMPILER) if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") - set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") - if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") -message(STATUS "removing fortran flags") - set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") + set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") + if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") + message(STATUS "removing fortran flags") + set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") + endif () + foreach (FILTER_FLAG ${FILTER_FLAGS}) + string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) + string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) + endforeach () endif () - foreach (FILTER_FLAG ${FILTER_FLAGS}) - string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) - string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) - endforeach () -endif () endif () if ("${F_COMPILER}" STREQUAL "GFORTRAN") From 1a6ecda3983c9daab2f94dfe5bc1fdcd759a94ea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 4 Dec 2024 15:32:26 -0800 Subject: [PATCH 29/66] utilize /proc/cpuinfo on NetBSD too --- cpuid_arm64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index aaf5084395..fbb78e7943 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -127,7 +127,7 @@ static char *cpuname_lower[] = { int get_feature(char *search) { -#ifdef __linux +#if defined( __linux ) || defined( __NetBSD__ ) FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -163,7 +163,7 @@ int get_feature(char *search) int detect(void) { -#ifdef __linux +#if defined( __linux ) || defined( __NetBSD__ ) FILE *infile; char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; @@ -314,7 +314,7 @@ void get_cpucount(void) { int n=0; -#ifdef __linux +#if defined( __linux ) || defined( __NetBSD__ ) FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -608,7 +608,7 @@ void get_libname(void) void get_features(void) { -#ifdef __linux +#if defined( __linux ) || defined( __NetBSD__ ) FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; From a791912cbb06260e1d0271b31959f16a41ddef4c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 4 Dec 2024 15:34:57 -0800 Subject: [PATCH 30/66] handle uname returning evbarm on NetBSD --- c_check | 3 +++ 1 file changed, 3 insertions(+) diff --git a/c_check b/c_check index c2b52c81b0..c3c2901712 100755 --- a/c_check +++ b/c_check @@ -6,6 +6,9 @@ hostarch=`uname -m | sed -e 's/i.86/x86/'` if [ "$hostos" = "AIX" ] || [ "$hostos" = "SunOS" ]; then hostarch=`uname -p` fi +if [ "$hostarch" = "evbarm" ]; then + hostarch=`uname -p` +fi case "$hostarch" in amd64) hostarch=x86_64 ;; arm*) [ "$hostarch" = "arm64" ] || hostarch='arm' ;; From 5fe983db29381a86a0dcef2a1750b4da86cca69c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 5 Dec 2024 21:09:53 +0100 Subject: [PATCH 31/66] retire the thunderx2 nrm2 kernels for now due to NAN and inaccuracies --- kernel/arm64/KERNEL.ARMV8SVE | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index bfadf5cba9..7904011a82 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -104,10 +104,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.c SDOTKERNEL = dot.c From 3345007d8f4559fcd65ea7166695fda3a161e7ae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 5 Dec 2024 21:12:06 +0100 Subject: [PATCH 32/66] retire the thunderx2 NRM2 kernels due to reported inaccuracies and NAN --- kernel/arm64/KERNEL.NEOVERSEN2 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index cabacad46e..2f7400113b 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -91,10 +91,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.c SDOTKERNEL = dot.c From 0f8ff82592b7eaf6bb61613673e64aa8ecb5ebfa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 6 Dec 2024 01:35:42 -0800 Subject: [PATCH 33/66] Add build notes for Windows and flang from gh Discussion 5008 --- benchmark/pybench/README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/benchmark/pybench/README.md b/benchmark/pybench/README.md index 7523ca75ab..43c1b3665d 100644 --- a/benchmark/pybench/README.md +++ b/benchmark/pybench/README.md @@ -43,7 +43,17 @@ have all what it takes to build OpenBLAS from source, plus `python` and $ python -mpip install numpy meson ninja pytest pytest-benchmark ``` -The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/test_blas.py`. +The Meson build system looks for the installed OpenBLAS using pkgconfig, so the openblas.pc created during the OpenBLAS build needs +to be somewhere on the search path of pkgconfig or in a folder pointed to by the environment variable PKG_CONFIG_PATH. + +If you want to build the benchmark suite using flang (or flang-new) instead of gfortran for the Fortran parts, you currently need +to edit the meson.build file and change the line `'fortran_std=legacy'` to `'fortran_std=none'` to work around an incompatibility +between Meson and flang. + +If you are building and running the benchmark under MS Windows, it may be necessary to copy the generated openblas_wrap module from +your build folder to the `benchmarks` folder. + +The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/bench_blas.py`. An ASV compatible benchmark suite is planned but currently not implemented. From 5aea097df068e87033d46efb081640e8bf41caa0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 10 Dec 2024 23:52:05 +0100 Subject: [PATCH 34/66] add missing lapack 3.11+ symbols --- exports/gensymbol | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/exports/gensymbol b/exports/gensymbol index f3ca9a427e..d886e6d143 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -869,8 +869,12 @@ lapackobjs2z="$lapackobjs2z #functions added post 3.11 lapackobjs2c="$lapackobjs2c + cgelst + cgeqp3rk claqp2rk claqp3rk + clatrs3 + crscl ctrsyl3 " # claqz0 @@ -894,6 +898,17 @@ lapackobjs2d="$lapackobjs2d # dlaqz3 # dlaqz4 +lapackobjs2s="$lapackobjs2s + sgelst + sgeqp2rk + sgeqp3rk + slaqp2rk + slaqp3rk + slarmm + slatrs3 + strsyl3 + " + lapackobjs2z="$lapackobjs2z zgelst zgeqp3rk From 61d5aec7c1298969cb007686695dcadefa0e9f7f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 11 Dec 2024 00:41:56 +0100 Subject: [PATCH 35/66] remove typo --- exports/gensymbol | 1 - 1 file changed, 1 deletion(-) diff --git a/exports/gensymbol b/exports/gensymbol index d886e6d143..f747dd091f 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -900,7 +900,6 @@ lapackobjs2d="$lapackobjs2d lapackobjs2s="$lapackobjs2s sgelst - sgeqp2rk sgeqp3rk slaqp2rk slaqp3rk From b9f51a5cf7723c7fb383812c75f786d3e5c4a1ba Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:58:06 +0530 Subject: [PATCH 36/66] Delete kernel/arm64/rot.c --- kernel/arm64/rot.c | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 kernel/arm64/rot.c diff --git a/kernel/arm64/rot.c b/kernel/arm64/rot.c deleted file mode 100644 index abddc15381..0000000000 --- a/kernel/arm64/rot.c +++ /dev/null @@ -1,40 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include "rot_kernel_sve.c" -#include "rot_kernel_c.c" - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - if (n <= 0) - return (0); - if (inc_x == 1 && inc_y == 1) - rot_kernel_sve(n, x, y, c, s); - else - rot_kernel_c(n, x, inc_x, y, inc_y, c, s); - return (0); -} From 10857c9df4f915871e989496c8b2bb78f81af8e2 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:58:51 +0530 Subject: [PATCH 37/66] Delete kernel/arm64/rot_kernel_c.c --- kernel/arm64/rot_kernel_c.c | 44 ------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 kernel/arm64/rot_kernel_c.c diff --git a/kernel/arm64/rot_kernel_c.c b/kernel/arm64/rot_kernel_c.c deleted file mode 100644 index f37d2db169..0000000000 --- a/kernel/arm64/rot_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" - -static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT temp; - while (i < n) - { - temp = c * x[ix] + s * y[iy]; - y[iy] = c * y[iy] - s * x[ix]; - x[ix] = temp; - ix += inc_x; - iy += inc_y; - i++; - } - return (0); -} From f62519cc87521e1c6e09972cd03f3695b01b086f Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:59:35 +0530 Subject: [PATCH 38/66] Delete kernel/arm64/rot_kernel_sve.c --- kernel/arm64/rot_kernel_sve.c | 59 ----------------------------------- 1 file changed, 59 deletions(-) delete mode 100644 kernel/arm64/rot_kernel_sve.c diff --git a/kernel/arm64/rot_kernel_sve.c b/kernel/arm64/rot_kernel_sve.c deleted file mode 100644 index 0a790824f0..0000000000 --- a/kernel/arm64/rot_kernel_sve.c +++ /dev/null @@ -1,59 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include - -#ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() -#else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() -#endif - -static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) -{ - for (int i = 0; i < n; i += SVE_WIDTH) - { - svbool_t pg = SVE_WHILELT((uint32_t)i, (uint32_t)n); - SVE_TYPE x_vec = svld1(pg, &x[i]); - SVE_TYPE y_vec = svld1(pg, &y[i]); - SVE_TYPE cx_vec = svmul_z(pg, x_vec, c); - SVE_TYPE sy_vec = svmul_z(pg, y_vec, s); - SVE_TYPE sx_vec = svmul_z(pg, x_vec, s); - SVE_TYPE cy_vec = svmul_z(pg, y_vec, c); - svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec)); - svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec)); - } - return (0); -} From 5540f2121e2304e8e4682708b00af18fade7465b Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:00:12 +0530 Subject: [PATCH 39/66] Delete kernel/arm64/scal.c --- kernel/arm64/scal.c | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 kernel/arm64/scal.c diff --git a/kernel/arm64/scal.c b/kernel/arm64/scal.c deleted file mode 100644 index e64b0075e8..0000000000 --- a/kernel/arm64/scal.c +++ /dev/null @@ -1,40 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include "scal_kernel_sve.c" -#include "scal_kernel_c.c" - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - if ((n <= 0) || (inc_x <= 0)) - return (0); - if (inc_x == 1) - scal_kernel_sve(n, x, da); - else - scal_kernel_c(n, da, x, inc_x, y, inc_y); - return (0); -} From 95a97012e8e7350df05c3e3ee749dbe34feff05a Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:00:45 +0530 Subject: [PATCH 40/66] Delete kernel/arm64/scal_kernel_c.c --- kernel/arm64/scal_kernel_c.c | 43 ------------------------------------ 1 file changed, 43 deletions(-) delete mode 100644 kernel/arm64/scal_kernel_c.c diff --git a/kernel/arm64/scal_kernel_c.c b/kernel/arm64/scal_kernel_c.c deleted file mode 100644 index 659168da54..0000000000 --- a/kernel/arm64/scal_kernel_c.c +++ /dev/null @@ -1,43 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" - -static int scal_kernel_c(BLASLONG n, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i = 0, j = 0; - - while (j < n) - { - if (da == 0.0) - x[i] = 0.0; - else - x[i] = da * x[i]; - i += inc_x; - j++; - } - return (0); -} From 3b7b74664c125e8589b8f5c4255bdb972f666dff Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:01:03 +0530 Subject: [PATCH 41/66] Delete kernel/arm64/scal_kernel_sve.c --- kernel/arm64/scal_kernel_sve.c | 54 ---------------------------------- 1 file changed, 54 deletions(-) delete mode 100644 kernel/arm64/scal_kernel_sve.c diff --git a/kernel/arm64/scal_kernel_sve.c b/kernel/arm64/scal_kernel_sve.c deleted file mode 100644 index ccd5a4cd2b..0000000000 --- a/kernel/arm64/scal_kernel_sve.c +++ /dev/null @@ -1,54 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include - -#ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() -#else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() -#endif - -static int scal_kernel_sve(int n, FLOAT *x, FLOAT da) -{ - for (int i = 0; i < n; i += SVE_WIDTH) - { - svbool_t pg = SVE_WHILELT(i, n); - SVE_TYPE x_vec = svld1(pg, &x[i]); - SVE_TYPE result = svmul_z(pg, x_vec, da); - svst1(pg, &x[i], result); - } - return (0); -} From f6416c0e3702a1d1d825b1500993f66a60677281 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:01:32 +0530 Subject: [PATCH 42/66] Delete kernel/arm64/swap.c --- kernel/arm64/swap.c | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 kernel/arm64/swap.c diff --git a/kernel/arm64/swap.c b/kernel/arm64/swap.c deleted file mode 100644 index c5af18e6ba..0000000000 --- a/kernel/arm64/swap.c +++ /dev/null @@ -1,40 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#include "common.h" -#include "swap_kernel_sve.c" -#include "swap_kernel_c.c" - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - if (n <= 0) - return 0; - if (inc_x == 1 && inc_y == 1) - swap_kernel_sve(n, x, y); - else - swap_kernel_c(n, x, inc_x, y, inc_y); - return (0); -} From c17c19fbcf4d6fd90326c564b6bddd06c93cfe23 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:01:46 +0530 Subject: [PATCH 43/66] Delete kernel/arm64/swap_kernel_c.c --- kernel/arm64/swap_kernel_c.c | 46 ------------------------------------ 1 file changed, 46 deletions(-) delete mode 100644 kernel/arm64/swap_kernel_c.c diff --git a/kernel/arm64/swap_kernel_c.c b/kernel/arm64/swap_kernel_c.c deleted file mode 100644 index c1d7cc619a..0000000000 --- a/kernel/arm64/swap_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#include "common.h" -#include - -static int swap_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT temp; - - while (i < n) - { - temp = x[ix]; - x[ix] = y[iy]; - y[iy] = temp; - ix += inc_x; - iy += inc_y; - i++; - } - return (0); -} From 765850194e2529433433be20ae9d74ff54f6c673 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:02:01 +0530 Subject: [PATCH 44/66] Delete kernel/arm64/swap_kernel_sve.c --- kernel/arm64/swap_kernel_sve.c | 62 ---------------------------------- 1 file changed, 62 deletions(-) delete mode 100644 kernel/arm64/swap_kernel_sve.c diff --git a/kernel/arm64/swap_kernel_sve.c b/kernel/arm64/swap_kernel_sve.c deleted file mode 100644 index fed7e6d0f5..0000000000 --- a/kernel/arm64/swap_kernel_sve.c +++ /dev/null @@ -1,62 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include - -#ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() -#else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() -#endif - -static int swap_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) -{ - BLASLONG sve_width = SVE_WIDTH; - - for (BLASLONG i = 0; i < n; i += sve_width * 2) - { - svbool_t pg_a = SVE_WHILELT(i, n); - svbool_t pg_b = SVE_WHILELT((i + sve_width), n); - SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); - SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); - SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); - SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); - svst1(pg_a, &x[i], y_vec_a); - svst1(pg_a, &y[i], x_vec_a); - svst1(pg_b, &x[i + sve_width], y_vec_b); - svst1(pg_b, &y[i + sve_width], x_vec_b); - } - return (0); -} From 41912f9c22615bf2d94cecd7ea7d239a5f94e666 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:05:10 +0530 Subject: [PATCH 45/66] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index cf74524c8d..508dbcd0e6 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -230,12 +230,5 @@ In chronological order: * Christopher Daley * [2024-01-24] Optimize GEMV forwarding on ARM64 systems -* Aniket P. Garade - * [2024-10-30] Optimized scal Level-1 BLAS routines with ARM SVE - -* Sushil Pratap Singh - * [2024-10-30] Optimized swap Level-1 BLAS routines with ARM SVE - -* Juliya James - * [2024-10-30] Optimized rot Level-1 BLAS routines with ARM SVE - +* Aniket P. Garade Sushil Pratap Singh Juliya James + * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE From 06ffd411a588734793cb2057c254090aac07f7a7 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:05:47 +0530 Subject: [PATCH 46/66] Update KERNEL.ARMV8SVE --- kernel/arm64/KERNEL.ARMV8SVE | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index cecc72cf96..133fab9d62 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -69,8 +69,8 @@ DROTKERNEL = rot.c CROTKERNEL = zrot.S ZROTKERNEL = zrot.S -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S From dd71e4234a0cb3469168f7e6a8f55a17cd02db58 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:15:29 +0530 Subject: [PATCH 47/66] Added Updated swap and rot sve kernels. --- kernel/arm64/rot.c | 40 ++++++++++++++++++++++ kernel/arm64/rot_kernel_c.c | 44 ++++++++++++++++++++++++ kernel/arm64/rot_kernel_sve.c | 59 ++++++++++++++++++++++++++++++++ kernel/arm64/swap.c | 40 ++++++++++++++++++++++ kernel/arm64/swap_kernel_c.c | 46 +++++++++++++++++++++++++ kernel/arm64/swap_kernel_sve.c | 62 ++++++++++++++++++++++++++++++++++ 6 files changed, 291 insertions(+) create mode 100644 kernel/arm64/rot.c create mode 100644 kernel/arm64/rot_kernel_c.c create mode 100644 kernel/arm64/rot_kernel_sve.c create mode 100644 kernel/arm64/swap.c create mode 100644 kernel/arm64/swap_kernel_c.c create mode 100644 kernel/arm64/swap_kernel_sve.c diff --git a/kernel/arm64/rot.c b/kernel/arm64/rot.c new file mode 100644 index 0000000000..09b708494c --- /dev/null +++ b/kernel/arm64/rot.c @@ -0,0 +1,40 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include "rot_kernel_sve.c" +#include "rot_kernel_c.c" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + if (n <= 0) + return (0); + if (inc_x == 1 && inc_y == 1) + rot_kernel_sve(n, x, y, c, s); + else + rot_kernel_c(n, x, inc_x, y, inc_y, c, s); + return (0); +} \ No newline at end of file diff --git a/kernel/arm64/rot_kernel_c.c b/kernel/arm64/rot_kernel_c.c new file mode 100644 index 0000000000..788beed7a5 --- /dev/null +++ b/kernel/arm64/rot_kernel_c.c @@ -0,0 +1,44 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" + +static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; + while (i < n) + { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} \ No newline at end of file diff --git a/kernel/arm64/rot_kernel_sve.c b/kernel/arm64/rot_kernel_sve.c new file mode 100644 index 0000000000..1d54a2907e --- /dev/null +++ b/kernel/arm64/rot_kernel_sve.c @@ -0,0 +1,59 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + for (BLASLONG i = 0; i < n; i += SVE_WIDTH) + { + svbool_t pg = SVE_WHILELT((uint64_t)i, (uint64_t)n); + SVE_TYPE x_vec = svld1(pg, &x[i]); + SVE_TYPE y_vec = svld1(pg, &y[i]); + SVE_TYPE cx_vec = svmul_z(pg, x_vec, c); + SVE_TYPE sy_vec = svmul_z(pg, y_vec, s); + SVE_TYPE sx_vec = svmul_z(pg, x_vec, s); + SVE_TYPE cy_vec = svmul_z(pg, y_vec, c); + svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec)); + svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec)); + } + return (0); +} \ No newline at end of file diff --git a/kernel/arm64/swap.c b/kernel/arm64/swap.c new file mode 100644 index 0000000000..6a9117cf0e --- /dev/null +++ b/kernel/arm64/swap.c @@ -0,0 +1,40 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include "common.h" +#include "swap_kernel_sve.c" +#include "swap_kernel_c.c" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if (n <= 0) + return 0; + if (inc_x == 1 && inc_y == 1) + swap_kernel_sve(n, x, y); + else + swap_kernel_c(n, x, inc_x, y, inc_y); + return (0); +} \ No newline at end of file diff --git a/kernel/arm64/swap_kernel_c.c b/kernel/arm64/swap_kernel_c.c new file mode 100644 index 0000000000..4029350962 --- /dev/null +++ b/kernel/arm64/swap_kernel_c.c @@ -0,0 +1,46 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include "common.h" +#include + +static int swap_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; + + while (i < n) + { + temp = x[ix]; + x[ix] = y[iy]; + y[iy] = temp; + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} \ No newline at end of file diff --git a/kernel/arm64/swap_kernel_sve.c b/kernel/arm64/swap_kernel_sve.c new file mode 100644 index 0000000000..db3c0fae57 --- /dev/null +++ b/kernel/arm64/swap_kernel_sve.c @@ -0,0 +1,62 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static int swap_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) +{ + BLASLONG sve_width = SVE_WIDTH; + + for (BLASLONG i = 0; i < n; i += sve_width * 2) + { + svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); + svbool_t pg_b = SVE_WHILELT((i + sve_width), n); + SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); + SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); + SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); + SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); + svst1(pg_a, &x[i], y_vec_a); + svst1(pg_a, &y[i], x_vec_a); + svst1(pg_b, &x[i + sve_width], y_vec_b); + svst1(pg_b, &y[i + sve_width], x_vec_b); + } + return (0); +} \ No newline at end of file From 3368a4e697c45a5de4370b1e6861c9ab7178b297 Mon Sep 17 00:00:00 2001 From: SushilPratap04 Date: Fri, 13 Dec 2024 16:47:58 +0530 Subject: [PATCH 48/66] Update swap_kernel_sve.c --- kernel/arm64/swap_kernel_sve.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/swap_kernel_sve.c b/kernel/arm64/swap_kernel_sve.c index db3c0fae57..1efdce48bd 100644 --- a/kernel/arm64/swap_kernel_sve.c +++ b/kernel/arm64/swap_kernel_sve.c @@ -48,7 +48,7 @@ static int swap_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) for (BLASLONG i = 0; i < n; i += sve_width * 2) { svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); - svbool_t pg_b = SVE_WHILELT((i + sve_width), n); + svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); @@ -59,4 +59,4 @@ static int swap_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) svst1(pg_b, &y[i + sve_width], x_vec_b); } return (0); -} \ No newline at end of file +} From d00cc400b17155d6f5b624e272a3ec458f93a1fe Mon Sep 17 00:00:00 2001 From: "tingbo.liao" Date: Wed, 18 Dec 2024 08:35:26 +0800 Subject: [PATCH 49/66] Replaced the __riscv_vid_v_i32m2 and __riscv_vid_v_i64m2 with __riscv_vid_v_u32m2 and __riscv_vid_v_u64m2 for riscv64-unknown-linux-gnu-gcc compiling. Signed-off-by: tingbo.liao --- kernel/riscv64/symm_lcopy_rvv_v1.c | 9 +++++---- kernel/riscv64/symm_ucopy_rvv_v1.c | 10 ++++++---- kernel/riscv64/zhemm_ltcopy_rvv_v1.c | 9 +++++---- kernel/riscv64/zhemm_utcopy_rvv_v1.c | 10 ++++++---- kernel/riscv64/zsymm_lcopy_rvv_v1.c | 9 +++++---- kernel/riscv64/zsymm_ucopy_rvv_v1.c | 10 ++++++---- kernel/riscv64/ztrmm_lncopy_rvv_v1.c | 12 +++++++----- 7 files changed, 40 insertions(+), 29 deletions(-) diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c index a615db44d9..2e5bfc6caf 100644 --- a/kernel/riscv64/symm_lcopy_rvv_v1.c +++ b/kernel/riscv64/symm_lcopy_rvv_v1.c @@ -35,11 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define INT_V_T vint32m2_t -#define VID_V_INT __riscv_vid_v_i32m2 +#define VID_V_INT __riscv_vid_v_u32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 #define VBOOL_T vbool16_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() @@ -48,11 +49,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define INT_V_T vint64m2_t -#define VID_V_INT __riscv_vid_v_i64m2 +#define VID_V_INT __riscv_vid_v_u64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 #define VBOOL_T vbool32_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 #endif // Optimizes the implementation in ../generic/symm_lcopy_4.c @@ -70,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON INT_V_T vindex_max, vindex; size_t vl = VSETVL_MAX; - vindex_max = VID_V_INT(vl); + vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); for (js = n; js > 0; js -= vl, posX += vl) { vl = VSETVL(js); @@ -98,4 +100,3 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON return 0; } - diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c index 464f97b3a6..faab88a678 100644 --- a/kernel/riscv64/symm_ucopy_rvv_v1.c +++ b/kernel/riscv64/symm_ucopy_rvv_v1.c @@ -35,11 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define INT_V_T vint32m2_t -#define VID_V_INT __riscv_vid_v_i32m2 +#define VID_V_INT __riscv_vid_v_u32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 #define VBOOL_T vbool16_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() @@ -48,11 +49,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define INT_V_T vint64m2_t -#define VID_V_INT __riscv_vid_v_i64m2 +#define VID_V_INT __riscv_vid_v_u64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 #define VBOOL_T vbool32_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 #endif // Optimizes the implementation in ../generic/symm_ucopy_4.c @@ -70,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON INT_V_T vindex_max, vindex; size_t vl = VSETVL_MAX; - vindex_max = VID_V_INT(vl); + vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); for (js = n; js > 0; js -= vl, posX += vl) { vl = VSETVL(js); @@ -97,4 +99,4 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } return 0; -} +} \ No newline at end of file diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c index 97013895ae..15dfc229d8 100644 --- a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -41,7 +41,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t -#define VID_V_INT __riscv_vid_v_i32m2 +#define VID_V_INT __riscv_vid_v_u32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 #define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 @@ -50,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VBOOL_T vbool16_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() @@ -64,7 +65,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t -#define VID_V_INT __riscv_vid_v_i64m2 +#define VID_V_INT __riscv_vid_v_u64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 #define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 @@ -73,6 +74,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VBOOL_T vbool32_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 #endif @@ -92,7 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON INT_V_T vindex_max, vindex; size_t vl = VSETVL_MAX; - vindex_max = VID_V_INT(vl); + vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); vzero = VFMVVF_FLOAT(ZERO, vl); for (js = n; js > 0; js -= vl, posX += vl) { @@ -136,4 +138,3 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON return 0; } - diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c index 59029e9e59..cc7c44e12c 100644 --- a/kernel/riscv64/zhemm_utcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -41,7 +41,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t -#define VID_V_INT __riscv_vid_v_i32m2 +#define VID_V_INT __riscv_vid_v_u32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 #define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 @@ -50,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VBOOL_T vbool16_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() @@ -64,7 +65,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t -#define VID_V_INT __riscv_vid_v_i64m2 +#define VID_V_INT __riscv_vid_v_u64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 #define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 @@ -73,6 +74,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VBOOL_T vbool32_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 #endif @@ -90,7 +92,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON INT_V_T vindex_max, vindex; size_t vl = VSETVL_MAX; - vindex_max = VID_V_INT(vl); + vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); vzero = VFMVVF_FLOAT(ZERO, vl); for (js = n; js > 0; js -= vl, posX += vl) { @@ -132,4 +134,4 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } return 0; -} +} \ No newline at end of file diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c index f4d8061909..ed0e00b547 100644 --- a/kernel/riscv64/zsymm_lcopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c @@ -41,11 +41,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t -#define VID_V_INT __riscv_vid_v_i32m2 +#define VID_V_INT __riscv_vid_v_u32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 #define VBOOL_T vbool16_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() @@ -60,11 +61,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t -#define VID_V_INT __riscv_vid_v_i64m2 +#define VID_V_INT __riscv_vid_v_u64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 #define VBOOL_T vbool32_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) @@ -81,7 +83,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON INT_V_T vindex_max, vindex; size_t vl = VSETVL_MAX; - vindex_max = VID_V_INT(vl); + vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); for (js = n; js > 0; js -= vl, posX += vl) { vl = VSETVL(js); @@ -118,4 +120,3 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON return 0; } - diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c index 069551bb0e..5f3ac3d07d 100644 --- a/kernel/riscv64/zsymm_ucopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -41,11 +41,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t -#define VID_V_INT __riscv_vid_v_i32m2 +#define VID_V_INT __riscv_vid_v_u32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 #define VBOOL_T vbool16_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() @@ -60,11 +61,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t -#define VID_V_INT __riscv_vid_v_i64m2 +#define VID_V_INT __riscv_vid_v_u64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 #define VBOOL_T vbool32_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 #endif @@ -83,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON size_t vl = VSETVL_MAX; - vindex_max = VID_V_INT(vl); + vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); for (js = n; js > 0; js -= vl, posX += vl) { vl = VSETVL(js); @@ -118,4 +120,4 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } return 0; -} +} \ No newline at end of file diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c index ae664561b4..9264f13781 100644 --- a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c @@ -42,10 +42,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vint32m2_t -#define VID_V_UINT __riscv_vid_v_i32m2 +#define VID_V_UINT __riscv_vid_v_u32m2 #define VMSGTU_VX_UINT __riscv_vmsgt_vx_i32m2_b16 #define VMSEQ_VX_UINT __riscv_vmseq_vx_i32m2_b16 #define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t @@ -63,6 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 #define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#define V_UM2_TO_IM2(values) values #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -99,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - do + do { if (X > posY) { @@ -119,9 +121,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X ++; i ++; } - else + else { - vindex = VID_V_UINT(vl); + vindex = V_UM2_TO_IM2(VID_V_UINT(vl)); for (unsigned int j = 0; j < vl; j++) { vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); @@ -152,4 +154,4 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } return 0; -} +} \ No newline at end of file From 48caf2303d4b953d74b3caba0f8fc4ad94c9cdd8 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 18 Dec 2024 08:53:29 +0100 Subject: [PATCH 50/66] Fix build warning about discarding volatile qualifier in memory.c The warning was: ``` [4339/5327] Building C object driver/others/CMakeFiles/driver_others.dir/memory.c.o /home/rgommers/code/pixi-dev-scipystack/openblas/OpenBLAS/driver/others/memory.c: In function 'blas_shutdown': /home/rgommers/code/pixi-dev-scipystack/openblas/OpenBLAS/driver/others/memory.c:3257:10: warning: passing argument 1 of 'free' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers] 3257 | free(newmemory); | ^~~~~~~~~ In file included from /home/rgommers/code/pixi-dev-scipystack/openblas/OpenBLAS/common.h:83, from /home/rgommers/code/pixi-dev-scipystack/openblas/OpenBLAS/driver/others/memory.c:74: /home/rgommers/code/pixi-dev-scipystack/openblas/.pixi/envs/default/x86_64-conda-linux-gnu/sysroot/usr/include/stdlib.h:482:25: note: expected 'void *' but argument is of type 'volatile struct newmemstruct *' 482 | extern void free (void *__ptr) __THROW; | ~~~~~~^~~~~ ``` The use of `volatile` for `newmemstruct` seems on purpose, and there are more such constructs in this file. The warning appeared after gh-4451 and is correct. The `free` prototype doesn't expect a volatile pointer, hence this change adds a cast to silence the warning. --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 6343a3785e..276e39ece0 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -3254,7 +3254,7 @@ void blas_shutdown(void){ #endif newmemory[pos].lock = 0; } - free(newmemory); + free((void*)newmemory); newmemory = NULL; memory_overflowed = 0; } From 765ad8bcd2bee89d8393a2200a6777989a8d4db0 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 18 Dec 2024 09:39:07 +0100 Subject: [PATCH 51/66] Fix guard around `alloc_hugetlb`, fixes compile warning The warning was: ``` /home/rgommers/code/pixi-dev-scipystack/openblas/OpenBLAS/driver/others/memory.c: At top level: /home/rgommers/code/pixi-dev-scipystack/openblas/OpenBLAS/driver/others/memory.c:2565:14: warning: 'alloc_hugetlb' defined but not used [-Wunused-function] 2565 | static void *alloc_hugetlb(void *address){ | ^~~~~~~~~~~~~ ``` The added define is the same as is already present in the TLS part of `memory.c`. This follows up on gh-4681. --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 276e39ece0..c53e798bc1 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2538,7 +2538,7 @@ static void *alloc_shm(void *address){ } #endif -#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS +#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) static void alloc_hugetlb_free(struct release_t *release){ From e460512685b3004c3796b4620c1454150cf61ef0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Dec 2024 00:50:37 +0100 Subject: [PATCH 52/66] Update WoA build instructions from rewording in issue #5001 --- docs/install.md | 66 +++++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/docs/install.md b/docs/install.md index b842d3355b..7155263056 100644 --- a/docs/install.md +++ b/docs/install.md @@ -437,36 +437,54 @@ To then use the built OpenBLAS shared library in Visual Studio: [Qt Creator](http://qt.nokia.com/products/developer-tools/). -#### Windows on Arm - -While OpenBLAS can be built with Microsoft VisualStudio (Community Edition or commercial), you would only be able to build for the GENERIC target -that does not use optimized assembly kernels, also the stock VisualStudio lacks the Fortran compiler necessary for building the LAPACK component. -It is therefore highly recommended to download the free LLVM compiler suite and use it to compile OpenBLAS outside of VisualStudio. - -The following tools needs to be installed to build for Windows on Arm (WoA): - -- LLVM for Windows on Arm. - Find the latest LLVM build for WoA from [LLVM release page](https://releases.llvm.org/) - you want the package whose name ends in "woa64.exe". - (This may not always be present in the very latest point release, as building and uploading the binaries takes time.) - E.g: a LLVM 19 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.2/LLVM-19.1.2-woa64.exe). - Run the LLVM installer and ensure that LLVM is added to the environment variable PATH. (If you do not want to add it to the PATH, you will need to specify - both C and Fortran compiler to Make or CMake with their full path later on) +## Windows on Arm + +A fully functional native OpenBLAS for WoA that can be built as both a static and dynamic library using LLVM toolchain and Visual Studio 2022. Before starting to build, make sure that you have installed Visual Studio 2022 on your ARM device, including the "Desktop Development with C++" component (that contains the cmake tool). +(Note that you can use the free "Visual Studio 2022 Community Edition" for this task. In principle it would be possible to build with VisualStudio alone, but using +the LLVM toolchain enables native compilation of the Fortran sources of LAPACK and of all the optimized assembly files, which VisualStudio cannot handle on its own) + + 1. Clone OpenBLAS to your local machine and checkout to latest release of OpenBLAS (unless you want to build the latest development snapshot - here we are using the 0.3.28 release as the example, of course this exact version may be outdated by the time you read this) + + ```cmd + git clone https://github.com/OpenMathLib/OpenBLAS.git + cd OpenBLAS + git checkout v0.3.28 + ``` + + 2. Install Latest LLVM toolchain for WoA: + + Download the Latest LLVM toolchain for WoA from [the Release page](https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5). At the time of writing, this is version 19.1.5 - be sure to select the latest release for which you can find a precompiled package whose name ends in "-woa64.exe" (precompiled packages + usually lag a week or two behind their corresponding source release). + Make sure to enable the option “Add LLVM to the system PATH for all the users” + Note: Make sure that the path of LLVM toolchain is at the top of Environment Variables section to avoid conflicts between the set of compilers available in the system path + + 3. Launch the Native Command Prompt for Windows ARM64: + + From the start menu search for “ARM64 Native Tools Command Prompt for Visual Studio 2022” + Alternatively open command prompt, run the following command to activate the environment: + "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsarm64.bat" + + Navigate to the OpenBLAS source code directory and start building OpenBLAS by invoking Ninja: + + ```cmd + cd OpenBLAS + mkdir build + cd build + + cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new -The following steps describe how to build the static library for OpenBLAS with either Make or CMake: + ninja -j16 + ``` + +Note: You might want to include additional options in the cmake command here. For example, the default configuration only generates a static.lib version of the library. If you prefer a DLL, you can add -DBUILD_SHARED_LIBS=ON. -1. Build OpenBLAS with Make: +Note that it is also possible to use the same setup to build OpenBLAS with Make, if you prepare Makefiles over the CMake build for some reason: - ```bash + ```cmd $ make CC=clang-cl FC=flang-new AR="llvm-ar" TARGET=ARMV8 ARCH=arm64 RANLIB="llvm-ranlib" MAKE=make ``` -2. Build OpenBLAS with CMake - ```bash - $ mkdir build - $ cd build - $ cmake .. -G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang-new -DTARGET=ARMV8 -DCMAKE_BUILD_TYPE=Release - $ cmake --build . - ``` + #### Generating an import library From a93d3db34a7e2fe70bbeb3a43c20323d85802a74 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Dec 2024 00:53:10 +0100 Subject: [PATCH 53/66] fix formatting of WoA section --- docs/install.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/install.md b/docs/install.md index 7155263056..5bb88cccd8 100644 --- a/docs/install.md +++ b/docs/install.md @@ -437,13 +437,13 @@ To then use the built OpenBLAS shared library in Visual Studio: [Qt Creator](http://qt.nokia.com/products/developer-tools/). -## Windows on Arm +### Windows on Arm A fully functional native OpenBLAS for WoA that can be built as both a static and dynamic library using LLVM toolchain and Visual Studio 2022. Before starting to build, make sure that you have installed Visual Studio 2022 on your ARM device, including the "Desktop Development with C++" component (that contains the cmake tool). (Note that you can use the free "Visual Studio 2022 Community Edition" for this task. In principle it would be possible to build with VisualStudio alone, but using the LLVM toolchain enables native compilation of the Fortran sources of LAPACK and of all the optimized assembly files, which VisualStudio cannot handle on its own) - 1. Clone OpenBLAS to your local machine and checkout to latest release of OpenBLAS (unless you want to build the latest development snapshot - here we are using the 0.3.28 release as the example, of course this exact version may be outdated by the time you read this) +1. Clone OpenBLAS to your local machine and checkout to latest release of OpenBLAS (unless you want to build the latest development snapshot - here we are using the 0.3.28 release as the example, of course this exact version may be outdated by the time you read this) ```cmd git clone https://github.com/OpenMathLib/OpenBLAS.git @@ -451,20 +451,20 @@ the LLVM toolchain enables native compilation of the Fortran sources of LAPACK a git checkout v0.3.28 ``` - 2. Install Latest LLVM toolchain for WoA: +2. Install Latest LLVM toolchain for WoA: - Download the Latest LLVM toolchain for WoA from [the Release page](https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5). At the time of writing, this is version 19.1.5 - be sure to select the latest release for which you can find a precompiled package whose name ends in "-woa64.exe" (precompiled packages - usually lag a week or two behind their corresponding source release). - Make sure to enable the option “Add LLVM to the system PATH for all the users” - Note: Make sure that the path of LLVM toolchain is at the top of Environment Variables section to avoid conflicts between the set of compilers available in the system path +Download the Latest LLVM toolchain for WoA from [the Release page](https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5). At the time of writing, this is version 19.1.5 - be sure to select the latest release for which you can find a precompiled package whose name ends in "-woa64.exe" (precompiled packages +usually lag a week or two behind their corresponding source release). +Make sure to enable the option “Add LLVM to the system PATH for all the users” +Note: Make sure that the path of LLVM toolchain is at the top of Environment Variables section to avoid conflicts between the set of compilers available in the system path - 3. Launch the Native Command Prompt for Windows ARM64: +3. Launch the Native Command Prompt for Windows ARM64: - From the start menu search for “ARM64 Native Tools Command Prompt for Visual Studio 2022” - Alternatively open command prompt, run the following command to activate the environment: - "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsarm64.bat" +From the start menu search for “ARM64 Native Tools Command Prompt for Visual Studio 2022” +Alternatively open command prompt, run the following command to activate the environment: +"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsarm64.bat" - Navigate to the OpenBLAS source code directory and start building OpenBLAS by invoking Ninja: +Navigate to the OpenBLAS source code directory and start building OpenBLAS by invoking Ninja: ```cmd cd OpenBLAS From 1c4401ebf16dd4ff3c0de8a7517bea9724a63a45 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Dec 2024 14:32:24 -0800 Subject: [PATCH 54/66] Add target-specific options to enable SVE with the NVIDIA compiler --- Makefile.arm64 | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index fccc0d0d0f..2909a83e0e 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -351,4 +351,31 @@ endif endif +else +# NVIDIA HPC options necessary to enable SVE in the compiler +ifeq ($(CORE), THUNDERX2T99) +CCOMMON_OPT += -tp=thunderx2t99 +FCOMMON_OPT += -tp=thunderx2t99 +endif +ifeq ($(CORE), NEOVERSEN1) +CCOMMON_OPT += -tp=neoverse-n1 +FCOMMON_OPT += -tp=neoverse-n1 +endif +ifeq ($(CORE), NEOVERSEV1) +CCOMMON_OPT += -tp=neoverse-v1 +FCOMMON_OPT += -tp=neoverse-v1 +endif +ifeq ($(CORE), NEOVERSEV2) +CCOMMON_OPT += -tp=neoverse-v2 +FCOMMON_OPT += -tp=neoverse-v2 +endif +ifeq ($(CORE), ARMV8SVE) +CCOMMON_OPT += -tp=neoverse-v2 +FCOMMON_OPT += -tp=neoverse-v2 +endif +ifeq ($(CORE), ARMV9SVE) +CCOMMON_OPT += -tp=neoverse-v2 +FCOMMON_OPT += -tp=neoverse-v2 +endif + endif From 32319a33ac5e7c1562ce9763cae0a5118a8ec2bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:00:48 +0100 Subject: [PATCH 55/66] Add options for Intel oneAPI 2025.0 ifx on Windows --- cmake/f_check.cmake | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index 4c4f5ac044..dc0f5e0ac5 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -45,13 +45,15 @@ if (NOT ONLY_CBLAS) # TODO: detect whether underscore needed, set #defines and BU appropriately - use try_compile # TODO: set FEXTRALIB flags a la f_check? - + if (NOT (${CMAKE_SYSTEM_NAME} MATCHES "Windows" AND ${CMAKE_Fortran_COMPILER_ID} MATCHES "IntelLLVM")) set(BU "_") file(APPEND ${TARGET_CONF_TEMP} "#define BUNDERSCORE _\n" "#define NEEDBUNDERSCORE 1\n" "#define NEED2UNDERSCORES 0\n") - + else () + set (FCOMMON_OPT "${FCOMMON_OPT} /fp:precise /recursive /names:lowercase /assume:nounderscore") + endif() else () #When we only build CBLAS, we set NOFORTRAN=2 From 30188a55d180a493922dc9ffc4ff0c17696cdf41 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:02:34 +0100 Subject: [PATCH 56/66] Don't assume underlined symbols for ifx; make cpuid.S inclusion conditional --- cmake/prebuild.cmake | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 53a78d782f..bdc0f7f927 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -58,7 +58,7 @@ set(TARGET_CONF_TEMP "${PROJECT_BINARY_DIR}/${TARGET_CONF}.tmp") # c_check set(FU "") -if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) +if (APPLE OR (MSVC AND NOT (${CMAKE_C_COMPILER_ID} MATCHES "Clang" OR ${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM"))) set(FU "_") endif() if(MINGW AND NOT MINGW64) @@ -1433,7 +1433,9 @@ else(NOT CMAKE_CROSSCOMPILING) message(STATUS "MSVC") set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) else() - list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin") + list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) + endif() if (DEFINED TARGET_CORE) set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE}) endif () From d78fbe425c4ea0a79f005d8c6b1014b4b16743b2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:04:50 +0100 Subject: [PATCH 57/66] Assume no underline suffixes on symbols when compiling with ifx on Windows --- cmake/system.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 4ac244e3ea..7413c88c80 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -672,6 +672,9 @@ endif () if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") endif () +if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE") +endif () if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") if ("${F_COMPILER}" STREQUAL "FLANG") From 5d81e514e4d289879921ff3be9b432afdc5fc53f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:06:03 +0100 Subject: [PATCH 58/66] Assume no underline suffixes on symbols when compiling with ifx on Windows --- ctest/cblas_test.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ctest/cblas_test.h b/ctest/cblas_test.h index 24ea677637..502a2fee20 100644 --- a/ctest/cblas_test.h +++ b/ctest/cblas_test.h @@ -10,6 +10,10 @@ #define int long #endif +#if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER) +//#define LAPACK_COMPLEX_STRUCTURE +#define NOCHANGE +#endif /* e.g. mingw64/x86_64-w64-mingw32/include/winerror.h */ #ifdef FAILED #undef FAILED From 5c9417d3061650a26062f3759da4f8586fa790f0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:07:39 +0100 Subject: [PATCH 59/66] Assume no underline suffixes on symbols when compiling with ifx on Windows --- lapack-netlib/LAPACKE/include/lapacke_config.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 798a5eb2ef..4ef542fb10 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -67,8 +67,14 @@ extern "C" { #define lapack_logical lapack_int #endif +#if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER) +#define LAPACK_COMPLEX_STRUCTURE +#define LAPACK_GLOBAL(lcname,UCNAME) lcname +#define NOCHANGE +#endif + #ifndef LAPACK_COMPLEX_CUSTOM -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__INTEL_CLANG_COMPILER) #define _CRT_USE_C_COMPLEX_H #include #define LAPACK_COMPLEX_CUSTOM From 64c6c7920175b6b1603a3a876b86c83e4a4a3cdf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:09:34 +0100 Subject: [PATCH 60/66] Assume no underline suffixes on symbols when compiling with Intel ifx on Windows --- utest/openblas_utest.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utest/openblas_utest.h b/utest/openblas_utest.h index abe381a924..1851c60c56 100644 --- a/utest/openblas_utest.h +++ b/utest/openblas_utest.h @@ -36,7 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "ctest.h" - +#if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER) +//#define LAPACK_COMPLEX_STRUCTURE +#define NOCHANGE +#endif #include #include From 05fe49ddafc438b564879e5fc19b6ab8083a2e3e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:12:17 +0100 Subject: [PATCH 61/66] Rename local copy functions to avoid name clash with the standard BLAS ones --- utest/test_extensions/common.c | 8 ++++---- utest/test_extensions/common.h | 10 +++++----- utest/test_extensions/test_cimatcopy.c | 2 +- utest/test_extensions/test_comatcopy.c | 2 +- utest/test_extensions/test_dimatcopy.c | 2 +- utest/test_extensions/test_domatcopy.c | 2 +- utest/test_extensions/test_simatcopy.c | 2 +- utest/test_extensions/test_somatcopy.c | 2 +- utest/test_extensions/test_zimatcopy.c | 2 +- utest/test_extensions/test_zomatcopy.c | 2 +- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/utest/test_extensions/common.c b/utest/test_extensions/common.c index 808aa54557..a5d3196aaf 100644 --- a/utest/test_extensions/common.c +++ b/utest/test_extensions/common.c @@ -206,7 +206,7 @@ void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int ld * param lda_dst - leading dimension of output matrix A * param conj specifies conjugation */ -void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, +void my_scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, float *a_dst, blasint lda_dst) { blasint i, j; @@ -217,7 +217,7 @@ void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, } } -void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, +void my_dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, double *a_dst, blasint lda_dst) { blasint i, j; @@ -228,7 +228,7 @@ void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, } } -void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, +void my_ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, float *a_dst, blasint lda_dst, int conj) { blasint i, j; @@ -243,7 +243,7 @@ void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, } } -void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, +void my_zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, double *a_dst, blasint lda_dst, int conj) { blasint i, j; diff --git a/utest/test_extensions/common.h b/utest/test_extensions/common.h index 62b84325c2..f8c60d2686 100644 --- a/utest/test_extensions/common.h +++ b/utest/test_extensions/common.h @@ -65,12 +65,12 @@ extern void ctranspose(blasint rows, blasint cols, float *alpha, float *a_src, i extern void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, double *a_dst, blasint lda_dst, int conj); -extern void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, +extern void my_scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, float *a_dst, blasint lda_dst); -extern void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, +extern void my_dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, double *a_dst, blasint lda_dst); -extern void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, +extern void my_ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, float *a_dst, blasint lda_dst, int conj); -extern void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, +extern void my_zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, double *a_dst, blasint lda_dst, int conj); -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_cimatcopy.c b/utest/test_extensions/test_cimatcopy.c index 0c96a3b17c..41c0a0f6b0 100644 --- a/utest/test_extensions/test_cimatcopy.c +++ b/utest/test_extensions/test_cimatcopy.c @@ -91,7 +91,7 @@ static float check_cimatcopy(char api, char order, char trans, blasint rows, bla ctranspose(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); } else { - ccopy(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); + my_ccopy(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); } if (api == 'F') { diff --git a/utest/test_extensions/test_comatcopy.c b/utest/test_extensions/test_comatcopy.c index b493c93a6f..dc6beeeaee 100644 --- a/utest/test_extensions/test_comatcopy.c +++ b/utest/test_extensions/test_comatcopy.c @@ -92,7 +92,7 @@ static float check_comatcopy(char api, char order, char trans, blasint rows, bla ctranspose(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); } else { - ccopy(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); + my_ccopy(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); } if (api == 'F') { diff --git a/utest/test_extensions/test_dimatcopy.c b/utest/test_extensions/test_dimatcopy.c index eebb7669eb..f57707eeea 100644 --- a/utest/test_extensions/test_dimatcopy.c +++ b/utest/test_extensions/test_dimatcopy.c @@ -86,7 +86,7 @@ static double check_dimatcopy(char api, char order, char trans, blasint rows, bl dtranspose(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); } else { - dcopy(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); + my_dcopy(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); } if (api == 'F') { diff --git a/utest/test_extensions/test_domatcopy.c b/utest/test_extensions/test_domatcopy.c index e892271d2d..8869f7b453 100644 --- a/utest/test_extensions/test_domatcopy.c +++ b/utest/test_extensions/test_domatcopy.c @@ -87,7 +87,7 @@ static double check_domatcopy(char api, char order, char trans, blasint rows, bl dtranspose(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); } else { - dcopy(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); + my_dcopy(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); } if (api == 'F') { diff --git a/utest/test_extensions/test_simatcopy.c b/utest/test_extensions/test_simatcopy.c index c00ea0c8f0..6b70881bf9 100644 --- a/utest/test_extensions/test_simatcopy.c +++ b/utest/test_extensions/test_simatcopy.c @@ -86,7 +86,7 @@ static float check_simatcopy(char api, char order, char trans, blasint rows, bla stranspose(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); } else { - scopy(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); + my_scopy(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); } if (api == 'F') { diff --git a/utest/test_extensions/test_somatcopy.c b/utest/test_extensions/test_somatcopy.c index 62a6056d92..bcc2eabf51 100644 --- a/utest/test_extensions/test_somatcopy.c +++ b/utest/test_extensions/test_somatcopy.c @@ -87,7 +87,7 @@ static float check_somatcopy(char api, char order, char trans, blasint rows, bla stranspose(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); } else { - scopy(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); + my_scopy(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); } if (api == 'F') { diff --git a/utest/test_extensions/test_zimatcopy.c b/utest/test_extensions/test_zimatcopy.c index 86bc4670f2..349050b9c1 100644 --- a/utest/test_extensions/test_zimatcopy.c +++ b/utest/test_extensions/test_zimatcopy.c @@ -91,7 +91,7 @@ static double check_zimatcopy(char api, char order, char trans, blasint rows, bl ztranspose(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); } else { - zcopy(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); + my_zcopy(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); } if (api == 'F') { diff --git a/utest/test_extensions/test_zomatcopy.c b/utest/test_extensions/test_zomatcopy.c index 208cfd981c..eb13d10830 100644 --- a/utest/test_extensions/test_zomatcopy.c +++ b/utest/test_extensions/test_zomatcopy.c @@ -92,7 +92,7 @@ static double check_zomatcopy(char api, char order, char trans, blasint rows, bl ztranspose(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); } else { - zcopy(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); + my_zcopy(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); } if (api == 'F') { From e6fd62977056b95aad33a10c433ce686e4f852e4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 23:18:52 +0100 Subject: [PATCH 62/66] Expressly declare the .S extension for assembly (documented as standard, but current cmake does not set it for icx) --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ddff73c2cd..3c6508edff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,7 @@ cmake_minimum_required(VERSION 3.16.0) +set (CMAKE_ASM_SOURCE_FILE_EXTENSIONS "S") project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) From fbf594b62f4d1ee015a03a5df6e58fe796e63c98 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 24 Dec 2024 13:34:33 +0100 Subject: [PATCH 63/66] Guard against empty CMAKE_Fortran_COMPILER_ID --- cmake/f_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index dc0f5e0ac5..3f713807ea 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -45,7 +45,7 @@ if (NOT ONLY_CBLAS) # TODO: detect whether underscore needed, set #defines and BU appropriately - use try_compile # TODO: set FEXTRALIB flags a la f_check? - if (NOT (${CMAKE_SYSTEM_NAME} MATCHES "Windows" AND ${CMAKE_Fortran_COMPILER_ID} MATCHES "IntelLLVM")) + if (NOT (${CMAKE_SYSTEM_NAME} MATCHES "Windows" AND x${CMAKE_Fortran_COMPILER_ID} MATCHES "IntelLLVM")) set(BU "_") file(APPEND ${TARGET_CONF_TEMP} "#define BUNDERSCORE _\n" From 762fa1afa9aedebb32e4516b9b5b35a70869dd0e Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 24 Dec 2024 19:48:04 +0100 Subject: [PATCH 64/66] fix link to faq --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d8e73b2022..cc9325d39d 100644 --- a/README.md +++ b/README.md @@ -338,7 +338,7 @@ Please see Changelog.txt. ## Troubleshooting -* Please read the [FAQ](www.openmathlib.org/OpenBLAS/docs/faq) section of the docs first. +* Please read the [FAQ](http://www.openmathlib.org/OpenBLAS/docs/faq) section of the docs first. * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. From df42f79c4c7bc94e5b861af129638229ec2c0ce9 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Thu, 26 Dec 2024 21:09:26 +0100 Subject: [PATCH 65/66] docs: update extensions and install pages with last wiki edits I went through the wiki pages and found two pages with edits that weren't reflected in the html docs yet, so syncing that content here. --- docs/extensions.md | 16 ++++++++-------- docs/install.md | 14 ++++++++++---- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/docs/extensions.md b/docs/extensions.md index 483b009289..bc015910d3 100644 --- a/docs/extensions.md +++ b/docs/extensions.md @@ -5,14 +5,14 @@ This page documents those non-standard APIs. ## BLAS-like extensions -| Routine | Data Types | Description | -| ------------- |:------------- | :---------------| -| ?axpby | s,d,c,z | like axpy with a multiplier for y | -| ?gemm3m | c,z | gemm3m | -| ?imatcopy | s,d,c,z | in-place transpositon/copying | -| ?omatcopy | s,d,c,z | out-of-place transpositon/copying | -| ?geadd | s,d,c,z | matrix add | -| ?gemmt | s,d,c,z | gemm but only a triangular part updated| +| Routine | Data Types | Description | +| ------------- |:------------- | :-----------------------------------------------| +| ?axpby | s,d,c,z | like `axpy` with a multiplier for `y` | +| ?gemm3m | c,z | `gemm3m` | +| ?imatcopy | s,d,c,z | in-place transposition/copying | +| ?omatcopy | s,d,c,z | out-of-place transposition/copying | +| ?geadd | s,d,c,z | ATLAS-like matrix add `B = α*A+β*B` | +| ?gemmt | s,d,c,z | `gemm` but only a triangular part updated | ## bfloat16 functionality diff --git a/docs/install.md b/docs/install.md index 5bb88cccd8..3bc7ffc8f9 100644 --- a/docs/install.md +++ b/docs/install.md @@ -536,7 +536,6 @@ In your shell, move to this directory: `cd exports`. To build OpenBLAS for Android, you will need the following tools installed on your machine: - [The Android NDK](https://developer.android.com/ndk/) -- Perl - Clang compiler on the build machine The next two sections below describe how to build with Clang for ARMV7 and @@ -578,7 +577,9 @@ utility in the make command above, like so: AR=${NDK_BUNDLE_DIR}/toolchains/arm-linux-androideabi-4.9/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-gcc-ar ``` otherwise you may get a linker error complaining like `malformed archive header -name at 8` when the native macOS `ar` command was invoked instead. +name at 8` when the native macOS `ar` command was invoked instead. Note that +with recent NDK versions, the AR tool may be named `llvm-ar` rather than what +is assumed above. #### Building for ARMV8 @@ -608,12 +609,17 @@ Note: for NDK 23b, something as simple as: export PATH=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/:$PATH make HOSTCC=gcc CC=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android31-clang ONLY_CBLAS=1 TARGET=ARMV8 ``` -appears to be sufficient on Linux. +appears to be sufficient on Linux. On OSX, setting AR to the ar provided in the +"bin" path of the NDK (probably `llvm-ar`) is also necessary. ??? note "Alternative build script for 3 architectures" - This script will build OpenBLAS for 3 architecture (`ARMV7`, `ARMV8`, `X86`) and install them to `/opt/OpenBLAS/lib`. + This script will build OpenBLAS for 3 architecture (`ARMV7`, `ARMV8`, + `X86`) and install them to `/opt/OpenBLAS/lib`. Of course you can also copy + only the section that is of interest to you - also notice that the `AR=` + line may need adapting to the name of the ar tool provided in your + `$TOOLCHAIN/bin` - for example `llvm-ar` in some recent NDK versions. It was tested on macOS with NDK version 21.3.6528147. ```bash From d5e255519e5cbdd496a816dce939ae54f59896f2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Dec 2024 22:38:23 +0100 Subject: [PATCH 66/66] Improve OpenBLASConfig.cmake contents --- Makefile.install | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/Makefile.install b/Makefile.install index 129ed9a137..bfed157a49 100644 --- a/Makefile.install +++ b/Makefile.install @@ -191,22 +191,29 @@ endif #Generating OpenBLASConfig.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" - @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "file(REAL_PATH \"../../..\" _OpenBLAS_ROOT_DIR BASE_DIRECTORY \$${CMAKE_CURRENT_LIST_DIR} )" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_INCLUDE_DIRS \$${_OpenBLAS_ROOT_DIR}/include)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" ifneq ($(NO_SHARED),1) #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/bin/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), Darwin) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" +endif + @echo "add_library(OpenBLAS::OpenBLAS SHARED IMPORTED)" + @echo "target_include_directories(OpenBLAS::OpenBLAS INTERFACE \$${OpenBLAS_INCLUDE_DIRS})" +ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) + @echo "set_property(TARGET OpenBLAS::OpenBLAS PROPERTY IMPORTED_LOCATION \$${OpenBLAS_LIBRARIES})" + @echo "set_property(TARGET OpenBLAS::OpenBLAS PROPERTY IMPORTED_IMPLIB \$${_OpenBLAS_ROOT_DIR}/lib/libopenblas.lib)" endif else #only static - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif #Generating OpenBLASConfigVersion.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)