Skip to content

Commit

Permalink
Merge pull request OpenMathLib#5073 from XiWeiGu/la64_update_symv_lsx…
Browse files Browse the repository at this point in the history
…_version

LoongArch64: Update symv lsx version
  • Loading branch information
martin-frbg authored Jan 14, 2025
2 parents 9b98103 + e0a8216 commit eba7338
Show file tree
Hide file tree
Showing 4 changed files with 464 additions and 345 deletions.
208 changes: 123 additions & 85 deletions kernel/loongarch64/dsymv_L_lsx.S
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER

#include "common.h"
#include "loongarch64_asm.S"

/* Param */
#define M $r4
Expand Down Expand Up @@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28
#define T3 $r29
#define T4 $r30
#define T5 $r17
#define T6 $r16
#define T7 $r12

/* LSX vectors */
#define U0 $vr31
Expand Down Expand Up @@ -87,10 +91,114 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a8 $f8
#define a9 $f9

.macro LOAD_Y_8
beqz T5, .L01_Y_0
add.d T2, IY, INCY
fldx.d $f4, Y, T2
add.d T2, T2, INCY
fldx.d $f5, Y, T2
add.d T2, T2, INCY
fldx.d $f6, Y, T2
add.d T2, T2, INCY
fldx.d $f7, Y, T2

PROLOGUE
add.d T2, T2, INCY
fldx.d $f8, Y, T2
add.d T2, T2, INCY
fldx.d $f9, Y, T2
add.d T2, T2, INCY
fldx.d $f10, Y, T2
add.d T2, T2, INCY
fldx.d $f11, Y, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
b .L01_Y_1
.L01_Y_0:
add.d T7, IY, INCY
vldx U4, Y, T7
alsl.d T2, INCY, T7, 1
vldx U6, Y, T2
alsl.d T3, INCY, T2, 1
vldx U8, Y, T3
alsl.d T4, INCY, T3, 1
vldx U10, Y, T4
.L01_Y_1:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
add.d T2, IX, INCX
fldx.d $f4, X, T2
add.d T2, T2, INCX
fldx.d $f5, X, T2
add.d T2, T2, INCX
fldx.d $f6, X, T2
add.d T2, T2, INCX
fldx.d $f7, X, T2

add.d T2, T2, INCX
fldx.d $f8, X, T2
add.d T2, T2, INCX
fldx.d $f9, X, T2
add.d T2, T2, INCX
fldx.d $f10, X, T2
add.d T2, T2, INCX
fldx.d $f11, X, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
b .L01_X_1
.L01_X_0:
add.d T7, IX, INCX
vldx U4, X, T7
alsl.d T2, INCX, T7, 1
vldx U6, X, T2
alsl.d T3, INCX, T2, 1
vldx U8, X, T3
alsl.d T4, INCX, T3, 1
vldx U10, X, T4
.L01_X_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
vextrins.d U5, U4, 0x01
vextrins.d U7, U6, 0x01
vextrins.d U9, U8, 0x01
vextrins.d U11, U10, 0x01

add.d T2, IY, INCY
fstx.d $f4, Y, T2
add.d T2, T2, INCY
fstx.d $f5, Y, T2
add.d T2, T2, INCY
fstx.d $f6, Y, T2
add.d T2, T2, INCY
fstx.d $f7, Y, T2

add.d T2, T2, INCY
fstx.d $f8, Y, T2
add.d T2, T2, INCY
fstx.d $f9, Y, T2
add.d T2, T2, INCY
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2
b .L01_Y_3
.L01_Y_2:
vstx U4, Y, T7
vstx U6, Y, T2
vstx U8, Y, T3
vstx U10, Y, T4
.L01_Y_3:
.endm

LDARG BUFFER, $sp, 0
PROLOGUE

addi.d $sp, $sp, -88

Expand All @@ -107,6 +215,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

vldrepl.d VALPHA, $sp, 80

addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
Expand All @@ -122,11 +232,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
beq J, N, .L999

.L01:
MTC a2, $r0 //temp2
vxor.v U2, U2, U2
fldx.d a6, X, JX
fmul.d a3, ALPHA, a6 //temp1
vshuf4i.d U3, U3, 0x00
vshuf4i.d U2, U2, 0x00

mul.d T0, J, LDA
slli.d T1, J, BASE_SHIFT
Expand Down Expand Up @@ -163,105 +272,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vldx U16, AO1, T1
addi.d T1, T1, 16

add.d T2, IY, INCY
fldx.d $f4, Y, T2
add.d T2, T2, INCY
fldx.d $f5, Y, T2
add.d T2, T2, INCY
fldx.d $f6, Y, T2
add.d T2, T2, INCY
fldx.d $f7, Y, T2

add.d T2, T2, INCY
fldx.d $f8, Y, T2
add.d T2, T2, INCY
fldx.d $f9, Y, T2
add.d T2, T2, INCY
fldx.d $f10, Y, T2
add.d T2, T2, INCY
fldx.d $f11, Y, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
LOAD_Y_8

vfmadd.d U4, U3, U1, U4
vfmadd.d U6, U3, U14, U6
vfmadd.d U8, U3, U15, U8
vfmadd.d U10, U3, U16, U10

vextrins.d U5, U4, 0x01
vextrins.d U7, U6, 0x01
vextrins.d U9, U8, 0x01
vextrins.d U11, U10, 0x01

add.d T2, IY, INCY
fstx.d $f4, Y, T2
add.d T2, T2, INCY
fstx.d $f5, Y, T2
add.d T2, T2, INCY
fstx.d $f6, Y, T2
add.d T2, T2, INCY
fstx.d $f7, Y, T2

add.d T2, T2, INCY
fstx.d $f8, Y, T2
add.d T2, T2, INCY
fstx.d $f9, Y, T2
add.d T2, T2, INCY
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

add.d T2, IX, INCX
fldx.d $f4, X, T2
add.d T2, T2, INCX
fldx.d $f5, X, T2
add.d T2, T2, INCX
fldx.d $f6, X, T2
add.d T2, T2, INCX
fldx.d $f7, X, T2

add.d T2, T2, INCX
fldx.d $f8, X, T2
add.d T2, T2, INCX
fldx.d $f9, X, T2
add.d T2, T2, INCX
fldx.d $f10, X, T2
add.d T2, T2, INCX
fldx.d $f11, X, T2
STORE_Y_8

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
alsl.d IY, INCY, IY, 3

vand.v $vr12, $vr2, $vr2
LOAD_X_8

vfmadd.d U2, U1, U4, U2
vfsub.d U2, U2, $vr12
vfmadd.d U2, U14, U6, U2
vfmadd.d U2, U15, U8, U2
vfmadd.d U2, U16, U10, U2

vextrins.d U4, U2, 0x01

fadd.d $f2, $f2, $f4
fadd.d $f2, $f2, $f12

vextrins.d U2, U2, 0x10

slli.d T2, INCX, 3
add.d IX, IX, T2
alsl.d IX, INCX, IX, 3

addi.d II, II, 64
addi.d I, I, 1
blt I, T0, .L02

// Acc U2
GACC vf, d, U4, U2
vilvl.d U2, U4, U4

.L03: /* &4 */
sub.d T0, M, J
addi.d T0, T0, -1
Expand Down Expand Up @@ -429,4 +467,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0

EPILOGUE
EPILOGUE
Loading

0 comments on commit eba7338

Please sign in to comment.