Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vperm: refactor vcompress #97

Merged
merged 1 commit into from
Jan 5, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
309 changes: 270 additions & 39 deletions src/main/scala/yunsuan/vector/VectorPerm/Permutation.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ package yunsuan.vector.perm

import chisel3._
import chisel3.util._
import scala.language.postfixOps
import chisel3.util.experimental.decode.TruthTable
import scala.language.{existentials, postfixOps}
import yunsuan.vector._

class Permutation extends Module {
Expand Down Expand Up @@ -60,6 +61,188 @@ class Permutation extends Module {
val vrgather16_sew8 = opcode.isVrgather && (srcTypeVs1 === 1.U) && (srcTypeVs2 === 0.U)
val vslide = vslideup || vslide1up || vslidedn || vslide1dn

val vlRemain_vcompress = Wire(UInt(8.W))
val elements = Wire(UInt(5.W))
val base_ele = Wire(UInt(5.W))
val mask_start_idx = Wire(UInt(7.W))
val mask_selected = Wire(UInt(16.W))
val ones_sum_base = Wire(UInt(8.W))
val ones_sum = WireInit(VecInit(Seq.fill(vlenb + 1)(0.U(8.W))))
val compressed_vs2_en = WireInit(VecInit(Seq.fill(vlenb)(0.U(16.W))))
val compressed_vs2 = WireInit(VecInit(Seq.fill(vlenb)(0.U(VLEN.W))))
val compressed_vs2_masked = WireInit(VecInit(Seq.fill(vlenb)(0.U(VLEN.W))))
val compressed_res = Wire(UInt(VLEN.W))
val compressed_res_8 = WireInit(VecInit(Seq.fill(16)(0.U(8.W))))
val compressed_res_16 = WireInit(VecInit(Seq.fill(8)(0.U(16.W))))
val compressed_res_32 = WireInit(VecInit(Seq.fill(4)(0.U(32.W))))
val compressed_res_64 = WireInit(VecInit(Seq.fill(2)(0.U(64.W))))
val select_compressed_vs2 = Wire(UInt(vlenb.W))
val compressed_vs2_merged = Wire(UInt(VLEN.W))

val vlRemain1H = VecInit(Seq(
uopIdx === 0.U || uopIdx === 1.U,
uopIdx === 2.U || uopIdx === 3.U || uopIdx === 4.U,
uopIdx === 5.U || uopIdx === 6.U || uopIdx === 7.U || uopIdx === 8.U,
uopIdx === 9.U || uopIdx === 10.U || uopIdx === 11.U || uopIdx === 12.U || uopIdx === 13.U,
uopIdx === 14.U || uopIdx === 15.U || uopIdx === 16.U || uopIdx === 17.U || uopIdx === 18.U || uopIdx === 19.U,
uopIdx === 20.U || uopIdx === 21.U || uopIdx === 22.U || uopIdx === 23.U || uopIdx === 24.U || uopIdx === 25.U || uopIdx === 26.U,
uopIdx === 27.U || uopIdx === 28.U || uopIdx === 29.U || uopIdx === 30.U || uopIdx === 31.U || uopIdx === 32.U || uopIdx === 33.U || uopIdx === 34.U,
uopIdx === 35.U || uopIdx === 36.U || uopIdx === 37.U || uopIdx === 38.U || uopIdx === 39.U || uopIdx === 40.U || uopIdx === 41.U || uopIdx === 42.U
)).asUInt
val vlRemainOut = Seq(
Mux(vl > ele_cnt.asUInt, ele_cnt.asUInt, vl),
Mux(vl > (ele_cnt.asUInt << 1.U), ele_cnt.asUInt, Mux(vl > ele_cnt.asUInt , vl - ele_cnt.asUInt , 0.U)),
Mux(vl > (ele_cnt.asUInt * 3.U), ele_cnt.asUInt, Mux(vl > (ele_cnt.asUInt << 1.U), vl - (ele_cnt.asUInt << 1.U), 0.U)),
Mux(vl > (ele_cnt.asUInt << 2.U), ele_cnt.asUInt, Mux(vl > (ele_cnt.asUInt * 3.U), vl - (ele_cnt.asUInt * 3.U), 0.U)),
Mux(vl > (ele_cnt.asUInt * 5.U), ele_cnt.asUInt, Mux(vl > (ele_cnt.asUInt << 2.U), vl - (ele_cnt.asUInt << 2.U), 0.U)),
Mux(vl > (ele_cnt.asUInt * 6.U), ele_cnt.asUInt, Mux(vl > (ele_cnt.asUInt * 5.U), vl - (ele_cnt.asUInt * 5.U), 0.U)),
Mux(vl > (ele_cnt.asUInt * 7.U), ele_cnt.asUInt, Mux(vl > (ele_cnt.asUInt * 6.U), vl - (ele_cnt.asUInt * 6.U), 0.U)),
Mux(vl > (ele_cnt.asUInt * 7.U), vl - (ele_cnt.asUInt * 7.U), 0.U)
)
vlRemain_vcompress := Mux1H(vlRemain1H, vlRemainOut)

when(vlmul === 5.U) {
elements := Mux(vl < Cat(ele_cnt >> 3.U), vl, ele_cnt >> 3.U)
}.elsewhen(vlmul === 6.U) {
elements := Mux(vl < Cat(ele_cnt >> 2.U), vl, ele_cnt >> 2.U)
}.elsewhen(vlmul === 7.U) {
elements := Mux(vl < Cat(ele_cnt >> 1.U), vl, ele_cnt >> 1.U)
}.elsewhen(vlmul === 1.U) {
elements := Mux(vl < Cat(ele_cnt << 1.U), vlRemain_vcompress, ele_cnt)
}.elsewhen(vlmul === 2.U) {
elements := Mux(vl < Cat(ele_cnt << 2.U), vlRemain_vcompress, ele_cnt)
}.elsewhen(vlmul === 3.U) {
elements := Mux(vl < Cat(ele_cnt << 3.U), vlRemain_vcompress, ele_cnt)
}.otherwise {
elements := Mux(vl < Cat(ele_cnt), vl, ele_cnt)
}

when(vlmul === 5.U) {
base_ele := ele_cnt >> 3.U
}.elsewhen(vlmul === 6.U) {
base_ele := ele_cnt >> 2.U
}.elsewhen(vlmul === 7.U) {
base_ele := ele_cnt >> 1.U
}.otherwise {
base_ele := ele_cnt
}

val eNum = Mux1H(UIntToOH(vsew), Seq(16, 8, 4, 2).map(num => num.U))
val maskUopIdx1H = Seq(
uopIdx >= 0.U && uopIdx <= 1.U,
uopIdx >= 2.U && uopIdx <= 4.U,
uopIdx >= 5.U && uopIdx <= 8.U,
uopIdx >= 9.U && uopIdx <= 13.U,
uopIdx >= 14.U && uopIdx <= 19.U,
uopIdx >= 20.U && uopIdx <= 26.U,
uopIdx >= 27.U && uopIdx <= 34.U,
uopIdx >= 35.U && uopIdx <= 42.U
)
val startIdx1H = Mux1H(maskUopIdx1H, Seq.tabulate(8){num => num.U})
mask_start_idx := eNum * startIdx1H
val maskPart = vmask >> mask_start_idx
mask_selected := Mux1H(UIntToOH(vsew), Seq(16, 8, 4, 2).map(num => maskPart(num - 1 ,0)))

val baseUopIdx1H = Seq(
uopIdx === 0.U || uopIdx === 2.U || uopIdx === 5.U || uopIdx === 9.U || uopIdx === 14.U || uopIdx === 20.U || uopIdx === 27.U || uopIdx === 35.U,
uopIdx === 3.U || uopIdx === 6.U || uopIdx === 10.U || uopIdx === 15.U || uopIdx === 21.U || uopIdx === 28.U || uopIdx === 36.U,
uopIdx === 7.U || uopIdx === 11.U || uopIdx === 16.U || uopIdx === 22.U || uopIdx === 29.U || uopIdx === 37.U,
uopIdx === 12.U || uopIdx === 17.U || uopIdx === 23.U || uopIdx === 30.U || uopIdx === 38.U,
uopIdx === 18.U || uopIdx === 24.U || uopIdx === 31.U || uopIdx === 39.U,
uopIdx === 25.U || uopIdx === 32.U || uopIdx === 40.U,
uopIdx === 33.U || uopIdx === 41.U,
uopIdx === 42.U
)
ones_sum_base := Mux1H(baseUopIdx1H, Seq.tabulate(8){num => num.U * base_ele})
ones_sum(0) := vmask(7, 0)

for (i <- 1 to vlenb) {
when(i.U <= elements) {
ones_sum(i) := ones_sum(i - 1) + vs1(mask_start_idx + i.U - 1.U)
}
}

dontTouch(vlRemain1H)
dontTouch(vlRemain_vcompress)
dontTouch(elements)
dontTouch(base_ele)
dontTouch(eNum)
dontTouch(mask_start_idx)
dontTouch(mask_selected)
dontTouch(Cat(ones_sum.reverse))

when(vsew === 0.U) {
for (i <- 0 until 16) {
when(i.U < elements) {
compressed_vs2_en(i) := (vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) << (ones_sum(i) - ones_sum_base)(3, 0)
compressed_vs2(i) := vs2(8 * i + 7, 8 * i) << ((ones_sum(i) - ones_sum_base)(3, 0) << 3.U)
compressed_vs2_masked(i) := Fill(VLEN, vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) & compressed_vs2(i)
}
}
}.elsewhen(vsew === 1.U) {
for (i <- 0 until 8) {
when(i.U < elements) {
compressed_vs2_en(i) := (vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) << (ones_sum(i) - ones_sum_base)(3, 0)
compressed_vs2(i) := vs2(16 * i + 15, 16 * i) << ((ones_sum(i) - ones_sum_base)(3, 0) << 4.U)
compressed_vs2_masked(i) := Fill(VLEN, vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) & compressed_vs2(i)
}
}
}.elsewhen(vsew === 2.U) {
for (i <- 0 until 4) {
when(i.U < elements) {
compressed_vs2_en(i) := (vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) << (ones_sum(i) - ones_sum_base)(3, 0)
compressed_vs2(i) := vs2(32 * i + 31, 32 * i) << ((ones_sum(i) - ones_sum_base)(3, 0) << 5.U)
compressed_vs2_masked(i) := Fill(VLEN, vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) & compressed_vs2(i)
}
}
}.otherwise {
for (i <- 0 until 2) {
when(i.U < elements) {
compressed_vs2_en(i) := (vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) << (ones_sum(i) - ones_sum_base)(3, 0)
compressed_vs2(i) := vs2(64 * i + 63, 64 * i) << ((ones_sum(i) - ones_sum_base)(3, 0) << 6.U)
compressed_vs2_masked(i) := Fill(VLEN, vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) & compressed_vs2(i)
}
}
}

select_compressed_vs2 := compressed_vs2_en.reduce(_ | _)
compressed_vs2_merged := compressed_vs2_masked.reduce(_ | _)

when(vsew === 0.U) {
for (i <- 0 until 16) {
compressed_res_8(i):= Mux(select_compressed_vs2(i), compressed_vs2_merged(8 * i + 7, 8 * i), old_vd(8 * i + 7, 8 * i))
}
}.elsewhen(vsew === 1.U) {
for (i <- 0 until 8) {
compressed_res_16(i) := Mux(select_compressed_vs2(i), compressed_vs2_merged(16 * i + 15, 16 * i), old_vd(16 * i + 15, 16 * i))
}
}.elsewhen(vsew === 2.U) {
for (i <- 0 until 4) {
compressed_res_32(i) := Mux(select_compressed_vs2(i), compressed_vs2_merged(32 * i + 31, 32 * i), old_vd(32 * i + 31, 32 * i))
}
}.otherwise {
for (i <- 0 until 2) {
compressed_res_64(i) := Mux(select_compressed_vs2(i), compressed_vs2_merged(64 * i + 63, 64 * i), old_vd(64 * i + 63, 64 * i))
}
}

when(vsew === 0.U) {
compressed_res := Cat(compressed_res_8.reverse)
}.elsewhen(vsew === 1.U) {
compressed_res := Cat(compressed_res_16.reverse)
}.elsewhen(vsew === 2.U) {
compressed_res := Cat(compressed_res_32.reverse)
}.otherwise {
compressed_res := Cat(compressed_res_64.reverse)
}

dontTouch(select_compressed_vs2)
dontTouch(Cat(compressed_vs2_en.reverse))
dontTouch(Cat(compressed_vs2_masked.reverse))
dontTouch(Cat(compressed_vs2.reverse))
dontTouch(compressed_vs2_merged)
dontTouch(compressed_res)

val base = Wire(UInt(7.W))
val vmask0 = Mux(vcompress, vs1, vmask)
val vmask1 = Mux(vcompress, vs1 >> ele_cnt, vmask >> ele_cnt)
Expand Down Expand Up @@ -328,29 +511,9 @@ class Permutation extends Module {
vstartRemain := Mux(vstart >= (uopIdx(5, 1) << vsew_plus1), vstart - (uopIdx(5, 1) << vsew_plus1), 0.U)
}

val in_previous_ones_sum = Wire(UInt(8.W))
val out_previous_ones_sum = Wire(UInt(8.W))
val res_idx = Wire(Vec(vlenb, UInt(7.W)))
val res_valid = Wire(Vec(vlenb, Bool()))
val current_ones_sum = Wire(Vec(vlenb, UInt(8.W)))
val current_uop_ones_sum = Wire(Vec(vlenb, UInt(5.W)))
val vd_reg = RegInit(0.U(VLEN.W))

in_previous_ones_sum := vmask(7, 0)
out_previous_ones_sum := current_uop_ones_sum(vlenb - 1)

for (i <- 0 until vlenb) {
current_uop_ones_sum(i) := PopCount(Cat(vmask_byte_strb.reverse)(i, 0))
current_ones_sum(i) := in_previous_ones_sum + current_uop_ones_sum(i)
}

when(vcompress && fire) {
when(uopIdx === 1.U) {
vd_reg := Cat(0.U((VLEN - 8).W), out_previous_ones_sum)
}.otherwise {
vd_reg := Cat(current_ones_sum.reverse)
}
}.elsewhen(vmvnr && fire) {
when(vmvnr && fire) {
vd_reg := vs2
}.elsewhen(vslideup && fire) {
vd_reg := Cat(vslideup_vd.reverse)
Expand Down Expand Up @@ -384,18 +547,19 @@ class Permutation extends Module {
val vlRemain_reg = RegEnable(vlRemain, 0.U, fire)
val vsew_reg = RegEnable(vsew, 0.U, fire)
val old_vd_reg = RegEnable(old_vd, 0.U, fire)
val vs2_reg = RegEnable(vs2, 0.U, fire)
val vmask_byte_strb_reg = RegEnable(Cat(vmask_byte_strb.reverse), 0.U, fire)
val ta_reg = RegEnable(ta, false.B, fire)
val vstartRemain_reg = RegEnable(vstartRemain, 0.U, fire)
val base_reg = RegEnable(base, 0.U, fire)
val one_reg = RegEnable(in_previous_ones_sum, 0.U, fire)
val vstart_reg = RegEnable(vstart, 0.U, fire)
val vl_reg = RegEnable(Mux(vmvnr, evl, vl), 0.U, fire)
val ones_sum_base_reg = RegEnable(ones_sum_base, 0.U, fire)
val vm_reg = RegEnable(vm, 0.U, fire)
val mask_selected_reg = RegEnable(mask_selected, 0.U, fire)
val ma_reg = RegEnable(ma, 0.U, fire)
val compressed_res_reg = RegEnable(compressed_res, 0.U, fire)
val ones_sum_reg = RegEnable(ones_sum(elements), 0.U, fire)

val vlRemainBytes_reg = vlRemain_reg << vsew_reg
val vstartRemainBytes_reg = vstartRemain_reg << vsew_reg
val current_res_boundary = Mux(uopIdx_reg === 3.U, (2 * vlenb).U, vlenb.U)

vslideup_vl := Mux(vslideup & (slide_ele > vl), Mux(slide_ele > VLEN.U, VLEN.U, slide_ele), vl)
val tail_bytes = Mux((vlRemainBytes_reg >= vlenb.U), 0.U, vlenb.U - vlRemainBytes_reg)
Expand All @@ -413,17 +577,83 @@ class Permutation extends Module {
vmask_vstart_bits := vd_mask << vstart_bits
val vstart_old_vd = old_vd_reg & (~vmask_vstart_bits)

val cmprs_vd = Wire(Vec(vlenb, UInt(8.W)))
for (i <- 0 until vlenb) {
cmprs_vd(i) := Mux(ta_reg && ((i.U >= one_reg) && (uopIdx_reg =/= 3.U) || (uopIdx_reg === 3.U)), "hff".U, old_vd_reg(i * 8 + 7, i * 8))
val cmprs_vd = Wire(UInt(VLEN.W))
val cmprs_vd_8 = WireInit(VecInit(Seq.fill(16)(0.U(8.W))))
val cmprs_vd_16 = WireInit(VecInit(Seq.fill(8)(0.U(16.W))))
val cmprs_vd_32 = WireInit(VecInit(Seq.fill(4)(0.U(32.W))))
val cmprs_vd_64 = WireInit(VecInit(Seq.fill(2)(0.U(64.W))))
val res_keep_old_vd_8 = WireInit(VecInit(Seq.fill(16)(false.B)))
val res_keep_old_vd_16 = WireInit(VecInit(Seq.fill(8)(false.B)))
val res_keep_old_vd_32 = WireInit(VecInit(Seq.fill(4)(false.B)))
val res_keep_old_vd_64 = WireInit(VecInit(Seq.fill(2)(false.B)))
val res_agnostic_8 = WireInit(VecInit(Seq.fill(16)(false.B)))
val res_agnostic_16 = WireInit(VecInit(Seq.fill(8)(false.B)))
val res_agnostic_32 = WireInit(VecInit(Seq.fill(4)(false.B)))
val res_agnostic_64 = WireInit(VecInit(Seq.fill(2)(false.B)))

when(vsew_reg === 0.U) {
for (i <- 0 until 16) {
res_keep_old_vd_8(i) := ((~vm_reg & ~mask_selected_reg(i)) & ~ma_reg) | (ones_sum_base_reg + i.U < vstart_reg) | ((ones_sum_base_reg + i.U >= ones_sum_reg) & ~ta_reg)
res_agnostic_8(i) := ((ones_sum_base_reg + i.U >= ones_sum_reg) & ta_reg) | ((~vm_reg & ~mask_selected_reg(i)) & ma_reg)
when(res_keep_old_vd_8(i)) {
cmprs_vd_8(i) := old_vd_reg(8 * i + 7, 8 * i)
}.elsewhen(res_agnostic_8(i)) {
cmprs_vd_8(i) := Fill(8, 1.U)
}.otherwise {
cmprs_vd_8(i) := compressed_res_reg(8 * i + 7, 8 * i)
}
}
}.elsewhen(vsew === 1.U) {
for (i <- 0 until 8) {
res_keep_old_vd_16(i) := ((~vm_reg & ~mask_selected_reg(i)) & ~ma_reg) | (ones_sum_base_reg + i.U < vstart_reg) | ((ones_sum_base_reg + i.U >= ones_sum_reg) & ~ta_reg)
res_agnostic_16(i) := ((ones_sum_base_reg + i.U >= ones_sum_reg) & ta_reg) | ((~vm_reg & ~mask_selected_reg(i)) & ma_reg)
when(res_keep_old_vd_16(i)) {
cmprs_vd_16(i) := old_vd_reg(16 * i + 15, 16 * i)
}.elsewhen(res_agnostic_16(i)) {
cmprs_vd_16(i) := Fill(16, 1.U)
}.otherwise {
cmprs_vd_16(i) := compressed_res_reg(16 * i + 15, 16 * i)
}
}
}.elsewhen(vsew === 2.U) {
for (i <- 0 until 4) {
res_keep_old_vd_32(i) := ((~vm_reg & ~mask_selected_reg(i)) & ~ma_reg) | (ones_sum_base_reg + i.U < vstart_reg) | ((ones_sum_base_reg + i.U >= ones_sum_reg) & ~ta_reg)
res_agnostic_32(i) := ((ones_sum_base_reg + i.U >= ones_sum_reg) & ta_reg) | ((~vm_reg & ~mask_selected_reg(i)) & ma_reg)
when(res_keep_old_vd_32(i)) {
cmprs_vd_32(i) := old_vd_reg(32 * i + 31, 32 * i)
}.elsewhen(res_agnostic_32(i)) {
cmprs_vd_32(i) := Fill(32, 1.U)
}.otherwise {
cmprs_vd_32(i) := compressed_res_reg(32 * i + 31, 32 * i)
}
}
}.otherwise {
for (i <- 0 until 2) {
res_keep_old_vd_64(i) := ((~vm_reg & ~mask_selected_reg(i)) & ~ma_reg) | (ones_sum_base_reg + i.U < vstart_reg) | ((ones_sum_base_reg + i.U >= ones_sum_reg) & ~ta_reg)
res_agnostic_64(i) := ((ones_sum_base_reg + i.U >= ones_sum_reg) & ta_reg) | ((~vm_reg & ~mask_selected_reg(i)) & ma_reg)
when(res_keep_old_vd_64(i)) {
cmprs_vd_64(i) := old_vd_reg(64 * i + 63, 64 * i)
}.elsewhen(res_agnostic_64(i)) {
cmprs_vd_64(i) := Fill(64, 1.U)
}.otherwise {
cmprs_vd_64(i) := compressed_res_reg(64 * i + 63, 64 * i)
}
}
}

for (i <- 0 until vlenb) {
res_idx(i) := vd_reg(8 * (i + 1) - 1, 8 * i) - base_reg - 1.U
res_valid(i) := vd_reg(8 * (i + 1) - 1, 8 * i) >= base_reg + 1.U
when((vmask_byte_strb_reg(i) === 1.U) && res_valid(i) && (res_idx(i) < current_res_boundary)) {
cmprs_vd(res_idx(i)) := vs2_reg(i * 8 + 7, i * 8)
}
dontTouch(Cat(cmprs_vd_8.reverse))
dontTouch(Cat(cmprs_vd_16.reverse))
dontTouch(Cat(cmprs_vd_32.reverse))
dontTouch(Cat(cmprs_vd_64.reverse))

when(vsew_reg === 0.U) {
cmprs_vd := Cat(cmprs_vd_8.reverse)
}.elsewhen(vsew_reg === 1.U) {
cmprs_vd := Cat(cmprs_vd_16.reverse)
}.elsewhen(vsew_reg === 2.U) {
cmprs_vd := Cat(cmprs_vd_32.reverse)
}.otherwise {
cmprs_vd := Cat(cmprs_vd_64.reverse)
}

perm_tail_mask_vd := vd_reg
Expand All @@ -437,10 +667,11 @@ class Permutation extends Module {
when(vstart_reg >= vl_reg) {
perm_vd := old_vd_reg
}.elsewhen(is_vcompress_reg) {
when(uopIdx_reg === 1.U) {
perm_vd := vd_reg
when(uopIdx_reg === 1.U || uopIdx_reg === 4.U || uopIdx_reg === 8.U || uopIdx_reg === 13.U || uopIdx_reg === 19.U ||
uopIdx_reg === 26.U || uopIdx_reg === 34.U) {
perm_vd := ones_sum_reg
}.otherwise {
perm_vd := Cat(cmprs_vd.reverse)
perm_vd := cmprs_vd
}
}

Expand Down
Loading