diff --git a/src/main/scala/yunsuan/vector/VectorPerm/Permutation.scala b/src/main/scala/yunsuan/vector/VectorPerm/Permutation.scala index 09f8014..8bba9a2 100644 --- a/src/main/scala/yunsuan/vector/VectorPerm/Permutation.scala +++ b/src/main/scala/yunsuan/vector/VectorPerm/Permutation.scala @@ -6,6 +6,7 @@ import chisel3.util.experimental.decode.TruthTable import scala.language.{existentials, postfixOps} import yunsuan.vector._ import chisel3.util.experimental.decode.{QMCMinimizer, TruthTable, decoder} +import yunsuan.util.GatedValidRegNext class slideupVs2VdTable() extends Module { // convert uop index of slide instruction to offset of vs2 and vd @@ -106,6 +107,7 @@ class Permutation extends Module { val vl = io.in.bits.info.vl val uopIdx = io.in.bits.info.uopIdx val fire = io.in.valid + val fire_reg0 = GatedValidRegNext(fire) val vsew = srcTypeVs2(1, 0) val vsew_plus1 = Wire(UInt(3.W)) @@ -116,7 +118,6 @@ class Permutation extends Module { val ele_cnt = vlenb.U >> vsew val vlRemain = Wire(UInt(8.W)) - val vlRemainBytes = vlRemain << vsew val eewVs1 = SewOH(srcTypeVs1(1, 0)) val eewVs2 = SewOH(srcTypeVs2(1, 0)) val eewVd = SewOH(vdType(1, 0)) @@ -133,23 +134,41 @@ class Permutation extends Module { val vrgather16_sew8 = opcode.isVrgather && (srcTypeVs1 === 1.U) && (srcTypeVs2 === 0.U) val vslide = vslideup || vslide1up || vslidedn || vslide1dn + /** + * vcompress + * + * cycle0 + * vlRemain_vcompress (mux8) + * elements (mux7) + * mask_start_idx(mux8, mul) + * ones_sum_base(mux8, mul) + * ones_sum(mask_start_idx, acc sum) + * + * cycle1 + * compressed_vs2_en(mask_start_idx) + * compressed_vs2 + * compressed_vs2_masked(compressed_vs2) + * select_compressed_vs2(compressed_vs2_en, or) + * compressed_vs2_merged(compressed_vs2_masked, or) + * compressed_res_8 + * compressed_res + * + * cycle2 + * res_agnostic_8 + * cmprd_vd_8 + * cmprd_vd(mux4) + * + */ + + /** + * vcompress cycle0 + * -----begin----- + */ val vlRemain_vcompress = Wire(UInt(8.W)) val elements = Wire(UInt(5.W)) val base_ele = Wire(UInt(5.W)) val mask_start_idx = Wire(UInt(7.W)) - val mask_selected = Wire(UInt(16.W)) val ones_sum_base = Wire(UInt(8.W)) - val ones_sum = WireInit(VecInit(Seq.fill(vlenb + 1)(0.U(8.W)))) - val compressed_vs2_en = WireInit(VecInit(Seq.fill(vlenb)(0.U(16.W)))) - val compressed_vs2 = WireInit(VecInit(Seq.fill(vlenb)(0.U(VLEN.W)))) - val compressed_vs2_masked = WireInit(VecInit(Seq.fill(vlenb)(0.U(VLEN.W)))) - val compressed_res = Wire(UInt(VLEN.W)) - val compressed_res_8 = WireInit(VecInit(Seq.fill(16)(0.U(8.W)))) - val compressed_res_16 = WireInit(VecInit(Seq.fill(8)(0.U(16.W)))) - val compressed_res_32 = WireInit(VecInit(Seq.fill(4)(0.U(32.W)))) - val compressed_res_64 = WireInit(VecInit(Seq.fill(2)(0.U(64.W)))) - val select_compressed_vs2 = Wire(UInt(vlenb.W)) - val compressed_vs2_merged = Wire(UInt(VLEN.W)) val vlRemain1H = VecInit(Seq( uopIdx === 0.U || uopIdx === 1.U, @@ -199,7 +218,7 @@ class Permutation extends Module { base_ele := ele_cnt } - val eNum = Mux1H(UIntToOH(vsew), Seq(16, 8, 4, 2).map(num => num.U)) + val eNum = Mux1H(UIntToOH(vsew), Seq(4, 3, 2, 1).map(num => num.U)) val maskUopIdx1H = Seq( uopIdx >= 0.U && uopIdx <= 1.U, uopIdx >= 2.U && uopIdx <= 4.U, @@ -210,10 +229,7 @@ class Permutation extends Module { uopIdx >= 27.U && uopIdx <= 34.U, uopIdx >= 35.U && uopIdx <= 42.U ) - val startIdx1H = Mux1H(maskUopIdx1H, Seq.tabulate(8){num => num.U}) - mask_start_idx := eNum * startIdx1H - val maskPart = vmask >> mask_start_idx - mask_selected := Mux1H(UIntToOH(vsew), Seq(16, 8, 4, 2).map(num => maskPart(num - 1 ,0))) + mask_start_idx := Mux1H(maskUopIdx1H, Seq.tabulate(8){num => num.U << eNum}) val baseUopIdx1H = Seq( uopIdx === 0.U || uopIdx === 2.U || uopIdx === 5.U || uopIdx === 9.U || uopIdx === 14.U || uopIdx === 20.U || uopIdx === 27.U || uopIdx === 35.U, @@ -226,99 +242,174 @@ class Permutation extends Module { uopIdx === 42.U ) ones_sum_base := Mux1H(baseUopIdx1H, Seq.tabulate(8){num => num.U * base_ele}) - ones_sum(0) := vmask(7, 0) - for (i <- 1 to vlenb) { - when(i.U <= elements) { - ones_sum(i) := ones_sum(i - 1) + vs1(mask_start_idx + i.U - 1.U) + val in_previous_ones_sum = Wire(UInt(8.W)) + val out_previous_ones_sum = Wire(UInt(8.W)) + val current_uop_ones_sum = WireInit(VecInit(Seq.fill(vlenb)(0.U(5.W)))) + val current_ones_sum = WireInit(VecInit(Seq.fill(vlenb)(0.U(8.W)))) + + val vs1_mask = Wire(Vec(vlenb, UInt(1.W))) + for (i <- 0 until vlenb) { + vs1_mask(i) := vs1(mask_start_idx + i.U) + } + + in_previous_ones_sum := vmask(7, 0) + out_previous_ones_sum := current_uop_ones_sum(elements - 1.U) + + for (i <- 0 until vlenb) { + when(i.U < elements) { + current_uop_ones_sum(i) := PopCount(Cat(vs1_mask.reverse)(i, 0)) + } + when(i.U < elements - 1.U) { + current_ones_sum(i) := in_previous_ones_sum + current_uop_ones_sum(i) } } - dontTouch(vlRemain1H) - dontTouch(vlRemain_vcompress) - dontTouch(elements) - dontTouch(base_ele) - dontTouch(eNum) - dontTouch(mask_start_idx) - dontTouch(mask_selected) - dontTouch(Cat(ones_sum.reverse)) + /** + * vcompress cycle0 + * ----end----- + */ + + + /** + * vcompress cycle1 + * -----begin----- + */ + val vsew_reg0 = RegEnable(vsew, 0.U, fire) + val elements_reg0 = RegEnable(elements, 0.U, fire) + val mask_start_idx_reg0 = RegEnable(mask_start_idx, 0.U, fire) + val vs1_reg0 = RegEnable(vs1, 0.U, fire) + val vs2_reg0 = RegEnable(vs2, 0.U, fire) + val old_vd_reg0 = RegEnable(old_vd, 0.U, fire) + val current_ones_sum_reg0 = RegEnable(current_ones_sum, fire) + val in_previous_ones_sum_reg0 = RegEnable(in_previous_ones_sum, fire) + val out_previous_ones_sum_reg0 = RegEnable(out_previous_ones_sum, fire) + val ones_sum_base_reg0 = RegEnable(ones_sum_base, fire) + val ele_cnt_reg0 = RegEnable(ele_cnt, fire) + + val ones_sum_reg0 = WireInit(VecInit(Seq.fill(vlenb)(0.U(8.W)))) + ones_sum_reg0(0) := in_previous_ones_sum_reg0 + for (i <- 1 until vlenb) { + when(i.U < elements_reg0) { + ones_sum_reg0(i) := current_ones_sum_reg0(i-1) + } + } - when(vsew === 0.U) { + val ones_sum_eles_reg0 = Wire(UInt(8.W)) + ones_sum_eles_reg0 := in_previous_ones_sum_reg0 + out_previous_ones_sum_reg0 + + val compressed_vs2_en_reg0 = WireInit(VecInit(Seq.fill(vlenb)(0.U(16.W)))) + val compressed_vs2_reg0 = WireInit(VecInit(Seq.fill(vlenb)(0.U(VLEN.W)))) + val compressed_vs2_masked_reg0 = WireInit(VecInit(Seq.fill(vlenb)(0.U(VLEN.W)))) + val compressed_res_reg0 = Wire(UInt(VLEN.W)) + val compressed_res_8_reg0 = WireInit(VecInit(Seq.fill(16)(0.U(8.W)))) + val compressed_res_16_reg0 = WireInit(VecInit(Seq.fill(8)(0.U(16.W)))) + val compressed_res_32_reg0 = WireInit(VecInit(Seq.fill(4)(0.U(32.W)))) + val compressed_res_64_reg0 = WireInit(VecInit(Seq.fill(2)(0.U(64.W)))) + val select_compressed_vs2_reg0 = Wire(UInt(vlenb.W)) + val compressed_vs2_merged_reg0 = Wire(UInt(VLEN.W)) + + when(vsew_reg0 === 0.U) { for (i <- 0 until 16) { - when(i.U < elements) { - compressed_vs2_en(i) := (vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) << (ones_sum(i) - ones_sum_base)(3, 0) - compressed_vs2(i) := vs2(8 * i + 7, 8 * i) << ((ones_sum(i) - ones_sum_base)(3, 0) << 3.U) - compressed_vs2_masked(i) := Fill(VLEN, vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) & compressed_vs2(i) + when(i.U < elements_reg0) { + compressed_vs2_en_reg0(i) := (vs1_reg0(mask_start_idx_reg0 + i.U) & (0.U <= (ones_sum_reg0(i) - ones_sum_base_reg0)) & ((ones_sum_reg0(i) - ones_sum_base_reg0) < ele_cnt_reg0)) << (ones_sum_reg0(i) - ones_sum_base_reg0)(3, 0) + compressed_vs2_reg0(i) := vs2_reg0(8 * i + 7, 8 * i) << ((ones_sum_reg0(i) - ones_sum_base_reg0)(3, 0) << 3.U) + compressed_vs2_masked_reg0(i) := Fill(VLEN, vs1_reg0(mask_start_idx_reg0 + i.U) & (0.U <= (ones_sum_reg0(i) - ones_sum_base_reg0)) & ((ones_sum_reg0(i) - ones_sum_base_reg0) < ele_cnt_reg0)) & compressed_vs2_reg0(i) } } - }.elsewhen(vsew === 1.U) { + }.elsewhen(vsew_reg0 === 1.U) { for (i <- 0 until 8) { - when(i.U < elements) { - compressed_vs2_en(i) := (vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) << (ones_sum(i) - ones_sum_base)(3, 0) - compressed_vs2(i) := vs2(16 * i + 15, 16 * i) << ((ones_sum(i) - ones_sum_base)(3, 0) << 4.U) - compressed_vs2_masked(i) := Fill(VLEN, vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) & compressed_vs2(i) + when(i.U < elements_reg0) { + compressed_vs2_en_reg0(i) := (vs1_reg0(mask_start_idx_reg0 + i.U) & (0.U <= (ones_sum_reg0(i) - ones_sum_base_reg0)) & ((ones_sum_reg0(i) - ones_sum_base_reg0) < ele_cnt_reg0)) << (ones_sum_reg0(i) - ones_sum_base_reg0)(3, 0) + compressed_vs2_reg0(i) := vs2_reg0(16 * i + 15, 16 * i) << ((ones_sum_reg0(i) - ones_sum_base_reg0)(3, 0) << 4.U) + compressed_vs2_masked_reg0(i) := Fill(VLEN, vs1_reg0(mask_start_idx_reg0 + i.U) & (0.U <= (ones_sum_reg0(i) - ones_sum_base_reg0)) & ((ones_sum_reg0(i) - ones_sum_base_reg0) < ele_cnt_reg0)) & compressed_vs2_reg0(i) } } - }.elsewhen(vsew === 2.U) { + }.elsewhen(vsew_reg0 === 2.U) { for (i <- 0 until 4) { - when(i.U < elements) { - compressed_vs2_en(i) := (vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) << (ones_sum(i) - ones_sum_base)(3, 0) - compressed_vs2(i) := vs2(32 * i + 31, 32 * i) << ((ones_sum(i) - ones_sum_base)(3, 0) << 5.U) - compressed_vs2_masked(i) := Fill(VLEN, vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) & compressed_vs2(i) + when(i.U < elements_reg0) { + compressed_vs2_en_reg0(i) := (vs1_reg0(mask_start_idx_reg0 + i.U) & (0.U <= (ones_sum_reg0(i) - ones_sum_base_reg0)) & ((ones_sum_reg0(i) - ones_sum_base_reg0) < ele_cnt_reg0)) << (ones_sum_reg0(i) - ones_sum_base_reg0)(3, 0) + compressed_vs2_reg0(i) := vs2_reg0(32 * i + 31, 32 * i) << ((ones_sum_reg0(i) - ones_sum_base_reg0)(3, 0) << 5.U) + compressed_vs2_masked_reg0(i) := Fill(VLEN, vs1_reg0(mask_start_idx_reg0 + i.U) & (0.U <= (ones_sum_reg0(i) - ones_sum_base_reg0)) & ((ones_sum_reg0(i) - ones_sum_base_reg0) < ele_cnt_reg0)) & compressed_vs2_reg0(i) } } }.otherwise { for (i <- 0 until 2) { - when(i.U < elements) { - compressed_vs2_en(i) := (vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) << (ones_sum(i) - ones_sum_base)(3, 0) - compressed_vs2(i) := vs2(64 * i + 63, 64 * i) << ((ones_sum(i) - ones_sum_base)(3, 0) << 6.U) - compressed_vs2_masked(i) := Fill(VLEN, vs1(mask_start_idx + i.U) & (0.U <= (ones_sum(i) - ones_sum_base)) & ((ones_sum(i) - ones_sum_base) < ele_cnt)) & compressed_vs2(i) + when(i.U < elements_reg0) { + compressed_vs2_en_reg0(i) := (vs1_reg0(mask_start_idx_reg0 + i.U) & (0.U <= (ones_sum_reg0(i) - ones_sum_base_reg0)) & ((ones_sum_reg0(i) - ones_sum_base_reg0) < ele_cnt_reg0)) << (ones_sum_reg0(i) - ones_sum_base_reg0)(3, 0) + compressed_vs2_reg0(i) := vs2_reg0(64 * i + 63, 64 * i) << ((ones_sum_reg0(i) - ones_sum_base_reg0)(3, 0) << 6.U) + compressed_vs2_masked_reg0(i) := Fill(VLEN, vs1_reg0(mask_start_idx_reg0 + i.U) & (0.U <= (ones_sum_reg0(i) - ones_sum_base_reg0)) & ((ones_sum_reg0(i) - ones_sum_base_reg0) < ele_cnt_reg0)) & compressed_vs2_reg0(i) } } } - select_compressed_vs2 := compressed_vs2_en.reduce(_ | _) - compressed_vs2_merged := compressed_vs2_masked.reduce(_ | _) + select_compressed_vs2_reg0 := compressed_vs2_en_reg0.reduce(_ | _) + compressed_vs2_merged_reg0 := compressed_vs2_masked_reg0.reduce(_ | _) - when(vsew === 0.U) { + when(vsew_reg0 === 0.U) { for (i <- 0 until 16) { - compressed_res_8(i):= Mux(select_compressed_vs2(i), compressed_vs2_merged(8 * i + 7, 8 * i), old_vd(8 * i + 7, 8 * i)) + compressed_res_8_reg0(i):= Mux(select_compressed_vs2_reg0(i), compressed_vs2_merged_reg0(8 * i + 7, 8 * i), old_vd_reg0(8 * i + 7, 8 * i)) } - }.elsewhen(vsew === 1.U) { + }.elsewhen(vsew_reg0 === 1.U) { for (i <- 0 until 8) { - compressed_res_16(i) := Mux(select_compressed_vs2(i), compressed_vs2_merged(16 * i + 15, 16 * i), old_vd(16 * i + 15, 16 * i)) + compressed_res_16_reg0(i) := Mux(select_compressed_vs2_reg0(i), compressed_vs2_merged_reg0(16 * i + 15, 16 * i), old_vd_reg0(16 * i + 15, 16 * i)) } - }.elsewhen(vsew === 2.U) { + }.elsewhen(vsew_reg0 === 2.U) { for (i <- 0 until 4) { - compressed_res_32(i) := Mux(select_compressed_vs2(i), compressed_vs2_merged(32 * i + 31, 32 * i), old_vd(32 * i + 31, 32 * i)) + compressed_res_32_reg0(i) := Mux(select_compressed_vs2_reg0(i), compressed_vs2_merged_reg0(32 * i + 31, 32 * i), old_vd_reg0(32 * i + 31, 32 * i)) } }.otherwise { for (i <- 0 until 2) { - compressed_res_64(i) := Mux(select_compressed_vs2(i), compressed_vs2_merged(64 * i + 63, 64 * i), old_vd(64 * i + 63, 64 * i)) + compressed_res_64_reg0(i) := Mux(select_compressed_vs2_reg0(i), compressed_vs2_merged_reg0(64 * i + 63, 64 * i), old_vd_reg0(64 * i + 63, 64 * i)) } } - when(vsew === 0.U) { - compressed_res := Cat(compressed_res_8.reverse) - }.elsewhen(vsew === 1.U) { - compressed_res := Cat(compressed_res_16.reverse) - }.elsewhen(vsew === 2.U) { - compressed_res := Cat(compressed_res_32.reverse) + when(vsew_reg0 === 0.U) { + compressed_res_reg0 := Cat(compressed_res_8_reg0.reverse) + }.elsewhen(vsew_reg0 === 1.U) { + compressed_res_reg0 := Cat(compressed_res_16_reg0.reverse) + }.elsewhen(vsew_reg0 === 2.U) { + compressed_res_reg0 := Cat(compressed_res_32_reg0.reverse) }.otherwise { - compressed_res := Cat(compressed_res_64.reverse) + compressed_res_reg0 := Cat(compressed_res_64_reg0.reverse) } - dontTouch(select_compressed_vs2) - dontTouch(Cat(compressed_vs2_en.reverse)) - dontTouch(Cat(compressed_vs2_masked.reverse)) - dontTouch(Cat(compressed_vs2.reverse)) - dontTouch(compressed_vs2_merged) - dontTouch(compressed_res) +// dontTouch(select_compressed_vs2) +// dontTouch(Cat(compressed_vs2_en.reverse)) +// dontTouch(Cat(compressed_vs2_masked.reverse)) +// dontTouch(Cat(compressed_vs2.reverse)) +// dontTouch(compressed_vs2_merged) +// dontTouch(compressed_res) + /** + * vcompress cycle1 + * -----end------ + */ + + /** + * vslideup,vslidedn + * + * cycle0 + * vlRemain + * base + * vmask_byte_strb + * vrgather_byte_sel + * + * clcye1 + * vrgather_vd + * vslide1dn_vd + * vslideup_vd + * vslide1up_vd + * vstartRemain + * + * cycle2 + * vd_reg + * + */ val base = Wire(UInt(7.W)) val vmask0 = vmask val vmask_uop = Wire(UInt(VLEN.W)) - val vmask_byte_strb = Wire(Vec(vlenb, UInt(1.W))) + val vmask_byte_strb_reg0 = Wire(Vec(vlenb, UInt(1.W))) val vs1_bytes = VecInit(Seq.tabulate(vlenb)(i => vs1((i + 1) * 8 - 1, i * 8))) val vs2_bytes = VecInit(Seq.tabulate(vlenb)(i => vs2((i + 1) * 8 - 1, i * 8))) val emul = vlmul(1, 0) @@ -373,8 +464,8 @@ class Permutation extends Module { (vslidedn) -> vslidednVs2Id, )) - dontTouch(vdId) - dontTouch(vs2Id) + // dontTouch(vdId) + // dontTouch(vs2Id) val vslideup_vl = Wire(UInt(8.W)) vlRemain := vslideup_vl @@ -396,19 +487,26 @@ class Permutation extends Module { } base := Mux1H(Seq.tabulate(8)(i => (vs2Id === i.U) -> (vlenb * i).U)) + // dontTouch(base) + + val vlRemain_reg0 = RegEnable(vlRemain, 0.U, fire) + val vmask_uop_reg0 = RegEnable(vmask_uop, fire) + val vm_reg0 = RegEnable(vm, fire) + + val vlRemainBytes_reg0 = vlRemain_reg0 << vsew_reg0 for (i <- 0 until vlenb) { - when(i.U < vlRemainBytes) { - vmask_byte_strb(i) := vmask_uop(i) | vm - when(vsew === 1.U(3.W)) { - vmask_byte_strb(i) := vmask_uop(i / 2) | vm - }.elsewhen(vsew === 2.U(3.W)) { - vmask_byte_strb(i) := vmask_uop(i / 4) | vm - }.elsewhen(vsew === 3.U(3.W)) { - vmask_byte_strb(i) := vmask_uop(i / 8) | vm + when(i.U < vlRemainBytes_reg0) { + vmask_byte_strb_reg0(i) := vmask_uop_reg0(i) | vm_reg0 + when(vsew_reg0 === 1.U(3.W)) { + vmask_byte_strb_reg0(i) := vmask_uop_reg0(i / 2) | vm_reg0 + }.elsewhen(vsew_reg0 === 2.U(3.W)) { + vmask_byte_strb_reg0(i) := vmask_uop_reg0(i / 4) | vm_reg0 + }.elsewhen(vsew_reg0 === 3.U(3.W)) { + vmask_byte_strb_reg0(i) := vmask_uop_reg0(i / 8) | vm_reg0 } }.otherwise { - vmask_byte_strb(i) := 0.U + vmask_byte_strb_reg0(i) := 0.U } } @@ -421,9 +519,9 @@ class Permutation extends Module { (vs2Id === 0.U) -> vlmax_bytes, ) ++ (1 until 8).map(i => (vs2Id === i.U) -> (vlenb * (i + 1)).U)) - dontTouch(vs2_bytes_min) - dontTouch(vs2_bytes_max) - val vrgather_vd = Wire(Vec(vlenb, UInt(8.W))) + // dontTouch(vs2_bytes_min) + // dontTouch(vs2_bytes_max) + val vrgather_vd_reg0 = Wire(Vec(vlenb, UInt(8.W))) vlmax_bytes := vlenb.U when(vlmul === 5.U) { @@ -436,7 +534,7 @@ class Permutation extends Module { for (i <- 0 until vlenb) { vrgather_byte_sel(i) := 0.U - vrgather_vd(i) := 0.U + vrgather_vd_reg0(i) := 0.U } for (i <- 0 until vlenb / 2) { @@ -527,16 +625,26 @@ class Permutation extends Module { } } - when((vrgather || vrgather_vx) && fire) { + val base_reg0 = RegEnable(base, fire) + val vrgather_byte_sel_reg0 = RegEnable(vrgather_byte_sel, fire) + val is_vrgather_reg0 = RegEnable(vrgather, false.B, fire) + val is_vrgather_vx_reg0 = RegEnable(vrgather_vx, false.B, fire) + val ma_reg0 = RegEnable(ma, fire) + val vs2_bytes_min_reg0 = RegEnable(vs2_bytes_min, fire) + val vs2_bytes_max_reg0 = RegEnable(vs2_bytes_max, fire) + val vs2_bytes_reg0 = RegEnable(vs2_bytes, fire) + val first_gather_reg0 = RegEnable(first_gather, fire) + + when((is_vrgather_reg0 || is_vrgather_vx_reg0) && fire_reg0) { for (i <- 0 until vlenb) { - vrgather_vd(i) := Mux(ma, "hff".U, old_vd((i + 1) * 8 - 1, i * 8)) - when(vmask_byte_strb(i).asBool) { - when((vrgather_byte_sel(i) >= vs2_bytes_min) && (vrgather_byte_sel(i) < vs2_bytes_max)) { - vrgather_vd(i) := vs2_bytes((vrgather_byte_sel(i) - vs2_bytes_min)(vlenbWidth - 1, 0)) - }.elsewhen(first_gather) { - vrgather_vd(i) := 0.U + vrgather_vd_reg0(i) := Mux(ma_reg0, "hff".U, old_vd_reg0((i + 1) * 8 - 1, i * 8)) + when(vmask_byte_strb_reg0(i).asBool) { + when((vrgather_byte_sel_reg0(i) >= vs2_bytes_min_reg0) && (vrgather_byte_sel_reg0(i) < vs2_bytes_max_reg0)) { + vrgather_vd_reg0(i) := vs2_bytes_reg0((vrgather_byte_sel_reg0(i) - vs2_bytes_min_reg0)(vlenbWidth - 1, 0)) + }.elsewhen(first_gather_reg0) { + vrgather_vd_reg0(i) := 0.U }.otherwise { - vrgather_vd(i) := old_vd((i + 1) * 8 - 1, i * 8) + vrgather_vd_reg0(i) := old_vd_reg0((i + 1) * 8 - 1, i * 8) } } } @@ -545,76 +653,85 @@ class Permutation extends Module { // vslideup/vslide1up val slide_ele = Mux(vslide1up || vslide1dn, 1.U, vs1(xLen, 0)) val slide_bytes = slide_ele << vsew - val vslideup_vd = Wire(Vec(vlenb, UInt(8.W))) - val vslide1up_vd = Wire(Vec(vlenb, UInt(8.W))) - val vslidedn_vd = Wire(Vec(vlenb, UInt(8.W))) - val vslide1dn_vd_wo_rs1 = Wire(Vec(vlenb, UInt(8.W))) - val vslide1dn_vd_rs1 = Wire(UInt(VLEN.W)) + val vslideup_vd_reg0 = Wire(Vec(vlenb, UInt(8.W))) + val vslide1up_vd_reg0 = Wire(Vec(vlenb, UInt(8.W))) + val vslidedn_vd_reg0 = Wire(Vec(vlenb, UInt(8.W))) + val vslide1dn_vd_wo_rs1_reg0 = Wire(Vec(vlenb, UInt(8.W))) + val vslide1dn_vd_rs1_reg0 = Wire(UInt(VLEN.W)) val first_slidedn = vslidedn && vslidednOffset.outIsFirst val load_rs1 = (((vlmul >= 4.U) || (vlmul === 0.U)) && (uopIdx === 0.U)) || ((vlmul === 1.U) && (uopIdx === 2.U)) || ((vlmul === 2.U) && (uopIdx === 6.U)) || (uopIdx === 14.U) - val vslide1dn_vd = Mux((load_rs1 || uopIdx(0)), VecInit(Seq.tabulate(vlenb)(i => vslide1dn_vd_rs1((i + 1) * 8 - 1, i * 8))), vslide1dn_vd_wo_rs1) - dontTouch(base) + + val uopIdx_reg0 = RegEnable(uopIdx, 0.U, fire) + val load_rs1_reg0 = RegEnable(load_rs1, fire) + val slide_bytes_reg0 = RegEnable(slide_bytes, fire) + val vlmax_bytes_reg0 = RegEnable(vlmax_bytes, fire) + val first_slidedn_reg0 = RegEnable(first_slidedn, fire) + val is_vslide1up_reg0 = RegEnable(vslide1up, false.B, fire) + val is_vslide1dn_reg0 = RegEnable(vslide1dn, false.B, fire) + val vsew_bytes_reg0 = RegEnable(vsew_bytes, fire) + val vs1_bytes_reg0 = RegEnable(vs1_bytes, fire) for (i <- 0 until vlenb) { - vslideup_vd(i) := Mux(ma, "hff".U, old_vd(i * 8 + 7, i * 8)) - when(vmask_byte_strb(i).asBool) { - when(((base +& i.U) >= slide_bytes) && ((base +& i.U - slide_bytes) < vlmax_bytes)) { - vslideup_vd(i) := vs2_bytes((base +& i.U - slide_bytes)(vlenbWidth - 1, 0)) + vslideup_vd_reg0(i) := Mux(ma_reg0, "hff".U, old_vd_reg0(i * 8 + 7, i * 8)) + when(vmask_byte_strb_reg0(i).asBool) { + when(((base_reg0 +& i.U) >= slide_bytes_reg0) && ((base_reg0 +& i.U - slide_bytes_reg0) < vlmax_bytes_reg0)) { + vslideup_vd_reg0(i) := vs2_bytes_reg0((base_reg0 +& i.U - slide_bytes_reg0)(vlenbWidth - 1, 0)) }.otherwise { - vslideup_vd(i) := old_vd(i * 8 + 7, i * 8) + vslideup_vd_reg0(i) := old_vd_reg0(i * 8 + 7, i * 8) } } } for (i <- 0 until vlenb) { - vslidedn_vd(i) := Mux(ma, "hff".U, old_vd(i * 8 + 7, i * 8)) - when(vmask_byte_strb(i).asBool) { - when(((i.U +& slide_bytes) >= base) && ((i.U +& slide_bytes - base) < vlmax_bytes)) { - vslidedn_vd(i) := vs2_bytes((i.U +& slide_bytes - base)(vlenbWidth - 1, 0)) - }.elsewhen(first_slidedn) { - vslidedn_vd(i) := 0.U + vslidedn_vd_reg0(i) := Mux(ma_reg0, "hff".U, old_vd_reg0(i * 8 + 7, i * 8)) + when(vmask_byte_strb_reg0(i).asBool) { + when(((i.U +& slide_bytes_reg0) >= base_reg0) && ((i.U +& slide_bytes_reg0 - base_reg0) < vlmax_bytes_reg0)) { + vslidedn_vd_reg0(i) := vs2_bytes_reg0((i.U +& slide_bytes_reg0 - base_reg0)(vlenbWidth - 1, 0)) + }.elsewhen(first_slidedn_reg0) { + vslidedn_vd_reg0(i) := 0.U }.otherwise { - vslidedn_vd(i) := old_vd(i * 8 + 7, i * 8) + vslidedn_vd_reg0(i) := old_vd_reg0(i * 8 + 7, i * 8) } } } for (i <- 0 until vlenb) { - vslide1up_vd(i) := Mux(ma, "hff".U, old_vd(i * 8 + 7, i * 8)) - when(vslide1up && (vmask_byte_strb(i) === 1.U)) { - when((i.U < vsew_bytes)) { - vslide1up_vd(i) := vs1_bytes((vlenb.U - vsew_bytes + i.U)(vlenbWidth - 1, 0)) + vslide1up_vd_reg0(i) := Mux(ma_reg0, "hff".U, old_vd_reg0(i * 8 + 7, i * 8)) + when(is_vslide1up_reg0 && (vmask_byte_strb_reg0(i) === 1.U)) { + when((i.U < vsew_bytes_reg0)) { + vslide1up_vd_reg0(i) := vs1_bytes_reg0((vlenb.U - vsew_bytes_reg0 + i.U)(vlenbWidth - 1, 0)) }.otherwise { - vslide1up_vd(i) := vs2_bytes(i.U - vsew_bytes) + vslide1up_vd_reg0(i) := vs2_bytes_reg0(i.U - vsew_bytes_reg0) } } } for (i <- 0 until vlenb) { - vslide1dn_vd_wo_rs1(i) := Mux(ma, "hff".U, old_vd(i * 8 + 7, i * 8)) - when(vslide1dn && !uopIdx(0) && (vmask_byte_strb(i) === 1.U)) { - when(i.U < (vlenb.U - vsew_bytes)) { - vslide1dn_vd_wo_rs1(i) := vs2_bytes(vsew_bytes + i.U) + vslide1dn_vd_wo_rs1_reg0(i) := Mux(ma_reg0, "hff".U, old_vd_reg0(i * 8 + 7, i * 8)) + when(is_vslide1dn_reg0 && !uopIdx_reg0(0) && (vmask_byte_strb_reg0(i) === 1.U)) { + when(i.U < (vlenb.U - vsew_bytes_reg0)) { + vslide1dn_vd_wo_rs1_reg0(i) := vs2_bytes_reg0(vsew_bytes_reg0 + i.U) }.otherwise { - vslide1dn_vd_wo_rs1(i) := vs1_bytes(i.U + vsew_bytes - vlenb.U) + vslide1dn_vd_wo_rs1_reg0(i) := vs1_bytes_reg0(i.U + vsew_bytes_reg0 - vlenb.U) } } } - val rs1_old_vd = Mux(load_rs1, Cat(vslide1dn_vd_wo_rs1.reverse), old_vd) - vslide1dn_vd_rs1 := Mux(load_rs1, Cat(vslide1dn_vd_wo_rs1.reverse), old_vd) - when(load_rs1 || uopIdx(0)) { - when((vlRemainBytes > 0.U) && (vlRemainBytes <= vlenb.U) && (vmask_byte_strb(vlRemainBytes - 1.U).asBool)) { - vslide1dn_vd_rs1 := (rs1_old_vd & (vd_mask >> (VLEN.U - Cat((vlRemainBytes - vsew_bytes), 0.U(3.W))))) | - (vs1 & (vd_mask << Cat((vlRemainBytes - vsew_bytes), 0.U(3.W)))) + val rs1_old_vd_reg0 = Mux(load_rs1_reg0, Cat(vslide1dn_vd_wo_rs1_reg0.reverse), old_vd_reg0) + vslide1dn_vd_rs1_reg0 := Mux(load_rs1_reg0, Cat(vslide1dn_vd_wo_rs1_reg0.reverse), old_vd_reg0) + when(load_rs1_reg0 || uopIdx_reg0(0)) { + when((vlRemainBytes_reg0 > 0.U) && (vlRemainBytes_reg0 <= vlenb.U) && (vmask_byte_strb_reg0(vlRemainBytes_reg0 - 1.U).asBool)) { + vslide1dn_vd_rs1_reg0 := (rs1_old_vd_reg0 & (vd_mask >> (VLEN.U - Cat((vlRemainBytes_reg0 - vsew_bytes_reg0), 0.U(3.W))))) | + (vs1_reg0 & (vd_mask << Cat((vlRemainBytes_reg0 - vsew_bytes_reg0), 0.U(3.W)))) } } + val vslide1dn_vd_reg0 = Mux(load_rs1_reg0 || uopIdx_reg0(0), VecInit(Seq.tabulate(vlenb)(i => vslide1dn_vd_rs1_reg0((i + 1) * 8 - 1, i * 8))), vslide1dn_vd_wo_rs1_reg0) + val vstartRemain = Wire(UInt(8.W)) - val vstartRemainBytes = vstartRemain << vsew val vslideup_vstart = Mux(vslideup & (slide_ele > vstart), Mux(slide_ele > VLEN.U, VLEN.U, slide_ele), vstart) vstartRemain := vslideup_vstart @@ -626,70 +743,80 @@ class Permutation extends Module { vstartRemain := Mux1H(Seq.tabulate(8)(i => (vdId === i.U) -> (if (i == 0) vslideup_vstart else Mux(vslideup_vstart >= (ele_cnt * i.U), vslideup_vstart - (ele_cnt * i.U), 0.U)))) } + val is_vcompress_reg0 = RegEnable(vcompress, false.B, fire) + val is_vslideup_reg0 = RegEnable(vslideup, false.B, fire) + val is_vslidedn_reg0 = RegEnable(vslidedn, false.B, fire) + val is_vrgather16_sew8_reg0 = RegEnable(vrgather16_sew8, false.B, fire) + val is_vmvnr_reg0 = RegEnable(vmvnr, false.B, fire) + val is_vslide_reg0 = RegEnable(vslide, false.B, fire) + val ta_reg0 = RegEnable(ta, false.B, fire) + val vstartRemain_reg0 = RegEnable(vstartRemain, 0.U, fire) + val vstart_reg0 = RegEnable(vstart, 0.U, fire) + val vl_reg0 = RegEnable(Mux(vmvnr, evl, vl), 0.U, fire) + val vd_reg = RegInit(0.U(VLEN.W)) - when(vmvnr && fire) { - vd_reg := vs2 - }.elsewhen(vslideup && fire) { - vd_reg := Cat(vslideup_vd.reverse) - }.elsewhen(vslide1up && fire) { - vd_reg := Cat(vslide1up_vd.reverse) - }.elsewhen(vslidedn && fire) { - vd_reg := Cat(vslidedn_vd.reverse) - }.elsewhen(vslide1dn && fire) { - vd_reg := Cat(vslide1dn_vd.reverse) - }.elsewhen((vrgather || vrgather_vx) && !(vrgather16_sew8) && fire) { - vd_reg := Cat(vrgather_vd.reverse) - }.elsewhen(vrgather16_sew8 && fire) { - when(uopIdx(0)) { - vd_reg := Cat(Cat(vrgather_vd.reverse)(VLEN - 1, VLEN / 2), old_vd(VLEN / 2 - 1, 0)) + when(is_vmvnr_reg0 && fire_reg0) { + vd_reg := vs2_reg0 + }.elsewhen(is_vslideup_reg0 && fire_reg0) { + vd_reg := Cat(vslideup_vd_reg0.reverse) + }.elsewhen(is_vslide1up_reg0 && fire_reg0) { + vd_reg := Cat(vslide1up_vd_reg0.reverse) + }.elsewhen(is_vslidedn_reg0 && fire_reg0) { + vd_reg := Cat(vslidedn_vd_reg0.reverse) + }.elsewhen(is_vslide1dn_reg0 && fire_reg0) { + vd_reg := Cat(vslide1dn_vd_reg0.reverse) + }.elsewhen((is_vrgather_reg0 || is_vrgather_vx_reg0) && !(is_vrgather16_sew8_reg0) && fire_reg0) { + vd_reg := Cat(vrgather_vd_reg0.reverse) + }.elsewhen(is_vrgather16_sew8_reg0 && fire_reg0) { + when(uopIdx_reg0(0)) { + vd_reg := Cat(Cat(vrgather_vd_reg0.reverse)(VLEN - 1, VLEN / 2), old_vd_reg0(VLEN / 2 - 1, 0)) }.otherwise { - vd_reg := Cat(old_vd(VLEN - 1, VLEN / 2), Cat(vrgather_vd.reverse)(VLEN / 2 - 1, 0)) + vd_reg := Cat(old_vd_reg0(VLEN - 1, VLEN / 2), Cat(vrgather_vd_reg0.reverse)(VLEN / 2 - 1, 0)) } } - val is_vcompress_reg = RegEnable(vcompress, false.B, fire) - val is_vslideup_reg = RegEnable(vslideup, false.B, fire) - val is_vslidedn_reg = RegEnable(vslidedn, false.B, fire) - val is_vslide1up_reg = RegEnable(vslide1up, false.B, fire) - val is_vslide1dn_reg = RegEnable(vslide1dn, false.B, fire) - val is_vrgather_reg = RegEnable(vrgather, false.B, fire) - val is_vrgather_vx_reg = RegEnable(vrgather_vx, false.B, fire) - val is_vmvnr_reg = RegEnable(vmvnr, false.B, fire) - val is_vslide_reg = RegEnable(vslide, false.B, fire) - val uopIdx_reg = RegEnable(uopIdx, 0.U, fire) - val load_rs1_reg = RegEnable(load_rs1, false.B, fire) - val vlRemain_reg = RegEnable(vlRemain, 0.U, fire) - val vsew_reg = RegEnable(vsew, 0.U, fire) - val old_vd_reg = RegEnable(old_vd, 0.U, fire) - val ta_reg = RegEnable(ta, false.B, fire) - val vstartRemain_reg = RegEnable(vstartRemain, 0.U, fire) - val vstart_reg = RegEnable(vstart, 0.U, fire) - val vl_reg = RegEnable(Mux(vmvnr, evl, vl), 0.U, fire) - val ones_sum_base_reg = RegEnable(ones_sum_base, 0.U, fire) - val mask_selected_reg = RegEnable(mask_selected, 0.U, fire) - val compressed_res_reg = RegEnable(compressed_res, 0.U, fire) - val ones_sum_reg = RegEnable(ones_sum(elements), 0.U, fire) - - val vlRemainBytes_reg = vlRemain_reg << vsew_reg - val vstartRemainBytes_reg = vstartRemain_reg << vsew_reg + val is_vcompress_reg1 = RegEnable(is_vcompress_reg0, fire_reg0) + val is_vrgather_reg1 = RegEnable(is_vrgather_reg0, fire_reg0) + val is_vrgather_vx_reg1 = RegEnable(is_vrgather_vx_reg0, fire_reg0) + val is_vslide_reg1 = RegEnable(is_vslide_reg0, fire_reg0) + val uopIdx_reg1 = RegEnable(uopIdx_reg0, fire_reg0) + val old_vd_reg1 = RegEnable(old_vd_reg0, fire_reg0) + val ta_reg1 = RegEnable(ta_reg0, fire_reg0) + val vstart_reg1 = RegEnable(vstart_reg0, fire_reg0) + val vl_reg1 = RegEnable(vl_reg0, fire_reg0) + val vsew_reg1 = RegEnable(vsew_reg0, fire_reg0) + val ones_sum_eles_reg1 = RegEnable(ones_sum_eles_reg0, fire_reg0) + val ones_sum_base_reg1 = RegEnable(ones_sum_base_reg0, fire_reg0) + val compressed_res_reg1 = RegEnable(compressed_res_reg0, fire_reg0) + + val vstartRemainBytes_reg0 = vstartRemain_reg0 << vsew_reg0 vslideup_vl := Mux(vslideup & (slide_ele > vl), Mux(slide_ele > VLEN.U, VLEN.U, slide_ele), vl) - val tail_bytes = Mux((vlRemainBytes_reg >= vlenb.U), 0.U, vlenb.U - vlRemainBytes_reg) + val tail_bytes = Mux(vlRemainBytes_reg0 >= vlenb.U, 0.U, vlenb.U - vlRemainBytes_reg0) val tail_bits = Cat(tail_bytes, 0.U(3.W)) val vmask_tail_bits = Wire(UInt(VLEN.W)) vmask_tail_bits := vd_mask >> tail_bits - val tail_old_vd = old_vd_reg & (~vmask_tail_bits) + val tail_old_vd = old_vd_reg0 & (~vmask_tail_bits) val tail_ones_vd = ~vmask_tail_bits - val tail_vd = Mux(ta_reg, tail_ones_vd, tail_old_vd) + val tail_vd = Mux(ta_reg0, tail_ones_vd, tail_old_vd) val perm_tail_mask_vd = Wire(UInt(VLEN.W)) - val vstart_bytes = Mux(vstartRemainBytes_reg >= vlenb.U, vlenb.U, vstartRemainBytes_reg) + val vstart_bytes = Mux(vstartRemainBytes_reg0 >= vlenb.U, vlenb.U, vstartRemainBytes_reg0) val vstart_bits = Cat(vstart_bytes, 0.U(3.W)) val vmask_vstart_bits = Wire(UInt(VLEN.W)) vmask_vstart_bits := vd_mask << vstart_bits - val vstart_old_vd = old_vd_reg & (~vmask_vstart_bits) + val vstart_old_vd = old_vd_reg0 & (~vmask_vstart_bits) + val vmask_tail_bits_reg1 = RegEnable(vmask_tail_bits, fire_reg0) + val vmask_vstart_bits_reg1 = RegEnable(vmask_vstart_bits, fire_reg0) + val tail_vd_reg1 = RegEnable(tail_vd, fire_reg0) + val vstart_old_vd_reg1 = RegEnable(vstart_old_vd, fire_reg0) + + /** + * vcompress cycle2 + * ----begin----- + */ val cmprs_vd = Wire(UInt(VLEN.W)) val cmprs_vd_8 = WireInit(VecInit(Seq.fill(16)(0.U(8.W)))) val cmprs_vd_16 = WireInit(VecInit(Seq.fill(8)(0.U(16.W)))) @@ -700,73 +827,78 @@ class Permutation extends Module { val res_agnostic_32 = WireInit(VecInit(Seq.fill(4)(false.B))) val res_agnostic_64 = WireInit(VecInit(Seq.fill(2)(false.B))) - when(vsew_reg === 0.U) { + when(vsew_reg1 === 0.U) { for (i <- 0 until 16) { - res_agnostic_8(i) := ((ones_sum_base_reg + i.U >= ones_sum_reg) & ta_reg) + res_agnostic_8(i) := ((ones_sum_base_reg1 + i.U >= ones_sum_eles_reg1) & ta_reg1) when(res_agnostic_8(i)) { cmprs_vd_8(i) := Fill(8, 1.U) }.otherwise { - cmprs_vd_8(i) := compressed_res_reg(8 * i + 7, 8 * i) + cmprs_vd_8(i) := compressed_res_reg1(8 * i + 7, 8 * i) } } - }.elsewhen(vsew_reg === 1.U) { + }.elsewhen(vsew_reg1 === 1.U) { for (i <- 0 until 8) { - res_agnostic_16(i) := ((ones_sum_base_reg + i.U >= ones_sum_reg) & ta_reg) + res_agnostic_16(i) := ((ones_sum_base_reg1 + i.U >= ones_sum_eles_reg1) & ta_reg1) when(res_agnostic_16(i)) { cmprs_vd_16(i) := Fill(16, 1.U) }.otherwise { - cmprs_vd_16(i) := compressed_res_reg(16 * i + 15, 16 * i) + cmprs_vd_16(i) := compressed_res_reg1(16 * i + 15, 16 * i) } } - }.elsewhen(vsew_reg === 2.U) { + }.elsewhen(vsew_reg1 === 2.U) { for (i <- 0 until 4) { - res_agnostic_32(i) := ((ones_sum_base_reg + i.U >= ones_sum_reg) & ta_reg) + res_agnostic_32(i) := ((ones_sum_base_reg1 + i.U >= ones_sum_eles_reg1) & ta_reg1) when(res_agnostic_32(i)) { cmprs_vd_32(i) := Fill(32, 1.U) }.otherwise { - cmprs_vd_32(i) := compressed_res_reg(32 * i + 31, 32 * i) + cmprs_vd_32(i) := compressed_res_reg1(32 * i + 31, 32 * i) } } }.otherwise { for (i <- 0 until 2) { - res_agnostic_64(i) := ((ones_sum_base_reg + i.U >= ones_sum_reg) & ta_reg) + res_agnostic_64(i) := ((ones_sum_base_reg1 + i.U >= ones_sum_eles_reg1) & ta_reg1) when(res_agnostic_64(i)) { cmprs_vd_64(i) := Fill(64, 1.U) }.otherwise { - cmprs_vd_64(i) := compressed_res_reg(64 * i + 63, 64 * i) + cmprs_vd_64(i) := compressed_res_reg1(64 * i + 63, 64 * i) } } } - dontTouch(Cat(cmprs_vd_8.reverse)) - dontTouch(Cat(cmprs_vd_16.reverse)) - dontTouch(Cat(cmprs_vd_32.reverse)) - dontTouch(Cat(cmprs_vd_64.reverse)) + // dontTouch(Cat(cmprs_vd_8.reverse)) + // dontTouch(Cat(cmprs_vd_16.reverse)) + // dontTouch(Cat(cmprs_vd_32.reverse)) + // dontTouch(Cat(cmprs_vd_64.reverse)) - when(vsew_reg === 0.U) { + when(vsew_reg1 === 0.U) { cmprs_vd := Cat(cmprs_vd_8.reverse) - }.elsewhen(vsew_reg === 1.U) { + }.elsewhen(vsew_reg1 === 1.U) { cmprs_vd := Cat(cmprs_vd_16.reverse) - }.elsewhen(vsew_reg === 2.U) { + }.elsewhen(vsew_reg1 === 2.U) { cmprs_vd := Cat(cmprs_vd_32.reverse) }.otherwise { cmprs_vd := Cat(cmprs_vd_64.reverse) } + /** + * vcompress cycle2 + * -----end---- + */ + perm_tail_mask_vd := vd_reg - when(is_vslide_reg || is_vrgather_reg || is_vrgather_vx_reg) { - perm_tail_mask_vd := (vd_reg & vmask_tail_bits & vmask_vstart_bits) | tail_vd | vstart_old_vd + when(is_vslide_reg1 || is_vrgather_reg1 || is_vrgather_vx_reg1) { + perm_tail_mask_vd := (vd_reg & vmask_tail_bits_reg1 & vmask_vstart_bits_reg1) | tail_vd_reg1 | vstart_old_vd_reg1 } val perm_vd = Wire(UInt(VLEN.W)) perm_vd := perm_tail_mask_vd - when(vstart_reg >= vl_reg) { - perm_vd := old_vd_reg - }.elsewhen(is_vcompress_reg) { - when(uopIdx_reg === 1.U || uopIdx_reg === 4.U || uopIdx_reg === 8.U || uopIdx_reg === 13.U || uopIdx_reg === 19.U || - uopIdx_reg === 26.U || uopIdx_reg === 34.U) { - perm_vd := ones_sum_reg + when(vstart_reg1 >= vl_reg1) { + perm_vd := old_vd_reg1 + }.elsewhen(is_vcompress_reg1) { + when(uopIdx_reg1 === 1.U || uopIdx_reg1 === 4.U || uopIdx_reg1 === 8.U || uopIdx_reg1 === 13.U || uopIdx_reg1 === 19.U || + uopIdx_reg1 === 26.U || uopIdx_reg1 === 34.U) { + perm_vd := ones_sum_eles_reg1 }.otherwise { perm_vd := cmprs_vd } diff --git a/src/test/scala/vector/VectorALU/VPermWrapper.scala b/src/test/scala/vector/VectorALU/VPermWrapper.scala index fb43694..f7d9dfe 100644 --- a/src/test/scala/vector/VectorALU/VPermWrapper.scala +++ b/src/test/scala/vector/VectorALU/VPermWrapper.scala @@ -19,6 +19,6 @@ class VPermWrapper extends Module { vPerm.io.in.valid := io.in.valid - io.out.valid := RegNext(io.in.valid) + io.out.valid := RegNext(RegNext(io.in.valid)) io.in.ready := io.out.ready }