diff --git a/apps/riscv-tests/isa/rv64uv/vid.c b/apps/riscv-tests/isa/rv64uv/vid.c index 7db9a1fc5..465d6fdd4 100644 --- a/apps/riscv-tests/isa/rv64uv/vid.c +++ b/apps/riscv-tests/isa/rv64uv/vid.c @@ -11,6 +11,12 @@ void TEST_CASE1() { VSET(16, e8, m1); __asm__ volatile("vid.v v1"); VCMP_U8(1, v1, 0, 1, 2, 3, 4, 5, 6, 7); + VSET(10, e8, m1); + + VLOAD_8(v1, 0b00001110, 0b00000111, 0b00010000, 0b01001011, 0b00110100, 0b01111101, 0b11001100, 0b00011000, 0b01000111, 0b00010100); + VSET(77, e8, m1); + asm volatile("vid.v v2"); + VCMP_U8(2, v2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76); } void TEST_CASE2() { @@ -18,7 +24,7 @@ void TEST_CASE2() { VLOAD_8(v0, 85, 0, 0, 0, 0, 0, 0, 0); VCLEAR(v1); __asm__ volatile("vid.v v1, v0.t"); - VCMP_U8(2, v1, 0, 0, 2, 0, 4, 0, 6, 0); + VCMP_U8(3, v1, 0, 0, 2, 0, 4, 0, 6, 0); } int main(void) { diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index 818207b45..ca3e95801 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -1300,7 +1300,10 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = eew_q[ara_req_d.vd]; end 5'b10000: ara_req_d.op = ara_pkg::VIOTA; - 5'b10001: ara_req_d.op = ara_pkg::VID; + 5'b10001: begin + ara_req_d.op = ara_pkg::VID; + ara_req_d.use_vs2 = 1'b0; + end endcase end 6'b001000: ara_req_d.op = ara_pkg::VAADDU; diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index a61242f67..9bb8fd3e7 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -445,10 +445,10 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; prevent_commit = 1'b0; // How many elements are we processing this cycle? - element_cnt_buf_issue = (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)); + element_cnt_buf_issue = 1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)); element_cnt_issue = vinsn_issue_q.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_issue}; - element_cnt_buf_commit = (unsigned'(EW64) - unsigned'(vinsn_commit.vtype.vsew)); + element_cnt_buf_commit = 1 << (unsigned'(EW64) - unsigned'(vinsn_commit.vtype.vsew)); element_cnt_commit = vinsn_commit.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_commit}; //////////////////////////////////////// diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index d7bc525c6..9766ebfc4 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -560,14 +560,14 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( always_comb begin // Tail-agnostic bus - alu_result = '0; - alu_result_vm = '0; - alu_result_vm_m = '0; - alu_result_vm_shuf = '0; - alu_result_vmsif_vm = '0; - alu_result_vmsbf_vm = '0; - alu_result_vmsof_vm = '0; - alu_result_vm = '0; + alu_result = '1; + alu_result_vm = '1; + alu_result_vm_m = '1; + alu_result_vm_shuf = '1; + alu_result_vmsif_vm = '1; + alu_result_vmsbf_vm = '1; + alu_result_vmsof_vm = '1; + alu_result_vm = '1; vcpop_operand = '0; @@ -624,9 +624,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Mask the input vector // VID uses the same datapath of VIOTA, but with implicit input vector at '1 masku_operand_alu_seq_m = (vinsn_issue.op == VID) - ? masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}} + ? '1 // VID mask does NOT modify the count : masku_operand_alu_seq - & (masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}}); + & (masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}}); // VIOTA mask DOES modify the count // Compute output results on `ViotaParallelism 16-bit adders viota_res[0] = viota_acc_q; @@ -656,19 +656,19 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( unique case (vinsn_issue.vtype.vsew) EW8: for (int i = 0; i < ViotaParallelism; i++) begin be_viota_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8/ViotaParallelism)-1:0] * ViotaParallelism * 1 + 1*i +: 1] = - {1{vinsn_issue.vm}} | {1{masku_operand_m_seq[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; + {1{vinsn_issue.vm}} | {1{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; end EW16: for (int i = 0; i < ViotaParallelism; i++) begin be_viota_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/16/ViotaParallelism)-1:0] * ViotaParallelism * 2 + 2*i +: 2] = - {2{vinsn_issue.vm}} | {2{masku_operand_m_seq[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; + {2{vinsn_issue.vm}} | {2{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; end EW32: for (int i = 0; i < ViotaParallelism; i++) begin be_viota_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/32/ViotaParallelism)-1:0] * ViotaParallelism * 4 + 4*i +: 4] = - {4{vinsn_issue.vm}} | {4{masku_operand_m_seq[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; + {4{vinsn_issue.vm}} | {4{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; end default: for (int i = 0; i < ViotaParallelism; i++) begin // EW64 be_viota_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/64/ViotaParallelism)-1:0] * ViotaParallelism * 8 + 8*i +: 8] = - {8{vinsn_issue.vm}} | {8{masku_operand_m_seq[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; + {8{vinsn_issue.vm}} | {8{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; end endcase end @@ -689,7 +689,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Shuffle the VIOTA,VID BE for (int b = 0; b < (NrLanes*StrbWidth); b++) begin automatic int shuffle_byte = shuffle_index(b, NrLanes, vinsn_issue.vtype.vsew); - be_viota_shuf[shuffle_byte] = be_viota_seq_q[b]; + be_viota_shuf[shuffle_byte] = be_viota_seq_d[b]; end // alu_result propagation mux @@ -920,7 +920,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( wdata: result_queue_q[result_queue_write_pnt_q][lane].wdata, // Retain the last-cycle's data // VIOTA, VID generate a non-mask vector and should comply with undisturbed policy // This means that we can use the byte-enable signal - be : vinsn_issue.op inside {[VIOTA:VID]} ? be(elm_per_lane + additional_elm[lane], vinsn_issue.vtype.vsew) | be_viota_shuf : '1, + be : vinsn_issue.op inside {[VIOTA:VID]} ? be(elm_per_lane + additional_elm[lane], vinsn_issue.vtype.vsew) & be_viota_shuf[lane*StrbWidth +: StrbWidth] : '1, addr : vaddr(vinsn_issue.vd, NrLanes, VLEN) + iteration_cnt_q, id : vinsn_issue.id }; @@ -941,7 +941,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( for (int unsigned lane = 0; lane < NrLanes; lane++) begin result_queue_background_data[lane] = (out_valid_cnt_q != '0) ? result_queue_q[result_queue_write_pnt_q][lane].wdata - : masku_operand_vd[lane]; + : vinsn_issue.op inside {[VIOTA:VID]} ? '1 : masku_operand_vd[lane]; end for (int unsigned lane = 0; lane < NrLanes; lane++) begin // The mask vector writes at 1 (tail-agnostic ok value) both the background body @@ -1007,8 +1007,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Have we finished insn execution? Clear MASKU ALU state if (issue_cnt_d == '0) begin + be_viota_seq_d = '1; // Default: write viota_acc_d = '0; - be_viota_seq_d = '0; found_one_d = '0; end end @@ -1291,7 +1291,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( out_valid_threshold_q <= '0; viota_acc_q <= '0; found_one_q <= '0; - be_viota_seq_q <= '0; + be_viota_seq_q <= '1; // Default: write end else begin vinsn_running_q <= vinsn_running_d; read_cnt_q <= read_cnt_d; diff --git a/hardware/src/masku/masku_operands.sv b/hardware/src/masku/masku_operands.sv index ff1c246cc..f63f6b44e 100644 --- a/hardware/src/masku/masku_operands.sv +++ b/hardware/src/masku/masku_operands.sv @@ -219,7 +219,7 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #( // Compress ALU/FPU results into a mask vector // ------------------------------------------- always_comb begin - alu_result_compressed_o = '0; + alu_result_compressed_o = '1; for (int b = 0; b < ELENB * NrLanes; b++) begin if ((b % (1 << vinsn_issue_i.vtype.vsew)) == '0) begin automatic int src_byte = shuffle_index(b, NrLanes, vinsn_issue_i.vtype.vsew);