diff --git a/neon2rvv.h b/neon2rvv.h index 6c3cf58e..8ad7f8e0 100644 --- a/neon2rvv.h +++ b/neon2rvv.h @@ -2472,7 +2472,27 @@ FORCE_INLINE int8x8x2_t vtrn_s8(int8x8_t __a, int8x8_t __b) { // FORCE_INLINE uint32x4x2_t vzipq_u32(uint32x4_t __a, uint32x4_t __b); -// FORCE_INLINE int8x8x2_t vuzp_s8(int8x8_t __a, int8x8_t __b); +FORCE_INLINE int8x8x2_t vuzp_s8(int8x8_t __a, int8x8_t __b) { + uint8_t low_mask_arr[] = {85}; + uint8_t merge_mask_arr[] = {15}; + vbool16_t low_mask = __riscv_vlm_v_b16(low_mask_arr, 8); + vbool16_t merge_mask = __riscv_vlm_v_b16(merge_mask_arr, 8); + vint8mf2_t zeros = vdup_n_s8(0); + + vint8mf2_t res_low_low = __riscv_vcompress_vm_i8mf2(__a, low_mask, 8); + vint8mf2_t res_low_high = __riscv_vcompress_vm_i8mf2(__b, low_mask, 8); + vint8mf2_t res_low_high_slideup = __riscv_vslideup_vx_i8mf2(zeros, res_low_high, 4, 8); + vint8mf2_t res_low = __riscv_vmerge_vvm_i8mf2(res_low_high_slideup, res_low_low, merge_mask, 8); + + vint8mf2_t a_slide1down = __riscv_vslide1down_vx_i8mf2(__a, 0, 8); + vint8mf2_t b_slide1down = __riscv_vslide1down_vx_i8mf2(__b, 0, 8); + vint8mf2_t res_high_low = __riscv_vcompress_vm_i8mf2(a_slide1down, low_mask, 8); + vint8mf2_t res_high_high = __riscv_vcompress_vm_i8mf2(b_slide1down, low_mask, 8); + vint8mf2_t res_high_high_slideup = __riscv_vslideup_vx_i8mf2(zeros, res_high_high, 4, 8); + vint8mf2_t res_high = __riscv_vmerge_vvm_i8mf2(res_high_high_slideup, res_high_low, merge_mask, 8); + + return __riscv_vcreate_v_i8mf2x2(res_low, res_high); +} // FORCE_INLINE int16x4x2_t vuzp_s16(int16x4_t __a, int16x4_t __b); diff --git a/tests/impl.cpp b/tests/impl.cpp index 4a77153f..7d936a8f 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -3120,7 +3120,36 @@ result_t test_vzipq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return result_t test_vzipq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } -result_t test_vuzp_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vuzp_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { + const int8_t *_a = (int8_t *)impl.test_cases_int_pointer1; + const int8_t *_b = (int8_t *)impl.test_cases_int_pointer2; + + int8_t _c[2][8]; + _c[0][0] = _a[0]; + _c[0][1] = _a[2]; + _c[0][2] = _a[4]; + _c[0][3] = _a[6]; + _c[0][4] = _b[0]; + _c[0][5] = _b[2]; + _c[0][6] = _b[4]; + _c[0][7] = _b[6]; + + _c[1][0] = _a[1]; + _c[1][1] = _a[3]; + _c[1][2] = _a[5]; + _c[1][3] = _a[7]; + _c[1][4] = _b[1]; + _c[1][5] = _b[3]; + _c[1][6] = _b[5]; + _c[1][7] = _b[7]; + + int8x8_t a = vld1_s8(_a); + int8x8_t b = vld1_s8(_b); + int8x8x2_t c = vuzp_s8(a, b); + + return validate_int8(c, _c[0][0], _c[0][1], _c[0][2], _c[0][3], _c[0][4], _c[0][5], _c[0][6], _c[0][7], _c[1][0], + _c[1][1], _c[1][2], _c[1][3], _c[1][4], _c[1][5], _c[1][6], _c[1][7]); +} result_t test_vuzp_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }