howjmay · howjmay · Nov 3, 2023 · Nov 3, 2023
diff --git a/neon2rvv.h b/neon2rvv.h
@@ -2472,7 +2472,27 @@ FORCE_INLINE int8x8x2_t vtrn_s8(int8x8_t __a, int8x8_t __b) {
 
 // FORCE_INLINE uint32x4x2_t vzipq_u32(uint32x4_t __a, uint32x4_t __b);
 
-// FORCE_INLINE int8x8x2_t vuzp_s8(int8x8_t __a, int8x8_t __b);
+FORCE_INLINE int8x8x2_t vuzp_s8(int8x8_t __a, int8x8_t __b) {
+  uint8_t low_mask_arr[] = {85};
+  uint8_t merge_mask_arr[] = {15};
+  vbool16_t low_mask = __riscv_vlm_v_b16(low_mask_arr, 8);
+  vbool16_t merge_mask = __riscv_vlm_v_b16(merge_mask_arr, 8);
+  vint8mf2_t zeros = vdup_n_s8(0);
+
+  vint8mf2_t res_low_low = __riscv_vcompress_vm_i8mf2(__a, low_mask, 8);
+  vint8mf2_t res_low_high = __riscv_vcompress_vm_i8mf2(__b, low_mask, 8);
+  vint8mf2_t res_low_high_slideup = __riscv_vslideup_vx_i8mf2(zeros, res_low_high, 4, 8);
+  vint8mf2_t res_low = __riscv_vmerge_vvm_i8mf2(res_low_high_slideup, res_low_low, merge_mask, 8);
+
+  vint8mf2_t a_slide1down = __riscv_vslide1down_vx_i8mf2(__a, 0, 8);
+  vint8mf2_t b_slide1down = __riscv_vslide1down_vx_i8mf2(__b, 0, 8);
+  vint8mf2_t res_high_low = __riscv_vcompress_vm_i8mf2(a_slide1down, low_mask, 8);
+  vint8mf2_t res_high_high = __riscv_vcompress_vm_i8mf2(b_slide1down, low_mask, 8);
+  vint8mf2_t res_high_high_slideup = __riscv_vslideup_vx_i8mf2(zeros, res_high_high, 4, 8);
+  vint8mf2_t res_high = __riscv_vmerge_vvm_i8mf2(res_high_high_slideup, res_high_low, merge_mask, 8);
+
+  return __riscv_vcreate_v_i8mf2x2(res_low, res_high);
+}
 
 // FORCE_INLINE int16x4x2_t vuzp_s16(int16x4_t __a, int16x4_t __b);
 

diff --git a/tests/impl.cpp b/tests/impl.cpp
@@ -3120,7 +3120,36 @@ result_t test_vzipq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return
 
 result_t test_vzipq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
 
-result_t test_vuzp_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vuzp_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+  const int8_t *_a = (int8_t *)impl.test_cases_int_pointer1;
+  const int8_t *_b = (int8_t *)impl.test_cases_int_pointer2;
+
+  int8_t _c[2][8];
+  _c[0][0] = _a[0];
+  _c[0][1] = _a[2];
+  _c[0][2] = _a[4];
+  _c[0][3] = _a[6];
+  _c[0][4] = _b[0];
+  _c[0][5] = _b[2];
+  _c[0][6] = _b[4];
+  _c[0][7] = _b[6];
+
+  _c[1][0] = _a[1];
+  _c[1][1] = _a[3];
+  _c[1][2] = _a[5];
+  _c[1][3] = _a[7];
+  _c[1][4] = _b[1];
+  _c[1][5] = _b[3];
+  _c[1][6] = _b[5];
+  _c[1][7] = _b[7];
+
+  int8x8_t a = vld1_s8(_a);
+  int8x8_t b = vld1_s8(_b);
+  int8x8x2_t c = vuzp_s8(a, b);
+
+  return validate_int8(c, _c[0][0], _c[0][1], _c[0][2], _c[0][3], _c[0][4], _c[0][5], _c[0][6], _c[0][7], _c[1][0],
+                       _c[1][1], _c[1][2], _c[1][3], _c[1][4], _c[1][5], _c[1][6], _c[1][7]);
+}
 
 result_t test_vuzp_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }