Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add vuzp_s8 #24

Merged
merged 1 commit into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion neon2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -2472,7 +2472,27 @@ FORCE_INLINE int8x8x2_t vtrn_s8(int8x8_t __a, int8x8_t __b) {

// FORCE_INLINE uint32x4x2_t vzipq_u32(uint32x4_t __a, uint32x4_t __b);

// FORCE_INLINE int8x8x2_t vuzp_s8(int8x8_t __a, int8x8_t __b);
FORCE_INLINE int8x8x2_t vuzp_s8(int8x8_t __a, int8x8_t __b) {
uint8_t low_mask_arr[] = {85};
uint8_t merge_mask_arr[] = {15};
vbool16_t low_mask = __riscv_vlm_v_b16(low_mask_arr, 8);
vbool16_t merge_mask = __riscv_vlm_v_b16(merge_mask_arr, 8);
vint8mf2_t zeros = vdup_n_s8(0);

vint8mf2_t res_low_low = __riscv_vcompress_vm_i8mf2(__a, low_mask, 8);
vint8mf2_t res_low_high = __riscv_vcompress_vm_i8mf2(__b, low_mask, 8);
vint8mf2_t res_low_high_slideup = __riscv_vslideup_vx_i8mf2(zeros, res_low_high, 4, 8);
vint8mf2_t res_low = __riscv_vmerge_vvm_i8mf2(res_low_high_slideup, res_low_low, merge_mask, 8);

vint8mf2_t a_slide1down = __riscv_vslide1down_vx_i8mf2(__a, 0, 8);
vint8mf2_t b_slide1down = __riscv_vslide1down_vx_i8mf2(__b, 0, 8);
vint8mf2_t res_high_low = __riscv_vcompress_vm_i8mf2(a_slide1down, low_mask, 8);
vint8mf2_t res_high_high = __riscv_vcompress_vm_i8mf2(b_slide1down, low_mask, 8);
vint8mf2_t res_high_high_slideup = __riscv_vslideup_vx_i8mf2(zeros, res_high_high, 4, 8);
vint8mf2_t res_high = __riscv_vmerge_vvm_i8mf2(res_high_high_slideup, res_high_low, merge_mask, 8);

return __riscv_vcreate_v_i8mf2x2(res_low, res_high);
}

// FORCE_INLINE int16x4x2_t vuzp_s16(int16x4_t __a, int16x4_t __b);

Expand Down
31 changes: 30 additions & 1 deletion tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3120,7 +3120,36 @@ result_t test_vzipq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return

result_t test_vzipq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }

result_t test_vuzp_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vuzp_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
const int8_t *_a = (int8_t *)impl.test_cases_int_pointer1;
const int8_t *_b = (int8_t *)impl.test_cases_int_pointer2;

int8_t _c[2][8];
_c[0][0] = _a[0];
_c[0][1] = _a[2];
_c[0][2] = _a[4];
_c[0][3] = _a[6];
_c[0][4] = _b[0];
_c[0][5] = _b[2];
_c[0][6] = _b[4];
_c[0][7] = _b[6];

_c[1][0] = _a[1];
_c[1][1] = _a[3];
_c[1][2] = _a[5];
_c[1][3] = _a[7];
_c[1][4] = _b[1];
_c[1][5] = _b[3];
_c[1][6] = _b[5];
_c[1][7] = _b[7];

int8x8_t a = vld1_s8(_a);
int8x8_t b = vld1_s8(_b);
int8x8x2_t c = vuzp_s8(a, b);

return validate_int8(c, _c[0][0], _c[0][1], _c[0][2], _c[0][3], _c[0][4], _c[0][5], _c[0][6], _c[0][7], _c[1][0],
_c[1][1], _c[1][2], _c[1][3], _c[1][4], _c[1][5], _c[1][6], _c[1][7]);
}

result_t test_vuzp_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }

Expand Down