Skip to content

Commit

Permalink
feat: Add vmul[q]_lane[q]_[s16|s32|u16|u32]
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Jul 29, 2024
1 parent 1a1ffb7 commit 907c8e3
Show file tree
Hide file tree
Showing 3 changed files with 232 additions and 24 deletions.
40 changes: 32 additions & 8 deletions neon2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -8882,21 +8882,45 @@ FORCE_INLINE float64_t vmuld_lane_f64(float64_t a, float64x1_t b, const int lane
return a * b_lane;
}

// FORCE_INLINE int16x4_t vmul_laneq_s16(int16x4_t a, int16x8_t b, const int lane);
FORCE_INLINE int16x4_t vmul_laneq_s16(int16x4_t a, int16x8_t b, const int lane) {
vint16m1_t b_dup_lane = __riscv_vrgather_vx_i16m1(b, lane, 8);
return __riscv_vmul_vv_i16m1(a, b_dup_lane, 4);
}

// FORCE_INLINE int16x8_t vmulq_laneq_s16(int16x8_t a, int16x8_t b, const int lane);
FORCE_INLINE int16x8_t vmulq_laneq_s16(int16x8_t a, int16x8_t b, const int lane) {
vint16m1_t b_dup_lane = __riscv_vrgather_vx_i16m1(b, lane, 8);
return __riscv_vmul_vv_i16m1(a, b_dup_lane, 8);
}

// FORCE_INLINE int32x2_t vmul_laneq_s32(int32x2_t a, int32x4_t b, const int lane);
FORCE_INLINE int32x2_t vmul_laneq_s32(int32x2_t a, int32x4_t b, const int lane) {
vint32m1_t b_dup_lane = __riscv_vrgather_vx_i32m1(b, lane, 4);
return __riscv_vmul_vv_i32m1(a, b_dup_lane, 2);
}

// FORCE_INLINE int32x4_t vmulq_laneq_s32(int32x4_t a, int32x4_t b, const int lane);
FORCE_INLINE int32x4_t vmulq_laneq_s32(int32x4_t a, int32x4_t b, const int lane) {
vint32m1_t b_dup_lane = __riscv_vrgather_vx_i32m1(b, lane, 4);
return __riscv_vmul_vv_i32m1(a, b_dup_lane, 4);
}

// FORCE_INLINE uint16x4_t vmul_laneq_u16(uint16x4_t a, uint16x8_t b, const int lane);
FORCE_INLINE uint16x4_t vmul_laneq_u16(uint16x4_t a, uint16x8_t b, const int lane) {
vuint16m1_t b_dup_lane = __riscv_vrgather_vx_u16m1(b, lane, 8);
return __riscv_vmul_vv_u16m1(a, b_dup_lane, 4);
}

// FORCE_INLINE uint16x8_t vmulq_laneq_u16(uint16x8_t a, uint16x8_t b, const int lane);
FORCE_INLINE uint16x8_t vmulq_laneq_u16(uint16x8_t a, uint16x8_t b, const int lane) {
vuint16m1_t b_dup_lane = __riscv_vrgather_vx_u16m1(b, lane, 8);
return __riscv_vmul_vv_u16m1(a, b_dup_lane, 8);
}

// FORCE_INLINE uint32x2_t vmul_laneq_u32(uint32x2_t a, uint32x4_t b, const int lane);
FORCE_INLINE uint32x2_t vmul_laneq_u32(uint32x2_t a, uint32x4_t b, const int lane) {
vuint32m1_t b_dup_lane = __riscv_vrgather_vx_u32m1(b, lane, 4);
return __riscv_vmul_vv_u32m1(a, b_dup_lane, 2);
}

// FORCE_INLINE uint32x4_t vmulq_laneq_u32(uint32x4_t a, uint32x4_t b, const int lane);
FORCE_INLINE uint32x4_t vmulq_laneq_u32(uint32x4_t a, uint32x4_t b, const int lane) {
vuint32m1_t b_dup_lane = __riscv_vrgather_vx_u32m1(b, lane, 4);
return __riscv_vmul_vv_u32m1(a, b_dup_lane, 4);
}

FORCE_INLINE float32x2_t vmul_laneq_f32(float32x2_t a, float32x4_t b, const int lane) {
vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 4);
Expand Down
200 changes: 192 additions & 8 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31585,21 +31585,205 @@ result_t test_vmuld_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#endif // ENABLE_TEST_ALL
}

result_t test_vmul_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vmul_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1;
const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
int16_t _c[4];
int16x4_t a = vld1_s16(_a);
int16x8_t b = vld1q_s16(_b);
int16x4_t c;

#define TEST_IMPL(IDX) \
for (int i = 0; i < 4; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmul_laneq_s16(a, b, IDX); \
CHECK_RESULT(validate_int16(c, _c[0], _c[1], _c[2], _c[3]))

IMM_8_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmulq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1;
const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
int16_t _c[8];
int16x8_t a = vld1q_s16(_a);
int16x8_t b = vld1q_s16(_b);
int16x8_t c;

#define TEST_IMPL(IDX) \
for (int i = 0; i < 8; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmulq_laneq_s16(a, b, IDX); \
CHECK_RESULT(validate_int16(c, _c[0], _c[1], _c[2], _c[3], _c[4], _c[5], _c[6], _c[7]))

IMM_8_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmul_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1;
const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2;
int32_t _c[2];
int32x2_t a = vld1_s32(_a);
int32x4_t b = vld1q_s32(_b);
int32x2_t c;

#define TEST_IMPL(IDX) \
for (int i = 0; i < 2; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmul_laneq_s32(a, b, IDX); \
CHECK_RESULT(validate_int32(c, _c[0], _c[1]))

IMM_4_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmulq_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1;
const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2;
int32_t _c[4];
int32x4_t a = vld1q_s32(_a);
int32x4_t b = vld1q_s32(_b);
int32x4_t c;

#define TEST_IMPL(IDX) \
for (int i = 0; i < 4; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmulq_laneq_s32(a, b, IDX); \
CHECK_RESULT(validate_int32(c, _c[0], _c[1], _c[2], _c[3]))

IMM_4_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmul_laneq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const uint16_t *_a = (uint16_t *)impl.test_cases_int_pointer1;
const uint16_t *_b = (uint16_t *)impl.test_cases_int_pointer2;
uint16_t _c[4];
uint16x4_t a = vld1_u16(_a);
uint16x8_t b = vld1q_u16(_b);
uint16x4_t c;

#define TEST_IMPL(IDX) \
for (int i = 0; i < 4; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmul_laneq_u16(a, b, IDX); \
CHECK_RESULT(validate_uint16(c, _c[0], _c[1], _c[2], _c[3]))

IMM_8_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmulq_laneq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const uint16_t *_a = (uint16_t *)impl.test_cases_int_pointer1;
const uint16_t *_b = (uint16_t *)impl.test_cases_int_pointer2;
uint16_t _c[8];
uint16x8_t a = vld1q_u16(_a);
uint16x8_t b = vld1q_u16(_b);
uint16x8_t c;

#define TEST_IMPL(IDX) \
for (int i = 0; i < 8; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmulq_laneq_u16(a, b, IDX); \
CHECK_RESULT(validate_uint16(c, _c[0], _c[1], _c[2], _c[3], _c[4], _c[5], _c[6], _c[7]))

IMM_8_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmul_laneq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const uint32_t *_a = (uint32_t *)impl.test_cases_int_pointer1;
const uint32_t *_b = (uint32_t *)impl.test_cases_int_pointer2;
uint32_t _c[2];
uint32x2_t a = vld1_u32(_a);
uint32x4_t b = vld1q_u32(_b);
uint32x2_t c;

result_t test_vmulq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
#define TEST_IMPL(IDX) \
for (int i = 0; i < 2; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmul_laneq_u32(a, b, IDX); \
CHECK_RESULT(validate_uint32(c, _c[0], _c[1]))

result_t test_vmul_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
IMM_4_ITER
#undef TEST_IMPL

result_t test_vmulq_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmul_laneq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vmulq_laneq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const uint32_t *_a = (uint32_t *)impl.test_cases_int_pointer1;
const uint32_t *_b = (uint32_t *)impl.test_cases_int_pointer2;
uint32_t _c[4];
uint32x4_t a = vld1q_u32(_a);
uint32x4_t b = vld1q_u32(_b);
uint32x4_t c;

result_t test_vmulq_laneq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
#define TEST_IMPL(IDX) \
for (int i = 0; i < 4; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmulq_laneq_u32(a, b, IDX); \
CHECK_RESULT(validate_uint32(c, _c[0], _c[1], _c[2], _c[3]))

result_t test_vmul_laneq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
IMM_4_ITER
#undef TEST_IMPL

result_t test_vmulq_laneq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmul_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
Expand Down
16 changes: 8 additions & 8 deletions tests/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1894,14 +1894,14 @@
_(vmulq_lane_f64) \
_(vmuls_lane_f32) \
_(vmuld_lane_f64) \
/*_(vmul_laneq_s16) */ \
/*_(vmulq_laneq_s16) */ \
/*_(vmul_laneq_s32) */ \
/*_(vmulq_laneq_s32) */ \
/*_(vmul_laneq_u16) */ \
/*_(vmulq_laneq_u16) */ \
/*_(vmul_laneq_u32) */ \
/*_(vmulq_laneq_u32) */ \
_(vmul_laneq_s16) \
_(vmulq_laneq_s16) \
_(vmul_laneq_s32) \
_(vmulq_laneq_s32) \
_(vmul_laneq_u16) \
_(vmulq_laneq_u16) \
_(vmul_laneq_u32) \
_(vmulq_laneq_u32) \
_(vmul_laneq_f32) \
_(vmulq_laneq_f32) \
_(vmul_laneq_f64) \
Expand Down

0 comments on commit 907c8e3

Please sign in to comment.