howjmay · howjmay · Jul 30, 2024 · Jul 30, 2024
diff --git a/neon2rvv.h b/neon2rvv.h
@@ -9036,25 +9036,55 @@ FORCE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_
   return __riscv_vfmacc_vv_f32m1(a, b, c_dup_lane, 4);
 }
 
-// FORCE_INLINE int16x4_t vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, const int lane);
+FORCE_INLINE int16x4_t vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t c, const int lane) {
+  vint16m1_t c_dup_lane = __riscv_vrgather_vx_i16m1(c, lane, 8);
+  return __riscv_vmacc_vv_i16m1(a, b, c_dup_lane, 4);
+}
 
-// FORCE_INLINE int16x8_t vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, const int lane);
+FORCE_INLINE int16x8_t vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t c, const int lane) {
+  vint16m1_t c_dup_lane = __riscv_vrgather_vx_i16m1(c, lane, 8);
+  return __riscv_vmacc_vv_i16m1(a, b, c_dup_lane, 8);
+}
 
-// FORCE_INLINE int32x2_t vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, const int lane);
+FORCE_INLINE int32x2_t vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t c, const int lane) {
+  vint32m1_t c_dup_lane = __riscv_vrgather_vx_i32m1(c, lane, 4);
+  return __riscv_vmacc_vv_i32m1(a, b, c_dup_lane, 2);
+}
 
-// FORCE_INLINE int32x4_t vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v, const int lane);
+FORCE_INLINE int32x4_t vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t c, const int lane) {
+  vint32m1_t c_dup_lane = __riscv_vrgather_vx_i32m1(c, lane, 4);
+  return __riscv_vmacc_vv_i32m1(a, b, c_dup_lane, 4);
+}
 
-// FORCE_INLINE uint16x4_t vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v, const int lane);
+FORCE_INLINE uint16x4_t vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t c, const int lane) {
+  vuint16m1_t c_dup_lane = __riscv_vrgather_vx_u16m1(c, lane, 8);
+  return __riscv_vmacc_vv_u16m1(a, b, c_dup_lane, 4);
+}
 
-// FORCE_INLINE uint16x8_t vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v, const int lane);
+FORCE_INLINE uint16x8_t vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c, const int lane) {
+  vuint16m1_t c_dup_lane = __riscv_vrgather_vx_u16m1(c, lane, 8);
+  return __riscv_vmacc_vv_u16m1(a, b, c_dup_lane, 8);
+}
 
-// FORCE_INLINE uint32x2_t vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v, const int lane);
+FORCE_INLINE uint32x2_t vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t c, const int lane) {
+  vuint32m1_t c_dup_lane = __riscv_vrgather_vx_u32m1(c, lane, 4);
+  return __riscv_vmacc_vv_u32m1(a, b, c_dup_lane, 2);
+}
 
-// FORCE_INLINE uint32x4_t vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v, const int lane);
+FORCE_INLINE uint32x4_t vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c, const int lane) {
+  vuint32m1_t c_dup_lane = __riscv_vrgather_vx_u32m1(c, lane, 4);
+  return __riscv_vmacc_vv_u32m1(a, b, c_dup_lane, 4);
+}
 
-// FORCE_INLINE float32x2_t vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, const int lane);
+FORCE_INLINE float32x2_t vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t c, const int lane) {
+  vfloat32m1_t c_dup_lane = __riscv_vrgather_vx_f32m1(c, lane, 4);
+  return __riscv_vfmacc_vv_f32m1(a, b, c_dup_lane, 2);
+}
 
-// FORCE_INLINE float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, const int lane);
+FORCE_INLINE float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, const int lane) {
+  vfloat32m1_t c_dup_lane = __riscv_vrgather_vx_f32m1(c, lane, 4);
+  return __riscv_vfmacc_vv_f32m1(a, b, c_dup_lane, 4);
+}
 
 FORCE_INLINE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c, const int lane) {
   vuint16m1_t c_dup_lane = __riscv_vrgather_vx_u16m1(c, lane, 8);

diff --git a/tests/impl.cpp b/tests/impl.cpp
@@ -32555,25 +32555,275 @@ result_t test_vmlaq_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #endif  // ENABLE_TEST_ALL
 }
 
-result_t test_vmla_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vmla_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  const int16_t *_c = (const int16_t *)impl.test_cases_int_pointer3;
+  int16_t _d[4];
+  int16x4_t a = vld1_s16(_a);
+  int16x4_t b = vld1_s16(_b);
+  int16x8_t c = vld1q_s16(_c);
+  int16x4_t d;
+
+#define TEST_IMPL(IDX)               \
+  for (int i = 0; i < 4; i++) {      \
+    _d[i] = _a[i] + _b[i] * _c[IDX]; \
+  }                                  \
+  d = vmla_laneq_s16(a, b, c, IDX);  \
+  CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3]))
+
+  IMM_8_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vmlaq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  const int16_t *_c = (const int16_t *)impl.test_cases_int_pointer3;
+  int16_t _d[8];
+  int16x8_t a = vld1q_s16(_a);
+  int16x8_t b = vld1q_s16(_b);
+  int16x8_t c = vld1q_s16(_c);
+  int16x8_t d;
+
+#define TEST_IMPL(IDX)               \
+  for (int i = 0; i < 8; i++) {      \
+    _d[i] = _a[i] + _b[i] * _c[IDX]; \
+  }                                  \
+  d = vmlaq_laneq_s16(a, b, c, IDX); \
+  CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]))
+
+  IMM_8_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vmla_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  const int32_t *_c = (const int32_t *)impl.test_cases_int_pointer3;
+  int32_t _d[2];
+  int32x2_t a = vld1_s32(_a);
+  int32x2_t b = vld1_s32(_b);
+  int32x4_t c = vld1q_s32(_c);
+  int32x2_t d;
+
+#define TEST_IMPL(IDX)               \
+  for (int i = 0; i < 2; i++) {      \
+    _d[i] = _a[i] + _b[i] * _c[IDX]; \
+  }                                  \
+  d = vmla_laneq_s32(a, b, c, IDX);  \
+  CHECK_RESULT(validate_int32(d, _d[0], _d[1]))
+
+  IMM_4_ITER
+#undef TEST_IMPL
 
-result_t test_vmlaq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vmlaq_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  const int32_t *_c = (const int32_t *)impl.test_cases_int_pointer3;
+  int32_t _d[4];
+  int32x4_t a = vld1q_s32(_a);
+  int32x4_t b = vld1q_s32(_b);
+  int32x4_t c = vld1q_s32(_c);
+  int32x4_t d;
 
-result_t test_vmla_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+#define TEST_IMPL(IDX)               \
+  for (int i = 0; i < 4; i++) {      \
+    _d[i] = _a[i] + _b[i] * _c[IDX]; \
+  }                                  \
+  d = vmlaq_laneq_s32(a, b, c, IDX); \
+  CHECK_RESULT(validate_int32(d, _d[0], _d[1], _d[2], _d[3]))
 
-result_t test_vmlaq_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  IMM_4_ITER
+#undef TEST_IMPL
 
-result_t test_vmla_laneq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
-result_t test_vmlaq_laneq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vmla_laneq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
+  const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2;
+  const uint16_t *_c = (const uint16_t *)impl.test_cases_int_pointer3;
+  uint16_t _d[4];
+  uint16x4_t a = vld1_u16(_a);
+  uint16x4_t b = vld1_u16(_b);
+  uint16x8_t c = vld1q_u16(_c);
+  uint16x4_t d;
 
-result_t test_vmla_laneq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+#define TEST_IMPL(IDX)               \
+  for (int i = 0; i < 4; i++) {      \
+    _d[i] = _a[i] + _b[i] * _c[IDX]; \
+  }                                  \
+  d = vmla_laneq_u16(a, b, c, IDX);  \
+  CHECK_RESULT(validate_uint16(d, _d[0], _d[1], _d[2], _d[3]))
 
-result_t test_vmlaq_laneq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  IMM_8_ITER
+#undef TEST_IMPL
 
-result_t test_vmla_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
-result_t test_vmlaq_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vmlaq_laneq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
+  const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2;
+  const uint16_t *_c = (const uint16_t *)impl.test_cases_int_pointer3;
+  uint16_t _d[8];
+  uint16x8_t a = vld1q_u16(_a);
+  uint16x8_t b = vld1q_u16(_b);
+  uint16x8_t c = vld1q_u16(_c);
+  uint16x8_t d;
+
+#define TEST_IMPL(IDX)               \
+  for (int i = 0; i < 8; i++) {      \
+    _d[i] = _a[i] + _b[i] * _c[IDX]; \
+  }                                  \
+  d = vmlaq_laneq_u16(a, b, c, IDX); \
+  CHECK_RESULT(validate_uint16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]))
+
+  IMM_8_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vmla_laneq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1;
+  const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2;
+  const uint32_t *_c = (const uint32_t *)impl.test_cases_int_pointer3;
+  uint32_t _d[2];
+  uint32x2_t a = vld1_u32(_a);
+  uint32x2_t b = vld1_u32(_b);
+  uint32x4_t c = vld1q_u32(_c);
+  uint32x2_t d;
+
+#define TEST_IMPL(IDX)               \
+  for (int i = 0; i < 2; i++) {      \
+    _d[i] = _a[i] + _b[i] * _c[IDX]; \
+  }                                  \
+  d = vmla_laneq_u32(a, b, c, IDX);  \
+  CHECK_RESULT(validate_uint32(d, _d[0], _d[1]))
+
+  IMM_4_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vmlaq_laneq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1;
+  const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2;
+  const uint32_t *_c = (const uint32_t *)impl.test_cases_int_pointer3;
+  uint32_t _d[4];
+  uint32x4_t a = vld1q_u32(_a);
+  uint32x4_t b = vld1q_u32(_b);
+  uint32x4_t c = vld1q_u32(_c);
+  uint32x4_t d;
+
+#define TEST_IMPL(IDX)               \
+  for (int i = 0; i < 4; i++) {      \
+    _d[i] = _a[i] + _b[i] * _c[IDX]; \
+  }                                  \
+  d = vmlaq_laneq_u32(a, b, c, IDX); \
+  CHECK_RESULT(validate_uint32(d, _d[0], _d[1], _d[2], _d[3]))
+
+  IMM_4_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vmla_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const float *_a = (const float *)impl.test_cases_float_pointer1;
+  const float *_b = (const float *)impl.test_cases_float_pointer2;
+  const float *_c = (const float *)impl.test_cases_float_pointer3;
+  float _d[4];
+  float32x2_t a = vld1_f32(_a);
+  float32x2_t b = vld1_f32(_b);
+  float32x4_t c = vld1q_f32(_c);
+  float32x2_t d;
+
+#define TEST_IMPL(IDX)               \
+  for (int i = 0; i < 2; i++) {      \
+    _d[i] = _a[i] + _b[i] * _c[IDX]; \
+  }                                  \
+  d = vmla_laneq_f32(a, b, c, IDX);  \
+  CHECK_RESULT(validate_float_error(d, _d[0], _d[1], 0.0001f))
+
+  IMM_4_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vmlaq_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const float *_a = (const float *)impl.test_cases_float_pointer1;
+  const float *_b = (const float *)impl.test_cases_float_pointer2;
+  const float *_c = (const float *)impl.test_cases_float_pointer3;
+  float _d[4];
+  float32x4_t a = vld1q_f32(_a);
+  float32x4_t b = vld1q_f32(_b);
+  float32x4_t c = vld1q_f32(_c);
+  float32x4_t d;
+
+#define TEST_IMPL(IDX)               \
+  for (int i = 0; i < 4; i++) {      \
+    _d[i] = _a[i] + _b[i] * _c[IDX]; \
+  }                                  \
+  d = vmlaq_laneq_f32(a, b, c, IDX); \
+  CHECK_RESULT(validate_float_error(d, _d[0], _d[1], _d[2], _d[3], 0.0001f))
+
+  IMM_4_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
 result_t test_vmlaq_lane_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #ifdef ENABLE_TEST_ALL

diff --git a/tests/impl.h b/tests/impl.h
@@ -1918,16 +1918,16 @@
   _(vmlaq_lane_s16)                                                              \
   _(vmlaq_lane_s32)                                                              \
   _(vmlaq_lane_f32)                                                              \
-  /*_(vmla_laneq_s16)                                                         */ \
-  /*_(vmlaq_laneq_s16)                                                        */ \
-  /*_(vmla_laneq_s32)                                                         */ \
-  /*_(vmlaq_laneq_s32)                                                        */ \
-  /*_(vmla_laneq_u16)                                                         */ \
-  /*_(vmlaq_laneq_u16)                                                        */ \
-  /*_(vmla_laneq_u32)                                                         */ \
-  /*_(vmlaq_laneq_u32)                                                        */ \
-  /*_(vmla_laneq_f32)                                                         */ \
-  /*_(vmlaq_laneq_f32)                                                        */ \
+  _(vmla_laneq_s16)                                                              \
+  _(vmlaq_laneq_s16)                                                             \
+  _(vmla_laneq_s32)                                                              \
+  _(vmlaq_laneq_s32)                                                             \
+  _(vmla_laneq_u16)                                                              \
+  _(vmlaq_laneq_u16)                                                             \
+  _(vmla_laneq_u32)                                                              \
+  _(vmlaq_laneq_u32)                                                             \
+  _(vmla_laneq_f32)                                                              \
+  _(vmlaq_laneq_f32)                                                             \
   _(vmlaq_lane_u16)                                                              \
   _(vmlaq_lane_u32)                                                              \
   _(vmlal_lane_s16)                                                              \