howjmay · howjmay · Jan 20, 2024 · Apr 3, 2024
diff --git a/neon2rvv.h b/neon2rvv.h
@@ -5853,7 +5853,15 @@ FORCE_INLINE uint8x16_t vcntq_u8(uint8x16_t a) {
 
 FORCE_INLINE float32x2_t vrecpe_f32(float32x2_t a) { return __riscv_vfrec7_v_f32m1(a, 2); }
 
-// FORCE_INLINE uint32x2_t vrecpe_u32(uint32x2_t a);
+FORCE_INLINE uint32x2_t vrecpe_u32(uint32x2_t a) {
+  const uint32_t sign_bit = 0x80000000;
+  vbool32_t signbit_mask = __riscv_vmsgeu_vx_u32m1_b32(__a, 0x80000000, 2);
+  const uint32_t input_lower_bound = 0x80000000, input_upper_bound = UINT32_MAX;
+  const uint32_t estimate_lower_bound = 0x80000000, estimate_upper_bound = 0xff800000;
+  const uint32_t input_range = input_upper_bound - input_lower_bound;
+  const uint32_t estimate_range = estimate_upper_bound - estimate_lower_bound;
+  vuint32m1_t diff = __riscv_vrsub_vx_u32m1(__a, UINT32_MAX, 2);
+}
 
 FORCE_INLINE float32x4_t vrecpeq_f32(float32x4_t a) { return __riscv_vfrec7_v_f32m1(a, 4); }
 

diff --git a/tests/impl.cpp b/tests/impl.cpp
@@ -18881,7 +18881,36 @@ result_t test_vrecpe_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #endif  // ENABLE_TEST_ALL
 }
 
-result_t test_vrecpe_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vrecpe_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // refer: https://community.arm.com/support-forums/f/armds-forum/930/division-with-neon
+  // https://www.cnblogs.com/pepetang/p/7777243.html
+  const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1;
+  uint32_t _c[2];
+  // const uint32_t sign_bit = 0x80000000;
+  // const uint32_t input_lower_bound = 0x80000000, input_upper_bound = UINT32_MAX;
+  // const uint32_t estimate_lower_bound = 0x80000000, estimate_upper_bound = 0xff800000;
+  // const uint32_t input_range = input_upper_bound - input_lower_bound;
+  // const uint32_t estimate_range = estimate_upper_bound - estimate_lower_bound;
+  // for (int i = 0; i < 2; i++) {
+  //   if (!(_a[i] & sign_bit)) {
+  //     _c[i] = UINT32_MAX;
+  //   } else {
+  //     uint32_t diff = UINT32_MAX - _a[i];
+  //     _c[i] = (diff / (input_range) * (estimate_range) + estimate_lower_bound);
+  //   }
+  // }
+  for (int i = 0; i < 2; i++) {
+  x_normalized = x << 
+  }
+
+
+  uint32x2_t a = vld1_u32(_a);
+  uint32x2_t c = vrecpe_u32(a);
+  // print_u32_64("_a", _a);
+  // print_u32_64("_c", _c);
+  // print_u32_64("-c", c);
+  return validate_uint32(c, _c[0], _c[1]);
+}
 
 result_t test_vrecpeq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #ifdef ENABLE_TEST_ALL