Skip to content

Commit

Permalink
Merge pull request #502 from howjmay/vmaxnm
Browse files Browse the repository at this point in the history
feat: Add v[max|min]nm[q]_[f32|f64]
  • Loading branch information
howjmay authored Aug 2, 2024
2 parents c235c2a + e3bd6a9 commit 1e64c4a
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 16 deletions.
64 changes: 56 additions & 8 deletions neon2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -3829,21 +3829,69 @@ FORCE_INLINE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b) {
return __riscv_vmerge_vvm_f64m1(vdupq_n_f64(NAN), max_res, mask, 2);
}

FORCE_INLINE float32x2_t vmaxnm_f32(float32x2_t a, float32x2_t b) { return __riscv_vfmax_vv_f32m1(a, b, 2); }
FORCE_INLINE float32x2_t vmaxnm_f32(float32x2_t a, float32x2_t b) {
vbool32_t a_non_nan_mask = __riscv_vmfeq_vv_f32m1_b32(a, a, 2);
vbool32_t b_non_nan_mask = __riscv_vmfeq_vv_f32m1_b32(b, b, 2);
float32x2_t a_replace = __riscv_vmerge_vvm_f32m1(b, a, a_non_nan_mask, 2);
float32x2_t b_replace = __riscv_vmerge_vvm_f32m1(a, b, b_non_nan_mask, 2);
return __riscv_vfmax_vv_f32m1(a_replace, b_replace, 2);
}

FORCE_INLINE float32x4_t vmaxnmq_f32(float32x4_t a, float32x4_t b) { return __riscv_vfmax_vv_f32m1(a, b, 4); }
FORCE_INLINE float32x4_t vmaxnmq_f32(float32x4_t a, float32x4_t b) {
vbool32_t a_non_nan_mask = __riscv_vmfeq_vv_f32m1_b32(a, a, 4);
vbool32_t b_non_nan_mask = __riscv_vmfeq_vv_f32m1_b32(b, b, 4);
float32x2_t a_replace = __riscv_vmerge_vvm_f32m1(b, a, a_non_nan_mask, 4);
float32x2_t b_replace = __riscv_vmerge_vvm_f32m1(a, b, b_non_nan_mask, 4);
return __riscv_vfmax_vv_f32m1(a_replace, b_replace, 4);
}

// FORCE_INLINE float64x1_t vmaxnm_f64(float64x1_t a, float64x1_t b);
FORCE_INLINE float64x1_t vmaxnm_f64(float64x1_t a, float64x1_t b) {
vbool64_t a_non_nan_mask = __riscv_vmfeq_vv_f64m1_b64(a, a, 1);
vbool64_t b_non_nan_mask = __riscv_vmfeq_vv_f64m1_b64(b, b, 1);
float64x2_t a_replace = __riscv_vmerge_vvm_f64m1(b, a, a_non_nan_mask, 1);
float64x2_t b_replace = __riscv_vmerge_vvm_f64m1(a, b, b_non_nan_mask, 1);
return __riscv_vfmax_vv_f64m1(a_replace, b_replace, 1);
}

// FORCE_INLINE float64x2_t vmaxnmq_f64(float64x2_t a, float64x2_t b);
FORCE_INLINE float64x2_t vmaxnmq_f64(float64x2_t a, float64x2_t b) {
vbool64_t a_non_nan_mask = __riscv_vmfeq_vv_f64m1_b64(a, a, 2);
vbool64_t b_non_nan_mask = __riscv_vmfeq_vv_f64m1_b64(b, b, 2);
float64x2_t a_replace = __riscv_vmerge_vvm_f64m1(b, a, a_non_nan_mask, 2);
float64x2_t b_replace = __riscv_vmerge_vvm_f64m1(a, b, b_non_nan_mask, 2);
return __riscv_vfmax_vv_f64m1(a_replace, b_replace, 2);
}

FORCE_INLINE float32x2_t vminnm_f32(float32x2_t a, float32x2_t b) { return __riscv_vfmin_vv_f32m1(a, b, 2); }
FORCE_INLINE float32x2_t vminnm_f32(float32x2_t a, float32x2_t b) {
vbool32_t a_non_nan_mask = __riscv_vmfeq_vv_f32m1_b32(a, a, 2);
vbool32_t b_non_nan_mask = __riscv_vmfeq_vv_f32m1_b32(b, b, 2);
float32x2_t a_replace = __riscv_vmerge_vvm_f32m1(b, a, a_non_nan_mask, 2);
float32x2_t b_replace = __riscv_vmerge_vvm_f32m1(a, b, b_non_nan_mask, 2);
return __riscv_vfmin_vv_f32m1(a_replace, b_replace, 2);
}

FORCE_INLINE float32x4_t vminnmq_f32(float32x4_t a, float32x4_t b) { return __riscv_vfmin_vv_f32m1(a, b, 4); }
FORCE_INLINE float32x4_t vminnmq_f32(float32x4_t a, float32x4_t b) {
vbool32_t a_non_nan_mask = __riscv_vmfeq_vv_f32m1_b32(a, a, 4);
vbool32_t b_non_nan_mask = __riscv_vmfeq_vv_f32m1_b32(b, b, 4);
float32x2_t a_replace = __riscv_vmerge_vvm_f32m1(b, a, a_non_nan_mask, 4);
float32x2_t b_replace = __riscv_vmerge_vvm_f32m1(a, b, b_non_nan_mask, 4);
return __riscv_vfmin_vv_f32m1(a_replace, b_replace, 4);
}

// FORCE_INLINE float64x1_t vminnm_f64(float64x1_t a, float64x1_t b);
FORCE_INLINE float64x1_t vminnm_f64(float64x1_t a, float64x1_t b) {
vbool64_t a_non_nan_mask = __riscv_vmfeq_vv_f64m1_b64(a, a, 1);
vbool64_t b_non_nan_mask = __riscv_vmfeq_vv_f64m1_b64(b, b, 1);
float64x2_t a_replace = __riscv_vmerge_vvm_f64m1(b, a, a_non_nan_mask, 1);
float64x2_t b_replace = __riscv_vmerge_vvm_f64m1(a, b, b_non_nan_mask, 1);
return __riscv_vfmin_vv_f64m1(a_replace, b_replace, 1);
}

// FORCE_INLINE float64x2_t vminnmq_f64(float64x2_t a, float64x2_t b);
FORCE_INLINE float64x2_t vminnmq_f64(float64x2_t a, float64x2_t b) {
vbool64_t a_non_nan_mask = __riscv_vmfeq_vv_f64m1_b64(a, a, 2);
vbool64_t b_non_nan_mask = __riscv_vmfeq_vv_f64m1_b64(b, b, 2);
float64x2_t a_replace = __riscv_vmerge_vvm_f64m1(b, a, a_non_nan_mask, 2);
float64x2_t b_replace = __riscv_vmerge_vvm_f64m1(a, b, b_non_nan_mask, 2);
return __riscv_vfmin_vv_f64m1(a_replace, b_replace, 2);
}

FORCE_INLINE uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b) { return __riscv_vmaxu_vv_u8m1(a, b, 16); }

Expand Down
72 changes: 68 additions & 4 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14080,9 +14080,41 @@ result_t test_vmaxnmq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#endif // ENABLE_TEST_ALL
}

result_t test_vmaxnm_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vmaxnm_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const double *_a = (const double *)impl.test_cases_float_pointer1;
const double *_b = (const double *)impl.test_cases_float_pointer2;
double _c[1];
for (int i = 0; i < 1; i++) {
_c[i] = _a[i] > _b[i] ? _a[i] : _b[i];
}

float64x1_t a = vld1_f64(_a);
float64x1_t b = vld1_f64(_b);
float64x1_t c = vmaxnm_f64(a, b);
return validate_double(c, _c[0]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmaxnmq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vmaxnmq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const double *_a = (const double *)impl.test_cases_float_pointer1;
const double *_b = (const double *)impl.test_cases_float_pointer2;
double _c[2];
for (int i = 0; i < 2; i++) {
_c[i] = _a[i] > _b[i] ? _a[i] : _b[i];
}

float64x2_t a = vld1q_f64(_a);
float64x2_t b = vld1q_f64(_b);
float64x2_t c = vmaxnmq_f64(a, b);
return validate_double(c, _c[0], _c[1]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vminnm_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
Expand Down Expand Up @@ -14120,9 +14152,41 @@ result_t test_vminnmq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#endif // ENABLE_TEST_ALL
}

result_t test_vminnm_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vminnm_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const double *_a = (const double *)impl.test_cases_float_pointer1;
const double *_b = (const double *)impl.test_cases_float_pointer2;
double _c[1];
for (int i = 0; i < 1; i++) {
_c[i] = _a[i] < _b[i] ? _a[i] : _b[i];
}

float64x1_t a = vld1_f64(_a);
float64x1_t b = vld1_f64(_b);
float64x1_t c = vminnm_f64(a, b);
return validate_double(c, _c[0]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vminnmq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vminnmq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const double *_a = (const double *)impl.test_cases_float_pointer1;
const double *_b = (const double *)impl.test_cases_float_pointer2;
double _c[2];
for (int i = 0; i < 2; i++) {
_c[i] = _a[i] < _b[i] ? _a[i] : _b[i];
}

float64x2_t a = vld1q_f64(_a);
float64x2_t b = vld1q_f64(_b);
float64x2_t c = vminnmq_f64(a, b);
return validate_double(c, _c[0], _c[1]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmaxq_u8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
Expand Down
8 changes: 4 additions & 4 deletions tests/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -776,12 +776,12 @@
_(vmaxq_f64) \
_(vmaxnm_f32) \
_(vmaxnmq_f32) \
/*_(vmaxnm_f64) */ \
/*_(vmaxnmq_f64) */ \
_(vmaxnm_f64) \
_(vmaxnmq_f64) \
_(vminnm_f32) \
_(vminnmq_f32) \
/*_(vminnm_f64) */ \
/*_(vminnmq_f64) */ \
_(vminnm_f64) \
_(vminnmq_f64) \
_(vmaxq_u8) \
_(vmaxq_u16) \
_(vmaxq_u32) \
Expand Down

0 comments on commit 1e64c4a

Please sign in to comment.