From 399b69fcdff56f267dd167525ac1bb9b4c59b298 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Thu, 21 Sep 2023 00:26:08 +0300 Subject: [PATCH] Optimizations for ARM NEON --- .../private/dsp/arch/arm/neon-d32/pmath/sqr.h | 120 ++++++++++++ .../dsp/arch/arm/neon-d32/pmath/ssqrt.h | 176 ++++++++++++++++++ src/main/arm/neon-d32.cpp | 7 + src/test/utest/pmath/sqr1.cpp | 4 +- src/test/utest/pmath/sqr2.cpp | 4 +- src/test/utest/pmath/ssqrt1.cpp | 4 +- src/test/utest/pmath/ssqrt2.cpp | 4 +- 7 files changed, 311 insertions(+), 8 deletions(-) create mode 100644 include/private/dsp/arch/arm/neon-d32/pmath/sqr.h create mode 100644 include/private/dsp/arch/arm/neon-d32/pmath/ssqrt.h diff --git a/include/private/dsp/arch/arm/neon-d32/pmath/sqr.h b/include/private/dsp/arch/arm/neon-d32/pmath/sqr.h new file mode 100644 index 00000000..0298fdae --- /dev/null +++ b/include/private/dsp/arch/arm/neon-d32/pmath/sqr.h @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2023 Linux Studio Plugins Project + * (C) 2023 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 20 сент. 2023 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SQR_H_ +#define PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SQR_H_ + +#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL */ + +namespace lsp +{ + namespace neon_d32 + { + #define SQR_CORE(DST, SRC, INC) \ + /* 32x blocks */ \ + __ASM_EMIT("subs %[count], #32") \ + __ASM_EMIT("blo 2f") \ + __ASM_EMIT("1:") \ + __ASM_EMIT("vldm %[" SRC "]" INC ", {q0-q7}") \ + __ASM_EMIT("vmul.f32 q0, q0, q0") \ + __ASM_EMIT("vmul.f32 q1, q1, q1") \ + __ASM_EMIT("vmul.f32 q2, q2, q2") \ + __ASM_EMIT("vmul.f32 q3, q3, q3") \ + __ASM_EMIT("vmul.f32 q4, q4, q4") \ + __ASM_EMIT("vmul.f32 q5, q5, q5") \ + __ASM_EMIT("vmul.f32 q6, q6, q6") \ + __ASM_EMIT("vmul.f32 q7, q7, q7") \ + __ASM_EMIT("subs %[count], #16") \ + __ASM_EMIT("vstm %[" DST "]!, {q0-q7}") \ + __ASM_EMIT("bhs 1b") \ + __ASM_EMIT("2:") \ + /* 16x block */ \ + __ASM_EMIT("adds %[count], #16") \ + __ASM_EMIT("blt 4f") \ + __ASM_EMIT("vldm %[" SRC "]" INC ", {q0-q3}") \ + __ASM_EMIT("vmul.f32 q0, q0, q0") \ + __ASM_EMIT("vmul.f32 q1, q1, q1") \ + __ASM_EMIT("vmul.f32 q2, q2, q2") \ + __ASM_EMIT("vmul.f32 q3, q3, q3") \ + __ASM_EMIT("subs %[count], #16") \ + __ASM_EMIT("vstm %[" DST "]!, {q0-q3}") \ + __ASM_EMIT("4:") \ + /* 8x block */ \ + __ASM_EMIT("adds %[count], #8") \ + __ASM_EMIT("blt 6f") \ + __ASM_EMIT("vldm %[" SRC "]" INC ", {q0-q1}") \ + __ASM_EMIT("vmul.f32 q0, q0, q0") \ + __ASM_EMIT("vmul.f32 q1, q1, q1") \ + __ASM_EMIT("subs %[count], #8") \ + __ASM_EMIT("vstm %[" DST "]!, {q0-q1}") \ + __ASM_EMIT("6:") \ + /* 4x block */ \ + __ASM_EMIT("adds %[count], #4") \ + __ASM_EMIT("blt 8f") \ + __ASM_EMIT("vldm %[" SRC "]" INC ", {q0}") \ + __ASM_EMIT("vmul.f32 q0, q0, q0") \ + __ASM_EMIT("subs %[count], #4") \ + __ASM_EMIT("vstm %[" DST "]!, {q0}") \ + __ASM_EMIT("8:") \ + /* 1x block */ \ + __ASM_EMIT("adds %[count], #3") /* 4 - 3 */ \ + __ASM_EMIT("blt 10f") \ + __ASM_EMIT("9:") \ + __ASM_EMIT("vld1.32 {d0[], d1[]}, [%[" SRC "]]" INC) \ + __ASM_EMIT("vmul.f32 q0, q0, q0") \ + __ASM_EMIT("subs %[count], #1") \ + __ASM_EMIT("vst1.32 {d0[0]}, [%[" DST "]]!") \ + __ASM_EMIT("bge 9b") \ + __ASM_EMIT("10:") + + void sqr1(float *dst, size_t count) + { + ARCH_ARM_ASM( + SQR_CORE("dst", "dst", "") + : [dst] "+r" (dst), [count] "+r" (count) + : + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q4", "q5", "q6", "q7" + ); + } + + void sqr1(float *dst, const float *src, size_t count) + { + ARCH_ARM_ASM( + SQR_CORE("dst", "src", "!") + : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) + : + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q4", "q5", "q6", "q7" + ); + } + + #undef SQR_CORE + + } /* namespace neon_d32 */ +} /* namespace lsp */ + + +#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SQR_H_ */ diff --git a/include/private/dsp/arch/arm/neon-d32/pmath/ssqrt.h b/include/private/dsp/arch/arm/neon-d32/pmath/ssqrt.h new file mode 100644 index 00000000..96f38f3c --- /dev/null +++ b/include/private/dsp/arch/arm/neon-d32/pmath/ssqrt.h @@ -0,0 +1,176 @@ +/* + * Copyright (C) 2023 Linux Studio Plugins Project + * (C) 2023 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 20 сент. 2023 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SSQRT_H_ +#define PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SSQRT_H_ + +#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL */ + +namespace lsp +{ + namespace neon_d32 + { + #define SSQRT_CORE(DST, SRC, INC) \ + /* 16x blocks */ \ + __ASM_EMIT("subs %[count], #16") \ + __ASM_EMIT("blo 2f") \ + __ASM_EMIT("1:") \ + __ASM_EMIT("vldm %[" SRC "]" INC ", {q0-q3}") \ + __ASM_EMIT("veor q14, q14, q14") \ + __ASM_EMIT("veor q15, q15, q15") \ + __ASM_EMIT("vmax.f32 q0, q0, q14") \ + __ASM_EMIT("vmax.f32 q1, q1, q15") \ + __ASM_EMIT("vmax.f32 q2, q2, q14") \ + __ASM_EMIT("vmax.f32 q3, q3, q15") \ + __ASM_EMIT("vrsqrte.f32 q4, q0") /* q4 = x0 */ \ + __ASM_EMIT("vrsqrte.f32 q5, q1") \ + __ASM_EMIT("vrsqrte.f32 q6, q2") \ + __ASM_EMIT("vrsqrte.f32 q7, q3") \ + __ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x0 */ \ + __ASM_EMIT("vmul.f32 q9, q5, q1") \ + __ASM_EMIT("vmul.f32 q10, q6, q2") \ + __ASM_EMIT("vmul.f32 q11, q7, q3") \ + __ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x0 * x0) / 2 */ \ + __ASM_EMIT("vrsqrts.f32 q13, q9, q5") \ + __ASM_EMIT("vrsqrts.f32 q14, q10, q6") \ + __ASM_EMIT("vrsqrts.f32 q15, q11, q7") \ + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ \ + __ASM_EMIT("vmul.f32 q5, q5, q13") \ + __ASM_EMIT("vmul.f32 q6, q6, q14") \ + __ASM_EMIT("vmul.f32 q7, q7, q15") \ + __ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x1 */ \ + __ASM_EMIT("vmul.f32 q9, q5, q1") \ + __ASM_EMIT("vmul.f32 q10, q6, q2") \ + __ASM_EMIT("vmul.f32 q11, q7, q3") \ + __ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x1 * x1) / 2 */ \ + __ASM_EMIT("vrsqrts.f32 q13, q9, q5") \ + __ASM_EMIT("vrsqrts.f32 q14, q10, q6") \ + __ASM_EMIT("vrsqrts.f32 q15, q11, q7") \ + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = 1/sqrt(R) = x2 = x1 * (3 - R * x1 * x1) / 2 */ \ + __ASM_EMIT("vmul.f32 q5, q5, q13") \ + __ASM_EMIT("vmul.f32 q6, q6, q14") \ + __ASM_EMIT("vmul.f32 q7, q7, q15") \ + __ASM_EMIT("vmul.f32 q0, q0, q4") /* q0 = R/sqrt(R) = sqrt(R) */ \ + __ASM_EMIT("vmul.f32 q1, q1, q5") \ + __ASM_EMIT("vmul.f32 q2, q2, q6") \ + __ASM_EMIT("vmul.f32 q3, q3, q7") \ + __ASM_EMIT("subs %[count], #16") \ + __ASM_EMIT("vstm %[" DST "]!, {q0-q3}") \ + __ASM_EMIT("bhs 1b") \ + __ASM_EMIT("2:") \ + __ASM_EMIT("veor q14, q14, q14") \ + __ASM_EMIT("veor q15, q15, q15") \ + /* 8x block */ \ + __ASM_EMIT("adds %[count], #8") \ + __ASM_EMIT("blt 4f") \ + __ASM_EMIT("vldm %[" SRC "]" INC ", {q0-q1}") \ + __ASM_EMIT("vmax.f32 q0, q0, q14") \ + __ASM_EMIT("vmax.f32 q1, q1, q15") \ + __ASM_EMIT("vrsqrte.f32 q4, q0") /* q4 = x0 */ \ + __ASM_EMIT("vrsqrte.f32 q5, q1") \ + __ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x0 */ \ + __ASM_EMIT("vmul.f32 q9, q5, q1") \ + __ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x0 * x0) / 2 */ \ + __ASM_EMIT("vrsqrts.f32 q13, q9, q5") \ + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ \ + __ASM_EMIT("vmul.f32 q5, q5, q13") \ + __ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x1 */ \ + __ASM_EMIT("vmul.f32 q9, q5, q1") \ + __ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x1 * x1) / 2 */ \ + __ASM_EMIT("vrsqrts.f32 q13, q9, q5") \ + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = 1/sqrt(R) = x2 = x1 * (3 - R * x1 * x1) / 2 */ \ + __ASM_EMIT("vmul.f32 q5, q5, q13") \ + __ASM_EMIT("vmul.f32 q0, q0, q4") /* q0 = R/sqrt(R) = sqrt(R) */ \ + __ASM_EMIT("vmul.f32 q1, q1, q5") \ + __ASM_EMIT("subs %[count], #4") \ + __ASM_EMIT("vstm %[" DST "]!, {q0-q1}") \ + __ASM_EMIT("4:") \ + /* 4x block */ \ + __ASM_EMIT("adds %[count], #4") \ + __ASM_EMIT("blt 6f") \ + __ASM_EMIT("vldm %[" SRC "]" INC ", {q0}") \ + __ASM_EMIT("vmax.f32 q0, q0, q14") \ + __ASM_EMIT("vrsqrte.f32 q4, q0") /* q4 = x0 */ \ + __ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x0 */ \ + __ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x0 * x0) / 2 */ \ + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ \ + __ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x1 */ \ + __ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x1 * x1) / 2 */ \ + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = 1/sqrt(R) = x2 = x1 * (3 - R * x1 * x1) / 2 */ \ + __ASM_EMIT("vmul.f32 q0, q0, q4") /* q0 = R/sqrt(R) = sqrt(R) */ \ + __ASM_EMIT("subs %[count], #4") \ + __ASM_EMIT("vstm %[" DST "]!, {q0}") \ + __ASM_EMIT("6:") \ + /* 1x block */ \ + __ASM_EMIT("adds %[count], #3") /* 4 - 3 */ \ + __ASM_EMIT("blt 8f") \ + __ASM_EMIT("7:") \ + __ASM_EMIT("vld1.32 {d0[], d1[]}, [%[" SRC "]]" INC) \ + __ASM_EMIT("vmax.f32 q0, q0, q14") \ + __ASM_EMIT("vrsqrte.f32 q4, q0") /* q4 = x0 */ \ + __ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x0 */ \ + __ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x0 * x0) / 2 */ \ + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ \ + __ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x1 */ \ + __ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x1 * x1) / 2 */ \ + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = 1/sqrt(R) = x2 = x1 * (3 - R * x1 * x1) / 2 */ \ + __ASM_EMIT("vmul.f32 q0, q0, q4") /* q0 = R/sqrt(R) = sqrt(R) */ \ + __ASM_EMIT("subs %[count], #1") \ + __ASM_EMIT("vst1.32 {d0[0]}, [%[" DST "]]!") \ + __ASM_EMIT("bge 7b") \ + __ASM_EMIT("8:") + + void ssqrt1(float *dst, size_t count) + { + ARCH_ARM_ASM( + SSQRT_CORE("dst", "dst", "") + : [dst] "+r" (dst), [count] "+r" (count) + : + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15" + ); + } + + void ssqrt1(float *dst, const float *src, size_t count) + { + ARCH_ARM_ASM( + SSQRT_CORE("dst", "src", "!") + : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) + : + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15" + ); + } + + #undef SSQRT_CORE + + } /* namespace neon_d32 */ +} /* namespace lsp */ + +#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SSQRT_H_ */ diff --git a/src/main/arm/neon-d32.cpp b/src/main/arm/neon-d32.cpp index e4546763..b68616f1 100644 --- a/src/main/arm/neon-d32.cpp +++ b/src/main/arm/neon-d32.cpp @@ -78,6 +78,8 @@ #include #include #include + #include + #include #include #include #include @@ -294,6 +296,11 @@ EXPORT1(powvx1); EXPORT1(powvx2); + EXPORT1(sqr1); + EXPORT1(sqr2); + EXPORT1(ssqrt1); + EXPORT1(ssqrt2); + EXPORT1(h_sum); EXPORT1(h_abs_sum); EXPORT1(h_sqr_sum); diff --git a/src/test/utest/pmath/sqr1.cpp b/src/test/utest/pmath/sqr1.cpp index cbc32b5b..f9a1f9a6 100644 --- a/src/test/utest/pmath/sqr1.cpp +++ b/src/test/utest/pmath/sqr1.cpp @@ -116,8 +116,8 @@ UTEST_BEGIN("dsp.pmath", sqr1) IF_ARCH_X86(CALL(generic::sqr1, avx::sqr1, 32)); IF_ARCH_X86(CALL(generic::sqr1, avx512::sqr1, 64)); -// IF_ARCH_ARM(CALL(generic::sqr1, neon_d32::sqr1, 16)); -// + IF_ARCH_ARM(CALL(generic::sqr1, neon_d32::sqr1, 16)); + // IF_ARCH_AARCH64(CALL(generic::sqr1, asimd::sqr1, 16)); } UTEST_END diff --git a/src/test/utest/pmath/sqr2.cpp b/src/test/utest/pmath/sqr2.cpp index 8fdc1765..207d72ee 100644 --- a/src/test/utest/pmath/sqr2.cpp +++ b/src/test/utest/pmath/sqr2.cpp @@ -116,8 +116,8 @@ UTEST_BEGIN("dsp.pmath", sqr2) IF_ARCH_X86(CALL(generic::sqr2, avx::sqr2, 32)); IF_ARCH_X86(CALL(generic::sqr2, avx512::sqr2, 64)); -// IF_ARCH_ARM(CALL(generic::sqr2, neon_d32::sqr2, 16)); -// + IF_ARCH_ARM(CALL(generic::sqr2, neon_d32::sqr2, 16)); + // IF_ARCH_AARCH64(CALL(generic::sqr2, asimd::sqr2, 16)); } UTEST_END diff --git a/src/test/utest/pmath/ssqrt1.cpp b/src/test/utest/pmath/ssqrt1.cpp index dccebc2e..341008e0 100644 --- a/src/test/utest/pmath/ssqrt1.cpp +++ b/src/test/utest/pmath/ssqrt1.cpp @@ -116,8 +116,8 @@ UTEST_BEGIN("dsp.pmath", ssqrt1) IF_ARCH_X86(CALL(generic::ssqrt1, avx::ssqrt1, 32)); IF_ARCH_X86(CALL(generic::ssqrt1, avx512::ssqrt1, 64)); -// IF_ARCH_ARM(CALL(generic::ssqrt1, neon_d32::ssqrt1, 16)); -// + IF_ARCH_ARM(CALL(generic::ssqrt1, neon_d32::ssqrt1, 16)); + // IF_ARCH_AARCH64(CALL(generic::ssqrt1, asimd::ssqrt1, 16)); } UTEST_END diff --git a/src/test/utest/pmath/ssqrt2.cpp b/src/test/utest/pmath/ssqrt2.cpp index e6ed5210..8f6b9303 100644 --- a/src/test/utest/pmath/ssqrt2.cpp +++ b/src/test/utest/pmath/ssqrt2.cpp @@ -116,8 +116,8 @@ UTEST_BEGIN("dsp.pmath", ssqrt2) IF_ARCH_X86(CALL(generic::ssqrt2, avx::ssqrt2, 32)); IF_ARCH_X86(CALL(generic::ssqrt2, avx512::ssqrt2, 64)); -// IF_ARCH_ARM(CALL(generic::ssqrt2, neon_d32::ssqrt2, 16)); -// + IF_ARCH_ARM(CALL(generic::ssqrt2, neon_d32::ssqrt2, 16)); + // IF_ARCH_AARCH64(CALL(generic::ssqrt2, asimd::ssqrt2, 16)); } UTEST_END