Optimizations for ARM NEON

lsp-plugins · Sep 20, 2023 · c5aace4 · c5aace4
1 parent 09b6bd1
commit c5aace4
Show file tree

Hide file tree

Showing 7 changed files with 311 additions and 8 deletions.
diff --git a/include/private/dsp/arch/arm/neon-d32/pmath/sqr.h b/include/private/dsp/arch/arm/neon-d32/pmath/sqr.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
+ *           (C) 2023 Vladimir Sadovnikov <[email protected]>
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 20 сент. 2023 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SQR_H_
+#define PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SQR_H_
+
+#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL
+    #error "This header should not be included directly"
+#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL */
+
+namespace lsp
+{
+    namespace neon_d32
+    {
+        #define SQR_CORE(DST, SRC, INC) \
+            /* 32x blocks */ \
+            __ASM_EMIT("subs        %[count], #32") \
+            __ASM_EMIT("blo         2f") \
+            __ASM_EMIT("1:") \
+            __ASM_EMIT("vldm        %[" SRC "]" INC ", {q0-q7}") \
+            __ASM_EMIT("vmul.f32    q0, q0, q0") \
+            __ASM_EMIT("vmul.f32    q1, q1, q1") \
+            __ASM_EMIT("vmul.f32    q2, q2, q2") \
+            __ASM_EMIT("vmul.f32    q3, q3, q3") \
+            __ASM_EMIT("vmul.f32    q4, q4, q4") \
+            __ASM_EMIT("vmul.f32    q5, q5, q5") \
+            __ASM_EMIT("vmul.f32    q6, q6, q6") \
+            __ASM_EMIT("vmul.f32    q7, q7, q7") \
+            __ASM_EMIT("subs        %[count], #16") \
+            __ASM_EMIT("vstm        %[" DST "]!, {q0-q7}") \
+            __ASM_EMIT("bhs         1b") \
+            __ASM_EMIT("2:") \
+            /* 16x block */ \
+            __ASM_EMIT("adds        %[count], #16")  \
+            __ASM_EMIT("blt         4f") \
+            __ASM_EMIT("vldm        %[" SRC "]" INC ", {q0-q3}") \
+            __ASM_EMIT("vmul.f32    q0, q0, q0") \
+            __ASM_EMIT("vmul.f32    q1, q1, q1") \
+            __ASM_EMIT("vmul.f32    q2, q2, q2") \
+            __ASM_EMIT("vmul.f32    q3, q3, q3") \
+            __ASM_EMIT("subs        %[count], #16") \
+            __ASM_EMIT("vstm        %[" DST "]!, {q0-q3}") \
+            __ASM_EMIT("4:") \
+            /* 8x block */ \
+            __ASM_EMIT("adds        %[count], #8")  \
+            __ASM_EMIT("blt         6f") \
+            __ASM_EMIT("vldm        %[" SRC "]" INC ", {q0-q1}") \
+            __ASM_EMIT("vmul.f32    q0, q0, q0") \
+            __ASM_EMIT("vmul.f32    q1, q1, q1") \
+            __ASM_EMIT("subs        %[count], #8") \
+            __ASM_EMIT("vstm        %[" DST "]!, {q0-q1}") \
+            __ASM_EMIT("6:") \
+            /* 4x block */ \
+            __ASM_EMIT("adds        %[count], #4")  \
+            __ASM_EMIT("blt         8f") \
+            __ASM_EMIT("vldm        %[" SRC "]" INC ", {q0}") \
+            __ASM_EMIT("vmul.f32    q0, q0, q0") \
+            __ASM_EMIT("subs        %[count], #4") \
+            __ASM_EMIT("vstm        %[" DST "]!, {q0}") \
+            __ASM_EMIT("8:") \
+            /* 1x block */ \
+            __ASM_EMIT("adds        %[count], #3") /* 4 - 3 */ \
+            __ASM_EMIT("blt         10f") \
+            __ASM_EMIT("9:") \
+            __ASM_EMIT("vld1.32     {d0[], d1[]}, [%[" SRC "]]" INC) \
+            __ASM_EMIT("vmul.f32    q0, q0, q0") \
+            __ASM_EMIT("subs        %[count], #1") \
+            __ASM_EMIT("vst1.32     {d0[0]}, [%[" DST "]]!") \
+            __ASM_EMIT("bge         9b") \
+            __ASM_EMIT("10:")
+
+        void sqr1(float *dst, size_t count)
+        {
+            ARCH_ARM_ASM(
+                SQR_CORE("dst", "dst", "")
+                : [dst] "+r" (dst), [count] "+r" (count)
+                :
+                : "cc", "memory",
+                  "q0", "q1", "q2", "q3",
+                  "q4", "q5", "q6", "q7"
+            );
+        }
+
+        void sqr1(float *dst, const float *src, size_t count)
+        {
+            ARCH_ARM_ASM(
+                SQR_CORE("dst", "src", "!")
+                : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
+                :
+                : "cc", "memory",
+                  "q0", "q1", "q2", "q3",
+                  "q4", "q5", "q6", "q7"
+            );
+        }
+
+        #undef SQR_CORE
+
+    } /* namespace neon_d32 */
+} /* namespace lsp */
+
+
+#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SQR_H_ */
diff --git a/include/private/dsp/arch/arm/neon-d32/pmath/ssqrt.h b/include/private/dsp/arch/arm/neon-d32/pmath/ssqrt.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
+ *           (C) 2023 Vladimir Sadovnikov <[email protected]>
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 20 сент. 2023 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SSQRT_H_
+#define PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SSQRT_H_
+
+#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL
+    #error "This header should not be included directly"
+#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL */
+
+namespace lsp
+{
+    namespace neon_d32
+    {
+        #define SSQRT_CORE(DST, SRC, INC) \
+            /* 16x blocks */ \
+            __ASM_EMIT("subs            %[count], #16") \
+            __ASM_EMIT("blo             2f") \
+            __ASM_EMIT("1:") \
+            __ASM_EMIT("vldm            %[" SRC "]" INC ", {q0-q3}") \
+            __ASM_EMIT("veor            q14, q14, q14") \
+            __ASM_EMIT("veor            q15, q15, q15") \
+            __ASM_EMIT("vmax.f32        q0, q0, q14") \
+            __ASM_EMIT("vmax.f32        q1, q1, q15") \
+            __ASM_EMIT("vmax.f32        q2, q2, q14") \
+            __ASM_EMIT("vmax.f32        q3, q3, q15") \
+            __ASM_EMIT("vrsqrte.f32     q4, q0")            /* q4  = x0 */ \
+            __ASM_EMIT("vrsqrte.f32     q5, q1") \
+            __ASM_EMIT("vrsqrte.f32     q6, q2") \
+            __ASM_EMIT("vrsqrte.f32     q7, q3") \
+            __ASM_EMIT("vmul.f32        q8, q4, q0")        /* q8  = R * x0 */ \
+            __ASM_EMIT("vmul.f32        q9, q5, q1") \
+            __ASM_EMIT("vmul.f32        q10, q6, q2") \
+            __ASM_EMIT("vmul.f32        q11, q7, q3") \
+            __ASM_EMIT("vrsqrts.f32     q12, q8, q4")       /* q12 = (3 - R * x0 * x0) / 2 */ \
+            __ASM_EMIT("vrsqrts.f32     q13, q9, q5") \
+            __ASM_EMIT("vrsqrts.f32     q14, q10, q6") \
+            __ASM_EMIT("vrsqrts.f32     q15, q11, q7") \
+            __ASM_EMIT("vmul.f32        q4, q4, q12")       /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ \
+            __ASM_EMIT("vmul.f32        q5, q5, q13") \
+            __ASM_EMIT("vmul.f32        q6, q6, q14") \
+            __ASM_EMIT("vmul.f32        q7, q7, q15") \
+            __ASM_EMIT("vmul.f32        q8, q4, q0")        /* q8 = R * x1 */ \
+            __ASM_EMIT("vmul.f32        q9, q5, q1") \
+            __ASM_EMIT("vmul.f32        q10, q6, q2") \
+            __ASM_EMIT("vmul.f32        q11, q7, q3") \
+            __ASM_EMIT("vrsqrts.f32     q12, q8, q4")       /* q12 = (3 - R * x1 * x1) / 2 */ \
+            __ASM_EMIT("vrsqrts.f32     q13, q9, q5") \
+            __ASM_EMIT("vrsqrts.f32     q14, q10, q6") \
+            __ASM_EMIT("vrsqrts.f32     q15, q11, q7") \
+            __ASM_EMIT("vmul.f32        q4, q4, q12")       /* q4 = 1/sqrt(R) = x2 = x1 * (3 - R * x1 * x1) / 2 */ \
+            __ASM_EMIT("vmul.f32        q5, q5, q13") \
+            __ASM_EMIT("vmul.f32        q6, q6, q14") \
+            __ASM_EMIT("vmul.f32        q7, q7, q15") \
+            __ASM_EMIT("vmul.f32        q0, q0, q4")        /* q0 = R/sqrt(R) = sqrt(R) */ \
+            __ASM_EMIT("vmul.f32        q1, q1, q5") \
+            __ASM_EMIT("vmul.f32        q2, q2, q6") \
+            __ASM_EMIT("vmul.f32        q3, q3, q7") \
+            __ASM_EMIT("subs            %[count], #16") \
+            __ASM_EMIT("vstm            %[" DST "]!, {q0-q3}") \
+            __ASM_EMIT("bhs             1b") \
+            __ASM_EMIT("2:") \
+            __ASM_EMIT("veor            q14, q14, q14") \
+            __ASM_EMIT("veor            q15, q15, q15") \
+            /* 8x block */ \
+            __ASM_EMIT("adds            %[count], #8")  \
+            __ASM_EMIT("blt             4f") \
+            __ASM_EMIT("vldm            %[" SRC "]" INC ", {q0-q1}") \
+            __ASM_EMIT("vmax.f32        q0, q0, q14") \
+            __ASM_EMIT("vmax.f32        q1, q1, q15") \
+            __ASM_EMIT("vrsqrte.f32     q4, q0")            /* q4  = x0 */ \
+            __ASM_EMIT("vrsqrte.f32     q5, q1") \
+            __ASM_EMIT("vmul.f32        q8, q4, q0")        /* q8  = R * x0 */ \
+            __ASM_EMIT("vmul.f32        q9, q5, q1") \
+            __ASM_EMIT("vrsqrts.f32     q12, q8, q4")       /* q12 = (3 - R * x0 * x0) / 2 */ \
+            __ASM_EMIT("vrsqrts.f32     q13, q9, q5") \
+            __ASM_EMIT("vmul.f32        q4, q4, q12")       /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ \
+            __ASM_EMIT("vmul.f32        q5, q5, q13") \
+            __ASM_EMIT("vmul.f32        q8, q4, q0")        /* q8 = R * x1 */ \
+            __ASM_EMIT("vmul.f32        q9, q5, q1") \
+            __ASM_EMIT("vrsqrts.f32     q12, q8, q4")       /* q12 = (3 - R * x1 * x1) / 2 */ \
+            __ASM_EMIT("vrsqrts.f32     q13, q9, q5") \
+            __ASM_EMIT("vmul.f32        q4, q4, q12")       /* q4 = 1/sqrt(R) = x2 = x1 * (3 - R * x1 * x1) / 2 */ \
+            __ASM_EMIT("vmul.f32        q5, q5, q13") \
+            __ASM_EMIT("vmul.f32        q0, q0, q4")        /* q0 = R/sqrt(R) = sqrt(R) */ \
+            __ASM_EMIT("vmul.f32        q1, q1, q5") \
+            __ASM_EMIT("subs            %[count], #4") \
+            __ASM_EMIT("vstm            %[" DST "]!, {q0-q1}") \
+            __ASM_EMIT("6:") \
+            /* 4x block */ \
+            __ASM_EMIT("adds            %[count], #4")  \
+            __ASM_EMIT("blt             6f") \
+            __ASM_EMIT("vldm            %[" SRC "]" INC ", {q0}") \
+            __ASM_EMIT("vmax.f32        q0, q0, q14") \
+            __ASM_EMIT("vrsqrte.f32     q4, q0")            /* q4  = x0 */ \
+            __ASM_EMIT("vmul.f32        q8, q4, q0")        /* q8  = R * x0 */ \
+            __ASM_EMIT("vrsqrts.f32     q12, q8, q4")       /* q12 = (3 - R * x0 * x0) / 2 */ \
+            __ASM_EMIT("vmul.f32        q4, q4, q12")       /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ \
+            __ASM_EMIT("vmul.f32        q8, q4, q0")        /* q8 = R * x1 */ \
+            __ASM_EMIT("vrsqrts.f32     q12, q8, q4")       /* q12 = (3 - R * x1 * x1) / 2 */ \
+            __ASM_EMIT("vmul.f32        q4, q4, q12")       /* q4 = 1/sqrt(R) = x2 = x1 * (3 - R * x1 * x1) / 2 */ \
+            __ASM_EMIT("vmul.f32        q0, q0, q4")        /* q0 = R/sqrt(R) = sqrt(R) */ \
+            __ASM_EMIT("subs            %[count], #4") \
+            __ASM_EMIT("vstm            %[" DST "]!, {q0}") \
+            __ASM_EMIT("6:") \
+            /* 1x block */ \
+            __ASM_EMIT("adds            %[count], #3") /* 4 - 3 */ \
+            __ASM_EMIT("blt             8f") \
+            __ASM_EMIT("7:") \
+            __ASM_EMIT("vld1.32         {d0[], d1[]}, [%[" SRC "]]" INC) \
+            __ASM_EMIT("vmax.f32        q0, q0, q14") \
+            __ASM_EMIT("vrsqrte.f32     q4, q0")            /* q4  = x0 */ \
+            __ASM_EMIT("vmul.f32        q8, q4, q0")        /* q8  = R * x0 */ \
+            __ASM_EMIT("vrsqrts.f32     q12, q8, q4")       /* q12 = (3 - R * x0 * x0) / 2 */ \
+            __ASM_EMIT("vmul.f32        q4, q4, q12")       /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ \
+            __ASM_EMIT("vmul.f32        q8, q4, q0")        /* q8 = R * x1 */ \
+            __ASM_EMIT("vrsqrts.f32     q12, q8, q4")       /* q12 = (3 - R * x1 * x1) / 2 */ \
+            __ASM_EMIT("vmul.f32        q4, q4, q12")       /* q4 = 1/sqrt(R) = x2 = x1 * (3 - R * x1 * x1) / 2 */ \
+            __ASM_EMIT("vmul.f32        q0, q0, q4")        /* q0 = R/sqrt(R) = sqrt(R) */ \
+            __ASM_EMIT("subs            %[count], #1") \
+            __ASM_EMIT("vst1.32         {d0[0]}, [%[" DST "]]!") \
+            __ASM_EMIT("bge             7b") \
+            __ASM_EMIT("8:")
+
+        void ssqrt1(float *dst, size_t count)
+        {
+            ARCH_ARM_ASM(
+                SSQRT_CORE("dst", "dst", "")
+                : [dst] "+r" (dst), [count] "+r" (count)
+                :
+                : "cc", "memory",
+                  "q0", "q1", "q2", "q3",
+                  "q4", "q5", "q6", "q7",
+                  "q8", "q9", "q10", "q11",
+                  "q12", "q13", "q14", "q15"
+            );
+        }
+
+        void ssqrt1(float *dst, const float *src, size_t count)
+        {
+            ARCH_ARM_ASM(
+                SSQRT_CORE("dst", "src", "!")
+                : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
+                :
+                : "cc", "memory",
+                  "q0", "q1", "q2", "q3",
+                  "q4", "q5", "q6", "q7",
+                  "q8", "q9", "q10", "q11",
+                  "q12", "q13", "q14", "q15"
+            );
+        }
+
+        #undef SQR_CORE
+
+    } /* namespace neon_d32 */
+} /* namespace lsp */
+
+#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SSQRT_H_ */
diff --git a/src/main/arm/neon-d32.cpp b/src/main/arm/neon-d32.cpp
@@ -78,6 +78,8 @@
         #include <private/dsp/arch/arm/neon-d32/pmath/op_kx.h>
         #include <private/dsp/arch/arm/neon-d32/pmath/op_vv.h>
         #include <private/dsp/arch/arm/neon-d32/pmath/pow.h>
+        #include <private/dsp/arch/arm/neon-d32/pmath/sqr.h>
+        #include <private/dsp/arch/arm/neon-d32/pmath/ssqrt.h>
         #include <private/dsp/arch/arm/neon-d32/resampling.h>
         #include <private/dsp/arch/arm/neon-d32/search/iminmax.h>
         #include <private/dsp/arch/arm/neon-d32/search/minmax.h>
@@ -294,6 +296,11 @@
                 EXPORT1(powvx1);
                 EXPORT1(powvx2);
 
+                EXPORT1(sqr1);
+                EXPORT1(sqr2);
+                EXPORT1(ssqrt1);
+                EXPORT1(ssqrt2);
+
                 EXPORT1(h_sum);
                 EXPORT1(h_abs_sum);
                 EXPORT1(h_sqr_sum);

diff --git a/src/test/utest/pmath/sqr1.cpp b/src/test/utest/pmath/sqr1.cpp
@@ -116,8 +116,8 @@ UTEST_BEGIN("dsp.pmath", sqr1)
         IF_ARCH_X86(CALL(generic::sqr1, avx::sqr1, 32));
         IF_ARCH_X86(CALL(generic::sqr1, avx512::sqr1, 64));
 
-//        IF_ARCH_ARM(CALL(generic::sqr1, neon_d32::sqr1, 16));
-//
+        IF_ARCH_ARM(CALL(generic::sqr1, neon_d32::sqr1, 16));
+
 //        IF_ARCH_AARCH64(CALL(generic::sqr1, asimd::sqr1, 16));
     }
 UTEST_END

diff --git a/src/test/utest/pmath/sqr2.cpp b/src/test/utest/pmath/sqr2.cpp
@@ -116,8 +116,8 @@ UTEST_BEGIN("dsp.pmath", sqr2)
         IF_ARCH_X86(CALL(generic::sqr2, avx::sqr2, 32));
         IF_ARCH_X86(CALL(generic::sqr2, avx512::sqr2, 64));
 
-//        IF_ARCH_ARM(CALL(generic::sqr2, neon_d32::sqr2, 16));
-//
+        IF_ARCH_ARM(CALL(generic::sqr2, neon_d32::sqr2, 16));
+
 //        IF_ARCH_AARCH64(CALL(generic::sqr2, asimd::sqr2, 16));
     }
 UTEST_END

diff --git a/src/test/utest/pmath/ssqrt1.cpp b/src/test/utest/pmath/ssqrt1.cpp
@@ -116,8 +116,8 @@ UTEST_BEGIN("dsp.pmath", ssqrt1)
         IF_ARCH_X86(CALL(generic::ssqrt1, avx::ssqrt1, 32));
         IF_ARCH_X86(CALL(generic::ssqrt1, avx512::ssqrt1, 64));
 
-//        IF_ARCH_ARM(CALL(generic::ssqrt1, neon_d32::ssqrt1, 16));
-//
+        IF_ARCH_ARM(CALL(generic::ssqrt1, neon_d32::ssqrt1, 16));
+
 //        IF_ARCH_AARCH64(CALL(generic::ssqrt1, asimd::ssqrt1, 16));
     }
 UTEST_END

diff --git a/src/test/utest/pmath/ssqrt2.cpp b/src/test/utest/pmath/ssqrt2.cpp
@@ -116,8 +116,8 @@ UTEST_BEGIN("dsp.pmath", ssqrt2)
         IF_ARCH_X86(CALL(generic::ssqrt2, avx::ssqrt2, 32));
         IF_ARCH_X86(CALL(generic::ssqrt2, avx512::ssqrt2, 64));
 
-//        IF_ARCH_ARM(CALL(generic::ssqrt2, neon_d32::ssqrt2, 16));
-//
+        IF_ARCH_ARM(CALL(generic::ssqrt2, neon_d32::ssqrt2, 16));
+
 //        IF_ARCH_AARCH64(CALL(generic::ssqrt2, asimd::ssqrt2, 16));
     }
 UTEST_END