Skip to content

Commit

Permalink
Optimizations for ARM NEON
Browse files Browse the repository at this point in the history
  • Loading branch information
sadko4u committed Sep 20, 2023
1 parent 09b6bd1 commit c5aace4
Show file tree
Hide file tree
Showing 7 changed files with 311 additions and 8 deletions.
120 changes: 120 additions & 0 deletions include/private/dsp/arch/arm/neon-d32/pmath/sqr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
* (C) 2023 Vladimir Sadovnikov <[email protected]>
*
* This file is part of lsp-dsp-lib
* Created on: 20 сент. 2023 г.
*
* lsp-dsp-lib is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* lsp-dsp-lib is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>.
*/

#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SQR_H_
#define PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SQR_H_

#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL
#error "This header should not be included directly"
#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL */

namespace lsp
{
namespace neon_d32
{
#define SQR_CORE(DST, SRC, INC) \
/* 32x blocks */ \
__ASM_EMIT("subs %[count], #32") \
__ASM_EMIT("blo 2f") \
__ASM_EMIT("1:") \
__ASM_EMIT("vldm %[" SRC "]" INC ", {q0-q7}") \
__ASM_EMIT("vmul.f32 q0, q0, q0") \
__ASM_EMIT("vmul.f32 q1, q1, q1") \
__ASM_EMIT("vmul.f32 q2, q2, q2") \
__ASM_EMIT("vmul.f32 q3, q3, q3") \
__ASM_EMIT("vmul.f32 q4, q4, q4") \
__ASM_EMIT("vmul.f32 q5, q5, q5") \
__ASM_EMIT("vmul.f32 q6, q6, q6") \
__ASM_EMIT("vmul.f32 q7, q7, q7") \
__ASM_EMIT("subs %[count], #16") \
__ASM_EMIT("vstm %[" DST "]!, {q0-q7}") \
__ASM_EMIT("bhs 1b") \
__ASM_EMIT("2:") \
/* 16x block */ \
__ASM_EMIT("adds %[count], #16") \
__ASM_EMIT("blt 4f") \
__ASM_EMIT("vldm %[" SRC "]" INC ", {q0-q3}") \
__ASM_EMIT("vmul.f32 q0, q0, q0") \
__ASM_EMIT("vmul.f32 q1, q1, q1") \
__ASM_EMIT("vmul.f32 q2, q2, q2") \
__ASM_EMIT("vmul.f32 q3, q3, q3") \
__ASM_EMIT("subs %[count], #16") \
__ASM_EMIT("vstm %[" DST "]!, {q0-q3}") \
__ASM_EMIT("4:") \
/* 8x block */ \
__ASM_EMIT("adds %[count], #8") \
__ASM_EMIT("blt 6f") \
__ASM_EMIT("vldm %[" SRC "]" INC ", {q0-q1}") \
__ASM_EMIT("vmul.f32 q0, q0, q0") \
__ASM_EMIT("vmul.f32 q1, q1, q1") \
__ASM_EMIT("subs %[count], #8") \
__ASM_EMIT("vstm %[" DST "]!, {q0-q1}") \
__ASM_EMIT("6:") \
/* 4x block */ \
__ASM_EMIT("adds %[count], #4") \
__ASM_EMIT("blt 8f") \
__ASM_EMIT("vldm %[" SRC "]" INC ", {q0}") \
__ASM_EMIT("vmul.f32 q0, q0, q0") \
__ASM_EMIT("subs %[count], #4") \
__ASM_EMIT("vstm %[" DST "]!, {q0}") \
__ASM_EMIT("8:") \
/* 1x block */ \
__ASM_EMIT("adds %[count], #3") /* 4 - 3 */ \
__ASM_EMIT("blt 10f") \
__ASM_EMIT("9:") \
__ASM_EMIT("vld1.32 {d0[], d1[]}, [%[" SRC "]]" INC) \
__ASM_EMIT("vmul.f32 q0, q0, q0") \
__ASM_EMIT("subs %[count], #1") \
__ASM_EMIT("vst1.32 {d0[0]}, [%[" DST "]]!") \
__ASM_EMIT("bge 9b") \
__ASM_EMIT("10:")

void sqr1(float *dst, size_t count)
{
ARCH_ARM_ASM(
SQR_CORE("dst", "dst", "")
: [dst] "+r" (dst), [count] "+r" (count)
:
: "cc", "memory",
"q0", "q1", "q2", "q3",
"q4", "q5", "q6", "q7"
);
}

void sqr1(float *dst, const float *src, size_t count)
{
ARCH_ARM_ASM(
SQR_CORE("dst", "src", "!")
: [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
:
: "cc", "memory",
"q0", "q1", "q2", "q3",
"q4", "q5", "q6", "q7"
);
}

#undef SQR_CORE

} /* namespace neon_d32 */
} /* namespace lsp */


#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SQR_H_ */
176 changes: 176 additions & 0 deletions include/private/dsp/arch/arm/neon-d32/pmath/ssqrt.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
/*
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
* (C) 2023 Vladimir Sadovnikov <[email protected]>
*
* This file is part of lsp-dsp-lib
* Created on: 20 сент. 2023 г.
*
* lsp-dsp-lib is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* lsp-dsp-lib is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>.
*/

#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SSQRT_H_
#define PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SSQRT_H_

#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL
#error "This header should not be included directly"
#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL */

namespace lsp
{
namespace neon_d32
{
#define SSQRT_CORE(DST, SRC, INC) \
/* 16x blocks */ \
__ASM_EMIT("subs %[count], #16") \
__ASM_EMIT("blo 2f") \
__ASM_EMIT("1:") \
__ASM_EMIT("vldm %[" SRC "]" INC ", {q0-q3}") \
__ASM_EMIT("veor q14, q14, q14") \
__ASM_EMIT("veor q15, q15, q15") \
__ASM_EMIT("vmax.f32 q0, q0, q14") \
__ASM_EMIT("vmax.f32 q1, q1, q15") \
__ASM_EMIT("vmax.f32 q2, q2, q14") \
__ASM_EMIT("vmax.f32 q3, q3, q15") \
__ASM_EMIT("vrsqrte.f32 q4, q0") /* q4 = x0 */ \
__ASM_EMIT("vrsqrte.f32 q5, q1") \
__ASM_EMIT("vrsqrte.f32 q6, q2") \
__ASM_EMIT("vrsqrte.f32 q7, q3") \
__ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x0 */ \
__ASM_EMIT("vmul.f32 q9, q5, q1") \
__ASM_EMIT("vmul.f32 q10, q6, q2") \
__ASM_EMIT("vmul.f32 q11, q7, q3") \
__ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x0 * x0) / 2 */ \
__ASM_EMIT("vrsqrts.f32 q13, q9, q5") \
__ASM_EMIT("vrsqrts.f32 q14, q10, q6") \
__ASM_EMIT("vrsqrts.f32 q15, q11, q7") \
__ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ \
__ASM_EMIT("vmul.f32 q5, q5, q13") \
__ASM_EMIT("vmul.f32 q6, q6, q14") \
__ASM_EMIT("vmul.f32 q7, q7, q15") \
__ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x1 */ \
__ASM_EMIT("vmul.f32 q9, q5, q1") \
__ASM_EMIT("vmul.f32 q10, q6, q2") \
__ASM_EMIT("vmul.f32 q11, q7, q3") \
__ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x1 * x1) / 2 */ \
__ASM_EMIT("vrsqrts.f32 q13, q9, q5") \
__ASM_EMIT("vrsqrts.f32 q14, q10, q6") \
__ASM_EMIT("vrsqrts.f32 q15, q11, q7") \
__ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = 1/sqrt(R) = x2 = x1 * (3 - R * x1 * x1) / 2 */ \
__ASM_EMIT("vmul.f32 q5, q5, q13") \
__ASM_EMIT("vmul.f32 q6, q6, q14") \
__ASM_EMIT("vmul.f32 q7, q7, q15") \
__ASM_EMIT("vmul.f32 q0, q0, q4") /* q0 = R/sqrt(R) = sqrt(R) */ \
__ASM_EMIT("vmul.f32 q1, q1, q5") \
__ASM_EMIT("vmul.f32 q2, q2, q6") \
__ASM_EMIT("vmul.f32 q3, q3, q7") \
__ASM_EMIT("subs %[count], #16") \
__ASM_EMIT("vstm %[" DST "]!, {q0-q3}") \
__ASM_EMIT("bhs 1b") \
__ASM_EMIT("2:") \
__ASM_EMIT("veor q14, q14, q14") \
__ASM_EMIT("veor q15, q15, q15") \
/* 8x block */ \
__ASM_EMIT("adds %[count], #8") \
__ASM_EMIT("blt 4f") \
__ASM_EMIT("vldm %[" SRC "]" INC ", {q0-q1}") \
__ASM_EMIT("vmax.f32 q0, q0, q14") \
__ASM_EMIT("vmax.f32 q1, q1, q15") \
__ASM_EMIT("vrsqrte.f32 q4, q0") /* q4 = x0 */ \
__ASM_EMIT("vrsqrte.f32 q5, q1") \
__ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x0 */ \
__ASM_EMIT("vmul.f32 q9, q5, q1") \
__ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x0 * x0) / 2 */ \
__ASM_EMIT("vrsqrts.f32 q13, q9, q5") \
__ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ \
__ASM_EMIT("vmul.f32 q5, q5, q13") \
__ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x1 */ \
__ASM_EMIT("vmul.f32 q9, q5, q1") \
__ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x1 * x1) / 2 */ \
__ASM_EMIT("vrsqrts.f32 q13, q9, q5") \
__ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = 1/sqrt(R) = x2 = x1 * (3 - R * x1 * x1) / 2 */ \
__ASM_EMIT("vmul.f32 q5, q5, q13") \
__ASM_EMIT("vmul.f32 q0, q0, q4") /* q0 = R/sqrt(R) = sqrt(R) */ \
__ASM_EMIT("vmul.f32 q1, q1, q5") \
__ASM_EMIT("subs %[count], #4") \
__ASM_EMIT("vstm %[" DST "]!, {q0-q1}") \
__ASM_EMIT("6:") \
/* 4x block */ \
__ASM_EMIT("adds %[count], #4") \
__ASM_EMIT("blt 6f") \
__ASM_EMIT("vldm %[" SRC "]" INC ", {q0}") \
__ASM_EMIT("vmax.f32 q0, q0, q14") \
__ASM_EMIT("vrsqrte.f32 q4, q0") /* q4 = x0 */ \
__ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x0 */ \
__ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x0 * x0) / 2 */ \
__ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ \
__ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x1 */ \
__ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x1 * x1) / 2 */ \
__ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = 1/sqrt(R) = x2 = x1 * (3 - R * x1 * x1) / 2 */ \
__ASM_EMIT("vmul.f32 q0, q0, q4") /* q0 = R/sqrt(R) = sqrt(R) */ \
__ASM_EMIT("subs %[count], #4") \
__ASM_EMIT("vstm %[" DST "]!, {q0}") \
__ASM_EMIT("6:") \
/* 1x block */ \
__ASM_EMIT("adds %[count], #3") /* 4 - 3 */ \
__ASM_EMIT("blt 8f") \
__ASM_EMIT("7:") \
__ASM_EMIT("vld1.32 {d0[], d1[]}, [%[" SRC "]]" INC) \
__ASM_EMIT("vmax.f32 q0, q0, q14") \
__ASM_EMIT("vrsqrte.f32 q4, q0") /* q4 = x0 */ \
__ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x0 */ \
__ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x0 * x0) / 2 */ \
__ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ \
__ASM_EMIT("vmul.f32 q8, q4, q0") /* q8 = R * x1 */ \
__ASM_EMIT("vrsqrts.f32 q12, q8, q4") /* q12 = (3 - R * x1 * x1) / 2 */ \
__ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = 1/sqrt(R) = x2 = x1 * (3 - R * x1 * x1) / 2 */ \
__ASM_EMIT("vmul.f32 q0, q0, q4") /* q0 = R/sqrt(R) = sqrt(R) */ \
__ASM_EMIT("subs %[count], #1") \
__ASM_EMIT("vst1.32 {d0[0]}, [%[" DST "]]!") \
__ASM_EMIT("bge 7b") \
__ASM_EMIT("8:")

void ssqrt1(float *dst, size_t count)
{
ARCH_ARM_ASM(
SSQRT_CORE("dst", "dst", "")
: [dst] "+r" (dst), [count] "+r" (count)
:
: "cc", "memory",
"q0", "q1", "q2", "q3",
"q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15"
);
}

void ssqrt1(float *dst, const float *src, size_t count)
{
ARCH_ARM_ASM(
SSQRT_CORE("dst", "src", "!")
: [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
:
: "cc", "memory",
"q0", "q1", "q2", "q3",
"q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15"
);
}

#undef SQR_CORE

} /* namespace neon_d32 */
} /* namespace lsp */

#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_PMATH_SSQRT_H_ */
7 changes: 7 additions & 0 deletions src/main/arm/neon-d32.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@
#include <private/dsp/arch/arm/neon-d32/pmath/op_kx.h>
#include <private/dsp/arch/arm/neon-d32/pmath/op_vv.h>
#include <private/dsp/arch/arm/neon-d32/pmath/pow.h>
#include <private/dsp/arch/arm/neon-d32/pmath/sqr.h>
#include <private/dsp/arch/arm/neon-d32/pmath/ssqrt.h>
#include <private/dsp/arch/arm/neon-d32/resampling.h>
#include <private/dsp/arch/arm/neon-d32/search/iminmax.h>
#include <private/dsp/arch/arm/neon-d32/search/minmax.h>
Expand Down Expand Up @@ -294,6 +296,11 @@
EXPORT1(powvx1);
EXPORT1(powvx2);

EXPORT1(sqr1);
EXPORT1(sqr2);
EXPORT1(ssqrt1);
EXPORT1(ssqrt2);

EXPORT1(h_sum);
EXPORT1(h_abs_sum);
EXPORT1(h_sqr_sum);
Expand Down
4 changes: 2 additions & 2 deletions src/test/utest/pmath/sqr1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ UTEST_BEGIN("dsp.pmath", sqr1)
IF_ARCH_X86(CALL(generic::sqr1, avx::sqr1, 32));
IF_ARCH_X86(CALL(generic::sqr1, avx512::sqr1, 64));

// IF_ARCH_ARM(CALL(generic::sqr1, neon_d32::sqr1, 16));
//
IF_ARCH_ARM(CALL(generic::sqr1, neon_d32::sqr1, 16));

// IF_ARCH_AARCH64(CALL(generic::sqr1, asimd::sqr1, 16));
}
UTEST_END
Expand Down
4 changes: 2 additions & 2 deletions src/test/utest/pmath/sqr2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ UTEST_BEGIN("dsp.pmath", sqr2)
IF_ARCH_X86(CALL(generic::sqr2, avx::sqr2, 32));
IF_ARCH_X86(CALL(generic::sqr2, avx512::sqr2, 64));

// IF_ARCH_ARM(CALL(generic::sqr2, neon_d32::sqr2, 16));
//
IF_ARCH_ARM(CALL(generic::sqr2, neon_d32::sqr2, 16));

// IF_ARCH_AARCH64(CALL(generic::sqr2, asimd::sqr2, 16));
}
UTEST_END
Expand Down
4 changes: 2 additions & 2 deletions src/test/utest/pmath/ssqrt1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ UTEST_BEGIN("dsp.pmath", ssqrt1)
IF_ARCH_X86(CALL(generic::ssqrt1, avx::ssqrt1, 32));
IF_ARCH_X86(CALL(generic::ssqrt1, avx512::ssqrt1, 64));

// IF_ARCH_ARM(CALL(generic::ssqrt1, neon_d32::ssqrt1, 16));
//
IF_ARCH_ARM(CALL(generic::ssqrt1, neon_d32::ssqrt1, 16));

// IF_ARCH_AARCH64(CALL(generic::ssqrt1, asimd::ssqrt1, 16));
}
UTEST_END
Expand Down
4 changes: 2 additions & 2 deletions src/test/utest/pmath/ssqrt2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ UTEST_BEGIN("dsp.pmath", ssqrt2)
IF_ARCH_X86(CALL(generic::ssqrt2, avx::ssqrt2, 32));
IF_ARCH_X86(CALL(generic::ssqrt2, avx512::ssqrt2, 64));

// IF_ARCH_ARM(CALL(generic::ssqrt2, neon_d32::ssqrt2, 16));
//
IF_ARCH_ARM(CALL(generic::ssqrt2, neon_d32::ssqrt2, 16));

// IF_ARCH_AARCH64(CALL(generic::ssqrt2, asimd::ssqrt2, 16));
}
UTEST_END
Expand Down

0 comments on commit c5aace4

Please sign in to comment.