Skip to content

Commit

Permalink
Implemented abs_amax2, abs_amin2, abs_amax3 and abs_amin3 functions f…
Browse files Browse the repository at this point in the history
…or ARM ASIMD
  • Loading branch information
sadko4u committed Nov 2, 2024
1 parent b45c2db commit b538f69
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 13 deletions.
118 changes: 105 additions & 13 deletions include/private/dsp/arch/aarch64/asimd/pmath/abs_vv.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,8 @@ namespace lsp

#define OP_DSEL(a, b) a
#define OP_RSEL(a, b) b
#define OP_WITHOUT_ABS(x)
#define OP_WITH_ABS(x) x

#define OP_ABS_VV2_CORE(DST, SRC, OP, SEL) \
__ASM_EMIT("subs %[count], %[count], #32") \
Expand All @@ -241,6 +243,14 @@ namespace lsp
__ASM_EMIT("fabs " SEL("v21.4s, v21.4s", "v5.4s, v5.4s")) \
__ASM_EMIT("fabs " SEL("v22.4s, v22.4s", "v6.4s, v6.4s")) \
__ASM_EMIT("fabs " SEL("v23.4s, v23.4s", "v7.4s, v7.4s")) \
ABS(__ASM_EMIT("fabs " SEL("v0.4s, v0.4s", "v16.4s, v16.4s"))) \
ABS(__ASM_EMIT("fabs " SEL("v1.4s, v1.4s", "v17.4s, v17.4s"))) \
ABS(__ASM_EMIT("fabs " SEL("v2.4s, v2.4s", "v18.4s, v18.4s"))) \
ABS(__ASM_EMIT("fabs " SEL("v3.4s, v3.4s", "v19.4s, v19.4s"))) \
ABS(__ASM_EMIT("fabs " SEL("v4.4s, v4.4s", "v20.4s, v20.4s"))) \
ABS(__ASM_EMIT("fabs " SEL("v5.4s, v5.4s", "v21.4s, v21.4s"))) \
ABS(__ASM_EMIT("fabs " SEL("v6.4s, v6.4s", "v22.4s, v22.4s"))) \
ABS(__ASM_EMIT("fabs " SEL("v7.4s, v7.4s", "v23.4s, v23.4s"))) \
__ASM_EMIT(OP " v0.4s, v0.4s, v16.4s") \
__ASM_EMIT(OP " v1.4s, v1.4s, v17.4s") \
__ASM_EMIT(OP " v2.4s, v2.4s, v18.4s") \
Expand Down Expand Up @@ -269,6 +279,10 @@ namespace lsp
__ASM_EMIT("fabs " SEL("v17.4s, v17.4s", "v1.4s, v1.4s")) \
__ASM_EMIT("fabs " SEL("v18.4s, v18.4s", "v2.4s, v2.4s")) \
__ASM_EMIT("fabs " SEL("v19.4s, v19.4s", "v3.4s, v3.4s")) \
ABS(__ASM_EMIT("fabs " SEL("v0.4s, v0.4s", "v16.4s, v16.4s"))) \
ABS(__ASM_EMIT("fabs " SEL("v1.4s, v1.4s", "v17.4s, v17.4s"))) \
ABS(__ASM_EMIT("fabs " SEL("v2.4s, v2.4s", "v18.4s, v18.4s"))) \
ABS(__ASM_EMIT("fabs " SEL("v3.4s, v3.4s", "v19.4s, v19.4s"))) \
__ASM_EMIT(OP " v0.4s, v0.4s, v16.4s") \
__ASM_EMIT(OP " v1.4s, v1.4s, v17.4s") \
__ASM_EMIT(OP " v2.4s, v2.4s, v18.4s") \
Expand All @@ -288,6 +302,8 @@ namespace lsp
__ASM_EMIT("ldp q18, q19, [%[" SEL(SRC, DST) "], #0x20]") \
__ASM_EMIT("fabs " SEL("v16.4s, v16.4s", "v0.4s, v0.4s")) \
__ASM_EMIT("fabs " SEL("v17.4s, v17.4s", "v1.4s, v1.4s")) \
ABS(__ASM_EMIT("fabs " SEL("v0.4s, v0.4s", "v16.4s, v16.4s"))) \
ABS(__ASM_EMIT("fabs " SEL("v1.4s, v1.4s", "v17.4s, v17.4s"))) \
__ASM_EMIT(OP " v0.4s, v0.4s, v16.4s") \
__ASM_EMIT(OP " v1.4s, v1.4s, v17.4s") \
__ASM_EMIT("stp q0, q1, [%[" DST "], #0x00]") \
Expand All @@ -301,6 +317,7 @@ namespace lsp
__ASM_EMIT("ldr q0, [%[" SEL(DST, SRC) "], #0x00]") \
__ASM_EMIT("ldr q16, [%[" SEL(SRC, DST) "], #0x00]") \
__ASM_EMIT("fabs " SEL("v16.4s, v16.4s", "v0.4s, v0.4s")) \
ABS(__ASM_EMIT("fabs " SEL("v0.4s, v0.4s", "v16.4s, v16.4s"))) \
__ASM_EMIT(OP " v0.4s, v0.4s, v16.4s") \
__ASM_EMIT("str q0, [%[" DST "], #0x00]") \
__ASM_EMIT("sub %[count], %[count], #4") \
Expand All @@ -314,6 +331,7 @@ namespace lsp
__ASM_EMIT("ld1r {v0.4s}, [%[" SEL(DST, SRC) "]]") \
__ASM_EMIT("ld1r {v16.4s}, [%[" SEL(SRC, DST) "]]") \
__ASM_EMIT("fabs " SEL("v16.4s, v16.4s", "v0.4s, v0.4s")) \
ABS(__ASM_EMIT("fabs " SEL("v0.4s, v0.4s", "v16.4s, v16.4s"))) \
__ASM_EMIT(OP " v0.4s, v0.4s, v16.4s") \
__ASM_EMIT("st1 {v0.s}[0], [%[" DST "]]") \
__ASM_EMIT("subs %[count], %[count], #1") \
Expand All @@ -326,7 +344,7 @@ namespace lsp
{
ARCH_AARCH64_ASM
(
OP_ABS_VV2_CORE("dst", "src", "fadd", OP_DSEL)
OP_ABS_VV2_CORE("dst", "src", "fadd", OP_DSEL, OP_WITHOUT_ABS)
: [dst] "+r" (dst), [src] "+r" (src),
[count] "+r" (count)
:
Expand All @@ -340,7 +358,7 @@ namespace lsp
{
ARCH_AARCH64_ASM
(
OP_ABS_VV2_CORE("dst", "src", "fsub", OP_DSEL)
OP_ABS_VV2_CORE("dst", "src", "fsub", OP_DSEL, OP_WITHOUT_ABS)
: [dst] "+r" (dst), [src] "+r" (src),
[count] "+r" (count)
:
Expand All @@ -354,7 +372,7 @@ namespace lsp
{
ARCH_AARCH64_ASM
(
OP_ABS_VV2_CORE("dst", "src", "fsub", OP_RSEL)
OP_ABS_VV2_CORE("dst", "src", "fsub", OP_RSEL, OP_WITHOUT_ABS)
: [dst] "+r" (dst), [src] "+r" (src),
[count] "+r" (count)
:
Expand All @@ -368,7 +386,7 @@ namespace lsp
{
ARCH_AARCH64_ASM
(
OP_ABS_VV2_CORE("dst", "src", "fmul", OP_DSEL)
OP_ABS_VV2_CORE("dst", "src", "fmul", OP_DSEL, OP_WITHOUT_ABS)
: [dst] "+r" (dst), [src] "+r" (src),
[count] "+r" (count)
:
Expand All @@ -382,7 +400,7 @@ namespace lsp
{
ARCH_AARCH64_ASM
(
OP_ABS_VV2_CORE("dst", "src", "fmax", OP_DSEL)
OP_ABS_VV2_CORE("dst", "src", "fmax", OP_DSEL, OP_WITHOUT_ABS)
: [dst] "+r" (dst), [src] "+r" (src),
[count] "+r" (count)
:
Expand All @@ -396,7 +414,35 @@ namespace lsp
{
ARCH_AARCH64_ASM
(
OP_ABS_VV2_CORE("dst", "src", "fmin", OP_DSEL)
OP_ABS_VV2_CORE("dst", "src", "fmin", OP_DSEL, OP_WITHOUT_ABS)
: [dst] "+r" (dst), [src] "+r" (src),
[count] "+r" (count)
:
: "cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
);
}

void abs_amax2(float *dst, const float *src, size_t count)
{
ARCH_AARCH64_ASM
(
OP_ABS_VV2_CORE("dst", "src", "fmax", OP_DSEL, OP_WITH_ABS)
: [dst] "+r" (dst), [src] "+r" (src),
[count] "+r" (count)
:
: "cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
);
}

void abs_amin2(float *dst, const float *src, size_t count)
{
ARCH_AARCH64_ASM
(
OP_ABS_VV2_CORE("dst", "src", "fmin", OP_DSEL, OP_WITH_ABS)
: [dst] "+r" (dst), [src] "+r" (src),
[count] "+r" (count)
:
Expand Down Expand Up @@ -545,7 +591,7 @@ namespace lsp

#undef OP_ABS_DIV2_CORE

#define OP_ABS_VV3_CORE(DST, SRC1, SRC2, OP, SEL) \
#define OP_ABS_VV3_CORE(DST, SRC1, SRC2, OP, SEL, ABS) \
__ASM_EMIT("subs %[count], %[count], #32") \
__ASM_EMIT("blo 2f") \
/* 32x blocks */ \
Expand All @@ -566,6 +612,14 @@ namespace lsp
__ASM_EMIT("fabs v21.4s, v21.4s") \
__ASM_EMIT("fabs v22.4s, v22.4s") \
__ASM_EMIT("fabs v23.4s, v23.4s") \
ABS(__ASM_EMIT("fabs v0.4s, v0.4s")) \
ABS(__ASM_EMIT("fabs v1.4s, v1.4s")) \
ABS(__ASM_EMIT("fabs v2.4s, v2.4s")) \
ABS(__ASM_EMIT("fabs v3.4s, v3.4s")) \
ABS(__ASM_EMIT("fabs v4.4s, v4.4s")) \
ABS(__ASM_EMIT("fabs v5.4s, v5.4s")) \
ABS(__ASM_EMIT("fabs v6.4s, v6.4s")) \
ABS(__ASM_EMIT("fabs v7.4s, v7.4s")) \
__ASM_EMIT(OP " v0.4s, " SEL("v0.4s", "v16.4s") ", " SEL("v16.4s", "v0.4s") ) \
__ASM_EMIT(OP " v1.4s, " SEL("v1.4s", "v17.4s") ", " SEL("v17.4s", "v1.4s") ) \
__ASM_EMIT(OP " v2.4s, " SEL("v2.4s", "v18.4s") ", " SEL("v18.4s", "v2.4s") ) \
Expand Down Expand Up @@ -595,6 +649,10 @@ namespace lsp
__ASM_EMIT("fabs v17.4s, v17.4s") \
__ASM_EMIT("fabs v18.4s, v18.4s") \
__ASM_EMIT("fabs v19.4s, v19.4s") \
ABS(__ASM_EMIT("fabs v0.4s, v0.4s")) \
ABS(__ASM_EMIT("fabs v1.4s, v1.4s")) \
ABS(__ASM_EMIT("fabs v2.4s, v2.4s")) \
ABS(__ASM_EMIT("fabs v3.4s, v3.4s")) \
__ASM_EMIT(OP " v0.4s, " SEL("v0.4s", "v16.4s") ", " SEL("v16.4s", "v0.4s") ) \
__ASM_EMIT(OP " v1.4s, " SEL("v1.4s", "v17.4s") ", " SEL("v17.4s", "v1.4s") ) \
__ASM_EMIT(OP " v2.4s, " SEL("v2.4s", "v18.4s") ", " SEL("v18.4s", "v2.4s") ) \
Expand All @@ -613,6 +671,8 @@ namespace lsp
__ASM_EMIT("ldp q0, q1, [%[" SRC1 "], #0x00]") \
__ASM_EMIT("fabs v16.4s, v16.4s") \
__ASM_EMIT("fabs v17.4s, v17.4s") \
ABS(__ASM_EMIT("fabs v0.4s, v0.4s")) \
ABS(__ASM_EMIT("fabs v1.4s, v1.4s")) \
__ASM_EMIT(OP " v0.4s, " SEL("v0.4s", "v16.4s") ", " SEL("v16.4s", "v0.4s") ) \
__ASM_EMIT(OP " v1.4s, " SEL("v1.4s", "v17.4s") ", " SEL("v17.4s", "v1.4s") ) \
__ASM_EMIT("stp q0, q1, [%[" DST "], #0x00]") \
Expand All @@ -627,6 +687,7 @@ namespace lsp
__ASM_EMIT("ldr q16, [%[" SRC2 "], #0x00]") \
__ASM_EMIT("ldr q0, [%[" SRC1 "], #0x00]") \
__ASM_EMIT("fabs v16.4s, v16.4s") \
ABS(__ASM_EMIT("fabs v0.4s, v0.4s")) \
__ASM_EMIT(OP " v0.4s, " SEL("v0.4s", "v16.4s") ", " SEL("v16.4s", "v0.4s") ) \
__ASM_EMIT("str q0, [%[" DST "], #0x00]") \
__ASM_EMIT("sub %[count], %[count], #4") \
Expand All @@ -641,6 +702,7 @@ namespace lsp
__ASM_EMIT("ld1r {v16.4s}, [%[" SRC2 "]]") \
__ASM_EMIT("ld1r {v0.4s}, [%[" SRC1 "]]") \
__ASM_EMIT("fabs v16.4s, v16.4s") \
ABS(__ASM_EMIT("fabs v0.4s, v0.4s")) \
__ASM_EMIT(OP " v0.4s, " SEL("v0.4s", "v16.4s") ", " SEL("v16.4s", "v0.4s") ) \
__ASM_EMIT("st1 {v0.s}[0], [%[" DST "]]") \
__ASM_EMIT("subs %[count], %[count], #1") \
Expand All @@ -654,7 +716,7 @@ namespace lsp
{
ARCH_AARCH64_ASM
(
OP_ABS_VV3_CORE("dst", "src1", "src2", "fadd", OP_DSEL)
OP_ABS_VV3_CORE("dst", "src1", "src2", "fadd", OP_DSEL, OP_WITHOUT_ABS)
: [dst] "+r" (dst), [src1] "+r" (src1), [src2] "+r" (src2),
[count] "+r" (count)
:
Expand All @@ -668,7 +730,7 @@ namespace lsp
{
ARCH_AARCH64_ASM
(
OP_ABS_VV3_CORE("dst", "src1", "src2", "fsub", OP_DSEL)
OP_ABS_VV3_CORE("dst", "src1", "src2", "fsub", OP_DSEL, OP_WITHOUT_ABS)
: [dst] "+r" (dst), [src1] "+r" (src1), [src2] "+r" (src2),
[count] "+r" (count)
:
Expand All @@ -682,7 +744,7 @@ namespace lsp
{
ARCH_AARCH64_ASM
(
OP_ABS_VV3_CORE("dst", "src1", "src2", "fsub", OP_RSEL)
OP_ABS_VV3_CORE("dst", "src1", "src2", "fsub", OP_RSEL, OP_WITHOUT_ABS)
: [dst] "+r" (dst), [src1] "+r" (src1), [src2] "+r" (src2),
[count] "+r" (count)
:
Expand All @@ -696,7 +758,7 @@ namespace lsp
{
ARCH_AARCH64_ASM
(
OP_ABS_VV3_CORE("dst", "src1", "src2", "fmul", OP_DSEL)
OP_ABS_VV3_CORE("dst", "src1", "src2", "fmul", OP_DSEL, OP_WITHOUT_ABS)
: [dst] "+r" (dst), [src1] "+r" (src1), [src2] "+r" (src2),
[count] "+r" (count)
:
Expand All @@ -710,7 +772,7 @@ namespace lsp
{
ARCH_AARCH64_ASM
(
OP_ABS_VV3_CORE("dst", "src1", "src2", "fmax", OP_DSEL)
OP_ABS_VV3_CORE("dst", "src1", "src2", "fmax", OP_DSEL, OP_WITHOUT_ABS)
: [dst] "+r" (dst), [src1] "+r" (src1), [src2] "+r" (src2),
[count] "+r" (count)
:
Expand All @@ -724,7 +786,35 @@ namespace lsp
{
ARCH_AARCH64_ASM
(
OP_ABS_VV3_CORE("dst", "src1", "src2", "fmin", OP_DSEL)
OP_ABS_VV3_CORE("dst", "src1", "src2", "fmin", OP_DSEL, OP_WITHOUT_ABS)
: [dst] "+r" (dst), [src1] "+r" (src1), [src2] "+r" (src2),
[count] "+r" (count)
:
: "cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
);
}

void abs_amax3(float *dst, const float *src1, const float *src2, size_t count)
{
ARCH_AARCH64_ASM
(
OP_ABS_VV3_CORE("dst", "src1", "src2", "fmax", OP_DSEL, OP_WITH_ABS)
: [dst] "+r" (dst), [src1] "+r" (src1), [src2] "+r" (src2),
[count] "+r" (count)
:
: "cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
);
}

void abs_amin3(float *dst, const float *src1, const float *src2, size_t count)
{
ARCH_AARCH64_ASM
(
OP_ABS_VV3_CORE("dst", "src1", "src2", "fmin", OP_DSEL, OP_WITH_ABS)
: [dst] "+r" (dst), [src1] "+r" (src1), [src2] "+r" (src2),
[count] "+r" (count)
:
Expand Down Expand Up @@ -879,6 +969,8 @@ namespace lsp

#undef OP_DSEL
#undef OP_RSEL
#undef OP_WITHOUT_ABS
#undef OP_WITH_ABS

} /* namespace asimd */
} /* namespace lsp */
Expand Down
4 changes: 4 additions & 0 deletions src/main/aarch64/asimd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,8 @@
EXPORT1(abs_rdiv2);
EXPORT1(abs_max2);
EXPORT1(abs_min2);
EXPORT1(abs_amax2);
EXPORT1(abs_amin2);

EXPORT1(abs_add3);
EXPORT1(abs_sub3);
Expand All @@ -275,6 +277,8 @@
EXPORT1(abs_rdiv3);
EXPORT1(abs_max3);
EXPORT1(abs_min3);
EXPORT1(abs_amax3);
EXPORT1(abs_amin3);

EXPORT1(h_sum);
EXPORT1(h_sqr_sum);
Expand Down

0 comments on commit b538f69

Please sign in to comment.