Skip to content

Commit

Permalink
Merge branch 'avx512' into devel
Browse files Browse the repository at this point in the history
  • Loading branch information
sadko4u committed Oct 24, 2023
2 parents 443c960 + abb51f5 commit 5aabf12
Show file tree
Hide file tree
Showing 7 changed files with 232 additions and 1 deletion.
4 changes: 3 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
*******************************************************************************

=== 1.0.19 ===
* AVX2-optimization of search functions for maximum and minimum.
* AVX2 optimization of search functions for maximum and minimum.
* Implemented SIMD-optimized gate functions.
* AVX512 optimization of packed complex functions.

=== 1.0.18 ===
* Fixed compilation regression for 32-bit Clang compiler.
Expand Down
203 changes: 203 additions & 0 deletions include/private/dsp/arch/x86/avx512/pcomplex.h
Original file line number Diff line number Diff line change
Expand Up @@ -1123,6 +1123,209 @@ namespace lsp
);
}

void pcomplex_r2c(float *dst, const float *src, size_t count)
{
IF_ARCH_X86(size_t off);
ARCH_X86_ASM
(
__ASM_EMIT("xor %[off], %[off]")
/* x64 blocks */
__ASM_EMIT("sub $64, %[count]")
__ASM_EMIT("jb 2f")
__ASM_EMIT("kmovw %[CC], %%k4") /* k4 = 0x5555 */
__ASM_EMIT("1:")
__ASM_EMIT("vexpandps 0x00(%[src], %[off]), %%zmm0 %{%%k4%}%{z%}")/* zmm0 = s0 0 s1 0 s2 0 s3 0 s4 0 s5 0 s6 0 s7 0 */
__ASM_EMIT("vexpandps 0x20(%[src], %[off]), %%zmm1 %{%%k4%}%{z%}")
__ASM_EMIT("vexpandps 0x40(%[src], %[off]), %%zmm2 %{%%k4%}%{z%}")
__ASM_EMIT("vexpandps 0x60(%[src], %[off]), %%zmm3 %{%%k4%}%{z%}")
__ASM_EMIT("vexpandps 0x80(%[src], %[off]), %%zmm4 %{%%k4%}%{z%}")
__ASM_EMIT("vexpandps 0xa0(%[src], %[off]), %%zmm5 %{%%k4%}%{z%}")
__ASM_EMIT("vexpandps 0xc0(%[src], %[off]), %%zmm6 %{%%k4%}%{z%}")
__ASM_EMIT("vexpandps 0xe0(%[src], %[off]), %%zmm7 %{%%k4%}%{z%}")
__ASM_EMIT("vmovups %%zmm0, 0x000(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%zmm1, 0x040(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%zmm2, 0x080(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%zmm3, 0x0c0(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%zmm4, 0x100(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%zmm5, 0x140(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%zmm6, 0x180(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%zmm7, 0x1c0(%[dst], %[off], 2)")
__ASM_EMIT("add $0x100, %[off]")
__ASM_EMIT("sub $64, %[count]")
__ASM_EMIT("jae 1b")
__ASM_EMIT("2:")
/* x16 blocks */
__ASM_EMIT("add $48, %[count]")
__ASM_EMIT("vxorps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = 0 */
__ASM_EMIT("vxorps %%ymm7, %%ymm7, %%ymm7") /* ymm7 = 0 */
__ASM_EMIT("jl 4f") \
__ASM_EMIT("3:") \
__ASM_EMIT("vmovups 0x00(%[src], %[off]), %%ymm0") /* ymm0 = s0 s1 s2 s3 s4 s5 s6 s7 */
__ASM_EMIT("vmovups 0x20(%[src], %[off]), %%ymm2")
__ASM_EMIT("vunpckhps %%ymm6, %%ymm0, %%ymm1") /* ymm1 = s2 0 s3 0 s6 0 s7 0 */
__ASM_EMIT("vunpckhps %%ymm6, %%ymm2, %%ymm3")
__ASM_EMIT("vunpcklps %%ymm6, %%ymm0, %%ymm0") /* ymm0 = s0 0 s1 0 s4 0 s5 0 */
__ASM_EMIT("vunpcklps %%ymm6, %%ymm2, %%ymm2")
__ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm4") /* xmm4 = s4 0 s5 0 */
__ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm5")
__ASM_EMIT("vinsertf128 $1, %%xmm1, %%ymm0, %%ymm0") /* ymm0 = s0 0 s1 0 s2 0 s3 0 */
__ASM_EMIT("vinsertf128 $1, %%xmm3, %%ymm2, %%ymm2")
__ASM_EMIT("vinsertf128 $0, %%xmm4, %%ymm1, %%ymm1") /* ymm1 = s4 0 s5 0 s6 0 s7 0 */
__ASM_EMIT("vinsertf128 $0, %%xmm5, %%ymm3, %%ymm3")
__ASM_EMIT("vmovups %%ymm0, 0x00(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%ymm1, 0x20(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%ymm2, 0x40(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%ymm3, 0x60(%[dst], %[off], 2)")
__ASM_EMIT("add $0x40, %[off]")
__ASM_EMIT("sub $16, %[count]")
__ASM_EMIT("jge 3b")
__ASM_EMIT("4:")
/* x8 block */
__ASM_EMIT("add $8, %[count]")
__ASM_EMIT("jl 6f")
__ASM_EMIT("vmovups 0x00(%[src], %[off]), %%xmm0") /* xmm0 = s0 s1 s2 s3 */
__ASM_EMIT("vmovups 0x10(%[src], %[off]), %%xmm2")
__ASM_EMIT("vunpckhps %%xmm6, %%xmm0, %%xmm1") /* xmm1 = s2 0 s3 0 */
__ASM_EMIT("vunpckhps %%xmm6, %%xmm2, %%xmm3")
__ASM_EMIT("vunpcklps %%xmm6, %%xmm0, %%xmm0") /* xmm0 = s0 0 s1 0 */
__ASM_EMIT("vunpcklps %%xmm6, %%xmm2, %%xmm2")
__ASM_EMIT("vmovups %%xmm0, 0x00(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%xmm1, 0x10(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%xmm2, 0x20(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%xmm3, 0x30(%[dst], %[off], 2)")
__ASM_EMIT("sub $8, %[count]")
__ASM_EMIT("add $0x20, %[off]")
__ASM_EMIT("6:")
/* x4 blocks */
__ASM_EMIT("add $4, %[count]")
__ASM_EMIT("jl 8f")
__ASM_EMIT("vmovups 0x00(%[src], %[off]), %%xmm0") /* xmm0 = s0 s1 s2 s3 */
__ASM_EMIT("vunpckhps %%xmm6, %%xmm0, %%xmm1") /* xmm1 = s2 0 s3 0 */
__ASM_EMIT("vunpcklps %%xmm6, %%xmm0, %%xmm0") /* xmm0 = s0 0 s1 0 */
__ASM_EMIT("vmovups %%xmm0, 0x00(%[dst], %[off], 2)")
__ASM_EMIT("vmovups %%xmm1, 0x10(%[dst], %[off], 2)")
__ASM_EMIT("sub $4, %[count]")
__ASM_EMIT("add $0x10, %[off]")
__ASM_EMIT("8:")
/* x1 blocks */
__ASM_EMIT("add $3, %[count]")
__ASM_EMIT("jl 10f")
__ASM_EMIT("9:")
__ASM_EMIT("vmovss 0x00(%[src], %[off]), %%xmm0") /* xmm0 = s0 */
__ASM_EMIT("vmovlps %%xmm0, 0x00(%[dst], %[off], 2)")
__ASM_EMIT("add $0x04, %[off]")
__ASM_EMIT("dec %[count]")
__ASM_EMIT("jge 9b")
__ASM_EMIT("10:")

: [count] "+r" (count), [off] "=&r" (off)
: [dst] "r" (dst), [src] "r" (src),
[CC] "o" (pcomplex_r2c_expand)
: "cc", "memory",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%k4"
);
}

void pcomplex_c2r(float *dst, const float *src, size_t count)
{
IF_ARCH_X86(size_t off);
ARCH_X86_ASM
(
__ASM_EMIT("xor %[off], %[off]")
/* x64 blocks */
__ASM_EMIT("sub $64, %[count]")
__ASM_EMIT("kmovw %[CC], %%k4") /* k4 = 0x5555 */
__ASM_EMIT("jb 2f")
__ASM_EMIT("1:")
__ASM_EMIT("vmovups 0x000(%[src], %[off], 2), %%zmm0") /* zmm0 = s0 i0 s1 i0 s2 i2 s3 i3 s4 i4 s5 i5 s6 i6 s7 i7 */
__ASM_EMIT("vmovups 0x040(%[src], %[off], 2), %%zmm1")
__ASM_EMIT("vmovups 0x080(%[src], %[off], 2), %%zmm2")
__ASM_EMIT("vmovups 0x0c0(%[src], %[off], 2), %%zmm3")
__ASM_EMIT("vmovups 0x100(%[src], %[off], 2), %%zmm4")
__ASM_EMIT("vmovups 0x140(%[src], %[off], 2), %%zmm5")
__ASM_EMIT("vmovups 0x180(%[src], %[off], 2), %%zmm6")
__ASM_EMIT("vmovups 0x1c0(%[src], %[off], 2), %%zmm7")
__ASM_EMIT("vcompressps %%zmm0, %%zmm0 %{%%k4%}%{z%}") /* zmm0 = s0 s1 s2 s3 s4 s5 s6 s7 */
__ASM_EMIT("vcompressps %%zmm1, %%zmm1 %{%%k4%}%{z%}")
__ASM_EMIT("vcompressps %%zmm2, %%zmm2 %{%%k4%}%{z%}")
__ASM_EMIT("vcompressps %%zmm3, %%zmm3 %{%%k4%}%{z%}")
__ASM_EMIT("vcompressps %%zmm4, %%zmm4 %{%%k4%}%{z%}")
__ASM_EMIT("vcompressps %%zmm5, %%zmm5 %{%%k4%}%{z%}")
__ASM_EMIT("vcompressps %%zmm6, %%zmm6 %{%%k4%}%{z%}")
__ASM_EMIT("vcompressps %%zmm7, %%zmm7 %{%%k4%}%{z%}")
__ASM_EMIT("vmovups %%ymm0, 0x000(%[dst], %[off])")
__ASM_EMIT("vmovups %%ymm1, 0x020(%[dst], %[off])")
__ASM_EMIT("vmovups %%ymm2, 0x040(%[dst], %[off])")
__ASM_EMIT("vmovups %%ymm3, 0x060(%[dst], %[off])")
__ASM_EMIT("vmovups %%ymm4, 0x080(%[dst], %[off])")
__ASM_EMIT("vmovups %%ymm5, 0x0a0(%[dst], %[off])")
__ASM_EMIT("vmovups %%ymm6, 0x0c0(%[dst], %[off])")
__ASM_EMIT("vmovups %%ymm7, 0x0e0(%[dst], %[off])")
__ASM_EMIT("add $0x100, %[off]")
__ASM_EMIT("sub $64, %[count]")
__ASM_EMIT("jae 1b")
__ASM_EMIT("2:")
/* x16 blocks */
__ASM_EMIT("add $48, %[count]")
__ASM_EMIT("jl 4f") \
__ASM_EMIT("3:") \
__ASM_EMIT("vmovups 0x000(%[src], %[off], 2), %%zmm0") /* zmm0 = s0 i0 s1 i1 s2 i2 s3 i3 s4 i4 s5 i5 s6 i6 s7 i7 */
__ASM_EMIT("vmovups 0x040(%[src], %[off], 2), %%zmm1")
__ASM_EMIT("vcompressps %%zmm0, %%zmm0 %{%%k4%}%{z%}") /* zmm0 = s0 s1 s2 s3 s4 s5 s6 s7 */
__ASM_EMIT("vcompressps %%zmm1, %%zmm1 %{%%k4%}%{z%}")
__ASM_EMIT("vmovups %%ymm0, 0x000(%[dst], %[off])")
__ASM_EMIT("vmovups %%ymm1, 0x020(%[dst], %[off])")
__ASM_EMIT("add $0x40, %[off]")
__ASM_EMIT("sub $16, %[count]")
__ASM_EMIT("jge 3b")
__ASM_EMIT("4:")
/* x8 block */
__ASM_EMIT("add $8, %[count]")
__ASM_EMIT("jl 6f")
__ASM_EMIT("vmovups 0x00(%[src], %[off], 2), %%xmm0") /* xmm0 = s0 i0 s1 i1 */
__ASM_EMIT("vmovups 0x10(%[src], %[off], 2), %%xmm1") /* xmm1 = s2 i2 s3 i3 */
__ASM_EMIT("vmovups 0x20(%[src], %[off], 2), %%xmm2")
__ASM_EMIT("vmovups 0x30(%[src], %[off], 2), %%xmm3")
__ASM_EMIT("vshufps $0x88, %%xmm1, %%xmm0, %%xmm0") /* xmm0 = s0 s1 s2 s3 */
__ASM_EMIT("vshufps $0x88, %%xmm3, %%xmm2, %%xmm2")
__ASM_EMIT("vmovups %%xmm0, 0x00(%[dst], %[off])")
__ASM_EMIT("vmovups %%xmm2, 0x10(%[dst], %[off])")
__ASM_EMIT("sub $8, %[count]")
__ASM_EMIT("add $0x20, %[off]")
__ASM_EMIT("6:")
/* x4 blocks */
__ASM_EMIT("add $4, %[count]")
__ASM_EMIT("jl 8f")
__ASM_EMIT("vmovups 0x00(%[src], %[off], 2), %%xmm0") /* xmm0 = s0 i0 s1 i1 */
__ASM_EMIT("vmovups 0x10(%[src], %[off], 2), %%xmm1") /* xmm1 = s2 i2 s3 i3 */
__ASM_EMIT("vshufps $0x88, %%xmm1, %%xmm0, %%xmm0") /* xmm0 = s0 s1 s2 s3 */
__ASM_EMIT("vmovups %%xmm0, 0x00(%[dst], %[off])")
__ASM_EMIT("sub $4, %[count]")
__ASM_EMIT("add $0x10, %[off]")
__ASM_EMIT("8:")
/* x1 blocks */
__ASM_EMIT("add $3, %[count]")
__ASM_EMIT("jl 10f")
__ASM_EMIT("9:")
__ASM_EMIT("vmovss 0x00(%[src], %[off], 2), %%xmm0") /* xmm0 = s0 */
__ASM_EMIT("vmovss %%xmm0, 0x00(%[dst], %[off])")
__ASM_EMIT("add $0x04, %[off]")
__ASM_EMIT("dec %[count]")
__ASM_EMIT("jge 9b")
__ASM_EMIT("10:")

: [count] "+r" (count), [off] "=&r" (off)
: [dst] "r" (dst), [src] "r" (src),
[CC] "o" (pcomplex_r2c_expand)
: "cc", "memory",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%k4"
);
}

} /* namespace avx512 */
} /* namespace lsp */

Expand Down
2 changes: 2 additions & 0 deletions src/main/x86/avx512.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,13 @@
CEXPORT1(vl, pcomplex_div2);
CEXPORT1(vl, pcomplex_rdiv2);
CEXPORT1(vl, pcomplex_div3);
CEXPORT1(vl, pcomplex_r2c);
CEXPORT1(vl, pcomplex_r2c_add2);
CEXPORT1(vl, pcomplex_r2c_rsub2);
CEXPORT1(vl, pcomplex_r2c_sub2);
CEXPORT1(vl, pcomplex_r2c_mul2);
CEXPORT1(vl, pcomplex_r2c_div2);
CEXPORT1(vl, pcomplex_c2r);

CEXPORT1(vl, lr_to_ms);
CEXPORT1(vl, lr_to_mid);
Expand Down
6 changes: 6 additions & 0 deletions src/test/ptest/pcomplex/c2r.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ namespace lsp
{
void pcomplex_c2r(float *dst, const float *src, size_t count);
}

namespace avx512
{
void pcomplex_c2r(float *dst, const float *src, size_t count);
}
)

IF_ARCH_ARM(
Expand Down Expand Up @@ -98,6 +103,7 @@ PTEST_BEGIN("dsp.pcomplex", c2r, 5, 1000)

CALL(generic::pcomplex_c2r);
IF_ARCH_X86(CALL(sse::pcomplex_c2r));
IF_ARCH_X86(CALL(avx512::pcomplex_c2r));
IF_ARCH_ARM(CALL(neon_d32::pcomplex_c2r));
IF_ARCH_AARCH64(CALL(asimd::pcomplex_c2r));

Expand Down
6 changes: 6 additions & 0 deletions src/test/ptest/pcomplex/r2c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ namespace lsp
{
void pcomplex_r2c(float *dst, const float *src, size_t count);
}

namespace avx512
{
void pcomplex_r2c(float *dst, const float *src, size_t count);
}
)

IF_ARCH_ARM(
Expand Down Expand Up @@ -104,6 +109,7 @@ PTEST_BEGIN("dsp.pcomplex", r2c, 5, 1000)
CALL(generic::pcomplex_r2c);
IF_ARCH_X86(CALL(sse::pcomplex_r2c));
IF_ARCH_X86(CALL(avx::pcomplex_r2c));
IF_ARCH_X86(CALL(avx512::pcomplex_r2c));
IF_ARCH_ARM(CALL(neon_d32::pcomplex_r2c));
IF_ARCH_AARCH64(CALL(asimd::pcomplex_r2c));

Expand Down
6 changes: 6 additions & 0 deletions src/test/utest/pcomplex/c2r.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ namespace lsp
{
void pcomplex_c2r(float *dst, const float *src, size_t count);
}

namespace avx512
{
void pcomplex_c2r(float *dst, const float *src, size_t count);
}
)

IF_ARCH_ARM(
Expand Down Expand Up @@ -100,6 +105,7 @@ UTEST_BEGIN("dsp.pcomplex", c2r)
call(#func, align, generic::pcomplex_c2r, func)

IF_ARCH_X86(CALL(sse::pcomplex_c2r, 16));
IF_ARCH_X86(CALL(avx512::pcomplex_c2r, 64));

IF_ARCH_ARM(CALL(neon_d32::pcomplex_c2r, 16));

Expand Down
6 changes: 6 additions & 0 deletions src/test/utest/pcomplex/r2c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ namespace lsp
{
void pcomplex_r2c(float *dst, const float *src, size_t count);
}

namespace avx512
{
void pcomplex_r2c(float *dst, const float *src, size_t count);
}
)

IF_ARCH_ARM(
Expand Down Expand Up @@ -106,6 +111,7 @@ UTEST_BEGIN("dsp.pcomplex", r2c)

IF_ARCH_X86(CALL(sse::pcomplex_r2c, 16));
IF_ARCH_X86(CALL(avx::pcomplex_r2c, 32));
IF_ARCH_X86(CALL(avx512::pcomplex_r2c, 64));

IF_ARCH_ARM(CALL(neon_d32::pcomplex_r2c, 16));

Expand Down

0 comments on commit 5aabf12

Please sign in to comment.