Skip to content

Commit

Permalink
Implemented AVX-optimized fill routine
Browse files Browse the repository at this point in the history
  • Loading branch information
sadko4u committed Oct 19, 2023
1 parent 7e6c96c commit bfa12bc
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 4 deletions.
124 changes: 120 additions & 4 deletions include/private/dsp/arch/x86/avx/copy.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
* (C) 2020 Vladimir Sadovnikov <[email protected]>
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
* (C) 2023 Vladimir Sadovnikov <[email protected]>
*
* This file is part of lsp-dsp-lib
* Created on: 31 мар. 2020 г.
Expand Down Expand Up @@ -442,7 +442,123 @@ namespace lsp
"%xmm4", "%xmm5", "%xmm6", "%xmm7"
);
}
}
}

void fill(float *dst, float value, size_t count)
{
ARCH_X86_ASM
(
__ASM_EMIT("test %[count], %[count]")
__ASM_EMIT("jz 2000f")
__ASM_EMIT("vbroadcastss %[value], %%ymm0")

/* Align destination */
__ASM_EMIT("1:")
__ASM_EMIT("test $0x01f, %[dst]")
__ASM_EMIT("jz 2f")
__ASM_EMIT("vmovss %%xmm0, 0x00(%[dst])")
__ASM_EMIT("add $0x4, %[dst]")
__ASM_EMIT("dec %[count]")
__ASM_EMIT("jnz 1b")
__ASM_EMIT("jmp 2000f")
__ASM_EMIT("2:")
__ASM_EMIT("vmovaps %%ymm0, %%ymm1")
__ASM_EMIT("vmovaps %%ymm0, %%ymm2")
__ASM_EMIT("vmovaps %%ymm1, %%ymm3")

/* Destination is aligned */
/* x64 blocks */
__ASM_EMIT("sub $0x40, %[count]")
__ASM_EMIT("jb 4f")
__ASM_EMIT("vmovaps %%ymm0, %%ymm4")
__ASM_EMIT("vmovaps %%ymm1, %%ymm5")
__ASM_EMIT("vmovaps %%ymm2, %%ymm6")
__ASM_EMIT("vmovaps %%ymm3, %%ymm7")
__ASM_EMIT("3:")
__ASM_EMIT("vmovaps %%ymm0, 0x00(%[dst])")
__ASM_EMIT("vmovaps %%ymm1, 0x20(%[dst])")
__ASM_EMIT("vmovaps %%ymm2, 0x40(%[dst])")
__ASM_EMIT("vmovaps %%ymm3, 0x60(%[dst])")
__ASM_EMIT("vmovaps %%ymm4, 0x80(%[dst])")
__ASM_EMIT("vmovaps %%ymm5, 0xa0(%[dst])")
__ASM_EMIT("vmovaps %%ymm6, 0xc0(%[dst])")
__ASM_EMIT("vmovaps %%ymm7, 0xe0(%[dst])")
__ASM_EMIT("add $0x100, %[dst]")
__ASM_EMIT("sub $0x40, %[count]")
__ASM_EMIT("jae 3b")
__ASM_EMIT("4:")
/* x32 block */
__ASM_EMIT("add $0x20, %[count]")
__ASM_EMIT("jl 6f")
__ASM_EMIT("vmovaps %%ymm0, 0x00(%[dst])")
__ASM_EMIT("vmovaps %%ymm1, 0x20(%[dst])")
__ASM_EMIT("vmovaps %%ymm2, 0x40(%[dst])")
__ASM_EMIT("vmovaps %%ymm3, 0x60(%[dst])")
__ASM_EMIT("sub $0x20, %[count]")
__ASM_EMIT("add $0x80, %[dst]")
__ASM_EMIT("6:")
/* x16 block */
__ASM_EMIT("add $0x10, %[count]")
__ASM_EMIT("jl 8f")
__ASM_EMIT("vmovaps %%xmm0, 0x00(%[dst])")
__ASM_EMIT("vmovaps %%xmm1, 0x10(%[dst])")
__ASM_EMIT("vmovaps %%xmm2, 0x20(%[dst])")
__ASM_EMIT("vmovaps %%xmm3, 0x30(%[dst])")
__ASM_EMIT("sub $0x10, %[count]")
__ASM_EMIT("add $0x40, %[dst]")
__ASM_EMIT("8:")
/* x8 block */
__ASM_EMIT("add $0x8, %[count]")
__ASM_EMIT("jl 10f")
__ASM_EMIT("vmovaps %%xmm0, 0x00(%[dst])")
__ASM_EMIT("vmovaps %%xmm1, 0x10(%[dst])")
__ASM_EMIT("sub $0x8, %[count]")
__ASM_EMIT("add $0x20, %[dst]")
__ASM_EMIT("10:")
/* x4 block */
__ASM_EMIT("add $0x4, %[count]")
__ASM_EMIT("jl 12f")
__ASM_EMIT("vmovaps %%xmm0, 0x00(%[dst])")
__ASM_EMIT("sub $0x4, %[count]")
__ASM_EMIT("add $0x10, %[dst]")
__ASM_EMIT("12:")
/* x1 block */
__ASM_EMIT("add $0x3, %[count]")
__ASM_EMIT("jl 14f")
__ASM_EMIT("13:")
__ASM_EMIT("vmovss %%xmm0, 0x00(%[dst])")
__ASM_EMIT("add $0x04, %[dst]")
__ASM_EMIT("dec %[count]")
__ASM_EMIT("jge 13b")
__ASM_EMIT("14:")
/* end */
__ASM_EMIT("2000:")

: [dst] "+r"(dst), [count] "+r" (count)
: [value] "o" (value)
: "cc", "memory",
"%xmm1", "%xmm2", "%xmm3", "%xmm4",
"%xmm4", "%xmm5", "%xmm6", "%xmm7"
);

#undef FILL_CORE
}

void fill_zero(float *dst, size_t count)
{
fill(dst, 0.0f, count);
}

void fill_one(float *dst, size_t count)
{
fill(dst, 1.0f, count);
}

void fill_minus_one(float *dst, size_t count)
{
fill(dst, -1.0f, count);
}

} /* namespace avx */
} /* namespace lsp */

#endif /* PRIVATE_DSP_ARCH_X86_AVX_COPY_H_ */
4 changes: 4 additions & 0 deletions src/main/x86/avx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@

CEXPORT1(favx, copy);
CEXPORT1(favx, move);
CEXPORT1(favx, fill);
CEXPORT1(favx, fill_zero);
CEXPORT1(favx, fill_one);
CEXPORT1(favx, fill_minus_one);

CEXPORT1(favx, limit1);
CEXPORT1(favx, limit2);
Expand Down
12 changes: 12 additions & 0 deletions src/test/ptest/copy/fill.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ namespace lsp
void fill_zero(float *dst, size_t count);
void fill_minus_one(float *dst, size_t count);
}

namespace avx
{
void fill(float *dst, float value, size_t count);
void fill_one(float *dst, size_t count);
void fill_zero(float *dst, size_t count);
void fill_minus_one(float *dst, size_t count);
}
)

IF_ARCH_ARM(
Expand Down Expand Up @@ -133,6 +141,7 @@ PTEST_BEGIN("dsp.copy", fill, 5, 5000)

CALL(generic::fill);
IF_ARCH_X86(CALL(sse::fill));
IF_ARCH_X86(CALL(avx::fill));
IF_ARCH_ARM(CALL(neon_d32::fill));
IF_ARCH_AARCH64(CALL(asimd::fill));
PTEST_SEPARATOR;
Expand All @@ -141,18 +150,21 @@ PTEST_BEGIN("dsp.copy", fill, 5, 5000)
CALL(fill_zero_memset);
CALL(fill_zero_bzero);
IF_ARCH_X86(CALL(sse::fill_zero));
IF_ARCH_X86(CALL(avx::fill_zero));
IF_ARCH_ARM(CALL(neon_d32::fill_zero));
IF_ARCH_AARCH64(CALL(asimd::fill_zero));
PTEST_SEPARATOR;

CALL(generic::fill_one);
IF_ARCH_X86(CALL(sse::fill_one));
IF_ARCH_X86(CALL(avx::fill_one));
IF_ARCH_ARM(CALL(neon_d32::fill_one));
IF_ARCH_AARCH64(CALL(asimd::fill_one));
PTEST_SEPARATOR;

CALL(generic::fill_minus_one);
IF_ARCH_X86(CALL(sse::fill_minus_one));
IF_ARCH_X86(CALL(avx::fill_minus_one));
IF_ARCH_ARM(CALL(neon_d32::fill_minus_one));
IF_ARCH_AARCH64(CALL(asimd::fill_minus_one));
PTEST_SEPARATOR2;
Expand Down
13 changes: 13 additions & 0 deletions src/test/utest/copy/fill.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ namespace lsp
void fill_zero(float *dst, size_t count);
void fill_minus_one(float *dst, size_t count);
}

namespace avx
{
void fill(float *dst, float value, size_t count);
void fill_one(float *dst, size_t count);
void fill_zero(float *dst, size_t count);
void fill_minus_one(float *dst, size_t count);
}
)

IF_ARCH_ARM(
Expand Down Expand Up @@ -141,6 +149,11 @@ UTEST_BEGIN("dsp.copy", fill)
IF_ARCH_X86(CALL(generic::fill_zero, sse::fill_zero, 16));
IF_ARCH_X86(CALL(generic::fill_minus_one, sse::fill_minus_one, 16));

IF_ARCH_X86(CALL(generic::fill, avx::fill, 32));
IF_ARCH_X86(CALL(generic::fill_one, avx::fill_one, 32));
IF_ARCH_X86(CALL(generic::fill_zero, avx::fill_zero, 32));
IF_ARCH_X86(CALL(generic::fill_minus_one, avx::fill_minus_one, 32));

IF_ARCH_ARM(CALL(generic::fill, neon_d32::fill, 16));
IF_ARCH_ARM(CALL(generic::fill_one, neon_d32::fill_one, 16));
IF_ARCH_ARM(CALL(generic::fill_zero, neon_d32::fill_zero, 16));
Expand Down

0 comments on commit bfa12bc

Please sign in to comment.