From a8d84a14eed8de722dd1a2229b503d2ee202b327 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Thu, 19 Oct 2023 00:39:51 +0300 Subject: [PATCH 1/4] Implemented generic gate functions --- include/lsp-plug.in/dsp/common/dynamics.h | 1 + .../lsp-plug.in/dsp/common/dynamics/gate.h | 33 ++++ .../lsp-plug.in/dsp/common/dynamics/types.h | 28 ++- include/private/dsp/arch/generic/dynamics.h | 1 + .../private/dsp/arch/generic/dynamics/gate.h | 73 ++++++++ src/main/generic/generic.cpp | 2 + src/test/ptest/dynamics/gate_x1_curve.cpp | 142 ++++++++++++++++ src/test/ptest/dynamics/gate_x1_gain2.cpp | 142 ++++++++++++++++ src/test/utest/dynamics/gate_x1_curve.cpp | 160 ++++++++++++++++++ src/test/utest/dynamics/gate_x1_gain.cpp | 160 ++++++++++++++++++ 10 files changed, 740 insertions(+), 2 deletions(-) create mode 100644 include/lsp-plug.in/dsp/common/dynamics/gate.h create mode 100644 include/private/dsp/arch/generic/dynamics/gate.h create mode 100644 src/test/ptest/dynamics/gate_x1_curve.cpp create mode 100644 src/test/ptest/dynamics/gate_x1_gain2.cpp create mode 100644 src/test/utest/dynamics/gate_x1_curve.cpp create mode 100644 src/test/utest/dynamics/gate_x1_gain.cpp diff --git a/include/lsp-plug.in/dsp/common/dynamics.h b/include/lsp-plug.in/dsp/common/dynamics.h index 5887ba50..2b13667b 100644 --- a/include/lsp-plug.in/dsp/common/dynamics.h +++ b/include/lsp-plug.in/dsp/common/dynamics.h @@ -26,6 +26,7 @@ #include #include +#include #endif /* LSP_PLUG_IN_DSP_COMMON_DYNAMICS_H_ */ diff --git a/include/lsp-plug.in/dsp/common/dynamics/gate.h b/include/lsp-plug.in/dsp/common/dynamics/gate.h new file mode 100644 index 00000000..e95cb7de --- /dev/null +++ b/include/lsp-plug.in/dsp/common/dynamics/gate.h @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2023 Linux Studio Plugins Project + * (C) 2023 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 19 окт. 2023 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef LSP_PLUG_IN_DSP_COMMON_DYNAMICS_GATE_H_ +#define LSP_PLUG_IN_DSP_COMMON_DYNAMICS_GATE_H_ + +#include +#include + +LSP_DSP_LIB_SYMBOL(void, gate_x1_gain, float *dst, const float *src, const LSP_DSP_LIB_TYPE(gate_knee_t) *c, size_t count); + +LSP_DSP_LIB_SYMBOL(void, gate_x1_curve, float *dst, const float *src, const LSP_DSP_LIB_TYPE(gate_knee_t) *c, size_t count); + + +#endif /* LSP_PLUG_IN_DSP_COMMON_DYNAMICS_GATE_H_ */ diff --git a/include/lsp-plug.in/dsp/common/dynamics/types.h b/include/lsp-plug.in/dsp/common/dynamics/types.h index cc2038ee..e9e02b2d 100644 --- a/include/lsp-plug.in/dsp/common/dynamics/types.h +++ b/include/lsp-plug.in/dsp/common/dynamics/types.h @@ -29,7 +29,7 @@ LSP_DSP_LIB_BEGIN_NAMESPACE #pragma pack(push, 1) /** - * Compressor knee is a curve that constists of three parts: + * Compressor knee is a curve that consists of three parts: * 1. Part with constant gain amplification in the range [-inf .. start] dB * 2. Soft compression knee in the range (start .. end) dB present by the quadratic function (2nd-order polynom) * 3. Gain reduction part in the range [end .. +inf] dB present by the linear function (1st-order polynom) @@ -40,7 +40,7 @@ LSP_DSP_LIB_BEGIN_NAMESPACE * 3. Compute the natural logarithm of the x: lx = logf(x). * 4. If x < end then compute the gain using the 2nd-order polynom: gain = (herm[0]*lx + herm[1])*lx + herm[2] * 5. Otherwise compute the gain using the 1st-order polynom: gain = tilt[0]*lx + tilt[1] - * 6. return expf(gain) + * 6. return expf(gain) * x */ typedef struct LSP_DSP_LIB_TYPE(compressor_knee_t) { @@ -61,6 +61,30 @@ typedef struct LSP_DSP_LIB_TYPE(compressor_x2_t) LSP_DSP_LIB_TYPE(compressor_knee_t) k[2]; } LSP_DSP_LIB_TYPE(compressor_x2_t); + +/** + * Gate knee is a curve that consists of three parts: + * 1. Part with constant gain amplification in the range [-inf .. start] dB + * 2. Transition zone in the range (start .. end) dB present by the quadratic function (2nd-order polynom) + * 3. Part with constant gain amplification in the range [end .. +inf] dB + * + * The typical algorithm of computing the gate's curve: + * 1. Take absolute value of the sample: x = fabfs(in) + * 2. If x <= start then return gain_start * x + * 3. If x <= end then return gain_end * x + * 4. Compute the natural logarithm of the x: lx = logf(x). + * 5. Compute the gain using the 3rd-order polynom: gain = ((herm[0]*lx + herm[1])*lx + herm[2]*lx) + herm[3] + * 6. return expf(gain) * x + */ +typedef struct LSP_DSP_LIB_TYPE(gate_knee_t) +{ + float start; // The start of the knee, in gain units + float end; // The end of the knee, in gain units + float gain_start; // Gain below the start threshold + float gain_end; // Gain above the end threshold + float herm[4]; // Hermite interpolation of the knee with the 3rd-order polynom +} LSP_DSP_LIB_TYPE(gate_knee_t); + #pragma pack(pop) LSP_DSP_LIB_END_NAMESPACE diff --git a/include/private/dsp/arch/generic/dynamics.h b/include/private/dsp/arch/generic/dynamics.h index 2ad90164..efd7e8b3 100644 --- a/include/private/dsp/arch/generic/dynamics.h +++ b/include/private/dsp/arch/generic/dynamics.h @@ -27,5 +27,6 @@ #endif /* PRIVATE_DSP_ARCH_GENERIC_IMPL */ #include +#include #endif /* PRIVATE_DSP_ARCH_GENERIC_DYNAMICS_H_ */ diff --git a/include/private/dsp/arch/generic/dynamics/gate.h b/include/private/dsp/arch/generic/dynamics/gate.h new file mode 100644 index 00000000..6af2adc0 --- /dev/null +++ b/include/private/dsp/arch/generic/dynamics/gate.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2023 Linux Studio Plugins Project + * (C) 2023 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 19 окт. 2023 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_GENERIC_DYNAMICS_GATE_H_ +#define PRIVATE_DSP_ARCH_GENERIC_DYNAMICS_GATE_H_ + +#ifndef PRIVATE_DSP_ARCH_GENERIC_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_GENERIC_IMPL */ + +namespace lsp +{ + namespace generic + { + void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count) + { + for (size_t i=0; istart) + x = c->gain_start; + else if (x >= c->end) + x = c->gain_end; + else + { + float lx = logf(x); + x = expf(((c->herm[0]*lx + c->herm[1])*lx + c->herm[2])*lx + c->herm[3]); + } + dst[i] = x; + } + } + + void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count) + { + for (size_t i=0; istart) + x *= c->gain_start; + else if (x >= c->end) + x *= c->gain_end; + else + { + float lx = logf(x); + x *= expf(((c->herm[0]*lx + c->herm[1])*lx + c->herm[2])*lx + c->herm[3]); + } + dst[i] = x; + } + } + } /* namespace generic */ +} /* namespace lsp */ + + + +#endif /* PRIVATE_DSP_ARCH_GENERIC_DYNAMICS_GATE_H_ */ diff --git a/src/main/generic/generic.cpp b/src/main/generic/generic.cpp index 801612ef..5a0d1d90 100644 --- a/src/main/generic/generic.cpp +++ b/src/main/generic/generic.cpp @@ -635,6 +635,8 @@ namespace lsp EXPORT1(compressor_x2_gain) EXPORT1(compressor_x2_curve) + EXPORT1(gate_x1_gain) + EXPORT1(gate_x1_curve) } #undef EXPORT1 diff --git a/src/test/ptest/dynamics/gate_x1_curve.cpp b/src/test/ptest/dynamics/gate_x1_curve.cpp new file mode 100644 index 00000000..5d9d4382 --- /dev/null +++ b/src/test/ptest/dynamics/gate_x1_curve.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2023 Linux Studio Plugins Project + * (C) 2023 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 19 окт. 2023 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include +#include +#include + +#define MIN_RANK 8 +#define MAX_RANK 16 + +namespace lsp +{ + namespace generic + { + void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + + IF_ARCH_X86( + namespace sse2 + { +// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + + namespace avx2 + { +// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); +// void gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) + + IF_ARCH_X86_64( + namespace avx2 + { +// void x64_gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); +// void x64_gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) + + IF_ARCH_ARM( + namespace neon_d32 + { +// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) + + IF_ARCH_AARCH64( + namespace asimd + { +// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) +} + +typedef void (* gate_x1_func_t)(float *dst, const float *src, const lsp::dsp::gate_knee_t *c, size_t count); + +//----------------------------------------------------------------------------- +// Performance test for logarithmic axis calculation +PTEST_BEGIN("dsp.dynamics", gate_x1_curve, 5, 1000) + + void call(const char *label, float *dst, const float *src, const dsp::gate_knee_t *gate, size_t count, gate_x1_func_t func) + { + if (!PTEST_SUPPORTED(func)) + return; + + char buf[80]; + sprintf(buf, "%s x %d", label, int(count)); + printf("Testing %s points...\n", buf); + + PTEST_LOOP(buf, + func(dst, src, gate, count); + ); + } + + PTEST_MAIN + { + size_t buf_size = 1 << MAX_RANK; + uint8_t *data = NULL; + float *ptr = alloc_aligned(data, buf_size * 2, 64); + + dsp::gate_knee_t gate; + gate = { + 0.0316244587f, + 0.0631000027f, + 0.0631000027f, + 1.0f, + {-16.7640247f, -156.329346f, -479.938873f, -486.233582f}}; + + float *src = ptr; + float *dst = &src[buf_size]; + float k = 72.0f / (1 << MIN_RANK); + + for (size_t i=0; i + * (C) 2023 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 19 окт. 2023 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include +#include +#include + +#define MIN_RANK 8 +#define MAX_RANK 16 + +namespace lsp +{ + namespace generic + { + void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + + IF_ARCH_X86( + namespace sse2 + { +// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + + namespace avx2 + { +// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); +// void gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) + + IF_ARCH_X86_64( + namespace avx2 + { +// void x64_gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); +// void x64_gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) + + IF_ARCH_ARM( + namespace neon_d32 + { +// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) + + IF_ARCH_AARCH64( + namespace asimd + { +// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) +} + +typedef void (* gate_x1_func_t)(float *dst, const float *src, const lsp::dsp::gate_knee_t *c, size_t count); + +//----------------------------------------------------------------------------- +// Performance test for logarithmic axis calculation +PTEST_BEGIN("dsp.dynamics", gate_x1_gain, 5, 1000) + + void call(const char *label, float *dst, const float *src, const dsp::gate_knee_t *gate, size_t count, gate_x1_func_t func) + { + if (!PTEST_SUPPORTED(func)) + return; + + char buf[80]; + sprintf(buf, "%s x %d", label, int(count)); + printf("Testing %s points...\n", buf); + + PTEST_LOOP(buf, + func(dst, src, gate, count); + ); + } + + PTEST_MAIN + { + size_t buf_size = 1 << MAX_RANK; + uint8_t *data = NULL; + float *ptr = alloc_aligned(data, buf_size * 2, 64); + + dsp::gate_knee_t gate; + gate = { + 0.0316244587f, + 0.0631000027f, + 0.0631000027f, + 1.0f, + {-16.7640247f, -156.329346f, -479.938873f, -486.233582f}}; + + float *src = ptr; + float *dst = &src[buf_size]; + float k = 72.0f / (1 << MIN_RANK); + + for (size_t i=0; i + * (C) 2023 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 19 окт. 2023 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include + +#define MIN_RANK 8 +#define MAX_RANK 16 + +namespace lsp +{ + namespace generic + { +// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + + IF_ARCH_X86( + namespace sse2 + { +// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + + namespace avx2 + { +// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); +// void gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) + + IF_ARCH_X86_64( + namespace avx2 + { +// void x64_gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); +// void x64_gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) + + IF_ARCH_ARM( + namespace neon_d32 + { +// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) + + IF_ARCH_AARCH64( + namespace asimd + { +// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) +} + +typedef void (* gate_x1_func_t)(float *dst, const float *src, const lsp::dsp::gate_knee_t *c, size_t count); + +//----------------------------------------------------------------------------- +// Unit test for simple operations +UTEST_BEGIN("dsp.dynamics", gate_x1_curve) + + void call(const char *label, size_t align, gate_x1_func_t func1, gate_x1_func_t func2) + { + if (!UTEST_SUPPORTED(func1)) + return; + if (!UTEST_SUPPORTED(func2)) + return; + + dsp::gate_knee_t gate[2]; + gate[0] = { + 0.0316244587f, + 0.0631000027f, + 0.0631000027f, + 1.0f, + {-16.7640247f, -156.329346f, -479.938873f, -486.233582f}}; + + gate[1] = { + 0.0316244587f, + 0.0631000027f, + 1.0f, + 0.0630957335f, + {16.7644348f, 156.33316f, 479.950592f, 483.48233f}}; + + UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 32, 64, 65, 100, 999, 0xfff) + { + for (size_t mask=0; mask <= 0x03; ++mask) + { + for (size_t i=0; i<2; ++i) + { + printf("Testing %s on compressor %d, input buffer of %d numbers, mask=0x%x...\n", label, int(i), int(count), int(mask)); + + FloatBuffer src(count, align, mask & 0x01); + FloatBuffer dst(count, align, mask & 0x02); + + src.randomize_0to1(); + dst.randomize_sign(); + FloatBuffer dst1(dst); + FloatBuffer dst2(dst); + + // Call functions + func1(dst1, src, &gate[i], count); + func2(dst2, src, &gate[i], count); + + UTEST_ASSERT_MSG(src.valid(), "Source buffer corrupted"); + UTEST_ASSERT_MSG(dst.valid(), "Destination buffer corrupted"); + UTEST_ASSERT_MSG(dst1.valid(), "Destination buffer 1 corrupted"); + UTEST_ASSERT_MSG(dst2.valid(), "Destination buffer 2 corrupted"); + + // Compare buffers + if (!dst1.equals_relative(dst2, 1e-4)) + { + src.dump("src "); + dst.dump("dst "); + dst1.dump("dst1"); + dst2.dump("dst2"); + printf("index=%d, %.6f vs %.6f\n", dst1.last_diff(), dst1.get_diff(), dst2.get_diff()); + UTEST_FAIL_MSG("Output of functions for test '%s' differs", label); + } + } + } + } + } + + UTEST_MAIN + { + #define CALL(generic, func, align) \ + call(#func, align, generic, func); + +// IF_ARCH_X86(CALL(generic::gate_x1_curve, sse2::gate_x1_curve, 16)); +// IF_ARCH_X86(CALL(generic::gate_x1_curve, avx2::gate_x1_curve, 32)); +// IF_ARCH_X86(CALL(generic::gate_x1_curve, avx2::gate_x1_curve_fma3, 32)); +// IF_ARCH_X86_64(CALL(generic::gate_x1_curve, avx2::x64_gate_x1_curve, 32)); +// IF_ARCH_X86_64(CALL(generic::gate_x1_curve, avx2::x64_gate_x1_curve_fma3, 32)); +// +// IF_ARCH_ARM(CALL(generic::gate_x1_curve, neon_d32::gate_x1_curve, 16)); +// +// IF_ARCH_AARCH64(CALL(generic::gate_x1_curve, asimd::gate_x1_curve, 16)); + } +UTEST_END + + + diff --git a/src/test/utest/dynamics/gate_x1_gain.cpp b/src/test/utest/dynamics/gate_x1_gain.cpp new file mode 100644 index 00000000..b80dcb84 --- /dev/null +++ b/src/test/utest/dynamics/gate_x1_gain.cpp @@ -0,0 +1,160 @@ +/* + * Copyright (C) 2023 Linux Studio Plugins Project + * (C) 2023 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 19 окт. 2023 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include + +#define MIN_RANK 8 +#define MAX_RANK 16 + +namespace lsp +{ + namespace generic + { +// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + + IF_ARCH_X86( + namespace sse2 + { +// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + + namespace avx2 + { +// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); +// void gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) + + IF_ARCH_X86_64( + namespace avx2 + { +// void x64_gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); +// void x64_gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) + + IF_ARCH_ARM( + namespace neon_d32 + { +// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) + + IF_ARCH_AARCH64( + namespace asimd + { +// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + } + ) +} + +typedef void (* gate_x1_func_t)(float *dst, const float *src, const lsp::dsp::gate_knee_t *c, size_t count); + +//----------------------------------------------------------------------------- +// Unit test for simple operations +UTEST_BEGIN("dsp.dynamics", gate_x1_gain) + + void call(const char *label, size_t align, gate_x1_func_t func1, gate_x1_func_t func2) + { + if (!UTEST_SUPPORTED(func1)) + return; + if (!UTEST_SUPPORTED(func2)) + return; + + dsp::gate_knee_t gate[2]; + gate[0] = { + 0.0316244587f, + 0.0631000027f, + 0.0631000027f, + 1.0f, + {-16.7640247f, -156.329346f, -479.938873f, -486.233582f}}; + + gate[1] = { + 0.0316244587f, + 0.0631000027f, + 1.0f, + 0.0630957335f, + {16.7644348f, 156.33316f, 479.950592f, 483.48233f}}; + + UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 32, 64, 65, 100, 999, 0xfff) + { + for (size_t mask=0; mask <= 0x03; ++mask) + { + for (size_t i=0; i<2; ++i) + { + printf("Testing %s on compressor %d, input buffer of %d numbers, mask=0x%x...\n", label, int(i), int(count), int(mask)); + + FloatBuffer src(count, align, mask & 0x01); + FloatBuffer dst(count, align, mask & 0x02); + + src.randomize_0to1(); + dst.randomize_sign(); + FloatBuffer dst1(dst); + FloatBuffer dst2(dst); + + // Call functions + func1(dst1, src, &gate[i], count); + func2(dst2, src, &gate[i], count); + + UTEST_ASSERT_MSG(src.valid(), "Source buffer corrupted"); + UTEST_ASSERT_MSG(dst.valid(), "Destination buffer corrupted"); + UTEST_ASSERT_MSG(dst1.valid(), "Destination buffer 1 corrupted"); + UTEST_ASSERT_MSG(dst2.valid(), "Destination buffer 2 corrupted"); + + // Compare buffers + if (!dst1.equals_relative(dst2, 1e-4)) + { + src.dump("src "); + dst.dump("dst "); + dst1.dump("dst1"); + dst2.dump("dst2"); + printf("index=%d, %.6f vs %.6f\n", dst1.last_diff(), dst1.get_diff(), dst2.get_diff()); + UTEST_FAIL_MSG("Output of functions for test '%s' differs", label); + } + } + } + } + } + + UTEST_MAIN + { + #define CALL(generic, func, align) \ + call(#func, align, generic, func); + +// IF_ARCH_X86(CALL(generic::gate_x1_gain, sse2::gate_x1_gain, 16)); +// IF_ARCH_X86(CALL(generic::gate_x1_gain, avx2::gate_x1_gain, 32)); +// IF_ARCH_X86(CALL(generic::gate_x1_gain, avx2::gate_x1_gain_fma3, 32)); +// IF_ARCH_X86_64(CALL(generic::gate_x1_gain, avx2::x64_gate_x1_gain, 32)); +// IF_ARCH_X86_64(CALL(generic::gate_x1_gain, avx2::x64_gate_x1_gain_fma3, 32)); +// +// IF_ARCH_ARM(CALL(generic::gate_x1_gain, neon_d32::gate_x1_gain, 16)); +// +// IF_ARCH_AARCH64(CALL(generic::gate_x1_gain, asimd::gate_x1_gain, 16)); + } +UTEST_END + + + From 7f41770efbe5be8047fd8be327a70f833719a6b1 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Thu, 19 Oct 2023 21:16:15 +0300 Subject: [PATCH 2/4] Implemented SSE2-optimized version of gate functions --- include/private/dsp/arch/x86/sse2/dynamics.h | 1 + .../private/dsp/arch/x86/sse2/dynamics/gate.h | 364 ++++++++++++++++++ src/main/x86/sse2.cpp | 2 + src/test/ptest/dynamics/gate_x1_curve.cpp | 4 +- src/test/ptest/dynamics/gate_x1_gain2.cpp | 4 +- src/test/utest/dynamics/gate_x1_curve.cpp | 10 +- src/test/utest/dynamics/gate_x1_gain.cpp | 10 +- 7 files changed, 381 insertions(+), 14 deletions(-) create mode 100644 include/private/dsp/arch/x86/sse2/dynamics/gate.h diff --git a/include/private/dsp/arch/x86/sse2/dynamics.h b/include/private/dsp/arch/x86/sse2/dynamics.h index bf7daf1e..d452ec3e 100644 --- a/include/private/dsp/arch/x86/sse2/dynamics.h +++ b/include/private/dsp/arch/x86/sse2/dynamics.h @@ -27,5 +27,6 @@ #endif /* PRIVATE_DSP_ARCH_X86_SSE2_IMPL */ #include +#include #endif /* PRIVATE_DSP_ARCH_X86_SSE2_DYNAMICS_H_ */ diff --git a/include/private/dsp/arch/x86/sse2/dynamics/gate.h b/include/private/dsp/arch/x86/sse2/dynamics/gate.h new file mode 100644 index 00000000..ba759e17 --- /dev/null +++ b/include/private/dsp/arch/x86/sse2/dynamics/gate.h @@ -0,0 +1,364 @@ +/* + * Copyright (C) 2023 Linux Studio Plugins Project + * (C) 2023 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 19 окт. 2023 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_SSE2_DYNAMICS_GATE_H_ +#define PRIVATE_DSP_ARCH_X86_SSE2_DYNAMICS_GATE_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_SSE2_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_SSE2_IMPL */ + +#include +#include + +namespace lsp +{ + namespace sse2 + { + #pragma pack(push, 1) + typedef struct gate_knee_t + { + float start[4]; // +0x00 + float end[4]; // +0x10 + float gain_start[4]; // +0x20 + float gain_end[4]; // +0x30 + float herm[16]; // +0x40 + } gate_knee_t; + #pragma pack(pop) + + #define UNPACK_GATE_KNEE(DST, SRC) \ + __ASM_EMIT("movss 0x00(%[" SRC "]), %%xmm0") \ + __ASM_EMIT("movss 0x04(%[" SRC "]), %%xmm1") \ + __ASM_EMIT("movss 0x08(%[" SRC "]), %%xmm2") \ + __ASM_EMIT("movss 0x0c(%[" SRC "]), %%xmm3") \ + __ASM_EMIT("movss 0x10(%[" SRC "]), %%xmm4") \ + __ASM_EMIT("movss 0x14(%[" SRC "]), %%xmm5") \ + __ASM_EMIT("movss 0x18(%[" SRC "]), %%xmm6") \ + __ASM_EMIT("movss 0x1c(%[" SRC "]), %%xmm7") \ + __ASM_EMIT("shufps $0x00, %%xmm0, %%xmm0") \ + __ASM_EMIT("shufps $0x00, %%xmm1, %%xmm1") \ + __ASM_EMIT("shufps $0x00, %%xmm2, %%xmm2") \ + __ASM_EMIT("shufps $0x00, %%xmm3, %%xmm3") \ + __ASM_EMIT("shufps $0x00, %%xmm4, %%xmm4") \ + __ASM_EMIT("shufps $0x00, %%xmm5, %%xmm5") \ + __ASM_EMIT("shufps $0x00, %%xmm6, %%xmm6") \ + __ASM_EMIT("shufps $0x00, %%xmm7, %%xmm7") \ + __ASM_EMIT("movaps %%xmm0, 0x00 + %[" DST "]") \ + __ASM_EMIT("movaps %%xmm1, 0x10 + %[" DST "]") \ + __ASM_EMIT("movaps %%xmm2, 0x20 + %[" DST "]") \ + __ASM_EMIT("movaps %%xmm3, 0x30 + %[" DST "]") \ + __ASM_EMIT("movaps %%xmm4, 0x40 + %[" DST "]") \ + __ASM_EMIT("movaps %%xmm5, 0x50 + %[" DST "]") \ + __ASM_EMIT("movaps %%xmm6, 0x60 + %[" DST "]") \ + __ASM_EMIT("movaps %%xmm7, 0x70 + %[" DST "]") + + #define PROCESS_KNEE_SINGLE_X8 \ + /* in: xmm0 = lx0, xmm4 = lx1 */ \ + __ASM_EMIT("movaps 0x40 + %[knee], %%xmm1") /* xmm1 = herm[0] */ \ + __ASM_EMIT("movaps 0x40 + %[knee], %%xmm5") \ + __ASM_EMIT("mulps %%xmm0, %%xmm1") /* xmm1 = herm[0]*lx0 */ \ + __ASM_EMIT("mulps %%xmm4, %%xmm5") \ + __ASM_EMIT("addps 0x50 + %[knee], %%xmm1") /* xmm1 = herm[0]*lx0+herm[1] */ \ + __ASM_EMIT("addps 0x50 + %[knee], %%xmm5") \ + __ASM_EMIT("mulps %%xmm0, %%xmm1") /* xmm1 = (herm[0]*lx0+herm[1])*lx0 */ \ + __ASM_EMIT("mulps %%xmm4, %%xmm5") \ + __ASM_EMIT("addps 0x60 + %[knee], %%xmm1") /* xmm1 = (herm[0]*lx0+herm[1])*lx0+herm[2] */ \ + __ASM_EMIT("addps 0x60 + %[knee], %%xmm5") \ + __ASM_EMIT("mulps %%xmm0, %%xmm1") /* xmm1 = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0 */ \ + __ASM_EMIT("mulps %%xmm4, %%xmm5") \ + __ASM_EMIT("addps 0x70 + %[knee], %%xmm1") /* xmm1 = KV = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0+herm[3] */ \ + __ASM_EMIT("addps 0x70 + %[knee], %%xmm5") \ + __ASM_EMIT("movaps %%xmm1, %%xmm0") /* xmm0 = KV */ \ + __ASM_EMIT("movaps %%xmm5, %%xmm4") \ + EXP_CORE_X8 /* xmm0 = expf(KV) */ \ + __ASM_EMIT("movaps 0x00 + %[mem], %%xmm1") /* xmm1 = x0 */ \ + __ASM_EMIT("movaps 0x10 + %[mem], %%xmm5") \ + __ASM_EMIT("movaps %%xmm1, %%xmm2") /* xmm2 = x0 */ \ + __ASM_EMIT("movaps %%xmm5, %%xmm6") \ + __ASM_EMIT("cmpps $6, 0x00 + %[knee], %%xmm1") /* xmm0 = [x0 > start] */ \ + __ASM_EMIT("cmpps $6, 0x00 + %[knee], %%xmm5") \ + __ASM_EMIT("cmpps $1, 0x10 + %[knee], %%xmm2") /* xmm2 = [x0 < end] */ \ + __ASM_EMIT("cmpps $1, 0x10 + %[knee], %%xmm6") \ + __ASM_EMIT("andps %%xmm1, %%xmm0") /* xmm0 = [x0 > start] & expf(KV) */ \ + __ASM_EMIT("andps %%xmm5, %%xmm4") \ + __ASM_EMIT("andps %%xmm2, %%xmm0") /* xmm0 = ([x0 > start] && [x0 < end]) & expf(KV) */ \ + __ASM_EMIT("andps %%xmm6, %%xmm4") \ + __ASM_EMIT("andnps 0x20 + %[knee], %%xmm1") /* xmm1 = [x0 <= start] & gain_start */ \ + __ASM_EMIT("andnps 0x20 + %[knee], %%xmm5") \ + __ASM_EMIT("andnps 0x30 + %[knee], %%xmm2") /* xmm2 = [x0 >= end] & gain_end */ \ + __ASM_EMIT("andnps 0x30 + %[knee], %%xmm6") \ + __ASM_EMIT("orps %%xmm1, %%xmm0") /* xmm0 = ([x0 <= start]) ? gain_start : ([x0 < end]) ? expf(KV) : 0 */ \ + __ASM_EMIT("orps %%xmm5, %%xmm4") \ + __ASM_EMIT("orps %%xmm2, %%xmm0") /* xmm0 = ([x0 <= start]) ? gain_start : ([x0 < end]) ? expf(KV) : gain_end */ \ + __ASM_EMIT("orps %%xmm6, %%xmm4") \ + /* out: xmm0 = g0, xmm4 = g1 */ + + #define PROCESS_KNEE_SINGLE_X4 \ + /* in: xmm0 = lx0 */ \ + __ASM_EMIT("movaps 0x40 + %[knee], %%xmm1") /* xmm1 = herm[0] */ \ + __ASM_EMIT("mulps %%xmm0, %%xmm1") /* xmm1 = herm[0]*lx0 */ \ + __ASM_EMIT("addps 0x50 + %[knee], %%xmm1") /* xmm1 = herm[0]*lx0+herm[1] */ \ + __ASM_EMIT("mulps %%xmm0, %%xmm1") /* xmm1 = (herm[0]*lx0+herm[1])*lx0 */ \ + __ASM_EMIT("addps 0x60 + %[knee], %%xmm1") /* xmm1 = (herm[0]*lx0+herm[1])*lx0+herm[2] */ \ + __ASM_EMIT("mulps %%xmm0, %%xmm1") /* xmm1 = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0 */ \ + __ASM_EMIT("addps 0x70 + %[knee], %%xmm1") /* xmm1 = KV = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0+herm[3] */ \ + __ASM_EMIT("movaps %%xmm1, %%xmm0") /* xmm0 = KV */ \ + EXP_CORE_X4 /* xmm0 = expf(KV) */ \ + __ASM_EMIT("movaps 0x00 + %[mem], %%xmm1") /* xmm1 = x0 */ \ + __ASM_EMIT("movaps %%xmm1, %%xmm2") /* xmm2 = x0 */ \ + __ASM_EMIT("cmpps $6, 0x00 + %[knee], %%xmm1") /* xmm0 = [x0 > start] */ \ + __ASM_EMIT("cmpps $1, 0x10 + %[knee], %%xmm2") /* xmm2 = [x0 < end] */ \ + __ASM_EMIT("andps %%xmm1, %%xmm0") /* xmm0 = [x0 > start] & expf(KV) */ \ + __ASM_EMIT("andps %%xmm2, %%xmm0") /* xmm0 = ([x0 > start] && [x0 < end]) & expf(KV) */ \ + __ASM_EMIT("andnps 0x20 + %[knee], %%xmm1") /* xmm1 = [x0 <= start] & gain_start */ \ + __ASM_EMIT("andnps 0x30 + %[knee], %%xmm2") /* xmm2 = [x0 >= end] & gain_end */ \ + __ASM_EMIT("orps %%xmm1, %%xmm0") /* xmm0 = ([x0 <= start]) ? gain_start : ([x0 < end]) ? expf(KV) : 0 */ \ + __ASM_EMIT("orps %%xmm2, %%xmm0") /* xmm0 = ([x0 <= start]) ? gain_start : ([x0 < end]) ? expf(KV) : gain_end */ \ + /* out: xmm0 = g0 */ + + #define PROCESS_GATE_FULL_X8 \ + /* in: xmm0 = x0, xmm4 = x1 */ \ + __ASM_EMIT("andps 0x00 + %[G2C], %%xmm0") /* xmm0 = fabsf(x0) */ \ + __ASM_EMIT("andps 0x00 + %[G2C], %%xmm4") \ + __ASM_EMIT("movaps %%xmm0, %%xmm1") \ + __ASM_EMIT("movaps %%xmm4, %%xmm5") \ + __ASM_EMIT("movaps %%xmm0, %%xmm2") \ + __ASM_EMIT("movaps %%xmm4, %%xmm6") \ + __ASM_EMIT("cmpps $6, 0x00 + %[knee], %%xmm1") /* xmm1 = [x0 > start] */ \ + __ASM_EMIT("cmpps $6, 0x00 + %[knee], %%xmm5") \ + __ASM_EMIT("cmpps $1, 0x10 + %[knee], %%xmm2") /* xmm2 = [x0 < end] */ \ + __ASM_EMIT("cmpps $1, 0x10 + %[knee], %%xmm6") \ + __ASM_EMIT("movaps %%xmm1, %%xmm3") /* xmm3 = [x0 > start] */ \ + __ASM_EMIT("movaps %%xmm5, %%xmm7") \ + __ASM_EMIT("andps %%xmm2, %%xmm3") /* xmm3 = [x0 > start] && [x0 < end] */ \ + __ASM_EMIT("andps %%xmm6, %%xmm7") \ + __ASM_EMIT("orps %%xmm7, %%xmm3") \ + __ASM_EMIT("movmskps %%xmm3, %[mask]") \ + __ASM_EMIT("test %[mask], %[mask]") \ + __ASM_EMIT("jnz 100f") \ + /* Case when we do not need logarithm because there is no sample at the knee */ \ + __ASM_EMIT("andnps 0x20 + %[knee], %%xmm1") /* xmm1 = [x0 <= start] & gain_start */ \ + __ASM_EMIT("andnps 0x20 + %[knee], %%xmm5") \ + __ASM_EMIT("andnps 0x30 + %[knee], %%xmm2") /* xmm2 = [x0 >= end] & gain_end */ \ + __ASM_EMIT("andnps 0x30 + %[knee], %%xmm6") \ + __ASM_EMIT("movaps %%xmm1, %%xmm0") /* xmm0 = [x0 <= start] & gain_start */ \ + __ASM_EMIT("movaps %%xmm5, %%xmm4") \ + __ASM_EMIT("orps %%xmm2, %%xmm0") /* xmm0 = [x0 <= start] & gain_start | [x0 >= end] & gain_end */ \ + __ASM_EMIT("orps %%xmm6, %%xmm4") \ + __ASM_EMIT("jmp 200f") \ + __ASM_EMIT("100:") \ + __ASM_EMIT("movaps %%xmm0, 0x00 + %[mem]") /* store fabsf(x0) */ \ + __ASM_EMIT("movaps %%xmm4, 0x10 + %[mem]") \ + LOGE_CORE_X8 /* xmm0 = lx0 = logf(fabsf(x0)) */ \ + PROCESS_KNEE_SINGLE_X8 /* apply knee 0 */ \ + __ASM_EMIT("200:") \ + /* out: xmm0 = g0, xmm4 = g1 */ + + #define PROCESS_GATE_FULL_X4 \ + /* in: xmm0 = x0 */ \ + __ASM_EMIT("andps 0x00 + %[G2C], %%xmm0") /* xmm0 = fabsf(x0) */ \ + __ASM_EMIT("movaps %%xmm0, 0x00 + %[mem]") /* store fabsf(x0) */ \ + LOGE_CORE_X4 /* xmm0 = lx0 = logf(fabsf(x0)) */ \ + PROCESS_KNEE_SINGLE_X4 /* apply knee 0 */ \ + /* out: xmm0 = g */ + + static const uint32_t gate_const[] __lsp_aligned16 = + { + LSP_DSP_VEC4(0x7fffffff) + }; + + void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count) + { + IF_ARCH_X86( + gate_knee_t knee __lsp_aligned16; + float mem[8] __lsp_aligned16; + float stub[4] __lsp_aligned16; + size_t mask; + ); + + ARCH_X86_ASM + ( + // Prepare stuff + UNPACK_GATE_KNEE("knee", "gate") + + // 8x blocks + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("movups 0x00(%[src]), %%xmm0") + __ASM_EMIT("movups 0x10(%[src]), %%xmm4") + PROCESS_GATE_FULL_X8 + __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("movups %%xmm4, 0x10(%[dst])") + __ASM_EMIT("add $0x20, %[src]") + __ASM_EMIT("add $0x20, %[dst]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + // 4x blocks + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("movups 0x00(%[src]), %%xmm0") + PROCESS_GATE_FULL_X4 + __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[src]") + __ASM_EMIT("add $0x10, %[dst]") + __ASM_EMIT("4:") + // Tail: 1x-3x block + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jle 12f") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 6f") + __ASM_EMIT("movss 0x00(%[src]), %%xmm0") + __ASM_EMIT("add $4, %[src]") + __ASM_EMIT("6:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 8f") + __ASM_EMIT("movhps 0x00(%[src]), %%xmm0") + __ASM_EMIT("8:") + PROCESS_GATE_FULL_X4 + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 10f") + __ASM_EMIT("movss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $4, %[dst]") + __ASM_EMIT("10:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 12f") + __ASM_EMIT("movhps %%xmm0, 0x00(%[dst])") + __ASM_EMIT("12:") + + : [dst] "+r" (dst), [src] "+r" (src), + [count] "+r" (count), + [mask] "=&r" (mask) + : [gate] "r" (c), + [knee] "o" (knee), + [mem] "o" (mem), + [stub] "o" (stub), + [G2C] "o" (gate_const), + [L2C] "o" (LOG2_CONST), + [LOGC] "o" (LOGE_C), + [E2C] "o" (EXP2_CONST), + [LOG2E] "m" (EXP_LOG2E) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count) + { + IF_ARCH_X86( + gate_knee_t knee __lsp_aligned16; + float mem[8] __lsp_aligned16; + float stub[4] __lsp_aligned16; + size_t mask; + ); + + ARCH_X86_ASM + ( + // Prepare stuff + UNPACK_GATE_KNEE("knee", "gate") + + // 8x blocks + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("movups 0x00(%[src]), %%xmm0") + __ASM_EMIT("movups 0x10(%[src]), %%xmm4") + PROCESS_GATE_FULL_X8 + __ASM_EMIT("movups 0x00(%[src]), %%xmm1") + __ASM_EMIT("movups 0x10(%[src]), %%xmm5") + __ASM_EMIT("mulps %%xmm1, %%xmm0") + __ASM_EMIT("mulps %%xmm5, %%xmm4") + __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("movups %%xmm4, 0x10(%[dst])") + __ASM_EMIT("add $0x20, %[src]") + __ASM_EMIT("add $0x20, %[dst]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + // 4x blocks + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("movups 0x00(%[src]), %%xmm0") + PROCESS_GATE_FULL_X4 + __ASM_EMIT("movups 0x00(%[src]), %%xmm1") + __ASM_EMIT("mulps %%xmm1, %%xmm0") + __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[src]") + __ASM_EMIT("add $0x10, %[dst]") + __ASM_EMIT("4:") + // Tail: 1x-3x block + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jle 12f") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 6f") + __ASM_EMIT("movss 0x00(%[src]), %%xmm0") + __ASM_EMIT("add $4, %[src]") + __ASM_EMIT("6:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 8f") + __ASM_EMIT("movhps 0x00(%[src]), %%xmm0") + __ASM_EMIT("8:") + __ASM_EMIT("movaps %%xmm0, %%xmm4") + PROCESS_GATE_FULL_X4 + __ASM_EMIT("mulps %%xmm4, %%xmm0") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 10f") + __ASM_EMIT("movss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $4, %[dst]") + __ASM_EMIT("10:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 12f") + __ASM_EMIT("movhps %%xmm0, 0x00(%[dst])") + __ASM_EMIT("12:") + + : [dst] "+r" (dst), [src] "+r" (src), + [count] "+r" (count), + [mask] "=&r" (mask) + : [gate] "r" (c), + [knee] "o" (knee), + [mem] "o" (mem), + [stub] "o" (stub), + [G2C] "o" (gate_const), + [L2C] "o" (LOG2_CONST), + [LOGC] "o" (LOGE_C), + [E2C] "o" (EXP2_CONST), + [LOG2E] "m" (EXP_LOG2E) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + #undef PROCESS_KNEE_SINGLE_X4 + #undef PROCESS_KNEE_SINGLE_X8 + #undef PROCESS_GATE_FULL_X4 + #undef PROCESS_GATE_FULL_X8 + #undef UNPACK_GATE_KNEE + + } /* namespace sse2 */ +} /* namespace lsp */ + + +#endif /* PRIVATE_DSP_ARCH_X86_SSE2_DYNAMICS_GATE_H_ */ diff --git a/src/main/x86/sse2.cpp b/src/main/x86/sse2.cpp index e9cb76fc..509faab8 100644 --- a/src/main/x86/sse2.cpp +++ b/src/main/x86/sse2.cpp @@ -149,6 +149,8 @@ EXPORT1(compressor_x2_gain) EXPORT1(compressor_x2_curve) + EXPORT1(gate_x1_gain) + EXPORT1(gate_x1_curve) } #undef EXPORT1 diff --git a/src/test/ptest/dynamics/gate_x1_curve.cpp b/src/test/ptest/dynamics/gate_x1_curve.cpp index 5d9d4382..784c8716 100644 --- a/src/test/ptest/dynamics/gate_x1_curve.cpp +++ b/src/test/ptest/dynamics/gate_x1_curve.cpp @@ -39,7 +39,7 @@ namespace lsp IF_ARCH_X86( namespace sse2 { -// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } namespace avx2 @@ -124,7 +124,7 @@ PTEST_BEGIN("dsp.dynamics", gate_x1_curve, 5, 1000) size_t count = 1 << i; CALL(generic::gate_x1_curve); -// IF_ARCH_X86(CALL(sse2::gate_x1_curve)); + IF_ARCH_X86(CALL(sse2::gate_x1_curve)); // IF_ARCH_X86(CALL(avx2::gate_x1_curve)); // IF_ARCH_X86_64(CALL(avx2::x64_gate_x1_curve)); // IF_ARCH_X86(CALL(avx2::gate_x1_curve_fma3)); diff --git a/src/test/ptest/dynamics/gate_x1_gain2.cpp b/src/test/ptest/dynamics/gate_x1_gain2.cpp index f91dcae0..277e4993 100644 --- a/src/test/ptest/dynamics/gate_x1_gain2.cpp +++ b/src/test/ptest/dynamics/gate_x1_gain2.cpp @@ -39,7 +39,7 @@ namespace lsp IF_ARCH_X86( namespace sse2 { -// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } namespace avx2 @@ -124,7 +124,7 @@ PTEST_BEGIN("dsp.dynamics", gate_x1_gain, 5, 1000) size_t count = 1 << i; CALL(generic::gate_x1_gain); -// IF_ARCH_X86(CALL(sse2::gate_x1_gain)); + IF_ARCH_X86(CALL(sse2::gate_x1_gain)); // IF_ARCH_X86(CALL(avx2::gate_x1_gain)); // IF_ARCH_X86_64(CALL(avx2::x64_gate_x1_gain)); // IF_ARCH_X86(CALL(avx2::gate_x1_gain_fma3)); diff --git a/src/test/utest/dynamics/gate_x1_curve.cpp b/src/test/utest/dynamics/gate_x1_curve.cpp index 45228131..457aa6a5 100644 --- a/src/test/utest/dynamics/gate_x1_curve.cpp +++ b/src/test/utest/dynamics/gate_x1_curve.cpp @@ -31,13 +31,13 @@ namespace lsp { namespace generic { -// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } IF_ARCH_X86( namespace sse2 { -// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } namespace avx2 @@ -105,7 +105,7 @@ UTEST_BEGIN("dsp.dynamics", gate_x1_curve) { for (size_t i=0; i<2; ++i) { - printf("Testing %s on compressor %d, input buffer of %d numbers, mask=0x%x...\n", label, int(i), int(count), int(mask)); + printf("Testing %s on gate %d, input buffer of %d numbers, mask=0x%x...\n", label, int(i), int(count), int(mask)); FloatBuffer src(count, align, mask & 0x01); FloatBuffer dst(count, align, mask & 0x02); @@ -125,7 +125,7 @@ UTEST_BEGIN("dsp.dynamics", gate_x1_curve) UTEST_ASSERT_MSG(dst2.valid(), "Destination buffer 2 corrupted"); // Compare buffers - if (!dst1.equals_relative(dst2, 1e-4)) + if (!dst1.equals_absolute(dst2, 1e-4)) { src.dump("src "); dst.dump("dst "); @@ -144,7 +144,7 @@ UTEST_BEGIN("dsp.dynamics", gate_x1_curve) #define CALL(generic, func, align) \ call(#func, align, generic, func); -// IF_ARCH_X86(CALL(generic::gate_x1_curve, sse2::gate_x1_curve, 16)); + IF_ARCH_X86(CALL(generic::gate_x1_curve, sse2::gate_x1_curve, 16)); // IF_ARCH_X86(CALL(generic::gate_x1_curve, avx2::gate_x1_curve, 32)); // IF_ARCH_X86(CALL(generic::gate_x1_curve, avx2::gate_x1_curve_fma3, 32)); // IF_ARCH_X86_64(CALL(generic::gate_x1_curve, avx2::x64_gate_x1_curve, 32)); diff --git a/src/test/utest/dynamics/gate_x1_gain.cpp b/src/test/utest/dynamics/gate_x1_gain.cpp index b80dcb84..81110550 100644 --- a/src/test/utest/dynamics/gate_x1_gain.cpp +++ b/src/test/utest/dynamics/gate_x1_gain.cpp @@ -31,13 +31,13 @@ namespace lsp { namespace generic { -// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } IF_ARCH_X86( namespace sse2 { -// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } namespace avx2 @@ -105,7 +105,7 @@ UTEST_BEGIN("dsp.dynamics", gate_x1_gain) { for (size_t i=0; i<2; ++i) { - printf("Testing %s on compressor %d, input buffer of %d numbers, mask=0x%x...\n", label, int(i), int(count), int(mask)); + printf("Testing %s on gate %d, input buffer of %d numbers, mask=0x%x...\n", label, int(i), int(count), int(mask)); FloatBuffer src(count, align, mask & 0x01); FloatBuffer dst(count, align, mask & 0x02); @@ -125,7 +125,7 @@ UTEST_BEGIN("dsp.dynamics", gate_x1_gain) UTEST_ASSERT_MSG(dst2.valid(), "Destination buffer 2 corrupted"); // Compare buffers - if (!dst1.equals_relative(dst2, 1e-4)) + if (!dst1.equals_absolute(dst2, 1e-4)) { src.dump("src "); dst.dump("dst "); @@ -144,7 +144,7 @@ UTEST_BEGIN("dsp.dynamics", gate_x1_gain) #define CALL(generic, func, align) \ call(#func, align, generic, func); -// IF_ARCH_X86(CALL(generic::gate_x1_gain, sse2::gate_x1_gain, 16)); + IF_ARCH_X86(CALL(generic::gate_x1_gain, sse2::gate_x1_gain, 16)); // IF_ARCH_X86(CALL(generic::gate_x1_gain, avx2::gate_x1_gain, 32)); // IF_ARCH_X86(CALL(generic::gate_x1_gain, avx2::gate_x1_gain_fma3, 32)); // IF_ARCH_X86_64(CALL(generic::gate_x1_gain, avx2::x64_gate_x1_gain, 32)); From b101544dae685439922bfcc23372ee098a08f6b5 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Thu, 19 Oct 2023 23:19:25 +0300 Subject: [PATCH 3/4] Implemened AVX2-optimized gate functions --- include/private/dsp/arch/x86/avx2/dynamics.h | 1 + .../private/dsp/arch/x86/avx2/dynamics/gate.h | 1310 +++++++++++++++++ src/main/x86/avx2.cpp | 11 +- src/test/ptest/dynamics/gate_x1_curve.cpp | 16 +- src/test/ptest/dynamics/gate_x1_gain2.cpp | 16 +- src/test/utest/dynamics/gate_x1_curve.cpp | 22 +- src/test/utest/dynamics/gate_x1_gain.cpp | 22 +- 7 files changed, 1359 insertions(+), 39 deletions(-) create mode 100644 include/private/dsp/arch/x86/avx2/dynamics/gate.h diff --git a/include/private/dsp/arch/x86/avx2/dynamics.h b/include/private/dsp/arch/x86/avx2/dynamics.h index fde1c388..908ec4ad 100644 --- a/include/private/dsp/arch/x86/avx2/dynamics.h +++ b/include/private/dsp/arch/x86/avx2/dynamics.h @@ -27,6 +27,7 @@ #endif /* PRIVATE_DSP_ARCH_X86_AVX2_IMPL */ #include +#include #endif /* PRIVATE_DSP_ARCH_X86_AVX2_DYNAMICS_H_ */ diff --git a/include/private/dsp/arch/x86/avx2/dynamics/gate.h b/include/private/dsp/arch/x86/avx2/dynamics/gate.h new file mode 100644 index 00000000..378f3949 --- /dev/null +++ b/include/private/dsp/arch/x86/avx2/dynamics/gate.h @@ -0,0 +1,1310 @@ +/* + * Copyright (C) 2023 Linux Studio Plugins Project + * (C) 2023 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 19 окт. 2023 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX2_DYNAMICS_GATE_H_ +#define PRIVATE_DSP_ARCH_X86_AVX2_DYNAMICS_GATE_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX2_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_AVX2_IMPL */ + +#include +#include + +namespace lsp +{ + namespace avx2 + { + #pragma pack(push, 1) + typedef struct gate_knee_t + { + float start[8]; // +0x00 + float end[8]; // +0x20 + float gain_start[8]; // +0x40 + float gain_end[8]; // +0x60 + float herm[32]; // +0x80 + } gate_knee_t; + #pragma pack(pop) + + static const uint32_t gate_const[] __lsp_aligned32 = + { + LSP_DSP_VEC8(0x7fffffff) + }; + + + #define UNPACK_GATE_KNEE(DST, SRC) \ + __ASM_EMIT("vbroadcastss 0x00(%[" SRC "]), %%ymm0") \ + __ASM_EMIT("vbroadcastss 0x04(%[" SRC "]), %%ymm1") \ + __ASM_EMIT("vbroadcastss 0x08(%[" SRC "]), %%ymm2") \ + __ASM_EMIT("vbroadcastss 0x0c(%[" SRC "]), %%ymm3") \ + __ASM_EMIT("vbroadcastss 0x10(%[" SRC "]), %%ymm4") \ + __ASM_EMIT("vbroadcastss 0x14(%[" SRC "]), %%ymm5") \ + __ASM_EMIT("vbroadcastss 0x18(%[" SRC "]), %%ymm6") \ + __ASM_EMIT("vbroadcastss 0x1c(%[" SRC "]), %%ymm7") \ + __ASM_EMIT("vmovaps %%ymm0, 0x000 + %[" DST "]") \ + __ASM_EMIT("vmovaps %%ymm1, 0x020 + %[" DST "]") \ + __ASM_EMIT("vmovaps %%ymm2, 0x040 + %[" DST "]") \ + __ASM_EMIT("vmovaps %%ymm3, 0x060 + %[" DST "]") \ + __ASM_EMIT("vmovaps %%ymm4, 0x080 + %[" DST "]") \ + __ASM_EMIT("vmovaps %%ymm5, 0x0a0 + %[" DST "]") \ + __ASM_EMIT("vmovaps %%ymm6, 0x0c0 + %[" DST "]") \ + __ASM_EMIT("vmovaps %%ymm7, 0x0e0 + %[" DST "]") + + #define PROCESS_KNEE_SINGLE_X32 \ + /* in: ymm0 = lx0, ymm4 = lx1, ymm8 = lx2, ymm12 = lx3 */ \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm1") /* ymm1 = herm[0] */ \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm5") \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm9") \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm13") \ + __ASM_EMIT("vmulps %%ymm0, %%ymm1, %%ymm1") /* ymm1 = herm[0]*lx0 */ \ + __ASM_EMIT("vmulps %%ymm4, %%ymm5, %%ymm5") \ + __ASM_EMIT("vmulps %%ymm8, %%ymm9, %%ymm9") \ + __ASM_EMIT("vmulps %%ymm12, %%ymm13, %%ymm13") \ + __ASM_EMIT("vaddps 0xa0 + %[knee], %%ymm1, %%ymm1") /* ymm1 = herm[0]*lx0+herm[1] */ \ + __ASM_EMIT("vaddps 0xa0 + %[knee], %%ymm5, %%ymm5") \ + __ASM_EMIT("vaddps 0xa0 + %[knee], %%ymm9, %%ymm9") \ + __ASM_EMIT("vaddps 0xa0 + %[knee], %%ymm13, %%ymm13") \ + __ASM_EMIT("vmulps %%ymm0, %%ymm1, %%ymm1") /* ymm1 = (herm[0]*lx0+herm[1])*lx0 */ \ + __ASM_EMIT("vmulps %%ymm4, %%ymm5, %%ymm5") \ + __ASM_EMIT("vmulps %%ymm8, %%ymm9, %%ymm9") \ + __ASM_EMIT("vmulps %%ymm12, %%ymm13, %%ymm13") \ + __ASM_EMIT("vaddps 0xc0 + %[knee], %%ymm1, %%ymm1") /* ymm1 = (herm[0]*lx0+herm[1])*lx0+herm[2] */ \ + __ASM_EMIT("vaddps 0xc0 + %[knee], %%ymm5, %%ymm5") \ + __ASM_EMIT("vaddps 0xc0 + %[knee], %%ymm9, %%ymm9") \ + __ASM_EMIT("vaddps 0xc0 + %[knee], %%ymm13, %%ymm13") \ + __ASM_EMIT("vmulps %%ymm0, %%ymm1, %%ymm1") /* ymm1 = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0 */ \ + __ASM_EMIT("vmulps %%ymm4, %%ymm5, %%ymm5") \ + __ASM_EMIT("vmulps %%ymm8, %%ymm9, %%ymm9") \ + __ASM_EMIT("vmulps %%ymm12, %%ymm13, %%ymm13") \ + __ASM_EMIT("vaddps 0xe0 + %[knee], %%ymm1, %%ymm0") /* ymm0 = KV = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0+herm[3] */ \ + __ASM_EMIT("vaddps 0xe0 + %[knee], %%ymm5, %%ymm4") \ + __ASM_EMIT("vaddps 0xe0 + %[knee], %%ymm9, %%ymm8") \ + __ASM_EMIT("vaddps 0xe0 + %[knee], %%ymm13, %%ymm12") \ + EXP_CORE_X32 /* ymm0 = EV = expf(KV) */ \ + __ASM_EMIT("vmovaps 0x00 + %[mem], %%ymm1") /* ymm1 = x0 */ \ + __ASM_EMIT("vmovaps 0x20 + %[mem], %%ymm5") \ + __ASM_EMIT("vmovaps 0x40 + %[mem], %%ymm9") \ + __ASM_EMIT("vmovaps 0x60 + %[mem], %%ymm13") \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm1, %%ymm2") /* ymm2 = [x0 <= start] */ \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm5, %%ymm6") \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm9, %%ymm10") \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm13, %%ymm14") \ + __ASM_EMIT("vblendvps %%ymm2, 0x40 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 <= start] ? gain_start : expf(KV) */ \ + __ASM_EMIT("vblendvps %%ymm6, 0x40 + %[knee], %%ymm4, %%ymm4") \ + __ASM_EMIT("vblendvps %%ymm10, 0x40 + %[knee], %%ymm8, %%ymm8") \ + __ASM_EMIT("vblendvps %%ymm14, 0x40 + %[knee], %%ymm12, %%ymm12") \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm1, %%ymm1") /* ymm1 = [x0 >= end] */ \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm5, %%ymm5") \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm9, %%ymm9") \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm13, %%ymm13") \ + __ASM_EMIT("vblendvps %%ymm1, 0x60 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 >= end] ? gain_end : [x0 <= start] ? gain_start : expf(KV) */ \ + __ASM_EMIT("vblendvps %%ymm5, 0x60 + %[knee], %%ymm4, %%ymm4") \ + __ASM_EMIT("vblendvps %%ymm9, 0x60 + %[knee], %%ymm8, %%ymm8") \ + __ASM_EMIT("vblendvps %%ymm13, 0x60 + %[knee], %%ymm12, %%ymm12") \ + /* out: ymm0 = g0, ymm4 = g1, ymm8 = g3, ymm12 = g4 */ + + #define PROCESS_KNEE_SINGLE_X16 \ + /* in: ymm0 = lx0, ymm4 = lx1 */ \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm1") /* ymm1 = herm[0] */ \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm5") \ + __ASM_EMIT("vmulps %%ymm0, %%ymm1, %%ymm1") /* ymm1 = herm[0]*lx0 */ \ + __ASM_EMIT("vmulps %%ymm4, %%ymm5, %%ymm5") \ + __ASM_EMIT("vaddps 0xa0 + %[knee], %%ymm1, %%ymm1") /* ymm1 = herm[0]*lx0+herm[1] */ \ + __ASM_EMIT("vaddps 0xa0 + %[knee], %%ymm5, %%ymm5") \ + __ASM_EMIT("vmulps %%ymm0, %%ymm1, %%ymm1") /* ymm1 = (herm[0]*lx0+herm[1])*lx0 */ \ + __ASM_EMIT("vmulps %%ymm4, %%ymm5, %%ymm5") \ + __ASM_EMIT("vaddps 0xc0 + %[knee], %%ymm1, %%ymm1") /* ymm1 = (herm[0]*lx0+herm[1])*lx0+herm[2] */ \ + __ASM_EMIT("vaddps 0xc0 + %[knee], %%ymm5, %%ymm5") \ + __ASM_EMIT("vmulps %%ymm0, %%ymm1, %%ymm1") /* ymm1 = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0 */ \ + __ASM_EMIT("vmulps %%ymm4, %%ymm5, %%ymm5") \ + __ASM_EMIT("vaddps 0xe0 + %[knee], %%ymm1, %%ymm0") /* ymm0 = KV = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0+herm[3] */ \ + __ASM_EMIT("vaddps 0xe0 + %[knee], %%ymm5, %%ymm4") \ + EXP_CORE_X16 /* ymm0 = EV = expf(KV) */ \ + __ASM_EMIT("vmovaps 0x00 + %[mem], %%ymm1") /* ymm1 = x0 */ \ + __ASM_EMIT("vmovaps 0x20 + %[mem], %%ymm5") \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm1, %%ymm2") /* ymm2 = [x0 <= start] */ \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm5, %%ymm6") \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm1, %%ymm1") /* ymm1 = [x0 >= end] */ \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm5, %%ymm5") \ + __ASM_EMIT("vblendvps %%ymm2, 0x40 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 <= start] ? gain_start : expf(KV) */ \ + __ASM_EMIT("vblendvps %%ymm6, 0x40 + %[knee], %%ymm4, %%ymm4") \ + __ASM_EMIT("vblendvps %%ymm1, 0x60 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 >= end] ? gain_end : [x0 <= start] ? gain_start : expf(KV) */ \ + __ASM_EMIT("vblendvps %%ymm5, 0x60 + %[knee], %%ymm4, %%ymm4") \ + /* out: ymm0 = g0, ymm4 = g1 */ + + #define PROCESS_KNEE_SINGLE_X8 \ + /* in: ymm0 = lx0 */ \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm1") /* ymm1 = herm[0] */ \ + __ASM_EMIT("vmulps %%ymm0, %%ymm1, %%ymm1") /* ymm1 = herm[0]*lx0 */ \ + __ASM_EMIT("vaddps 0xa0 + %[knee], %%ymm1, %%ymm1") /* ymm1 = herm[0]*lx0+herm[1] */ \ + __ASM_EMIT("vmulps %%ymm0, %%ymm1, %%ymm1") /* ymm1 = (herm[0]*lx0+herm[1])*lx0 */ \ + __ASM_EMIT("vaddps 0xc0 + %[knee], %%ymm1, %%ymm1") /* ymm1 = (herm[0]*lx0+herm[1])*lx0+herm[2] */ \ + __ASM_EMIT("vmulps %%ymm0, %%ymm1, %%ymm1") /* ymm1 = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0 */ \ + __ASM_EMIT("vaddps 0xe0 + %[knee], %%ymm1, %%ymm0") /* ymm0 = KV = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0+herm[3] */ \ + EXP_CORE_X8 /* ymm0 = EV = expf(KV) */ \ + __ASM_EMIT("vmovaps 0x00 + %[mem], %%ymm1") /* ymm1 = x0 */ \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm1, %%ymm2") /* ymm2 = [x0 <= start] */ \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm1, %%ymm1") /* ymm1 = [x0 >= end] */ \ + __ASM_EMIT("vblendvps %%ymm2, 0x40 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 <= start] ? gain_start : expf(KV) */ \ + __ASM_EMIT("vblendvps %%ymm1, 0x60 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 >= end] ? gain_end : [x0 <= start] ? gain_start : expf(KV) */ \ + /* out: ymm0 = g0, ymm4 = g1 */ + + #define PROCESS_KNEE_SINGLE_X4 \ + /* in: xmm0 = lx0 */ \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%xmm1") /* xmm1 = herm[0] */ \ + __ASM_EMIT("vmulps %%xmm0, %%xmm1, %%xmm1") /* xmm1 = herm[0]*lx0 */ \ + __ASM_EMIT("vaddps 0xa0 + %[knee], %%xmm1, %%xmm1") /* xmm1 = herm[0]*lx0+herm[1] */ \ + __ASM_EMIT("vmulps %%xmm0, %%xmm1, %%xmm1") /* xmm1 = (herm[0]*lx0+herm[1])*lx0 */ \ + __ASM_EMIT("vaddps 0xc0 + %[knee], %%xmm1, %%xmm1") /* xmm1 = (herm[0]*lx0+herm[1])*lx0+herm[2] */ \ + __ASM_EMIT("vmulps %%xmm0, %%xmm1, %%xmm1") /* xmm1 = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0 */ \ + __ASM_EMIT("vaddps 0xe0 + %[knee], %%xmm1, %%xmm0") /* xmm0 = KV = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0+herm[3] */ \ + EXP_CORE_X4 /* xmm0 = EV = expf(KV) */ \ + __ASM_EMIT("vmovaps 0x00 + %[mem], %%xmm1") /* xmm1 = x0 */ \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%xmm1, %%xmm2") /* xmm2 = [x0 <= start] */ \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%xmm1, %%xmm1") /* xmm1 = [x0 >= end] */ \ + __ASM_EMIT("vblendvps %%xmm2, 0x40 + %[knee], %%xmm0, %%xmm0")/* xmm0 = [x0 <= start] ? gain_start : expf(KV) */ \ + __ASM_EMIT("vblendvps %%xmm1, 0x60 + %[knee], %%xmm0, %%xmm0")/* xmm0 = [x0 >= end] ? gain_end : [x0 <= start] ? gain_start : expf(KV) */ \ + /* out: xmm0 = g0, xmm4 = g1 */ + + #define PROCESS_GATE_FULL_X32 \ + /* in: ymm0 = x0, ymm4 = x1, ymm8 = x2, ymm12 = x4 */ \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm0, %%ymm0") /* ymm0 = fabsf(x0) */ \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm4, %%ymm4") \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm8, %%ymm8") \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm12, %%ymm12") \ + __ASM_EMIT("vcmpps $6, 0x00 + %[knee], %%ymm0, %%ymm1") /* ymm1 = [x0 > start] */ \ + __ASM_EMIT("vcmpps $6, 0x00 + %[knee], %%ymm4, %%ymm5") \ + __ASM_EMIT("vcmpps $6, 0x00 + %[knee], %%ymm8, %%ymm9") \ + __ASM_EMIT("vcmpps $6, 0x00 + %[knee], %%ymm12, %%ymm13") \ + __ASM_EMIT("vorps %%ymm5, %%ymm1, %%ymm1") \ + __ASM_EMIT("vorps %%ymm13, %%ymm9, %%ymm9") \ + __ASM_EMIT("vorps %%ymm9, %%ymm1, %%ymm1") \ + __ASM_EMIT("vmovmskps %%ymm1, %[mask]") \ + __ASM_EMIT("test %[mask], %[mask]") \ + __ASM_EMIT("jnz 100f") \ + __ASM_EMIT("vmovaps 0x40 + %[knee], %%ymm0") /* ymm0 = gain_start */ \ + __ASM_EMIT("vmovaps 0x40 + %[knee], %%ymm4") \ + __ASM_EMIT("vmovaps %%ymm0, %%ymm8") \ + __ASM_EMIT("vmovaps %%ymm4, %%ymm12") \ + __ASM_EMIT("jmp 300f") \ + __ASM_EMIT("100:") \ + __ASM_EMIT("vcmpps $1, 0x20 + %[knee], %%ymm0, %%ymm1") /* ymm1 = [x0 < end] */ \ + __ASM_EMIT("vcmpps $1, 0x20 + %[knee], %%ymm4, %%ymm5") \ + __ASM_EMIT("vcmpps $1, 0x20 + %[knee], %%ymm8, %%ymm9") \ + __ASM_EMIT("vcmpps $1, 0x20 + %[knee], %%ymm12, %%ymm13") \ + __ASM_EMIT("vorps %%ymm5, %%ymm1, %%ymm1") \ + __ASM_EMIT("vorps %%ymm13, %%ymm9, %%ymm9") \ + __ASM_EMIT("vorps %%ymm9, %%ymm1, %%ymm1") \ + __ASM_EMIT("vmovmskps %%ymm1, %[mask]") \ + __ASM_EMIT("test %[mask], %[mask]") \ + __ASM_EMIT("jnz 200f") \ + __ASM_EMIT("vmovaps 0x60 + %[knee], %%ymm0") /* ymm0 = gain_start */ \ + __ASM_EMIT("vmovaps 0x60 + %[knee], %%ymm4") \ + __ASM_EMIT("vmovaps %%ymm0, %%ymm8") \ + __ASM_EMIT("vmovaps %%ymm4, %%ymm12") \ + __ASM_EMIT("jmp 300f") \ + __ASM_EMIT("200:") \ + __ASM_EMIT("vmovaps %%ymm0, 0x00 + %[mem]") /* store fabsf(x0) */ \ + __ASM_EMIT("vmovaps %%ymm4, 0x20 + %[mem]") \ + __ASM_EMIT("vmovaps %%ymm8, 0x40 + %[mem]") \ + __ASM_EMIT("vmovaps %%ymm12, 0x60 + %[mem]") \ + LOGE_CORE_X32 /* ymm0 = lx0 = logf(fabsf(x0)) */ \ + PROCESS_KNEE_SINGLE_X32 /* apply knee */ \ + __ASM_EMIT("300:") \ + /* out: ymm0 = g0, ymm4 = g1, ymm8 = g2, ymm12 = g3 */ + + #define PROCESS_GATE_FULL_X16 \ + /* in: ymm0 = x0, ymm4 = x1 */ \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm0, %%ymm0") /* ymm0 = fabsf(x0) */ \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm4, %%ymm4") \ + __ASM_EMIT("vcmpps $6, 0x00 + %[knee], %%ymm0, %%ymm1") /* ymm1 = [x0 > start] */ \ + __ASM_EMIT("vcmpps $6, 0x00 + %[knee], %%ymm4, %%ymm5") \ + __ASM_EMIT("vcmpps $1, 0x20 + %[knee], %%ymm0, %%ymm2") /* ymm2 = [x0 < end] */ \ + __ASM_EMIT("vcmpps $1, 0x20 + %[knee], %%ymm4, %%ymm6") \ + __ASM_EMIT("vandps %%ymm2, %%ymm1, %%ymm3") /* ymm3 = [x0 > start] && [x0 < end] */ \ + __ASM_EMIT("vandps %%ymm6, %%ymm5, %%ymm7") \ + __ASM_EMIT("vorps %%ymm7, %%ymm3, %%ymm3") \ + __ASM_EMIT("vmovmskps %%ymm3, %[mask]") \ + __ASM_EMIT("test %[mask], %[mask]") \ + __ASM_EMIT("jnz 100f") \ + __ASM_EMIT("vmovaps 0x40 + %[knee], %%ymm0") /* ymm0 = gain_start */ \ + __ASM_EMIT("vmovaps 0x40 + %[knee], %%ymm4") \ + __ASM_EMIT("vblendvps %%ymm1, 0x60 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 > start] ? gain_end : gain_start */ \ + __ASM_EMIT("vblendvps %%ymm5, 0x60 + %[knee], %%ymm4, %%ymm4") \ + __ASM_EMIT("jmp 200f") \ + __ASM_EMIT("100:") \ + __ASM_EMIT("vmovaps %%ymm0, 0x00 + %[mem]") /* store fabsf(x0) */ \ + __ASM_EMIT("vmovaps %%ymm4, 0x20 + %[mem]") \ + LOGE_CORE_X16 /* ymm0 = lx0 = logf(fabsf(x0)) */ \ + PROCESS_KNEE_SINGLE_X16 /* apply knee */ \ + __ASM_EMIT("200:") \ + /* out: ymm0 = g0, ymm4 = g1 */ + + #define PROCESS_GATE_FULL_X8 \ + /* in: ymm0 = x0, ymm4 = x1 */ \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm0, %%ymm0") /* ymm0 = fabsf(x0) */ \ + __ASM_EMIT("vmovaps %%ymm0, 0x00 + %[mem]") /* store fabsf(x0) */ \ + LOGE_CORE_X8 /* ymm0 = lx0 = logf(fabsf(x0)) */ \ + PROCESS_KNEE_SINGLE_X8 /* apply knee */ \ + /* out: ymm0 = g */ + + #define PROCESS_GATE_FULL_X4 \ + /* in: xmm0 = x0, xmm4 = x1 */ \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%xmm0, %%xmm0") /* xmm0 = fabsf(x0) */ \ + __ASM_EMIT("vmovaps %%xmm0, 0x00 + %[mem]") /* store fabsf(x0) */ \ + LOGE_CORE_X4 /* xmm0 = lx0 = logf(fabsf(x0)) */ \ + PROCESS_KNEE_SINGLE_X4 /* apply knee */ \ + /* out: xmm0 = G0 */ + + void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count) + { + IF_ARCH_X86( + gate_knee_t knee __lsp_aligned32; + float mem[16] __lsp_aligned32; + float stub[8] __lsp_aligned32; + size_t mask; + ); + + ARCH_X86_ASM + ( + // Prepare stuff + UNPACK_GATE_KNEE("knee", "gate") + + // 16x blocks + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + __ASM_EMIT("vmovups 0x20(%[src]), %%ymm4") + PROCESS_GATE_FULL_X16 + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("vmovups %%ymm4, 0x20(%[dst])") + __ASM_EMIT("add $0x40, %[src]") + __ASM_EMIT("add $0x40, %[dst]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + // 8x blocks + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + PROCESS_GATE_FULL_X8 + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[src]") + __ASM_EMIT("add $0x20, %[dst]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("4:") + // 4x blocks + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[src]), %%xmm0") + PROCESS_GATE_FULL_X4 + __ASM_EMIT("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[src]") + __ASM_EMIT("add $0x10, %[dst]") + __ASM_EMIT("6:") + // Tail: 1x-3x block + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jle 14f") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 8f") + __ASM_EMIT("vmovss 0x00(%[src]), %%xmm0") + __ASM_EMIT("add $4, %[src]") + __ASM_EMIT("8:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 10f") + __ASM_EMIT("vmovhps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("10:") + PROCESS_GATE_FULL_X4 + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 12f") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $4, %[dst]") + __ASM_EMIT("12:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 14f") + __ASM_EMIT("vmovhps %%xmm0, 0x00(%[dst])") + __ASM_EMIT("14:") + + : [dst] "+r" (dst), [src] "+r" (src), + [count] "+r" (count), + [mask] "=&r" (mask) + : [gate] "r" (c), + [knee] "o" (knee), + [mem] "o" (mem), + [stub] "o" (stub), + [G2C] "o" (gate_const), + [L2C] "o" (LOG2_CONST), + [LOGC] "o" (LOGE_C), + [E2C] "o" (EXP2_CONST), + [LOG2E] "m" (EXP_LOG2E) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + IF_ARCH_X86_64( + void x64_gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count) + { + IF_ARCH_X86( + gate_knee_t knee __lsp_aligned32; + float mem[32] __lsp_aligned32; + float stub[8] __lsp_aligned32; + size_t mask; + ); + + ARCH_X86_ASM + ( + // Prepare stuff + UNPACK_GATE_KNEE("knee", "gate") + + // 32x blocks + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + __ASM_EMIT("vmovups 0x20(%[src]), %%ymm4") + __ASM_EMIT("vmovups 0x40(%[src]), %%ymm8") + __ASM_EMIT("vmovups 0x60(%[src]), %%ymm12") + PROCESS_GATE_FULL_X32 + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("vmovups %%ymm4, 0x20(%[dst])") + __ASM_EMIT("vmovups %%ymm8, 0x40(%[dst])") + __ASM_EMIT("vmovups %%ymm12, 0x60(%[dst])") + __ASM_EMIT("add $0x80, %[src]") + __ASM_EMIT("add $0x80, %[dst]") + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + // 16x blocks + __ASM_EMIT("add $16, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + __ASM_EMIT("vmovups 0x20(%[src]), %%ymm4") + PROCESS_GATE_FULL_X16 + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("vmovups %%ymm4, 0x20(%[dst])") + __ASM_EMIT("add $0x40, %[src]") + __ASM_EMIT("add $0x40, %[dst]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("4:") + // 8x blocks + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + PROCESS_GATE_FULL_X8 + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[src]") + __ASM_EMIT("add $0x20, %[dst]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("6:") + // 4x blocks + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("vmovups 0x00(%[src]), %%xmm0") + PROCESS_GATE_FULL_X4 + __ASM_EMIT("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[src]") + __ASM_EMIT("add $0x10, %[dst]") + __ASM_EMIT("8:") + // Tail: 1x-3x block + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jle 16f") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 10f") + __ASM_EMIT("vmovss 0x00(%[src]), %%xmm0") + __ASM_EMIT("add $4, %[src]") + __ASM_EMIT("10:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 12f") + __ASM_EMIT("vmovhps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("12:") + PROCESS_GATE_FULL_X4 + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 14f") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $4, %[dst]") + __ASM_EMIT("14:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 16f") + __ASM_EMIT("vmovhps %%xmm0, 0x00(%[dst])") + __ASM_EMIT("16:") + + : [dst] "+r" (dst), [src] "+r" (src), + [count] "+r" (count), + [mask] "=&r" (mask) + : [gate] "r" (c), + [knee] "o" (knee), + [mem] "o" (mem), + [stub] "o" (stub), + [G2C] "o" (gate_const), + [L2C] "o" (LOG2_CONST), + [LOGC] "o" (LOGE_C), + [E2C] "o" (EXP2_CONST), + [LOG2E] "m" (EXP_LOG2E) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15" + ); + } + ) + + void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count) + { + IF_ARCH_X86( + gate_knee_t knee __lsp_aligned32; + float mem[16] __lsp_aligned32; + float stub[8] __lsp_aligned32; + size_t mask; + ); + + ARCH_X86_ASM + ( + // Prepare stuff + UNPACK_GATE_KNEE("knee", "gate") + + // 16x blocks + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + __ASM_EMIT("vmovups 0x20(%[src]), %%ymm4") + PROCESS_GATE_FULL_X16 + __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm0") + __ASM_EMIT("vmulps 0x20(%[src]), %%ymm4, %%ymm4") + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("vmovups %%ymm4, 0x20(%[dst])") + __ASM_EMIT("add $0x40, %[src]") + __ASM_EMIT("add $0x40, %[dst]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + // 8x blocks + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + PROCESS_GATE_FULL_X8 + __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm0") + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[src]") + __ASM_EMIT("add $0x20, %[dst]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("4:") + // 4x blocks + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[src]), %%xmm0") + PROCESS_GATE_FULL_X4 + __ASM_EMIT("vmulps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[src]") + __ASM_EMIT("add $0x10, %[dst]") + __ASM_EMIT("6:") + // Tail: 1x-3x block + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jle 14f") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 8f") + __ASM_EMIT("vmovss 0x00(%[src]), %%xmm0") + __ASM_EMIT("add $4, %[src]") + __ASM_EMIT("8:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 10f") + __ASM_EMIT("vmovhps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("10:") + __ASM_EMIT("vmovaps %%xmm0, %%xmm4") + PROCESS_GATE_FULL_X4 + __ASM_EMIT("vmulps %%xmm4, %%xmm0, %%xmm0") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 12f") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $4, %[dst]") + __ASM_EMIT("12:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 14f") + __ASM_EMIT("vmovhps %%xmm0, 0x00(%[dst])") + __ASM_EMIT("14:") + + : [dst] "+r" (dst), [src] "+r" (src), + [count] "+r" (count), + [mask] "=&r" (mask) + : [gate] "r" (c), + [knee] "o" (knee), + [mem] "o" (mem), + [stub] "o" (stub), + [G2C] "o" (gate_const), + [L2C] "o" (LOG2_CONST), + [LOGC] "o" (LOGE_C), + [E2C] "o" (EXP2_CONST), + [LOG2E] "m" (EXP_LOG2E) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + IF_ARCH_X86_64( + void x64_gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count) + { + IF_ARCH_X86( + comp_knee_t knee __lsp_aligned32; + float mem[32] __lsp_aligned32; + float stub[8] __lsp_aligned32; + size_t mask; + ); + + ARCH_X86_ASM + ( + // Prepare stuff + UNPACK_GATE_KNEE("knee", "gate") + + // 32x blocks + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + __ASM_EMIT("vmovups 0x20(%[src]), %%ymm4") + __ASM_EMIT("vmovups 0x40(%[src]), %%ymm8") + __ASM_EMIT("vmovups 0x60(%[src]), %%ymm12") + PROCESS_GATE_FULL_X32 + __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm0") + __ASM_EMIT("vmulps 0x20(%[src]), %%ymm4, %%ymm4") + __ASM_EMIT("vmulps 0x40(%[src]), %%ymm8, %%ymm8") + __ASM_EMIT("vmulps 0x60(%[src]), %%ymm12, %%ymm12") + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("vmovups %%ymm4, 0x20(%[dst])") + __ASM_EMIT("vmovups %%ymm8, 0x40(%[dst])") + __ASM_EMIT("vmovups %%ymm12, 0x60(%[dst])") + __ASM_EMIT("add $0x80, %[src]") + __ASM_EMIT("add $0x80, %[dst]") + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + // 16x blocks + __ASM_EMIT("add $16, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + __ASM_EMIT("vmovups 0x20(%[src]), %%ymm4") + PROCESS_GATE_FULL_X16 + __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm0") + __ASM_EMIT("vmulps 0x20(%[src]), %%ymm4, %%ymm4") + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("vmovups %%ymm4, 0x20(%[dst])") + __ASM_EMIT("add $0x40, %[src]") + __ASM_EMIT("add $0x40, %[dst]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("4:") + // 8x blocks + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + PROCESS_GATE_FULL_X8 + __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm0") + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[src]") + __ASM_EMIT("add $0x20, %[dst]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("6:") + // 4x blocks + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("vmovups 0x00(%[src]), %%xmm0") + PROCESS_GATE_FULL_X4 + __ASM_EMIT("vmulps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[src]") + __ASM_EMIT("add $0x10, %[dst]") + __ASM_EMIT("8:") + // Tail: 1x-3x block + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jle 16f") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 10f") + __ASM_EMIT("vmovss 0x00(%[src]), %%xmm0") + __ASM_EMIT("add $4, %[src]") + __ASM_EMIT("10:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 12f") + __ASM_EMIT("vmovhps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("12:") + __ASM_EMIT("vmovaps %%xmm0, %%xmm4") + PROCESS_GATE_FULL_X4 + __ASM_EMIT("vmulps %%xmm4, %%xmm0, %%xmm0") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 14f") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $4, %[dst]") + __ASM_EMIT("14:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 16f") + __ASM_EMIT("vmovhps %%xmm0, 0x00(%[dst])") + __ASM_EMIT("16:") + + : [dst] "+r" (dst), [src] "+r" (src), + [count] "+r" (count), + [mask] "=&r" (mask) + : [gate] "r" (c), + [knee] "o" (knee), + [mem] "o" (mem), + [stub] "o" (stub), + [G2C] "o" (gate_const), + [L2C] "o" (LOG2_CONST), + [LOGC] "o" (LOGE_C), + [E2C] "o" (EXP2_CONST), + [LOG2E] "m" (EXP_LOG2E) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15" + ); + } + ) + + #undef PROCESS_GATE_FULL_X4 + #undef PROCESS_GATE_FULL_X8 + #undef PROCESS_GATE_FULL_X16 + #undef PROCESS_GATE_FULL_X32 + #undef PROCESS_KNEE_SINGLE_X4 + #undef PROCESS_KNEE_SINGLE_X8 + #undef PROCESS_KNEE_SINGLE_X16 + #undef PROCESS_KNEE_SINGLE_X32 + + #define PROCESS_KNEE_SINGLE_X32_FMA3 \ + /* in: ymm0 = lx0, ymm4 = lx1, ymm8 = lx2, ymm12 = lx3 */ \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm1") /* ymm1 = herm[0] */ \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm5") \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm9") \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm13") \ + __ASM_EMIT("vfmadd213ps 0xa0 + %[knee], %%ymm0, %%ymm1") /* ymm1 = herm[0]*lx0+herm[1] */ \ + __ASM_EMIT("vfmadd213ps 0xa0 + %[knee], %%ymm4, %%ymm5") \ + __ASM_EMIT("vfmadd213ps 0xa0 + %[knee], %%ymm8, %%ymm9") \ + __ASM_EMIT("vfmadd213ps 0xa0 + %[knee], %%ymm12, %%ymm13") \ + __ASM_EMIT("vfmadd213ps 0xc0 + %[knee], %%ymm0, %%ymm1") /* ymm1 = (herm[0]*lx0+herm[1])*lx0+herm[2] */ \ + __ASM_EMIT("vfmadd213ps 0xc0 + %[knee], %%ymm4, %%ymm5") \ + __ASM_EMIT("vfmadd213ps 0xc0 + %[knee], %%ymm8, %%ymm9") \ + __ASM_EMIT("vfmadd213ps 0xc0 + %[knee], %%ymm12, %%ymm13") \ + __ASM_EMIT("vfmadd213ps 0xe0 + %[knee], %%ymm1, %%ymm0") /* ymm0 = KV = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0+herm[3] */ \ + __ASM_EMIT("vfmadd213ps 0xe0 + %[knee], %%ymm5, %%ymm4") \ + __ASM_EMIT("vfmadd213ps 0xe0 + %[knee], %%ymm9, %%ymm8") \ + __ASM_EMIT("vfmadd213ps 0xe0 + %[knee], %%ymm13, %%ymm12") \ + EXP_CORE_X32_FMA3 /* ymm0 = EV = expf(KV) */ \ + __ASM_EMIT("vmovaps 0x00 + %[mem], %%ymm1") /* ymm1 = x0 */ \ + __ASM_EMIT("vmovaps 0x20 + %[mem], %%ymm5") \ + __ASM_EMIT("vmovaps 0x40 + %[mem], %%ymm9") \ + __ASM_EMIT("vmovaps 0x60 + %[mem], %%ymm13") \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm1, %%ymm2") /* ymm2 = [x0 <= start] */ \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm5, %%ymm6") \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm9, %%ymm10") \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm13, %%ymm14") \ + __ASM_EMIT("vblendvps %%ymm2, 0x40 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 <= start] ? gain_start : expf(KV) */ \ + __ASM_EMIT("vblendvps %%ymm6, 0x40 + %[knee], %%ymm4, %%ymm4") \ + __ASM_EMIT("vblendvps %%ymm10, 0x40 + %[knee], %%ymm8, %%ymm8") \ + __ASM_EMIT("vblendvps %%ymm14, 0x40 + %[knee], %%ymm12, %%ymm12") \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm1, %%ymm1") /* ymm1 = [x0 >= end] */ \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm5, %%ymm5") \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm9, %%ymm9") \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm13, %%ymm13") \ + __ASM_EMIT("vblendvps %%ymm1, 0x60 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 >= end] ? gain_end : [x0 <= start] ? gain_start : expf(KV) */ \ + __ASM_EMIT("vblendvps %%ymm5, 0x60 + %[knee], %%ymm4, %%ymm4") \ + __ASM_EMIT("vblendvps %%ymm9, 0x60 + %[knee], %%ymm8, %%ymm8") \ + __ASM_EMIT("vblendvps %%ymm13, 0x60 + %[knee], %%ymm12, %%ymm12") \ + /* out: ymm0 = g0, ymm4 = g1, ymm8 = g3, ymm12 = g4 */ + + #define PROCESS_KNEE_SINGLE_X16_FMA3 \ + /* in: ymm0 = lx0, ymm4 = lx1 */ \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm1") /* ymm1 = herm[0] */ \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm5") \ + __ASM_EMIT("vfmadd213ps 0xa0 + %[knee], %%ymm0, %%ymm1") /* ymm1 = herm[0]*lx0+herm[1] */ \ + __ASM_EMIT("vfmadd213ps 0xa0 + %[knee], %%ymm4, %%ymm5") \ + __ASM_EMIT("vfmadd213ps 0xc0 + %[knee], %%ymm0, %%ymm1") /* ymm1 = (herm[0]*lx0+herm[1])*lx0+herm[2] */ \ + __ASM_EMIT("vfmadd213ps 0xc0 + %[knee], %%ymm4, %%ymm5") \ + __ASM_EMIT("vfmadd213ps 0xe0 + %[knee], %%ymm1, %%ymm0") /* ymm0 = KV = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0+herm[3] */ \ + __ASM_EMIT("vfmadd213ps 0xe0 + %[knee], %%ymm5, %%ymm4") \ + EXP_CORE_X16_FMA3 /* ymm0 = EV = expf(KV) */ \ + __ASM_EMIT("vmovaps 0x00 + %[mem], %%ymm1") /* ymm1 = x0 */ \ + __ASM_EMIT("vmovaps 0x20 + %[mem], %%ymm5") \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm1, %%ymm2") /* ymm2 = [x0 <= start] */ \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm5, %%ymm6") \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm1, %%ymm1") /* ymm1 = [x0 >= end] */ \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm5, %%ymm5") \ + __ASM_EMIT("vblendvps %%ymm2, 0x40 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 <= start] ? gain_start : expf(KV) */ \ + __ASM_EMIT("vblendvps %%ymm6, 0x40 + %[knee], %%ymm4, %%ymm4") \ + __ASM_EMIT("vblendvps %%ymm1, 0x60 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 >= end] ? gain_end : [x0 <= start] ? gain_start : expf(KV) */ \ + __ASM_EMIT("vblendvps %%ymm5, 0x60 + %[knee], %%ymm4, %%ymm4") \ + /* out: ymm0 = g0, ymm4 = g1 */ + + #define PROCESS_KNEE_SINGLE_X8_FMA3 \ + /* in: ymm0 = lx0 */ \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%ymm1") /* ymm1 = herm[0] */ \ + __ASM_EMIT("vfmadd213ps 0xa0 + %[knee], %%ymm0, %%ymm1") /* ymm1 = herm[0]*lx0+herm[1] */ \ + __ASM_EMIT("vfmadd213ps 0xc0 + %[knee], %%ymm0, %%ymm1") /* ymm1 = (herm[0]*lx0+herm[1])*lx0+herm[2] */ \ + __ASM_EMIT("vfmadd213ps 0xe0 + %[knee], %%ymm1, %%ymm0") /* ymm0 = KV = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0+herm[3] */ \ + EXP_CORE_X8_FMA3 /* ymm0 = EV = expf(KV) */ \ + __ASM_EMIT("vmovaps 0x00 + %[mem], %%ymm1") /* ymm1 = x0 */ \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%ymm1, %%ymm2") /* ymm2 = [x0 <= start] */ \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%ymm1, %%ymm1") /* ymm1 = [x0 >= end] */ \ + __ASM_EMIT("vblendvps %%ymm2, 0x40 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 <= start] ? gain_start : expf(KV) */ \ + __ASM_EMIT("vblendvps %%ymm1, 0x60 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 >= end] ? gain_end : [x0 <= start] ? gain_start : expf(KV) */ \ + /* out: ymm0 = g0, ymm4 = g1 */ + + #define PROCESS_KNEE_SINGLE_X4_FMA3 \ + /* in: xmm0 = lx0 */ \ + __ASM_EMIT("vmovaps 0x80 + %[knee], %%xmm1") /* xmm1 = herm[0] */ \ + __ASM_EMIT("vfmadd213ps 0xa0 + %[knee], %%xmm0, %%xmm1") /* xmm1 = herm[0]*lx0+herm[1] */ \ + __ASM_EMIT("vfmadd213ps 0xc0 + %[knee], %%xmm0, %%xmm1") /* xmm1 = (herm[0]*lx0+herm[1])*lx0+herm[2] */ \ + __ASM_EMIT("vfmadd213ps 0xe0 + %[knee], %%xmm1, %%xmm0") /* xmm0 = KV = ((herm[0]*lx0+herm[1])*lx0+herm[2])*lx0+herm[3] */ \ + EXP_CORE_X4_FMA3 /* xmm0 = EV = expf(KV) */ \ + __ASM_EMIT("vmovaps 0x00 + %[mem], %%xmm1") /* xmm1 = x0 */ \ + __ASM_EMIT("vcmpps $2, 0x00 + %[knee], %%xmm1, %%xmm2") /* xmm2 = [x0 <= start] */ \ + __ASM_EMIT("vcmpps $5, 0x20 + %[knee], %%xmm1, %%xmm1") /* xmm1 = [x0 >= end] */ \ + __ASM_EMIT("vblendvps %%xmm2, 0x40 + %[knee], %%xmm0, %%xmm0")/* xmm0 = [x0 <= start] ? gain_start : expf(KV) */ \ + __ASM_EMIT("vblendvps %%xmm1, 0x60 + %[knee], %%xmm0, %%xmm0")/* xmm0 = [x0 >= end] ? gain_end : [x0 <= start] ? gain_start : expf(KV) */ \ + /* out: xmm0 = g0, xmm4 = g1 */ + + #define PROCESS_GATE_FULL_X32_FMA3 \ + /* in: ymm0 = x0, ymm4 = x1, ymm8 = x2, ymm12 = x4 */ \ + /* in: ymm0 = x0, ymm4 = x1, ymm8 = x2, ymm12 = x4 */ \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm0, %%ymm0") /* ymm0 = fabsf(x0) */ \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm4, %%ymm4") \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm8, %%ymm8") \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm12, %%ymm12") \ + __ASM_EMIT("vcmpps $6, 0x00 + %[knee], %%ymm0, %%ymm1") /* ymm1 = [x0 > start] */ \ + __ASM_EMIT("vcmpps $6, 0x00 + %[knee], %%ymm4, %%ymm5") \ + __ASM_EMIT("vcmpps $6, 0x00 + %[knee], %%ymm8, %%ymm9") \ + __ASM_EMIT("vcmpps $6, 0x00 + %[knee], %%ymm12, %%ymm13") \ + __ASM_EMIT("vorps %%ymm5, %%ymm1, %%ymm1") \ + __ASM_EMIT("vorps %%ymm13, %%ymm9, %%ymm9") \ + __ASM_EMIT("vorps %%ymm9, %%ymm1, %%ymm1") \ + __ASM_EMIT("vmovmskps %%ymm1, %[mask]") \ + __ASM_EMIT("test %[mask], %[mask]") \ + __ASM_EMIT("jnz 100f") \ + __ASM_EMIT("vmovaps 0x40 + %[knee], %%ymm0") /* ymm0 = gain_start */ \ + __ASM_EMIT("vmovaps 0x40 + %[knee], %%ymm4") \ + __ASM_EMIT("vmovaps %%ymm0, %%ymm8") \ + __ASM_EMIT("vmovaps %%ymm4, %%ymm12") \ + __ASM_EMIT("jmp 300f") \ + __ASM_EMIT("100:") \ + __ASM_EMIT("vcmpps $1, 0x20 + %[knee], %%ymm0, %%ymm1") /* ymm1 = [x0 < end] */ \ + __ASM_EMIT("vcmpps $1, 0x20 + %[knee], %%ymm4, %%ymm5") \ + __ASM_EMIT("vcmpps $1, 0x20 + %[knee], %%ymm8, %%ymm9") \ + __ASM_EMIT("vcmpps $1, 0x20 + %[knee], %%ymm12, %%ymm13") \ + __ASM_EMIT("vorps %%ymm5, %%ymm1, %%ymm1") \ + __ASM_EMIT("vorps %%ymm13, %%ymm9, %%ymm9") \ + __ASM_EMIT("vorps %%ymm9, %%ymm1, %%ymm1") \ + __ASM_EMIT("vmovmskps %%ymm1, %[mask]") \ + __ASM_EMIT("test %[mask], %[mask]") \ + __ASM_EMIT("jnz 200f") \ + __ASM_EMIT("vmovaps 0x60 + %[knee], %%ymm0") /* ymm0 = gain_start */ \ + __ASM_EMIT("vmovaps 0x60 + %[knee], %%ymm4") \ + __ASM_EMIT("vmovaps %%ymm0, %%ymm8") \ + __ASM_EMIT("vmovaps %%ymm4, %%ymm12") \ + __ASM_EMIT("jmp 300f") \ + __ASM_EMIT("200:") \ + __ASM_EMIT("vmovaps %%ymm0, 0x00 + %[mem]") /* store fabsf(x0) */ \ + __ASM_EMIT("vmovaps %%ymm4, 0x20 + %[mem]") \ + __ASM_EMIT("vmovaps %%ymm8, 0x40 + %[mem]") \ + __ASM_EMIT("vmovaps %%ymm12, 0x60 + %[mem]") \ + LOGE_CORE_X32_FMA3 /* ymm0 = lx0 = logf(fabsf(x0)) */ \ + PROCESS_KNEE_SINGLE_X32_FMA3 /* apply knee */ \ + __ASM_EMIT("300:") \ + /* out: ymm0 = g0, ymm4 = g1, ymm8 = g2, ymm12 = g3 */ + + #define PROCESS_GATE_FULL_X16_FMA3 \ + /* in: ymm0 = x0, ymm4 = x1 */ \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm0, %%ymm0") /* ymm0 = fabsf(x0) */ \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm4, %%ymm4") \ + __ASM_EMIT("vcmpps $6, 0x00 + %[knee], %%ymm0, %%ymm1") /* ymm1 = [x0 > start] */ \ + __ASM_EMIT("vcmpps $6, 0x00 + %[knee], %%ymm4, %%ymm5") \ + __ASM_EMIT("vcmpps $1, 0x20 + %[knee], %%ymm0, %%ymm2") /* ymm2 = [x0 < end] */ \ + __ASM_EMIT("vcmpps $1, 0x20 + %[knee], %%ymm4, %%ymm6") \ + __ASM_EMIT("vandps %%ymm2, %%ymm1, %%ymm3") /* ymm3 = [x0 > start] && [x0 < end] */ \ + __ASM_EMIT("vandps %%ymm6, %%ymm5, %%ymm7") \ + __ASM_EMIT("vorps %%ymm7, %%ymm3, %%ymm3") \ + __ASM_EMIT("vmovmskps %%ymm3, %[mask]") \ + __ASM_EMIT("test %[mask], %[mask]") \ + __ASM_EMIT("jnz 100f") \ + __ASM_EMIT("vmovaps 0x40 + %[knee], %%ymm0") /* ymm0 = gain_start */ \ + __ASM_EMIT("vmovaps 0x40 + %[knee], %%ymm4") \ + __ASM_EMIT("vblendvps %%ymm1, 0x60 + %[knee], %%ymm0, %%ymm0")/* ymm0 = [x0 > start] ? gain_end : gain_start */ \ + __ASM_EMIT("vblendvps %%ymm5, 0x60 + %[knee], %%ymm4, %%ymm4") \ + __ASM_EMIT("jmp 200f") \ + __ASM_EMIT("100:") \ + __ASM_EMIT("vmovaps %%ymm0, 0x00 + %[mem]") /* store fabsf(x0) */ \ + __ASM_EMIT("vmovaps %%ymm4, 0x20 + %[mem]") \ + LOGE_CORE_X16_FMA3 /* ymm0 = lx0 = logf(fabsf(x0)) */ \ + PROCESS_KNEE_SINGLE_X16_FMA3 /* apply knee */ \ + __ASM_EMIT("200:") \ + /* out: ymm0 = g0, ymm4 = g1 */ + + #define PROCESS_GATE_FULL_X8_FMA3 \ + /* in: ymm0 = x0, ymm4 = x1 */ \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%ymm0, %%ymm0") /* ymm0 = fabsf(x0) */ \ + __ASM_EMIT("vmovaps %%ymm0, 0x00 + %[mem]") /* store fabsf(x0) */ \ + LOGE_CORE_X8_FMA3 /* ymm0 = lx0 = logf(fabsf(x0)) */ \ + PROCESS_KNEE_SINGLE_X8_FMA3 /* apply knee */ \ + /* out: ymm0 = g */ + + #define PROCESS_GATE_FULL_X4_FMA3 \ + /* in: xmm0 = x0, xmm4 = x1 */ \ + __ASM_EMIT("vandps 0x00 + %[G2C], %%xmm0, %%xmm0") /* xmm0 = fabsf(x0) */ \ + __ASM_EMIT("vmovaps %%xmm0, 0x00 + %[mem]") /* store fabsf(x0) */ \ + LOGE_CORE_X4_FMA3 /* xmm0 = lx0 = logf(fabsf(x0)) */ \ + PROCESS_KNEE_SINGLE_X4_FMA3 /* apply knee */ \ + /* out: xmm0 = G0 */ + + void gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count) + { + IF_ARCH_X86( + gate_knee_t knee __lsp_aligned32; + float mem[16] __lsp_aligned32; + float stub[8] __lsp_aligned32; + size_t mask; + ); + + ARCH_X86_ASM + ( + // Prepare stuff + UNPACK_GATE_KNEE("knee", "gate") + + // 16x blocks + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + __ASM_EMIT("vmovups 0x20(%[src]), %%ymm4") + PROCESS_GATE_FULL_X16_FMA3 + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("vmovups %%ymm4, 0x20(%[dst])") + __ASM_EMIT("add $0x40, %[src]") + __ASM_EMIT("add $0x40, %[dst]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + // 8x blocks + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + PROCESS_GATE_FULL_X8_FMA3 + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[src]") + __ASM_EMIT("add $0x20, %[dst]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("4:") + // 4x blocks + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[src]), %%xmm0") + PROCESS_GATE_FULL_X4_FMA3 + __ASM_EMIT("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[src]") + __ASM_EMIT("add $0x10, %[dst]") + __ASM_EMIT("6:") + // Tail: 1x-3x block + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jle 14f") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 8f") + __ASM_EMIT("vmovss 0x00(%[src]), %%xmm0") + __ASM_EMIT("add $4, %[src]") + __ASM_EMIT("8:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 10f") + __ASM_EMIT("vmovhps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("10:") + PROCESS_GATE_FULL_X4_FMA3 + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 12f") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $4, %[dst]") + __ASM_EMIT("12:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 14f") + __ASM_EMIT("vmovhps %%xmm0, 0x00(%[dst])") + __ASM_EMIT("14:") + + : [dst] "+r" (dst), [src] "+r" (src), + [count] "+r" (count), + [mask] "=&r" (mask) + : [gate] "r" (c), + [knee] "o" (knee), + [mem] "o" (mem), + [stub] "o" (stub), + [G2C] "o" (gate_const), + [L2C] "o" (LOG2_CONST), + [LOGC] "o" (LOGE_C), + [E2C] "o" (EXP2_CONST), + [LOG2E] "m" (EXP_LOG2E) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + IF_ARCH_X86_64( + void x64_gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count) + { + IF_ARCH_X86( + gate_knee_t knee __lsp_aligned32; + float mem[32] __lsp_aligned32; + float stub[8] __lsp_aligned32; + size_t mask; + ); + + ARCH_X86_ASM + ( + // Prepare stuff + UNPACK_GATE_KNEE("knee", "gate") + + // 32x blocks + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + __ASM_EMIT("vmovups 0x20(%[src]), %%ymm4") + __ASM_EMIT("vmovups 0x40(%[src]), %%ymm8") + __ASM_EMIT("vmovups 0x60(%[src]), %%ymm12") + PROCESS_GATE_FULL_X32_FMA3 + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("vmovups %%ymm4, 0x20(%[dst])") + __ASM_EMIT("vmovups %%ymm8, 0x40(%[dst])") + __ASM_EMIT("vmovups %%ymm12, 0x60(%[dst])") + __ASM_EMIT("add $0x80, %[src]") + __ASM_EMIT("add $0x80, %[dst]") + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + // 16x blocks + __ASM_EMIT("add $16, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + __ASM_EMIT("vmovups 0x20(%[src]), %%ymm4") + PROCESS_GATE_FULL_X16_FMA3 + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("vmovups %%ymm4, 0x20(%[dst])") + __ASM_EMIT("add $0x40, %[src]") + __ASM_EMIT("add $0x40, %[dst]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("4:") + // 8x blocks + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + PROCESS_GATE_FULL_X8_FMA3 + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[src]") + __ASM_EMIT("add $0x20, %[dst]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("6:") + // 4x blocks + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("vmovups 0x00(%[src]), %%xmm0") + PROCESS_GATE_FULL_X4_FMA3 + __ASM_EMIT("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[src]") + __ASM_EMIT("add $0x10, %[dst]") + __ASM_EMIT("8:") + // Tail: 1x-3x block + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jle 16f") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 10f") + __ASM_EMIT("vmovss 0x00(%[src]), %%xmm0") + __ASM_EMIT("add $4, %[src]") + __ASM_EMIT("10:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 12f") + __ASM_EMIT("vmovhps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("12:") + PROCESS_GATE_FULL_X4_FMA3 + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 14f") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $4, %[dst]") + __ASM_EMIT("14:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 16f") + __ASM_EMIT("vmovhps %%xmm0, 0x00(%[dst])") + __ASM_EMIT("16:") + + : [dst] "+r" (dst), [src] "+r" (src), + [count] "+r" (count), + [mask] "=&r" (mask) + : [gate] "r" (c), + [knee] "o" (knee), + [mem] "o" (mem), + [stub] "o" (stub), + [G2C] "o" (gate_const), + [L2C] "o" (LOG2_CONST), + [LOGC] "o" (LOGE_C), + [E2C] "o" (EXP2_CONST), + [LOG2E] "m" (EXP_LOG2E) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15" + ); + } + ) + + void gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count) + { + IF_ARCH_X86( + gate_knee_t knee __lsp_aligned32; + float mem[16] __lsp_aligned32; + float stub[8] __lsp_aligned32; + size_t mask; + ); + + ARCH_X86_ASM + ( + // Prepare stuff + UNPACK_GATE_KNEE("knee", "gate") + + // 16x blocks + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + __ASM_EMIT("vmovups 0x20(%[src]), %%ymm4") + PROCESS_GATE_FULL_X16_FMA3 + __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm0") + __ASM_EMIT("vmulps 0x20(%[src]), %%ymm4, %%ymm4") + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("vmovups %%ymm4, 0x20(%[dst])") + __ASM_EMIT("add $0x40, %[src]") + __ASM_EMIT("add $0x40, %[dst]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + // 8x blocks + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + PROCESS_GATE_FULL_X8_FMA3 + __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm0") + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[src]") + __ASM_EMIT("add $0x20, %[dst]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("4:") + // 4x blocks + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[src]), %%xmm0") + PROCESS_GATE_FULL_X4_FMA3 + __ASM_EMIT("vmulps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[src]") + __ASM_EMIT("add $0x10, %[dst]") + __ASM_EMIT("6:") + // Tail: 1x-3x block + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jle 14f") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 8f") + __ASM_EMIT("vmovss 0x00(%[src]), %%xmm0") + __ASM_EMIT("add $4, %[src]") + __ASM_EMIT("8:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 10f") + __ASM_EMIT("vmovhps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("10:") + __ASM_EMIT("vmovaps %%xmm0, %%xmm4") + PROCESS_GATE_FULL_X4_FMA3 + __ASM_EMIT("vmulps %%xmm4, %%xmm0, %%xmm0") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 12f") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $4, %[dst]") + __ASM_EMIT("12:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 14f") + __ASM_EMIT("vmovhps %%xmm0, 0x00(%[dst])") + __ASM_EMIT("14:") + + : [dst] "+r" (dst), [src] "+r" (src), + [count] "+r" (count), + [mask] "=&r" (mask) + : [gate] "r" (c), + [knee] "o" (knee), + [mem] "o" (mem), + [stub] "o" (stub), + [G2C] "o" (gate_const), + [L2C] "o" (LOG2_CONST), + [LOGC] "o" (LOGE_C), + [E2C] "o" (EXP2_CONST), + [LOG2E] "m" (EXP_LOG2E) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + IF_ARCH_X86_64( + void x64_gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count) + { + IF_ARCH_X86( + comp_knee_t knee __lsp_aligned32; + float mem[32] __lsp_aligned32; + float stub[8] __lsp_aligned32; + size_t mask; + ); + + ARCH_X86_ASM + ( + // Prepare stuff + UNPACK_GATE_KNEE("knee", "gate") + + // 32x blocks + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + __ASM_EMIT("vmovups 0x20(%[src]), %%ymm4") + __ASM_EMIT("vmovups 0x40(%[src]), %%ymm8") + __ASM_EMIT("vmovups 0x60(%[src]), %%ymm12") + PROCESS_GATE_FULL_X32_FMA3 + __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm0") + __ASM_EMIT("vmulps 0x20(%[src]), %%ymm4, %%ymm4") + __ASM_EMIT("vmulps 0x40(%[src]), %%ymm8, %%ymm8") + __ASM_EMIT("vmulps 0x60(%[src]), %%ymm12, %%ymm12") + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("vmovups %%ymm4, 0x20(%[dst])") + __ASM_EMIT("vmovups %%ymm8, 0x40(%[dst])") + __ASM_EMIT("vmovups %%ymm12, 0x60(%[dst])") + __ASM_EMIT("add $0x80, %[src]") + __ASM_EMIT("add $0x80, %[dst]") + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + // 16x blocks + __ASM_EMIT("add $16, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + __ASM_EMIT("vmovups 0x20(%[src]), %%ymm4") + PROCESS_GATE_FULL_X16_FMA3 + __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm0") + __ASM_EMIT("vmulps 0x20(%[src]), %%ymm4, %%ymm4") + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("vmovups %%ymm4, 0x20(%[dst])") + __ASM_EMIT("add $0x40, %[src]") + __ASM_EMIT("add $0x40, %[dst]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("4:") + // 8x blocks + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") + PROCESS_GATE_FULL_X8_FMA3 + __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm0") + __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[src]") + __ASM_EMIT("add $0x20, %[dst]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("6:") + // 4x blocks + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("vmovups 0x00(%[src]), %%xmm0") + PROCESS_GATE_FULL_X4_FMA3 + __ASM_EMIT("vmulps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[src]") + __ASM_EMIT("add $0x10, %[dst]") + __ASM_EMIT("8:") + // Tail: 1x-3x block + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jle 16f") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 10f") + __ASM_EMIT("vmovss 0x00(%[src]), %%xmm0") + __ASM_EMIT("add $4, %[src]") + __ASM_EMIT("10:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 12f") + __ASM_EMIT("vmovhps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("12:") + __ASM_EMIT("vmovaps %%xmm0, %%xmm4") + PROCESS_GATE_FULL_X4_FMA3 + __ASM_EMIT("vmulps %%xmm4, %%xmm0, %%xmm0") + __ASM_EMIT("test $1, %[count]") + __ASM_EMIT("jz 14f") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $4, %[dst]") + __ASM_EMIT("14:") + __ASM_EMIT("test $2, %[count]") + __ASM_EMIT("jz 16f") + __ASM_EMIT("vmovhps %%xmm0, 0x00(%[dst])") + __ASM_EMIT("16:") + + : [dst] "+r" (dst), [src] "+r" (src), + [count] "+r" (count), + [mask] "=&r" (mask) + : [gate] "r" (c), + [knee] "o" (knee), + [mem] "o" (mem), + [stub] "o" (stub), + [G2C] "o" (gate_const), + [L2C] "o" (LOG2_CONST), + [LOGC] "o" (LOGE_C), + [E2C] "o" (EXP2_CONST), + [LOG2E] "m" (EXP_LOG2E) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15" + ); + } + ) + + #undef PROCESS_GATE_FULL_X4_FMA3 + #undef PROCESS_GATE_FULL_X8_FMA3 + #undef PROCESS_GATE_FULL_X16_FMA3 + #undef PROCESS_GATE_FULL_X32_FMA3 + #undef PROCESS_KNEE_SINGLE_X4_FMA3 + #undef PROCESS_KNEE_SINGLE_X8_FMA3 + #undef PROCESS_KNEE_SINGLE_X16_FMA3 + #undef PROCESS_KNEE_SINGLE_X32_FMA3 + + #undef UNPACK_GATE_KNEE + + } /* namespace avx2 */ +} /* namespace lsp */ + + + + +#endif /* PRIVATE_DSP_ARCH_X86_AVX2_DYNAMICS_GATE_H_ */ diff --git a/src/main/x86/avx2.cpp b/src/main/x86/avx2.cpp index cc6e7ef3..a18b4d24 100644 --- a/src/main/x86/avx2.cpp +++ b/src/main/x86/avx2.cpp @@ -199,10 +199,14 @@ namespace lsp CEXPORT1(favx, compressor_x2_gain); CEXPORT1(favx, compressor_x2_curve); - CEXPORT2_X64(favx, compressor_x2_gain, x64_compressor_x2_gain); CEXPORT2_X64(favx, compressor_x2_curve, x64_compressor_x2_curve); + CEXPORT1(favx, gate_x1_gain); + CEXPORT1(favx, gate_x1_curve); + CEXPORT2_X64(favx, gate_x1_gain, x64_gate_x1_gain); + CEXPORT2_X64(favx, gate_x1_curve, x64_gate_x1_curve); + if (f->features & CPU_OPTION_FMA3) { CEXPORT2(favx, mod_k2, mod_k2_fma3); @@ -254,6 +258,11 @@ namespace lsp CEXPORT2(favx, compressor_x2_curve, compressor_x2_curve_fma3); CEXPORT2_X64(favx, compressor_x2_gain, x64_compressor_x2_gain_fma3); CEXPORT2_X64(favx, compressor_x2_curve, x64_compressor_x2_curve_fma3); + + CEXPORT2(favx, gate_x1_gain, gate_x1_gain_fma3); + CEXPORT2(favx, gate_x1_curve, gate_x1_curve_fma3); + CEXPORT2_X64(favx, gate_x1_gain, x64_gate_x1_gain_fma3); + CEXPORT2_X64(favx, gate_x1_curve, x64_gate_x1_curve_fma3); } } } /* namespace avx2 */ diff --git a/src/test/ptest/dynamics/gate_x1_curve.cpp b/src/test/ptest/dynamics/gate_x1_curve.cpp index 784c8716..1f769314 100644 --- a/src/test/ptest/dynamics/gate_x1_curve.cpp +++ b/src/test/ptest/dynamics/gate_x1_curve.cpp @@ -44,16 +44,16 @@ namespace lsp namespace avx2 { -// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); -// void gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } ) IF_ARCH_X86_64( namespace avx2 { -// void x64_gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); -// void x64_gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void x64_gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void x64_gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } ) @@ -125,10 +125,10 @@ PTEST_BEGIN("dsp.dynamics", gate_x1_curve, 5, 1000) CALL(generic::gate_x1_curve); IF_ARCH_X86(CALL(sse2::gate_x1_curve)); -// IF_ARCH_X86(CALL(avx2::gate_x1_curve)); -// IF_ARCH_X86_64(CALL(avx2::x64_gate_x1_curve)); -// IF_ARCH_X86(CALL(avx2::gate_x1_curve_fma3)); -// IF_ARCH_X86_64(CALL(avx2::x64_gate_x1_curve_fma3)); + IF_ARCH_X86(CALL(avx2::gate_x1_curve)); + IF_ARCH_X86_64(CALL(avx2::x64_gate_x1_curve)); + IF_ARCH_X86(CALL(avx2::gate_x1_curve_fma3)); + IF_ARCH_X86_64(CALL(avx2::x64_gate_x1_curve_fma3)); // IF_ARCH_ARM(CALL(neon_d32::gate_x1_curve)); // IF_ARCH_AARCH64(CALL(asimd::gate_x1_curve)); PTEST_SEPARATOR; diff --git a/src/test/ptest/dynamics/gate_x1_gain2.cpp b/src/test/ptest/dynamics/gate_x1_gain2.cpp index 277e4993..fb8eddef 100644 --- a/src/test/ptest/dynamics/gate_x1_gain2.cpp +++ b/src/test/ptest/dynamics/gate_x1_gain2.cpp @@ -44,16 +44,16 @@ namespace lsp namespace avx2 { -// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); -// void gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } ) IF_ARCH_X86_64( namespace avx2 { -// void x64_gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); -// void x64_gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void x64_gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void x64_gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } ) @@ -125,10 +125,10 @@ PTEST_BEGIN("dsp.dynamics", gate_x1_gain, 5, 1000) CALL(generic::gate_x1_gain); IF_ARCH_X86(CALL(sse2::gate_x1_gain)); -// IF_ARCH_X86(CALL(avx2::gate_x1_gain)); -// IF_ARCH_X86_64(CALL(avx2::x64_gate_x1_gain)); -// IF_ARCH_X86(CALL(avx2::gate_x1_gain_fma3)); -// IF_ARCH_X86_64(CALL(avx2::x64_gate_x1_gain_fma3)); + IF_ARCH_X86(CALL(avx2::gate_x1_gain)); + IF_ARCH_X86_64(CALL(avx2::x64_gate_x1_gain)); + IF_ARCH_X86(CALL(avx2::gate_x1_gain_fma3)); + IF_ARCH_X86_64(CALL(avx2::x64_gate_x1_gain_fma3)); // IF_ARCH_ARM(CALL(neon_d32::gate_x1_gain)); // IF_ARCH_AARCH64(CALL(asimd::gate_x1_gain)); PTEST_SEPARATOR; diff --git a/src/test/utest/dynamics/gate_x1_curve.cpp b/src/test/utest/dynamics/gate_x1_curve.cpp index 457aa6a5..53f48672 100644 --- a/src/test/utest/dynamics/gate_x1_curve.cpp +++ b/src/test/utest/dynamics/gate_x1_curve.cpp @@ -42,16 +42,16 @@ namespace lsp namespace avx2 { -// void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); -// void gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } ) IF_ARCH_X86_64( namespace avx2 { -// void x64_gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); -// void x64_gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void x64_gate_x1_curve(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void x64_gate_x1_curve_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } ) @@ -125,7 +125,7 @@ UTEST_BEGIN("dsp.dynamics", gate_x1_curve) UTEST_ASSERT_MSG(dst2.valid(), "Destination buffer 2 corrupted"); // Compare buffers - if (!dst1.equals_absolute(dst2, 1e-4)) + if (!dst1.equals_absolute(dst2, 2e-4)) { src.dump("src "); dst.dump("dst "); @@ -145,13 +145,13 @@ UTEST_BEGIN("dsp.dynamics", gate_x1_curve) call(#func, align, generic, func); IF_ARCH_X86(CALL(generic::gate_x1_curve, sse2::gate_x1_curve, 16)); -// IF_ARCH_X86(CALL(generic::gate_x1_curve, avx2::gate_x1_curve, 32)); -// IF_ARCH_X86(CALL(generic::gate_x1_curve, avx2::gate_x1_curve_fma3, 32)); -// IF_ARCH_X86_64(CALL(generic::gate_x1_curve, avx2::x64_gate_x1_curve, 32)); -// IF_ARCH_X86_64(CALL(generic::gate_x1_curve, avx2::x64_gate_x1_curve_fma3, 32)); -// + IF_ARCH_X86(CALL(generic::gate_x1_curve, avx2::gate_x1_curve, 32)); + IF_ARCH_X86(CALL(generic::gate_x1_curve, avx2::gate_x1_curve_fma3, 32)); + IF_ARCH_X86_64(CALL(generic::gate_x1_curve, avx2::x64_gate_x1_curve, 32)); + IF_ARCH_X86_64(CALL(generic::gate_x1_curve, avx2::x64_gate_x1_curve_fma3, 32)); + // IF_ARCH_ARM(CALL(generic::gate_x1_curve, neon_d32::gate_x1_curve, 16)); -// + // IF_ARCH_AARCH64(CALL(generic::gate_x1_curve, asimd::gate_x1_curve, 16)); } UTEST_END diff --git a/src/test/utest/dynamics/gate_x1_gain.cpp b/src/test/utest/dynamics/gate_x1_gain.cpp index 81110550..8812fe12 100644 --- a/src/test/utest/dynamics/gate_x1_gain.cpp +++ b/src/test/utest/dynamics/gate_x1_gain.cpp @@ -42,16 +42,16 @@ namespace lsp namespace avx2 { -// void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); -// void gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } ) IF_ARCH_X86_64( namespace avx2 { -// void x64_gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); -// void x64_gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void x64_gate_x1_gain(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); + void x64_gate_x1_gain_fma3(float *dst, const float *src, const dsp::gate_knee_t *c, size_t count); } ) @@ -125,7 +125,7 @@ UTEST_BEGIN("dsp.dynamics", gate_x1_gain) UTEST_ASSERT_MSG(dst2.valid(), "Destination buffer 2 corrupted"); // Compare buffers - if (!dst1.equals_absolute(dst2, 1e-4)) + if (!dst1.equals_absolute(dst2, 2e-4)) { src.dump("src "); dst.dump("dst "); @@ -145,13 +145,13 @@ UTEST_BEGIN("dsp.dynamics", gate_x1_gain) call(#func, align, generic, func); IF_ARCH_X86(CALL(generic::gate_x1_gain, sse2::gate_x1_gain, 16)); -// IF_ARCH_X86(CALL(generic::gate_x1_gain, avx2::gate_x1_gain, 32)); -// IF_ARCH_X86(CALL(generic::gate_x1_gain, avx2::gate_x1_gain_fma3, 32)); -// IF_ARCH_X86_64(CALL(generic::gate_x1_gain, avx2::x64_gate_x1_gain, 32)); -// IF_ARCH_X86_64(CALL(generic::gate_x1_gain, avx2::x64_gate_x1_gain_fma3, 32)); -// + IF_ARCH_X86(CALL(generic::gate_x1_gain, avx2::gate_x1_gain, 32)); + IF_ARCH_X86(CALL(generic::gate_x1_gain, avx2::gate_x1_gain_fma3, 32)); + IF_ARCH_X86_64(CALL(generic::gate_x1_gain, avx2::x64_gate_x1_gain, 32)); + IF_ARCH_X86_64(CALL(generic::gate_x1_gain, avx2::x64_gate_x1_gain_fma3, 32)); + // IF_ARCH_ARM(CALL(generic::gate_x1_gain, neon_d32::gate_x1_gain, 16)); -// + // IF_ARCH_AARCH64(CALL(generic::gate_x1_gain, asimd::gate_x1_gain, 16)); } UTEST_END From 9c1c89a4b1fabf244b7b564a74eec77bc029357c Mon Sep 17 00:00:00 2001 From: sadko4u Date: Fri, 20 Oct 2023 00:05:08 +0300 Subject: [PATCH 4/4] Updated tests --- src/test/ptest/dynamics/gate_x1_curve.cpp | 6 ++++-- .../ptest/dynamics/{gate_x1_gain2.cpp => gate_x1_gain.cpp} | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) rename src/test/ptest/dynamics/{gate_x1_gain2.cpp => gate_x1_gain.cpp} (97%) diff --git a/src/test/ptest/dynamics/gate_x1_curve.cpp b/src/test/ptest/dynamics/gate_x1_curve.cpp index 1f769314..1500924e 100644 --- a/src/test/ptest/dynamics/gate_x1_curve.cpp +++ b/src/test/ptest/dynamics/gate_x1_curve.cpp @@ -99,12 +99,14 @@ PTEST_BEGIN("dsp.dynamics", gate_x1_curve, 5, 1000) float *ptr = alloc_aligned(data, buf_size * 2, 64); dsp::gate_knee_t gate; + gate = { - 0.0316244587f, + 0.00794381928f, 0.0631000027f, 0.0631000027f, 1.0f, - {-16.7640247f, -156.329346f, -479.938873f, -486.233582f}}; + {-0.620928824f, -7.07709408f, -24.8873253f, -27.8333282f} + }; float *src = ptr; float *dst = &src[buf_size]; diff --git a/src/test/ptest/dynamics/gate_x1_gain2.cpp b/src/test/ptest/dynamics/gate_x1_gain.cpp similarity index 97% rename from src/test/ptest/dynamics/gate_x1_gain2.cpp rename to src/test/ptest/dynamics/gate_x1_gain.cpp index fb8eddef..60440bc5 100644 --- a/src/test/ptest/dynamics/gate_x1_gain2.cpp +++ b/src/test/ptest/dynamics/gate_x1_gain.cpp @@ -100,11 +100,12 @@ PTEST_BEGIN("dsp.dynamics", gate_x1_gain, 5, 1000) dsp::gate_knee_t gate; gate = { - 0.0316244587f, + 0.00794381928f, 0.0631000027f, 0.0631000027f, 1.0f, - {-16.7640247f, -156.329346f, -479.938873f, -486.233582f}}; + {-0.620928824f, -7.07709408f, -24.8873253f, -27.8333282f} + }; float *src = ptr; float *dst = &src[buf_size];