From 1122108bd4a622f98c0a38ca5f050b7ee24cc4af Mon Sep 17 00:00:00 2001 From: sadko4u Date: Sat, 25 Nov 2023 15:17:08 +0300 Subject: [PATCH] Improved logarithm calculations for x64 and x86_64 --- include/private/dsp/arch/x86/avx2/pmath/log.h | 78 ++++--------------- .../private/dsp/arch/x86/avx512/pmath/log.h | 42 +++------- include/private/dsp/arch/x86/sse2/pmath/log.h | 34 +++----- 3 files changed, 34 insertions(+), 120 deletions(-) diff --git a/include/private/dsp/arch/x86/avx2/pmath/log.h b/include/private/dsp/arch/x86/avx2/pmath/log.h index be7bf341..50995f3d 100644 --- a/include/private/dsp/arch/x86/avx2/pmath/log.h +++ b/include/private/dsp/arch/x86/avx2/pmath/log.h @@ -36,14 +36,12 @@ namespace lsp LSP_DSP_VEC8(0x007fffff), // frac LSP_DSP_VEC8(0x0000007f), // 127 LSP_DSP_VEC8(0x3f800000), // 1.0f - LSP_DSP_VEC8(0x3d888889), // C0 = 1/15 = 0.0666666701436043 - LSP_DSP_VEC8(0x3d9d89d9), // C1 = 1/13 = 0.0769230797886848 - LSP_DSP_VEC8(0x3dba2e8c), // C2 = 1/11 = 0.0909090936183929 - LSP_DSP_VEC8(0x3de38e39), // C3 = 1/9 = 0.1111111119389534 - LSP_DSP_VEC8(0x3e124925), // C4 = 1/7 = 0.1428571492433548 - LSP_DSP_VEC8(0x3e4ccccd), // C5 = 1/5 = 0.2000000029802322 - LSP_DSP_VEC8(0x3eaaaaab), // C6 = 1/3 = 0.3333333432674408 - LSP_DSP_VEC8(0x3f800000) // C7 = 1/1 = 1.0000000000000000 + LSP_DSP_VEC8(0x3dba2e8c), // C0 = 1/11 = 0.0909090936183929 + LSP_DSP_VEC8(0x3de38e39), // C1 = 1/9 = 0.1111111119389534 + LSP_DSP_VEC8(0x3e124925), // C2 = 1/7 = 0.1428571492433548 + LSP_DSP_VEC8(0x3e4ccccd), // C3 = 1/5 = 0.2000000029802322 + LSP_DSP_VEC8(0x3eaaaaab), // C4 = 1/3 = 0.3333333432674408 + LSP_DSP_VEC8(0x3f800000) // C5 = 1/1 = 1.0000000000000000 }; static const float LOGB_C[] __lsp_aligned32 = @@ -142,23 +140,7 @@ namespace lsp __ASM_EMIT("vaddps 0x100 + %[L2C], %%ymm7, %%ymm7") \ __ASM_EMIT("vaddps 0x100 + %[L2C], %%ymm11, %%ymm11") \ __ASM_EMIT("vaddps 0x100 + %[L2C], %%ymm15, %%ymm15") \ - __ASM_EMIT("vmulps %%ymm2, %%ymm3, %%ymm3") /* ymm3 = Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vmulps %%ymm6, %%ymm7, %%ymm7") \ - __ASM_EMIT("vmulps %%ymm10, %%ymm11, %%ymm11") \ - __ASM_EMIT("vmulps %%ymm14, %%ymm15, %%ymm15") \ - __ASM_EMIT("vaddps 0x120 + %[L2C], %%ymm3, %%ymm3") /* ymm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vaddps 0x120 + %[L2C], %%ymm7, %%ymm7") \ - __ASM_EMIT("vaddps 0x120 + %[L2C], %%ymm11, %%ymm11") \ - __ASM_EMIT("vaddps 0x120 + %[L2C], %%ymm15, %%ymm15") \ - __ASM_EMIT("vmulps %%ymm2, %%ymm3, %%ymm3") /* ymm3 = Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vmulps %%ymm6, %%ymm7, %%ymm7") \ - __ASM_EMIT("vmulps %%ymm10, %%ymm11, %%ymm11") \ - __ASM_EMIT("vmulps %%ymm14, %%ymm15, %%ymm15") \ - __ASM_EMIT("vaddps 0x140 + %[L2C], %%ymm3, %%ymm3") /* ymm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vaddps 0x140 + %[L2C], %%ymm7, %%ymm7") \ - __ASM_EMIT("vaddps 0x140 + %[L2C], %%ymm11, %%ymm11") \ - __ASM_EMIT("vaddps 0x140 + %[L2C], %%ymm15, %%ymm15") \ - __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ __ASM_EMIT("vmulps %%ymm4, %%ymm7, %%ymm4") \ __ASM_EMIT("vmulps %%ymm8, %%ymm11, %%ymm8") \ __ASM_EMIT("vmulps %%ymm12, %%ymm15, %%ymm12") \ @@ -203,15 +185,7 @@ namespace lsp __ASM_EMIT("vmulps %%ymm6, %%ymm7, %%ymm7") \ __ASM_EMIT("vaddps 0x100 + %[L2C], %%ymm3, %%ymm3") /* ymm3 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ __ASM_EMIT("vaddps 0x100 + %[L2C], %%ymm7, %%ymm7") \ - __ASM_EMIT("vmulps %%ymm2, %%ymm3, %%ymm3") /* ymm3 = Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vmulps %%ymm6, %%ymm7, %%ymm7") \ - __ASM_EMIT("vaddps 0x120 + %[L2C], %%ymm3, %%ymm3") /* ymm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vaddps 0x120 + %[L2C], %%ymm7, %%ymm7") \ - __ASM_EMIT("vmulps %%ymm2, %%ymm3, %%ymm3") /* ymm3 = Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vmulps %%ymm6, %%ymm7, %%ymm7") \ - __ASM_EMIT("vaddps 0x140 + %[L2C], %%ymm3, %%ymm3") /* ymm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vaddps 0x140 + %[L2C], %%ymm7, %%ymm7") \ - __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ __ASM_EMIT("vmulps %%ymm4, %%ymm7, %%ymm4") \ #define LOGN_CORE_X8 \ @@ -235,11 +209,7 @@ namespace lsp __ASM_EMIT("vaddps 0x0e0 + %[L2C], %%ymm3, %%ymm3") /* ymm3 = C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))) */ \ __ASM_EMIT("vmulps %%ymm2, %%ymm3, %%ymm3") /* ymm3 = Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ __ASM_EMIT("vaddps 0x100 + %[L2C], %%ymm3, %%ymm3") /* ymm3 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ - __ASM_EMIT("vmulps %%ymm2, %%ymm3, %%ymm3") /* ymm3 = Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vaddps 0x120 + %[L2C], %%ymm3, %%ymm3") /* ymm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vmulps %%ymm2, %%ymm3, %%ymm3") /* ymm3 = Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vaddps 0x140 + %[L2C], %%ymm3, %%ymm3") /* ymm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ #define LOGN_CORE_X4 \ __ASM_EMIT("vpsrld $23, %%xmm0, %%xmm1") /* xmm1 = ilog2(x) + 127 */ \ @@ -262,11 +232,7 @@ namespace lsp __ASM_EMIT("vaddps 0x0e0 + %[L2C], %%xmm3, %%xmm3") /* xmm3 = C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))) */ \ __ASM_EMIT("vmulps %%xmm2, %%xmm3, %%xmm3") /* xmm3 = Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ __ASM_EMIT("vaddps 0x100 + %[L2C], %%xmm3, %%xmm3") /* xmm3 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ - __ASM_EMIT("vmulps %%xmm2, %%xmm3, %%xmm3") /* xmm3 = Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vaddps 0x120 + %[L2C], %%xmm3, %%xmm3") /* xmm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vmulps %%xmm2, %%xmm3, %%xmm3") /* xmm3 = Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vaddps 0x140 + %[L2C], %%xmm3, %%xmm3") /* xmm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vmulps %%xmm0, %%xmm3, %%xmm0") /* xmm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%xmm0, %%xmm3, %%xmm0") /* xmm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ #define LOGB_CORE_X32 \ LOGN_CORE_X32 \ @@ -1364,15 +1330,7 @@ namespace lsp __ASM_EMIT("vfmadd213ps 0x100 + %[L2C], %%ymm6, %%ymm7") \ __ASM_EMIT("vfmadd213ps 0x100 + %[L2C], %%ymm10, %%ymm11") \ __ASM_EMIT("vfmadd213ps 0x100 + %[L2C], %%ymm14, %%ymm15") \ - __ASM_EMIT("vfmadd213ps 0x120 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vfmadd213ps 0x120 + %[L2C], %%ymm6, %%ymm7") \ - __ASM_EMIT("vfmadd213ps 0x120 + %[L2C], %%ymm10, %%ymm11") \ - __ASM_EMIT("vfmadd213ps 0x120 + %[L2C], %%ymm14, %%ymm15") \ - __ASM_EMIT("vfmadd213ps 0x140 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vfmadd213ps 0x140 + %[L2C], %%ymm6, %%ymm7") \ - __ASM_EMIT("vfmadd213ps 0x140 + %[L2C], %%ymm10, %%ymm11") \ - __ASM_EMIT("vfmadd213ps 0x140 + %[L2C], %%ymm14, %%ymm15") \ - __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ __ASM_EMIT("vmulps %%ymm4, %%ymm7, %%ymm4") \ __ASM_EMIT("vmulps %%ymm8, %%ymm11, %%ymm8") \ __ASM_EMIT("vmulps %%ymm12, %%ymm15, %%ymm12") \ @@ -1409,11 +1367,7 @@ namespace lsp __ASM_EMIT("vfmadd213ps 0x0e0 + %[L2C], %%ymm6, %%ymm7") \ __ASM_EMIT("vfmadd213ps 0x100 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ __ASM_EMIT("vfmadd213ps 0x100 + %[L2C], %%ymm6, %%ymm7") \ - __ASM_EMIT("vfmadd213ps 0x120 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vfmadd213ps 0x120 + %[L2C], %%ymm6, %%ymm7") \ - __ASM_EMIT("vfmadd213ps 0x140 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vfmadd213ps 0x140 + %[L2C], %%ymm6, %%ymm7") \ - __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ __ASM_EMIT("vmulps %%ymm4, %%ymm7, %%ymm4") \ #define LOGN_CORE_X8_FMA3 \ @@ -1433,9 +1387,7 @@ namespace lsp __ASM_EMIT("vfmadd213ps 0x0c0 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C3+Y*(C2+Y*(C1+C0*Y)) */ \ __ASM_EMIT("vfmadd213ps 0x0e0 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))) */ \ __ASM_EMIT("vfmadd213ps 0x100 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ - __ASM_EMIT("vfmadd213ps 0x120 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vfmadd213ps 0x140 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ #define LOGN_CORE_X4_FMA3 \ __ASM_EMIT("vpsrld $23, %%xmm0, %%xmm1") /* xmm1 = ilog2(x) + 127 */ \ @@ -1454,9 +1406,7 @@ namespace lsp __ASM_EMIT("vfmadd213ps 0x0c0 + %[L2C], %%xmm2, %%xmm3") /* xmm3 = C3+Y*(C2+Y*(C1+C0*Y)) */ \ __ASM_EMIT("vfmadd213ps 0x0e0 + %[L2C], %%xmm2, %%xmm3") /* xmm3 = C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))) */ \ __ASM_EMIT("vfmadd213ps 0x100 + %[L2C], %%xmm2, %%xmm3") /* xmm3 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ - __ASM_EMIT("vfmadd213ps 0x120 + %[L2C], %%xmm2, %%xmm3") /* xmm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vfmadd213ps 0x140 + %[L2C], %%xmm2, %%xmm3") /* xmm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vmulps %%xmm0, %%xmm3, %%xmm0") /* xmm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%xmm0, %%xmm3, %%xmm0") /* xmm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ #define LOGB_CORE_X32_FMA3 \ diff --git a/include/private/dsp/arch/x86/avx512/pmath/log.h b/include/private/dsp/arch/x86/avx512/pmath/log.h index 0673f367..4ddcd896 100644 --- a/include/private/dsp/arch/x86/avx512/pmath/log.h +++ b/include/private/dsp/arch/x86/avx512/pmath/log.h @@ -36,14 +36,12 @@ namespace lsp LSP_DSP_VEC16(0x007fffff), // frac LSP_DSP_VEC16(0x0000007f), // 127 LSP_DSP_VEC16(0x3f800000), // 1.0f - LSP_DSP_VEC16(0x3d888889), // C0 = 1/15 = 0.0666666701436043 - LSP_DSP_VEC16(0x3d9d89d9), // C1 = 1/13 = 0.0769230797886848 - LSP_DSP_VEC16(0x3dba2e8c), // C2 = 1/11 = 0.0909090936183929 - LSP_DSP_VEC16(0x3de38e39), // C3 = 1/9 = 0.1111111119389534 - LSP_DSP_VEC16(0x3e124925), // C4 = 1/7 = 0.1428571492433548 - LSP_DSP_VEC16(0x3e4ccccd), // C5 = 1/5 = 0.2000000029802322 - LSP_DSP_VEC16(0x3eaaaaab), // C6 = 1/3 = 0.3333333432674408 - LSP_DSP_VEC16(0x3f800000) // C7 = 1/1 = 1.0000000000000000 + LSP_DSP_VEC16(0x3dba2e8c), // C0 = 1/11 = 0.0909090936183929 + LSP_DSP_VEC16(0x3de38e39), // C1 = 1/9 = 0.1111111119389534 + LSP_DSP_VEC16(0x3e124925), // C2 = 1/7 = 0.1428571492433548 + LSP_DSP_VEC16(0x3e4ccccd), // C3 = 1/5 = 0.2000000029802322 + LSP_DSP_VEC16(0x3eaaaaab), // C4 = 1/3 = 0.3333333432674408 + LSP_DSP_VEC16(0x3f800000) // C5 = 1/1 = 1.0000000000000000 }; static const float LOGB_C[] __lsp_aligned64 = @@ -126,15 +124,7 @@ namespace lsp __ASM_EMIT("vfmadd213ps 0x200 + %[L2C], %%zmm6, %%zmm7") \ __ASM_EMIT("vfmadd213ps 0x200 + %[L2C], %%zmm10, %%zmm11") \ __ASM_EMIT("vfmadd213ps 0x200 + %[L2C], %%zmm14, %%zmm15") \ - __ASM_EMIT("vfmadd213ps 0x240 + %[L2C], %%zmm2, %%zmm3") /* zmm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vfmadd213ps 0x240 + %[L2C], %%zmm6, %%zmm7") \ - __ASM_EMIT("vfmadd213ps 0x240 + %[L2C], %%zmm10, %%zmm11") \ - __ASM_EMIT("vfmadd213ps 0x240 + %[L2C], %%zmm14, %%zmm15") \ - __ASM_EMIT("vfmadd213ps 0x280 + %[L2C], %%zmm2, %%zmm3") /* zmm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vfmadd213ps 0x280 + %[L2C], %%zmm6, %%zmm7") \ - __ASM_EMIT("vfmadd213ps 0x280 + %[L2C], %%zmm10, %%zmm11") \ - __ASM_EMIT("vfmadd213ps 0x280 + %[L2C], %%zmm14, %%zmm15") \ - __ASM_EMIT("vmulps %%zmm0, %%zmm3, %%zmm0") /* zmm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%zmm0, %%zmm3, %%zmm0") /* zmm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ __ASM_EMIT("vmulps %%zmm4, %%zmm7, %%zmm4") \ __ASM_EMIT("vmulps %%zmm8, %%zmm11, %%zmm8") \ __ASM_EMIT("vmulps %%zmm12, %%zmm15, %%zmm12") \ @@ -171,11 +161,7 @@ namespace lsp __ASM_EMIT("vfmadd213ps 0x1c0 + %[L2C], %%zmm6, %%zmm7") \ __ASM_EMIT("vfmadd213ps 0x200 + %[L2C], %%zmm2, %%zmm3") /* zmm3 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ __ASM_EMIT("vfmadd213ps 0x200 + %[L2C], %%zmm6, %%zmm7") \ - __ASM_EMIT("vfmadd213ps 0x240 + %[L2C], %%zmm2, %%zmm3") /* zmm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vfmadd213ps 0x240 + %[L2C], %%zmm6, %%zmm7") \ - __ASM_EMIT("vfmadd213ps 0x280 + %[L2C], %%zmm2, %%zmm3") /* zmm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vfmadd213ps 0x280 + %[L2C], %%zmm6, %%zmm7") \ - __ASM_EMIT("vmulps %%zmm0, %%zmm3, %%zmm0") /* zmm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%zmm0, %%zmm3, %%zmm0") /* zmm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ __ASM_EMIT("vmulps %%zmm4, %%zmm7, %%zmm4") #define LOGN_CORE_X16 \ @@ -195,9 +181,7 @@ namespace lsp __ASM_EMIT("vfmadd213ps 0x180 + %[L2C], %%zmm2, %%zmm3") /* zmm3 = C3+Y*(C2+Y*(C1+C0*Y)) */ \ __ASM_EMIT("vfmadd213ps 0x1c0 + %[L2C], %%zmm2, %%zmm3") /* zmm3 = C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))) */ \ __ASM_EMIT("vfmadd213ps 0x200 + %[L2C], %%zmm2, %%zmm3") /* zmm3 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ - __ASM_EMIT("vfmadd213ps 0x240 + %[L2C], %%zmm2, %%zmm3") /* zmm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vfmadd213ps 0x280 + %[L2C], %%zmm2, %%zmm3") /* zmm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vmulps %%zmm0, %%zmm3, %%zmm0") /* zmm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%zmm0, %%zmm3, %%zmm0") /* zmm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ #define LOGN_CORE_X8 \ __ASM_EMIT("vpsrld $23, %%ymm0, %%ymm1") /* ymm1 = ilog2(x) + 127 */ \ @@ -216,9 +200,7 @@ namespace lsp __ASM_EMIT("vfmadd213ps 0x180 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C3+Y*(C2+Y*(C1+C0*Y)) */ \ __ASM_EMIT("vfmadd213ps 0x1c0 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))) */ \ __ASM_EMIT("vfmadd213ps 0x200 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ - __ASM_EMIT("vfmadd213ps 0x240 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vfmadd213ps 0x280 + %[L2C], %%ymm2, %%ymm3") /* ymm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%ymm0, %%ymm3, %%ymm0") /* ymm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ #define LOGN_CORE_X4 \ __ASM_EMIT("vpsrld $23, %%xmm0, %%xmm1") /* xmm1 = ilog2(x) + 127 */ \ @@ -237,9 +219,7 @@ namespace lsp __ASM_EMIT("vfmadd213ps 0x180 + %[L2C], %%xmm2, %%xmm3") /* xmm3 = C3+Y*(C2+Y*(C1+C0*Y)) */ \ __ASM_EMIT("vfmadd213ps 0x1c0 + %[L2C], %%xmm2, %%xmm3") /* xmm3 = C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))) */ \ __ASM_EMIT("vfmadd213ps 0x200 + %[L2C], %%xmm2, %%xmm3") /* xmm3 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ - __ASM_EMIT("vfmadd213ps 0x240 + %[L2C], %%xmm2, %%xmm3") /* xmm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("vfmadd213ps 0x280 + %[L2C], %%xmm2, %%xmm3") /* xmm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("vmulps %%xmm0, %%xmm3, %%xmm0") /* xmm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("vmulps %%xmm0, %%xmm3, %%xmm0") /* xmm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ #define LOGB_CORE_X64 \ diff --git a/include/private/dsp/arch/x86/sse2/pmath/log.h b/include/private/dsp/arch/x86/sse2/pmath/log.h index 62558f46..80fbd713 100644 --- a/include/private/dsp/arch/x86/sse2/pmath/log.h +++ b/include/private/dsp/arch/x86/sse2/pmath/log.h @@ -36,14 +36,12 @@ namespace lsp LSP_DSP_VEC4(0x007fffff), // frac LSP_DSP_VEC4(0x3f800000), // 1.0f LSP_DSP_VEC4(0x0000007f), // 127 - LSP_DSP_VEC4(0x3d888889), // C0 = 1/15 = 0.0666666701436043 - LSP_DSP_VEC4(0x3d9d89d9), // C1 = 1/13 = 0.0769230797886848 - LSP_DSP_VEC4(0x3dba2e8c), // C2 = 1/11 = 0.0909090936183929 - LSP_DSP_VEC4(0x3de38e39), // C3 = 1/9 = 0.1111111119389534 - LSP_DSP_VEC4(0x3e124925), // C4 = 1/7 = 0.1428571492433548 - LSP_DSP_VEC4(0x3e4ccccd), // C5 = 1/5 = 0.2000000029802322 - LSP_DSP_VEC4(0x3eaaaaab), // C6 = 1/3 = 0.3333333432674408 - LSP_DSP_VEC4(0x3f800000) // C7 = 1/1 = 1.0000000000000000 + LSP_DSP_VEC4(0x3dba2e8c), // C0 = 1/11 = 0.0909090936183929 + LSP_DSP_VEC4(0x3de38e39), // C1 = 1/9 = 0.1111111119389534 + LSP_DSP_VEC4(0x3e124925), // C2 = 1/7 = 0.1428571492433548 + LSP_DSP_VEC4(0x3e4ccccd), // C3 = 1/5 = 0.2000000029802322 + LSP_DSP_VEC4(0x3eaaaaab), // C4 = 1/3 = 0.3333333432674408 + LSP_DSP_VEC4(0x3f800000) // C5 = 1/1 = 1.0000000000000000 }; static const float LOGB_C[] __lsp_aligned16 = @@ -112,15 +110,7 @@ namespace lsp __ASM_EMIT("mulps %%xmm6, %%xmm7") \ __ASM_EMIT("addps 0x80 + %[L2C], %%xmm3") /* xmm3 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ __ASM_EMIT("addps 0x80 + %[L2C], %%xmm7") \ - __ASM_EMIT("mulps %%xmm2, %%xmm3") /* xmm3 = Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("mulps %%xmm6, %%xmm7") \ - __ASM_EMIT("addps 0x90 + %[L2C], %%xmm3") /* xmm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("addps 0x90 + %[L2C], %%xmm7") \ - __ASM_EMIT("mulps %%xmm2, %%xmm3") /* xmm3 = Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("mulps %%xmm6, %%xmm7") \ - __ASM_EMIT("addps 0xa0 + %[L2C], %%xmm3") /* xmm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("addps 0xa0 + %[L2C], %%xmm7") \ - __ASM_EMIT("mulps %%xmm3, %%xmm0") /* xmm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("mulps %%xmm3, %%xmm0") /* xmm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ __ASM_EMIT("mulps %%xmm7, %%xmm4") \ /* xmm0 = y*L, xmm1 = R */ @@ -150,11 +140,7 @@ namespace lsp __ASM_EMIT("addps 0x70 + %[L2C], %%xmm3") /* xmm3 = C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))) */ \ __ASM_EMIT("mulps %%xmm2, %%xmm3") /* xmm3 = Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ __ASM_EMIT("addps 0x80 + %[L2C], %%xmm3") /* xmm3 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \ - __ASM_EMIT("mulps %%xmm2, %%xmm3") /* xmm3 = Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("addps 0x90 + %[L2C], %%xmm3") /* xmm3 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ - __ASM_EMIT("mulps %%xmm2, %%xmm3") /* xmm3 = Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("addps 0xa0 + %[L2C], %%xmm3") /* xmm3 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \ - __ASM_EMIT("mulps %%xmm3, %%xmm0") /* xmm0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \ + __ASM_EMIT("mulps %%xmm3, %%xmm0") /* xmm0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \ /* xmm0 = y*L, xmm1 = R */ #define LOGB_CORE_X8 \ @@ -576,9 +562,7 @@ namespace lsp float y = (X-1)/(X+1); float y2 = y*y; - float L = 1/13.0f + y2 * 1/15.0f; - L = 1/11.0f + y2 * L; - L = 1/9.0f + y2 * L; + float L = 1/9.0f + y2 * L; L = 1/7.0f + y2 * L; L = 1/5.0f + y2 * L; L = 1/3.0f + y2 * L;