From 2aff629c1e7f594f4a8a987e591c8fbda6b09955 Mon Sep 17 00:00:00 2001 From: Vladimir Sadovnikov Date: Tue, 5 Mar 2024 01:56:41 +0300 Subject: [PATCH 01/22] Version up --- CHANGELOG | 3 +++ include/lsp-plug.in/dsp/version.h | 2 +- project.mk | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 4d445478..93de5d48 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,9 @@ * RECENT CHANGES ******************************************************************************* +=== 1.0.22 === + + === 1.0.21 === * Updated build scripts. * Updated module versions in dependencies. diff --git a/include/lsp-plug.in/dsp/version.h b/include/lsp-plug.in/dsp/version.h index bd21c8d5..8fa10d85 100644 --- a/include/lsp-plug.in/dsp/version.h +++ b/include/lsp-plug.in/dsp/version.h @@ -25,7 +25,7 @@ // Define version of headers #define LSP_DSP_LIB_MAJOR 1 #define LSP_DSP_LIB_MINOR 0 -#define LSP_DSP_LIB_MICRO 21 +#define LSP_DSP_LIB_MICRO 22 #if defined(__WINDOWS__) || defined(__WIN32__) || defined(__WIN64__) || defined(_WIN64) || defined(_WIN32) || defined(__WINNT) || defined(__WINNT__) #define LSP_DSP_LIB_EXPORT_MODIFIER __declspec(dllexport) diff --git a/project.mk b/project.mk index 86bee130..4952c1e0 100644 --- a/project.mk +++ b/project.mk @@ -23,4 +23,4 @@ ARTIFACT_ID = LSP_DSP_LIB ARTIFACT_NAME = lsp-dsp-lib ARTIFACT_DESC = DSP library for digital signal processing ARTIFACT_HEADERS = lsp-plug.in -ARTIFACT_VERSION = 1.0.21 +ARTIFACT_VERSION = 1.0.22-devel From 6a1254f749eaf67ac4f1fd0f014c8dbc0da953fb Mon Sep 17 00:00:00 2001 From: Vladimir Sadovnikov Date: Fri, 8 Mar 2024 01:01:06 +0300 Subject: [PATCH 02/22] Working on correlation function --- include/lsp-plug.in/dsp/common/correlation.h | 52 ++++++ include/lsp-plug.in/dsp/dsp.h | 1 + .../private/dsp/arch/generic/correlation.h | 150 ++++++++++++++++++ src/main/generic/generic.cpp | 2 + 4 files changed, 205 insertions(+) create mode 100644 include/lsp-plug.in/dsp/common/correlation.h create mode 100644 include/private/dsp/arch/generic/correlation.h diff --git a/include/lsp-plug.in/dsp/common/correlation.h b/include/lsp-plug.in/dsp/common/correlation.h new file mode 100644 index 00000000..e8521d75 --- /dev/null +++ b/include/lsp-plug.in/dsp/common/correlation.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 7 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ +#ifndef LSP_PLUG_IN_DSP_COMMON_CORRELATION_H_ +#define LSP_PLUG_IN_DSP_COMMON_CORRELATION_H_ + +#include + +LSP_DSP_LIB_BEGIN_NAMESPACE + +#pragma pack(push, 1) +/** + * DSP context to store and restore machine state + */ +typedef struct LSP_DSP_LIB_TYPE(correlation_t) +{ + float v; // the a*b aggregated value + float a; // the a*a aggregated value + float b; // the b*b aggregated value + float pad; // unused value +} LSP_DSP_LIB_TYPE(correlation_t); +#pragma pack(pop) + +/** Compute normalized correlation between two signals + * + * @param dst destination buffer to store result + * @param a the first signal data + * @param b the second signal data + * @param value the accumulated data from previous iteration + * @param tail the offset of the last element relative to the first one in correlation (length of the window minus one) + * @param count number of samples to process + */ +LSP_DSP_LIB_SYMBOL(void, correlation, correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count); + +#endif /* LSP_PLUG_IN_DSP_COMMON_CORRELATION_H_ */ diff --git a/include/lsp-plug.in/dsp/dsp.h b/include/lsp-plug.in/dsp/dsp.h index 900547f1..38c0a815 100644 --- a/include/lsp-plug.in/dsp/dsp.h +++ b/include/lsp-plug.in/dsp/dsp.h @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include diff --git a/include/private/dsp/arch/generic/correlation.h b/include/private/dsp/arch/generic/correlation.h new file mode 100644 index 00000000..a6fc6072 --- /dev/null +++ b/include/private/dsp/arch/generic/correlation.h @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 8 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + + +#ifndef PRIVATE_DSP_ARCH_GENERIC_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_GENERIC_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_GENERIC_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_GENERIC_IMPL */ + +namespace lsp +{ + namespace generic + { + void correlation(dsp::correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count) + { + float vv = corr->v; + float va = corr->a; + float vb = corr->b; + + for (size_t i=0; i= 1e-10f) ? vv / sqrtf(d) : 0.0f; + } + + corr->v = vv; + corr->a = va; + corr->b = vb; + } + + void correlation_v2(dsp::correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count) + { + float T[4], BA[4], BB[4], B[4], P[4], DV[4], DA[4], DB[4], DD[4]; + + float vv = corr->v; + float va = corr->a; + float vb = corr->b; + + for ( ; count >= 4; count -= 4) + { + DV[0] = a[tail+0]*b[tail+0] - a[0]*b[0]; + DV[1] = a[tail+1]*b[tail+1] - a[1]*b[1]; + DV[2] = a[tail+2]*b[tail+2] - a[2]*b[2]; + DV[3] = a[tail+3]*b[tail+3] - a[3]*b[3]; + + DA[0] = a[tail+0]*a[tail+0] - a[0]*a[0]; + DA[1] = a[tail+1]*a[tail+1] - a[1]*a[1]; + DA[2] = a[tail+2]*a[tail+2] - a[2]*a[2]; + DA[3] = a[tail+3]*a[tail+3] - a[3]*a[3]; + + DB[0] = b[tail+0]*b[tail+0] - b[0]*b[0]; + DB[1] = b[tail+1]*b[tail+1] - b[1]*b[1]; + DB[2] = b[tail+2]*b[tail+2] - b[2]*b[2]; + DB[3] = b[tail+3]*b[tail+3] - b[3]*b[3]; + + T[0] = vv + DV[0]; + T[1] = T[0] + DV[1]; + T[2] = T[1] + DV[2]; + T[3] = T[2] + DV[3]; + + BA[0] = va + DA[0]; + BA[1] = BA[0] + DA[1]; + BA[2] = BA[1] + DA[2]; + BA[3] = BA[2] + DA[3]; + + BB[0] = vb + DB[0]; + BB[1] = BB[0] + DB[1]; + BB[2] = BB[1] + DB[2]; + BB[3] = BB[2] + DB[3]; + + B[0] = BA[0] * BB[0]; + B[1] = BA[1] * BB[1]; + B[2] = BA[2] * BB[2]; + B[3] = BA[3] * BB[3]; + + dst[0] = (B[0] >= 1e-10f) ? T[0] / sqrtf(B[0]) : 0.0f; + dst[1] = (B[1] >= 1e-10f) ? T[1] / sqrtf(B[1]) : 0.0f; + dst[2] = (B[2] >= 1e-10f) ? T[2] / sqrtf(B[2]) : 0.0f; + dst[3] = (B[3] >= 1e-10f) ? T[3] / sqrtf(B[3]) : 0.0f; + + vv = T[3]; + va = BA[3]; + vb = BB[3]; + + a += 4; + b += 4; + dst += 4; + } + + for (; count > 0; --count) + { + DV[0] = a[tail+0]*b[tail+0] - a[0]*b[0]; + DA[0] = a[tail+0]*a[tail+0] - a[0]*a[0]; + DB[0] = b[tail+0]*b[tail+0] - b[0]*b[0]; + T[0] = vv + DV[0]; + BA[0] = va + DA[0]; + BB[0] = vb + DB[0]; + B[0] = BA[0] * BB[0]; + + dst[0] = (B[0] >= 1e-10f) ? T[0] / sqrtf(B[0]) : 0.0f; + + vv = T[0]; + va = BA[0]; + vb = BB[0]; + + a += 1; + b += 1; + dst += 1; + } + + corr->v = vv; + corr->a = va; + corr->b = vb; + } + + } /* namespace generic */ +} /* namespace lsp */ + + + +#endif /* PRIVATE_DSP_ARCH_GENERIC_CORRELATION_H_ */ diff --git a/src/main/generic/generic.cpp b/src/main/generic/generic.cpp index 12324237..bddc57ac 100644 --- a/src/main/generic/generic.cpp +++ b/src/main/generic/generic.cpp @@ -47,6 +47,7 @@ namespace lsp #include #include #include + #include #include @@ -598,6 +599,7 @@ namespace lsp EXPORT1(unit_vector_p1pv); EXPORT1(convolve); + EXPORT1(correlation); EXPORT1(base64_enc); EXPORT1(base64_dec); From d123c8759b7f14047fe169b0bc6fb54ecd818b8f Mon Sep 17 00:00:00 2001 From: sadko4u Date: Fri, 8 Mar 2024 16:58:53 +0300 Subject: [PATCH 03/22] Basic implementation + simple test --- include/lsp-plug.in/dsp/common/correlation.h | 4 +- .../private/dsp/arch/generic/correlation.h | 28 +--- src/test/utest/correlation.cpp | 137 ++++++++++++++++++ src/test/utest/resampling/oversampling.cpp | 1 + 4 files changed, 142 insertions(+), 28 deletions(-) create mode 100644 src/test/utest/correlation.cpp diff --git a/include/lsp-plug.in/dsp/common/correlation.h b/include/lsp-plug.in/dsp/common/correlation.h index e8521d75..82dd7eea 100644 --- a/include/lsp-plug.in/dsp/common/correlation.h +++ b/include/lsp-plug.in/dsp/common/correlation.h @@ -38,6 +38,8 @@ typedef struct LSP_DSP_LIB_TYPE(correlation_t) } LSP_DSP_LIB_TYPE(correlation_t); #pragma pack(pop) +LSP_DSP_LIB_END_NAMESPACE + /** Compute normalized correlation between two signals * * @param dst destination buffer to store result @@ -47,6 +49,6 @@ typedef struct LSP_DSP_LIB_TYPE(correlation_t) * @param tail the offset of the last element relative to the first one in correlation (length of the window minus one) * @param count number of samples to process */ -LSP_DSP_LIB_SYMBOL(void, correlation, correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count); +LSP_DSP_LIB_SYMBOL(void, correlation, LSP_DSP_LIB_TYPE(correlation_t) *corr, float *dst, const float *a, const float *b, size_t tail, size_t count); #endif /* LSP_PLUG_IN_DSP_COMMON_CORRELATION_H_ */ diff --git a/include/private/dsp/arch/generic/correlation.h b/include/private/dsp/arch/generic/correlation.h index a6fc6072..0cdb150b 100644 --- a/include/private/dsp/arch/generic/correlation.h +++ b/include/private/dsp/arch/generic/correlation.h @@ -33,33 +33,7 @@ namespace lsp { void correlation(dsp::correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count) { - float vv = corr->v; - float va = corr->a; - float vb = corr->b; - - for (size_t i=0; i= 1e-10f) ? vv / sqrtf(d) : 0.0f; - } - - corr->v = vv; - corr->a = va; - corr->b = vb; - } - - void correlation_v2(dsp::correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count) - { - float T[4], BA[4], BB[4], B[4], P[4], DV[4], DA[4], DB[4], DD[4]; + float T[4], BA[4], BB[4], B[4], DV[4], DA[4], DB[4]; float vv = corr->v; float va = corr->a; diff --git a/src/test/utest/correlation.cpp b/src/test/utest/correlation.cpp new file mode 100644 index 00000000..d0f7deee --- /dev/null +++ b/src/test/utest/correlation.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 8 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include +#include + +namespace lsp +{ + namespace generic + { + void correlation(dsp::correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count); + } + + static void correlation(dsp::correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count) + { + float vv = corr->v; + float va = corr->a; + float vb = corr->b; + + for (size_t i=0; i= 1e-10f) ? vv / sqrtf(d) : 0.0f; + } + + corr->v = vv; + corr->a = va; + corr->b = vb; + } + + typedef void (* corr_t)(dsp::correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count); +} + +UTEST_BEGIN("dsp", correlation) + void call(const char *label, size_t align, corr_t func) + { + if (!UTEST_SUPPORTED(func)) + return; + + for (size_t mask=0; mask <= 0x07; ++mask) + { + UTEST_FOREACH(tail, 0, 1, 2, 3, 4, 5, 8, 16, 24, 32, 33, 64, 47, 0x80, 0x1ff) + { + UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 8, 16, 24, 32, 33, 64, 47, 0x80, 0x1ff) + { + if ((tail == 0x80) && (count == 0x80)) + printf("Break\n"); + + FloatBuffer a(tail + count + 1, align, mask & 0x01); + FloatBuffer b(tail + count + 1, align, mask & 0x02); + FloatBuffer dst1(count, align, mask & 0x04); + FloatBuffer dst2(count, align, mask & 0x04); + + dsp::correlation_t corr_a, corr_b; + corr_a.v = randf(-1.0f, 1.0f); + corr_a.a = randf(0.0f, 1.0f); + corr_a.b = randf(0.0f, 1.0f); + corr_a.pad = 0.0f; + corr_b = corr_a; + + + printf("Tesing %s correlation tail=%d on buffer count=%d mask=0x%x\n", label, int(tail), int(count), int(mask)); + + correlation(&corr_a, dst1, a, b, tail, count); + func(&corr_b, dst2, a, b, tail, count); + + UTEST_ASSERT_MSG(a.valid(), "Buffer A corrupted"); + UTEST_ASSERT_MSG(b.valid(), "Buffer B corrupted"); + UTEST_ASSERT_MSG(dst1.valid(), "Destination buffer 1 corrupted"); + UTEST_ASSERT_MSG(dst2.valid(), "Destination buffer 2 corrupted"); + + // Compare buffers + if (!dst1.equals_relative(dst2, 1e-5)) + { + a.dump("a "); + b.dump("b "); + dst1.dump("dst1"); + dst2.dump("dst2"); + UTEST_FAIL_MSG("Output of functions for test '%s' differs", label); + } + + // Compare state + if ((!float_equals_adaptive(corr_a.v, corr_b.v)) || + (!float_equals_adaptive(corr_a.a, corr_b.a)) || + (!float_equals_adaptive(corr_a.b, corr_b.b))) + { + UTEST_FAIL_MSG("Correlation state differs a={%f, %f, %f}, b={%f, %f, %f}", + corr_a.v, corr_a.a, corr_a.b, + corr_b.v, corr_b.a, corr_b.b); + } + } + } + } + } + + UTEST_MAIN + { + #define CALL(func, align) \ + call(#func, align, func) + + CALL(generic::correlation, 16); + } + +UTEST_END; + + + diff --git a/src/test/utest/resampling/oversampling.cpp b/src/test/utest/resampling/oversampling.cpp index 6ec2ae51..2e484c9f 100644 --- a/src/test/utest/resampling/oversampling.cpp +++ b/src/test/utest/resampling/oversampling.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace lsp { From 1287a862ca6448ed7cc18e55800b1f070bb4db0d Mon Sep 17 00:00:00 2001 From: Vladimir Sadovnikov Date: Fri, 8 Mar 2024 19:04:39 +0300 Subject: [PATCH 04/22] Refactoring, function redesign --- include/lsp-plug.in/dsp/common/correlation.h | 55 +++++++-- .../private/dsp/arch/generic/correlation.h | 116 ++++++++++++++---- src/main/generic/generic.cpp | 3 +- .../utest/{correlation.cpp => corr_incr.cpp} | 42 ++++--- src/test/utest/corr_init.cpp | 109 ++++++++++++++++ 5 files changed, 276 insertions(+), 49 deletions(-) rename src/test/utest/{correlation.cpp => corr_incr.cpp} (76%) create mode 100644 src/test/utest/corr_init.cpp diff --git a/include/lsp-plug.in/dsp/common/correlation.h b/include/lsp-plug.in/dsp/common/correlation.h index 82dd7eea..bd86c0d9 100644 --- a/include/lsp-plug.in/dsp/common/correlation.h +++ b/include/lsp-plug.in/dsp/common/correlation.h @@ -26,29 +26,62 @@ LSP_DSP_LIB_BEGIN_NAMESPACE #pragma pack(push, 1) + /** - * DSP context to store and restore machine state + * Object to store correlation state. + * + * The correlation is computed using the following formula: + * + * sum(a[i] * b[i]) + * corr = --------------------------------------- + * sqrt(sum(a[i]*a[i]) * sum(b[i]*b[i])) + * + * where i is in range of 0 to count-1. + * */ typedef struct LSP_DSP_LIB_TYPE(correlation_t) { - float v; // the a*b aggregated value - float a; // the a*a aggregated value - float b; // the b*b aggregated value - float pad; // unused value + float v; // the aggregated value of sum(a*b) + float a; // the aggregated value of sum(a*a) + float b; // the aggregated value of sum(b*b) } LSP_DSP_LIB_TYPE(correlation_t); + #pragma pack(pop) LSP_DSP_LIB_END_NAMESPACE -/** Compute normalized correlation between two signals +/** + * Compute the initial intermediate values of correlation between two signals, + * the function can be called multiple times, so the value of corr structure + * should be cleared before first call. + * + * @param corr the object to initialize with intermediate results + * @param a the pointer to the first signal buffer + * @param b the pointer to the second signal buffer + * @param count number of samples to process + */ +LSP_DSP_LIB_SYMBOL(void, corr_init, + LSP_DSP_LIB_TYPE(correlation_t) *corr, + const float *a, const float *b, + size_t count); + +/** + * Compute incremental value of normalized correlation between two signals * + * @param corr the object that holds intermediate results * @param dst destination buffer to store result - * @param a the first signal data - * @param b the second signal data - * @param value the accumulated data from previous iteration - * @param tail the offset of the last element relative to the first one in correlation (length of the window minus one) + * @param a_head the pointer to the head of the first signal buffer + * @param b_head the pointer to the head of the second signal buffer + * @param a_tail the pointer to the tail of the first signal buffer + * @param b_tail the pointer to the tail of the second signal buffer + * @param head the offset of the head element relative to the first one in correlation (length of the window minus one) * @param count number of samples to process */ -LSP_DSP_LIB_SYMBOL(void, correlation, LSP_DSP_LIB_TYPE(correlation_t) *corr, float *dst, const float *a, const float *b, size_t tail, size_t count); +LSP_DSP_LIB_SYMBOL(void, corr_incr, + LSP_DSP_LIB_TYPE(correlation_t) *corr, + float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); #endif /* LSP_PLUG_IN_DSP_COMMON_CORRELATION_H_ */ diff --git a/include/private/dsp/arch/generic/correlation.h b/include/private/dsp/arch/generic/correlation.h index 0cdb150b..9b6dc311 100644 --- a/include/private/dsp/arch/generic/correlation.h +++ b/include/private/dsp/arch/generic/correlation.h @@ -13,7 +13,7 @@ * lsp-dsp-lib is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU Lesser General Public License for more deheads. * * You should have received a copy of the GNU Lesser General Public License * along with lsp-dsp-lib. If not, see . @@ -31,7 +31,76 @@ namespace lsp { namespace generic { - void correlation(dsp::correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count) + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + float xv = 0.0f; + float xa = 0.0f; + float xb = 0.0f; + + if (count >= 4) + { + float T[4], A[4], B[4]; + + T[0] = 0.0f; + T[1] = 0.0f; + T[2] = 0.0f; + T[3] = 0.0f; + + A[0] = 0.0f; + A[1] = 0.0f; + A[2] = 0.0f; + A[3] = 0.0f; + + B[0] = 0.0f; + B[1] = 0.0f; + B[2] = 0.0f; + B[3] = 0.0f; + + for ( ; count >= 4; count -= 4) + { + T[0] += a[0] * b[0]; + T[1] += a[1] * b[1]; + T[2] += a[2] * b[2]; + T[3] += a[3] * b[3]; + + A[0] += a[0] * a[0]; + A[1] += a[1] * a[1]; + A[2] += a[2] * a[2]; + A[3] += a[3] * a[3]; + + B[0] += b[0] * b[0]; + B[1] += b[1] * b[1]; + B[2] += b[2] * b[2]; + B[3] += b[3] * b[3]; + + a += 4; + b += 4; + } + + xv = T[0] + T[1] + T[2] + T[3]; + xa = A[0] + A[1] + A[2] + A[3]; + xb = B[0] + B[1] + B[2] + B[3]; + } + + for ( ; count > 0; --count) + { + xv += a[0] * b[0]; + xa += a[0] * a[0]; + xb += b[0] * b[0]; + + a += 1; + b += 1; + } + + corr->v += xv; + corr->a += xa; + corr->b += xb; + } + + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) { float T[4], BA[4], BB[4], B[4], DV[4], DA[4], DB[4]; @@ -41,20 +110,20 @@ namespace lsp for ( ; count >= 4; count -= 4) { - DV[0] = a[tail+0]*b[tail+0] - a[0]*b[0]; - DV[1] = a[tail+1]*b[tail+1] - a[1]*b[1]; - DV[2] = a[tail+2]*b[tail+2] - a[2]*b[2]; - DV[3] = a[tail+3]*b[tail+3] - a[3]*b[3]; + DV[0] = a_head[0]*b_head[0] - a_tail[0]*b_tail[0]; + DV[1] = a_head[1]*b_head[1] - a_tail[1]*b_tail[1]; + DV[2] = a_head[2]*b_head[2] - a_tail[2]*b_tail[2]; + DV[3] = a_head[3]*b_head[3] - a_tail[3]*b_tail[3]; - DA[0] = a[tail+0]*a[tail+0] - a[0]*a[0]; - DA[1] = a[tail+1]*a[tail+1] - a[1]*a[1]; - DA[2] = a[tail+2]*a[tail+2] - a[2]*a[2]; - DA[3] = a[tail+3]*a[tail+3] - a[3]*a[3]; + DA[0] = a_head[0]*a_head[0] - a_tail[0]*a_tail[0]; + DA[1] = a_head[1]*a_head[1] - a_tail[1]*a_tail[1]; + DA[2] = a_head[2]*a_head[2] - a_tail[2]*a_tail[2]; + DA[3] = a_head[3]*a_head[3] - a_tail[3]*a_tail[3]; - DB[0] = b[tail+0]*b[tail+0] - b[0]*b[0]; - DB[1] = b[tail+1]*b[tail+1] - b[1]*b[1]; - DB[2] = b[tail+2]*b[tail+2] - b[2]*b[2]; - DB[3] = b[tail+3]*b[tail+3] - b[3]*b[3]; + DB[0] = b_head[0]*b_head[0] - b_tail[0]*b_tail[0]; + DB[1] = b_head[1]*b_head[1] - b_tail[1]*b_tail[1]; + DB[2] = b_head[2]*b_head[2] - b_tail[2]*b_tail[2]; + DB[3] = b_head[3]*b_head[3] - b_tail[3]*b_tail[3]; T[0] = vv + DV[0]; T[1] = T[0] + DV[1]; @@ -85,16 +154,19 @@ namespace lsp va = BA[3]; vb = BB[3]; - a += 4; - b += 4; + a_head += 4; + b_head += 4; + a_tail += 4; + b_tail += 4; dst += 4; } for (; count > 0; --count) { - DV[0] = a[tail+0]*b[tail+0] - a[0]*b[0]; - DA[0] = a[tail+0]*a[tail+0] - a[0]*a[0]; - DB[0] = b[tail+0]*b[tail+0] - b[0]*b[0]; + DV[0] = a_head[0]*b_head[0] - a_tail[0]*b_tail[0]; + DA[0] = a_head[0]*a_head[0] - a_tail[0]*a_tail[0]; + DB[0] = b_head[0]*b_head[0] - b_tail[0]*b_tail[0]; + T[0] = vv + DV[0]; BA[0] = va + DA[0]; BB[0] = vb + DB[0]; @@ -106,8 +178,10 @@ namespace lsp va = BA[0]; vb = BB[0]; - a += 1; - b += 1; + a_head += 1; + b_head += 1; + a_tail += 1; + b_tail += 1; dst += 1; } diff --git a/src/main/generic/generic.cpp b/src/main/generic/generic.cpp index bddc57ac..a99fffed 100644 --- a/src/main/generic/generic.cpp +++ b/src/main/generic/generic.cpp @@ -599,7 +599,8 @@ namespace lsp EXPORT1(unit_vector_p1pv); EXPORT1(convolve); - EXPORT1(correlation); + EXPORT1(corr_init); + EXPORT1(corr_incr); EXPORT1(base64_enc); EXPORT1(base64_dec); diff --git a/src/test/utest/correlation.cpp b/src/test/utest/corr_incr.cpp similarity index 76% rename from src/test/utest/correlation.cpp rename to src/test/utest/corr_incr.cpp index d0f7deee..83e8b57b 100644 --- a/src/test/utest/correlation.cpp +++ b/src/test/utest/corr_incr.cpp @@ -29,10 +29,16 @@ namespace lsp { namespace generic { - void correlation(dsp::correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count); + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); } - static void correlation(dsp::correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count) + static void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) { float vv = corr->v; float va = corr->a; @@ -40,10 +46,10 @@ namespace lsp for (size_t i=0; ib = vb; } - typedef void (* corr_t)(dsp::correlation_t *corr, float *dst, const float *a, const float *b, size_t tail, size_t count); + typedef void (* corr_incr_t)(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); } -UTEST_BEGIN("dsp", correlation) - void call(const char *label, size_t align, corr_t func) +UTEST_BEGIN("dsp", corr_incr) + void call(const char *label, size_t align, corr_incr_t func) { if (!UTEST_SUPPORTED(func)) return; @@ -73,9 +82,6 @@ UTEST_BEGIN("dsp", correlation) { UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 8, 16, 24, 32, 33, 64, 47, 0x80, 0x1ff) { - if ((tail == 0x80) && (count == 0x80)) - printf("Break\n"); - FloatBuffer a(tail + count + 1, align, mask & 0x01); FloatBuffer b(tail + count + 1, align, mask & 0x02); FloatBuffer dst1(count, align, mask & 0x04); @@ -85,14 +91,18 @@ UTEST_BEGIN("dsp", correlation) corr_a.v = randf(-1.0f, 1.0f); corr_a.a = randf(0.0f, 1.0f); corr_a.b = randf(0.0f, 1.0f); - corr_a.pad = 0.0f; corr_b = corr_a; printf("Tesing %s correlation tail=%d on buffer count=%d mask=0x%x\n", label, int(tail), int(count), int(mask)); - correlation(&corr_a, dst1, a, b, tail, count); - func(&corr_b, dst2, a, b, tail, count); + const float *a_tail = a; + const float *b_tail = b; + const float *a_head = &a_tail[tail]; + const float *b_head = &b_tail[tail]; + + corr_incr(&corr_a, dst1, a_head, b_head, a_tail, b_tail, count); + func(&corr_b, dst1, a_head, b_head, a_tail, b_tail, count); UTEST_ASSERT_MSG(a.valid(), "Buffer A corrupted"); UTEST_ASSERT_MSG(b.valid(), "Buffer B corrupted"); @@ -128,7 +138,7 @@ UTEST_BEGIN("dsp", correlation) #define CALL(func, align) \ call(#func, align, func) - CALL(generic::correlation, 16); + CALL(generic::corr_incr, 16); } UTEST_END; diff --git a/src/test/utest/corr_init.cpp b/src/test/utest/corr_init.cpp new file mode 100644 index 00000000..7a81a926 --- /dev/null +++ b/src/test/utest/corr_init.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 8 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include +#include + +namespace lsp +{ + namespace generic + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + + static void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + float vv = 0.0f; + float va = 0.0f; + float vb = 0.0f; + + for (size_t i=0; iv += vv; + corr->a += va; + corr->b += vb; + } + + typedef void (* corr_init_t)(dsp::correlation_t *corr, const float *a, const float *b, size_t count); +} + +UTEST_BEGIN("dsp", corr_init) + void call(const char *label, size_t align, corr_init_t func) + { + if (!UTEST_SUPPORTED(func)) + return; + + for (size_t mask=0; mask <= 0x07; ++mask) + { + UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 8, 16, 24, 32, 33, 64, 47, 0x80, 0x1ff) + { + FloatBuffer a(count, align, mask & 0x01); + FloatBuffer b(count, align, mask & 0x02); + + dsp::correlation_t corr_a, corr_b; + corr_a.v = 0.0f; + corr_a.a = 0.0f; + corr_a.b = 0.0f; + corr_b.v = 0.0f; + corr_b.a = 0.0f; + corr_b.b = 0.0f; + + printf("Tesing %s corr_init on buffer count=%d mask=0x%x\n", label, int(count), int(mask)); + + corr_init(&corr_a, a, b, count); + func(&corr_b, a, b, count); + + UTEST_ASSERT_MSG(a.valid(), "Buffer A corrupted"); + UTEST_ASSERT_MSG(b.valid(), "Buffer B corrupted"); + + // Compare state + if ((!float_equals_adaptive(corr_a.v, corr_b.v)) || + (!float_equals_adaptive(corr_a.a, corr_b.a)) || + (!float_equals_adaptive(corr_a.b, corr_b.b))) + { + UTEST_FAIL_MSG("Correlation state differs a={%f, %f, %f}, b={%f, %f, %f}", + corr_a.v, corr_a.a, corr_a.b, + corr_b.v, corr_b.a, corr_b.b); + } + } + } + } + + UTEST_MAIN + { + #define CALL(func, align) \ + call(#func, align, func) + + CALL(generic::corr_init, 16); + } + +UTEST_END; + + + From 6de9df194802a607db027ed24b1a5778a3120c1e Mon Sep 17 00:00:00 2001 From: sadko4u Date: Fri, 8 Mar 2024 19:14:32 +0300 Subject: [PATCH 05/22] Bugfixes in tests --- src/test/utest/corr_incr.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/test/utest/corr_incr.cpp b/src/test/utest/corr_incr.cpp index 83e8b57b..9d019c24 100644 --- a/src/test/utest/corr_incr.cpp +++ b/src/test/utest/corr_incr.cpp @@ -51,9 +51,9 @@ namespace lsp float at = a_tail[i]; float bt = b_tail[i]; - vv += at*bt - ah*bh; - va += at*at - ah*ah; - vb += bt*bt - bh*bh; + vv += ah*bh - at*bt; + va += ah*ah - at*at; + vb += bh*bh - bt*bt; float d = va * vb; dst[i] = (d >= 1e-10f) ? vv / sqrtf(d) : 0.0f; @@ -93,7 +93,6 @@ UTEST_BEGIN("dsp", corr_incr) corr_a.b = randf(0.0f, 1.0f); corr_b = corr_a; - printf("Tesing %s correlation tail=%d on buffer count=%d mask=0x%x\n", label, int(tail), int(count), int(mask)); const float *a_tail = a; @@ -102,7 +101,7 @@ UTEST_BEGIN("dsp", corr_incr) const float *b_head = &b_tail[tail]; corr_incr(&corr_a, dst1, a_head, b_head, a_tail, b_tail, count); - func(&corr_b, dst1, a_head, b_head, a_tail, b_tail, count); + func(&corr_b, dst2, a_head, b_head, a_tail, b_tail, count); UTEST_ASSERT_MSG(a.valid(), "Buffer A corrupted"); UTEST_ASSERT_MSG(b.valid(), "Buffer B corrupted"); From e40e633fe5f28db553babc33b98ba9c33a760001 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Sat, 9 Mar 2024 16:17:10 +0300 Subject: [PATCH 06/22] SSE implementation of corr_init function --- .../private/dsp/arch/x86/sse/correlation.h | 144 ++++++++++++++++++ src/main/x86/sse.cpp | 2 + src/test/ptest/convolve.cpp | 3 + src/test/ptest/corr_init.cpp | 123 +++++++++++++++ src/test/utest/corr_init.cpp | 24 +-- 5 files changed, 287 insertions(+), 9 deletions(-) create mode 100644 include/private/dsp/arch/x86/sse/correlation.h create mode 100644 src/test/ptest/corr_init.cpp diff --git a/include/private/dsp/arch/x86/sse/correlation.h b/include/private/dsp/arch/x86/sse/correlation.h new file mode 100644 index 00000000..6cd4bb4f --- /dev/null +++ b/include/private/dsp/arch/x86/sse/correlation.h @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 9 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_SSE_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_X86_SSE_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_SSE_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_SSE_IMPL */ + +namespace lsp +{ + namespace sse + { + + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + IF_ARCH_X86( + size_t off; + ); + + ARCH_X86_ASM + ( + __ASM_EMIT("xor %[off], %[off]") + __ASM_EMIT("xorps %%xmm0, %%xmm0") /* xv = 0 */ + __ASM_EMIT("xorps %%xmm1, %%xmm1") /* xa = 0 */ + __ASM_EMIT("xorps %%xmm2, %%xmm2") /* xb = 0 */ + /* 8x blocks */ + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("movups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("movups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */ + __ASM_EMIT("movaps %%xmm3, %%xmm7") /* xmm7 = a0 */ + __ASM_EMIT("movups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("movups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */ + __ASM_EMIT("mulps %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("mulps %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("mulps %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("addps %%xmm7, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("addps %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("movaps %%xmm4, %%xmm7") /* xmm7 = a1 */ + __ASM_EMIT("addps %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("mulps %%xmm6, %%xmm7") /* xmm7 = a1*b1 */ + __ASM_EMIT("mulps %%xmm4, %%xmm4") /* xmm4 = a1*a1 */ + __ASM_EMIT("mulps %%xmm6, %%xmm6") /* xmm6 = b1*b1 */ + __ASM_EMIT("addps %%xmm7, %%xmm0") /* xmm0 = xv + a1*b1 */ + __ASM_EMIT("addps %%xmm4, %%xmm1") /* xmm1 = xa + a1*a1 */ + __ASM_EMIT("addps %%xmm6, %%xmm2") /* xmm2 = xb + b1*b1 */ + __ASM_EMIT("add $0x20, %[off]") /* ++off */ + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("movups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("movups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("movaps %%xmm3, %%xmm7") /* xmm7 = a0 */ + __ASM_EMIT("mulps %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("mulps %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("mulps %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("addps %%xmm7, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("addps %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("addps %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("add $0x10, %[off]") /* ++off */ + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("4:") + /* Do horizontal sum */ + __ASM_EMIT("movhlps %%xmm0, %%xmm4") /* xmm4 = xv2 xv3 ? ? */ + __ASM_EMIT("movhlps %%xmm1, %%xmm5") /* xmm5 = xa2 xa3 ? ? */ + __ASM_EMIT("movhlps %%xmm2, %%xmm6") /* xmm6 = xb2 xb3 ? ? */ + __ASM_EMIT("addps %%xmm4, %%xmm0") /* xmm0 = xv0+xv2 xv1+xv3 ? ? */ + __ASM_EMIT("addps %%xmm5, %%xmm1") /* xmm1 = xa0+xa2 xa1+xa3 ? ? */ + __ASM_EMIT("addps %%xmm6, %%xmm2") /* xmm2 = xb0+xb2 xb1+xb3 ? ? */ + __ASM_EMIT("unpcklps %%xmm4, %%xmm0") /* xmm0 = xv0+xv2 xv2 xv1+xv3 xv3 */ + __ASM_EMIT("unpcklps %%xmm5, %%xmm1") /* xmm1 = xa0+xa2 xa2 xa1+xa3 xa3 */ + __ASM_EMIT("unpcklps %%xmm6, %%xmm2") /* xmm2 = xb0+xb2 xb2 xb1+xb3 xb3 */ + __ASM_EMIT("movhlps %%xmm0, %%xmm4") /* xmm4 = xv1+xv3 xv3 ? ? */ + __ASM_EMIT("movhlps %%xmm1, %%xmm5") /* xmm5 = xa1+xa3 xa3 ? ? */ + __ASM_EMIT("movhlps %%xmm2, %%xmm6") /* xmm6 = xb1+xb3 xb3 ? ? */ + __ASM_EMIT("addss %%xmm4, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 xv2+xv3 ? ? */ + __ASM_EMIT("addss %%xmm5, %%xmm1") /* xmm1 = xa0+xa1+xa2+xa3 xa2+xa3 ? ? */ + __ASM_EMIT("addss %%xmm6, %%xmm2") /* xmm2 = xb0+xb1+xb2+xb3 xb2+xb3 ? ? */ + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("5:") + __ASM_EMIT("movss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("movss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("movaps %%xmm3, %%xmm7") /* xmm7 = a0 */ + __ASM_EMIT("mulss %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("mulss %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("mulss %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("addss %%xmm7, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("addss %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("addss %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("add $0x04, %[off]") /* ++off */ + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 5b") + __ASM_EMIT("6:") + /* Store result */ + __ASM_EMIT("movss 0x00(%[corr]), %%xmm4") + __ASM_EMIT("movss 0x04(%[corr]), %%xmm5") + __ASM_EMIT("movss 0x08(%[corr]), %%xmm6") + __ASM_EMIT("addss %%xmm4, %%xmm0") + __ASM_EMIT("addss %%xmm5, %%xmm1") + __ASM_EMIT("addss %%xmm6, %%xmm2") + __ASM_EMIT("movss %%xmm0, 0x00(%[corr])") + __ASM_EMIT("movss %%xmm1, 0x04(%[corr])") + __ASM_EMIT("movss %%xmm2, 0x08(%[corr])") + + : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count) + : [a] "r" (a), [b] "r" (b) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + } /* namespace sse */ +} /* namespace lsp */ + + + +#endif /* PRIVATE_DSP_ARCH_X86_SSE_CORRELATION_H_ */ diff --git a/src/main/x86/sse.cpp b/src/main/x86/sse.cpp index 96d344ae..676aea07 100644 --- a/src/main/x86/sse.cpp +++ b/src/main/x86/sse.cpp @@ -66,6 +66,7 @@ #include #include + #include #include #include @@ -492,6 +493,7 @@ EXPORT1(cull_triangle_raw); EXPORT1(convolve); + EXPORT1(corr_init); EXPORT1(lin_inter_set); EXPORT1(lin_inter_mul2); diff --git a/src/test/ptest/convolve.cpp b/src/test/ptest/convolve.cpp index 5393dc6b..9f2d2a42 100644 --- a/src/test/ptest/convolve.cpp +++ b/src/test/ptest/convolve.cpp @@ -109,6 +109,9 @@ PTEST_BEGIN("dsp", convolve, 5, 1000) float *in = &out[buf_size*2]; float *conv = &in[buf_size]; float *backup = &conv[buf_size]; + lsp_finally { + free_aligned(data); + }; for (size_t i=0; i < buf_size*4; ++i) out[i] = randf(-1.0f, 1.0f); diff --git a/src/test/ptest/corr_init.cpp b/src/test/ptest/corr_init.cpp new file mode 100644 index 00000000..1b6730d2 --- /dev/null +++ b/src/test/ptest/corr_init.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 9 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include +#include + +#define MIN_RANK 5 +#define MAX_RANK 15 + +namespace lsp +{ + namespace generic + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + + namespace sse + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + + namespace test + { + static void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + float vv = 0.0f; + float va = 0.0f; + float vb = 0.0f; + + for (size_t i=0; iv += vv; + corr->a += va; + corr->b += vb; + } + } + + typedef void (* corr_init_t)(dsp::correlation_t *corr, const float *a, const float *b, size_t count); +} + +//----------------------------------------------------------------------------- +// Performance test for lanczos resampling +PTEST_BEGIN("dsp", corr_init, 5, 1000) + + void call(const char *label, const float *a, const float *b, size_t count, corr_init_t func) + { + if (!PTEST_SUPPORTED(func)) + return; + + char buf[80]; + sprintf(buf, "%s x %d", label, int(count)); + printf("Testing %s correlation ...\n", buf); + + dsp::correlation_t corr; + + PTEST_LOOP(buf, + corr.v = 0.0f; + corr.a = 0.0f; + corr.b = 0.0f; + + func(&corr, a, b, count); + ); + } + + PTEST_MAIN + { + size_t buf_size = 1 << MAX_RANK; + uint8_t *data = NULL; + float *a = alloc_aligned(data, buf_size * 2, 64); + float *b = &a[buf_size]; + lsp_finally { + free_aligned(data); + }; + + for (size_t i=0; i < buf_size*2; ++i) + a[i] = randf(-1.0f, 1.0f); + + #define CALL(func, count) \ + call(#func, a, b, count, func) + + TEST_EXPORT(test::corr_init); + + for (size_t i=MIN_RANK; i<=MAX_RANK; ++i) + { + const size_t count = 1 << i; + + CALL(test::corr_init, count); + CALL(generic::corr_init, count); + CALL(sse::corr_init, count); + + PTEST_SEPARATOR; + } + } + +PTEST_END + + diff --git a/src/test/utest/corr_init.cpp b/src/test/utest/corr_init.cpp index 7a81a926..78c4ca14 100644 --- a/src/test/utest/corr_init.cpp +++ b/src/test/utest/corr_init.cpp @@ -32,6 +32,11 @@ namespace lsp void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); } + namespace sse + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + static void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) { float vv = 0.0f; @@ -67,12 +72,12 @@ UTEST_BEGIN("dsp", corr_init) FloatBuffer b(count, align, mask & 0x02); dsp::correlation_t corr_a, corr_b; - corr_a.v = 0.0f; - corr_a.a = 0.0f; - corr_a.b = 0.0f; - corr_b.v = 0.0f; - corr_b.a = 0.0f; - corr_b.b = 0.0f; + corr_a.v = 0.1f; + corr_a.a = 0.2f; + corr_a.b = 0.3f; + corr_b.v = 0.1f; + corr_b.a = 0.2f; + corr_b.b = 0.3f; printf("Tesing %s corr_init on buffer count=%d mask=0x%x\n", label, int(count), int(mask)); @@ -83,9 +88,9 @@ UTEST_BEGIN("dsp", corr_init) UTEST_ASSERT_MSG(b.valid(), "Buffer B corrupted"); // Compare state - if ((!float_equals_adaptive(corr_a.v, corr_b.v)) || - (!float_equals_adaptive(corr_a.a, corr_b.a)) || - (!float_equals_adaptive(corr_a.b, corr_b.b))) + if ((!float_equals_adaptive(corr_a.v, corr_b.v, 1e-5f)) || + (!float_equals_adaptive(corr_a.a, corr_b.a, 1e-5f)) || + (!float_equals_adaptive(corr_a.b, corr_b.b, 1e-5f))) { UTEST_FAIL_MSG("Correlation state differs a={%f, %f, %f}, b={%f, %f, %f}", corr_a.v, corr_a.a, corr_a.b, @@ -101,6 +106,7 @@ UTEST_BEGIN("dsp", corr_init) call(#func, align, func) CALL(generic::corr_init, 16); + IF_ARCH_X86(CALL(sse::corr_init, 16)); } UTEST_END; From b357d5af9e74bf149ee4f678f71bb899bef00ab7 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Sat, 9 Mar 2024 16:55:55 +0300 Subject: [PATCH 07/22] Implemented AVX-optimized corr_init --- .../private/dsp/arch/x86/avx/correlation.h | 256 ++++++++++++++++++ src/main/x86/avx.cpp | 3 + src/test/ptest/corr_init.cpp | 10 +- src/test/utest/corr_init.cpp | 8 + 4 files changed, 276 insertions(+), 1 deletion(-) create mode 100644 include/private/dsp/arch/x86/avx/correlation.h diff --git a/include/private/dsp/arch/x86/avx/correlation.h b/include/private/dsp/arch/x86/avx/correlation.h new file mode 100644 index 00000000..03275df9 --- /dev/null +++ b/include/private/dsp/arch/x86/avx/correlation.h @@ -0,0 +1,256 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 9 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_X86_AVX_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_AVX_IMPL */ + +namespace lsp +{ + namespace avx + { + + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + IF_ARCH_X86( + size_t off; + ); + + ARCH_X86_ASM + ( + __ASM_EMIT("xor %[off], %[off]") + __ASM_EMIT("vxorps %%ymm0, %%ymm0, %%ymm0") /* xv = 0 */ + __ASM_EMIT("vxorps %%ymm1, %%ymm1, %%ymm1") /* xa = 0 */ + __ASM_EMIT("vxorps %%ymm2, %%ymm2, %%ymm2") /* xb = 0 */ + /* 16x blocks */ + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm3") /* ymm3 = a0 */ + __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm4") /* ymm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%ymm5") /* ymm5 = b0 */ + __ASM_EMIT("vmovups 0x20(%[b], %[off]), %%ymm6") /* ymm6 = b1 */ + __ASM_EMIT("vmulps %%ymm3, %%ymm5, %%ymm7") /* ymm7 = a0*b0 */ + __ASM_EMIT("vmulps %%ymm3, %%ymm3, %%ymm3") /* ymm3 = a0*a0 */ + __ASM_EMIT("vmulps %%ymm5, %%ymm5, %%ymm5") /* ymm5 = b0*b0 */ + __ASM_EMIT("vaddps %%ymm7, %%ymm0, %%ymm0") /* ymm0 = xv + a0*b0 */ + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = xa + a0*a0 */ + __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = xb + b0*b0 */ + __ASM_EMIT("vmulps %%ymm4, %%ymm6, %%ymm7") /* ymm7 = a0*b0 */ + __ASM_EMIT("vmulps %%ymm4, %%ymm4, %%ymm4") /* ymm4 = a0*a0 */ + __ASM_EMIT("vmulps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = b0*b0 */ + __ASM_EMIT("vaddps %%ymm7, %%ymm0, %%ymm0") /* ymm0 = xv + a1*b1 */ + __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = xa + a1*a1 */ + __ASM_EMIT("vaddps %%ymm6, %%ymm2, %%ymm2") /* ymm2 = xb + b1*b1 */ + __ASM_EMIT("add $0x40, %[off]") /* ++off */ + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm4") + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm5") + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm6") + __ASM_EMIT("vaddps %%xmm4, %%xmm0, %%xmm0") + __ASM_EMIT("vaddps %%xmm5, %%xmm1, %%xmm1") + __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") + __ASM_EMIT("2:") + /* 8x block */ + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmovups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */ + __ASM_EMIT("vmulps %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("vmulps %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("vmulps %%xmm4, %%xmm6, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("vmulps %%xmm4, %%xmm4, %%xmm4") /* xmm4 = a0*a0 */ + __ASM_EMIT("vmulps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = b0*b0 */ + __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a1*b1 */ + __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = xa + a1*a1 */ + __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") /* xmm2 = xb + b1*b1 */ + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("add $0x20, %[off]") /* ++off */ + __ASM_EMIT("4:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmulps %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("vmulps %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[off]") /* ++off */ + __ASM_EMIT("6:") + /* Do horizontal sum */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1+xv2+xv3 */ + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("7:") + __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmulss %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("vmulss %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("vmulss %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("vaddss %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vaddss %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vaddss %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("add $0x04, %[off]") /* ++off */ + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 7b") + __ASM_EMIT("8:") + /* Store result */ + __ASM_EMIT("vaddss 0x00(%[corr]), %%xmm0, %%xmm0") + __ASM_EMIT("vaddss 0x04(%[corr]), %%xmm1, %%xmm1") + __ASM_EMIT("vaddss 0x08(%[corr]), %%xmm2, %%xmm2") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[corr])") + __ASM_EMIT("vmovss %%xmm1, 0x04(%[corr])") + __ASM_EMIT("vmovss %%xmm2, 0x08(%[corr])") + + : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count) + : [a] "r" (a), [b] "r" (b) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + IF_ARCH_X86( + size_t off; + ); + + ARCH_X86_ASM + ( + __ASM_EMIT("xor %[off], %[off]") + __ASM_EMIT("vxorps %%ymm0, %%ymm0, %%ymm0") /* xv = 0 */ + __ASM_EMIT("vxorps %%ymm1, %%ymm1, %%ymm1") /* xa = 0 */ + __ASM_EMIT("vxorps %%ymm2, %%ymm2, %%ymm2") /* xb = 0 */ + /* 16x blocks */ + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm3") /* ymm3 = a0 */ + __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm4") /* ymm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%ymm5") /* ymm5 = b0 */ + __ASM_EMIT("vmovups 0x20(%[b], %[off]), %%ymm6") /* ymm6 = b1 */ + __ASM_EMIT("vfmadd231ps %%ymm3, %%ymm5, %%ymm0") /* ymm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%ymm3, %%ymm3, %%ymm1") /* ymm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%ymm5, %%ymm5, %%ymm2") /* ymm2 = xv + b0*b0 */ + __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm6, %%ymm0") /* ymm0 = xv + a1*b1 */ + __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm4, %%ymm1") /* ymm1 = xa + a1*a1 */ + __ASM_EMIT("vfmadd231ps %%ymm6, %%ymm6, %%ymm2") /* ymm2 = xv + b1*b1 */ + __ASM_EMIT("add $0x40, %[off]") /* ++off */ + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm4") + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm5") + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm6") + __ASM_EMIT("vaddps %%xmm4, %%xmm0, %%xmm0") + __ASM_EMIT("vaddps %%xmm5, %%xmm1, %%xmm1") + __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") + __ASM_EMIT("2:") + /* 8x block */ + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmovups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xv + b0*b0 */ + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm6, %%xmm0") /* xmm0 = xv + a1*b1 */ + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm4, %%xmm1") /* xmm1 = xa + a1*a1 */ + __ASM_EMIT("vfmadd231ps %%xmm6, %%xmm6, %%xmm2") /* xmm2 = xv + b1*b1 */ + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("add $0x20, %[off]") /* ++off */ + __ASM_EMIT("4:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xv + b0*b0 */ + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[off]") /* ++off */ + __ASM_EMIT("6:") + /* Do horizontal sum */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1+xv2+xv3 */ + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("7:") + __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vfmadd231ss %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ss %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ss %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xv + b0*b0 */ + __ASM_EMIT("add $0x04, %[off]") /* ++off */ + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 7b") + __ASM_EMIT("8:") + /* Store result */ + __ASM_EMIT("vaddss 0x00(%[corr]), %%xmm0, %%xmm0") + __ASM_EMIT("vaddss 0x04(%[corr]), %%xmm1, %%xmm1") + __ASM_EMIT("vaddss 0x08(%[corr]), %%xmm2, %%xmm2") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[corr])") + __ASM_EMIT("vmovss %%xmm1, 0x04(%[corr])") + __ASM_EMIT("vmovss %%xmm2, 0x08(%[corr])") + + : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count) + : [a] "r" (a), [b] "r" (b) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + } /* namespace avx */ +} /* namespace lsp */ + + + + +#endif /* PRIVATE_DSP_ARCH_X86_AVX_CORRELATION_H_ */ diff --git a/src/main/x86/avx.cpp b/src/main/x86/avx.cpp index 99e76b59..eff3d99f 100644 --- a/src/main/x86/avx.cpp +++ b/src/main/x86/avx.cpp @@ -69,6 +69,7 @@ #include #include #include + #include #include @@ -391,6 +392,7 @@ CEXPORT1(favx, downsample_8x); CEXPORT1(favx, convolve); + CEXPORT1(favx, corr_init); CEXPORT1(favx, lin_inter_set); CEXPORT1(favx, lin_inter_mul2); @@ -483,6 +485,7 @@ CEXPORT2(favx, filter_transfer_apply_pc, filter_transfer_apply_pc_fma3); CEXPORT2(favx, convolve, convolve_fma3); + CEXPORT2(favx, corr_init, corr_init_fma3); CEXPORT2(favx, axis_apply_lin1, axis_apply_lin1_fma3); diff --git a/src/test/ptest/corr_init.cpp b/src/test/ptest/corr_init.cpp index 1b6730d2..7266c0be 100644 --- a/src/test/ptest/corr_init.cpp +++ b/src/test/ptest/corr_init.cpp @@ -40,6 +40,12 @@ namespace lsp void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); } + namespace avx + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + namespace test { static void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) @@ -112,7 +118,9 @@ PTEST_BEGIN("dsp", corr_init, 5, 1000) CALL(test::corr_init, count); CALL(generic::corr_init, count); - CALL(sse::corr_init, count); + IF_ARCH_X86(CALL(sse::corr_init, count)); + IF_ARCH_X86(CALL(avx::corr_init, count)); + IF_ARCH_X86(CALL(avx::corr_init_fma3, count)); PTEST_SEPARATOR; } diff --git a/src/test/utest/corr_init.cpp b/src/test/utest/corr_init.cpp index 78c4ca14..cf2c6345 100644 --- a/src/test/utest/corr_init.cpp +++ b/src/test/utest/corr_init.cpp @@ -37,6 +37,12 @@ namespace lsp void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); } + namespace avx + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + static void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) { float vv = 0.0f; @@ -107,6 +113,8 @@ UTEST_BEGIN("dsp", corr_init) CALL(generic::corr_init, 16); IF_ARCH_X86(CALL(sse::corr_init, 16)); + IF_ARCH_X86(CALL(avx::corr_init, 32)); + IF_ARCH_X86(CALL(avx::corr_init_fma3, 32)); } UTEST_END; From 8598b34f79cc6ac6c503e683ba7ab269d9d398ea Mon Sep 17 00:00:00 2001 From: sadko4u Date: Sat, 9 Mar 2024 17:09:35 +0300 Subject: [PATCH 08/22] AVX-512 implementation of correlation function --- .../private/dsp/arch/x86/avx512/correlation.h | 307 ++++++++++++++++++ src/main/x86/avx512.cpp | 5 + src/test/ptest/corr_init.cpp | 34 +- src/test/utest/corr_init.cpp | 28 +- 4 files changed, 353 insertions(+), 21 deletions(-) create mode 100644 include/private/dsp/arch/x86/avx512/correlation.h diff --git a/include/private/dsp/arch/x86/avx512/correlation.h b/include/private/dsp/arch/x86/avx512/correlation.h new file mode 100644 index 00000000..c16146fa --- /dev/null +++ b/include/private/dsp/arch/x86/avx512/correlation.h @@ -0,0 +1,307 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 9 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX512_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_X86_AVX512_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX512_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */ + +namespace lsp +{ + namespace avx512 + { + + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + IF_ARCH_X86( + size_t off; + ); + + ARCH_X86_ASM + ( + __ASM_EMIT("xor %[off], %[off]") + __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0") /* xv = 0 */ + __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1") /* xa = 0 */ + __ASM_EMIT("vxorps %%zmm2, %%zmm2, %%zmm2") /* xb = 0 */ + /* 32x blocks */ + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%zmm3") /* zmm3 = a0 */ + __ASM_EMIT("vmovups 0x40(%[a], %[off]), %%zmm4") /* zmm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%zmm5") /* zmm5 = b0 */ + __ASM_EMIT("vmovups 0x40(%[b], %[off]), %%zmm6") /* zmm6 = b1 */ + __ASM_EMIT("vmulps %%zmm3, %%zmm5, %%zmm7") /* zmm7 = a0*b0 */ + __ASM_EMIT("vmulps %%zmm3, %%zmm3, %%zmm3") /* zmm3 = a0*a0 */ + __ASM_EMIT("vmulps %%zmm5, %%zmm5, %%zmm5") /* zmm5 = b0*b0 */ + __ASM_EMIT("vaddps %%zmm7, %%zmm0, %%zmm0") /* zmm0 = xv + a0*b0 */ + __ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1") /* zmm1 = xa + a0*a0 */ + __ASM_EMIT("vaddps %%zmm5, %%zmm2, %%zmm2") /* zmm2 = xb + b0*b0 */ + __ASM_EMIT("vmulps %%zmm4, %%zmm6, %%zmm7") /* zmm7 = a0*b0 */ + __ASM_EMIT("vmulps %%zmm4, %%zmm4, %%zmm4") /* zmm4 = a0*a0 */ + __ASM_EMIT("vmulps %%zmm6, %%zmm6, %%zmm6") /* zmm6 = b0*b0 */ + __ASM_EMIT("vaddps %%zmm7, %%zmm0, %%zmm0") /* zmm0 = xv + a1*b1 */ + __ASM_EMIT("vaddps %%zmm4, %%zmm1, %%zmm1") /* zmm1 = xa + a1*a1 */ + __ASM_EMIT("vaddps %%zmm6, %%zmm2, %%zmm2") /* zmm2 = xb + b1*b1 */ + __ASM_EMIT("add $0x80, %[off]") /* ++off */ + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm4") + __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm5") + __ASM_EMIT("vextractf64x4 $1, %%zmm2, %%ymm6") + __ASM_EMIT("vaddps %%ymm4, %%ymm0, %%ymm0") + __ASM_EMIT("vaddps %%ymm5, %%ymm1, %%ymm1") + __ASM_EMIT("vaddps %%ymm6, %%ymm2, %%ymm2") + __ASM_EMIT("2:") + /* 16x blocks */ + __ASM_EMIT("add $16, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm3") /* ymm3 = a0 */ + __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm4") /* ymm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%ymm5") /* ymm5 = b0 */ + __ASM_EMIT("vmovups 0x20(%[b], %[off]), %%ymm6") /* ymm6 = b1 */ + __ASM_EMIT("vmulps %%ymm3, %%ymm5, %%ymm7") /* ymm7 = a0*b0 */ + __ASM_EMIT("vmulps %%ymm3, %%ymm3, %%ymm3") /* ymm3 = a0*a0 */ + __ASM_EMIT("vmulps %%ymm5, %%ymm5, %%ymm5") /* ymm5 = b0*b0 */ + __ASM_EMIT("vaddps %%ymm7, %%ymm0, %%ymm0") /* ymm0 = xv + a0*b0 */ + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = xa + a0*a0 */ + __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = xb + b0*b0 */ + __ASM_EMIT("vmulps %%ymm4, %%ymm6, %%ymm7") /* ymm7 = a0*b0 */ + __ASM_EMIT("vmulps %%ymm4, %%ymm4, %%ymm4") /* ymm4 = a0*a0 */ + __ASM_EMIT("vmulps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = b0*b0 */ + __ASM_EMIT("vaddps %%ymm7, %%ymm0, %%ymm0") /* ymm0 = xv + a1*b1 */ + __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = xa + a1*a1 */ + __ASM_EMIT("vaddps %%ymm6, %%ymm2, %%ymm2") /* ymm2 = xb + b1*b1 */ + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("add $0x40, %[off]") /* ++off */ + __ASM_EMIT("4:") + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm4") + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm5") + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm6") + __ASM_EMIT("vaddps %%xmm4, %%xmm0, %%xmm0") + __ASM_EMIT("vaddps %%xmm5, %%xmm1, %%xmm1") + __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") + /* 8x block */ + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmovups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */ + __ASM_EMIT("vmulps %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("vmulps %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("vmulps %%xmm4, %%xmm6, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("vmulps %%xmm4, %%xmm4, %%xmm4") /* xmm4 = a0*a0 */ + __ASM_EMIT("vmulps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = b0*b0 */ + __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a1*b1 */ + __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = xa + a1*a1 */ + __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") /* xmm2 = xb + b1*b1 */ + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("add $0x20, %[off]") /* ++off */ + __ASM_EMIT("6:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmulps %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("vmulps %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[off]") /* ++off */ + __ASM_EMIT("8:") + /* Do horizontal sum */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1+xv2+xv3 */ + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 10f") + __ASM_EMIT("9:") + __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmulss %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("vmulss %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("vmulss %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("vaddss %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vaddss %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vaddss %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("add $0x04, %[off]") /* ++off */ + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 9b") + __ASM_EMIT("10:") + /* Store result */ + __ASM_EMIT("vaddss 0x00(%[corr]), %%xmm0, %%xmm0") + __ASM_EMIT("vaddss 0x04(%[corr]), %%xmm1, %%xmm1") + __ASM_EMIT("vaddss 0x08(%[corr]), %%xmm2, %%xmm2") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[corr])") + __ASM_EMIT("vmovss %%xmm1, 0x04(%[corr])") + __ASM_EMIT("vmovss %%xmm2, 0x08(%[corr])") + + : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count) + : [a] "r" (a), [b] "r" (b) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + IF_ARCH_X86( + size_t off; + ); + + ARCH_X86_ASM + ( + __ASM_EMIT("xor %[off], %[off]") + __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0") /* xv = 0 */ + __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1") /* xa = 0 */ + __ASM_EMIT("vxorps %%zmm2, %%zmm2, %%zmm2") /* xb = 0 */ + /* 32x blocks */ + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%zmm3") /* zmm3 = a0 */ + __ASM_EMIT("vmovups 0x40(%[a], %[off]), %%zmm4") /* zmm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%zmm5") /* zmm5 = b0 */ + __ASM_EMIT("vmovups 0x40(%[b], %[off]), %%zmm6") /* zmm6 = b1 */ + __ASM_EMIT("vfmadd231ps %%zmm3, %%zmm5, %%zmm0") /* zmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%zmm3, %%zmm3, %%zmm1") /* zmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%zmm5, %%zmm5, %%zmm2") /* zmm2 = xb + b0*b0 */ + __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm6, %%zmm0") /* zmm0 = xv + a1*b1 */ + __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm4, %%zmm1") /* zmm1 = xa + a1*a1 */ + __ASM_EMIT("vfmadd231ps %%zmm6, %%zmm6, %%zmm2") /* zmm2 = xb + b1*b1 */ + __ASM_EMIT("add $0x80, %[off]") /* ++off */ + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm4") + __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm5") + __ASM_EMIT("vextractf64x4 $1, %%zmm2, %%ymm6") + __ASM_EMIT("vaddps %%ymm4, %%ymm0, %%ymm0") + __ASM_EMIT("vaddps %%ymm5, %%ymm1, %%ymm1") + __ASM_EMIT("vaddps %%ymm6, %%ymm2, %%ymm2") + __ASM_EMIT("2:") + /* 16x blocks */ + __ASM_EMIT("add $16, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm3") /* ymm3 = a0 */ + __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm4") /* ymm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%ymm5") /* ymm5 = b0 */ + __ASM_EMIT("vmovups 0x20(%[b], %[off]), %%ymm6") /* ymm6 = b1 */ + __ASM_EMIT("vfmadd231ps %%ymm3, %%ymm5, %%ymm0") /* ymm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%ymm3, %%ymm3, %%ymm1") /* ymm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%ymm5, %%ymm5, %%ymm2") /* ymm2 = xb + b0*b0 */ + __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm6, %%ymm0") /* ymm0 = xv + a1*b1 */ + __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm4, %%ymm1") /* ymm1 = xa + a1*a1 */ + __ASM_EMIT("vfmadd231ps %%ymm6, %%ymm6, %%ymm2") /* ymm2 = xb + b1*b1 */ + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("add $0x40, %[off]") /* ++off */ + __ASM_EMIT("4:") + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm4") + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm5") + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm6") + __ASM_EMIT("vaddps %%xmm4, %%xmm0, %%xmm0") + __ASM_EMIT("vaddps %%xmm5, %%xmm1, %%xmm1") + __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") + /* 8x block */ + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmovups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm6, %%xmm0") /* xmm0 = xv + a1*b1 */ + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm4, %%xmm1") /* xmm1 = xa + a1*a1 */ + __ASM_EMIT("vfmadd231ps %%xmm6, %%xmm6, %%xmm2") /* xmm2 = xb + b1*b1 */ + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("add $0x20, %[off]") /* ++off */ + __ASM_EMIT("6:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[off]") /* ++off */ + __ASM_EMIT("8:") + /* Do horizontal sum */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1+xv2+xv3 */ + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 10f") + __ASM_EMIT("9:") + __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vfmadd231ss %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ss %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ss %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("add $0x04, %[off]") /* ++off */ + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 9b") + __ASM_EMIT("10:") + /* Store result */ + __ASM_EMIT("vaddss 0x00(%[corr]), %%xmm0, %%xmm0") + __ASM_EMIT("vaddss 0x04(%[corr]), %%xmm1, %%xmm1") + __ASM_EMIT("vaddss 0x08(%[corr]), %%xmm2, %%xmm2") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[corr])") + __ASM_EMIT("vmovss %%xmm1, 0x04(%[corr])") + __ASM_EMIT("vmovss %%xmm2, 0x08(%[corr])") + + : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count) + : [a] "r" (a), [b] "r" (b) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + } /* namespace avx512 */ +} /* namespace lsp */ + + + + + +#endif /* PRIVATE_DSP_ARCH_X86_AVX512_CORRELATION_H_ */ diff --git a/src/main/x86/avx512.cpp b/src/main/x86/avx512.cpp index 64329fb1..524a14ef 100644 --- a/src/main/x86/avx512.cpp +++ b/src/main/x86/avx512.cpp @@ -49,6 +49,8 @@ #include #include #include + + #include #undef PRIVATE_DSP_ARCH_X86_AVX512_IMPL namespace lsp @@ -275,6 +277,9 @@ CEXPORT1(vl, uexpander_x1_curve); CEXPORT1(vl, dexpander_x1_gain); CEXPORT1(vl, dexpander_x1_curve); + + CEXPORT1(vl, corr_init); + CEXPORT2(vl, corr_init, corr_init_fma3); } } /* namespace avx2 */ } /* namespace lsp */ diff --git a/src/test/ptest/corr_init.cpp b/src/test/ptest/corr_init.cpp index 7266c0be..f8f7d427 100644 --- a/src/test/ptest/corr_init.cpp +++ b/src/test/ptest/corr_init.cpp @@ -25,7 +25,7 @@ #include #include -#define MIN_RANK 5 +#define MIN_RANK 8 #define MAX_RANK 15 namespace lsp @@ -35,20 +35,28 @@ namespace lsp void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); } - namespace sse - { - void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); - } + IF_ARCH_X86( + namespace sse + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } - namespace avx - { - void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); - void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count); - } + namespace avx + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + + namespace avx512 + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + ) namespace test { - static void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) { float vv = 0.0f; float va = 0.0f; @@ -72,7 +80,7 @@ namespace lsp //----------------------------------------------------------------------------- // Performance test for lanczos resampling -PTEST_BEGIN("dsp", corr_init, 5, 1000) +PTEST_BEGIN("dsp", corr_init, 5, 10000) void call(const char *label, const float *a, const float *b, size_t count, corr_init_t func) { @@ -121,6 +129,8 @@ PTEST_BEGIN("dsp", corr_init, 5, 1000) IF_ARCH_X86(CALL(sse::corr_init, count)); IF_ARCH_X86(CALL(avx::corr_init, count)); IF_ARCH_X86(CALL(avx::corr_init_fma3, count)); + IF_ARCH_X86(CALL(avx512::corr_init, count)); + IF_ARCH_X86(CALL(avx512::corr_init_fma3, count)); PTEST_SEPARATOR; } diff --git a/src/test/utest/corr_init.cpp b/src/test/utest/corr_init.cpp index cf2c6345..a153b13a 100644 --- a/src/test/utest/corr_init.cpp +++ b/src/test/utest/corr_init.cpp @@ -32,16 +32,24 @@ namespace lsp void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); } - namespace sse - { - void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); - } + IF_ARCH_X86( + namespace sse + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } - namespace avx - { - void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); - void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count); - } + namespace avx + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + + namespace avx512 + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + ) static void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) { @@ -115,6 +123,8 @@ UTEST_BEGIN("dsp", corr_init) IF_ARCH_X86(CALL(sse::corr_init, 16)); IF_ARCH_X86(CALL(avx::corr_init, 32)); IF_ARCH_X86(CALL(avx::corr_init_fma3, 32)); + IF_ARCH_X86(CALL(avx512::corr_init, 64)); + IF_ARCH_X86(CALL(avx512::corr_init_fma3, 64)); } UTEST_END; From 874bc6954a2bbeb0d9f8d8875c9102a4dc3c5d61 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Sat, 9 Mar 2024 18:49:37 +0300 Subject: [PATCH 09/22] NEON-D32 implementation of corr_init --- .../dsp/arch/arm/neon-d32/correlation.h | 135 ++++++++++++++++++ src/main/arm/neon-d32.cpp | 2 + src/test/ptest/corr_init.cpp | 26 +--- src/test/utest/corr_init.cpp | 8 ++ 4 files changed, 150 insertions(+), 21 deletions(-) create mode 100644 include/private/dsp/arch/arm/neon-d32/correlation.h diff --git a/include/private/dsp/arch/arm/neon-d32/correlation.h b/include/private/dsp/arch/arm/neon-d32/correlation.h new file mode 100644 index 00000000..e3cd0f98 --- /dev/null +++ b/include/private/dsp/arch/arm/neon-d32/correlation.h @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 9 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_ARM_NEON_D32_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL */ + +namespace lsp +{ + namespace neon_d32 + { + + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + ARCH_ARM_ASM( + __ASM_EMIT("veor q0, q0, q0") /* xv = 0 */ + __ASM_EMIT("veor q1, q1, q1") /* xa = 0 */ + __ASM_EMIT("veor q2, q2, q2") /* xb = 0 */ + /* 16x blocks */ + __ASM_EMIT("subs %[count], #16") + __ASM_EMIT("blo 2f") + __ASM_EMIT("veor q3, q3, q3") /* xv = 0 */ + __ASM_EMIT("veor q4, q4, q4") /* xa = 0 */ + __ASM_EMIT("veor q5, q5, q5") /* xb = 0 */ + __ASM_EMIT("1:") + __ASM_EMIT("vldm %[a]!, {q8-q11}") /* q8 = a0, q9 = a1, q10 = a2, q11 = a3 */ + __ASM_EMIT("vldm %[b]!, {q12-q15}") /* q12 = b0, q13 = b1, q14 = b2, q15 = b3 */ + __ASM_EMIT("vmla.f32 q0, q8, q12") /* q0 = xv + a0*b0 */ + __ASM_EMIT("vmla.f32 q3, q9, q13") /* q3 = xv + a1*b1 */ + __ASM_EMIT("vmla.f32 q1, q8, q8") /* q1 = xa + a0*a0 */ + __ASM_EMIT("vmla.f32 q4, q9, q9") /* q4 = xa + a1*a1 */ + __ASM_EMIT("vmla.f32 q2, q12, q12") /* q2 = xb + b0*b0 */ + __ASM_EMIT("vmla.f32 q5, q13, q13") /* q5 = xb + b1*b1 */ + __ASM_EMIT("vmla.f32 q0, q10, q14") /* q0 = xv + a2*b2 */ + __ASM_EMIT("vmla.f32 q3, q11, q15") /* q3 = xv + a3*b3 */ + __ASM_EMIT("vmla.f32 q1, q10, q10") /* q1 = xa + a2*a2 */ + __ASM_EMIT("vmla.f32 q4, q11, q11") /* q4 = xa + a3*a3 */ + __ASM_EMIT("vmla.f32 q2, q14, q14") /* q2 = xb + b2*b3 */ + __ASM_EMIT("vmla.f32 q5, q15, q15") /* q5 = xb + b3*b3 */ + __ASM_EMIT("subs %[count], #16") + __ASM_EMIT("bhs 1b") + __ASM_EMIT("vadd.f32 q0, q0, q3") + __ASM_EMIT("vadd.f32 q1, q1, q4") + __ASM_EMIT("vadd.f32 q2, q2, q5") + __ASM_EMIT("2:") + /* 8x block */ + __ASM_EMIT("adds %[count], #8") + __ASM_EMIT("blt 4f") + __ASM_EMIT("vldm %[a]!, {q8-q9}") /* q8 = a0, q9 = a1 */ + __ASM_EMIT("vldm %[b]!, {q12-q13}") /* q12 = b0, q13 = b1 */ + __ASM_EMIT("vmla.f32 q0, q8, q12") /* q0 = xv + a0*b0 */ + __ASM_EMIT("vmla.f32 q1, q8, q8") /* q1 = xa + a0*a0 */ + __ASM_EMIT("vmla.f32 q2, q12, q12") /* q2 = xb + b0*b0 */ + __ASM_EMIT("vmla.f32 q0, q9, q13") /* q0 = xv + a1*b1 */ + __ASM_EMIT("vmla.f32 q1, q9, q9") /* q1 = xa + a1*a1 */ + __ASM_EMIT("vmla.f32 q2, q13, q13") /* q2 = xb + b1*b1 */ + __ASM_EMIT("sub %[count], #8") + __ASM_EMIT("4:") + /* 4x block */ + __ASM_EMIT("adds %[count], #4") + __ASM_EMIT("blt 6f") + __ASM_EMIT("vldm %[a]!, {q8}") /* q8 = a0 */ + __ASM_EMIT("vldm %[b]!, {q12}") /* q12 = b0 */ + __ASM_EMIT("vmla.f32 q0, q8, q12") /* q0 = xv + a0*b0 */ + __ASM_EMIT("vmla.f32 q1, q8, q8") /* q1 = xa + a0*a0 */ + __ASM_EMIT("vmla.f32 q2, q12, q12") /* q2 = xb + b0*b0 */ + __ASM_EMIT("sub %[count], #4") + __ASM_EMIT("6:") + __ASM_EMIT("veor q6, q6, q6") /* q6 = 0 */ + __ASM_EMIT("vext.32 q3, q0, q6, #2") /* q3 = xv2 xv3 0 0 */ + __ASM_EMIT("vext.32 q4, q1, q6, #2") /* q4 = xa2 xa3 0 0 */ + __ASM_EMIT("vext.32 q5, q2, q6, #2") /* q5 = xb2 xb3 0 0 */ + __ASM_EMIT("vadd.f32 q0, q0, q3") /* q0 = xv0+xv2 xv1+xv3 xv2 xv3 */ + __ASM_EMIT("vadd.f32 q1, q1, q4") /* q1 = xa0+xa2 xa1+xa3 xv2 xv3 */ + __ASM_EMIT("vadd.f32 q2, q2, q5") /* q2 = xb0+xb2 xb1+xb3 xv2 xv3 */ + __ASM_EMIT("vext.32 q3, q0, q6, #1") /* q3 = xv1+xv3 xv2 xv3 0 */ + __ASM_EMIT("vext.32 q4, q1, q6, #1") /* q4 = xa1+xa3 xv2 xv3 0 */ + __ASM_EMIT("vext.32 q5, q2, q6, #1") /* q5 = xb1+xb3 xv2 xv3 */ + __ASM_EMIT("vadd.f32 q0, q0, q3") /* q0 = xv0+xv1+xv2+xv3 xv1+xv2+xv3 xv2+xv3 xv3 */ + __ASM_EMIT("vadd.f32 q1, q1, q4") /* q1 = xa0+xa1+xa2+xa3 xa1+xa2+xa3 xa2+xa3 xa3 */ + __ASM_EMIT("vadd.f32 q2, q2, q5") /* q2 = xb0+xb1+xb2+xb3 xb1+xb2+xb3 xb2+xb3 xb3 */ + /* 1x blocks */ + __ASM_EMIT("adds %[count], #3") + __ASM_EMIT("blt 8f") + __ASM_EMIT("7:") + __ASM_EMIT("vld1.32 {d16[], d17[]}, [%[a]]!") /* q8 = a0 */ + __ASM_EMIT("vld1.32 {d24[], d25[]}, [%[b]]!") /* q12 = b0 */ + __ASM_EMIT("vmla.f32 q0, q8, q12") /* q0 = xv + a0*b0 */ + __ASM_EMIT("vmla.f32 q1, q8, q8") /* q1 = xa + a0*a0 */ + __ASM_EMIT("vmla.f32 q2, q12, q12") /* q2 = xb + b0*b0 */ + __ASM_EMIT("subs %[count], #1") + __ASM_EMIT("bge 7b") + __ASM_EMIT("8:") + /* Store result */ + __ASM_EMIT("vld3.32 {d6[], d8[], d10[]}, [%[corr]]") /* q3 = v, q4 = a, q5 = b */ + __ASM_EMIT("vadd.f32 q0, q0, q3") + __ASM_EMIT("vadd.f32 q1, q1, q4") + __ASM_EMIT("vadd.f32 q2, q2, q5") + __ASM_EMIT("vst3.32 {d0[0], d2[0], d4[0]}, [%[corr]]") + + : [a] "+r" (a), [b] "+r" (b), [count] "+r" (count) + : [corr] "r" (corr) + : "cc", "memory", + "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + } + + } /* namespace neon_d32 */ +} /* namespace lsp */ + + + +#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_CORRELATION_H_ */ diff --git a/src/main/arm/neon-d32.cpp b/src/main/arm/neon-d32.cpp index edf61ec2..29a3eead 100644 --- a/src/main/arm/neon-d32.cpp +++ b/src/main/arm/neon-d32.cpp @@ -50,6 +50,7 @@ #define PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL #include #include + #include #include #include #include @@ -141,6 +142,7 @@ EXPORT1(pcomplex_r2c_rdiv2); EXPORT1(convolve); + EXPORT1(corr_init); EXPORT1(axis_apply_lin1); EXPORT1(axis_apply_log1); diff --git a/src/test/ptest/corr_init.cpp b/src/test/ptest/corr_init.cpp index f8f7d427..3f42b0a2 100644 --- a/src/test/ptest/corr_init.cpp +++ b/src/test/ptest/corr_init.cpp @@ -54,26 +54,12 @@ namespace lsp } ) - namespace test - { - void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + IF_ARCH_ARM( + namespace neon_d32 { - float vv = 0.0f; - float va = 0.0f; - float vb = 0.0f; - - for (size_t i=0; iv += vv; - corr->a += va; - corr->b += vb; + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); } - } + ) typedef void (* corr_init_t)(dsp::correlation_t *corr, const float *a, const float *b, size_t count); } @@ -118,19 +104,17 @@ PTEST_BEGIN("dsp", corr_init, 5, 10000) #define CALL(func, count) \ call(#func, a, b, count, func) - TEST_EXPORT(test::corr_init); - for (size_t i=MIN_RANK; i<=MAX_RANK; ++i) { const size_t count = 1 << i; - CALL(test::corr_init, count); CALL(generic::corr_init, count); IF_ARCH_X86(CALL(sse::corr_init, count)); IF_ARCH_X86(CALL(avx::corr_init, count)); IF_ARCH_X86(CALL(avx::corr_init_fma3, count)); IF_ARCH_X86(CALL(avx512::corr_init, count)); IF_ARCH_X86(CALL(avx512::corr_init_fma3, count)); + IF_ARCH_ARM(CALL(neon_d32::corr_init, count)); PTEST_SEPARATOR; } diff --git a/src/test/utest/corr_init.cpp b/src/test/utest/corr_init.cpp index a153b13a..3b18e724 100644 --- a/src/test/utest/corr_init.cpp +++ b/src/test/utest/corr_init.cpp @@ -51,6 +51,13 @@ namespace lsp } ) + IF_ARCH_ARM( + namespace neon_d32 + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + ) + static void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) { float vv = 0.0f; @@ -125,6 +132,7 @@ UTEST_BEGIN("dsp", corr_init) IF_ARCH_X86(CALL(avx::corr_init_fma3, 32)); IF_ARCH_X86(CALL(avx512::corr_init, 64)); IF_ARCH_X86(CALL(avx512::corr_init_fma3, 64)); + IF_ARCH_ARM(CALL(neon_d32::corr_init, 16)); } UTEST_END; From 3e8fff81ff203bb861f1f3a4c3d786402a15d1a5 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Sat, 9 Mar 2024 20:01:47 +0300 Subject: [PATCH 10/22] ASIMD implementation of corr_init --- .../dsp/arch/aarch64/asimd/correlation.h | 145 ++++++++++++++++++ src/main/aarch64/asimd.cpp | 2 + src/test/ptest/corr_init.cpp | 8 + src/test/utest/corr_init.cpp | 8 + 4 files changed, 163 insertions(+) create mode 100644 include/private/dsp/arch/aarch64/asimd/correlation.h diff --git a/include/private/dsp/arch/aarch64/asimd/correlation.h b/include/private/dsp/arch/aarch64/asimd/correlation.h new file mode 100644 index 00000000..c3d6e6b1 --- /dev/null +++ b/include/private/dsp/arch/aarch64/asimd/correlation.h @@ -0,0 +1,145 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 9 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_AARCH64_ASIMD_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_AARCH64_ASIMD_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL */ + +namespace lsp +{ + namespace asimd + { + + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + ARCH_AARCH64_ASM( + __ASM_EMIT("eor v0.16b, v0.16b, v0.16b") /* xv = 0 */ + __ASM_EMIT("eor v1.16b, v1.16b, v1.16b") /* xa = 0 */ + __ASM_EMIT("eor v2.16b, v2.16b, v2.16b") /* xb = 0 */ + /* 16x blocks */ + __ASM_EMIT("subs %[count], %[count], #16") + __ASM_EMIT("b.lo 2f") + __ASM_EMIT("eor v3.16b, v3.16b, v3.16b") /* xv = 0 */ + __ASM_EMIT("eor v4.16b, v4.16b, v4.16b") /* xa = 0 */ + __ASM_EMIT("eor v5.16b, v5.16b, v5.16b") /* xb = 0 */ + __ASM_EMIT("1:") + __ASM_EMIT("ldp q8, q9, [%[a], 0x00]") /* v8 = a0, v9 = a1 */ + __ASM_EMIT("ldp q10, q11, [%[a], 0x20]") /* v10 = a2, v11 = a3 */ + __ASM_EMIT("ldp q12, q13, [%[b], 0x00]") /* v12 = b0, v13 = b1 */ + __ASM_EMIT("ldp q14, q15, [%[b], 0x20]") /* v14 = b2, v15 = b3 */ + __ASM_EMIT("fmla v0.4s, v8.4s, v12.4s") /* v0 = xv + a0*b0 */ + __ASM_EMIT("fmla v3.4s, v9.4s, v13.4s") /* v3 = xv + a1*b1 */ + __ASM_EMIT("fmla v1.4s, v8.4s, v8.4s") /* v1 = xa + a0*a0 */ + __ASM_EMIT("fmla v4.4s, v9.4s, v9.4s") /* v4 = xa + a1*a1 */ + __ASM_EMIT("fmla v2.4s, v12.4s, v12.4s") /* v2 = xb + b0*b0 */ + __ASM_EMIT("fmla v5.4s, v13.4s, v13.4s") /* v5 = xb + b1*b1 */ + __ASM_EMIT("fmla v0.4s, v10.4s, v14.4s") /* v0 = xv + a0*b0 */ + __ASM_EMIT("fmla v3.4s, v11.4s, v15.4s") /* v3 = xv + a1*b1 */ + __ASM_EMIT("fmla v1.4s, v10.4s, v10.4s") /* v1 = xa + a0*a0 */ + __ASM_EMIT("fmla v4.4s, v11.4s, v11.4s") /* v4 = xa + a1*a1 */ + __ASM_EMIT("fmla v2.4s, v14.4s, v14.4s") /* v2 = xb + b0*b0 */ + __ASM_EMIT("fmla v5.4s, v15.4s, v15.4s") /* v5 = xb + b1*b1 */ + __ASM_EMIT("subs %[count], %[count], #16") + __ASM_EMIT("add %[a], %[a], #0x40") + __ASM_EMIT("add %[b], %[b], #0x40") + __ASM_EMIT("b.hs 1b") + __ASM_EMIT("fadd v0.4s, v0.4s, v3.4s") + __ASM_EMIT("fadd v1.4s, v1.4s, v4.4s") + __ASM_EMIT("fadd v2.4s, v2.4s, v5.4s") + __ASM_EMIT("2:") + /* 8x block */ + __ASM_EMIT("adds %[count], %[count], #8") + __ASM_EMIT("b.lt 4f") + __ASM_EMIT("ldp q8, q9, [%[a], 0x00]") /* v8 = a0, v9 = a1 */ + __ASM_EMIT("ldp q12, q13, [%[b], 0x00]") /* v12 = b0, v13 = b1 */ + __ASM_EMIT("fmla v0.4s, v8.4s, v12.4s") /* v0 = xv + a0*b0 */ + __ASM_EMIT("fmla v1.4s, v8.4s, v8.4s") /* v1 = xa + a0*a0 */ + __ASM_EMIT("fmla v2.4s, v12.4s, v12.4s") /* v2 = xb + b0*b0 */ + __ASM_EMIT("fmla v0.4s, v9.4s, v13.4s") /* v0 = xv + a1*b1 */ + __ASM_EMIT("fmla v1.4s, v9.4s, v9.4s") /* v1 = xa + a1*a1 */ + __ASM_EMIT("fmla v2.4s, v13.4s, v13.4s") /* v2 = xb + b1*b1 */ + __ASM_EMIT("sub %[count], %[count], #8") + __ASM_EMIT("add %[a], %[a], #0x20") + __ASM_EMIT("add %[b], %[b], #0x20") + __ASM_EMIT("4:") + /* 4x block */ + __ASM_EMIT("adds %[count], %[count], #4") + __ASM_EMIT("b.lt 6f") + __ASM_EMIT("ldr q8, [%[a], 0x00]") /* v8 = a0, v9 = a1 */ + __ASM_EMIT("ldr q12, [%[b], 0x00]") /* v12 = b0, v13 = b1 */ + __ASM_EMIT("fmla v0.4s, v8.4s, v12.4s") /* v0 = xv + a0*b0 */ + __ASM_EMIT("fmla v1.4s, v8.4s, v8.4s") /* v1 = xa + a0*a0 */ + __ASM_EMIT("fmla v2.4s, v12.4s, v12.4s") /* v2 = xb + b0*b0 */ + __ASM_EMIT("sub %[count], %[count], #4") + __ASM_EMIT("add %[a], %[a], #0x10") + __ASM_EMIT("add %[b], %[b], #0x10") + __ASM_EMIT("6:") + __ASM_EMIT("eor v6.16b, v6.16b, v6.16b") /* v6 = 0 */ + __ASM_EMIT("ext v3.16b, v0.16b, v6.16b, #8")/* v3 = xv2 xv3 0 0 */ + __ASM_EMIT("ext v4.16b, v1.16b, v6.16b, #8")/* v4 = xa2 xb3 0 0 */ + __ASM_EMIT("ext v5.16b, v2.16b, v6.16b, #8")/* v5 = xa2 xb3 0 0 */ + __ASM_EMIT("fadd v0.4s, v0.4s, v3.4s") /* v0 = xv0+xv2 xv1+xv3 xv2 xv3 */ + __ASM_EMIT("fadd v1.4s, v1.4s, v4.4s") /* v1 = xa0+xa2 xa1+xa3 xa2 xa3 */ + __ASM_EMIT("fadd v2.4s, v2.4s, v5.4s") /* v2 = xb0+xb2 xb1+xb3 xb2 xb3 */ + __ASM_EMIT("ext v3.16b, v0.16b, v6.16b, #4")/* v3 = xv1+xv3 xv2 xv3 0 */ + __ASM_EMIT("ext v4.16b, v1.16b, v6.16b, #4")/* v4 = xa1+xa3 xa2 xa3 0 */ + __ASM_EMIT("ext v5.16b, v2.16b, v6.16b, #4")/* v5 = xb1+xb3 xb2 xb3 0 */ + __ASM_EMIT("fadd v0.4s, v0.4s, v3.4s") /* v0 = xv0+xv1+xv2+xv3 xv1+xv2+xv3 xv2+xv3 xv3 */ + __ASM_EMIT("fadd v1.4s, v1.4s, v4.4s") /* v1 = xa0+xa1+xa2+xa3 xa1+xa2+xa3 xa2+xa3 xa3 */ + __ASM_EMIT("fadd v2.4s, v2.4s, v5.4s") /* v2 = xb0+xb1+xb2+xb3 xb1+xb2+xb3 xb2+xb3 xb3 */ + /* 1x blocks */ + __ASM_EMIT("adds %[count], %[count], #3") + __ASM_EMIT("b.lt 8f") + __ASM_EMIT("7:") + __ASM_EMIT("ld1r {v8.4s}, [%[a]]") /* v8 = a0 */ + __ASM_EMIT("ld1r {v12.4s}, [%[b]]") /* q12 = b0 */ + __ASM_EMIT("fmla v0.4s, v8.4s, v12.4s") /* v0 = xv + a0*b0 */ + __ASM_EMIT("fmla v1.4s, v8.4s, v8.4s") /* v1 = xa + a0*a0 */ + __ASM_EMIT("fmla v2.4s, v12.4s, v12.4s") /* v2 = xb + b0*b0 */ + __ASM_EMIT("subs %[count], %[count], #1") + __ASM_EMIT("add %[a], %[a], #0x04") + __ASM_EMIT("add %[b], %[b], #0x04") + __ASM_EMIT("b.ge 7b") + __ASM_EMIT("8:") + /* Store result */ + __ASM_EMIT("ld3r {v3.4s, v4.4s, v5.4s}, [%[corr]]") /* v3 = v, v4 = a, v5 = b */ + __ASM_EMIT("fadd v0.4s, v0.4s, v3.4s") + __ASM_EMIT("fadd v1.4s, v1.4s, v4.4s") + __ASM_EMIT("fadd v2.4s, v2.4s, v5.4s") + __ASM_EMIT("st3 {v0.s, v1.s, v2.s}[0], [%[corr]]") + + : [a] "+r" (a), [b] "+r" (b), [count] "+r" (count) + : [corr] "r" (corr) + : "cc", "memory", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); + } + + } /* namespace asimd */ +} /* namespace lsp */ + + + +#endif /* PRIVATE_DSP_ARCH_AARCH64_ASIMD_CORRELATION_H_ */ diff --git a/src/main/aarch64/asimd.cpp b/src/main/aarch64/asimd.cpp index 417c89af..f69b3b98 100644 --- a/src/main/aarch64/asimd.cpp +++ b/src/main/aarch64/asimd.cpp @@ -52,6 +52,7 @@ #define PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL #include #include + #include #include #include #include @@ -429,6 +430,7 @@ EXPORT1(downsample_8x); EXPORT1(convolve); + EXPORT1(corr_init); EXPORT1(abgr32_to_bgrff32); EXPORT1(rgba32_to_bgra32); diff --git a/src/test/ptest/corr_init.cpp b/src/test/ptest/corr_init.cpp index 3f42b0a2..582be71b 100644 --- a/src/test/ptest/corr_init.cpp +++ b/src/test/ptest/corr_init.cpp @@ -61,6 +61,13 @@ namespace lsp } ) + IF_ARCH_AARCH64( + namespace asimd + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + ) + typedef void (* corr_init_t)(dsp::correlation_t *corr, const float *a, const float *b, size_t count); } @@ -115,6 +122,7 @@ PTEST_BEGIN("dsp", corr_init, 5, 10000) IF_ARCH_X86(CALL(avx512::corr_init, count)); IF_ARCH_X86(CALL(avx512::corr_init_fma3, count)); IF_ARCH_ARM(CALL(neon_d32::corr_init, count)); + IF_ARCH_AARCH64(CALL(asimd::corr_init, count)); PTEST_SEPARATOR; } diff --git a/src/test/utest/corr_init.cpp b/src/test/utest/corr_init.cpp index 3b18e724..62ecb367 100644 --- a/src/test/utest/corr_init.cpp +++ b/src/test/utest/corr_init.cpp @@ -58,6 +58,13 @@ namespace lsp } ) + IF_ARCH_AARCH64( + namespace asimd + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + ) + static void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) { float vv = 0.0f; @@ -133,6 +140,7 @@ UTEST_BEGIN("dsp", corr_init) IF_ARCH_X86(CALL(avx512::corr_init, 64)); IF_ARCH_X86(CALL(avx512::corr_init_fma3, 64)); IF_ARCH_ARM(CALL(neon_d32::corr_init, 16)); + IF_ARCH_AARCH64(CALL(asimd::corr_init, 16)); } UTEST_END; From e240d12e64a593cd1372b6e58de7e775008ed5e2 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Sun, 10 Mar 2024 10:52:41 +0300 Subject: [PATCH 11/22] Implemented SSE-optimized and SSE3-optimized corr_incr functions --- .../private/dsp/arch/x86/sse/correlation.h | 180 ++++++++++++++++++ .../private/dsp/arch/x86/sse3/correlation.h | 179 +++++++++++++++++ src/main/x86/sse.cpp | 1 + src/main/x86/sse3.cpp | 3 + src/test/ptest/corr_incr.cpp | 128 +++++++++++++ src/test/utest/corr_incr.cpp | 39 +++- 6 files changed, 521 insertions(+), 9 deletions(-) create mode 100644 include/private/dsp/arch/x86/sse3/correlation.h create mode 100644 src/test/ptest/corr_incr.cpp diff --git a/include/private/dsp/arch/x86/sse/correlation.h b/include/private/dsp/arch/x86/sse/correlation.h index 6cd4bb4f..e40d11a3 100644 --- a/include/private/dsp/arch/x86/sse/correlation.h +++ b/include/private/dsp/arch/x86/sse/correlation.h @@ -136,6 +136,186 @@ namespace lsp ); } + static const float corr_const[] __lsp_aligned16 = + { + LSP_DSP_VEC4(1e-10f) + }; + + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + IF_ARCH_I386( + void *ptr; + ); + + ARCH_X86_ASM + ( + /* 4x blocks */ + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("movups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("movups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = ah */ + __ASM_EMIT("movups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("movups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("movaps %%xmm3, %%xmm5") /* xmm5 = at */ + __ASM_EMIT("mulps %%xmm1, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("mulps %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("mulps %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("mulps %%xmm4, %%xmm5") /* xmm5 = at*bt */ + __ASM_EMIT("mulps %%xmm3, %%xmm3") /* xmm3 = at*at */ + __ASM_EMIT("mulps %%xmm4, %%xmm4") /* xmm4 = bt*bt */ + __ASM_EMIT("subps %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("subps %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + __ASM_EMIT("subps %%xmm5, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + + __ASM_EMIT("xorps %%xmm3, %%xmm3") /* xmm3 = 0 0 0 0 */ + __ASM_EMIT("xorps %%xmm4, %%xmm4") /* xmm4 = 0 0 0 0 */ + __ASM_EMIT("xorps %%xmm5, %%xmm5") /* xmm5 = 0 0 0 0 */ + __ASM_EMIT("movlhps %%xmm0, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] */ + __ASM_EMIT("movlhps %%xmm1, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] */ + __ASM_EMIT("movlhps %%xmm2, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] */ + __ASM_EMIT("addps %%xmm3, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */ + __ASM_EMIT("addps %%xmm4, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */ + __ASM_EMIT("addps %%xmm5, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */ + __ASM_EMIT("shufps $0x99, %%xmm0, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] */ + __ASM_EMIT("shufps $0x99, %%xmm1, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] */ + __ASM_EMIT("shufps $0x99, %%xmm2, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] */ + __ASM_EMIT("addps %%xmm0, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("addps %%xmm1, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("addps %%xmm2, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("movss 0x00(%[ptr]), %%xmm0") + __ASM_EMIT32("movss 0x04(%[ptr]), %%xmm1") + __ASM_EMIT32("movss 0x08(%[ptr]), %%xmm2") + __ASM_EMIT64("movss 0x00(%[corr]), %%xmm0") + __ASM_EMIT64("movss 0x04(%[corr]), %%xmm1") + __ASM_EMIT64("movss 0x08(%[corr]), %%xmm2") + + __ASM_EMIT("shufps $0x00, %%xmm0, %%xmm0") /* xmm0 = xv xv xv xv */ + __ASM_EMIT("shufps $0x00, %%xmm1, %%xmm1") /* xmm1 = xa xa xa xa */ + __ASM_EMIT("shufps $0x00, %%xmm2, %%xmm2") /* xmm2 = xb xb xb xb */ + __ASM_EMIT("addps %%xmm3, %%xmm1") /* xmm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("addps %%xmm4, %%xmm2") /* xmm2 = BB = xb+DV[0] xb+DV[0]+DV[1] xb+DV[0]+DV[1]+DV[2] xb+DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("movaps %%xmm1, %%xmm6") /* xmm6 = BA */ + __ASM_EMIT("addps %%xmm5, %%xmm0") /* xmm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("mulps %%xmm2, %%xmm1") /* xmm1 = B = BA*BB */ + __ASM_EMIT("movaps %%xmm0, %%xmm5") /* xmm5 = T */ + __ASM_EMIT("shufps $0xff, %%xmm6, %%xmm6") /* xmm6 = BA[3] BA[3] BA[3] BA[3] */ + __ASM_EMIT("shufps $0xff, %%xmm5, %%xmm5") /* xmm5 = T[3] T[3] T[3] T[3] */ + __ASM_EMIT("shufps $0xff, %%xmm2, %%xmm2") /* xmm7 = BB[3] BB[3] BB[3] BB[3] */ + + __ASM_EMIT32("movss %%xmm5, 0x00(%[ptr])") + __ASM_EMIT32("movss %%xmm6, 0x04(%[ptr])") + __ASM_EMIT32("movss %%xmm2, 0x08(%[ptr])") + __ASM_EMIT64("movss %%xmm5, 0x00(%[corr])") + __ASM_EMIT64("movss %%xmm6, 0x04(%[corr])") + __ASM_EMIT64("movss %%xmm2, 0x08(%[corr])") + + __ASM_EMIT("sqrtps %%xmm1, %%xmm7") /* xmm7 = sqrtf(B) */ + __ASM_EMIT("cmpps $5, %[CORR_CC], %%xmm1") /* xmm1 = B >= 1e-10f */ + __ASM_EMIT("divps %%xmm7, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("andps %%xmm1, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */ + __ASM_EMIT32("movups %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("movups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x10, %[ptr]") + __ASM_EMIT64("add $0x10, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("3:") + __ASM_EMIT("movss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("movss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("movss %%xmm0, %%xmm2") /* xmm2 = ah */ + __ASM_EMIT("movss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("movss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("movss %%xmm3, %%xmm5") /* xmm5 = at */ + __ASM_EMIT("mulss %%xmm1, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("mulss %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("mulss %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("mulss %%xmm4, %%xmm5") /* xmm5 = at*bt */ + __ASM_EMIT("mulss %%xmm3, %%xmm3") /* xmm3 = at*at */ + __ASM_EMIT("mulss %%xmm4, %%xmm4") /* xmm4 = bt*bt */ + __ASM_EMIT("subss %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("subss %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + __ASM_EMIT("subss %%xmm5, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("movss 0x00(%[ptr]), %%xmm3") + __ASM_EMIT32("movss 0x04(%[ptr]), %%xmm4") + __ASM_EMIT32("movss 0x08(%[ptr]), %%xmm5") + __ASM_EMIT64("movss 0x00(%[corr]), %%xmm3") + __ASM_EMIT64("movss 0x04(%[corr]), %%xmm4") + __ASM_EMIT64("movss 0x08(%[corr]), %%xmm5") + + __ASM_EMIT("addss %%xmm4, %%xmm0") /* xmm0 = BA = xa+DA */ + __ASM_EMIT("addss %%xmm3, %%xmm2") /* xmm2 = T = xv+DV */ + __ASM_EMIT("movaps %%xmm0, %%xmm3") /* xmm3 = BA */ + __ASM_EMIT("addss %%xmm5, %%xmm1") /* xmm1 = BB = xb+DB */ + __ASM_EMIT("mulss %%xmm1, %%xmm3") /* xmm3 = B = BA*BB */ + + __ASM_EMIT32("movss %%xmm2, 0x00(%[ptr])") + __ASM_EMIT32("movss %%xmm0, 0x04(%[ptr])") + __ASM_EMIT32("movss %%xmm1, 0x08(%[ptr])") + __ASM_EMIT64("movss %%xmm2, 0x00(%[corr])") + __ASM_EMIT64("movss %%xmm0, 0x04(%[corr])") + __ASM_EMIT64("movss %%xmm1, 0x08(%[corr])") + + __ASM_EMIT("sqrtss %%xmm3, %%xmm7") /* xmm7 = sqrtf(B) */ + __ASM_EMIT("cmpss $5, %[CORR_CC], %%xmm3") /* xmm3 = B >= 1e-10f */ + __ASM_EMIT("divss %%xmm7, %%xmm2") /* xmm2 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("andps %%xmm3, %%xmm2") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */ + __ASM_EMIT32("movss %%xmm2, 0x00(%[ptr])") + __ASM_EMIT64("movss %%xmm2, 0x00(%[dst])") + __ASM_EMIT32("add $0x04, %[ptr]") + __ASM_EMIT64("add $0x04, %[dst]") + __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("decl %[count]") + __ASM_EMIT64("dec %[count]") + __ASM_EMIT("jge 3b") + __ASM_EMIT("4:") + + : __IF_32( + [ptr] "=&r" (ptr), + [corr] "+m" (corr), [dst] "+m" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+g" (count) + ) + __IF_64( + [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + ) + : __IF_64( [corr] "r" (corr), ) + [CORR_CC] "o" (corr_const) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + } /* namespace sse */ } /* namespace lsp */ diff --git a/include/private/dsp/arch/x86/sse3/correlation.h b/include/private/dsp/arch/x86/sse3/correlation.h new file mode 100644 index 00000000..dd90428d --- /dev/null +++ b/include/private/dsp/arch/x86/sse3/correlation.h @@ -0,0 +1,179 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 10 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_SSE3_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_X86_SSE3_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_SSE3_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_SSE3_IMPL */ + +namespace lsp +{ + namespace sse3 + { + static const float corr_const[] __lsp_aligned16 = + { + LSP_DSP_VEC4(1e-10f) + }; + + void x64_corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + ARCH_X86_64_ASM + ( + /* load data */ + __ASM_EMIT("movss 0x00(%[corr]), %%xmm8") /* xmm8 = xv */ + __ASM_EMIT("movss 0x04(%[corr]), %%xmm9") /* xmm9 = xa */ + __ASM_EMIT("movss 0x08(%[corr]), %%xmm10") /* xmm10 = xb */ + __ASM_EMIT("movaps %[CORR_CC], %%xmm11") /* xmm11 = 1e-10f */ + /* 4x blocks */ + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("shufps $0x00, %%xmm8, %%xmm8") + __ASM_EMIT("shufps $0x00, %%xmm9, %%xmm9") + __ASM_EMIT("shufps $0x00, %%xmm10, %%xmm10") + __ASM_EMIT(".align 0x10") + __ASM_EMIT("1:") + __ASM_EMIT("movups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("movups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = ah */ + __ASM_EMIT("movups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("movups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("movaps %%xmm3, %%xmm5") /* xmm5 = at */ + __ASM_EMIT("mulps %%xmm1, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("mulps %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("mulps %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("mulps %%xmm4, %%xmm5") /* xmm5 = at*bt */ + __ASM_EMIT("mulps %%xmm3, %%xmm3") /* xmm3 = at*at */ + __ASM_EMIT("mulps %%xmm4, %%xmm4") /* xmm4 = bt*bt */ + __ASM_EMIT("subps %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("subps %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + __ASM_EMIT("subps %%xmm5, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + + __ASM_EMIT("xorps %%xmm3, %%xmm3") /* xmm3 = 0 0 0 0 */ + __ASM_EMIT("xorps %%xmm4, %%xmm4") /* xmm4 = 0 0 0 0 */ + __ASM_EMIT("xorps %%xmm5, %%xmm5") /* xmm5 = 0 0 0 0 */ + __ASM_EMIT("movlhps %%xmm0, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] */ + __ASM_EMIT("movlhps %%xmm1, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] */ + __ASM_EMIT("movlhps %%xmm2, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] */ + __ASM_EMIT("addps %%xmm3, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */ + __ASM_EMIT("addps %%xmm4, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */ + __ASM_EMIT("addps %%xmm5, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */ + __ASM_EMIT("shufps $0x99, %%xmm0, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] */ + __ASM_EMIT("shufps $0x99, %%xmm1, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] */ + __ASM_EMIT("shufps $0x99, %%xmm2, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] */ + __ASM_EMIT("addps %%xmm0, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("addps %%xmm1, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("addps %%xmm2, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */ + + __ASM_EMIT("addps %%xmm3, %%xmm9") /* xmm9 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("addps %%xmm4, %%xmm10") /* xmm10 = BB = xb+DV[0] xb+DV[0]+DV[1] xb+DV[0]+DV[1]+DV[2] xb+DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("movaps %%xmm9, %%xmm6") /* xmm6 = BA */ + __ASM_EMIT("addps %%xmm5, %%xmm8") /* xmm8 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("mulps %%xmm10, %%xmm6") /* xmm6 = B = BA*BB */ + __ASM_EMIT("movaps %%xmm8, %%xmm5") /* xmm5 = T */ + __ASM_EMIT("shufps $0xff, %%xmm8, %%xmm8") /* xmm8 = T[3] T[3] T[3] T[3] */ + __ASM_EMIT("shufps $0xff, %%xmm9, %%xmm9") /* xmm9 = BA[3] BA[3] BA[3] BA[3] */ + __ASM_EMIT("shufps $0xff, %%xmm10, %%xmm10") /* xmm10 = BB[3] BB[3] BB[3] BB[3] */ + + __ASM_EMIT("sqrtps %%xmm6, %%xmm7") /* xmm7 = sqrtf(B) */ + __ASM_EMIT("cmpps $5, %%xmm11, %%xmm6") /* xmm6 = B >= 1e-10f */ + __ASM_EMIT("divps %%xmm7, %%xmm5") /* xmm5 = T/sqrtf(B) */ + __ASM_EMIT("andps %%xmm6, %%xmm5") /* xmm5 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */ + __ASM_EMIT("movups %%xmm5, 0x00(%[dst])") + __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */ + __ASM_EMIT("add $0x10, %[dst]") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("3:") + __ASM_EMIT("movss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("movss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("movss %%xmm0, %%xmm2") /* xmm2 = ah */ + __ASM_EMIT("movss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("movss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("movss %%xmm3, %%xmm5") /* xmm5 = at */ + __ASM_EMIT("mulss %%xmm1, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("mulss %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("mulss %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("mulss %%xmm4, %%xmm5") /* xmm5 = at*bt */ + __ASM_EMIT("mulss %%xmm3, %%xmm3") /* xmm3 = at*at */ + __ASM_EMIT("mulss %%xmm4, %%xmm4") /* xmm4 = bt*bt */ + __ASM_EMIT("subss %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("subss %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + __ASM_EMIT("subss %%xmm5, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + + __ASM_EMIT("addss %%xmm0, %%xmm9") /* xmm9 = BA = xa+DA */ + __ASM_EMIT("addss %%xmm1, %%xmm10") /* xmm10 = BB = xb+DB */ + __ASM_EMIT("movaps %%xmm9, %%xmm3") /* xmm3 = BA */ + __ASM_EMIT("addss %%xmm2, %%xmm8") /* xmm8 = T = xv+DV */ + __ASM_EMIT("mulss %%xmm1, %%xmm3") /* xmm3 = B = BA*BB */ + __ASM_EMIT("movaps %%xmm8, %%xmm0") /* xmm0 = T */ + + __ASM_EMIT("sqrtss %%xmm3, %%xmm7") /* xmm7 = sqrtf(B) */ + __ASM_EMIT("cmpss $5, %[CORR_CC], %%xmm3") /* xmm3 = B >= 1e-10f */ + __ASM_EMIT("divss %%xmm7, %%xmm0") /* xmm2 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("andps %%xmm3, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */ + __ASM_EMIT("movss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */ + __ASM_EMIT("add $0x04, %[dst]") + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 3b") + __ASM_EMIT("4:") + + /* Store result */ + __ASM_EMIT("movss %%xmm8, 0x00(%[corr])") + __ASM_EMIT("movss %%xmm9, 0x04(%[corr])") + __ASM_EMIT("movss %%xmm10, 0x08(%[corr])") + + : [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + : [corr] "r" (corr), + [CORR_CC] "o" (corr_const) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm9", "%xmm10" + ); + } + + } /* namespace sse3 */ +} /* namespace lsp */ + + + + +#endif /* PRIVATE_DSP_ARCH_X86_SSE3_CORRELATION_H_ */ diff --git a/src/main/x86/sse.cpp b/src/main/x86/sse.cpp index 676aea07..7211c429 100644 --- a/src/main/x86/sse.cpp +++ b/src/main/x86/sse.cpp @@ -494,6 +494,7 @@ EXPORT1(convolve); EXPORT1(corr_init); + EXPORT1(corr_incr); EXPORT1(lin_inter_set); EXPORT1(lin_inter_mul2); diff --git a/src/main/x86/sse3.cpp b/src/main/x86/sse3.cpp index 5d59d951..78b9be9c 100644 --- a/src/main/x86/sse3.cpp +++ b/src/main/x86/sse3.cpp @@ -48,6 +48,7 @@ #include #include #include + #include #undef PRIVATE_DSP_ARCH_X86_SSE3_IMPL namespace lsp @@ -115,6 +116,8 @@ EXPORT1(split_triangle_raw); EXPORT1(cull_triangle_raw); + + EXPORT2_X64(corr_incr, x64_corr_incr); } #undef EXPORT2 diff --git a/src/test/ptest/corr_incr.cpp b/src/test/ptest/corr_incr.cpp new file mode 100644 index 00000000..4aa9f553 --- /dev/null +++ b/src/test/ptest/corr_incr.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 10 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include +#include + +#define MIN_RANK 8 +#define MAX_RANK 15 + +namespace lsp +{ + namespace generic + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + + IF_ARCH_X86( + namespace sse + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + + IF_ARCH_X86_64( + namespace sse3 + { + void x64_corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + + typedef void (* corr_incr_t)(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); +} + +//----------------------------------------------------------------------------- +// Performance test for lanczos resampling +PTEST_BEGIN("dsp", corr_incr, 5, 10000) + + void call(const char *label, + float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count, corr_incr_t func) + { + if (!PTEST_SUPPORTED(func)) + return; + + char buf[80]; + sprintf(buf, "%s x %d", label, int(count)); + printf("Testing %s correlation ...\n", buf); + + dsp::correlation_t corr; + + PTEST_LOOP(buf, + corr.v = 0.0f; + corr.a = 0.0f; + corr.b = 0.0f; + + func(&corr, dst, a_head, b_head, a_tail, b_tail, count); + ); + } + + PTEST_MAIN + { + size_t buf_size = 1 << MAX_RANK; + uint8_t *data = NULL; + float *a_head = alloc_aligned(data, buf_size * 5, 64); + float *a_tail = &a_head[buf_size]; + float *b_head = &a_tail[buf_size]; + float *b_tail = &b_head[buf_size]; + float *dst = &b_tail[buf_size]; + lsp_finally { + free_aligned(data); + }; + + for (size_t i=0; i < buf_size*5; ++i) + a_head[i] = randf(-1.0f, 1.0f); + + #define CALL(func, count) \ + call(#func, dst, a_head, b_head, a_tail, b_tail, count, func) + + for (size_t i=MIN_RANK; i<=MAX_RANK; ++i) + { + const size_t count = 1 << i; + + CALL(generic::corr_incr, count); + IF_ARCH_X86(CALL(sse::corr_incr, count)); + IF_ARCH_X86_64(CALL(sse3::x64_corr_incr, count)); + + PTEST_SEPARATOR; + } + } + +PTEST_END + + diff --git a/src/test/utest/corr_incr.cpp b/src/test/utest/corr_incr.cpp index 9d019c24..100cb79c 100644 --- a/src/test/utest/corr_incr.cpp +++ b/src/test/utest/corr_incr.cpp @@ -35,6 +35,26 @@ namespace lsp size_t count); } + IF_ARCH_X86( + namespace sse + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + + IF_ARCH_X86_64( + namespace sse3 + { + void x64_corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + static void corr_incr(dsp::correlation_t *corr, float *dst, const float *a_head, const float *b_head, const float *a_tail, const float *b_tail, @@ -78,7 +98,7 @@ UTEST_BEGIN("dsp", corr_incr) for (size_t mask=0; mask <= 0x07; ++mask) { - UTEST_FOREACH(tail, 0, 1, 2, 3, 4, 5, 8, 16, 24, 32, 33, 64, 47, 0x80, 0x1ff) + UTEST_FOREACH(tail, 0x80, 0x1ff) { UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 8, 16, 24, 32, 33, 64, 47, 0x80, 0x1ff) { @@ -88,9 +108,7 @@ UTEST_BEGIN("dsp", corr_incr) FloatBuffer dst2(count, align, mask & 0x04); dsp::correlation_t corr_a, corr_b; - corr_a.v = randf(-1.0f, 1.0f); - corr_a.a = randf(0.0f, 1.0f); - corr_a.b = randf(0.0f, 1.0f); + dsp::corr_init(&corr_a, a, b, tail); corr_b = corr_a; printf("Tesing %s correlation tail=%d on buffer count=%d mask=0x%x\n", label, int(tail), int(count), int(mask)); @@ -109,19 +127,20 @@ UTEST_BEGIN("dsp", corr_incr) UTEST_ASSERT_MSG(dst2.valid(), "Destination buffer 2 corrupted"); // Compare buffers - if (!dst1.equals_relative(dst2, 1e-5)) + if (!dst1.equals_adaptive(dst2, 1e-4)) { a.dump("a "); b.dump("b "); dst1.dump("dst1"); dst2.dump("dst2"); - UTEST_FAIL_MSG("Output of functions for test '%s' differs", label); + UTEST_FAIL_MSG("Output of functions for test '%s' differs at index %d, value=%f vs %f", + label, int(dst1.last_diff()), dst1.get(dst1.last_diff()), dst2.get(dst1.last_diff())); } // Compare state - if ((!float_equals_adaptive(corr_a.v, corr_b.v)) || - (!float_equals_adaptive(corr_a.a, corr_b.a)) || - (!float_equals_adaptive(corr_a.b, corr_b.b))) + if ((!float_equals_adaptive(corr_a.v, corr_b.v, 1e-5)) || + (!float_equals_adaptive(corr_a.a, corr_b.a, 1e-5)) || + (!float_equals_adaptive(corr_a.b, corr_b.b, 1e-5))) { UTEST_FAIL_MSG("Correlation state differs a={%f, %f, %f}, b={%f, %f, %f}", corr_a.v, corr_a.a, corr_a.b, @@ -138,6 +157,8 @@ UTEST_BEGIN("dsp", corr_incr) call(#func, align, func) CALL(generic::corr_incr, 16); + IF_ARCH_X86(CALL(sse::corr_incr, 16)); + IF_ARCH_X86_64(CALL(sse3::x64_corr_incr, 16)); } UTEST_END; From a2de835f186c25f685c966b1284e8c7febe40385 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Sun, 10 Mar 2024 14:52:14 +0300 Subject: [PATCH 12/22] AVX optimizations of correlation function --- .../private/dsp/arch/x86/avx/correlation.h | 496 ++++++++++++++++++ .../private/dsp/arch/x86/sse3/correlation.h | 2 +- src/main/x86/avx.cpp | 2 + src/test/ptest/corr_incr.cpp | 15 + src/test/utest/corr_incr.cpp | 25 +- 5 files changed, 537 insertions(+), 3 deletions(-) diff --git a/include/private/dsp/arch/x86/avx/correlation.h b/include/private/dsp/arch/x86/avx/correlation.h index 03275df9..3aa8c2ff 100644 --- a/include/private/dsp/arch/x86/avx/correlation.h +++ b/include/private/dsp/arch/x86/avx/correlation.h @@ -247,6 +247,502 @@ namespace lsp ); } + static const float corr_const[] __lsp_aligned32 = + { + LSP_DSP_VEC8(1e-10f) + }; + + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + IF_ARCH_I386( + void *ptr; + ); + + ARCH_X86_ASM + ( + /* 8x blocks */ + __ASM_EMIT32("subl $8, %[count]") + __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%ymm0") /* ymm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%ymm1") /* ymm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%ymm3") /* ymm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%ymm4") /* ymm4 = bt */ + __ASM_EMIT("vmulps %%ymm1, %%ymm0, %%ymm2") /* ymm2 = ah*bh */ + __ASM_EMIT("vmulps %%ymm4, %%ymm3, %%ymm5") /* ymm5 = at*bt */ + __ASM_EMIT("vmulps %%ymm0, %%ymm0, %%ymm0") /* ymm0 = ah*ah */ + __ASM_EMIT("vmulps %%ymm1, %%ymm1, %%ymm1") /* ymm1 = bh*bh */ + __ASM_EMIT("vmulps %%ymm3, %%ymm3, %%ymm3") /* ymm3 = at*at */ + __ASM_EMIT("vmulps %%ymm4, %%ymm4, %%ymm4") /* ymm4 = bt*bt */ + __ASM_EMIT("vxorps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vsubps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vsubps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vsubps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vshufps $0x44, %%ymm0, %%ymm6, %%ymm3") /* ymm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */ + __ASM_EMIT("vshufps $0x44, %%ymm1, %%ymm6, %%ymm4") /* ymm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */ + __ASM_EMIT("vshufps $0x44, %%ymm2, %%ymm6, %%ymm5") /* ymm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */ + __ASM_EMIT("vaddps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vshufps $0x99, %%ymm0, %%ymm3, %%ymm3") /* ymm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vshufps $0x99, %%ymm1, %%ymm4, %%ymm4") /* ymm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vshufps $0x99, %%ymm2, %%ymm5, %%ymm5") /* ymm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vaddps %%ymm0, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%ymm1, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%ymm2, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm2") /* xmm2 = DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("vextractf128 $1, %%ymm3, %%xmm6") /* xmm6 = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm4, %%xmm7") /* xmm7 = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm0, %%xmm6, %%xmm6") /* xmm6 = DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm1, %%xmm7, %%xmm7") /* xmm7 = DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm5, %%xmm0") /* xmm0 = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm6, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") /* xmm0 = DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm7, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm0, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%ymm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%ymm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%ymm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%ymm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%ymm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%ymm2") + + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%ymm4, %%ymm2, %%ymm2") /* ymm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%ymm5, %%ymm0, %%ymm0") /* ymm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vmulps %%ymm2, %%ymm1, %%ymm7") /* ymm7 = B = BA*BB */ + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm4") /* xmm4 = BA[4] BA[5] BA[6] BA[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm3") /* xmm3 = T[4] T[5] T[6] T[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm5") /* xmm5 = BB[4] BB[5] BB[6] BB[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%ymm7, %%ymm6") /* ymm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $5, %[CORR_CC], %%ymm7, %%ymm1")/* ymm1 = B >= 1e-10f */ + __ASM_EMIT("vdivps %%ymm6, %%ymm0, %%ymm0") /* ymm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%ymm1, %%ymm0, %%ymm0") /* ymm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x20, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x20, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%ymm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x20, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x20, %[ptr]") + __ASM_EMIT64("add $0x20, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $8, %[count]") + __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("vmulps %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("vmulps %%xmm4, %%xmm3, %%xmm5") /* xmm5 = at*bt */ + __ASM_EMIT("vmulps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("vmulps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = at*at */ + __ASM_EMIT("vmulps %%xmm4, %%xmm4, %%xmm4") /* xmm4 = bt*bt */ + __ASM_EMIT("vxorps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vsubps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vsubps %%xmm3, %%xmm0, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vsubps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vmovlhps %%xmm0, %%xmm6, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */ + __ASM_EMIT("vmovlhps %%xmm1, %%xmm6, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */ + __ASM_EMIT("vmovlhps %%xmm2, %%xmm6, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */ + __ASM_EMIT("vaddps %%xmm3, %%xmm0, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vshufps $0x99, %%xmm0, %%xmm3, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vshufps $0x99, %%xmm1, %%xmm4, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vshufps $0x99, %%xmm2, %%xmm5, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vaddps %%xmm0, %%xmm3, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm1, %%xmm4, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm2, %%xmm5, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%xmm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%xmm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%xmm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%xmm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%xmm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%xmm2") + + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm4, %%xmm2, %%xmm2") /* xmm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm5, %%xmm0, %%xmm0") /* xmm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vmulps %%xmm2, %%xmm1, %%xmm7") /* xmm7 = B = BA*BB */ + __ASM_EMIT("vshufps $0xff, %%xmm0, %%xmm0, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm1, %%xmm1, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm2, %%xmm2, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $5, %[CORR_CC], %%xmm7, %%xmm1")/* xmm1 = B >= 1e-10f */ + __ASM_EMIT("vdivps %%xmm6, %%xmm0, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x10, %[ptr]") + __ASM_EMIT64("add $0x10, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") + __ASM_EMIT("4:") + + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("5:") + __ASM_EMIT("vmovss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("vmovss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("vmovss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("vmovss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("vmulss %%xmm4, %%xmm3, %%xmm5") /* xmm5 = at*bt */ + __ASM_EMIT("vmulss %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("vmulss %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("vmulss %%xmm3, %%xmm3, %%xmm3") /* xmm3 = at*at */ + __ASM_EMIT("vmulss %%xmm4, %%xmm4, %%xmm4") /* xmm4 = bt*bt */ + __ASM_EMIT("vsubss %%xmm5, %%xmm2, %%xmm5") /* xmm5 = DV = ah*bh - at*bt */ + __ASM_EMIT("vsubss %%xmm3, %%xmm0, %%xmm3") /* xmm3 = DA = ah*ah - at*at */ + __ASM_EMIT("vsubss %%xmm4, %%xmm1, %%xmm4") /* xmm4 = DB = bh*bh - bt*bt */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vmovss 0x00(%[ptr]), %%xmm0") + __ASM_EMIT32("vmovss 0x04(%[ptr]), %%xmm1") + __ASM_EMIT32("vmovss 0x08(%[ptr]), %%xmm2") + __ASM_EMIT64("vmovss 0x00(%[corr]), %%xmm0") + __ASM_EMIT64("vmovss 0x04(%[corr]), %%xmm1") + __ASM_EMIT64("vmovss 0x08(%[corr]), %%xmm2") + + __ASM_EMIT("vaddss %%xmm3, %%xmm1, %%xmm1") /* xmm1 = BA = xa+DA */ + __ASM_EMIT("vaddss %%xmm4, %%xmm2, %%xmm2") /* xmm2 = BB = xb+DB */ + __ASM_EMIT("vaddss %%xmm5, %%xmm0, %%xmm0") /* xmm0 = T = xv+DV */ + __ASM_EMIT("vmulss %%xmm2, %%xmm1, %%xmm7") /* xmm7 = B = BA*BB */ + + __ASM_EMIT32("vmovss %%xmm0, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm1, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm2, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm0, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm1, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm2, 0x08(%[corr])") + + __ASM_EMIT("vsqrtss %%xmm7, %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpss $5, %[CORR_CC], %%xmm7, %%xmm1")/* xmm1 = B >= 1e-10f */ + __ASM_EMIT("vdivss %%xmm6, %%xmm0, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovss %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x04, %[ptr]") + __ASM_EMIT64("add $0x04, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("decl %[count]") + __ASM_EMIT64("dec %[count]") + __ASM_EMIT("jge 5b") + __ASM_EMIT("6:") + + : __IF_32( + [ptr] "=&r" (ptr), + [corr] "+m" (corr), [dst] "+m" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+g" (count) + ) + __IF_64( + [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + ) + : __IF_64( [corr] "r" (corr), ) + [CORR_CC] "o" (corr_const) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + void corr_incr_fma3(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + IF_ARCH_I386( + void *ptr; + ); + + ARCH_X86_ASM + ( + /* 8x blocks */ + __ASM_EMIT32("subl $8, %[count]") + __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%ymm0") /* ymm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%ymm1") /* ymm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%ymm3") /* ymm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%ymm4") /* ymm4 = bt */ + __ASM_EMIT("vmulps %%ymm1, %%ymm0, %%ymm2") /* ymm2 = ah*bh */ + __ASM_EMIT("vmulps %%ymm0, %%ymm0, %%ymm0") /* ymm0 = ah*ah */ + __ASM_EMIT("vmulps %%ymm1, %%ymm1, %%ymm1") /* ymm1 = bh*bh */ + __ASM_EMIT("vxorps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vfnmadd231ps %%ymm4, %%ymm3, %%ymm2") /* ymm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ps %%ymm3, %%ymm3, %%ymm0") /* ymm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ps %%ymm4, %%ymm4, %%ymm1") /* ymm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vshufps $0x44, %%ymm0, %%ymm6, %%ymm3") /* ymm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */ + __ASM_EMIT("vshufps $0x44, %%ymm1, %%ymm6, %%ymm4") /* ymm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */ + __ASM_EMIT("vshufps $0x44, %%ymm2, %%ymm6, %%ymm5") /* ymm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */ + __ASM_EMIT("vaddps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vshufps $0x99, %%ymm0, %%ymm3, %%ymm3") /* ymm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vshufps $0x99, %%ymm1, %%ymm4, %%ymm4") /* ymm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vshufps $0x99, %%ymm2, %%ymm5, %%ymm5") /* ymm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vaddps %%ymm0, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%ymm1, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%ymm2, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm2") /* xmm2 = DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("vextractf128 $1, %%ymm3, %%xmm6") /* xmm6 = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm4, %%xmm7") /* xmm7 = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm0, %%xmm6, %%xmm6") /* xmm6 = DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm1, %%xmm7, %%xmm7") /* xmm7 = DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm5, %%xmm0") /* xmm0 = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm6, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") /* xmm0 = DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm7, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm0, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%ymm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%ymm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%ymm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%ymm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%ymm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%ymm2") + + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%ymm4, %%ymm2, %%ymm2") /* ymm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%ymm5, %%ymm0, %%ymm0") /* ymm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vmulps %%ymm2, %%ymm1, %%ymm7") /* ymm7 = B = BA*BB */ + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm4") /* xmm4 = BA[4] BA[5] BA[6] BA[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm3") /* xmm3 = T[4] T[5] T[6] T[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm5") /* xmm5 = BB[4] BB[5] BB[6] BB[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%ymm7, %%ymm6") /* ymm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $5, %[CORR_CC], %%ymm7, %%ymm1")/* ymm1 = B >= 1e-10f */ + __ASM_EMIT("vdivps %%ymm6, %%ymm0, %%ymm0") /* ymm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%ymm1, %%ymm0, %%ymm0") /* ymm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x20, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x20, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%ymm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x20, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x20, %[ptr]") + __ASM_EMIT64("add $0x20, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $8, %[count]") + __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("vmulps %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("vmulps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("vmulps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("vxorps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vfnmadd231ps %%xmm4, %%xmm3, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ps %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ps %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vmovlhps %%xmm0, %%xmm6, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */ + __ASM_EMIT("vmovlhps %%xmm1, %%xmm6, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */ + __ASM_EMIT("vmovlhps %%xmm2, %%xmm6, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */ + __ASM_EMIT("vaddps %%xmm3, %%xmm0, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vshufps $0x99, %%xmm0, %%xmm3, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vshufps $0x99, %%xmm1, %%xmm4, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vshufps $0x99, %%xmm2, %%xmm5, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vaddps %%xmm0, %%xmm3, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm1, %%xmm4, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm2, %%xmm5, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%xmm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%xmm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%xmm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%xmm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%xmm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%xmm2") + + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm4, %%xmm2, %%xmm2") /* xmm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm5, %%xmm0, %%xmm0") /* xmm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vmulps %%xmm2, %%xmm1, %%xmm7") /* xmm7 = B = BA*BB */ + __ASM_EMIT("vshufps $0xff, %%xmm0, %%xmm0, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm1, %%xmm1, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm2, %%xmm2, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $5, %[CORR_CC], %%xmm7, %%xmm1")/* xmm1 = B >= 1e-10f */ + __ASM_EMIT("vdivps %%xmm6, %%xmm0, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x10, %[ptr]") + __ASM_EMIT64("add $0x10, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") + __ASM_EMIT("4:") + + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("5:") + __ASM_EMIT("vmovss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("vmovss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("vmovss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("vmovss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("vmulss %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("vmulss %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("vfnmadd231ss %%xmm4, %%xmm3, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ss %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ss %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vmovss 0x00(%[ptr]), %%xmm3") + __ASM_EMIT32("vmovss 0x04(%[ptr]), %%xmm4") + __ASM_EMIT32("vmovss 0x08(%[ptr]), %%xmm5") + __ASM_EMIT64("vmovss 0x00(%[corr]), %%xmm3") + __ASM_EMIT64("vmovss 0x04(%[corr]), %%xmm4") + __ASM_EMIT64("vmovss 0x08(%[corr]), %%xmm5") + + __ASM_EMIT("vaddss %%xmm3, %%xmm2, %%xmm2") /* xmm2 = T = xv+DV */ + __ASM_EMIT("vaddss %%xmm4, %%xmm0, %%xmm0") /* xmm0 = BA = xa+DA */ + __ASM_EMIT("vaddss %%xmm5, %%xmm1, %%xmm1") /* xmm1 = BB = xb+DB */ + __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm7") /* xmm7 = B = BA*BB */ + + __ASM_EMIT32("vmovss %%xmm2, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm0, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm1, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm2, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm0, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm1, 0x08(%[corr])") + + __ASM_EMIT("vsqrtss %%xmm7, %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpss $5, %[CORR_CC], %%xmm7, %%xmm1")/* xmm1 = B >= 1e-10f */ + __ASM_EMIT("vdivss %%xmm6, %%xmm2, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovss %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x04, %[ptr]") + __ASM_EMIT64("add $0x04, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("decl %[count]") + __ASM_EMIT64("dec %[count]") + __ASM_EMIT("jge 5b") + __ASM_EMIT("6:") + + : __IF_32( + [ptr] "=&r" (ptr), + [corr] "+m" (corr), [dst] "+m" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+g" (count) + ) + __IF_64( + [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + ) + : __IF_64( [corr] "r" (corr), ) + [CORR_CC] "o" (corr_const) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + } /* namespace avx */ } /* namespace lsp */ diff --git a/include/private/dsp/arch/x86/sse3/correlation.h b/include/private/dsp/arch/x86/sse3/correlation.h index dd90428d..14be34ed 100644 --- a/include/private/dsp/arch/x86/sse3/correlation.h +++ b/include/private/dsp/arch/x86/sse3/correlation.h @@ -134,7 +134,7 @@ namespace lsp __ASM_EMIT("addss %%xmm1, %%xmm10") /* xmm10 = BB = xb+DB */ __ASM_EMIT("movaps %%xmm9, %%xmm3") /* xmm3 = BA */ __ASM_EMIT("addss %%xmm2, %%xmm8") /* xmm8 = T = xv+DV */ - __ASM_EMIT("mulss %%xmm1, %%xmm3") /* xmm3 = B = BA*BB */ + __ASM_EMIT("mulss %%xmm10, %%xmm3") /* xmm3 = B = BA*BB */ __ASM_EMIT("movaps %%xmm8, %%xmm0") /* xmm0 = T */ __ASM_EMIT("sqrtss %%xmm3, %%xmm7") /* xmm7 = sqrtf(B) */ diff --git a/src/main/x86/avx.cpp b/src/main/x86/avx.cpp index eff3d99f..54e517a7 100644 --- a/src/main/x86/avx.cpp +++ b/src/main/x86/avx.cpp @@ -393,6 +393,7 @@ CEXPORT1(favx, convolve); CEXPORT1(favx, corr_init); + CEXPORT1(favx, corr_incr); CEXPORT1(favx, lin_inter_set); CEXPORT1(favx, lin_inter_mul2); @@ -486,6 +487,7 @@ CEXPORT2(favx, convolve, convolve_fma3); CEXPORT2(favx, corr_init, corr_init_fma3); + CEXPORT2(favx, corr_incr, corr_incr_fma3); CEXPORT2(favx, axis_apply_lin1, axis_apply_lin1_fma3); diff --git a/src/test/ptest/corr_incr.cpp b/src/test/ptest/corr_incr.cpp index 4aa9f553..8c3bf735 100644 --- a/src/test/ptest/corr_incr.cpp +++ b/src/test/ptest/corr_incr.cpp @@ -46,6 +46,19 @@ namespace lsp const float *a_tail, const float *b_tail, size_t count); } + + namespace avx + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + + void corr_incr_fma3(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } ) IF_ARCH_X86_64( @@ -118,6 +131,8 @@ PTEST_BEGIN("dsp", corr_incr, 5, 10000) CALL(generic::corr_incr, count); IF_ARCH_X86(CALL(sse::corr_incr, count)); IF_ARCH_X86_64(CALL(sse3::x64_corr_incr, count)); + IF_ARCH_X86(CALL(avx::corr_incr, count)); + IF_ARCH_X86(CALL(avx::corr_incr_fma3, count)); PTEST_SEPARATOR; } diff --git a/src/test/utest/corr_incr.cpp b/src/test/utest/corr_incr.cpp index 100cb79c..935494c6 100644 --- a/src/test/utest/corr_incr.cpp +++ b/src/test/utest/corr_incr.cpp @@ -43,6 +43,19 @@ namespace lsp const float *a_tail, const float *b_tail, size_t count); } + + namespace avx + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + + void corr_incr_fma3(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } ) IF_ARCH_X86_64( @@ -108,6 +121,9 @@ UTEST_BEGIN("dsp", corr_incr) FloatBuffer dst2(count, align, mask & 0x04); dsp::correlation_t corr_a, corr_b; + corr_a.v = 0.0f; + corr_a.a = 0.0f; + corr_a.b = 0.0f; dsp::corr_init(&corr_a, a, b, tail); corr_b = corr_a; @@ -133,8 +149,11 @@ UTEST_BEGIN("dsp", corr_incr) b.dump("b "); dst1.dump("dst1"); dst2.dump("dst2"); - UTEST_FAIL_MSG("Output of functions for test '%s' differs at index %d, value=%f vs %f", - label, int(dst1.last_diff()), dst1.get(dst1.last_diff()), dst2.get(dst1.last_diff())); + UTEST_FAIL_MSG("Output of functions for test '%s' differs at index %d, value=%f vs %f\n" + "correlation state a={%f, %f, %f}, b={%f, %f, %f}", + label, int(dst1.last_diff()), dst1.get(dst1.last_diff()), dst2.get(dst1.last_diff()), + corr_a.v, corr_a.a, corr_a.b, + corr_b.v, corr_b.a, corr_b.b); } // Compare state @@ -159,6 +178,8 @@ UTEST_BEGIN("dsp", corr_incr) CALL(generic::corr_incr, 16); IF_ARCH_X86(CALL(sse::corr_incr, 16)); IF_ARCH_X86_64(CALL(sse3::x64_corr_incr, 16)); + IF_ARCH_X86(CALL(avx::corr_incr, 32)); + IF_ARCH_X86(CALL(avx::corr_incr_fma3, 32)); } UTEST_END; From 7ed2740cfe01368a65e95b1b163286ded40e9d0a Mon Sep 17 00:00:00 2001 From: sadko4u Date: Mon, 11 Mar 2024 01:06:49 +0300 Subject: [PATCH 13/22] AVX-512 implementation of corr_incr --- .../private/dsp/arch/x86/avx512/correlation.h | 494 ++++++++++++------ src/main/x86/avx512.cpp | 2 +- src/test/ptest/corr_incr.cpp | 9 + src/test/ptest/corr_init.cpp | 2 - src/test/utest/corr_incr.cpp | 9 + src/test/utest/corr_init.cpp | 2 - 6 files changed, 367 insertions(+), 151 deletions(-) diff --git a/include/private/dsp/arch/x86/avx512/correlation.h b/include/private/dsp/arch/x86/avx512/correlation.h index c16146fa..3321bf26 100644 --- a/include/private/dsp/arch/x86/avx512/correlation.h +++ b/include/private/dsp/arch/x86/avx512/correlation.h @@ -30,158 +30,12 @@ namespace lsp { namespace avx512 { - void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) { IF_ARCH_X86( size_t off; ); - ARCH_X86_ASM - ( - __ASM_EMIT("xor %[off], %[off]") - __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0") /* xv = 0 */ - __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1") /* xa = 0 */ - __ASM_EMIT("vxorps %%zmm2, %%zmm2, %%zmm2") /* xb = 0 */ - /* 32x blocks */ - __ASM_EMIT("sub $32, %[count]") - __ASM_EMIT("jb 2f") - __ASM_EMIT("1:") - __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%zmm3") /* zmm3 = a0 */ - __ASM_EMIT("vmovups 0x40(%[a], %[off]), %%zmm4") /* zmm4 = a1 */ - __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%zmm5") /* zmm5 = b0 */ - __ASM_EMIT("vmovups 0x40(%[b], %[off]), %%zmm6") /* zmm6 = b1 */ - __ASM_EMIT("vmulps %%zmm3, %%zmm5, %%zmm7") /* zmm7 = a0*b0 */ - __ASM_EMIT("vmulps %%zmm3, %%zmm3, %%zmm3") /* zmm3 = a0*a0 */ - __ASM_EMIT("vmulps %%zmm5, %%zmm5, %%zmm5") /* zmm5 = b0*b0 */ - __ASM_EMIT("vaddps %%zmm7, %%zmm0, %%zmm0") /* zmm0 = xv + a0*b0 */ - __ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1") /* zmm1 = xa + a0*a0 */ - __ASM_EMIT("vaddps %%zmm5, %%zmm2, %%zmm2") /* zmm2 = xb + b0*b0 */ - __ASM_EMIT("vmulps %%zmm4, %%zmm6, %%zmm7") /* zmm7 = a0*b0 */ - __ASM_EMIT("vmulps %%zmm4, %%zmm4, %%zmm4") /* zmm4 = a0*a0 */ - __ASM_EMIT("vmulps %%zmm6, %%zmm6, %%zmm6") /* zmm6 = b0*b0 */ - __ASM_EMIT("vaddps %%zmm7, %%zmm0, %%zmm0") /* zmm0 = xv + a1*b1 */ - __ASM_EMIT("vaddps %%zmm4, %%zmm1, %%zmm1") /* zmm1 = xa + a1*a1 */ - __ASM_EMIT("vaddps %%zmm6, %%zmm2, %%zmm2") /* zmm2 = xb + b1*b1 */ - __ASM_EMIT("add $0x80, %[off]") /* ++off */ - __ASM_EMIT("sub $32, %[count]") - __ASM_EMIT("jae 1b") - __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm4") - __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm5") - __ASM_EMIT("vextractf64x4 $1, %%zmm2, %%ymm6") - __ASM_EMIT("vaddps %%ymm4, %%ymm0, %%ymm0") - __ASM_EMIT("vaddps %%ymm5, %%ymm1, %%ymm1") - __ASM_EMIT("vaddps %%ymm6, %%ymm2, %%ymm2") - __ASM_EMIT("2:") - /* 16x blocks */ - __ASM_EMIT("add $16, %[count]") - __ASM_EMIT("jl 4f") - __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm3") /* ymm3 = a0 */ - __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm4") /* ymm4 = a1 */ - __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%ymm5") /* ymm5 = b0 */ - __ASM_EMIT("vmovups 0x20(%[b], %[off]), %%ymm6") /* ymm6 = b1 */ - __ASM_EMIT("vmulps %%ymm3, %%ymm5, %%ymm7") /* ymm7 = a0*b0 */ - __ASM_EMIT("vmulps %%ymm3, %%ymm3, %%ymm3") /* ymm3 = a0*a0 */ - __ASM_EMIT("vmulps %%ymm5, %%ymm5, %%ymm5") /* ymm5 = b0*b0 */ - __ASM_EMIT("vaddps %%ymm7, %%ymm0, %%ymm0") /* ymm0 = xv + a0*b0 */ - __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = xa + a0*a0 */ - __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = xb + b0*b0 */ - __ASM_EMIT("vmulps %%ymm4, %%ymm6, %%ymm7") /* ymm7 = a0*b0 */ - __ASM_EMIT("vmulps %%ymm4, %%ymm4, %%ymm4") /* ymm4 = a0*a0 */ - __ASM_EMIT("vmulps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = b0*b0 */ - __ASM_EMIT("vaddps %%ymm7, %%ymm0, %%ymm0") /* ymm0 = xv + a1*b1 */ - __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = xa + a1*a1 */ - __ASM_EMIT("vaddps %%ymm6, %%ymm2, %%ymm2") /* ymm2 = xb + b1*b1 */ - __ASM_EMIT("sub $16, %[count]") - __ASM_EMIT("add $0x40, %[off]") /* ++off */ - __ASM_EMIT("4:") - __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm4") - __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm5") - __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm6") - __ASM_EMIT("vaddps %%xmm4, %%xmm0, %%xmm0") - __ASM_EMIT("vaddps %%xmm5, %%xmm1, %%xmm1") - __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") - /* 8x block */ - __ASM_EMIT("add $8, %[count]") - __ASM_EMIT("jl 6f") - __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ - __ASM_EMIT("vmovups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */ - __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ - __ASM_EMIT("vmovups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */ - __ASM_EMIT("vmulps %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ - __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ - __ASM_EMIT("vmulps %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ - __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */ - __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */ - __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */ - __ASM_EMIT("vmulps %%xmm4, %%xmm6, %%xmm7") /* xmm7 = a0*b0 */ - __ASM_EMIT("vmulps %%xmm4, %%xmm4, %%xmm4") /* xmm4 = a0*a0 */ - __ASM_EMIT("vmulps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = b0*b0 */ - __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a1*b1 */ - __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = xa + a1*a1 */ - __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") /* xmm2 = xb + b1*b1 */ - __ASM_EMIT("sub $8, %[count]") - __ASM_EMIT("add $0x20, %[off]") /* ++off */ - __ASM_EMIT("6:") - /* 4x block */ - __ASM_EMIT("add $4, %[count]") - __ASM_EMIT("jl 8f") - __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ - __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ - __ASM_EMIT("vmulps %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ - __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ - __ASM_EMIT("vmulps %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ - __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */ - __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */ - __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */ - __ASM_EMIT("sub $4, %[count]") - __ASM_EMIT("add $0x10, %[off]") /* ++off */ - __ASM_EMIT("8:") - /* Do horizontal sum */ - __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ - __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ - __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ - __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 */ - __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1+xv2+xv3 */ - __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1+xv2+xv3 */ - /* 1x blocks */ - __ASM_EMIT("add $3, %[count]") - __ASM_EMIT("jl 10f") - __ASM_EMIT("9:") - __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ - __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ - __ASM_EMIT("vmulss %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ - __ASM_EMIT("vmulss %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ - __ASM_EMIT("vmulss %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ - __ASM_EMIT("vaddss %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */ - __ASM_EMIT("vaddss %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */ - __ASM_EMIT("vaddss %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */ - __ASM_EMIT("add $0x04, %[off]") /* ++off */ - __ASM_EMIT("dec %[count]") - __ASM_EMIT("jge 9b") - __ASM_EMIT("10:") - /* Store result */ - __ASM_EMIT("vaddss 0x00(%[corr]), %%xmm0, %%xmm0") - __ASM_EMIT("vaddss 0x04(%[corr]), %%xmm1, %%xmm1") - __ASM_EMIT("vaddss 0x08(%[corr]), %%xmm2, %%xmm2") - __ASM_EMIT("vmovss %%xmm0, 0x00(%[corr])") - __ASM_EMIT("vmovss %%xmm1, 0x04(%[corr])") - __ASM_EMIT("vmovss %%xmm2, 0x08(%[corr])") - - : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count) - : [a] "r" (a), [b] "r" (b) - : "cc", "memory", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7" - ); - } - - void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count) - { - IF_ARCH_X86( - size_t off; - ); - ARCH_X86_ASM ( __ASM_EMIT("xor %[off], %[off]") @@ -297,6 +151,354 @@ namespace lsp ); } + static const float corr_const[] __lsp_aligned64 = + { + LSP_DSP_VEC16(1e-10f) + }; + + static const uint32_t corr_idx[] __lsp_aligned64 = + { + 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 7, 7, 7, 7, // 0 0 A0 A1 + 0, 0, 0, 0, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, // 0 A0 A1 A2 + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15 + }; + + static const uint16_t corr_kmask[] = + { + 0xff00, + 0xfff0 + }; + + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + IF_ARCH_I386( + void *ptr; + ); + + ARCH_X86_ASM + ( + /* 16x blocks */ + __ASM_EMIT32("subl $16, %[count]") + __ASM_EMIT64("sub $16, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("kmovw 0x00 + %[CORR_KMASK], %%k1") + __ASM_EMIT("kmovw 0x02 + %[CORR_KMASK], %%k2") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%zmm0") /* zmm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%zmm1") /* zmm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%zmm3") /* zmm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%zmm4") /* zmm4 = bt */ + __ASM_EMIT("vmulps %%zmm1, %%zmm0, %%zmm2") /* zmm2 = ah*bh */ + __ASM_EMIT("vmulps %%zmm0, %%zmm0, %%zmm0") /* zmm0 = ah*ah */ + __ASM_EMIT("vmulps %%zmm1, %%zmm1, %%zmm1") /* zmm1 = bh*bh */ + __ASM_EMIT("vxorps %%zmm6, %%zmm6, %%zmm6") /* zmm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vfnmadd231ps %%zmm4, %%zmm3, %%zmm2") /* zmm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ps %%zmm3, %%zmm3, %%zmm0") /* zmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ps %%zmm4, %%zmm4, %%zmm1") /* zmm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vshufps $0x44, %%zmm0, %%zmm6, %%zmm3") /* zmm3 = 0 0 DA[0] DA[1] ... */ + __ASM_EMIT("vshufps $0x44, %%zmm1, %%zmm6, %%zmm4") /* zmm4 = 0 0 DB[0] DB[1] ... */ + __ASM_EMIT("vshufps $0x44, %%zmm2, %%zmm6, %%zmm5") /* zmm5 = 0 0 DV[0] DV[1] ... */ + __ASM_EMIT("vaddps %%zmm3, %%zmm0, %%zmm0") /* zmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] ... */ + __ASM_EMIT("vaddps %%zmm4, %%zmm1, %%zmm1") /* zmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] ... */ + __ASM_EMIT("vaddps %%zmm5, %%zmm2, %%zmm2") /* zmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] ... */ + __ASM_EMIT("vmovaps 0x00 + %[CORR_IDX], %%zmm6") /* zmm6 = permute mask */ + __ASM_EMIT("vmovaps 0x40 + %[CORR_IDX], %%zmm7") /* zmm7 = permute mask */ + __ASM_EMIT("vshufps $0x99, %%zmm0, %%zmm3, %%zmm3") /* zmm3 = 0 DA[0] DA[1] DA[0]+DA[2] ... */ + __ASM_EMIT("vshufps $0x99, %%zmm1, %%zmm4, %%zmm4") /* zmm4 = 0 DB[0] DB[1] DB[0]+DB[2] ... */ + __ASM_EMIT("vshufps $0x99, %%zmm2, %%zmm5, %%zmm5") /* zmm5 = 0 DV[0] DV[1] DV[0]+DV[2] ... */ + __ASM_EMIT("vaddps %%zmm0, %%zmm3, %%zmm3") /* zmm3 = A = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] ... */ + __ASM_EMIT("vaddps %%zmm1, %%zmm4, %%zmm4") /* zmm4 = B = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] ... */ + __ASM_EMIT("vaddps %%zmm2, %%zmm5, %%zmm5") /* zmm5 = V = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] ... */ + __ASM_EMIT("vpermps %%zmm3, %%zmm6, %%zmm0") /* zmm0 = A0 A0 A0 A1 */ + __ASM_EMIT("vpermps %%zmm4, %%zmm6, %%zmm1") /* zmm1 = B0 B0 B0 B1 */ + __ASM_EMIT("vpermps %%zmm5, %%zmm6, %%zmm2") /* zmm2 = V0 V0 V0 V1 */ + __ASM_EMIT("vaddps %%zmm0, %%zmm3, %%zmm3 %{%%k1%}") /* zmm3 = A0 A1 A0+A2 A1+A3 */ + __ASM_EMIT("vaddps %%zmm1, %%zmm4, %%zmm4 %{%%k1%}") /* zmm4 = B0 B1 B0+B2 B1+B3 */ + __ASM_EMIT("vaddps %%zmm2, %%zmm5, %%zmm5 %{%%k1%}") /* zmm5 = V0 V1 V0+V2 V1+V3 */ + __ASM_EMIT("vpermps %%zmm3, %%zmm7, %%zmm0") /* zmm0 = A0 A0 A1 A0+A2 */ + __ASM_EMIT("vpermps %%zmm4, %%zmm7, %%zmm1") /* zmm1 = B0 B0 B1 B0+B2 */ + __ASM_EMIT("vpermps %%zmm5, %%zmm7, %%zmm2") /* zmm2 = V0 V0 V1 V0+V2 */ + __ASM_EMIT("vaddps %%zmm0, %%zmm3, %%zmm3 %{%%k2%}") /* zmm3 = A0 A0+A1 A0+A1+A2 A0+A1+A2+A3 */ + __ASM_EMIT("vaddps %%zmm1, %%zmm4, %%zmm4 %{%%k2%}") /* zmm4 = B0 B0+B1 B0+B1+B2 B0+B1+B2+B3 */ + __ASM_EMIT("vaddps %%zmm2, %%zmm5, %%zmm5 %{%%k2%}") /* zmm5 = V0 V0+V1 V0+V1+V2 V0+V1+V2+V3 */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%zmm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%zmm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%zmm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%zmm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%zmm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%zmm2") + + __ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1") /* zmm1 = BA = xa + A */ + __ASM_EMIT("vaddps %%zmm4, %%zmm2, %%zmm2") /* zmm2 = BB = xb + B */ + __ASM_EMIT("vaddps %%zmm5, %%zmm0, %%zmm0") /* zmm0 = T = xv + V */ + __ASM_EMIT("vmovaps 0x80 + %[CORR_IDX], %%zmm6") /* zmm6 = permute mask */ + __ASM_EMIT("vmulps %%zmm2, %%zmm1, %%zmm7") /* zmm7 = B = BA*BB */ + __ASM_EMIT("vpermps %%zmm0, %%zmm6, %%zmm3") /* zmm3 = xv' */ + __ASM_EMIT("vpermps %%zmm1, %%zmm6, %%zmm4") /* zmm4 = xa' */ + __ASM_EMIT("vpermps %%zmm2, %%zmm6, %%zmm5") /* zmm5 = xb' */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%zmm7, %%zmm6") /* zmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $1, %[CORR_CC], %%zmm7, %%k3") /* k3 = B < 1e-10f */ + __ASM_EMIT("vdivps %%zmm6, %%zmm0, %%zmm0") /* zmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0 %{%%k3%}") /* zmm0 = (B < 1e-10f) ? 0 : T/sqrtf(B) */ + __ASM_EMIT("add $0x40, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x40, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%zmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%zmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x40, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x40, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x40, %[ptr]") + __ASM_EMIT64("add $0x40, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $16, %[count]") + __ASM_EMIT64("sub $16, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* 8x block */ + __ASM_EMIT32("addl $8, %[count]") + __ASM_EMIT64("add $8, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%ymm0") /* ymm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%ymm1") /* ymm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%ymm3") /* ymm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%ymm4") /* ymm4 = bt */ + __ASM_EMIT("vmulps %%ymm1, %%ymm0, %%ymm2") /* ymm2 = ah*bh */ + __ASM_EMIT("vmulps %%ymm0, %%ymm0, %%ymm0") /* ymm0 = ah*ah */ + __ASM_EMIT("vmulps %%ymm1, %%ymm1, %%ymm1") /* ymm1 = bh*bh */ + __ASM_EMIT("vxorps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vfnmadd231ps %%ymm4, %%ymm3, %%ymm2") /* ymm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ps %%ymm3, %%ymm3, %%ymm0") /* ymm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ps %%ymm4, %%ymm4, %%ymm1") /* ymm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vshufps $0x44, %%ymm0, %%ymm6, %%ymm3") /* ymm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */ + __ASM_EMIT("vshufps $0x44, %%ymm1, %%ymm6, %%ymm4") /* ymm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */ + __ASM_EMIT("vshufps $0x44, %%ymm2, %%ymm6, %%ymm5") /* ymm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */ + __ASM_EMIT("vaddps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vshufps $0x99, %%ymm0, %%ymm3, %%ymm3") /* ymm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vshufps $0x99, %%ymm1, %%ymm4, %%ymm4") /* ymm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vshufps $0x99, %%ymm2, %%ymm5, %%ymm5") /* ymm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vaddps %%ymm0, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%ymm1, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%ymm2, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm2") /* xmm2 = DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("vextractf128 $1, %%ymm3, %%xmm6") /* xmm6 = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm4, %%xmm7") /* xmm7 = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm0, %%xmm6, %%xmm6") /* xmm6 = DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm1, %%xmm7, %%xmm7") /* xmm7 = DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm5, %%xmm0") /* xmm0 = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm6, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") /* xmm0 = DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm7, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm0, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%ymm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%ymm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%ymm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%ymm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%ymm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%ymm2") + + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%ymm4, %%ymm2, %%ymm2") /* ymm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%ymm5, %%ymm0, %%ymm0") /* ymm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vmulps %%ymm2, %%ymm1, %%ymm7") /* ymm7 = B = BA*BB */ + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm4") /* xmm4 = BA[4] BA[5] BA[6] BA[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm3") /* xmm3 = T[4] T[5] T[6] T[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm5") /* xmm5 = BB[4] BB[5] BB[6] BB[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%ymm7, %%ymm6") /* ymm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $5, %[CORR_CC], %%ymm7, %%ymm1") /* ymm1 = B >= 1e-10f */ + __ASM_EMIT("vdivps %%ymm6, %%ymm0, %%ymm0") /* ymm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%ymm1, %%ymm0, %%ymm0") /* ymm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x20, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x20, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%ymm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x20, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x20, %[ptr]") + __ASM_EMIT64("add $0x20, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $8, %[count]") + __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT("4:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("vmulps %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("vmulps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("vmulps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("vxorps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vfnmadd231ps %%xmm4, %%xmm3, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ps %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ps %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vmovlhps %%xmm0, %%xmm6, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */ + __ASM_EMIT("vmovlhps %%xmm1, %%xmm6, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */ + __ASM_EMIT("vmovlhps %%xmm2, %%xmm6, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */ + __ASM_EMIT("vaddps %%xmm3, %%xmm0, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vshufps $0x99, %%xmm0, %%xmm3, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vshufps $0x99, %%xmm1, %%xmm4, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vshufps $0x99, %%xmm2, %%xmm5, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vaddps %%xmm0, %%xmm3, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm1, %%xmm4, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm2, %%xmm5, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%xmm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%xmm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%xmm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%xmm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%xmm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%xmm2") + + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm4, %%xmm2, %%xmm2") /* xmm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm5, %%xmm0, %%xmm0") /* xmm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vmulps %%xmm2, %%xmm1, %%xmm7") /* xmm7 = B = BA*BB */ + __ASM_EMIT("vshufps $0xff, %%xmm0, %%xmm0, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm1, %%xmm1, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm2, %%xmm2, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $5, %[CORR_CC], %%xmm7, %%xmm1") /* xmm1 = B >= 1e-10f */ + __ASM_EMIT("vdivps %%xmm6, %%xmm0, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x10, %[ptr]") + __ASM_EMIT64("add $0x10, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") + __ASM_EMIT("6:") + + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("7:") + __ASM_EMIT("vmovss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("vmovss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("vmovss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("vmovss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("vmulss %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("vmulss %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("vfnmadd231ss %%xmm4, %%xmm3, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ss %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ss %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vmovss 0x00(%[ptr]), %%xmm3") + __ASM_EMIT32("vmovss 0x04(%[ptr]), %%xmm4") + __ASM_EMIT32("vmovss 0x08(%[ptr]), %%xmm5") + __ASM_EMIT64("vmovss 0x00(%[corr]), %%xmm3") + __ASM_EMIT64("vmovss 0x04(%[corr]), %%xmm4") + __ASM_EMIT64("vmovss 0x08(%[corr]), %%xmm5") + + __ASM_EMIT("vaddss %%xmm3, %%xmm2, %%xmm2") /* xmm2 = T = xv+DV */ + __ASM_EMIT("vaddss %%xmm4, %%xmm0, %%xmm0") /* xmm0 = BA = xa+DA */ + __ASM_EMIT("vaddss %%xmm5, %%xmm1, %%xmm1") /* xmm1 = BB = xb+DB */ + __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm7") /* xmm7 = B = BA*BB */ + + __ASM_EMIT32("vmovss %%xmm2, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm0, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm1, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm2, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm0, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm1, 0x08(%[corr])") + + __ASM_EMIT("vsqrtss %%xmm7, %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpss $5, %[CORR_CC], %%xmm7, %%xmm1") /* xmm1 = B >= 1e-10f */ + __ASM_EMIT("vdivss %%xmm6, %%xmm2, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovss %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x04, %[ptr]") + __ASM_EMIT64("add $0x04, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("decl %[count]") + __ASM_EMIT64("dec %[count]") + __ASM_EMIT("jge 7b") + __ASM_EMIT("8:") + + : __IF_32( + [ptr] "=&r" (ptr), + [corr] "+m" (corr), [dst] "+m" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+g" (count) + ) + __IF_64( + [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + ) + : __IF_64( [corr] "r" (corr), ) + [CORR_CC] "o" (corr_const), + [CORR_IDX] "o" (corr_idx), + [CORR_KMASK] "o" (corr_kmask) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%k1", "%k2", "%k3" + ); + } + } /* namespace avx512 */ } /* namespace lsp */ diff --git a/src/main/x86/avx512.cpp b/src/main/x86/avx512.cpp index 524a14ef..9946c4ba 100644 --- a/src/main/x86/avx512.cpp +++ b/src/main/x86/avx512.cpp @@ -279,7 +279,7 @@ CEXPORT1(vl, dexpander_x1_curve); CEXPORT1(vl, corr_init); - CEXPORT2(vl, corr_init, corr_init_fma3); + CEXPORT1(vl, corr_incr); } } /* namespace avx2 */ } /* namespace lsp */ diff --git a/src/test/ptest/corr_incr.cpp b/src/test/ptest/corr_incr.cpp index 8c3bf735..d5d9967e 100644 --- a/src/test/ptest/corr_incr.cpp +++ b/src/test/ptest/corr_incr.cpp @@ -59,6 +59,14 @@ namespace lsp const float *a_tail, const float *b_tail, size_t count); } + + namespace avx512 + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } ) IF_ARCH_X86_64( @@ -133,6 +141,7 @@ PTEST_BEGIN("dsp", corr_incr, 5, 10000) IF_ARCH_X86_64(CALL(sse3::x64_corr_incr, count)); IF_ARCH_X86(CALL(avx::corr_incr, count)); IF_ARCH_X86(CALL(avx::corr_incr_fma3, count)); + IF_ARCH_X86(CALL(avx512::corr_incr, count)); PTEST_SEPARATOR; } diff --git a/src/test/ptest/corr_init.cpp b/src/test/ptest/corr_init.cpp index 582be71b..8bba049d 100644 --- a/src/test/ptest/corr_init.cpp +++ b/src/test/ptest/corr_init.cpp @@ -50,7 +50,6 @@ namespace lsp namespace avx512 { void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); - void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count); } ) @@ -120,7 +119,6 @@ PTEST_BEGIN("dsp", corr_init, 5, 10000) IF_ARCH_X86(CALL(avx::corr_init, count)); IF_ARCH_X86(CALL(avx::corr_init_fma3, count)); IF_ARCH_X86(CALL(avx512::corr_init, count)); - IF_ARCH_X86(CALL(avx512::corr_init_fma3, count)); IF_ARCH_ARM(CALL(neon_d32::corr_init, count)); IF_ARCH_AARCH64(CALL(asimd::corr_init, count)); diff --git a/src/test/utest/corr_incr.cpp b/src/test/utest/corr_incr.cpp index 935494c6..973457f9 100644 --- a/src/test/utest/corr_incr.cpp +++ b/src/test/utest/corr_incr.cpp @@ -56,6 +56,14 @@ namespace lsp const float *a_tail, const float *b_tail, size_t count); } + + namespace avx512 + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } ) IF_ARCH_X86_64( @@ -180,6 +188,7 @@ UTEST_BEGIN("dsp", corr_incr) IF_ARCH_X86_64(CALL(sse3::x64_corr_incr, 16)); IF_ARCH_X86(CALL(avx::corr_incr, 32)); IF_ARCH_X86(CALL(avx::corr_incr_fma3, 32)); + IF_ARCH_X86(CALL(avx512::corr_incr, 64)); } UTEST_END; diff --git a/src/test/utest/corr_init.cpp b/src/test/utest/corr_init.cpp index 62ecb367..a1a3c325 100644 --- a/src/test/utest/corr_init.cpp +++ b/src/test/utest/corr_init.cpp @@ -47,7 +47,6 @@ namespace lsp namespace avx512 { void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); - void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count); } ) @@ -138,7 +137,6 @@ UTEST_BEGIN("dsp", corr_init) IF_ARCH_X86(CALL(avx::corr_init, 32)); IF_ARCH_X86(CALL(avx::corr_init_fma3, 32)); IF_ARCH_X86(CALL(avx512::corr_init, 64)); - IF_ARCH_X86(CALL(avx512::corr_init_fma3, 64)); IF_ARCH_ARM(CALL(neon_d32::corr_init, 16)); IF_ARCH_AARCH64(CALL(asimd::corr_init, 16)); } From cbce8254c5f2e5bcfba3a29b47d72ea137284ae3 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Mon, 11 Mar 2024 23:53:09 +0300 Subject: [PATCH 14/22] ARM NEON implementation of corr_incr --- .../dsp/arch/arm/neon-d32/correlation.h | 202 ++++++++++++++++++ src/main/arm/neon-d32.cpp | 1 + src/test/ptest/corr_incr.cpp | 11 + src/test/utest/corr_incr.cpp | 11 + 4 files changed, 225 insertions(+) diff --git a/include/private/dsp/arch/arm/neon-d32/correlation.h b/include/private/dsp/arch/arm/neon-d32/correlation.h index e3cd0f98..0021fdb7 100644 --- a/include/private/dsp/arch/arm/neon-d32/correlation.h +++ b/include/private/dsp/arch/arm/neon-d32/correlation.h @@ -127,6 +127,208 @@ namespace lsp ); } + static const float corr_const[] __lsp_aligned16 = + { + LSP_DSP_VEC8(1e-10f) + }; + + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + ARCH_ARM_ASM( + __ASM_EMIT("vld3.32 {d0[], d2[], d4[]}, [%[corr]]") + __ASM_EMIT("vld3.32 {d1[], d3[], d5[]}, [%[corr]]") /* q0 = xv, q1 = xa, q2 = xb */ + __ASM_EMIT("veor q3, q3, q3") /* q3 = 0 */ + /* 8x blocks */ + __ASM_EMIT("subs %[count], #8") + __ASM_EMIT("blo 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vldm %[a_head]!, {q4-q5}") /* q4 = ah0, q5 = ah1 */ + __ASM_EMIT("vldm %[b_head]!, {q6-q7}") /* q6 = bh0, q7 = bh1 */ + __ASM_EMIT("vldm %[a_tail]!, {q8-q9}") /* q8 = at0, q9 = at1 */ + __ASM_EMIT("vldm %[b_tail]!, {q10-q11}") /* q10 = bt0, q11 = bt1 */ + __ASM_EMIT("vmul.f32 q12, q4, q6") /* q12 = ah0*bh0 */ + __ASM_EMIT("vmul.f32 q13, q5, q7") + __ASM_EMIT("vmul.f32 q4, q4, q4") /* q4 = ah0*ah0 */ + __ASM_EMIT("vmul.f32 q5, q5, q5") + __ASM_EMIT("vmul.f32 q6, q6, q6") /* q6 = bh0*bh0 */ + __ASM_EMIT("vmul.f32 q7, q7, q7") + __ASM_EMIT("vmls.f32 q12, q8, q10") /* q12 = DV = ah0*bh0 - at0*bt0 */ + __ASM_EMIT("vmls.f32 q13, q9, q11") + __ASM_EMIT("vmls.f32 q4, q8, q8") /* q4 = DA = ah0*ah0 - at0*at0 */ + __ASM_EMIT("vmls.f32 q5, q9, q9") + __ASM_EMIT("vmls.f32 q6, q10, q10") /* q6 = DB = bh0*bh0 - bt0*bt0 */ + __ASM_EMIT("vmls.f32 q7, q11, q11") + + __ASM_EMIT("vext.32 q14, q3, q12, #2") /* q14 = 0 0 DV[0] DV[1] */ + __ASM_EMIT("vext.32 q15, q3, q13, #2") /* q15 = 0 0 DV[4] DV[5] */ + __ASM_EMIT("vadd.f32 q12, q12, q14") /* q12 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */ + __ASM_EMIT("vadd.f32 q13, q13, q15") /* q13 = DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vext.32 q14, q3, q12, #3") /* q14 = 0 DV[0] DV[1] DV[0]+DV[2] */ + __ASM_EMIT("vext.32 q15, q3, q13, #3") /* q15 = 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vadd.f32 q12, q12, q14") /* q12 = V[0..3] = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("vadd.f32 q13, q13, q15") /* q13 = V[4..7] = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vext.32 q14, q3, q4, #2") /* q14 = 0 0 DA[0] DA[1] */ + __ASM_EMIT("vext.32 q15, q3, q5, #2") /* q15 = 0 0 DA[4] DA[5] */ + __ASM_EMIT("vadd.f32 q4, q4, q14") /* q4 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */ + __ASM_EMIT("vadd.f32 q5, q5, q15") /* q5 = DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vext.32 q14, q3, q4, #3") /* q14 = 0 DA[0] DA[1] DA[0]+DA[2] */ + __ASM_EMIT("vext.32 q15, q3, q5, #3") /* q15 = 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vadd.f32 q4, q4, q14") /* q4 = A[0..3] = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("vadd.f32 q5, q5, q15") /* q5 = A[4..7] = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vext.32 q14, q3, q6, #2") /* q14 = 0 0 DB[0] DB[1] */ + __ASM_EMIT("vext.32 q15, q3, q7, #2") /* q15 = 0 0 DB[4] DB[5] */ + __ASM_EMIT("vadd.f32 q6, q6, q14") /* q6 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */ + __ASM_EMIT("vadd.f32 q7, q7, q15") /* q7 = DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vext.32 q14, q3, q6, #3") /* q14 = 0 DB[0] DB[1] DB[0]+DB[2] */ + __ASM_EMIT("vext.32 q15, q3, q7, #3") /* q15 = 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vadd.f32 q6, q6, q14") /* q6 = B[0..3] = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("vadd.f32 q7, q7, q15") /* q7 = B[4..7] = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vdup.32 q8, d25[1]") /* q8 = V[3] V[3] V[3] V[3] */ + __ASM_EMIT("vdup.32 q9, d9[1]") /* q9 = A[3] A[3] A[3] A[3] */ + __ASM_EMIT("vdup.32 q10, d13[1]") /* q10 = B[3] B[3] B[3] B[3] */ + __ASM_EMIT("vadd.f32 q13, q13, q8") /* q13 = V[3]+V[4] V[3]+V[5] V[3]+V[6] V[3]+V[7] */ + __ASM_EMIT("vadd.f32 q5, q5, q9") /* q5 = A[3]+A[4] A[3]+A[5] A[3]+A[6] A[3]+A[7] */ + __ASM_EMIT("vadd.f32 q7, q7, q10") /* q7 = B[3]+B[4] B[3]+B[5] B[3]+B[6] B[3]+B[7] */ + + __ASM_EMIT("vadd.f32 q4, q4, q1") /* q4 = BA = xa + A */ + __ASM_EMIT("vadd.f32 q5, q5, q1") + __ASM_EMIT("vadd.f32 q6, q6, q2") /* q6 = BB = xb + B */ + __ASM_EMIT("vadd.f32 q7, q7, q2") + __ASM_EMIT("vadd.f32 q8, q12, q0") /* q8 = T = xv + V */ + __ASM_EMIT("vadd.f32 q9, q13, q0") + __ASM_EMIT("vmul.f32 q10, q4, q6") /* q10 = B = BA * BB */ + __ASM_EMIT("vmul.f32 q11, q5, q7") + __ASM_EMIT("vdup.32 q0, d19[1]") /* q4 = xv' = T[7] */ + __ASM_EMIT("vdup.32 q1, d11[1]") /* q5 = xa' = BA[7] */ + __ASM_EMIT("vdup.32 q2, d15[1]") /* q6 = xb' = BB[7] */ + __ASM_EMIT("vldm %[CORR_CC], {q14-q15}") /* q14 = 1e-10, q15 = 1e-10 */ + + __ASM_EMIT("vcge.f32 q14, q8, q14") /* q14 = T >= 1e-10 */ + __ASM_EMIT("vcge.f32 q15, q9, q15") + __ASM_EMIT("vrsqrte.f32 q4, q10") /* q4 = x0 */ + __ASM_EMIT("vrsqrte.f32 q5, q11") + __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x0 */ + __ASM_EMIT("vmul.f32 q7, q5, q11") + __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("vrsqrts.f32 q13, q7, q5") + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("vmul.f32 q5, q5, q13") + __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x1 */ + __ASM_EMIT("vmul.f32 q7, q5, q11") + __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("vrsqrts.f32 q13, q7, q5") + __ASM_EMIT("vmul.f32 q10, q4, q12") /* q10 = 1/sqrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("vmul.f32 q11, q5, q13") + __ASM_EMIT("vmul.f32 q10, q8, q10") /* q10 = T/sqrtf(B) */ + __ASM_EMIT("vmul.f32 q11, q9, q11") + __ASM_EMIT("vand q10, q10, q14") /* q10 = (T >= 1e-10) ? T/sqrt(B) : 0 */ + __ASM_EMIT("vand q11, q11, q15") + __ASM_EMIT("subs %[count], #8") + __ASM_EMIT("vstm %[dst]!, {q10-q11}") + __ASM_EMIT("bhs 1b") + __ASM_EMIT("2:") + /* 4x block */ + __ASM_EMIT("adds %[count], #4") + __ASM_EMIT("blt 4f") + __ASM_EMIT("vldm %[a_head]!, {q4}") /* q4 = ah0 */ + __ASM_EMIT("vldm %[b_head]!, {q6}") /* q6 = bh0 */ + __ASM_EMIT("vldm %[a_tail]!, {q8}") /* q8 = at0 */ + __ASM_EMIT("vldm %[b_tail]!, {q10}") /* q10 = bt0 */ + __ASM_EMIT("vmul.f32 q12, q4, q6") /* q12 = ah0*bh0 */ + __ASM_EMIT("vmul.f32 q4, q4, q4") /* q4 = ah0*ah0 */ + __ASM_EMIT("vmul.f32 q6, q6, q6") /* q6 = bh0*bh0 */ + __ASM_EMIT("vmls.f32 q12, q8, q10") /* q12 = DV = ah0*bh0 - at0*bt0 */ + __ASM_EMIT("vmls.f32 q4, q8, q8") /* q4 = DA = ah0*ah0 - at0*at0 */ + __ASM_EMIT("vmls.f32 q6, q10, q10") /* q6 = DB = bh0*bh0 - bt0*bt0 */ + + __ASM_EMIT("vext.32 q13, q3, q12, #2") /* q13 = 0 0 DV[0] DV[1] */ + __ASM_EMIT("vext.32 q5, q3, q4, #2") /* q5 = 0 0 DA[0] DA[1] */ + __ASM_EMIT("vext.32 q7, q3, q6, #2") /* q7 = 0 0 DB[0] DB[1] */ + __ASM_EMIT("vadd.f32 q12, q12, q13") /* q12 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */ + __ASM_EMIT("vadd.f32 q4, q4, q5") /* q4 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */ + __ASM_EMIT("vadd.f32 q6, q6, q7") /* q6 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */ + __ASM_EMIT("vext.32 q13, q3, q12, #3") /* q13 = 0 DV[0] DV[1] DV[0]+DV[2] */ + __ASM_EMIT("vext.32 q5, q3, q4, #3") /* q5 = 0 DA[0] DA[1] DA[0]+DA[2] */ + __ASM_EMIT("vext.32 q7, q3, q6, #3") /* q7 = 0 DB[0] DB[1] DB[0]+DB[2] */ + __ASM_EMIT("vadd.f32 q12, q12, q13") /* q12 = V = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("vadd.f32 q4, q4, q5") /* q4 = A = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("vadd.f32 q6, q6, q7") /* q6 = B = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */ + + __ASM_EMIT("vadd.f32 q4, q4, q1") /* q4 = BA = xa + A */ + __ASM_EMIT("vadd.f32 q6, q6, q2") /* q6 = BB = xb + B */ + __ASM_EMIT("vadd.f32 q8, q12, q0") /* q8 = T = xv + V */ + __ASM_EMIT("vmul.f32 q10, q4, q6") /* q10 = B = BA * BB */ + __ASM_EMIT("vdup.32 q1, d9[1]") /* q1 = xa' = BA[3] */ + __ASM_EMIT("vdup.32 q2, d13[1]") /* q2 = xb' = BB[3] */ + __ASM_EMIT("vdup.32 q0, d17[1]") /* q0 = xv' = T[3] */ + __ASM_EMIT("vldm %[CORR_CC], {q14}") /* q14 = 1e-10 */ + + __ASM_EMIT("vcge.f32 q14, q8, q14") /* q14 = T >= 1e-10 */ + __ASM_EMIT("vrsqrte.f32 q4, q10") /* q4 = x0 */ + __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x0 */ + __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x1 */ + __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("vmul.f32 q10, q4, q12") /* q10 = 1/sqrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("vmul.f32 q10, q8, q10") /* q10 = T/sqrtf(B) */ + __ASM_EMIT("vand q10, q10, q14") /* q10 = (T >= 1e-10) ? T/sqrt(B) : 0 */ + __ASM_EMIT("sub %[count], #4") + __ASM_EMIT("vstm %[dst]!, {q10}") + __ASM_EMIT("4:") + /* 1x blocks */ + __ASM_EMIT("adds %[count], #3") + __ASM_EMIT("blt 6f") + __ASM_EMIT("vldm %[CORR_CC], {q3}") /* q3 = 1e-10 */ + __ASM_EMIT("5:") + __ASM_EMIT("vld1.32 {d8[], d9[]}, [%[a_head]]!") /* q4 = ah0 */ + __ASM_EMIT("vld1.32 {d12[], d13[]}, [%[b_head]]!") /* q6 = bh0 */ + __ASM_EMIT("vld1.32 {d16[], d17[]}, [%[a_tail]]!") /* q8 = at0 */ + __ASM_EMIT("vld1.32 {d20[], d21[]}, [%[b_tail]]!") /* q10 = bt0 */ + __ASM_EMIT("vmul.f32 q12, q4, q6") /* q12 = ah0*bh0 */ + __ASM_EMIT("vmul.f32 q4, q4, q4") /* q4 = ah0*ah0 */ + __ASM_EMIT("vmul.f32 q6, q6, q6") /* q6 = bh0*bh0 */ + __ASM_EMIT("vmls.f32 q12, q8, q10") /* q12 = DV = ah0*bh0 - at0*bt0 */ + __ASM_EMIT("vmls.f32 q4, q8, q8") /* q4 = DA = ah0*ah0 - at0*at0 */ + __ASM_EMIT("vmls.f32 q6, q10, q10") /* q6 = DB = bh0*bh0 - bt0*bt0 */ + + __ASM_EMIT("vadd.f32 q1, q4, q1") /* q1 = BA = xa + DA */ + __ASM_EMIT("vadd.f32 q2, q6, q2") /* q2 = BB = xb + DB */ + __ASM_EMIT("vadd.f32 q0, q12, q0") /* q0 = T = xv + DV */ + __ASM_EMIT("vmul.f32 q10, q1, q2") /* q10 = B = BA * BB */ + + __ASM_EMIT("vcge.f32 q14, q0, q3") /* q14 = T >= 1e-10 */ + __ASM_EMIT("vrsqrte.f32 q4, q10") /* q4 = x0 */ + __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x0 */ + __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x1 */ + __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("vmul.f32 q10, q4, q12") /* q10 = 1/sqrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("vmul.f32 q10, q0, q10") /* q10 = T/sqrtf(B) */ + __ASM_EMIT("vand q10, q10, q14") /* q10 = (T >= 1e-10) ? T/sqrt(B) : 0 */ + __ASM_EMIT("subs %[count], #1") + __ASM_EMIT("vst1.32 {d20[0]}, [%[dst]]!") + __ASM_EMIT("bge 5b") + __ASM_EMIT("6:") + /* Store state */ + __ASM_EMIT("vst3.32 {d0[0], d2[0], d4[0]}, [%[corr]]") + + : [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + : [corr] "r" (corr), + [CORR_CC] "r" (&corr_const[0]) + : "cc", "memory", + "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + } + } /* namespace neon_d32 */ } /* namespace lsp */ diff --git a/src/main/arm/neon-d32.cpp b/src/main/arm/neon-d32.cpp index 29a3eead..b7ae9953 100644 --- a/src/main/arm/neon-d32.cpp +++ b/src/main/arm/neon-d32.cpp @@ -143,6 +143,7 @@ EXPORT1(convolve); EXPORT1(corr_init); + EXPORT1(corr_incr); EXPORT1(axis_apply_lin1); EXPORT1(axis_apply_log1); diff --git a/src/test/ptest/corr_incr.cpp b/src/test/ptest/corr_incr.cpp index d5d9967e..4fc32c1b 100644 --- a/src/test/ptest/corr_incr.cpp +++ b/src/test/ptest/corr_incr.cpp @@ -79,6 +79,16 @@ namespace lsp } ) + IF_ARCH_ARM( + namespace neon_d32 + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + typedef void (* corr_incr_t)(dsp::correlation_t *corr, float *dst, const float *a_head, const float *b_head, const float *a_tail, const float *b_tail, @@ -142,6 +152,7 @@ PTEST_BEGIN("dsp", corr_incr, 5, 10000) IF_ARCH_X86(CALL(avx::corr_incr, count)); IF_ARCH_X86(CALL(avx::corr_incr_fma3, count)); IF_ARCH_X86(CALL(avx512::corr_incr, count)); + IF_ARCH_ARM(CALL(neon_d32::corr_incr, count)); PTEST_SEPARATOR; } diff --git a/src/test/utest/corr_incr.cpp b/src/test/utest/corr_incr.cpp index 973457f9..e876ea6a 100644 --- a/src/test/utest/corr_incr.cpp +++ b/src/test/utest/corr_incr.cpp @@ -76,6 +76,16 @@ namespace lsp } ) + IF_ARCH_ARM( + namespace neon_d32 + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + static void corr_incr(dsp::correlation_t *corr, float *dst, const float *a_head, const float *b_head, const float *a_tail, const float *b_tail, @@ -189,6 +199,7 @@ UTEST_BEGIN("dsp", corr_incr) IF_ARCH_X86(CALL(avx::corr_incr, 32)); IF_ARCH_X86(CALL(avx::corr_incr_fma3, 32)); IF_ARCH_X86(CALL(avx512::corr_incr, 64)); + IF_ARCH_ARM(CALL(neon_d32::corr_incr, 16)); } UTEST_END; From cb1434d3124df163b563fee4d0f431bb70c03919 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Tue, 12 Mar 2024 23:44:19 +0300 Subject: [PATCH 15/22] ASIMD implementation of corr_incr --- .../dsp/arch/aarch64/asimd/correlation.h | 216 ++++++++++++++++++ src/main/aarch64/asimd.cpp | 5 +- src/test/ptest/corr_incr.cpp | 11 + src/test/utest/corr_incr.cpp | 11 + 4 files changed, 241 insertions(+), 2 deletions(-) diff --git a/include/private/dsp/arch/aarch64/asimd/correlation.h b/include/private/dsp/arch/aarch64/asimd/correlation.h index c3d6e6b1..4debc2af 100644 --- a/include/private/dsp/arch/aarch64/asimd/correlation.h +++ b/include/private/dsp/arch/aarch64/asimd/correlation.h @@ -137,6 +137,222 @@ namespace lsp ); } + static const float corr_const[] __lsp_aligned16 = + { + LSP_DSP_VEC8(1e-10f) + }; + + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + ARCH_AARCH64_ASM( + __ASM_EMIT("ld3r {v0.4s, v1.4s, v2.4s}, [%[corr]]") /* v0 = xv, v1 = xa, v2 = xb */ + __ASM_EMIT("eor v3.16b, v3.16b, v3.16b") /* v3 = 0 */ + /* 8x blocks */ + __ASM_EMIT("subs %[count], %[count], #8") + __ASM_EMIT("b.lo 2f") + __ASM_EMIT("1:") + __ASM_EMIT("ldp q4, q5, [%[a_head], 0x00]") /* v4 = ah0, v5 = ah1 */ + __ASM_EMIT("ldp q6, q7, [%[b_head], 0x00]") /* v6 = ah0, v7 = ah1 */ + __ASM_EMIT("ldp q8, q9, [%[a_tail], 0x00]") /* v8 = ah0, v9 = ah1 */ + __ASM_EMIT("ldp q10, q11, [%[b_tail], 0x00]") /* v10 = ah0, v11 = ah1 */ + __ASM_EMIT("fmul v12.4s, v4.4s, v6.4s") /* v12 = ah0*bh0 */ + __ASM_EMIT("fmul v13.4s, v5.4s, v7.4s") + __ASM_EMIT("fmul v4.4s, v4.4s, v4.4s") /* v4 = ah0*ah0 */ + __ASM_EMIT("fmul v5.4s, v5.4s, v5.4s") + __ASM_EMIT("fmul v6.4s, v6.4s, v6.4s") /* v6 = bh0*bh0 */ + __ASM_EMIT("fmul v7.4s, v7.4s, v7.4s") + __ASM_EMIT("fmls v12.4s, v8.4s, v10.4s") /* v12 = DV = ah0*bh0 - at0*bt0 */ + __ASM_EMIT("fmls v13.4s, v9.4s, v11.4s") + __ASM_EMIT("fmls v4.4s, v8.4s, v8.4s") /* v4 = DA = ah0*ah0 - at0*at0 */ + __ASM_EMIT("fmls v5.4s, v9.4s, v9.4s") + __ASM_EMIT("fmls v6.4s, v10.4s, v10.4s") /* v6 = DB = bh0*bh0 - bt0*bt0 */ + __ASM_EMIT("fmls v7.4s, v11.4s, v11.4s") + + __ASM_EMIT("ext v14.16b, v3.16b, v12.16b, #8") /* v14 = 0 0 DV[0] DV[1] */ + __ASM_EMIT("ext v15.16b, v3.16b, v13.16b, #8") /* v15 = 0 0 DV[4] DV[5] */ + __ASM_EMIT("fadd v12.4s, v12.4s, v14.4s") /* v12 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */ + __ASM_EMIT("fadd v13.4s, v13.4s, v15.4s") /* v13 = DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("ext v14.16b, v3.16b, v12.16b, #12") /* v14 = 0 DV[0] DV[1] DV[0]+DV[2] */ + __ASM_EMIT("ext v15.16b, v3.16b, v13.16b, #12") /* v15 = 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("fadd v12.4s, v12.4s, v14.4s") /* v12 = V[0..3] = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("fadd v13.4s, v13.4s, v15.4s") /* v13 = V[4..7] = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("ext v14.16b, v3.16b, v4.16b, #8") /* v14 = 0 0 DA[0] DA[1] */ + __ASM_EMIT("ext v15.16b, v3.16b, v5.16b, #8") /* v15 = 0 0 DA[4] DA[5] */ + __ASM_EMIT("fadd v4.4s, v4.4s, v14.4s") /* v4 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */ + __ASM_EMIT("fadd v5.4s, v5.4s, v15.4s") /* v5 = DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("ext v14.16b, v3.16b, v4.16b, #12") /* v14 = 0 DA[0] DA[1] DA[0]+DA[2] */ + __ASM_EMIT("ext v15.16b, v3.16b, v5.16b, #12") /* v15 = 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("fadd v4.4s, v4.4s, v14.4s") /* v4 = A[0..3] = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("fadd v5.4s, v5.4s, v15.4s") /* v5 = A[4..7] = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("ext v14.16b, v3.16b, v6.16b, #8") /* v14 = 0 0 DB[0] DB[1] */ + __ASM_EMIT("ext v15.16b, v3.16b, v7.16b, #8") /* v15 = 0 0 DB[4] DB[5] */ + __ASM_EMIT("fadd v6.4s, v6.4s, v14.4s") /* v6 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */ + __ASM_EMIT("fadd v7.4s, v7.4s, v15.4s") /* v7 = DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("ext v14.16b, v3.16b, v6.16b, #12") /* v14 = 0 DB[0] DB[1] DB[0]+DB[2] */ + __ASM_EMIT("ext v15.16b, v3.16b, v7.16b, #12") /* v15 = 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("fadd v6.4s, v6.4s, v14.4s") /* v6 = B[0..3] = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("fadd v7.4s, v7.4s, v15.4s") /* v7 = B[4..7] = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("dup v8.4s, v12.s[3]") /* v8 = V[3] V[3] V[3] V[3] */ + __ASM_EMIT("dup v9.4s, v4.s[3]") /* v9 = A[3] A[3] A[3] A[3] */ + __ASM_EMIT("dup v10.4s, v6.s[3]") /* v10 = B[3] B[3] B[3] B[3] */ + __ASM_EMIT("fadd v13.4s, v13.4s, v8.4s") /* v13 = V[3]+V[4] V[3]+V[5] V[3]+V[6] V[3]+V[7] */ + __ASM_EMIT("fadd v5.4s, v5.4s, v9.4s") /* v5 = A[3]+A[4] A[3]+A[5] A[3]+A[6] A[3]+A[7] */ + __ASM_EMIT("fadd v7.4s, v7.4s, v10.4s") /* v7 = B[3]+B[4] B[3]+B[5] B[3]+B[6] B[3]+B[7] */ + + __ASM_EMIT("fadd v4.4s, v4.4s, v1.4s") /* v4 = BA = xa + A */ + __ASM_EMIT("fadd v5.4s, v5.4s, v1.4s") + __ASM_EMIT("fadd v6.4s, v6.4s, v2.4s") /* v6 = BB = xb + B */ + __ASM_EMIT("fadd v7.4s, v7.4s, v2.4s") + __ASM_EMIT("fadd v8.4s, v12.4s, v0.4s") /* v8 = T = xv + V */ + __ASM_EMIT("fadd v9.4s, v13.4s, v0.4s") + __ASM_EMIT("fmul v10.4s, v4.4s, v6.4s") /* v10 = B = BA * BB */ + __ASM_EMIT("fmul v11.4s, v5.4s, v7.4s") + __ASM_EMIT("dup v0.4s, v9.s[3]") /* v0 = xv' = T[7] */ + __ASM_EMIT("dup v1.4s, v5.s[3]") /* v1 = xa' = BA[7] */ + __ASM_EMIT("dup v2.4s, v7.s[3]") /* v2 = xb' = BB[7] */ + __ASM_EMIT("ldp q14, q15, [%[CORR_CC]]") /* v14 = 1e-10, v15 = 1e-10 */ + + __ASM_EMIT("fcmge v14.4s, v8.4s, v14.4s") /* v14 = T >= 1e-10 */ + __ASM_EMIT("fcmge v15.4s, v9.4s, v15.4s") + __ASM_EMIT("frsqrte v4.4s, v10.4s") /* v4 = x0 */ + __ASM_EMIT("frsqrte v5.4s, v11.4s") + __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x0 */ + __ASM_EMIT("fmul v7.4s, v5.4s, v11.4s") + __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("frsqrts v13.4s, v7.4s, v5.4s") + __ASM_EMIT("fmul v4.4s, v4.4s, v12.4s") /* v4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("fmul v5.4s, v5.4s, v13.4s") + __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x1 */ + __ASM_EMIT("fmul v7.4s, v5.4s, v11.4s") + __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("frsqrts v13.4s, v7.4s, v5.4s") + __ASM_EMIT("fmul v10.4s, v4.4s, v12.4s") /* v10 = 1/svrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("fmul v11.4s, v5.4s, v13.4s") + __ASM_EMIT("fmul v10.4s, v8.4s, v10.4s") /* v10 = T/svrtf(B) */ + __ASM_EMIT("fmul v11.4s, v9.4s, v11.4s") + __ASM_EMIT("and v10.16b, v10.16b, v14.16b") /* v10 = (T >= 1e-10) ? T/svrt(B) : 0 */ + __ASM_EMIT("and v11.16b, v11.16b, v15.16b") + __ASM_EMIT("add %[a_head], %[a_head], #0x20") + __ASM_EMIT("add %[b_head], %[b_head], #0x20") + __ASM_EMIT("subs %[count], %[count], #8") + __ASM_EMIT("stp q10, q11, [%[dst], 0x00]") + __ASM_EMIT("add %[a_tail], %[a_tail], #0x20") + __ASM_EMIT("add %[b_tail], %[b_tail], #0x20") + __ASM_EMIT("add %[dst], %[dst], #0x20") + __ASM_EMIT("b.hs 1b") + __ASM_EMIT("2:") + /* 4x block */ + __ASM_EMIT("adds %[count], %[count], #4") + __ASM_EMIT("b.lt 4f") + __ASM_EMIT("ldr q4, [%[a_head], 0x00]") /* v4 = ah0 */ + __ASM_EMIT("ldr q6, [%[b_head], 0x00]") /* v6 = ah0 */ + __ASM_EMIT("ldr q8, [%[a_tail], 0x00]") /* v8 = ah0 */ + __ASM_EMIT("ldr q10, [%[b_tail], 0x00]") /* v10 = ah0 */ + __ASM_EMIT("fmul v12.4s, v4.4s, v6.4s") /* v12 = ah0*bh0 */ + __ASM_EMIT("fmul v4.4s, v4.4s, v4.4s") /* v4 = ah0*ah0 */ + __ASM_EMIT("fmul v6.4s, v6.4s, v6.4s") /* v6 = bh0*bh0 */ + __ASM_EMIT("fmls v12.4s, v8.4s, v10.4s") /* v12 = DV = ah0*bh0 - at0*bt0 */ + __ASM_EMIT("fmls v4.4s, v8.4s, v8.4s") /* v4 = DA = ah0*ah0 - at0*at0 */ + __ASM_EMIT("fmls v6.4s, v10.4s, v10.4s") /* v6 = DB = bh0*bh0 - bt0*bt0 */ + + __ASM_EMIT("ext v13.16b, v3.16b, v12.16b, #8") /* v13 = 0 0 DV[0] DV[1] */ + __ASM_EMIT("ext v5.16b, v3.16b, v4.16b, #8") /* v5 = 0 0 DA[0] DA[1] */ + __ASM_EMIT("ext v7.16b, v3.16b, v6.16b, #8") /* v7 = 0 0 DB[0] DB[1] */ + __ASM_EMIT("fadd v12.4s, v12.4s, v13.4s") /* v12 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */ + __ASM_EMIT("fadd v4.4s, v4.4s, v5.4s") /* v4 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */ + __ASM_EMIT("fadd v6.4s, v6.4s, v7.4s") /* v6 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */ + __ASM_EMIT("ext v13.16b, v3.16b, v12.16b, #12") /* v13 = 0 DV[0] DV[1] DV[0]+DV[2] */ + __ASM_EMIT("ext v5.16b, v3.16b, v4.16b, #12") /* v5 = 0 DA[0] DA[1] DA[0]+DA[2] */ + __ASM_EMIT("ext v7.16b, v3.16b, v6.16b, #12") /* v7 = 0 DB[0] DB[1] DB[0]+DB[2] */ + __ASM_EMIT("fadd v12.4s, v12.4s, v13.4s") /* v12 = V[0..3] = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("fadd v4.4s, v4.4s, v5.4s") /* v4 = A[0..3] = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("fadd v6.4s, v6.4s, v7.4s") /* v6 = B[0..3] = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */ + + __ASM_EMIT("fadd v4.4s, v4.4s, v1.4s") /* v4 = BA = xa + A */ + __ASM_EMIT("fadd v6.4s, v6.4s, v2.4s") /* v6 = BB = xb + B */ + __ASM_EMIT("fadd v8.4s, v12.4s, v0.4s") /* v8 = T = xv + V */ + __ASM_EMIT("fmul v10.4s, v4.4s, v6.4s") /* v10 = B = BA * BB */ + __ASM_EMIT("dup v1.4s, v4.s[3]") /* v1 = xa' = BA[7] */ + __ASM_EMIT("dup v2.4s, v6.s[3]") /* v2 = xb' = BB[7] */ + __ASM_EMIT("dup v0.4s, v8.s[3]") /* v0 = xv' = T[7] */ + __ASM_EMIT("ldr q14, [%[CORR_CC]]") /* v14 = 1e-10 */ + + __ASM_EMIT("fcmge v14.4s, v8.4s, v14.4s") /* v14 = T >= 1e-10 */ + __ASM_EMIT("frsqrte v4.4s, v10.4s") /* v4 = x0 */ + __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x0 */ + __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("fmul v4.4s, v4.4s, v12.4s") /* v4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x1 */ + __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("fmul v10.4s, v4.4s, v12.4s") /* v10 = 1/svrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("fmul v10.4s, v8.4s, v10.4s") /* v10 = T/svrtf(B) */ + __ASM_EMIT("and v10.16b, v10.16b, v14.16b") /* v10 = (T >= 1e-10) ? T/svrt(B) : 0 */ + __ASM_EMIT("add %[a_head], %[a_head], #0x10") + __ASM_EMIT("add %[b_head], %[b_head], #0x10") + __ASM_EMIT("sub %[count], %[count], #4") + __ASM_EMIT("str q10, [%[dst], 0x00]") + __ASM_EMIT("add %[a_tail], %[a_tail], #0x10") + __ASM_EMIT("add %[b_tail], %[b_tail], #0x10") + __ASM_EMIT("add %[dst], %[dst], #0x10") + __ASM_EMIT("4:") + /* 1x blocks */ + __ASM_EMIT("adds %[count], %[count], #3") + __ASM_EMIT("blt 6f") + __ASM_EMIT("ldr q3, [%[CORR_CC]]") /* v3 = 1e-10 */ + __ASM_EMIT("5:") + __ASM_EMIT("ld1r {v4.4s}, [%[a_head]]") /* v4 = ah0 */ + __ASM_EMIT("ld1r {v6.4s}, [%[b_head]]") /* v6 = bh0 */ + __ASM_EMIT("ld1r {v8.4s}, [%[a_tail]]") /* v8 = at0 */ + __ASM_EMIT("ld1r {v10.4s}, [%[b_tail]]") /* v10 = bt0 */ + __ASM_EMIT("fmul v12.4s, v4.4s, v6.4s") /* v12 = ah0*bh0 */ + __ASM_EMIT("fmul v4.4s, v4.4s, v4.4s") /* v4 = ah0*ah0 */ + __ASM_EMIT("fmul v6.4s, v6.4s, v6.4s") /* v6 = bh0*bh0 */ + __ASM_EMIT("fmls v12.4s, v8.4s, v10.4s") /* v12 = DV = ah0*bh0 - at0*bt0 */ + __ASM_EMIT("fmls v4.4s, v8.4s, v8.4s") /* v4 = DA = ah0*ah0 - at0*at0 */ + __ASM_EMIT("fmls v6.4s, v10.4s, v10.4s") /* v6 = DB = bh0*bh0 - bt0*bt0 */ + + __ASM_EMIT("fadd v1.4s, v4.4s, v1.4s") /* v1 = BA = xa + DA */ + __ASM_EMIT("fadd v2.4s, v6.4s, v2.4s") /* v2 = BB = xb + DB */ + __ASM_EMIT("fadd v0.4s, v12.4s, v0.4s") /* v0 = T = xv + DV */ + __ASM_EMIT("fmul v10.4s, v1.4s, v2.4s") /* v10 = B = BA * BB */ + + __ASM_EMIT("fcmge v14.4s, v0.4s, v3.4s") /* v14 = T >= 1e-10 */ + __ASM_EMIT("frsqrte v4.4s, v10.4s") /* v4 = x0 */ + __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x0 */ + __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("fmul v4.4s, v4.4s, v12.4s") /* v4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x1 */ + __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("fmul v10.4s, v4.4s, v12.4s") /* v10 = 1/svrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("fmul v10.4s, v0.4s, v10.4s") /* v10 = T/svrtf(B) */ + __ASM_EMIT("and v10.16b, v10.16b, v14.16b") /* v10 = (T >= 1e-10) ? T/svrt(B) : 0 */ + __ASM_EMIT("add %[a_head], %[a_head], #0x04") + __ASM_EMIT("add %[b_head], %[b_head], #0x04") + __ASM_EMIT("subs %[count], %[count], #1") + __ASM_EMIT("st1 {v10.s}[0], [%[dst]]") + __ASM_EMIT("add %[a_tail], %[a_tail], #0x04") + __ASM_EMIT("add %[b_tail], %[b_tail], #0x04") + __ASM_EMIT("add %[dst], %[dst], #0x04") + __ASM_EMIT("b.ge 5b") + __ASM_EMIT("6:") + /* Store state */ + __ASM_EMIT("st3 {v0.s, v1.s, v2.s}[0], [%[corr]]") + + : [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + : [corr] "r" (corr), + [CORR_CC] "r" (&corr_const[0]) + : "cc", "memory", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); + } + } /* namespace asimd */ } /* namespace lsp */ diff --git a/src/main/aarch64/asimd.cpp b/src/main/aarch64/asimd.cpp index f69b3b98..a5eac977 100644 --- a/src/main/aarch64/asimd.cpp +++ b/src/main/aarch64/asimd.cpp @@ -1,6 +1,6 @@ /* - * Copyright (C) 2023 Linux Studio Plugins Project - * (C) 2023 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 31 мар. 2020 г. @@ -431,6 +431,7 @@ EXPORT1(convolve); EXPORT1(corr_init); + EXPORT1(corr_incr); EXPORT1(abgr32_to_bgrff32); EXPORT1(rgba32_to_bgra32); diff --git a/src/test/ptest/corr_incr.cpp b/src/test/ptest/corr_incr.cpp index 4fc32c1b..597c7ecf 100644 --- a/src/test/ptest/corr_incr.cpp +++ b/src/test/ptest/corr_incr.cpp @@ -89,6 +89,16 @@ namespace lsp } ) + IF_ARCH_AARCH64( + namespace asimd + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + typedef void (* corr_incr_t)(dsp::correlation_t *corr, float *dst, const float *a_head, const float *b_head, const float *a_tail, const float *b_tail, @@ -153,6 +163,7 @@ PTEST_BEGIN("dsp", corr_incr, 5, 10000) IF_ARCH_X86(CALL(avx::corr_incr_fma3, count)); IF_ARCH_X86(CALL(avx512::corr_incr, count)); IF_ARCH_ARM(CALL(neon_d32::corr_incr, count)); + IF_ARCH_AARCH64(CALL(asimd::corr_incr, count)); PTEST_SEPARATOR; } diff --git a/src/test/utest/corr_incr.cpp b/src/test/utest/corr_incr.cpp index e876ea6a..34df2084 100644 --- a/src/test/utest/corr_incr.cpp +++ b/src/test/utest/corr_incr.cpp @@ -86,6 +86,16 @@ namespace lsp } ) + IF_ARCH_AARCH64( + namespace asimd + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + static void corr_incr(dsp::correlation_t *corr, float *dst, const float *a_head, const float *b_head, const float *a_tail, const float *b_tail, @@ -200,6 +210,7 @@ UTEST_BEGIN("dsp", corr_incr) IF_ARCH_X86(CALL(avx::corr_incr_fma3, 32)); IF_ARCH_X86(CALL(avx512::corr_incr, 64)); IF_ARCH_ARM(CALL(neon_d32::corr_incr, 16)); + IF_ARCH_AARCH64(CALL(asimd::corr_incr, 16)); } UTEST_END; From b30a7104ba668aafd071170ad4fe51b72553f537 Mon Sep 17 00:00:00 2001 From: Vladimir Sadovnikov Date: Thu, 14 Mar 2024 01:12:59 +0300 Subject: [PATCH 16/22] Updated CHANGELOG --- CHANGELOG | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index 93de5d48..6b561345 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,7 +3,7 @@ ******************************************************************************* === 1.0.22 === - +* Implemented functions for computing correlation between two signals. === 1.0.21 === * Updated build scripts. From c01f1939675a5acdd1772f79654129c193da0e60 Mon Sep 17 00:00:00 2001 From: Vladimir Sadovnikov Date: Sun, 17 Mar 2024 03:01:08 +0300 Subject: [PATCH 17/22] Updated build scripts --- CHANGELOG | 1 + make/tools.mk | 2 ++ 2 files changed, 3 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 6b561345..20d67740 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ === 1.0.22 === * Implemented functions for computing correlation between two signals. +* Updated build scripts. === 1.0.21 === * Updated build scripts. diff --git a/make/tools.mk b/make/tools.mk index e99498dd..ade5d571 100644 --- a/make/tools.mk +++ b/make/tools.mk @@ -76,6 +76,8 @@ ifeq ($(PLATFORM),Solaris) else ifeq ($(PLATFORM),Windows) FLAG_RELRO = FLAG_STDLIB = + CFLAGS_EXT += -DWINVER=0x600 -D_WIN32_WINNT=0x600 + CXXFLAGS_EXT += -DWINVER=0x600 -D_WIN32_WINNT=0x600 EXE_FLAGS_EXT += -static-libgcc -static-libstdc++ SO_FLAGS_EXT += -static-libgcc -static-libstdc++ LDFLAGS_EXT += -T $(CURDIR)/make/ld-windows.script From f78cd2770909b3b9fc4a2a5a768be2691a4827d7 Mon Sep 17 00:00:00 2001 From: Vladimir Sadovnikov Date: Mon, 25 Mar 2024 02:05:28 +0300 Subject: [PATCH 18/22] Fixed error in AVX and AVX-512 implementation of lr_to_ms functions --- CHANGELOG | 1 + include/private/dsp/arch/x86/avx/msmatrix.h | 8 ++++---- include/private/dsp/arch/x86/avx512/msmatrix.h | 8 ++++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 20d67740..7a56c6c4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ === 1.0.22 === * Implemented functions for computing correlation between two signals. +* Fixed error in AVX and AVX-512 implementation of lr_to_ms functions. * Updated build scripts. === 1.0.21 === diff --git a/include/private/dsp/arch/x86/avx/msmatrix.h b/include/private/dsp/arch/x86/avx/msmatrix.h index 5d158b82..bca9b985 100644 --- a/include/private/dsp/arch/x86/avx/msmatrix.h +++ b/include/private/dsp/arch/x86/avx/msmatrix.h @@ -1,6 +1,6 @@ /* - * Copyright (C) 2020 Linux Studio Plugins Project - * (C) 2020 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 31 мар. 2020 г. @@ -91,8 +91,8 @@ namespace lsp __ASM_EMIT("vmovups %%xmm0, 0x00(%[mid], %[off])") __ASM_EMIT("vmovups %%xmm2, 0x00(%[side], %[off])") __ASM_EMIT("add $0x10, %[off]") - __ASM_EMIT32("subl $8, %[count]") - __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") __ASM_EMIT("6:") // 1x blocks __ASM_EMIT32("addl $3, %[count]") diff --git a/include/private/dsp/arch/x86/avx512/msmatrix.h b/include/private/dsp/arch/x86/avx512/msmatrix.h index c8798e4f..002fdd96 100644 --- a/include/private/dsp/arch/x86/avx512/msmatrix.h +++ b/include/private/dsp/arch/x86/avx512/msmatrix.h @@ -1,6 +1,6 @@ /* - * Copyright (C) 2023 Linux Studio Plugins Project - * (C) 2023 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 9 сент. 2023 г. @@ -107,8 +107,8 @@ namespace lsp __ASM_EMIT("vmovups %%xmm0, 0x00(%[mid], %[off])") __ASM_EMIT("vmovups %%xmm2, 0x00(%[side], %[off])") __ASM_EMIT("add $0x10, %[off]") - __ASM_EMIT32("subl $8, %[count]") - __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") __ASM_EMIT("8:") // 1x blocks __ASM_EMIT32("addl $3, %[count]") From 99ad5fb546bdf62da3ccc97dfcdff52184f94290 Mon Sep 17 00:00:00 2001 From: sadko4u Date: Tue, 26 Mar 2024 11:14:45 +0300 Subject: [PATCH 19/22] Yet more AVX-512 fixes for msmatrix --- include/private/dsp/arch/x86/avx512/msmatrix.h | 4 ++-- src/test/utest/msmatrix/conv2.cpp | 6 +++--- src/test/utest/msmatrix/conv2x1.cpp | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/private/dsp/arch/x86/avx512/msmatrix.h b/include/private/dsp/arch/x86/avx512/msmatrix.h index 002fdd96..cfb1be5c 100644 --- a/include/private/dsp/arch/x86/avx512/msmatrix.h +++ b/include/private/dsp/arch/x86/avx512/msmatrix.h @@ -273,8 +273,8 @@ namespace lsp ARCH_X86_ASM( __ASM_EMIT("xor %[off], %[off]") // 64x blocks - __ASM_EMIT32("subl $32, %[count]") - __ASM_EMIT64("sub $32, %[count]") + __ASM_EMIT32("subl $64, %[count]") + __ASM_EMIT64("sub $64, %[count]") __ASM_EMIT("jb 2f") __ASM_EMIT("1:") __ASM_EMIT("vmovups 0x00(%[mid], %[off]), %%zmm0") // zmm0 = m diff --git a/src/test/utest/msmatrix/conv2.cpp b/src/test/utest/msmatrix/conv2.cpp index f22e21e4..540e3c46 100644 --- a/src/test/utest/msmatrix/conv2.cpp +++ b/src/test/utest/msmatrix/conv2.cpp @@ -1,6 +1,6 @@ /* - * Copyright (C) 2023 Linux Studio Plugins Project - * (C) 2023 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 31 мар. 2020 г. @@ -85,7 +85,7 @@ UTEST_BEGIN("dsp.msmatrix", conv2) return; UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 32, 64, 65, 100, 999, 0xfff) + 32, 33, 34, 35, 36, 37, 38, 39, 40, 63, 64, 65, 100, 999, 0xfff) { for (size_t mask=0; mask <= 0x0f; ++mask) { diff --git a/src/test/utest/msmatrix/conv2x1.cpp b/src/test/utest/msmatrix/conv2x1.cpp index ba75853e..889e8a74 100644 --- a/src/test/utest/msmatrix/conv2x1.cpp +++ b/src/test/utest/msmatrix/conv2x1.cpp @@ -1,6 +1,6 @@ /* - * Copyright (C) 2023 Linux Studio Plugins Project - * (C) 2023 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 31 мар. 2020 г. @@ -97,7 +97,7 @@ UTEST_BEGIN("dsp.msmatrix", conv2x1) return; UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 32, 64, 65, 100, 999, 0xfff) + 32, 33, 34, 35, 36, 37, 38, 39, 40, 63, 64, 65, 100, 999, 0xfff) { for (size_t mask=0; mask <= 0x07; ++mask) { From 4836b7b016282fe64033ce7f1621bedd53e5dcba Mon Sep 17 00:00:00 2001 From: sadko4u Date: Tue, 16 Apr 2024 21:53:07 +0300 Subject: [PATCH 20/22] Updated header --- make/system.mk | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/make/system.mk b/make/system.mk index 5796e895..66c7f58d 100644 --- a/make/system.mk +++ b/make/system.mk @@ -1,21 +1,21 @@ # -# Copyright (C) 2020 Linux Studio Plugins Project -# (C) 2020 Vladimir Sadovnikov +# Copyright (C) 2024 Linux Studio Plugins Project +# (C) 2024 Vladimir Sadovnikov # -# This file is part of lsp-plugins +# This file is part of lsp-dsp-lib # -# lsp-plugins is free software: you can redistribute it and/or modify +# lsp-dsp-lib is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation, either version 3 of the License, or # any later version. # -# lsp-plugins is distributed in the hope that it will be useful, +# lsp-dsp-lib is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License -# along with lsp-plugins. If not, see . +# along with lsp-dsp-lib. If not, see . # # Detect operating system From 342b82c14d061764be75e5cf03f68b2cadb1ec3c Mon Sep 17 00:00:00 2001 From: Vladimir Sadovnikov Date: Sat, 27 Apr 2024 02:56:15 +0300 Subject: [PATCH 21/22] Updated module versions in dependencies --- CHANGELOG | 1 + modules.mk | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 7a56c6c4..aae74d38 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,7 @@ * Implemented functions for computing correlation between two signals. * Fixed error in AVX and AVX-512 implementation of lr_to_ms functions. * Updated build scripts. +* Updated module versions in dependencies. === 1.0.21 === * Updated build scripts. diff --git a/modules.mk b/modules.mk index 929f615c..8a17adde 100644 --- a/modules.mk +++ b/modules.mk @@ -19,13 +19,13 @@ # # Variables that describe dependencies -LSP_COMMON_LIB_VERSION := 1.0.34 +LSP_COMMON_LIB_VERSION := 1.0.35 LSP_COMMON_LIB_NAME := lsp-common-lib LSP_COMMON_LIB_TYPE := src LSP_COMMON_LIB_URL_RO := https://github.com/lsp-plugins/$(LSP_COMMON_LIB_NAME).git LSP_COMMON_LIB_URL_RW := git@github.com:lsp-plugins/$(LSP_COMMON_LIB_NAME).git -LSP_TEST_FW_VERSION := 1.0.24 +LSP_TEST_FW_VERSION := 1.0.25 LSP_TEST_FW_NAME := lsp-test-fw LSP_TEST_FW_TYPE := src LSP_TEST_FW_URL_RO := https://github.com/lsp-plugins/$(LSP_TEST_FW_NAME).git From be5243b0babfa70c71af00995f72f2de90803616 Mon Sep 17 00:00:00 2001 From: Vladimir Sadovnikov Date: Sat, 27 Apr 2024 02:56:15 +0300 Subject: [PATCH 22/22] Preparing for release --- project.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project.mk b/project.mk index 4952c1e0..3e183d1f 100644 --- a/project.mk +++ b/project.mk @@ -23,4 +23,4 @@ ARTIFACT_ID = LSP_DSP_LIB ARTIFACT_NAME = lsp-dsp-lib ARTIFACT_DESC = DSP library for digital signal processing ARTIFACT_HEADERS = lsp-plug.in -ARTIFACT_VERSION = 1.0.22-devel +ARTIFACT_VERSION = 1.0.22