Skip to content

Commit

Permalink
Split unit tests and performance tests, some AMD-related optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
sadko4u committed Aug 28, 2023
1 parent 0aec8be commit ee21d94
Show file tree
Hide file tree
Showing 11 changed files with 507 additions and 61 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
=== 1.0.15 ===
* Fixed syntax error in C interface, covered with tests.
* Bugfix in horizontal summing functions (invalid register clobber list).
* Some AMD-related optimizations.

=== 1.0.14 ===
* Implemented pcomplex_r2c instruction set.
Expand Down
15 changes: 11 additions & 4 deletions include/private/dsp/arch/x86/cpuid.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
* (C) 2020 Vladimir Sadovnikov <[email protected]>
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
* (C) 2023 Vladimir Sadovnikov <[email protected]>
*
* This file is part of lsp-dsp-lib
* Created on: 31 мар. 2020 г.
Expand Down Expand Up @@ -116,12 +116,18 @@

//-------------------------------------------------------------------------
// Different processor families

#define INTEL_FAMILY_686_CORE 0x06

#define AMD_FAMILY_K8_HAMMER 0x0f
#define AMD_FAMILY_K10 0x10
#define AMD_FAMILY_BOBCAT 0x14
#define AMD_FAMILY_BULLDOZER 0x15
#define AMD_FAMILY_JAGUAR 0x16
#define AMD_FAMILY_ZEN_1_2 0x17
#define AMD_FAMILY_DHYANA 0x18
#define AMD_FAMILY_ZEN_3_4 0x19
#define AMD_FAMILY_ZEN_5 0x1a

#define AMD_MODEL_ZEN_2 0x31

Expand Down Expand Up @@ -191,8 +197,9 @@ namespace lsp
}

uint64_t read_xcr(umword_t xcr_id);
}
}

} /* namespace x86 */
} /* namespace lsp */


#endif /* PRIVATE_DSP_ARCH_X86_CPUID_H_ */
3 changes: 2 additions & 1 deletion include/private/dsp/arch/x86/features.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@
{
FEAT_FAST_MOVS, // Processor implements optimized MOVS instruction
FEAT_FAST_AVX, // Fast AVX implementation
FEAT_FAST_FMA3 // Fast FMA3 implementation
FEAT_FAST_FMA3, // Fast FMA3 implementation
FEAT_BELOW_ZEN3 // CPU has AMD architecture and is below Zen3
};

/**
Expand Down
14 changes: 9 additions & 5 deletions src/main/x86/avx.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
* (C) 2020 Vladimir Sadovnikov <[email protected]>
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
* (C) 2023 Vladimir Sadovnikov <[email protected]>
*
* This file is part of lsp-dsp-lib
* Created on: 31 мар. 2020 г.
Expand Down Expand Up @@ -127,8 +127,9 @@

// This routine sucks on AMD Bulldozer processor family but is pretty great on Intel
// Not tested on AMD Processors above Bulldozer family
bool favx = feature_check(f, FEAT_FAST_AVX);
bool ffma = favx && feature_check(f, FEAT_FAST_FMA3);
bool favx = feature_check(f, FEAT_FAST_AVX);
bool ffma = favx && feature_check(f, FEAT_FAST_FMA3);
bool below_zen3 = feature_check(f, FEAT_BELOW_ZEN3);

CEXPORT2_X64(favx, reverse1, reverse1);
CEXPORT2_X64(favx, reverse2, reverse2);
Expand Down Expand Up @@ -448,7 +449,10 @@
CEXPORT2(favx, pcomplex_rdiv2, pcomplex_rdiv2_fma3);
CEXPORT2(favx, pcomplex_div3, pcomplex_div3_fma3);

CEXPORT2(favx, h_sqr_sum, h_sqr_sum_fma3);
if (!below_zen3)
{
CEXPORT2(favx, h_sqr_sum, h_sqr_sum_fma3);
}

CEXPORT2(favx, direct_fft, direct_fft_fma3);
CEXPORT2(favx, reverse_fft, reverse_fft_fma3);
Expand Down
32 changes: 28 additions & 4 deletions src/main/x86/x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -516,21 +516,45 @@
case FEAT_FAST_MOVS:
if (f->vendor == CPU_VENDOR_INTEL)
{
if ((f->family == 0x6) && (f->model >= 0x5e)) // Should be some Core i3 microarchitecture...
// Should be some Core i3 microarchitecture...
if ((f->family == INTEL_FAMILY_686_CORE) && (f->model >= 0x5e))
return true;
}
break;
case FEAT_FAST_AVX:
if (f->vendor == CPU_VENDOR_INTEL) // Any Intel CPU is good enough with AVX
return true;
// Only starting with ZEN 1 architecture AMD's implementation of AVX is fast enough
if ((f->vendor == CPU_VENDOR_AMD) || (f->vendor == CPU_VENDOR_HYGON))
return (f->family >= AMD_FAMILY_ZEN_1_2); // Only starting with ZEN 1 architecture AMD's implementation of AVX is fast enough
{
if (f->family < AMD_FAMILY_ZEN_1_2)
return false;
if (f->family == AMD_FAMILY_DHYANA)
return false;
return true;
}
break;
case FEAT_FAST_FMA3:
if (f->vendor == CPU_VENDOR_INTEL) // Any Intel CPU is good enough with AVX
return true;
if ((f->vendor == CPU_VENDOR_AMD) || (f->vendor == CPU_VENDOR_HYGON)) // Starting with ZEN 2 FMA3 operations are fast enough on AMD
return (f->family >= AMD_FAMILY_ZEN_1_2) && (f->model >= AMD_MODEL_ZEN_2);
// Starting with ZEN 2 FMA3 operations are fast enough on AMD
if ((f->vendor == CPU_VENDOR_AMD) || (f->vendor == CPU_VENDOR_HYGON))
{
if (f->family < AMD_FAMILY_ZEN_1_2)
return false;
if (f->family == AMD_FAMILY_DHYANA)
return false;
return true;
}
break;

case FEAT_BELOW_ZEN3: // Test that this is AMD and below Zen 3 architecture
if ((f->vendor == CPU_VENDOR_AMD) || (f->vendor == CPU_VENDOR_HYGON))
{
if (f->family < AMD_FAMILY_ZEN_3_4)
return true;
return false;
}
break;
default:
break;
Expand Down
110 changes: 110 additions & 0 deletions src/test/ptest/hmath/h_abs_sum.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/*
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
* (C) 2023 Vladimir Sadovnikov <[email protected]>
*
* This file is part of lsp-dsp-lib
* Created on: 31 мар. 2020 г.
*
* lsp-dsp-lib is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* lsp-dsp-lib is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>.
*/

#include <lsp-plug.in/dsp/dsp.h>
#include <lsp-plug.in/test-fw/ptest.h>
#include <lsp-plug.in/test-fw/helpers.h>
#include <lsp-plug.in/common/alloc.h>

#define MIN_RANK 8
#define MAX_RANK 16

namespace lsp
{
namespace generic
{
float h_abs_sum(const float *src, size_t count);
}

IF_ARCH_X86(
namespace sse
{
float h_abs_sum(const float *src, size_t count);
}

namespace avx
{
float h_abs_sum(const float *src, size_t count);
}
)

IF_ARCH_ARM(
namespace neon_d32
{
float h_abs_sum(const float *src, size_t count);
}
)

IF_ARCH_AARCH64(
namespace asimd
{
float h_abs_sum(const float *src, size_t count);
}
)

typedef float (* h_sum_t)(const float *src, size_t count);
}

PTEST_BEGIN("dsp.hmath", hsum, 5, 10000)

void call(const char *label, float *src, size_t count, h_sum_t func)
{
if (!PTEST_SUPPORTED(func))
return;

char buf[80];
sprintf(buf, "%s x %d", label, int(count));
printf("Testing %s numbers...\n", buf);

PTEST_LOOP(buf,
func(src, count);
);
}

PTEST_MAIN
{
size_t buf_size = 1 << MAX_RANK;
uint8_t *data = NULL;
float *src = alloc_aligned<float>(data, buf_size, 64);

for (size_t i=0; i < buf_size; ++i)
src[i] = randf(0.0f, 1.0f);

#define CALL(func) \
call(#func, src, count, func)

for (size_t i=MIN_RANK; i <= MAX_RANK; ++i)
{
size_t count = 1 << i;

CALL(generic::h_abs_sum);
IF_ARCH_X86(CALL(sse::h_abs_sum));
IF_ARCH_X86(CALL(avx::h_abs_sum));
IF_ARCH_ARM(CALL(neon_d32::h_abs_sum));
IF_ARCH_AARCH64(CALL(asimd::h_abs_sum));
PTEST_SEPARATOR;
}

free_aligned(data);
}

PTEST_END

Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
* (C) 2020 Vladimir Sadovnikov <[email protected]>
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
* (C) 2023 Vladimir Sadovnikov <[email protected]>
*
* This file is part of lsp-dsp-lib
* Created on: 31 мар. 2020 г.
Expand Down Expand Up @@ -31,50 +31,40 @@ namespace lsp
{
namespace generic
{
float h_sum(const float *src, size_t count);
float h_sqr_sum(const float *src, size_t count);
float h_abs_sum(const float *src, size_t count);
}

IF_ARCH_X86(
namespace sse
{
float h_sum(const float *src, size_t count);
float h_sqr_sum(const float *src, size_t count);
float h_abs_sum(const float *src, size_t count);
}

namespace avx
{
float h_sum(const float *src, size_t count);
float h_sqr_sum(const float *src, size_t count);
float h_sqr_sum_fma3(const float *src, size_t count);
float h_abs_sum(const float *src, size_t count);
}
)

IF_ARCH_ARM(
namespace neon_d32
{
float h_sum(const float *src, size_t count);
float h_sqr_sum(const float *src, size_t count);
float h_abs_sum(const float *src, size_t count);
}
)

IF_ARCH_AARCH64(
namespace asimd
{
float h_sum(const float *src, size_t count);
float h_sqr_sum(const float *src, size_t count);
float h_abs_sum(const float *src, size_t count);
}
)

typedef float (* h_sum_t)(const float *src, size_t count);
}

PTEST_BEGIN("dsp.hmath", hsum, 5, 10000)
PTEST_BEGIN("dsp.hmath", h_sqr_sum, 5, 10000)

void call(const char *label, float *src, size_t count, h_sum_t func)
{
Expand Down Expand Up @@ -106,27 +96,13 @@ PTEST_BEGIN("dsp.hmath", hsum, 5, 10000)
{
size_t count = 1 << i;

CALL(generic::h_sum);
IF_ARCH_X86(CALL(sse::h_sum));
IF_ARCH_X86(CALL(avx::h_sum));
IF_ARCH_ARM(CALL(neon_d32::h_sum));
IF_ARCH_AARCH64(CALL(asimd::h_sum));
PTEST_SEPARATOR;

CALL(generic::h_sqr_sum);
IF_ARCH_X86(CALL(sse::h_sqr_sum));
IF_ARCH_X86(CALL(avx::h_sqr_sum));
IF_ARCH_X86(CALL(avx::h_sqr_sum_fma3));
IF_ARCH_ARM(CALL(neon_d32::h_sqr_sum));
IF_ARCH_AARCH64(CALL(asimd::h_sqr_sum));
PTEST_SEPARATOR;

CALL(generic::h_abs_sum);
IF_ARCH_X86(CALL(sse::h_abs_sum));
IF_ARCH_X86(CALL(avx::h_abs_sum));
IF_ARCH_ARM(CALL(neon_d32::h_abs_sum));
IF_ARCH_AARCH64(CALL(asimd::h_abs_sum));
PTEST_SEPARATOR2;
}

free_aligned(data);
Expand Down
Loading

0 comments on commit ee21d94

Please sign in to comment.