From 2c2c7e420cc3b9d5cec498427131e7e6a617eee3 Mon Sep 17 00:00:00 2001 From: deeptiag1 Date: Wed, 19 Feb 2020 11:54:20 -0700 Subject: [PATCH 01/12] AVX512_changes Signed-off-by: deeptiag1 --- Source/Lib/ASM_AVX2/CMakeLists.txt | 1 + Source/Lib/ASM_AVX2/EbTransforms_AVX2.h | 6 + .../ASM_AVX2/EbTransforms_Intrinsic_AVX512.c | 193 ++++++ Source/Lib/ASM_SSSE3/CMakeLists.txt | 8 +- Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c | 581 ++++++++++++++++++ Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c | 74 +-- 6 files changed, 824 insertions(+), 39 deletions(-) create mode 100644 Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c create mode 100644 Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c diff --git a/Source/Lib/ASM_AVX2/CMakeLists.txt b/Source/Lib/ASM_AVX2/CMakeLists.txt index 9705a8adc..30286cbbb 100644 --- a/Source/Lib/ASM_AVX2/CMakeLists.txt +++ b/Source/Lib/ASM_AVX2/CMakeLists.txt @@ -79,6 +79,7 @@ set(ASM_AVX2_SOURCE EbNoiseExtractAVX2.c EbPackUnPack_Intrinsic_AVX2.c EbPictureOperators_Intrinsic_AVX2.c + EbTransforms_Intrinsic_AVX512.c EbTransforms_Intrinsic_AVX2.c) if(COMPILE_AS_CPP) diff --git a/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h b/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h index 30be3e941..6a3299549 100644 --- a/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h +++ b/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h @@ -11,6 +11,12 @@ extern "C" { #endif +#define EbHevcTransform32_INTRIN EbHevcTransform32_AVX2_INTRIN +#ifndef NON_AVX512_SUPPORT +#define EbHevcTransform32_INTRIN EbHevcTransform32_AVX512_INTRIN +#endif + + void QuantizeInvQuantize8x8_AVX2_INTRIN( EB_S16 *coeff, const EB_U32 coeffStride, diff --git a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c new file mode 100644 index 000000000..6d5342d42 --- /dev/null +++ b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c @@ -0,0 +1,193 @@ +#include "EbTransforms_AVX2.h" +#include "EbDefinitions.h" + +#include +#include + +#ifndef NON_AVX512_SUPPORT + +#ifdef __GNUC__ +__attribute__((aligned(16))) +#endif +static EB_ALIGN(32) const EB_S16 EbHevcCoeff_tbl_AVX2[48 * 16] = +{ + 64, 64, 89, 75, 83, 36, 75, -18, 64, 64, 89, 75, 83, 36, 75, -18, 64, -64, 50, -89, 36, -83, 18, -50, 64, -64, 50, -89, 36, -83, 18, -50, + 64, 64, 50, 18, -36, -83, -89, -50, 64, 64, 50, 18, -36, -83, -89, -50, -64, 64, 18, 75, 83, -36, 75, -89, -64, 64, 18, 75, 83, -36, 75, -89, + 64, 64, -18, -50, -83, -36, 50, 89, 64, 64, -18, -50, -83, -36, 50, 89, 64, -64, -75, -18, -36, 83, 89, -75, 64, -64, -75, -18, -36, 83, 89, -75, + 64, 64, -75, -89, 36, 83, 18, -75, 64, 64, -75, -89, 36, 83, 18, -75, -64, 64, 89, -50, -83, 36, 50, -18, -64, 64, 89, -50, -83, 36, 50, -18, + 90, 87, 87, 57, 80, 9, 70, -43, 90, 87, 87, 57, 80, 9, 70, -43, 57, -80, 43, -90, 25, -70, 9, -25, 57, -80, 43, -90, 25, -70, 9, -25, + 80, 70, 9, -43, -70, -87, -87, 9, 80, 70, 9, -43, -70, -87, -87, 9, -25, 90, 57, 25, 90, -80, 43, -57, -25, 90, 57, 25, 90, -80, 43, -57, + 57, 43, -80, -90, -25, 57, 90, 25, 57, 43, -80, -90, -25, 57, 90, 25, -9, -87, -87, 70, 43, 9, 70, -80, -9, -87, -87, 70, 43, 9, 70, -80, + 25, 9, -70, -25, 90, 43, -80, -57, 25, 9, -70, -25, 90, 43, -80, -57, 43, 70, 9, -80, -57, 87, 87, -90, 43, 70, 9, -80, -57, 87, 87, -90, + 90, 90, 90, 82, 88, 67, 85, 46, 90, 90, 90, 82, 88, 67, 85, 46, 82, 22, 78, -4, 73, -31, 67, -54, 82, 22, 78, -4, 73, -31, 67, -54, + 61, -73, 54, -85, 46, -90, 38, -88, 61, -73, 54, -85, 46, -90, 38, -88, 31, -78, 22, -61, 13, -38, 4, -13, 31, -78, 22, -61, 13, -38, 4, -13, + 88, 85, 67, 46, 31, -13, -13, -67, 88, 85, 67, 46, 31, -13, -13, -67, -54, -90, -82, -73, -90, -22, -78, 38, -54, -90, -82, -73, -90, -22, -78, 38, + -46, 82, -4, 88, 38, 54, 73, -4, -46, 82, -4, 88, 38, 54, 73, -4, 90, -61, 85, -90, 61, -78, 22, -31, 90, -61, 85, -90, 61, -78, 22, -31, + 82, 78, 22, -4, -54, -82, -90, -73, 82, 78, 22, -4, -54, -82, -90, -73, -61, 13, 13, 85, 78, 67, 85, -22, -61, 13, 13, 85, 78, 67, 85, -22, + 31, -88, -46, -61, -90, 31, -67, 90, 31, -88, -46, -61, -90, 31, -67, 90, 4, 54, 73, -38, 88, -90, 38, -46, 4, 54, 73, -38, 88, -90, 38, -46, + 73, 67, -31, -54, -90, -78, -22, 38, 73, 67, -31, -54, -90, -78, -22, 38, 78, 85, 67, -22, -38, -90, -90, 4, 78, 85, 67, -22, -38, -90, -90, 4, + -13, 90, 82, 13, 61, -88, -46, -31, -13, 90, 82, 13, 61, -88, -46, -31, -88, 82, -4, 46, 85, -73, 54, -61, -88, 82, -4, 46, 85, -73, 54, -61, + 61, 54, -73, -85, -46, -4, 82, 88, 61, 54, -73, -85, -46, -4, 82, 88, 31, -46, -88, -61, -13, 82, 90, 13, 31, -46, -88, -61, -13, 82, 90, 13, + -4, -90, -90, 38, 22, 67, 85, -78, -4, -90, -90, 38, 22, 67, 85, -78, -38, -22, -78, 90, 54, -31, 67, -73, -38, -22, -78, 90, 54, -31, 67, -73, + 46, 38, -90, -88, 38, 73, 54, -4, 46, 38, -90, -88, 38, 73, 54, -4, -90, -67, 31, 90, 61, -46, -88, -31, -90, -67, 31, 90, 61, -46, -88, -31, + 22, 85, 67, -78, -85, 13, 13, 61, 22, 85, 67, -78, -85, 13, 13, 61, 73, -90, -82, 54, 4, 22, 78, -82, 73, -90, -82, 54, 4, 22, 78, -82, + 31, 22, -78, -61, 90, 85, -61, -90, 31, 22, -78, -61, 90, 85, -61, -90, 4, 73, 54, -38, -88, -4, 82, 46, 4, 73, 54, -38, -88, -4, 82, 46, + -38, -78, -22, 90, 73, -82, -90, 54, -38, -78, -22, 90, 73, -82, -90, 54, 67, -13, -13, -31, -46, 67, 85, -88, 67, -13, -13, -31, -46, 67, 85, -88, + 13, 4, -38, -13, 61, 22, -78, -31, 13, 4, -38, -13, 61, 22, -78, -31, 88, 38, -90, -46, 85, 54, -73, -61, 88, 38, -90, -46, 85, 54, -73, -61, + 54, 67, -31, -73, 4, 78, 22, -82, 54, 67, -31, -73, 4, 78, 22, -82, -46, 85, 67, -88, -82, 90, 90, -90, -46, 85, 67, -88, -82, 90, 90, -90 +}; + +extern void EbHevcTransform32_AVX512_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift) +{ + EB_U32 i; + __m128i s0; + __m256i o0; + const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; + + shift &= 0x0000FFFF; // Redundant code to fix Visual Studio 2012 AVX2 compiler error + s0 = _mm_cvtsi32_si128(shift); + o0 = _mm256_set1_epi32(1 << (shift - 1)); + + for (i = 0; i < 16; i++) + { + __m256i x0, x1, x2, x3,sox0,sox5,soxa,soxf,s1x0,s1x5,s1xa,s1xf; + __m256i y0, y1, y2, y3; + __m256i a0, a1, a2, a3, a4, a5, a6, a7; + __m256i b0, b1, b2, b3, b4, b5, b6, b7; + + x0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x00))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x00)), 0x1); + x1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x08))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x08)), 0x1); + x2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x10))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x10)), 0x1); + x3 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x18))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x18)), 0x1); + + // 32-point butterfly + x2 = _mm256_shuffle_epi8(x2, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + x3 = _mm256_shuffle_epi8(x3, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + + y0 = _mm256_add_epi16(x0, x3); + y1 = _mm256_add_epi16(x1, x2); + + y2 = _mm256_sub_epi16(x0, x3); + y3 = _mm256_sub_epi16(x1, x2); + + // 16-point butterfly + y1 = _mm256_shuffle_epi8(y1, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + + x0 = _mm256_add_epi16(y0, y1); + x1 = _mm256_sub_epi16(y0, y1); + + x2 = y2; + x3 = y3; + + + sox0 = _mm256_shuffle_epi32(x0, 0x00); + sox5 = _mm256_shuffle_epi32(x0, 0x55); + soxa = _mm256_shuffle_epi32(x0, 0xaa); + soxf = _mm256_shuffle_epi32(x0, 0xff); + s1x0 = _mm256_shuffle_epi32(x1, 0x00); + s1x5 = _mm256_shuffle_epi32(x1, 0x55); + s1xa = _mm256_shuffle_epi32(x1, 0xaa); + s1xf = _mm256_shuffle_epi32(x1, 0xff); + + a0 = _mm256_madd_epi16(sox0, coeff32[0]); + a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(sox5, coeff32[2])); + a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(soxa, coeff32[4])); + a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(soxf, coeff32[6])); + + a1 = _mm256_madd_epi16(sox0, coeff32[1]); + a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(sox5, coeff32[3])); + a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(soxa, coeff32[5])); + a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(soxf, coeff32[7])); + + a2 = _mm256_madd_epi16(s1x0, coeff32[8]); + a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(s1x5, coeff32[10])); + a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(s1xa, coeff32[12])); + a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(s1xf, coeff32[14])); + + a3 = _mm256_madd_epi16(s1x0, coeff32[9]); + a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(s1x5, coeff32[11])); + a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(s1xa, coeff32[13])); + a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(s1xf, coeff32[15])); + + sox0 = _mm256_shuffle_epi32(x2, 0x00); + sox5 = _mm256_shuffle_epi32(x2, 0x55); + soxa = _mm256_shuffle_epi32(x2, 0xaa); + soxf = _mm256_shuffle_epi32(x2, 0xff); + s1x0 = _mm256_shuffle_epi32(x3, 0x00); + s1x5 = _mm256_shuffle_epi32(x3, 0x55); + s1xa = _mm256_shuffle_epi32(x3, 0xaa); + s1xf = _mm256_shuffle_epi32(x3, 0xff); + + a4 = _mm256_madd_epi16(sox0, coeff32[16]); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(sox5, coeff32[20])); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(soxa, coeff32[24])); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(soxf, coeff32[28])); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1x0, coeff32[32])); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1x5, coeff32[36])); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1xa, coeff32[40])); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1xf, coeff32[44])); + + a5 = _mm256_madd_epi16(sox0, coeff32[17]); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(sox5, coeff32[21])); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(soxa, coeff32[25])); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(soxf, coeff32[29])); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1x0, coeff32[33])); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1x5, coeff32[37])); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1xa, coeff32[41])); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1xf, coeff32[45])); + + a6 = _mm256_madd_epi16(sox0, coeff32[18]); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(sox5, coeff32[22])); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(soxa, coeff32[26])); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(soxf, coeff32[30])); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1x0, coeff32[34])); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1x5, coeff32[38])); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1xa, coeff32[42])); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1xf, coeff32[46])); + + a7 = _mm256_madd_epi16(sox0, coeff32[19]); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(sox5, coeff32[23])); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(soxa, coeff32[27])); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(soxf, coeff32[31])); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1x0, coeff32[35])); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1x5, coeff32[39])); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1xa, coeff32[43])); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1xf, coeff32[47])); + + b0 = _mm256_sra_epi32(_mm256_add_epi32(a0, o0), s0); + b1 = _mm256_sra_epi32(_mm256_add_epi32(a1, o0), s0); + b2 = _mm256_sra_epi32(_mm256_add_epi32(a2, o0), s0); + b3 = _mm256_sra_epi32(_mm256_add_epi32(a3, o0), s0); + b4 = _mm256_sra_epi32(_mm256_add_epi32(a4, o0), s0); + b5 = _mm256_sra_epi32(_mm256_add_epi32(a5, o0), s0); + b6 = _mm256_sra_epi32(_mm256_add_epi32(a6, o0), s0); + b7 = _mm256_sra_epi32(_mm256_add_epi32(a7, o0), s0); + + x0 = _mm256_packs_epi32(b0, b1); + x1 = _mm256_packs_epi32(b2, b3); + x2 = _mm256_packs_epi32(b4, b5); + x3 = _mm256_packs_epi32(b6, b7); + + y0 = _mm256_unpacklo_epi16(x0, x1); + y1 = _mm256_unpackhi_epi16(x0, x1); + y2 = x2; + y3 = x3; + x0 = _mm256_unpacklo_epi16(y0, y2); + x1 = _mm256_unpackhi_epi16(y0, y2); + x2 = _mm256_unpacklo_epi16(y1, y3); + x3 = _mm256_unpackhi_epi16(y1, y3); + + y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 0)), _mm256_extracti128_si256(x1, 0), 0x1); + y1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x2, 0)), _mm256_extracti128_si256(x3, 0), 0x1); + y2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 1)), _mm256_extracti128_si256(x1, 1), 0x1); + y3 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x2, 1)), _mm256_extracti128_si256(x3, 1), 0x1); + _mm256_storeu_si256((__m256i *)(dst + 0x00), y0); + _mm256_storeu_si256((__m256i *)(dst + 0x10), y1); + _mm256_storeu_si256((__m256i *)(dst + dst_stride + 0x00), y2); + _mm256_storeu_si256((__m256i *)(dst + dst_stride + 0x10), y3); + + src += 2 * src_stride; + dst += 2 * dst_stride; + } +} +#endif \ No newline at end of file diff --git a/Source/Lib/ASM_SSSE3/CMakeLists.txt b/Source/Lib/ASM_SSSE3/CMakeLists.txt index 74147970a..9ce313bbc 100644 --- a/Source/Lib/ASM_SSSE3/CMakeLists.txt +++ b/Source/Lib/ASM_SSSE3/CMakeLists.txt @@ -16,6 +16,9 @@ include_directories(${PROJECT_SOURCE_DIR}/Source/API/ set(flags_to_test -mssse3 + -mavx512bw + -mavx512vnni + -mavx512vl -msse4.1 -static-intel) @@ -40,9 +43,9 @@ foreach(cflag ${flags_to_test}) endforeach() if(CMAKE_C_COMPILER_ID STREQUAL "Intel") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -w") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -w ") if(COMPILE_AS_CPP) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w ") endif() endif() @@ -52,6 +55,7 @@ set(ASM_SSSE3_SOURCE EbDeblockingFilter_Intrinsic_SSSE3.c EbIntraPrediction16bit_Intrinsic_SSSE3.c EbMcp_Intrinsic_SSSE3.c + EbMcp_Intrinsic_AVX512.c EbSaoApplication_Intrinsic_SSSE3.c EbTransforms_Intrinsic_SSSE3.c) diff --git a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c new file mode 100644 index 000000000..14fb4ce91 --- /dev/null +++ b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c @@ -0,0 +1,581 @@ +#include "EbMcp_SSSE3.h" +#include "EbDefinitions.h" + +#include "emmintrin.h" + +#include "immintrin.h" + +#ifndef NON_AVX512_SUPPORT + + + +const EB_S16 EbHevcLumaFilterCoeff1[4][8] = +{ + { 0, 0, 0, 64, 0, 0, 0, 0}, + {-1, 4,-10, 58, 17, -5, 1, 0}, + {-1, 4,-11, 40, 40,-11, 4, -1}, + { 0, 1, -5, 17, 58,-10, 4, -1} +}; + +static const EB_S16 EbHevcLumaFilterCoeff7[4][8] = + { + { 0, 0, 0, 64, 0, 0, 0, 0}, + {-1, 4,-10, 58, 17, -5, 1, 0}, + {-1, 4,-11, 40, 40,-11, 4, -1}, + { 1, -5, 17, 58,-10, 4, -1, 0} + }; + +void LumaInterpolationFilterOneDOutRawHorizontal_AVX512( + EB_BYTE refPic, + EB_U32 srcStride, + EB_S16 *dst, + EB_U32 puWidth, + EB_U32 puHeight, + EB_U32 fracPosx) +{ + EB_S32 rowCount, colCount; + __m128i c0, c1, c2, c3; // coeffs + __m128i a0, a1; + __m128i b0; + __m128i sum; + EB_BYTE ptr; + + refPic -= 3; + + PrefetchBlock(refPic, srcStride, (puWidth == 4) ? 16 : puWidth+8, (puWidth == 4) ? ((puHeight+1)&~1) : puHeight); + + c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff1[fracPosx]); + c0 = _mm_packs_epi16(c0, c0); + __m128i ct = _mm_srli_epi64(c0, 32); + __m512i cc0 = _mm512_broadcastd_epi32(c0); + __m512i cc1 = _mm512_broadcastd_epi32(ct); + c0 = _mm_unpacklo_epi16(c0, c0); + c3 = _mm_shuffle_epi32(c0, 0xff); + c2 = _mm_shuffle_epi32(c0, 0xaa); + c1 = _mm_shuffle_epi32(c0, 0x55); + c0 = _mm_shuffle_epi32(c0, 0x00); + __m512i b1 = _mm512_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0); + __m512i b2 = _mm512_set_epi8(14, 13, 12, 11, 13, 12, 11, 10, 12, 11, 10, 9, 11, 10, 9, 8, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 14, 13, 12, 11, 13, 12, 11, 10, 12, 11, 10, 9, 11, 10, 9, 8, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4); + + + if (puWidth & 4) + { + ptr = refPic; + rowCount = puHeight; + do + { + a0 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; + a1 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; + b0 = _mm_unpacklo_epi64(a0, a1); + sum = _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12)), c0); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14)), c1)); + b0 = _mm_unpacklo_epi64(_mm_srli_si128(a0, 4), _mm_srli_si128(a1, 4)); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12)), c2)); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14)), c3)); + + sum = _mm_sub_epi16(sum, _mm_set1_epi16(128*64)); + + _mm_storeu_si128((__m128i *)dst, sum); + dst += 8; + + rowCount -= 2; + } + while (rowCount > 0); + + puWidth -= 4; + if (puWidth == 0) + { + return; + } + + refPic += 4; + } + + colCount = puWidth; + int rowLoop = puHeight >>1 ;//divide by 2 + int evenRow = puHeight & 1; + do + { + ptr = refPic; + // rowCount = puHeight; + int rowCount = rowLoop ;//divide by 2 + do + { + __m512i a1 = _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)(ptr))); + __m256i b0 = _mm256_broadcast_i32x4(_mm_loadu_si128((__m128i*)(ptr + srcStride))); ptr += 2 * srcStride; + __m512i s1 = _mm512_inserti64x4(a1, b0, 1); + __m512i sh2 = _mm512_shuffle_epi8(s1, b1); + __m512i sh3 = _mm512_shuffle_epi8(s1, b2); + __m512i sum00 = _mm512_setzero_si512(); + __m512i sum0 = _mm512_dpbusds_epi32(sum00, sh2, cc0); + __m512i sum1 = _mm512_dpbusds_epi32(sum0, sh3, cc1); + __m512i f1 = _mm512_packs_epi32(sum1,sum1);// + __m512i f2 = _mm512_permutexvar_epi64( _mm512_setr_epi64(0x0, 0x0000000000000002, 0x0000000000000004, 0x0000000000000006, 0x0, 0x0002000200020002, 0x0004000400040004, 0x0006000600060006), f1); + f2 = _mm512_sub_epi16(f2, _mm512_set1_epi16(128 * 64)); + _mm256_storeu_si256((__m256i*)dst, _mm512_castsi512_si256(f2)); + dst += 16; + rowCount = rowCount - 1; + } + while (rowCount > 0); + + if (evenRow) + { + a0 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; + + sum = _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8)), c0); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10)), c1)); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12)), c2)); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14)), c3)); + + sum = _mm_sub_epi16(sum, _mm_set1_epi16(128*64)); + + _mm_storeu_si128((__m128i *)dst, sum); + dst += 8; + } + + refPic += 8; + colCount -= 8; + } + while (colCount > 0); +} + +void EbHevcLumaInterpolationFilterTwoDInRaw7_AVX512(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) +{ + EB_S32 rowCount, colCount; + __m128i c0, c1, c2; + __m128i a0, a1, a2, a3, a4, a5, a6; + __m128i sum0 , sum1; + __m128i b0l, b0h, b1l, b1h, b2l, b2h; + + EB_BYTE qtr; + c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[fracPosy]); + c2 = _mm_shuffle_epi32(c0, 0xaa); + c1 = _mm_shuffle_epi32(c0, 0x55); + c0 = _mm_shuffle_epi32(c0, 0x00); + + + if (puWidth & 4) + { + rowCount = puHeight; + + qtr = dst; + + do + { + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*4)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*4)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*4)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*4)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*4)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*4)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*4)); + a0 = _mm_sub_epi16(a0, a6); + + sum0 = _mm_set1_epi32(257<<11); + sum1 = _mm_set1_epi32(257<<11); + + + b0l = _mm_unpacklo_epi16(a0, a1); + b0h = _mm_unpackhi_epi16(a0, a1); + b1l = _mm_unpacklo_epi16(a2, a3); + b1h = _mm_unpackhi_epi16(a2, a3); + b2l = _mm_unpacklo_epi16(a4, a5); + b2h = _mm_unpackhi_epi16(a4, a5); + + sum0 = _mm_dpwssd_epi32(sum0, b0l, c0); + sum1 = _mm_dpwssd_epi32(sum1, b0h, c0); + sum0 = _mm_dpwssd_epi32(sum0, b1l, c1); + sum1 = _mm_dpwssd_epi32(sum1, b1h, c1); + sum0 = _mm_dpwssd_epi32(sum0, b2l, c2); + sum1 = _mm_dpwssd_epi32(sum1, b2h, c2); + + sum0 = _mm_srai_epi32(sum0, 12); + sum1 = _mm_srai_epi32(sum1, 12); + sum0 = _mm_packs_epi32(sum0, sum1); + sum0 = _mm_packus_epi16(sum0, sum0); + + *(EB_U32 *)qtr = _mm_extract_epi32(sum0, 0); qtr += dstStride; + *(EB_U32 *)qtr = _mm_extract_epi32(sum0, 1); qtr += dstStride; + + firstPassIFDst += 8; + rowCount -= 2; + } + while (rowCount > 0); + + puWidth -= 4; + if (puWidth == 0) + { + return; + } + + firstPassIFDst += (fracPosy == 2) ? 32 : 24; + dst += 4; + } + + colCount = puWidth; + do + { + EB_BYTE qtr = dst; + + rowCount = puHeight; + do + { + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*8)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*8)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*8)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*8)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*8)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*8)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*8)); + a0 = _mm_sub_epi16(a0, a6); + + sum0 = _mm_set1_epi32(257<<11); + sum1 = _mm_set1_epi32(257<<11); + + b0l = _mm_unpacklo_epi16(a0, a1); + b0h = _mm_unpackhi_epi16(a0, a1); + b1l = _mm_unpacklo_epi16(a2, a3); + b1h = _mm_unpackhi_epi16(a2, a3); + b2l = _mm_unpacklo_epi16(a4, a5); + b2h = _mm_unpackhi_epi16(a4, a5); + + sum0 = _mm_dpwssd_epi32(sum0, b0l, c0); + sum1 = _mm_dpwssd_epi32(sum1, b0h, c0); + sum0 = _mm_dpwssd_epi32(sum0, b1l, c1); + sum1 = _mm_dpwssd_epi32(sum1, b1h, c1); + sum0 = _mm_dpwssd_epi32(sum0, b2l, c2); + sum1 = _mm_dpwssd_epi32(sum1, b2h, c2); + + sum0 = _mm_srai_epi32(sum0, 12); + sum1 = _mm_srai_epi32(sum1, 12); + sum0 = _mm_packs_epi32(sum0, sum1); + sum0 = _mm_packus_epi16(sum0, sum0); + + _mm_storel_epi64((__m128i *)qtr, sum0); qtr += dstStride; + + firstPassIFDst += 8; + rowCount--; + } + while (rowCount > 0); + + firstPassIFDst += (fracPosy == 2) ? 56 : 48; + dst += 8; + colCount -= 8; + } + while (colCount > 0); +} + +void EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_AVX512(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) +{ + EB_S32 rowCount, colCount; + __m128i a0, a1, a2, a3, a4, a5, a6; + __m128i c0, c1, c2; + c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[fracPosy]); + c2 = _mm_shuffle_epi32(c0, 0xaa); + c1 = _mm_shuffle_epi32(c0, 0x55); + c0 = _mm_shuffle_epi32(c0, 0x00); + + if (puWidth & 4) + { + rowCount = puHeight; + + do + { + __m128i sum0, sum1; + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*4)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*4)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*4)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*4)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*4)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*4)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*4)); + a0 = _mm_sub_epi16(a0, a6); + + sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0); + sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0); + + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a4, a5), c2); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a4, a5), c2); + + sum0 = _mm_srai_epi32(sum0, 6); + sum1 = _mm_srai_epi32(sum1, 6); + sum0 = _mm_packs_epi32(sum0, sum1); + + _mm_storeu_si128((__m128i *)dst, sum0); + dst += 8; + + firstPassIFDst += 8; + rowCount -= 2; + } + while (rowCount > 0); + + puWidth -= 4; + if (puWidth == 0) + { + return; + } + + firstPassIFDst += (fracPosy == 2) ? 32 : 24; + } + + colCount = puWidth; + do + { + rowCount = puHeight; + do + { + __m128i b0l, b0h, b1l, b1h, b2l, b2h; + __m128i sum0, sum1; + + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*8)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*8)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*8)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*8)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*8)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*8)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*8)); + a0 = _mm_sub_epi16(a0, a6); + + b0l = _mm_unpacklo_epi16(a0, a1); + b0h = _mm_unpackhi_epi16(a0, a1); + b1l = _mm_unpacklo_epi16(a2, a3); + b1h = _mm_unpackhi_epi16(a2, a3); + b2l = _mm_unpacklo_epi16(a4, a5); + b2h = _mm_unpackhi_epi16(a4, a5); + + sum0 = _mm_madd_epi16(b0l, c0); + sum1 = _mm_madd_epi16(b0h, c0); + + sum0 = _mm_dpwssd_epi32(sum0, b1l, c1); + sum1 = _mm_dpwssd_epi32(sum1, b1h, c1); + sum0 = _mm_dpwssd_epi32(sum0, b2l, c2); + sum1 = _mm_dpwssd_epi32(sum1, b2h, c2); + + sum0 = _mm_srai_epi32(sum0, 6); + sum1 = _mm_srai_epi32(sum1, 6); + sum0 = _mm_packs_epi32(sum0, sum1); + + _mm_storeu_si128((__m128i *)dst, sum0); + dst += 8; + + firstPassIFDst += 8; + rowCount--; + } + while (rowCount > 0); + + firstPassIFDst += (fracPosy == 2) ? 56 : 48; + colCount -= 8; + } + while (colCount > 0); +} + +void EbHevcLumaInterpolationFilterTwoDInRawM_AVX512(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight) +{ + EB_S32 rowCount, colCount; + + __m128i c0, c1; + __m128i a0, a1, a2, a3, a4, a5, a6, a7; + __m128i sum0, sum1; + + EB_BYTE qtr; + + c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[2]); + c1 = _mm_shuffle_epi32(c0, 0x55); + c0 = _mm_shuffle_epi32(c0, 0x00); + + + + if (puWidth & 4) + { + rowCount = puHeight; + qtr = dst; + + do + { + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*4)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*4)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*4)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*4)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*4)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*4)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*4)); + a7 = _mm_loadu_si128((__m128i *)(firstPassIFDst+7*4)); + + sum0 = _mm_set1_epi32(257<<11); + sum1 = _mm_set1_epi32(257<<11); + + a0 = _mm_add_epi16(a0, a7); + a1 = _mm_add_epi16(a1, a6); + a2 = _mm_add_epi16(a2, a5); + a3 = _mm_add_epi16(a3, a4); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a0, a1), c0); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a0, a1), c0); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); + + sum0 = _mm_srai_epi32(sum0, 12); + sum1 = _mm_srai_epi32(sum1, 12); + sum0 = _mm_packs_epi32(sum0, sum1); + sum0 = _mm_packus_epi16(sum0, sum0); + + *(EB_U32 *)qtr = _mm_extract_epi32(sum0, 0); qtr += dstStride; + *(EB_U32 *)qtr = _mm_extract_epi32(sum0, 1); qtr += dstStride; + firstPassIFDst += 8; + rowCount -= 2; + } + while (rowCount > 0); + + puWidth -= 4; + if (puWidth == 0) + { + return; + } + + firstPassIFDst += 32; + dst += 4; + } + + colCount = puWidth; + do + { + qtr = dst; + + rowCount = puHeight; + do + { + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*8)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*8)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*8)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*8)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*8)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*8)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*8)); + a7 = _mm_loadu_si128((__m128i *)(firstPassIFDst+7*8)); + + sum0 = _mm_set1_epi32(257<<11); + sum1 = _mm_set1_epi32(257<<11); + a0 = _mm_add_epi16(a0, a7); + a1 = _mm_add_epi16(a1, a6); + a2 = _mm_add_epi16(a2, a5); + a3 = _mm_add_epi16(a3, a4); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a0, a1), c0); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a0, a1), c0); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); + + sum0 = _mm_srai_epi32(sum0, 12); + sum1 = _mm_srai_epi32(sum1, 12); + sum0 = _mm_packs_epi32(sum0, sum1); + sum0 = _mm_packus_epi16(sum0, sum0); + + _mm_storel_epi64((__m128i *)qtr, sum0); qtr += dstStride; + firstPassIFDst += 8; + } + while (--rowCount > 0); + + firstPassIFDst += 56; + dst += 8; + colCount -= 8; + } + while (colCount > 0); +} + +void EbHevcLumaInterpolationFilterTwoDInRawOutRawM_AVX512(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight) +{ + EB_S32 rowCount, colCount; + + __m128i a0, a1, a2, a3, a4, a5, a6, a7; + __m128i c0, c1; + c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[2]); + c1 = _mm_shuffle_epi32(c0, 0x55); + c0 = _mm_shuffle_epi32(c0, 0x00); + + if (puWidth & 4) + { + rowCount = puHeight; + + do + { + __m128i sum0, sum1; + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*4)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*4)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*4)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*4)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*4)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*4)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*4)); + a7 = _mm_loadu_si128((__m128i *)(firstPassIFDst+7*4)); + + a0 = _mm_add_epi16(a0, a7); + a1 = _mm_add_epi16(a1, a6); + a2 = _mm_add_epi16(a2, a5); + a3 = _mm_add_epi16(a3, a4); + sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0); + sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); + + sum0 = _mm_srai_epi32(sum0, 6); + sum1 = _mm_srai_epi32(sum1, 6); + sum0 = _mm_packs_epi32(sum0, sum1); + + _mm_storeu_si128((__m128i *)dst, sum0); + dst += 8; + firstPassIFDst += 8; + rowCount -= 2; + } + while (rowCount > 0); + + puWidth -= 4; + if (puWidth == 0) + { + return; + } + + firstPassIFDst += 32; + } + + colCount = puWidth; + do + { + rowCount = puHeight; + do + { + __m128i sum0, sum1; + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*8)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*8)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*8)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*8)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*8)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*8)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*8)); + a7 = _mm_loadu_si128((__m128i *)(firstPassIFDst+7*8)); + + a0 = _mm_add_epi16(a0, a7); + a1 = _mm_add_epi16(a1, a6); + a2 = _mm_add_epi16(a2, a5); + a3 = _mm_add_epi16(a3, a4); + sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0); + sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); + + sum0 = _mm_srai_epi32(sum0, 6); + sum1 = _mm_srai_epi32(sum1, 6); + sum0 = _mm_packs_epi32(sum0, sum1); + + _mm_storeu_si128((__m128i *)dst, sum0); + dst += 8; + firstPassIFDst += 8; + } + while (--rowCount > 0); + + firstPassIFDst += 56; + colCount -= 8; + } + while (colCount > 0); +} + +#endif diff --git a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c index e23e88a20..5ffc364bd 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c +++ b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c @@ -66,7 +66,7 @@ static void _mm_storeh_epi64(__m128i * p, __m128i x) _mm_storeh_pd((double *)p, _mm_castsi128_pd(x)); } -static void PrefetchBlock(EB_U8 *src, EB_U32 srcStride, EB_U32 blkWidth, EB_U32 blkHeight) +void PrefetchBlock(EB_U8 *src, EB_U32 srcStride, EB_U32 blkWidth, EB_U32 blkHeight) { #if PREFETCH EB_U32 rowCount = blkHeight; @@ -2313,8 +2313,8 @@ void LumaInterpolationFilterPose_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); - EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); } @@ -2329,8 +2329,8 @@ void LumaInterpolationFilterPosf_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 3 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); - EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); } void LumaInterpolationFilterPosg_SSSE3( @@ -2342,8 +2342,8 @@ void LumaInterpolationFilterPosg_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); - EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); } void LumaInterpolationFilterPosi_SSSE3( @@ -2355,8 +2355,8 @@ void LumaInterpolationFilterPosi_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); - EbHevcLumaInterpolationFilterTwoDInRawM_SSSE3(firstPassIFDst, dst, dstStride, puWidth, puHeight); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); + EbHevcLumaInterpolationFilterTwoDInRawM(firstPassIFDst, dst, dstStride, puWidth, puHeight); } @@ -2370,8 +2370,8 @@ void LumaInterpolationFilterPosj_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); - EbHevcLumaInterpolationFilterTwoDInRawM_SSSE3(firstPassIFDst, dst, dstStride, puWidth, puHeight); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); + EbHevcLumaInterpolationFilterTwoDInRawM(firstPassIFDst, dst, dstStride, puWidth, puHeight); } void LumaInterpolationFilterPosk_SSSE3( @@ -2383,8 +2383,8 @@ void LumaInterpolationFilterPosk_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); - EbHevcLumaInterpolationFilterTwoDInRawM_SSSE3(firstPassIFDst, dst, dstStride, puWidth, puHeight); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); + EbHevcLumaInterpolationFilterTwoDInRawM(firstPassIFDst, dst, dstStride, puWidth, puHeight); } void LumaInterpolationFilterPosp_SSSE3( @@ -2396,8 +2396,8 @@ void LumaInterpolationFilterPosp_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); - EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); } void LumaInterpolationFilterPosq_SSSE3( @@ -2411,8 +2411,8 @@ void LumaInterpolationFilterPosq_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 2 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); - EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); } void LumaInterpolationFilterPosr_SSSE3( @@ -2424,8 +2424,8 @@ void LumaInterpolationFilterPosr_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); - EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); } @@ -2452,7 +2452,7 @@ void LumaInterpolationFilterPosaOutRaw_SSSE3( EB_S16 *firstPassIFDst) { (void)firstPassIFDst; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic, srcStride, dst, puWidth, puHeight, 1); } void LumaInterpolationFilterPosbOutRaw_SSSE3( @@ -2465,7 +2465,7 @@ void LumaInterpolationFilterPosbOutRaw_SSSE3( { (void)firstPassIFDst; //LumaInterpolationFilterOneDOutRawHorizontalOut_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 2); - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic, srcStride, dst, puWidth, puHeight, 2); } void LumaInterpolationFilterPoscOutRaw_SSSE3( @@ -2478,7 +2478,7 @@ void LumaInterpolationFilterPoscOutRaw_SSSE3( { (void)firstPassIFDst; //LumaInterpolationFilterOneDOutRawHorizontalOut_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 3); - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic, srcStride, dst, puWidth, puHeight, 3); } void LumaInterpolationFilterPosdOutRaw_SSSE3( @@ -2821,8 +2821,8 @@ void LumaInterpolationFilterPoseOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); - EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3(firstPassIFDst, dst, puWidth, puHeight, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 1); } void LumaInterpolationFilterPosfOutRaw_SSSE3( @@ -2835,8 +2835,8 @@ void LumaInterpolationFilterPosfOutRaw_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 3 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); - EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3(firstPassIFDst, dst, puWidth, puHeight, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 1); } void LumaInterpolationFilterPosgOutRaw_SSSE3( @@ -2847,8 +2847,8 @@ void LumaInterpolationFilterPosgOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); - EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3(firstPassIFDst, dst, puWidth, puHeight, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 1); } void LumaInterpolationFilterPosiOutRaw_SSSE3( @@ -2859,7 +2859,7 @@ void LumaInterpolationFilterPosiOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); EbHevcLumaInterpolationFilterTwoDInRawOutRawM_SSSE3(firstPassIFDst, dst, puWidth, puHeight); } @@ -2871,7 +2871,7 @@ void LumaInterpolationFilterPosjOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); EbHevcLumaInterpolationFilterTwoDInRawOutRawM_SSSE3(firstPassIFDst, dst, puWidth, puHeight); } @@ -2883,7 +2883,7 @@ void LumaInterpolationFilterPoskOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); EbHevcLumaInterpolationFilterTwoDInRawOutRawM_SSSE3(firstPassIFDst, dst, puWidth, puHeight); } @@ -2895,8 +2895,8 @@ void LumaInterpolationFilterPospOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); - EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3(firstPassIFDst, dst, puWidth, puHeight, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 3); } void LumaInterpolationFilterPosqOutRaw_SSSE3( @@ -2909,8 +2909,8 @@ void LumaInterpolationFilterPosqOutRaw_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 2 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); - EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3(firstPassIFDst, dst, puWidth, puHeight, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 3); } void LumaInterpolationFilterPosrOutRaw_SSSE3( @@ -2921,6 +2921,6 @@ void LumaInterpolationFilterPosrOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); - EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3(firstPassIFDst, dst, puWidth, puHeight, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 3); } From 157cbe8d3cf81caba38fa2f7473dce05aec7911d Mon Sep 17 00:00:00 2001 From: deeptiag1 Date: Wed, 19 Feb 2020 13:37:54 -0700 Subject: [PATCH 02/12] AVX512_changes_P2 Signed-off-by: deeptiag1 --- Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h b/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h index ba5fce2e4..d58cb8591 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h +++ b/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h @@ -12,6 +12,18 @@ extern "C" { #endif +#define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3 +#define LumaInterpolationFilterOneDOutRawHorizontal LumaInterpolationFilterOneDOutRawHorizontal_SSSE3 +#define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3 +#define EbHevcLumaInterpolationFilterTwoDInRawM EbHevcLumaInterpolationFilterTwoDInRawM_SSSE3 +#ifndef NON_AVX512_SUPPORT +#define LumaInterpolationFilterOneDOutRawHorizontal LumaInterpolationFilterOneDOutRawHorizontal_AVX512 +#define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_AVX512 +#define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_AVX512 +#define EbHevcLumaInterpolationFilterTwoDInRawM EbHevcLumaInterpolationFilterTwoDInRawM_AVX512 +#endif + +void PrefetchBlock(EB_U8 *src, EB_U32 srcStride, EB_U32 blkWidth, EB_U32 blkHeight); // SSSE3 functions void ChromaInterpolationCopy_SSSE3(EB_BYTE refPic, EB_U32 srcStride, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight, EB_S16 *firstPassIFDst, EB_U32 fracPosx, EB_U32 fracPosy); void ChromaInterpolationFilterOneDHorizontal_SSSE3(EB_BYTE refPic, EB_U32 srcStride, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight, EB_S16 *firstPassIFDst, EB_U32 fracPosx, EB_U32 fracPosy); From 4093056a01175240c241f857475a3500d50ab4ef Mon Sep 17 00:00:00 2001 From: deeptiag1 Date: Wed, 19 Feb 2020 13:42:19 -0700 Subject: [PATCH 03/12] AVX512_changes_P3 Signed-off-by: deeptiag1 --- .../EbComputeSAD_SadLoopKernel_AVX512.c | 25 +++++++++++-------- .../ASM_AVX2/EbTransforms_Intrinsic_AVX2.c | 4 +-- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/Source/Lib/ASM_AVX2/EbComputeSAD_SadLoopKernel_AVX512.c b/Source/Lib/ASM_AVX2/EbComputeSAD_SadLoopKernel_AVX512.c index 1d530b724..25cad707a 100644 --- a/Source/Lib/ASM_AVX2/EbComputeSAD_SadLoopKernel_AVX512.c +++ b/Source/Lib/ASM_AVX2/EbComputeSAD_SadLoopKernel_AVX512.c @@ -209,6 +209,8 @@ void SadLoopKernel_AVX512_HmeL0_INTRIN( case 16: { + __m512i x = _mm512_setr_epi64(0x0000000000000000, 0x0001000100010001, 0x0004000400040004, 0x0005000500050005, 0x0001000100010001, 0x0002000200020002, 0x0005000500050005, 0x0006000600060006); + __m512i x1 = _mm512_setr_epi64(0x0001000100010001, 0x0002000200020002, 0x0005000500050005, 0x0006000600060006, 0x0002000200020002, 0x0003000300030003, 0x0006000600060006, 0x0007000700070007); if (height <= 16 && searchAreaWidth <= 128) { for (i = 0; i Date: Wed, 19 Feb 2020 19:55:50 -0700 Subject: [PATCH 04/12] AVX512_changes_P4 Signed-off-by: deeptiag1 --- Source/Lib/ASM_AVX2/EbTransforms_AVX2.h | 4 +- .../ASM_AVX2/EbTransforms_Intrinsic_AVX2.c | 233 +++++++++--------- .../ASM_AVX2/EbTransforms_Intrinsic_AVX512.c | 84 ++++--- Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h | 3 +- 4 files changed, 168 insertions(+), 156 deletions(-) diff --git a/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h b/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h index 6a3299549..6538c3d07 100644 --- a/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h +++ b/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h @@ -11,12 +11,14 @@ extern "C" { #endif +#ifdef NON_AVX512_SUPPORT #define EbHevcTransform32_INTRIN EbHevcTransform32_AVX2_INTRIN -#ifndef NON_AVX512_SUPPORT +#else NON_AVX512_SUPPORT #define EbHevcTransform32_INTRIN EbHevcTransform32_AVX512_INTRIN #endif + void QuantizeInvQuantize8x8_AVX2_INTRIN( EB_S16 *coeff, const EB_U32 coeffStride, diff --git a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c index 8b4150c1c..09432f1f3 100644 --- a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c +++ b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c @@ -13,7 +13,7 @@ #ifdef __GNUC__ __attribute__((aligned(16))) #endif -EB_ALIGN(32) const EB_S16 EbHevcCoeff_tbl_AVX2[48 * 16] = +EB_ALIGN(32) const EB_S16 coeff_tbl_AVX2[48 * 16] = { 64, 64, 89, 75, 83, 36, 75, -18, 64, 64, 89, 75, 83, 36, 75, -18, 64, -64, 50, -89, 36, -83, 18, -50, 64, -64, 50, -89, 36, -83, 18, -50, 64, 64, 50, 18, -36, -83, -89, -50, 64, 64, 50, 18, -36, -83, -89, -50, -64, 64, 18, 75, 83, -36, 75, -89, -64, 64, 18, 75, 83, -36, 75, -89, @@ -282,7 +282,7 @@ void QuantizeInvQuantize8x8_AVX2_INTRIN( // transpose 16x16 block of data -void EbHevcTranspose16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride) +void transpose16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride) { EB_U32 i; for (i = 0; i < 2; i++) @@ -346,7 +346,7 @@ void EbHevcTranspose16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, } // transpose 32x32 block of data -void EbHevcTranspose32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride) +void transpose32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride) { EB_U32 i, j; for (i = 0; i < 4; i++) @@ -553,7 +553,7 @@ void transform16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 EB_U32 i; __m128i s0 = _mm_cvtsi32_si128(shift); __m256i o0 = _mm256_set1_epi32(1 << (shift - 1)); - const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; + const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2; for (i = 0; i < 16; i += 2) { @@ -610,12 +610,12 @@ void transform16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 } // 32-point forward transform (32 rows) -void EbHevcTransform32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift) +void transform32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift) { - EB_U32 i; + EB_U32 i; __m128i s0; __m256i o0; - const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; + const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2; shift &= 0x0000FFFF; // Redundant code to fix Visual Studio 2012 AVX2 compiler error s0 = _mm_cvtsi32_si128(shift); @@ -626,6 +626,7 @@ void EbHevcTransform32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, __m256i x0, x1, x2, x3; __m256i y0, y1, y2, y3; __m256i a0, a1, a2, a3, a4, a5, a6, a7; + __m256i aa4, aa5, aa6, aa7; __m256i b0, b1, b2, b3, b4, b5, b6, b7; x0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x00))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x00)), 0x1); @@ -653,60 +654,65 @@ void EbHevcTransform32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, x3 = y3; a0 = _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x00), coeff32[0]); - a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x55), coeff32[2])); - a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xaa), coeff32[4])); - a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xff), coeff32[6])); + a0 = _mm256_dpwssd_epi32(a0, _mm256_shuffle_epi32(x0, 0x55), coeff32[2]); + a0 = _mm256_dpwssd_epi32(a0, _mm256_shuffle_epi32(x0, 0xaa), coeff32[4]); + a0 = _mm256_dpwssd_epi32(a0, _mm256_shuffle_epi32(x0, 0xff), coeff32[6]); a1 = _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x00), coeff32[1]); - a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x55), coeff32[3])); - a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xaa), coeff32[5])); - a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xff), coeff32[7])); + a1 = _mm256_dpwssd_epi32(a1, _mm256_shuffle_epi32(x0, 0x55), coeff32[3]); + a1 = _mm256_dpwssd_epi32(a1, _mm256_shuffle_epi32(x0, 0xaa), coeff32[5]); + a1 = _mm256_dpwssd_epi32(a1, _mm256_shuffle_epi32(x0, 0xff), coeff32[7]); a2 = _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x00), coeff32[8]); - a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x55), coeff32[10])); - a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xaa), coeff32[12])); - a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xff), coeff32[14])); + a2 = _mm256_dpwssd_epi32(a2, _mm256_shuffle_epi32(x1, 0x55), coeff32[10]); + a2 = _mm256_dpwssd_epi32(a2, _mm256_shuffle_epi32(x1, 0xaa), coeff32[12]); + a2 = _mm256_dpwssd_epi32(a2, _mm256_shuffle_epi32(x1, 0xff), coeff32[14]); a3 = _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x00), coeff32[9]); - a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x55), coeff32[11])); - a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xaa), coeff32[13])); - a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xff), coeff32[15])); + a3 = _mm256_dpwssd_epi32(a3, _mm256_shuffle_epi32(x1, 0x55), coeff32[11]); + a3 = _mm256_dpwssd_epi32(a3, _mm256_shuffle_epi32(x1, 0xaa), coeff32[13]); + a3 = _mm256_dpwssd_epi32(a3, _mm256_shuffle_epi32(x1, 0xff), coeff32[15]); a4 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[16]); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[20])); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[24])); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[28])); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[32])); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[36])); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[40])); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[44])); + a4 = _mm256_dpwssd_epi32(a4, _mm256_shuffle_epi32(x2, 0x55), coeff32[20]); + a4 = _mm256_dpwssd_epi32(a4, _mm256_shuffle_epi32(x2, 0xaa), coeff32[24]); + a4 = _mm256_dpwssd_epi32(a4, _mm256_shuffle_epi32(x2, 0xff), coeff32[28]); + aa4 = _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[32]); + aa4 = _mm256_dpwssd_epi32(aa4, _mm256_shuffle_epi32(x3, 0x55), coeff32[36]); + aa4 = _mm256_dpwssd_epi32(aa4, _mm256_shuffle_epi32(x3, 0xaa), coeff32[40]); + aa4 = _mm256_dpwssd_epi32(aa4, _mm256_shuffle_epi32(x3, 0xff), coeff32[44]); + a4 = _mm256_add_epi32( a4, aa4); a5 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[17]); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[21])); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[25])); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[29])); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[33])); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[37])); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[41])); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[45])); + a5 = _mm256_dpwssd_epi32(a5, _mm256_shuffle_epi32(x2, 0x55), coeff32[21]); + a5 = _mm256_dpwssd_epi32(a5, _mm256_shuffle_epi32(x2, 0xaa), coeff32[25]); + a5 = _mm256_dpwssd_epi32(a5, _mm256_shuffle_epi32(x2, 0xff), coeff32[29]); + aa5 = _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[33]); + aa5 = _mm256_dpwssd_epi32(aa5, _mm256_shuffle_epi32(x3, 0x55), coeff32[37]); + aa5 = _mm256_dpwssd_epi32(aa5, _mm256_shuffle_epi32(x3, 0xaa), coeff32[41]); + aa5 = _mm256_dpwssd_epi32(aa5, _mm256_shuffle_epi32(x3, 0xff), coeff32[45]); + a5 = _mm256_add_epi32( a5, aa5); a6 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[18]); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[22])); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[26])); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[30])); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[34])); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[38])); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[42])); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[46])); + a6 = _mm256_dpwssd_epi32(a6, _mm256_shuffle_epi32(x2, 0x55), coeff32[22]); + a6 = _mm256_dpwssd_epi32(a6, _mm256_shuffle_epi32(x2, 0xaa), coeff32[26]); + a6 = _mm256_dpwssd_epi32(a6, _mm256_shuffle_epi32(x2, 0xff), coeff32[30]); + aa6 = _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[34]); + aa6 = _mm256_dpwssd_epi32(aa6, _mm256_shuffle_epi32(x3, 0x55), coeff32[38]); + aa6 = _mm256_dpwssd_epi32(aa6, _mm256_shuffle_epi32(x3, 0xaa), coeff32[42]); + aa6 = _mm256_dpwssd_epi32(aa6, _mm256_shuffle_epi32(x3, 0xff), coeff32[46]); + a6 = _mm256_add_epi32( a6, aa6); a7 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[19]); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[23])); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[27])); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[31])); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[35])); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[39])); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[43])); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[47])); + a7 = _mm256_dpwssd_epi32(a7, _mm256_shuffle_epi32(x2, 0x55), coeff32[23]); + a7 = _mm256_dpwssd_epi32(a7, _mm256_shuffle_epi32(x2, 0xaa), coeff32[27]); + a7 = _mm256_dpwssd_epi32(a7, _mm256_shuffle_epi32(x2, 0xff), coeff32[31]); + aa7 = _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[35]); + aa7 = _mm256_dpwssd_epi32(aa7, _mm256_shuffle_epi32(x3, 0x55), coeff32[39]); + aa7 = _mm256_dpwssd_epi32(aa7, _mm256_shuffle_epi32(x3, 0xaa), coeff32[43]); + aa7 = _mm256_dpwssd_epi32(aa7, _mm256_shuffle_epi32(x3, 0xff), coeff32[47]); + a7 = _mm256_add_epi32( a7, aa7); + b0 = _mm256_sra_epi32(_mm256_add_epi32(a0, o0), s0); b1 = _mm256_sra_epi32(_mm256_add_epi32(a1, o0), s0); @@ -755,7 +761,7 @@ void Pfreq1DTransform32_AVX2_INTRIN( EB_U32 i; __m128i s0 = _mm_cvtsi32_si128(shift); __m256i o0 = _mm256_set1_epi32(1 << (shift - 1)); - const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; + const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2; for (i = 0; i < 32; i += 2) { @@ -889,7 +895,7 @@ void Pfreq2DTransform32_AVX2_INTRIN( EB_U32 i; __m128i s0 = _mm_cvtsi32_si128(shift); __m256i o0 = _mm256_set1_epi32(1 << (shift - 1)); - const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; + const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2; for (i = 0; i < 16; i += 2) { @@ -1038,7 +1044,7 @@ void PfreqN41DTransform32_AVX2_INTRIN( EB_U32 i; __m128i s0 = _mm_cvtsi32_si128(shift); __m256i o0 = _mm256_set1_epi32(1 << (shift - 1)); - const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; + const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2; for (i = 0; i < 32; i += 2) { @@ -1175,7 +1181,7 @@ void PfreqN42DTransform32_AVX2_INTRIN( EB_U32 i; __m128i s0 = _mm_cvtsi32_si128(shift); __m256i o0 = _mm256_set1_epi32(1 << (shift - 1)); - const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; + const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2; //for (i = 0; i < 16; i += 2) for (i = 0; i < 8; i += 2) @@ -1291,8 +1297,8 @@ void PfreqN42DTransform32_AVX2_INTRIN( //x2 = _mm256_unpacklo_epi16(y1, y3); //x3 = _mm256_unpackhi_epi16(y1, y3); - //---// y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 0)), _mm256_extracti128_si256(x1, 0), 0x1); - //---//y2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 1)), _mm256_extracti128_si256(x1, 1), 0x1); + //---// y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 0)), _mm256_extracti128_si256(x1, 0), 0x1); + //---//y2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 1)), _mm256_extracti128_si256(x1, 1), 0x1); //---// _mm256_storeu_si256((__m256i *)(dst + i*dst_stride), y0); //---// _mm256_storeu_si256((__m256i *)(dst + (i + 1)*dst_stride), y2); @@ -1458,20 +1464,20 @@ void PfreqN4Transform32x32_AVX2_INTRIN( EB_EXTERN void lowPrecisionTransform16x16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_S16 *intermediate, EB_U32 addshift) { transform16_AVX2_INTRIN(src, src_stride, intermediate, 16, (EB_S16)(4 + addshift)); - EbHevcTranspose16_AVX2_INTRIN(intermediate, 16, dst, dst_stride); + transpose16_AVX2_INTRIN(intermediate, 16, dst, dst_stride); transform16_AVX2_INTRIN(dst, dst_stride, intermediate, 16, 9); - EbHevcTranspose16_AVX2_INTRIN(intermediate, 16, dst, dst_stride); + transpose16_AVX2_INTRIN(intermediate, 16, dst, dst_stride); } // forward 32x32 transform EB_EXTERN void lowPrecisionTransform32x32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_S16 *intermediate, EB_U32 addshift) { - EbHevcTransform32_INTRIN(src, src_stride, intermediate, 32, 6 + addshift); - EbHevcTranspose32_AVX2_INTRIN(intermediate, 32, dst, dst_stride); - EbHevcTransform32_INTRIN(dst, dst_stride, intermediate, 32, 9); - EbHevcTranspose32_AVX2_INTRIN(intermediate, 32, dst, dst_stride); + transform32_AVX2_INTRIN(src, src_stride, intermediate, 32, 6 + addshift); + transpose32_AVX2_INTRIN(intermediate, 32, dst, dst_stride); + transform32_AVX2_INTRIN(dst, dst_stride, intermediate, 32, 9); + transpose32_AVX2_INTRIN(intermediate, 32, dst, dst_stride); } void MatMult4x4_OutBuff_AVX2_INTRIN( @@ -1480,10 +1486,10 @@ void MatMult4x4_OutBuff_AVX2_INTRIN( EB_S16* coeffOut, const EB_U32 coeffOutStride, const EB_U16 *maskingMatrix, - const EB_U32 maskingMatrixStride, - const EB_U32 computeSize, - const EB_S32 offset, - const EB_S32 shiftNum, + const EB_U32 maskingMatrixStride, + const EB_U32 computeSize, + const EB_S32 offset, + const EB_S32 shiftNum, EB_U32* nonzerocoeff) { @@ -1549,7 +1555,7 @@ void MatMult4x4_OutBuff_AVX2_INTRIN( void MatMult4x4_AVX2_INTRIN( EB_S16* coeff, const EB_U32 coeffStride, - const EB_U16 *maskingMatrix, + const EB_U16 *maskingMatrix, const EB_U32 maskingMatrixStride, //Matrix size const EB_U32 computeSize, //Computation area size const EB_S32 offset, //(PMP_MAX >> 1) @@ -1563,52 +1569,52 @@ void MatMult4x4_AVX2_INTRIN( (void)computeSize; coeffTemp = a0 = a1 = b0 = b1 = ymm_computed = MaskingMatrix = offsetREG = _mm256_setzero_si256(); - + // prepare Shift REG __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (EB_S16)shiftNum); //_mm_set1_epi16((EB_U16)shiftNum);//_mm_set1_epi32(shiftNum); - + //prepare the offset offsetREG = _mm256_set1_epi32(offset); - + //load maskingMatrix_new MaskingMatrix = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)maskingMatrix), _mm_loadl_epi64((__m128i*)(maskingMatrix + maskingMatrixStride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(maskingMatrix + 2 * maskingMatrixStride)), _mm_loadl_epi64((__m128i*)(maskingMatrix + 3 * maskingMatrixStride))), 0x1); - + //load coefftemp a = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)coeff), _mm_loadl_epi64((__m128i*)(coeff + coeffStride))); // 1st and 2nd row of the 4x4 block b = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(coeff + 2 * coeffStride)), _mm_loadl_epi64((__m128i*)(coeff + 3 * coeffStride))); // 3rd and 4th row of the 4x4 block coeffTemp = _mm256_insertf128_si256(_mm256_castsi128_si256( a),b,0x1); // the 4x4 block is now loaded - + coeffTempORG = coeffTemp; //Absolute val coeffTemp = _mm256_abs_epi16(coeffTemp); - + a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix); a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix); - - + + b0 = _mm256_unpacklo_epi16(a0, a1); b1 = _mm256_unpackhi_epi16(a0, a1); - + b0 = _mm256_add_epi32(b0, offsetREG); b1 = _mm256_add_epi32(b1, offsetREG); - + //Shift right by PMP_PRECISION_REG b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG); b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG); - + //coefftemp in c ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst. z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp; ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp); - + a = _mm256_extracti128_si256(ymm_computed, 0); b = _mm256_extracti128_si256(ymm_computed, 1); _mm_storel_epi64((__m128i *)coeff, a); _mm_storel_epi64((__m128i *)(coeff + coeffStride), _mm_srli_si128(a, 8)); _mm_storel_epi64((__m128i *)(coeff + 2 * coeffStride), b); _mm_storel_epi64((__m128i *)(coeff + 3 * coeffStride), _mm_srli_si128(b, 8)); - + z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 8)); *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1))); @@ -1633,15 +1639,15 @@ void MatMult8x8_AVX2_INTRIN( // prepare Shift REG __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (EB_S16)shiftNum);//_mm_set1_epi32(shiftNum); - + //prepare the offset - __m256i offsetREG = _mm256_set1_epi32(offset); + __m256i offsetREG = _mm256_set1_epi32(offset); row = 0; do { - + //load maskingMatrix_new MaskingMatrix = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(maskingMatrix + maskingMatrixStride*row))), _mm_loadu_si128((__m128i*)(maskingMatrix + maskingMatrixStride*(row + 1))), 0x1); - + //load coefftemp coeffTemp = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(coeff + coeffStride*row))), _mm_loadu_si128((__m128i*)(coeff + coeffStride*(row + 1))), 0x1); @@ -1649,38 +1655,38 @@ void MatMult8x8_AVX2_INTRIN( coeffTempORG = coeffTemp; //Absolute val coeffTemp = _mm256_abs_epi16(coeffTemp); - + //Multiply a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix); a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix); - - + + b0 = _mm256_unpacklo_epi16(a0, a1); b1 = _mm256_unpackhi_epi16(a0, a1); - + //Add b0 = _mm256_add_epi32(b0, offsetREG); b1 = _mm256_add_epi32(b1, offsetREG); - + //Shift right by PMP_PRECISION_REG b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG); b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG); - + //coefftemp in c ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from b0 and b1 to packed 16-bit integers using signed saturation, and store the results in dst. z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp; - + ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp); - + _mm_storeu_si128((__m128i *)(coeff + coeffStride*row), _mm256_extracti128_si256(ymm_computed, 0)); _mm_storeu_si128((__m128i *)(coeff + coeffStride*(row + 1)), _mm256_extracti128_si256(ymm_computed, 1)); - + row += 2; } while (row < computeSize); - + z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 7)); *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1))); - + } /***************************************MatMultNxN_AVX2_INTRIN****************************************************/ void MatMultNxN_AVX2_INTRIN( @@ -1693,67 +1699,68 @@ void MatMultNxN_AVX2_INTRIN( const EB_S32 shiftNum, //PMP_PRECISION EB_U32* nonzerocoeff) { - + unsigned row,col; __m256i z = _mm256_setzero_si256(); //__m128i a, b; __m256i coeffTemp,a0,a1,b0,b1,ymm_computed,MaskingMatrix,coeffTempORG; coeffTemp = a0 = a1 = b0 = b1 = ymm_computed =MaskingMatrix = _mm256_setzero_si256(); - + // prepare Shift REG __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (EB_S16)shiftNum);//_mm_set1_epi32(shiftNum); - + //prepare the offset - __m256i offsetREG = _mm256_set1_epi32(offset); - + __m256i offsetREG = _mm256_set1_epi32(offset); + row = 0; do { col = 0; do { - + //load coefftemp coeffTemp = _mm256_loadu_si256((__m256i *)(coeff + coeffStride*row + col)); - + //load maskingMatrix_new MaskingMatrix = _mm256_loadu_si256((__m256i *) (maskingMatrix + maskingMatrixStride*row + col)); - + coeffTempORG = coeffTemp; - + //Absolute val coeffTemp = _mm256_abs_epi16(coeffTemp); - + //Multiply a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix); a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix); - - + + b0 = _mm256_unpacklo_epi16(a0, a1); b1 = _mm256_unpackhi_epi16(a0, a1); - + //Add b0 = _mm256_add_epi32(b0, offsetREG); b1 = _mm256_add_epi32(b1, offsetREG); - + //Shift right by PMP_PRECISION_REG b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG); b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG); - + //coefftemp in c ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from b0 and b1 to packed 16-bit integers using signed saturation, and store the results in dst. z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp; - + ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp); - + _mm256_storeu_si256((__m256i *)(coeff + coeffStride*row + col), ymm_computed); - + col += 16; } while (col < computeSize); row++; } while (row < computeSize); - + z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 7)); - *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1))); - + *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1))); + } + diff --git a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c index 6d5342d42..e7ec6815a 100644 --- a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c +++ b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c @@ -52,6 +52,7 @@ extern void EbHevcTransform32_AVX512_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S { __m256i x0, x1, x2, x3,sox0,sox5,soxa,soxf,s1x0,s1x5,s1xa,s1xf; __m256i y0, y1, y2, y3; + __m256i aa4, aa5, aa6, aa7; __m256i a0, a1, a2, a3, a4, a5, a6, a7; __m256i b0, b1, b2, b3, b4, b5, b6, b7; @@ -90,24 +91,25 @@ extern void EbHevcTransform32_AVX512_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S s1xf = _mm256_shuffle_epi32(x1, 0xff); a0 = _mm256_madd_epi16(sox0, coeff32[0]); - a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(sox5, coeff32[2])); - a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(soxa, coeff32[4])); - a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(soxf, coeff32[6])); + a0 = _mm256_dpwssd_epi32(a0, sox5, coeff32[2]); + a0 = _mm256_dpwssd_epi32(a0, soxa, coeff32[4]); + a0 = _mm256_dpwssd_epi32(a0, soxf, coeff32[6]); + a1 = _mm256_madd_epi16(sox0, coeff32[1]); - a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(sox5, coeff32[3])); - a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(soxa, coeff32[5])); - a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(soxf, coeff32[7])); + a1 = _mm256_dpwssd_epi32(a1, sox5, coeff32[3]); + a1 = _mm256_dpwssd_epi32(a1, soxa, coeff32[5]); + a1 = _mm256_dpwssd_epi32(a1, soxf, coeff32[7]); a2 = _mm256_madd_epi16(s1x0, coeff32[8]); - a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(s1x5, coeff32[10])); - a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(s1xa, coeff32[12])); - a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(s1xf, coeff32[14])); + a2 = _mm256_dpwssd_epi32(a2, s1x5, coeff32[10]); + a2 = _mm256_dpwssd_epi32(a2, s1xa, coeff32[12]); + a2 = _mm256_dpwssd_epi32(a2, s1xf, coeff32[14]); a3 = _mm256_madd_epi16(s1x0, coeff32[9]); - a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(s1x5, coeff32[11])); - a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(s1xa, coeff32[13])); - a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(s1xf, coeff32[15])); + a3 = _mm256_dpwssd_epi32(a3, s1x5, coeff32[11]); + a3 = _mm256_dpwssd_epi32(a3, s1xa, coeff32[13]); + a3 = _mm256_dpwssd_epi32(a3, s1xf, coeff32[15]); sox0 = _mm256_shuffle_epi32(x2, 0x00); sox5 = _mm256_shuffle_epi32(x2, 0x55); @@ -117,42 +119,42 @@ extern void EbHevcTransform32_AVX512_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S s1x5 = _mm256_shuffle_epi32(x3, 0x55); s1xa = _mm256_shuffle_epi32(x3, 0xaa); s1xf = _mm256_shuffle_epi32(x3, 0xff); - + a4 = _mm256_madd_epi16(sox0, coeff32[16]); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(sox5, coeff32[20])); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(soxa, coeff32[24])); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(soxf, coeff32[28])); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1x0, coeff32[32])); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1x5, coeff32[36])); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1xa, coeff32[40])); - a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1xf, coeff32[44])); + a4 = _mm256_dpwssd_epi32(a4, sox5, coeff32[20]); + a4 = _mm256_dpwssd_epi32(a4, soxa, coeff32[24]); + a4 = _mm256_dpwssd_epi32(a4, soxf, coeff32[28]); + a4 = _mm256_dpwssd_epi32(a4, s1x0, coeff32[32]); + a4 = _mm256_dpwssd_epi32(a4, s1x5, coeff32[36]); + a4 = _mm256_dpwssd_epi32(a4, s1xa, coeff32[40]); + a4 = _mm256_dpwssd_epi32(a4, s1xf, coeff32[44]); a5 = _mm256_madd_epi16(sox0, coeff32[17]); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(sox5, coeff32[21])); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(soxa, coeff32[25])); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(soxf, coeff32[29])); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1x0, coeff32[33])); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1x5, coeff32[37])); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1xa, coeff32[41])); - a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1xf, coeff32[45])); + a5 = _mm256_dpwssd_epi32(a5, sox5, coeff32[21]); + a5 = _mm256_dpwssd_epi32(a5, soxa, coeff32[25]); + a5 = _mm256_dpwssd_epi32(a5, soxf, coeff32[29]); + a5 = _mm256_dpwssd_epi32(a5, s1x0, coeff32[33]); + a5 = _mm256_dpwssd_epi32(a5, s1x5, coeff32[37]); + a5 = _mm256_dpwssd_epi32(a5, s1xa, coeff32[41]); + a5 = _mm256_dpwssd_epi32(a5, s1xf, coeff32[45]); a6 = _mm256_madd_epi16(sox0, coeff32[18]); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(sox5, coeff32[22])); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(soxa, coeff32[26])); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(soxf, coeff32[30])); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1x0, coeff32[34])); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1x5, coeff32[38])); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1xa, coeff32[42])); - a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1xf, coeff32[46])); + a6 = _mm256_dpwssd_epi32(a6, sox5, coeff32[22]); + a6 = _mm256_dpwssd_epi32(a6, soxa, coeff32[26]); + a6 = _mm256_dpwssd_epi32(a6, soxf, coeff32[30]); + a6 = _mm256_dpwssd_epi32(a6, s1x0, coeff32[34]); + a6 = _mm256_dpwssd_epi32(a6, s1x5, coeff32[38]); + a6 = _mm256_dpwssd_epi32(a6, s1xa, coeff32[42]); + a6 = _mm256_dpwssd_epi32(a6, s1xf, coeff32[46]); a7 = _mm256_madd_epi16(sox0, coeff32[19]); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(sox5, coeff32[23])); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(soxa, coeff32[27])); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(soxf, coeff32[31])); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1x0, coeff32[35])); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1x5, coeff32[39])); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1xa, coeff32[43])); - a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1xf, coeff32[47])); + a7 = _mm256_dpwssd_epi32(a7, sox5, coeff32[23]); + a7 = _mm256_dpwssd_epi32(a7, soxa, coeff32[27]); + a7 = _mm256_dpwssd_epi32(a7, soxf, coeff32[31]); + a7 = _mm256_dpwssd_epi32(a7, s1x0, coeff32[35]); + a7 = _mm256_dpwssd_epi32(a7, s1x5, coeff32[39]); + a7 = _mm256_dpwssd_epi32(a7, s1xa, coeff32[43]); + a7 = _mm256_dpwssd_epi32(a7, s1xf, coeff32[47]); b0 = _mm256_sra_epi32(_mm256_add_epi32(a0, o0), s0); b1 = _mm256_sra_epi32(_mm256_add_epi32(a1, o0), s0); diff --git a/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h b/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h index d58cb8591..197a4a8bf 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h +++ b/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h @@ -12,11 +12,12 @@ extern "C" { #endif +#ifdef NON_AVX512_SUPPORT #define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3 #define LumaInterpolationFilterOneDOutRawHorizontal LumaInterpolationFilterOneDOutRawHorizontal_SSSE3 #define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3 #define EbHevcLumaInterpolationFilterTwoDInRawM EbHevcLumaInterpolationFilterTwoDInRawM_SSSE3 -#ifndef NON_AVX512_SUPPORT +#else #define LumaInterpolationFilterOneDOutRawHorizontal LumaInterpolationFilterOneDOutRawHorizontal_AVX512 #define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_AVX512 #define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_AVX512 From b82761397523eb46f332f9b8705370dc7dd003ad Mon Sep 17 00:00:00 2001 From: deeptiag1 Date: Thu, 20 Feb 2020 10:32:50 -0700 Subject: [PATCH 05/12] AVX512_changes_P4 Signed-off-by: deeptiag1 --- Source/Lib/ASM_AVX2/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Source/Lib/ASM_AVX2/CMakeLists.txt b/Source/Lib/ASM_AVX2/CMakeLists.txt index 30286cbbb..d149972f4 100644 --- a/Source/Lib/ASM_AVX2/CMakeLists.txt +++ b/Source/Lib/ASM_AVX2/CMakeLists.txt @@ -17,7 +17,10 @@ link_directories(${PROJECT_SOURCE_DIR}/Source/Lib/ASM_SSSE3/) set(flags_to_test -mavx2 - -static-intel + -mavx512bw + -mavx512vnni + -mavx512vl + -static-intel /Qwd10148 /Qwd10010 /Qwd10157) From d14b73501637fd0f26289821cfc55fa638712bbe Mon Sep 17 00:00:00 2001 From: deeptiag1 Date: Thu, 20 Feb 2020 20:06:23 -0700 Subject: [PATCH 06/12] AVX512_changes_P4 Signed-off-by: deeptiag1 --- Source/Lib/ASM_AVX2/EbTransforms_AVX2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h b/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h index 6538c3d07..4a7a32665 100644 --- a/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h +++ b/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h @@ -13,7 +13,7 @@ extern "C" { #ifdef NON_AVX512_SUPPORT #define EbHevcTransform32_INTRIN EbHevcTransform32_AVX2_INTRIN -#else NON_AVX512_SUPPORT +#else #define EbHevcTransform32_INTRIN EbHevcTransform32_AVX512_INTRIN #endif From 0bc40931c7ac0606ffe1e95ad2a436d080dfef12 Mon Sep 17 00:00:00 2001 From: deeptiag1 Date: Fri, 21 Feb 2020 09:19:17 -0700 Subject: [PATCH 07/12] AVX512_changes_P4 Signed-off-by: deeptiag1 --- .../ASM_AVX2/EbTransforms_Intrinsic_AVX2.c | 283 +++++++++--------- 1 file changed, 147 insertions(+), 136 deletions(-) diff --git a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c index 09432f1f3..661c29caa 100644 --- a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c +++ b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c @@ -13,7 +13,7 @@ #ifdef __GNUC__ __attribute__((aligned(16))) #endif -EB_ALIGN(32) const EB_S16 coeff_tbl_AVX2[48 * 16] = +EB_ALIGN(32) const EB_S16 EbHevcCoeff_tbl_AVX2[48 * 16] = { 64, 64, 89, 75, 83, 36, 75, -18, 64, 64, 89, 75, 83, 36, 75, -18, 64, -64, 50, -89, 36, -83, 18, -50, 64, -64, 50, -89, 36, -83, 18, -50, 64, 64, 50, 18, -36, -83, -89, -50, 64, 64, 50, 18, -36, -83, -89, -50, -64, 64, 18, 75, 83, -36, 75, -89, -64, 64, 18, 75, 83, -36, 75, -89, @@ -282,7 +282,7 @@ void QuantizeInvQuantize8x8_AVX2_INTRIN( // transpose 16x16 block of data -void transpose16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride) +void EbHevcTranspose16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride) { EB_U32 i; for (i = 0; i < 2; i++) @@ -346,7 +346,7 @@ void transpose16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 } // transpose 32x32 block of data -void transpose32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride) +void EbHevcTranspose32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride) { EB_U32 i, j; for (i = 0; i < 4; i++) @@ -553,7 +553,7 @@ void transform16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 EB_U32 i; __m128i s0 = _mm_cvtsi32_si128(shift); __m256i o0 = _mm256_set1_epi32(1 << (shift - 1)); - const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2; + const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; for (i = 0; i < 16; i += 2) { @@ -610,12 +610,12 @@ void transform16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 } // 32-point forward transform (32 rows) -void transform32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift) +void EbHevcTransform32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift) { - EB_U32 i; + EB_U32 i; __m128i s0; __m256i o0; - const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2; + const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; shift &= 0x0000FFFF; // Redundant code to fix Visual Studio 2012 AVX2 compiler error s0 = _mm_cvtsi32_si128(shift); @@ -623,10 +623,9 @@ void transform32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 for (i = 0; i < 16; i++) { - __m256i x0, x1, x2, x3; + __m256i x0, x1, x2, x3,sox0,sox5,soxa,soxf,s1x0,s1x5,s1xa,s1xf; __m256i y0, y1, y2, y3; __m256i a0, a1, a2, a3, a4, a5, a6, a7; - __m256i aa4, aa5, aa6, aa7; __m256i b0, b1, b2, b3, b4, b5, b6, b7; x0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x00))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x00)), 0x1); @@ -653,66 +652,79 @@ void transform32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 x2 = y2; x3 = y3; - a0 = _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x00), coeff32[0]); - - a0 = _mm256_dpwssd_epi32(a0, _mm256_shuffle_epi32(x0, 0x55), coeff32[2]); - a0 = _mm256_dpwssd_epi32(a0, _mm256_shuffle_epi32(x0, 0xaa), coeff32[4]); - a0 = _mm256_dpwssd_epi32(a0, _mm256_shuffle_epi32(x0, 0xff), coeff32[6]); - a1 = _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x00), coeff32[1]); - - a1 = _mm256_dpwssd_epi32(a1, _mm256_shuffle_epi32(x0, 0x55), coeff32[3]); - a1 = _mm256_dpwssd_epi32(a1, _mm256_shuffle_epi32(x0, 0xaa), coeff32[5]); - a1 = _mm256_dpwssd_epi32(a1, _mm256_shuffle_epi32(x0, 0xff), coeff32[7]); - a2 = _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x00), coeff32[8]); - - a2 = _mm256_dpwssd_epi32(a2, _mm256_shuffle_epi32(x1, 0x55), coeff32[10]); - a2 = _mm256_dpwssd_epi32(a2, _mm256_shuffle_epi32(x1, 0xaa), coeff32[12]); - a2 = _mm256_dpwssd_epi32(a2, _mm256_shuffle_epi32(x1, 0xff), coeff32[14]); - a3 = _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x00), coeff32[9]); - - a3 = _mm256_dpwssd_epi32(a3, _mm256_shuffle_epi32(x1, 0x55), coeff32[11]); - a3 = _mm256_dpwssd_epi32(a3, _mm256_shuffle_epi32(x1, 0xaa), coeff32[13]); - a3 = _mm256_dpwssd_epi32(a3, _mm256_shuffle_epi32(x1, 0xff), coeff32[15]); - a4 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[16]); - - a4 = _mm256_dpwssd_epi32(a4, _mm256_shuffle_epi32(x2, 0x55), coeff32[20]); - a4 = _mm256_dpwssd_epi32(a4, _mm256_shuffle_epi32(x2, 0xaa), coeff32[24]); - a4 = _mm256_dpwssd_epi32(a4, _mm256_shuffle_epi32(x2, 0xff), coeff32[28]); - aa4 = _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[32]); - aa4 = _mm256_dpwssd_epi32(aa4, _mm256_shuffle_epi32(x3, 0x55), coeff32[36]); - aa4 = _mm256_dpwssd_epi32(aa4, _mm256_shuffle_epi32(x3, 0xaa), coeff32[40]); - aa4 = _mm256_dpwssd_epi32(aa4, _mm256_shuffle_epi32(x3, 0xff), coeff32[44]); - a4 = _mm256_add_epi32( a4, aa4); - a5 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[17]); - - a5 = _mm256_dpwssd_epi32(a5, _mm256_shuffle_epi32(x2, 0x55), coeff32[21]); - a5 = _mm256_dpwssd_epi32(a5, _mm256_shuffle_epi32(x2, 0xaa), coeff32[25]); - a5 = _mm256_dpwssd_epi32(a5, _mm256_shuffle_epi32(x2, 0xff), coeff32[29]); - aa5 = _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[33]); - aa5 = _mm256_dpwssd_epi32(aa5, _mm256_shuffle_epi32(x3, 0x55), coeff32[37]); - aa5 = _mm256_dpwssd_epi32(aa5, _mm256_shuffle_epi32(x3, 0xaa), coeff32[41]); - aa5 = _mm256_dpwssd_epi32(aa5, _mm256_shuffle_epi32(x3, 0xff), coeff32[45]); - a5 = _mm256_add_epi32( a5, aa5); - a6 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[18]); - a6 = _mm256_dpwssd_epi32(a6, _mm256_shuffle_epi32(x2, 0x55), coeff32[22]); - a6 = _mm256_dpwssd_epi32(a6, _mm256_shuffle_epi32(x2, 0xaa), coeff32[26]); - a6 = _mm256_dpwssd_epi32(a6, _mm256_shuffle_epi32(x2, 0xff), coeff32[30]); - aa6 = _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[34]); - aa6 = _mm256_dpwssd_epi32(aa6, _mm256_shuffle_epi32(x3, 0x55), coeff32[38]); - aa6 = _mm256_dpwssd_epi32(aa6, _mm256_shuffle_epi32(x3, 0xaa), coeff32[42]); - aa6 = _mm256_dpwssd_epi32(aa6, _mm256_shuffle_epi32(x3, 0xff), coeff32[46]); - a6 = _mm256_add_epi32( a6, aa6); - - a7 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[19]); - a7 = _mm256_dpwssd_epi32(a7, _mm256_shuffle_epi32(x2, 0x55), coeff32[23]); - a7 = _mm256_dpwssd_epi32(a7, _mm256_shuffle_epi32(x2, 0xaa), coeff32[27]); - a7 = _mm256_dpwssd_epi32(a7, _mm256_shuffle_epi32(x2, 0xff), coeff32[31]); - aa7 = _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[35]); - aa7 = _mm256_dpwssd_epi32(aa7, _mm256_shuffle_epi32(x3, 0x55), coeff32[39]); - aa7 = _mm256_dpwssd_epi32(aa7, _mm256_shuffle_epi32(x3, 0xaa), coeff32[43]); - aa7 = _mm256_dpwssd_epi32(aa7, _mm256_shuffle_epi32(x3, 0xff), coeff32[47]); - a7 = _mm256_add_epi32( a7, aa7); - + sox0 = _mm256_shuffle_epi32(x0, 0x00); + sox5 = _mm256_shuffle_epi32(x0, 0x55); + soxa = _mm256_shuffle_epi32(x0, 0xaa); + soxf = _mm256_shuffle_epi32(x0, 0xff); + s1x0 = _mm256_shuffle_epi32(x1, 0x00); + s1x5 = _mm256_shuffle_epi32(x1, 0x55); + s1xa = _mm256_shuffle_epi32(x1, 0xaa); + s1xf = _mm256_shuffle_epi32(x1, 0xff); + + a0 = _mm256_madd_epi16(sox0, coeff32[0]); + a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(sox5, coeff32[2])); + a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(soxa, coeff32[4])); + a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(soxf, coeff32[6])); + + a1 = _mm256_madd_epi16(sox0, coeff32[1]); + a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(sox5, coeff32[3])); + a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(soxa, coeff32[5])); + a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(soxf, coeff32[7])); + + a2 = _mm256_madd_epi16(s1x0, coeff32[8]); + a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(s1x5, coeff32[10])); + a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(s1xa, coeff32[12])); + a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(s1xf, coeff32[14])); + + a3 = _mm256_madd_epi16(s1x0, coeff32[9]); + a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(s1x5, coeff32[11])); + a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(s1xa, coeff32[13])); + a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(s1xf, coeff32[15])); + + sox0 = _mm256_shuffle_epi32(x2, 0x00); + sox5 = _mm256_shuffle_epi32(x2, 0x55); + soxa = _mm256_shuffle_epi32(x2, 0xaa); + soxf = _mm256_shuffle_epi32(x2, 0xff); + s1x0 = _mm256_shuffle_epi32(x3, 0x00); + s1x5 = _mm256_shuffle_epi32(x3, 0x55); + s1xa = _mm256_shuffle_epi32(x3, 0xaa); + s1xf = _mm256_shuffle_epi32(x3, 0xff); + + a4 = _mm256_madd_epi16(sox0, coeff32[16]); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(sox5, coeff32[20])); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(soxa, coeff32[24])); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(soxf, coeff32[28])); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1x0, coeff32[32])); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1x5, coeff32[36])); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1xa, coeff32[40])); + a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1xf, coeff32[44])); + + a5 = _mm256_madd_epi16(sox0, coeff32[17]); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(sox5, coeff32[21])); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(soxa, coeff32[25])); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(soxf, coeff32[29])); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1x0, coeff32[33])); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1x5, coeff32[37])); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1xa, coeff32[41])); + a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1xf, coeff32[45])); + + a6 = _mm256_madd_epi16(sox0, coeff32[18]); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(sox5, coeff32[22])); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(soxa, coeff32[26])); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(soxf, coeff32[30])); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1x0, coeff32[34])); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1x5, coeff32[38])); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1xa, coeff32[42])); + a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1xf, coeff32[46])); + + a7 = _mm256_madd_epi16(sox0, coeff32[19]); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(sox5, coeff32[23])); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(soxa, coeff32[27])); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(soxf, coeff32[31])); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1x0, coeff32[35])); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1x5, coeff32[39])); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1xa, coeff32[43])); + a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1xf, coeff32[47])); b0 = _mm256_sra_epi32(_mm256_add_epi32(a0, o0), s0); b1 = _mm256_sra_epi32(_mm256_add_epi32(a1, o0), s0); @@ -761,7 +773,7 @@ void Pfreq1DTransform32_AVX2_INTRIN( EB_U32 i; __m128i s0 = _mm_cvtsi32_si128(shift); __m256i o0 = _mm256_set1_epi32(1 << (shift - 1)); - const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2; + const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; for (i = 0; i < 32; i += 2) { @@ -895,7 +907,7 @@ void Pfreq2DTransform32_AVX2_INTRIN( EB_U32 i; __m128i s0 = _mm_cvtsi32_si128(shift); __m256i o0 = _mm256_set1_epi32(1 << (shift - 1)); - const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2; + const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; for (i = 0; i < 16; i += 2) { @@ -1044,7 +1056,7 @@ void PfreqN41DTransform32_AVX2_INTRIN( EB_U32 i; __m128i s0 = _mm_cvtsi32_si128(shift); __m256i o0 = _mm256_set1_epi32(1 << (shift - 1)); - const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2; + const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; for (i = 0; i < 32; i += 2) { @@ -1181,7 +1193,7 @@ void PfreqN42DTransform32_AVX2_INTRIN( EB_U32 i; __m128i s0 = _mm_cvtsi32_si128(shift); __m256i o0 = _mm256_set1_epi32(1 << (shift - 1)); - const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2; + const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; //for (i = 0; i < 16; i += 2) for (i = 0; i < 8; i += 2) @@ -1297,8 +1309,8 @@ void PfreqN42DTransform32_AVX2_INTRIN( //x2 = _mm256_unpacklo_epi16(y1, y3); //x3 = _mm256_unpackhi_epi16(y1, y3); - //---// y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 0)), _mm256_extracti128_si256(x1, 0), 0x1); - //---//y2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 1)), _mm256_extracti128_si256(x1, 1), 0x1); + //---// y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 0)), _mm256_extracti128_si256(x1, 0), 0x1); + //---//y2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 1)), _mm256_extracti128_si256(x1, 1), 0x1); //---// _mm256_storeu_si256((__m256i *)(dst + i*dst_stride), y0); //---// _mm256_storeu_si256((__m256i *)(dst + (i + 1)*dst_stride), y2); @@ -1464,20 +1476,20 @@ void PfreqN4Transform32x32_AVX2_INTRIN( EB_EXTERN void lowPrecisionTransform16x16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_S16 *intermediate, EB_U32 addshift) { transform16_AVX2_INTRIN(src, src_stride, intermediate, 16, (EB_S16)(4 + addshift)); - transpose16_AVX2_INTRIN(intermediate, 16, dst, dst_stride); + EbHevcTranspose16_AVX2_INTRIN(intermediate, 16, dst, dst_stride); transform16_AVX2_INTRIN(dst, dst_stride, intermediate, 16, 9); - transpose16_AVX2_INTRIN(intermediate, 16, dst, dst_stride); + EbHevcTranspose16_AVX2_INTRIN(intermediate, 16, dst, dst_stride); } // forward 32x32 transform EB_EXTERN void lowPrecisionTransform32x32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_S16 *intermediate, EB_U32 addshift) { - transform32_AVX2_INTRIN(src, src_stride, intermediate, 32, 6 + addshift); - transpose32_AVX2_INTRIN(intermediate, 32, dst, dst_stride); - transform32_AVX2_INTRIN(dst, dst_stride, intermediate, 32, 9); - transpose32_AVX2_INTRIN(intermediate, 32, dst, dst_stride); + EbHevcTransform32_INTRIN(src, src_stride, intermediate, 32, 6 + addshift); + EbHevcTranspose32_AVX2_INTRIN(intermediate, 32, dst, dst_stride); + EbHevcTransform32_INTRIN(dst, dst_stride, intermediate, 32, 9); + EbHevcTranspose32_AVX2_INTRIN(intermediate, 32, dst, dst_stride); } void MatMult4x4_OutBuff_AVX2_INTRIN( @@ -1486,10 +1498,10 @@ void MatMult4x4_OutBuff_AVX2_INTRIN( EB_S16* coeffOut, const EB_U32 coeffOutStride, const EB_U16 *maskingMatrix, - const EB_U32 maskingMatrixStride, - const EB_U32 computeSize, - const EB_S32 offset, - const EB_S32 shiftNum, + const EB_U32 maskingMatrixStride, + const EB_U32 computeSize, + const EB_S32 offset, + const EB_S32 shiftNum, EB_U32* nonzerocoeff) { @@ -1555,7 +1567,7 @@ void MatMult4x4_OutBuff_AVX2_INTRIN( void MatMult4x4_AVX2_INTRIN( EB_S16* coeff, const EB_U32 coeffStride, - const EB_U16 *maskingMatrix, + const EB_U16 *maskingMatrix, const EB_U32 maskingMatrixStride, //Matrix size const EB_U32 computeSize, //Computation area size const EB_S32 offset, //(PMP_MAX >> 1) @@ -1569,52 +1581,52 @@ void MatMult4x4_AVX2_INTRIN( (void)computeSize; coeffTemp = a0 = a1 = b0 = b1 = ymm_computed = MaskingMatrix = offsetREG = _mm256_setzero_si256(); - + // prepare Shift REG __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (EB_S16)shiftNum); //_mm_set1_epi16((EB_U16)shiftNum);//_mm_set1_epi32(shiftNum); - + //prepare the offset offsetREG = _mm256_set1_epi32(offset); - + //load maskingMatrix_new MaskingMatrix = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)maskingMatrix), _mm_loadl_epi64((__m128i*)(maskingMatrix + maskingMatrixStride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(maskingMatrix + 2 * maskingMatrixStride)), _mm_loadl_epi64((__m128i*)(maskingMatrix + 3 * maskingMatrixStride))), 0x1); - + //load coefftemp a = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)coeff), _mm_loadl_epi64((__m128i*)(coeff + coeffStride))); // 1st and 2nd row of the 4x4 block b = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(coeff + 2 * coeffStride)), _mm_loadl_epi64((__m128i*)(coeff + 3 * coeffStride))); // 3rd and 4th row of the 4x4 block coeffTemp = _mm256_insertf128_si256(_mm256_castsi128_si256( a),b,0x1); // the 4x4 block is now loaded - + coeffTempORG = coeffTemp; //Absolute val coeffTemp = _mm256_abs_epi16(coeffTemp); - + a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix); a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix); - - + + b0 = _mm256_unpacklo_epi16(a0, a1); b1 = _mm256_unpackhi_epi16(a0, a1); - + b0 = _mm256_add_epi32(b0, offsetREG); b1 = _mm256_add_epi32(b1, offsetREG); - + //Shift right by PMP_PRECISION_REG b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG); b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG); - + //coefftemp in c ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst. z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp; ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp); - + a = _mm256_extracti128_si256(ymm_computed, 0); b = _mm256_extracti128_si256(ymm_computed, 1); _mm_storel_epi64((__m128i *)coeff, a); _mm_storel_epi64((__m128i *)(coeff + coeffStride), _mm_srli_si128(a, 8)); _mm_storel_epi64((__m128i *)(coeff + 2 * coeffStride), b); _mm_storel_epi64((__m128i *)(coeff + 3 * coeffStride), _mm_srli_si128(b, 8)); - + z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 8)); *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1))); @@ -1639,15 +1651,15 @@ void MatMult8x8_AVX2_INTRIN( // prepare Shift REG __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (EB_S16)shiftNum);//_mm_set1_epi32(shiftNum); - + //prepare the offset - __m256i offsetREG = _mm256_set1_epi32(offset); + __m256i offsetREG = _mm256_set1_epi32(offset); row = 0; do { - + //load maskingMatrix_new MaskingMatrix = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(maskingMatrix + maskingMatrixStride*row))), _mm_loadu_si128((__m128i*)(maskingMatrix + maskingMatrixStride*(row + 1))), 0x1); - + //load coefftemp coeffTemp = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(coeff + coeffStride*row))), _mm_loadu_si128((__m128i*)(coeff + coeffStride*(row + 1))), 0x1); @@ -1655,38 +1667,38 @@ void MatMult8x8_AVX2_INTRIN( coeffTempORG = coeffTemp; //Absolute val coeffTemp = _mm256_abs_epi16(coeffTemp); - + //Multiply a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix); a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix); - - + + b0 = _mm256_unpacklo_epi16(a0, a1); b1 = _mm256_unpackhi_epi16(a0, a1); - + //Add b0 = _mm256_add_epi32(b0, offsetREG); b1 = _mm256_add_epi32(b1, offsetREG); - + //Shift right by PMP_PRECISION_REG b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG); b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG); - + //coefftemp in c ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from b0 and b1 to packed 16-bit integers using signed saturation, and store the results in dst. z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp; - + ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp); - + _mm_storeu_si128((__m128i *)(coeff + coeffStride*row), _mm256_extracti128_si256(ymm_computed, 0)); _mm_storeu_si128((__m128i *)(coeff + coeffStride*(row + 1)), _mm256_extracti128_si256(ymm_computed, 1)); - + row += 2; } while (row < computeSize); - + z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 7)); *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1))); - + } /***************************************MatMultNxN_AVX2_INTRIN****************************************************/ void MatMultNxN_AVX2_INTRIN( @@ -1699,68 +1711,67 @@ void MatMultNxN_AVX2_INTRIN( const EB_S32 shiftNum, //PMP_PRECISION EB_U32* nonzerocoeff) { - + unsigned row,col; __m256i z = _mm256_setzero_si256(); //__m128i a, b; __m256i coeffTemp,a0,a1,b0,b1,ymm_computed,MaskingMatrix,coeffTempORG; coeffTemp = a0 = a1 = b0 = b1 = ymm_computed =MaskingMatrix = _mm256_setzero_si256(); - + // prepare Shift REG __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (EB_S16)shiftNum);//_mm_set1_epi32(shiftNum); - + //prepare the offset - __m256i offsetREG = _mm256_set1_epi32(offset); - + __m256i offsetREG = _mm256_set1_epi32(offset); + row = 0; do { col = 0; do { - + //load coefftemp coeffTemp = _mm256_loadu_si256((__m256i *)(coeff + coeffStride*row + col)); - + //load maskingMatrix_new MaskingMatrix = _mm256_loadu_si256((__m256i *) (maskingMatrix + maskingMatrixStride*row + col)); - + coeffTempORG = coeffTemp; - + //Absolute val coeffTemp = _mm256_abs_epi16(coeffTemp); - + //Multiply a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix); a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix); - - + + b0 = _mm256_unpacklo_epi16(a0, a1); b1 = _mm256_unpackhi_epi16(a0, a1); - + //Add b0 = _mm256_add_epi32(b0, offsetREG); b1 = _mm256_add_epi32(b1, offsetREG); - + //Shift right by PMP_PRECISION_REG b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG); b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG); - + //coefftemp in c ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from b0 and b1 to packed 16-bit integers using signed saturation, and store the results in dst. z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp; - + ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp); - + _mm256_storeu_si256((__m256i *)(coeff + coeffStride*row + col), ymm_computed); - + col += 16; } while (col < computeSize); row++; } while (row < computeSize); - + z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 7)); - *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1))); - -} + *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1))); +} From 67ec8bf8a88f9dd38ca7c5f255f441e46a0307af Mon Sep 17 00:00:00 2001 From: deeptiag1 Date: Fri, 21 Feb 2020 12:22:08 -0700 Subject: [PATCH 08/12] AVX512_changes_Revert Signed-off-by: deeptiag1 --- Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c | 124 ------------------ Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c | 44 +++---- Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h | 3 - 3 files changed, 22 insertions(+), 149 deletions(-) diff --git a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c index 14fb4ce91..19e220c1a 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c +++ b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c @@ -7,16 +7,6 @@ #ifndef NON_AVX512_SUPPORT - - -const EB_S16 EbHevcLumaFilterCoeff1[4][8] = -{ - { 0, 0, 0, 64, 0, 0, 0, 0}, - {-1, 4,-10, 58, 17, -5, 1, 0}, - {-1, 4,-11, 40, 40,-11, 4, -1}, - { 0, 1, -5, 17, 58,-10, 4, -1} -}; - static const EB_S16 EbHevcLumaFilterCoeff7[4][8] = { { 0, 0, 0, 64, 0, 0, 0, 0}, @@ -25,120 +15,6 @@ static const EB_S16 EbHevcLumaFilterCoeff7[4][8] = { 1, -5, 17, 58,-10, 4, -1, 0} }; -void LumaInterpolationFilterOneDOutRawHorizontal_AVX512( - EB_BYTE refPic, - EB_U32 srcStride, - EB_S16 *dst, - EB_U32 puWidth, - EB_U32 puHeight, - EB_U32 fracPosx) -{ - EB_S32 rowCount, colCount; - __m128i c0, c1, c2, c3; // coeffs - __m128i a0, a1; - __m128i b0; - __m128i sum; - EB_BYTE ptr; - - refPic -= 3; - - PrefetchBlock(refPic, srcStride, (puWidth == 4) ? 16 : puWidth+8, (puWidth == 4) ? ((puHeight+1)&~1) : puHeight); - - c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff1[fracPosx]); - c0 = _mm_packs_epi16(c0, c0); - __m128i ct = _mm_srli_epi64(c0, 32); - __m512i cc0 = _mm512_broadcastd_epi32(c0); - __m512i cc1 = _mm512_broadcastd_epi32(ct); - c0 = _mm_unpacklo_epi16(c0, c0); - c3 = _mm_shuffle_epi32(c0, 0xff); - c2 = _mm_shuffle_epi32(c0, 0xaa); - c1 = _mm_shuffle_epi32(c0, 0x55); - c0 = _mm_shuffle_epi32(c0, 0x00); - __m512i b1 = _mm512_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0); - __m512i b2 = _mm512_set_epi8(14, 13, 12, 11, 13, 12, 11, 10, 12, 11, 10, 9, 11, 10, 9, 8, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 14, 13, 12, 11, 13, 12, 11, 10, 12, 11, 10, 9, 11, 10, 9, 8, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4); - - - if (puWidth & 4) - { - ptr = refPic; - rowCount = puHeight; - do - { - a0 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; - a1 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; - b0 = _mm_unpacklo_epi64(a0, a1); - sum = _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12)), c0); - sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14)), c1)); - b0 = _mm_unpacklo_epi64(_mm_srli_si128(a0, 4), _mm_srli_si128(a1, 4)); - sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12)), c2)); - sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14)), c3)); - - sum = _mm_sub_epi16(sum, _mm_set1_epi16(128*64)); - - _mm_storeu_si128((__m128i *)dst, sum); - dst += 8; - - rowCount -= 2; - } - while (rowCount > 0); - - puWidth -= 4; - if (puWidth == 0) - { - return; - } - - refPic += 4; - } - - colCount = puWidth; - int rowLoop = puHeight >>1 ;//divide by 2 - int evenRow = puHeight & 1; - do - { - ptr = refPic; - // rowCount = puHeight; - int rowCount = rowLoop ;//divide by 2 - do - { - __m512i a1 = _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)(ptr))); - __m256i b0 = _mm256_broadcast_i32x4(_mm_loadu_si128((__m128i*)(ptr + srcStride))); ptr += 2 * srcStride; - __m512i s1 = _mm512_inserti64x4(a1, b0, 1); - __m512i sh2 = _mm512_shuffle_epi8(s1, b1); - __m512i sh3 = _mm512_shuffle_epi8(s1, b2); - __m512i sum00 = _mm512_setzero_si512(); - __m512i sum0 = _mm512_dpbusds_epi32(sum00, sh2, cc0); - __m512i sum1 = _mm512_dpbusds_epi32(sum0, sh3, cc1); - __m512i f1 = _mm512_packs_epi32(sum1,sum1);// - __m512i f2 = _mm512_permutexvar_epi64( _mm512_setr_epi64(0x0, 0x0000000000000002, 0x0000000000000004, 0x0000000000000006, 0x0, 0x0002000200020002, 0x0004000400040004, 0x0006000600060006), f1); - f2 = _mm512_sub_epi16(f2, _mm512_set1_epi16(128 * 64)); - _mm256_storeu_si256((__m256i*)dst, _mm512_castsi512_si256(f2)); - dst += 16; - rowCount = rowCount - 1; - } - while (rowCount > 0); - - if (evenRow) - { - a0 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; - - sum = _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8)), c0); - sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10)), c1)); - sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12)), c2)); - sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14)), c3)); - - sum = _mm_sub_epi16(sum, _mm_set1_epi16(128*64)); - - _mm_storeu_si128((__m128i *)dst, sum); - dst += 8; - } - - refPic += 8; - colCount -= 8; - } - while (colCount > 0); -} - void EbHevcLumaInterpolationFilterTwoDInRaw7_AVX512(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) { EB_S32 rowCount, colCount; diff --git a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c index 5ffc364bd..bf650f06f 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c +++ b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c @@ -66,7 +66,7 @@ static void _mm_storeh_epi64(__m128i * p, __m128i x) _mm_storeh_pd((double *)p, _mm_castsi128_pd(x)); } -void PrefetchBlock(EB_U8 *src, EB_U32 srcStride, EB_U32 blkWidth, EB_U32 blkHeight) +static void PrefetchBlock(EB_U8 *src, EB_U32 srcStride, EB_U32 blkWidth, EB_U32 blkHeight) { #if PREFETCH EB_U32 rowCount = blkHeight; @@ -2313,7 +2313,7 @@ void LumaInterpolationFilterPose_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); } @@ -2329,7 +2329,7 @@ void LumaInterpolationFilterPosf_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 3 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); } @@ -2342,7 +2342,7 @@ void LumaInterpolationFilterPosg_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); } @@ -2355,7 +2355,7 @@ void LumaInterpolationFilterPosi_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); EbHevcLumaInterpolationFilterTwoDInRawM(firstPassIFDst, dst, dstStride, puWidth, puHeight); } @@ -2370,7 +2370,7 @@ void LumaInterpolationFilterPosj_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); EbHevcLumaInterpolationFilterTwoDInRawM(firstPassIFDst, dst, dstStride, puWidth, puHeight); } @@ -2383,7 +2383,7 @@ void LumaInterpolationFilterPosk_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); EbHevcLumaInterpolationFilterTwoDInRawM(firstPassIFDst, dst, dstStride, puWidth, puHeight); } @@ -2396,7 +2396,7 @@ void LumaInterpolationFilterPosp_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); } @@ -2411,7 +2411,7 @@ void LumaInterpolationFilterPosq_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 2 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); } @@ -2424,7 +2424,7 @@ void LumaInterpolationFilterPosr_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); } @@ -2452,7 +2452,7 @@ void LumaInterpolationFilterPosaOutRaw_SSSE3( EB_S16 *firstPassIFDst) { (void)firstPassIFDst; - LumaInterpolationFilterOneDOutRawHorizontal(refPic, srcStride, dst, puWidth, puHeight, 1); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 1); } void LumaInterpolationFilterPosbOutRaw_SSSE3( @@ -2465,7 +2465,7 @@ void LumaInterpolationFilterPosbOutRaw_SSSE3( { (void)firstPassIFDst; //LumaInterpolationFilterOneDOutRawHorizontalOut_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 2); - LumaInterpolationFilterOneDOutRawHorizontal(refPic, srcStride, dst, puWidth, puHeight, 2); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 2); } void LumaInterpolationFilterPoscOutRaw_SSSE3( @@ -2478,7 +2478,7 @@ void LumaInterpolationFilterPoscOutRaw_SSSE3( { (void)firstPassIFDst; //LumaInterpolationFilterOneDOutRawHorizontalOut_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 3); - LumaInterpolationFilterOneDOutRawHorizontal(refPic, srcStride, dst, puWidth, puHeight, 3); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 3); } void LumaInterpolationFilterPosdOutRaw_SSSE3( @@ -2821,7 +2821,7 @@ void LumaInterpolationFilterPoseOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 1); } @@ -2835,7 +2835,7 @@ void LumaInterpolationFilterPosfOutRaw_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 3 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 1); } @@ -2847,7 +2847,7 @@ void LumaInterpolationFilterPosgOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 1); } @@ -2859,7 +2859,7 @@ void LumaInterpolationFilterPosiOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); EbHevcLumaInterpolationFilterTwoDInRawOutRawM_SSSE3(firstPassIFDst, dst, puWidth, puHeight); } @@ -2871,7 +2871,7 @@ void LumaInterpolationFilterPosjOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); EbHevcLumaInterpolationFilterTwoDInRawOutRawM_SSSE3(firstPassIFDst, dst, puWidth, puHeight); } @@ -2883,7 +2883,7 @@ void LumaInterpolationFilterPoskOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); EbHevcLumaInterpolationFilterTwoDInRawOutRawM_SSSE3(firstPassIFDst, dst, puWidth, puHeight); } @@ -2895,7 +2895,7 @@ void LumaInterpolationFilterPospOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 3); } @@ -2909,7 +2909,7 @@ void LumaInterpolationFilterPosqOutRaw_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 2 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 3); } @@ -2921,6 +2921,6 @@ void LumaInterpolationFilterPosrOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 3); } diff --git a/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h b/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h index 197a4a8bf..6e18ee5c4 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h +++ b/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h @@ -14,17 +14,14 @@ extern "C" { #ifdef NON_AVX512_SUPPORT #define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3 -#define LumaInterpolationFilterOneDOutRawHorizontal LumaInterpolationFilterOneDOutRawHorizontal_SSSE3 #define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3 #define EbHevcLumaInterpolationFilterTwoDInRawM EbHevcLumaInterpolationFilterTwoDInRawM_SSSE3 #else -#define LumaInterpolationFilterOneDOutRawHorizontal LumaInterpolationFilterOneDOutRawHorizontal_AVX512 #define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_AVX512 #define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_AVX512 #define EbHevcLumaInterpolationFilterTwoDInRawM EbHevcLumaInterpolationFilterTwoDInRawM_AVX512 #endif -void PrefetchBlock(EB_U8 *src, EB_U32 srcStride, EB_U32 blkWidth, EB_U32 blkHeight); // SSSE3 functions void ChromaInterpolationCopy_SSSE3(EB_BYTE refPic, EB_U32 srcStride, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight, EB_S16 *firstPassIFDst, EB_U32 fracPosx, EB_U32 fracPosy); void ChromaInterpolationFilterOneDHorizontal_SSSE3(EB_BYTE refPic, EB_U32 srcStride, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight, EB_S16 *firstPassIFDst, EB_U32 fracPosx, EB_U32 fracPosy); From d92a7c47bda283f0eb82baad86e1e443b2d7439e Mon Sep 17 00:00:00 2001 From: deeptiag1 Date: Thu, 12 Mar 2020 08:24:38 -0700 Subject: [PATCH 09/12] changes_incorpoarted Signed-off-by: deeptiag1 --- Source/Lib/ASM_AVX2/CMakeLists.txt | 2 +- Source/Lib/ASM_AVX2/EbTransforms_AVX2.h | 6 +- ...AVX512.c => EbTransforms_Intrinsic_VNNI.c} | 5 +- Source/Lib/ASM_SSSE3/CMakeLists.txt | 2 +- ..._AVX512.c => EbMcp_Intrinsic_AVX512VNNI.c} | 137 +++++++++++++++++- Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c | 55 +++---- Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h | 16 +- Source/Lib/Codec/EbDefinitions.h | 3 +- 8 files changed, 177 insertions(+), 49 deletions(-) rename Source/Lib/ASM_AVX2/{EbTransforms_Intrinsic_AVX512.c => EbTransforms_Intrinsic_VNNI.c} (98%) rename Source/Lib/ASM_SSSE3/{EbMcp_Intrinsic_AVX512.c => EbMcp_Intrinsic_AVX512VNNI.c} (71%) diff --git a/Source/Lib/ASM_AVX2/CMakeLists.txt b/Source/Lib/ASM_AVX2/CMakeLists.txt index d149972f4..0897d7929 100644 --- a/Source/Lib/ASM_AVX2/CMakeLists.txt +++ b/Source/Lib/ASM_AVX2/CMakeLists.txt @@ -82,7 +82,7 @@ set(ASM_AVX2_SOURCE EbNoiseExtractAVX2.c EbPackUnPack_Intrinsic_AVX2.c EbPictureOperators_Intrinsic_AVX2.c - EbTransforms_Intrinsic_AVX512.c + EbTransforms_Intrinsic_VNNI.c EbTransforms_Intrinsic_AVX2.c) if(COMPILE_AS_CPP) diff --git a/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h b/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h index 4a7a32665..e172f16af 100644 --- a/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h +++ b/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h @@ -11,10 +11,10 @@ extern "C" { #endif -#ifdef NON_AVX512_SUPPORT -#define EbHevcTransform32_INTRIN EbHevcTransform32_AVX2_INTRIN +#ifdef VNNI_SUPPORT +#define EbHevcTransform32_INTRIN EbHevcTransform32_VNNI_INTRIN #else -#define EbHevcTransform32_INTRIN EbHevcTransform32_AVX512_INTRIN +#define EbHevcTransform32_INTRIN EbHevcTransform32_AVX2_INTRIN #endif diff --git a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_VNNI.c similarity index 98% rename from Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c rename to Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_VNNI.c index e7ec6815a..8afc854a8 100644 --- a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c +++ b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_VNNI.c @@ -4,7 +4,7 @@ #include #include -#ifndef NON_AVX512_SUPPORT +#ifdef VNNI_SUPPORT #ifdef __GNUC__ __attribute__((aligned(16))) @@ -37,13 +37,12 @@ static EB_ALIGN(32) const EB_S16 EbHevcCoeff_tbl_AVX2[48 * 16] = 54, 67, -31, -73, 4, 78, 22, -82, 54, 67, -31, -73, 4, 78, 22, -82, -46, 85, 67, -88, -82, 90, 90, -90, -46, 85, 67, -88, -82, 90, 90, -90 }; -extern void EbHevcTransform32_AVX512_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift) +extern void EbHevcTransform32_VNNI_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift) { EB_U32 i; __m128i s0; __m256i o0; const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; - shift &= 0x0000FFFF; // Redundant code to fix Visual Studio 2012 AVX2 compiler error s0 = _mm_cvtsi32_si128(shift); o0 = _mm256_set1_epi32(1 << (shift - 1)); diff --git a/Source/Lib/ASM_SSSE3/CMakeLists.txt b/Source/Lib/ASM_SSSE3/CMakeLists.txt index 9ce313bbc..7ce731694 100644 --- a/Source/Lib/ASM_SSSE3/CMakeLists.txt +++ b/Source/Lib/ASM_SSSE3/CMakeLists.txt @@ -55,7 +55,7 @@ set(ASM_SSSE3_SOURCE EbDeblockingFilter_Intrinsic_SSSE3.c EbIntraPrediction16bit_Intrinsic_SSSE3.c EbMcp_Intrinsic_SSSE3.c - EbMcp_Intrinsic_AVX512.c + EbMcp_Intrinsic_AVX512VNNI.c EbSaoApplication_Intrinsic_SSSE3.c EbTransforms_Intrinsic_SSSE3.c) diff --git a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512VNNI.c similarity index 71% rename from Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c rename to Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512VNNI.c index 19e220c1a..662e33f9f 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c +++ b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512VNNI.c @@ -1,11 +1,17 @@ #include "EbMcp_SSSE3.h" #include "EbDefinitions.h" -#include "emmintrin.h" - #include "immintrin.h" -#ifndef NON_AVX512_SUPPORT +#ifdef VNNI_SUPPORT + +const EB_S16 EbHevcLumaFilterCoeff1[4][8] = +{ + { 0, 0, 0, 64, 0, 0, 0, 0}, + {-1, 4,-10, 58, 17, -5, 1, 0}, + {-1, 4,-11, 40, 40,-11, 4, -1}, + { 0, 1, -5, 17, 58,-10, 4, -1} +}; static const EB_S16 EbHevcLumaFilterCoeff7[4][8] = { @@ -15,14 +21,128 @@ static const EB_S16 EbHevcLumaFilterCoeff7[4][8] = { 1, -5, 17, 58,-10, 4, -1, 0} }; -void EbHevcLumaInterpolationFilterTwoDInRaw7_AVX512(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) +#ifndef NON_AVX512_SUPPORT +void LumaInterpolationFilterOneDOutRawHorizontal_AVX512( + EB_BYTE refPic, + EB_U32 srcStride, + EB_S16 *dst, + EB_U32 puWidth, + EB_U32 puHeight, + EB_U32 fracPosx) +{ + EB_S32 rowCount, colCount; + __m128i c0, c1, c2, c3; // coeffs + __m128i a0, a1; + __m128i b0; + __m128i sum; + EB_BYTE ptr; + + refPic -= 3; + + PrefetchBlock(refPic, srcStride, (puWidth == 4) ? 16 : puWidth+8, (puWidth == 4) ? ((puHeight+1)&~1) : puHeight); + + c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff1[fracPosx]); + c0 = _mm_packs_epi16(c0, c0); + __m128i ct = _mm_srli_epi64(c0, 32); + __m512i cc0 = _mm512_broadcastd_epi32(c0); + __m512i cc1 = _mm512_broadcastd_epi32(ct); + c0 = _mm_unpacklo_epi16(c0, c0); + c3 = _mm_shuffle_epi32(c0, 0xff); + c2 = _mm_shuffle_epi32(c0, 0xaa); + c1 = _mm_shuffle_epi32(c0, 0x55); + c0 = _mm_shuffle_epi32(c0, 0x00); + __m512i b1 = _mm512_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0); + __m512i b2 = _mm512_set_epi8(14, 13, 12, 11, 13, 12, 11, 10, 12, 11, 10, 9, 11, 10, 9, 8, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 14, 13, 12, 11, 13, 12, 11, 10, 12, 11, 10, 9, 11, 10, 9, 8, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4); + + + if (puWidth & 4) + { + ptr = refPic; + rowCount = puHeight; + do + { + a0 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; + a1 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; + b0 = _mm_unpacklo_epi64(a0, a1); + sum = _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12)), c0); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14)), c1)); + b0 = _mm_unpacklo_epi64(_mm_srli_si128(a0, 4), _mm_srli_si128(a1, 4)); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12)), c2)); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14)), c3)); + + sum = _mm_sub_epi16(sum, _mm_set1_epi16(128*64)); + + _mm_storeu_si128((__m128i *)dst, sum); + dst += 8; + + rowCount -= 2; + } + while (rowCount > 0); + + puWidth -= 4; + if (puWidth == 0) + { + return; + } + + refPic += 4; + } + colCount = puWidth; + int rowLoop = puHeight >>1 ;//divide by 2 + int evenRow = puHeight & 1; + do + { + ptr = refPic; + // rowCount = puHeight; + int rowCount = rowLoop ;//divide by 2 + do + { + __m512i a1 = _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)(ptr))); + __m256i b0 = _mm256_broadcast_i32x4(_mm_loadu_si128((__m128i*)(ptr + srcStride))); ptr += 2 * srcStride; + __m512i s1 = _mm512_inserti64x4(a1, b0, 1); + __m512i sh2 = _mm512_shuffle_epi8(s1, b1); + __m512i sh3 = _mm512_shuffle_epi8(s1, b2); + __m512i sum00 = _mm512_setzero_si512(); + __m512i sum0 = _mm512_dpbusds_epi32(sum00, sh2, cc0); + __m512i sum1 = _mm512_dpbusds_epi32(sum0, sh3, cc1); + __m512i f1 = _mm512_packs_epi32(sum1,sum1);// + __m512i f2 = _mm512_permutexvar_epi64( _mm512_setr_epi64(0x0, 0x0000000000000002, 0x0000000000000004, 0x0000000000000006, 0x0, 0x0002000200020002, 0x0004000400040004, 0x0006000600060006), f1); + f2 = _mm512_sub_epi16(f2, _mm512_set1_epi16(128 * 64)); + _mm256_storeu_si256((__m256i*)dst, _mm512_castsi512_si256(f2)); + dst += 16; + rowCount = rowCount - 1; + } + while (rowCount > 0); + + if (evenRow) + { + a0 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; + + sum = _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8)), c0); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10)), c1)); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12)), c2)); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14)), c3)); + + sum = _mm_sub_epi16(sum, _mm_set1_epi16(128*64)); + + _mm_storeu_si128((__m128i *)dst, sum); + dst += 8; + } + + refPic += 8; + colCount -= 8; + } + while (colCount > 0); +} +#endif + +void EbHevcLumaInterpolationFilterTwoDInRaw7_VNNI(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) { EB_S32 rowCount, colCount; __m128i c0, c1, c2; __m128i a0, a1, a2, a3, a4, a5, a6; __m128i sum0 , sum1; __m128i b0l, b0h, b1l, b1h, b2l, b2h; - EB_BYTE qtr; c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[fracPosy]); c2 = _mm_shuffle_epi32(c0, 0xaa); @@ -141,7 +261,7 @@ void EbHevcLumaInterpolationFilterTwoDInRaw7_AVX512(EB_S16 *firstPassIFDst, EB_B while (colCount > 0); } -void EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_AVX512(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) +void EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_VNNI(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) { EB_S32 rowCount, colCount; __m128i a0, a1, a2, a3, a4, a5, a6; @@ -245,9 +365,10 @@ void EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_AVX512(EB_S16 *firstPassIFDst colCount -= 8; } while (colCount > 0); + } -void EbHevcLumaInterpolationFilterTwoDInRawM_AVX512(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight) +void EbHevcLumaInterpolationFilterTwoDInRawM_VNNI(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight) { EB_S32 rowCount, colCount; @@ -358,7 +479,7 @@ void EbHevcLumaInterpolationFilterTwoDInRawM_AVX512(EB_S16 *firstPassIFDst, EB_B while (colCount > 0); } -void EbHevcLumaInterpolationFilterTwoDInRawOutRawM_AVX512(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight) +void EbHevcLumaInterpolationFilterTwoDInRawOutRawM_VNNI(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight) { EB_S32 rowCount, colCount; diff --git a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c index bf650f06f..fbb807336 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c +++ b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c @@ -66,7 +66,7 @@ static void _mm_storeh_epi64(__m128i * p, __m128i x) _mm_storeh_pd((double *)p, _mm_castsi128_pd(x)); } -static void PrefetchBlock(EB_U8 *src, EB_U32 srcStride, EB_U32 blkWidth, EB_U32 blkHeight) +void PrefetchBlock(EB_U8 *src, EB_U32 srcStride, EB_U32 blkWidth, EB_U32 blkHeight) { #if PREFETCH EB_U32 rowCount = blkHeight; @@ -232,7 +232,7 @@ void LumaInterpolationCopy_SSSE3( } void EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) -{ +{ EB_S32 rowCount, colCount; __m128i c0, c1, c2; __m128i a0, a1, a2, a3, a4, a5, a6; @@ -347,7 +347,8 @@ void EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3(EB_S16 *firstPassIFDst, EB_BY dst += 8; colCount -= 8; } - while (colCount > 0); + while (colCount > 0); + } void EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) @@ -453,6 +454,7 @@ void EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3(EB_S16 *firstPassIFDst, colCount -= 8; } while (colCount > 0); + } void EbHevcLumaInterpolationFilterTwoDInRawM_SSSE3(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight) @@ -1762,7 +1764,7 @@ void LumaInterpolationFilterOneDOutRawHorizontal_SSSE3( __m128i a0, a1; __m128i b0; __m128i sum; - EB_BYTE ptr; + EB_BYTE ptr; refPic -= 3; @@ -1811,7 +1813,6 @@ void LumaInterpolationFilterOneDOutRawHorizontal_SSSE3( refPic += 4; } - colCount = puWidth; do { @@ -1836,7 +1837,7 @@ void LumaInterpolationFilterOneDOutRawHorizontal_SSSE3( refPic += 8; colCount -= 8; } - while (colCount > 0); + while (colCount > 0); } void LumaInterpolationFilterOneDOutRawHorizontalOut_SSSE3( @@ -2313,7 +2314,7 @@ void LumaInterpolationFilterPose_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); } @@ -2329,7 +2330,7 @@ void LumaInterpolationFilterPosf_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 3 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); } @@ -2342,7 +2343,7 @@ void LumaInterpolationFilterPosg_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); } @@ -2355,7 +2356,7 @@ void LumaInterpolationFilterPosi_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); EbHevcLumaInterpolationFilterTwoDInRawM(firstPassIFDst, dst, dstStride, puWidth, puHeight); } @@ -2370,7 +2371,7 @@ void LumaInterpolationFilterPosj_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); EbHevcLumaInterpolationFilterTwoDInRawM(firstPassIFDst, dst, dstStride, puWidth, puHeight); } @@ -2383,7 +2384,7 @@ void LumaInterpolationFilterPosk_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); EbHevcLumaInterpolationFilterTwoDInRawM(firstPassIFDst, dst, dstStride, puWidth, puHeight); } @@ -2396,7 +2397,7 @@ void LumaInterpolationFilterPosp_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); } @@ -2411,7 +2412,7 @@ void LumaInterpolationFilterPosq_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 2 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); } @@ -2424,7 +2425,7 @@ void LumaInterpolationFilterPosr_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); } @@ -2452,7 +2453,7 @@ void LumaInterpolationFilterPosaOutRaw_SSSE3( EB_S16 *firstPassIFDst) { (void)firstPassIFDst; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic, srcStride, dst, puWidth, puHeight, 1); } void LumaInterpolationFilterPosbOutRaw_SSSE3( @@ -2465,7 +2466,7 @@ void LumaInterpolationFilterPosbOutRaw_SSSE3( { (void)firstPassIFDst; //LumaInterpolationFilterOneDOutRawHorizontalOut_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 2); - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic, srcStride, dst, puWidth, puHeight, 2); } void LumaInterpolationFilterPoscOutRaw_SSSE3( @@ -2478,7 +2479,7 @@ void LumaInterpolationFilterPoscOutRaw_SSSE3( { (void)firstPassIFDst; //LumaInterpolationFilterOneDOutRawHorizontalOut_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 3); - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic, srcStride, dst, puWidth, puHeight, 3); } void LumaInterpolationFilterPosdOutRaw_SSSE3( @@ -2821,7 +2822,7 @@ void LumaInterpolationFilterPoseOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 1); } @@ -2835,7 +2836,7 @@ void LumaInterpolationFilterPosfOutRaw_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 3 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 1); } @@ -2847,7 +2848,7 @@ void LumaInterpolationFilterPosgOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 1); } @@ -2859,7 +2860,7 @@ void LumaInterpolationFilterPosiOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); EbHevcLumaInterpolationFilterTwoDInRawOutRawM_SSSE3(firstPassIFDst, dst, puWidth, puHeight); } @@ -2871,7 +2872,7 @@ void LumaInterpolationFilterPosjOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); EbHevcLumaInterpolationFilterTwoDInRawOutRawM_SSSE3(firstPassIFDst, dst, puWidth, puHeight); } @@ -2883,7 +2884,7 @@ void LumaInterpolationFilterPoskOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); EbHevcLumaInterpolationFilterTwoDInRawOutRawM_SSSE3(firstPassIFDst, dst, puWidth, puHeight); } @@ -2895,7 +2896,7 @@ void LumaInterpolationFilterPospOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 3); } @@ -2909,7 +2910,7 @@ void LumaInterpolationFilterPosqOutRaw_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 2 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 3); } @@ -2921,6 +2922,6 @@ void LumaInterpolationFilterPosrOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 3); } diff --git a/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h b/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h index 6e18ee5c4..b4e04a294 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h +++ b/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h @@ -12,14 +12,20 @@ extern "C" { #endif -#ifdef NON_AVX512_SUPPORT +#ifdef VNNI_SUPPORT + #ifndef NON_AVX512_SUPPORT + #define LumaInterpolationFilterOneDOutRawHorizontal LumaInterpolationFilterOneDOutRawHorizontal_AVX512 + #else + #define LumaInterpolationFilterOneDOutRawHorizontal LumaInterpolationFilterOneDOutRawHorizontal_SSSE3 + #endif +#define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_VNNI +#define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_VNNI +#define EbHevcLumaInterpolationFilterTwoDInRawM EbHevcLumaInterpolationFilterTwoDInRawM_VNNI +#else #define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3 +#define LumaInterpolationFilterOneDOutRawHorizontal LumaInterpolationFilterOneDOutRawHorizontal_SSSE3 #define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3 #define EbHevcLumaInterpolationFilterTwoDInRawM EbHevcLumaInterpolationFilterTwoDInRawM_SSSE3 -#else -#define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_AVX512 -#define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_AVX512 -#define EbHevcLumaInterpolationFilterTwoDInRawM EbHevcLumaInterpolationFilterTwoDInRawM_AVX512 #endif // SSSE3 functions diff --git a/Source/Lib/Codec/EbDefinitions.h b/Source/Lib/Codec/EbDefinitions.h index adf38d530..9af8b512a 100644 --- a/Source/Lib/Codec/EbDefinitions.h +++ b/Source/Lib/Codec/EbDefinitions.h @@ -18,7 +18,8 @@ extern "C" { #define LATENCY_PROFILE 0 //#define DEBUG_LIFE_CYCLE 0 // Internal Marcos -#define NON_AVX512_SUPPORT +//#define NON_AVX512_SUPPORT +//#define VNNI_SUPPORT #ifdef __cplusplus #define EB_EXTERN extern "C" From d983a3bdb43e2d2d8879c70ba23dc698d152b0d8 Mon Sep 17 00:00:00 2001 From: deeptiag1 Date: Thu, 12 Mar 2020 08:24:38 -0700 Subject: [PATCH 10/12] avx512-changes_v3 Signed-off-by: deeptiag1 --- README.md | 1 + Source/Lib/ASM_AVX2/CMakeLists.txt | 2 +- Source/Lib/ASM_AVX2/EbTransforms_AVX2.h | 6 +- .../ASM_AVX2/EbTransforms_Intrinsic_AVX2.c | 157 ++++++ ...AVX512.c => EbTransforms_Intrinsic_VNNI.c} | 5 +- Source/Lib/ASM_SSSE3/CMakeLists.txt | 2 +- Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c | 457 ---------------- Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c | 495 +++++++++++++++++- Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h | 12 +- Source/Lib/Codec/EbDefinitions.h | 1 + 10 files changed, 641 insertions(+), 497 deletions(-) rename Source/Lib/ASM_AVX2/{EbTransforms_Intrinsic_AVX512.c => EbTransforms_Intrinsic_VNNI.c} (98%) delete mode 100644 Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c diff --git a/README.md b/README.md index 05ed941ea..53a7c7ab0 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ In order to run the highest resolution supported by the encoder, at least 64GB o - Download the yasm exe from the following [link](http://www.tortall.net/projects/yasm/releases/yasm-1.3.0-win64.exe) - Rename yasm-1.3.0-win64.exe to yasm.exe - Copy yasm.exe into a location that is in the PATH environment variable + - Vnni requires gcc version greater then 9.2. - __Build Instructions__ - Build the project by following the steps below in a windows command prompt: diff --git a/Source/Lib/ASM_AVX2/CMakeLists.txt b/Source/Lib/ASM_AVX2/CMakeLists.txt index d149972f4..0897d7929 100644 --- a/Source/Lib/ASM_AVX2/CMakeLists.txt +++ b/Source/Lib/ASM_AVX2/CMakeLists.txt @@ -82,7 +82,7 @@ set(ASM_AVX2_SOURCE EbNoiseExtractAVX2.c EbPackUnPack_Intrinsic_AVX2.c EbPictureOperators_Intrinsic_AVX2.c - EbTransforms_Intrinsic_AVX512.c + EbTransforms_Intrinsic_VNNI.c EbTransforms_Intrinsic_AVX2.c) if(COMPILE_AS_CPP) diff --git a/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h b/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h index 4a7a32665..e172f16af 100644 --- a/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h +++ b/Source/Lib/ASM_AVX2/EbTransforms_AVX2.h @@ -11,10 +11,10 @@ extern "C" { #endif -#ifdef NON_AVX512_SUPPORT -#define EbHevcTransform32_INTRIN EbHevcTransform32_AVX2_INTRIN +#ifdef VNNI_SUPPORT +#define EbHevcTransform32_INTRIN EbHevcTransform32_VNNI_INTRIN #else -#define EbHevcTransform32_INTRIN EbHevcTransform32_AVX512_INTRIN +#define EbHevcTransform32_INTRIN EbHevcTransform32_AVX2_INTRIN #endif diff --git a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c index 661c29caa..b9497a71f 100644 --- a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c +++ b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c @@ -1775,3 +1775,160 @@ void MatMultNxN_AVX2_INTRIN( *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1))); } + +#ifdef VNNI_SUPPORT +void EbHevcTransform32_VNNI_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift) +{ + EB_U32 i; + __m128i s0; + __m256i o0; + const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; + shift &= 0x0000FFFF; // Redundant code to fix Visual Studio 2012 AVX2 compiler error + s0 = _mm_cvtsi32_si128(shift); + o0 = _mm256_set1_epi32(1 << (shift - 1)); + + for (i = 0; i < 16; i++) + { + __m256i x0, x1, x2, x3,sox0,sox5,soxa,soxf,s1x0,s1x5,s1xa,s1xf; + __m256i y0, y1, y2, y3; + __m256i aa4, aa5, aa6, aa7; + __m256i a0, a1, a2, a3, a4, a5, a6, a7; + __m256i b0, b1, b2, b3, b4, b5, b6, b7; + + x0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x00))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x00)), 0x1); + x1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x08))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x08)), 0x1); + x2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x10))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x10)), 0x1); + x3 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x18))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x18)), 0x1); + + // 32-point butterfly + x2 = _mm256_shuffle_epi8(x2, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + x3 = _mm256_shuffle_epi8(x3, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + + y0 = _mm256_add_epi16(x0, x3); + y1 = _mm256_add_epi16(x1, x2); + + y2 = _mm256_sub_epi16(x0, x3); + y3 = _mm256_sub_epi16(x1, x2); + + // 16-point butterfly + y1 = _mm256_shuffle_epi8(y1, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + + x0 = _mm256_add_epi16(y0, y1); + x1 = _mm256_sub_epi16(y0, y1); + + x2 = y2; + x3 = y3; + + + sox0 = _mm256_shuffle_epi32(x0, 0x00); + sox5 = _mm256_shuffle_epi32(x0, 0x55); + soxa = _mm256_shuffle_epi32(x0, 0xaa); + soxf = _mm256_shuffle_epi32(x0, 0xff); + s1x0 = _mm256_shuffle_epi32(x1, 0x00); + s1x5 = _mm256_shuffle_epi32(x1, 0x55); + s1xa = _mm256_shuffle_epi32(x1, 0xaa); + s1xf = _mm256_shuffle_epi32(x1, 0xff); + + a0 = _mm256_madd_epi16(sox0, coeff32[0]); + + a0 = _mm256_dpwssd_epi32(a0, sox5, coeff32[2]); + a0 = _mm256_dpwssd_epi32(a0, soxa, coeff32[4]); + a0 = _mm256_dpwssd_epi32(a0, soxf, coeff32[6]); + + a1 = _mm256_madd_epi16(sox0, coeff32[1]); + a1 = _mm256_dpwssd_epi32(a1, sox5, coeff32[3]); + a1 = _mm256_dpwssd_epi32(a1, soxa, coeff32[5]); + a1 = _mm256_dpwssd_epi32(a1, soxf, coeff32[7]); + + a2 = _mm256_madd_epi16(s1x0, coeff32[8]); + a2 = _mm256_dpwssd_epi32(a2, s1x5, coeff32[10]); + a2 = _mm256_dpwssd_epi32(a2, s1xa, coeff32[12]); + a2 = _mm256_dpwssd_epi32(a2, s1xf, coeff32[14]); + + a3 = _mm256_madd_epi16(s1x0, coeff32[9]); + a3 = _mm256_dpwssd_epi32(a3, s1x5, coeff32[11]); + a3 = _mm256_dpwssd_epi32(a3, s1xa, coeff32[13]); + a3 = _mm256_dpwssd_epi32(a3, s1xf, coeff32[15]); + + sox0 = _mm256_shuffle_epi32(x2, 0x00); + sox5 = _mm256_shuffle_epi32(x2, 0x55); + soxa = _mm256_shuffle_epi32(x2, 0xaa); + soxf = _mm256_shuffle_epi32(x2, 0xff); + s1x0 = _mm256_shuffle_epi32(x3, 0x00); + s1x5 = _mm256_shuffle_epi32(x3, 0x55); + s1xa = _mm256_shuffle_epi32(x3, 0xaa); + s1xf = _mm256_shuffle_epi32(x3, 0xff); + + a4 = _mm256_madd_epi16(sox0, coeff32[16]); + a4 = _mm256_dpwssd_epi32(a4, sox5, coeff32[20]); + a4 = _mm256_dpwssd_epi32(a4, soxa, coeff32[24]); + a4 = _mm256_dpwssd_epi32(a4, soxf, coeff32[28]); + a4 = _mm256_dpwssd_epi32(a4, s1x0, coeff32[32]); + a4 = _mm256_dpwssd_epi32(a4, s1x5, coeff32[36]); + a4 = _mm256_dpwssd_epi32(a4, s1xa, coeff32[40]); + a4 = _mm256_dpwssd_epi32(a4, s1xf, coeff32[44]); + + a5 = _mm256_madd_epi16(sox0, coeff32[17]); + a5 = _mm256_dpwssd_epi32(a5, sox5, coeff32[21]); + a5 = _mm256_dpwssd_epi32(a5, soxa, coeff32[25]); + a5 = _mm256_dpwssd_epi32(a5, soxf, coeff32[29]); + a5 = _mm256_dpwssd_epi32(a5, s1x0, coeff32[33]); + a5 = _mm256_dpwssd_epi32(a5, s1x5, coeff32[37]); + a5 = _mm256_dpwssd_epi32(a5, s1xa, coeff32[41]); + a5 = _mm256_dpwssd_epi32(a5, s1xf, coeff32[45]); + + a6 = _mm256_madd_epi16(sox0, coeff32[18]); + a6 = _mm256_dpwssd_epi32(a6, sox5, coeff32[22]); + a6 = _mm256_dpwssd_epi32(a6, soxa, coeff32[26]); + a6 = _mm256_dpwssd_epi32(a6, soxf, coeff32[30]); + a6 = _mm256_dpwssd_epi32(a6, s1x0, coeff32[34]); + a6 = _mm256_dpwssd_epi32(a6, s1x5, coeff32[38]); + a6 = _mm256_dpwssd_epi32(a6, s1xa, coeff32[42]); + a6 = _mm256_dpwssd_epi32(a6, s1xf, coeff32[46]); + + a7 = _mm256_madd_epi16(sox0, coeff32[19]); + a7 = _mm256_dpwssd_epi32(a7, sox5, coeff32[23]); + a7 = _mm256_dpwssd_epi32(a7, soxa, coeff32[27]); + a7 = _mm256_dpwssd_epi32(a7, soxf, coeff32[31]); + a7 = _mm256_dpwssd_epi32(a7, s1x0, coeff32[35]); + a7 = _mm256_dpwssd_epi32(a7, s1x5, coeff32[39]); + a7 = _mm256_dpwssd_epi32(a7, s1xa, coeff32[43]); + a7 = _mm256_dpwssd_epi32(a7, s1xf, coeff32[47]); + + b0 = _mm256_sra_epi32(_mm256_add_epi32(a0, o0), s0); + b1 = _mm256_sra_epi32(_mm256_add_epi32(a1, o0), s0); + b2 = _mm256_sra_epi32(_mm256_add_epi32(a2, o0), s0); + b3 = _mm256_sra_epi32(_mm256_add_epi32(a3, o0), s0); + b4 = _mm256_sra_epi32(_mm256_add_epi32(a4, o0), s0); + b5 = _mm256_sra_epi32(_mm256_add_epi32(a5, o0), s0); + b6 = _mm256_sra_epi32(_mm256_add_epi32(a6, o0), s0); + b7 = _mm256_sra_epi32(_mm256_add_epi32(a7, o0), s0); + + x0 = _mm256_packs_epi32(b0, b1); + x1 = _mm256_packs_epi32(b2, b3); + x2 = _mm256_packs_epi32(b4, b5); + x3 = _mm256_packs_epi32(b6, b7); + + y0 = _mm256_unpacklo_epi16(x0, x1); + y1 = _mm256_unpackhi_epi16(x0, x1); + y2 = x2; + y3 = x3; + x0 = _mm256_unpacklo_epi16(y0, y2); + x1 = _mm256_unpackhi_epi16(y0, y2); + x2 = _mm256_unpacklo_epi16(y1, y3); + x3 = _mm256_unpackhi_epi16(y1, y3); + + y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 0)), _mm256_extracti128_si256(x1, 0), 0x1); + y1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x2, 0)), _mm256_extracti128_si256(x3, 0), 0x1); + y2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 1)), _mm256_extracti128_si256(x1, 1), 0x1); + y3 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x2, 1)), _mm256_extracti128_si256(x3, 1), 0x1); + _mm256_storeu_si256((__m256i *)(dst + 0x00), y0); + _mm256_storeu_si256((__m256i *)(dst + 0x10), y1); + _mm256_storeu_si256((__m256i *)(dst + dst_stride + 0x00), y2); + _mm256_storeu_si256((__m256i *)(dst + dst_stride + 0x10), y3); + + src += 2 * src_stride; + dst += 2 * dst_stride; + } +} +#endif diff --git a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_VNNI.c similarity index 98% rename from Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c rename to Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_VNNI.c index e7ec6815a..8afc854a8 100644 --- a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX512.c +++ b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_VNNI.c @@ -4,7 +4,7 @@ #include #include -#ifndef NON_AVX512_SUPPORT +#ifdef VNNI_SUPPORT #ifdef __GNUC__ __attribute__((aligned(16))) @@ -37,13 +37,12 @@ static EB_ALIGN(32) const EB_S16 EbHevcCoeff_tbl_AVX2[48 * 16] = 54, 67, -31, -73, 4, 78, 22, -82, 54, 67, -31, -73, 4, 78, 22, -82, -46, 85, 67, -88, -82, 90, 90, -90, -46, 85, 67, -88, -82, 90, 90, -90 }; -extern void EbHevcTransform32_AVX512_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift) +extern void EbHevcTransform32_VNNI_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift) { EB_U32 i; __m128i s0; __m256i o0; const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; - shift &= 0x0000FFFF; // Redundant code to fix Visual Studio 2012 AVX2 compiler error s0 = _mm_cvtsi32_si128(shift); o0 = _mm256_set1_epi32(1 << (shift - 1)); diff --git a/Source/Lib/ASM_SSSE3/CMakeLists.txt b/Source/Lib/ASM_SSSE3/CMakeLists.txt index 9ce313bbc..7ce731694 100644 --- a/Source/Lib/ASM_SSSE3/CMakeLists.txt +++ b/Source/Lib/ASM_SSSE3/CMakeLists.txt @@ -55,7 +55,7 @@ set(ASM_SSSE3_SOURCE EbDeblockingFilter_Intrinsic_SSSE3.c EbIntraPrediction16bit_Intrinsic_SSSE3.c EbMcp_Intrinsic_SSSE3.c - EbMcp_Intrinsic_AVX512.c + EbMcp_Intrinsic_AVX512VNNI.c EbSaoApplication_Intrinsic_SSSE3.c EbTransforms_Intrinsic_SSSE3.c) diff --git a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c deleted file mode 100644 index 19e220c1a..000000000 --- a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512.c +++ /dev/null @@ -1,457 +0,0 @@ -#include "EbMcp_SSSE3.h" -#include "EbDefinitions.h" - -#include "emmintrin.h" - -#include "immintrin.h" - -#ifndef NON_AVX512_SUPPORT - -static const EB_S16 EbHevcLumaFilterCoeff7[4][8] = - { - { 0, 0, 0, 64, 0, 0, 0, 0}, - {-1, 4,-10, 58, 17, -5, 1, 0}, - {-1, 4,-11, 40, 40,-11, 4, -1}, - { 1, -5, 17, 58,-10, 4, -1, 0} - }; - -void EbHevcLumaInterpolationFilterTwoDInRaw7_AVX512(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) -{ - EB_S32 rowCount, colCount; - __m128i c0, c1, c2; - __m128i a0, a1, a2, a3, a4, a5, a6; - __m128i sum0 , sum1; - __m128i b0l, b0h, b1l, b1h, b2l, b2h; - - EB_BYTE qtr; - c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[fracPosy]); - c2 = _mm_shuffle_epi32(c0, 0xaa); - c1 = _mm_shuffle_epi32(c0, 0x55); - c0 = _mm_shuffle_epi32(c0, 0x00); - - - if (puWidth & 4) - { - rowCount = puHeight; - - qtr = dst; - - do - { - a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*4)); - a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*4)); - a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*4)); - a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*4)); - a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*4)); - a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*4)); - a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*4)); - a0 = _mm_sub_epi16(a0, a6); - - sum0 = _mm_set1_epi32(257<<11); - sum1 = _mm_set1_epi32(257<<11); - - - b0l = _mm_unpacklo_epi16(a0, a1); - b0h = _mm_unpackhi_epi16(a0, a1); - b1l = _mm_unpacklo_epi16(a2, a3); - b1h = _mm_unpackhi_epi16(a2, a3); - b2l = _mm_unpacklo_epi16(a4, a5); - b2h = _mm_unpackhi_epi16(a4, a5); - - sum0 = _mm_dpwssd_epi32(sum0, b0l, c0); - sum1 = _mm_dpwssd_epi32(sum1, b0h, c0); - sum0 = _mm_dpwssd_epi32(sum0, b1l, c1); - sum1 = _mm_dpwssd_epi32(sum1, b1h, c1); - sum0 = _mm_dpwssd_epi32(sum0, b2l, c2); - sum1 = _mm_dpwssd_epi32(sum1, b2h, c2); - - sum0 = _mm_srai_epi32(sum0, 12); - sum1 = _mm_srai_epi32(sum1, 12); - sum0 = _mm_packs_epi32(sum0, sum1); - sum0 = _mm_packus_epi16(sum0, sum0); - - *(EB_U32 *)qtr = _mm_extract_epi32(sum0, 0); qtr += dstStride; - *(EB_U32 *)qtr = _mm_extract_epi32(sum0, 1); qtr += dstStride; - - firstPassIFDst += 8; - rowCount -= 2; - } - while (rowCount > 0); - - puWidth -= 4; - if (puWidth == 0) - { - return; - } - - firstPassIFDst += (fracPosy == 2) ? 32 : 24; - dst += 4; - } - - colCount = puWidth; - do - { - EB_BYTE qtr = dst; - - rowCount = puHeight; - do - { - a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*8)); - a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*8)); - a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*8)); - a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*8)); - a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*8)); - a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*8)); - a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*8)); - a0 = _mm_sub_epi16(a0, a6); - - sum0 = _mm_set1_epi32(257<<11); - sum1 = _mm_set1_epi32(257<<11); - - b0l = _mm_unpacklo_epi16(a0, a1); - b0h = _mm_unpackhi_epi16(a0, a1); - b1l = _mm_unpacklo_epi16(a2, a3); - b1h = _mm_unpackhi_epi16(a2, a3); - b2l = _mm_unpacklo_epi16(a4, a5); - b2h = _mm_unpackhi_epi16(a4, a5); - - sum0 = _mm_dpwssd_epi32(sum0, b0l, c0); - sum1 = _mm_dpwssd_epi32(sum1, b0h, c0); - sum0 = _mm_dpwssd_epi32(sum0, b1l, c1); - sum1 = _mm_dpwssd_epi32(sum1, b1h, c1); - sum0 = _mm_dpwssd_epi32(sum0, b2l, c2); - sum1 = _mm_dpwssd_epi32(sum1, b2h, c2); - - sum0 = _mm_srai_epi32(sum0, 12); - sum1 = _mm_srai_epi32(sum1, 12); - sum0 = _mm_packs_epi32(sum0, sum1); - sum0 = _mm_packus_epi16(sum0, sum0); - - _mm_storel_epi64((__m128i *)qtr, sum0); qtr += dstStride; - - firstPassIFDst += 8; - rowCount--; - } - while (rowCount > 0); - - firstPassIFDst += (fracPosy == 2) ? 56 : 48; - dst += 8; - colCount -= 8; - } - while (colCount > 0); -} - -void EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_AVX512(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) -{ - EB_S32 rowCount, colCount; - __m128i a0, a1, a2, a3, a4, a5, a6; - __m128i c0, c1, c2; - c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[fracPosy]); - c2 = _mm_shuffle_epi32(c0, 0xaa); - c1 = _mm_shuffle_epi32(c0, 0x55); - c0 = _mm_shuffle_epi32(c0, 0x00); - - if (puWidth & 4) - { - rowCount = puHeight; - - do - { - __m128i sum0, sum1; - a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*4)); - a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*4)); - a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*4)); - a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*4)); - a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*4)); - a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*4)); - a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*4)); - a0 = _mm_sub_epi16(a0, a6); - - sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0); - sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0); - - sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); - sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); - sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a4, a5), c2); - sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a4, a5), c2); - - sum0 = _mm_srai_epi32(sum0, 6); - sum1 = _mm_srai_epi32(sum1, 6); - sum0 = _mm_packs_epi32(sum0, sum1); - - _mm_storeu_si128((__m128i *)dst, sum0); - dst += 8; - - firstPassIFDst += 8; - rowCount -= 2; - } - while (rowCount > 0); - - puWidth -= 4; - if (puWidth == 0) - { - return; - } - - firstPassIFDst += (fracPosy == 2) ? 32 : 24; - } - - colCount = puWidth; - do - { - rowCount = puHeight; - do - { - __m128i b0l, b0h, b1l, b1h, b2l, b2h; - __m128i sum0, sum1; - - a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*8)); - a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*8)); - a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*8)); - a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*8)); - a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*8)); - a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*8)); - a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*8)); - a0 = _mm_sub_epi16(a0, a6); - - b0l = _mm_unpacklo_epi16(a0, a1); - b0h = _mm_unpackhi_epi16(a0, a1); - b1l = _mm_unpacklo_epi16(a2, a3); - b1h = _mm_unpackhi_epi16(a2, a3); - b2l = _mm_unpacklo_epi16(a4, a5); - b2h = _mm_unpackhi_epi16(a4, a5); - - sum0 = _mm_madd_epi16(b0l, c0); - sum1 = _mm_madd_epi16(b0h, c0); - - sum0 = _mm_dpwssd_epi32(sum0, b1l, c1); - sum1 = _mm_dpwssd_epi32(sum1, b1h, c1); - sum0 = _mm_dpwssd_epi32(sum0, b2l, c2); - sum1 = _mm_dpwssd_epi32(sum1, b2h, c2); - - sum0 = _mm_srai_epi32(sum0, 6); - sum1 = _mm_srai_epi32(sum1, 6); - sum0 = _mm_packs_epi32(sum0, sum1); - - _mm_storeu_si128((__m128i *)dst, sum0); - dst += 8; - - firstPassIFDst += 8; - rowCount--; - } - while (rowCount > 0); - - firstPassIFDst += (fracPosy == 2) ? 56 : 48; - colCount -= 8; - } - while (colCount > 0); -} - -void EbHevcLumaInterpolationFilterTwoDInRawM_AVX512(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight) -{ - EB_S32 rowCount, colCount; - - __m128i c0, c1; - __m128i a0, a1, a2, a3, a4, a5, a6, a7; - __m128i sum0, sum1; - - EB_BYTE qtr; - - c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[2]); - c1 = _mm_shuffle_epi32(c0, 0x55); - c0 = _mm_shuffle_epi32(c0, 0x00); - - - - if (puWidth & 4) - { - rowCount = puHeight; - qtr = dst; - - do - { - a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*4)); - a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*4)); - a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*4)); - a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*4)); - a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*4)); - a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*4)); - a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*4)); - a7 = _mm_loadu_si128((__m128i *)(firstPassIFDst+7*4)); - - sum0 = _mm_set1_epi32(257<<11); - sum1 = _mm_set1_epi32(257<<11); - - a0 = _mm_add_epi16(a0, a7); - a1 = _mm_add_epi16(a1, a6); - a2 = _mm_add_epi16(a2, a5); - a3 = _mm_add_epi16(a3, a4); - sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a0, a1), c0); - sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a0, a1), c0); - sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); - sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); - - sum0 = _mm_srai_epi32(sum0, 12); - sum1 = _mm_srai_epi32(sum1, 12); - sum0 = _mm_packs_epi32(sum0, sum1); - sum0 = _mm_packus_epi16(sum0, sum0); - - *(EB_U32 *)qtr = _mm_extract_epi32(sum0, 0); qtr += dstStride; - *(EB_U32 *)qtr = _mm_extract_epi32(sum0, 1); qtr += dstStride; - firstPassIFDst += 8; - rowCount -= 2; - } - while (rowCount > 0); - - puWidth -= 4; - if (puWidth == 0) - { - return; - } - - firstPassIFDst += 32; - dst += 4; - } - - colCount = puWidth; - do - { - qtr = dst; - - rowCount = puHeight; - do - { - a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*8)); - a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*8)); - a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*8)); - a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*8)); - a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*8)); - a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*8)); - a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*8)); - a7 = _mm_loadu_si128((__m128i *)(firstPassIFDst+7*8)); - - sum0 = _mm_set1_epi32(257<<11); - sum1 = _mm_set1_epi32(257<<11); - a0 = _mm_add_epi16(a0, a7); - a1 = _mm_add_epi16(a1, a6); - a2 = _mm_add_epi16(a2, a5); - a3 = _mm_add_epi16(a3, a4); - sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a0, a1), c0); - sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a0, a1), c0); - sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); - sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); - - sum0 = _mm_srai_epi32(sum0, 12); - sum1 = _mm_srai_epi32(sum1, 12); - sum0 = _mm_packs_epi32(sum0, sum1); - sum0 = _mm_packus_epi16(sum0, sum0); - - _mm_storel_epi64((__m128i *)qtr, sum0); qtr += dstStride; - firstPassIFDst += 8; - } - while (--rowCount > 0); - - firstPassIFDst += 56; - dst += 8; - colCount -= 8; - } - while (colCount > 0); -} - -void EbHevcLumaInterpolationFilterTwoDInRawOutRawM_AVX512(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight) -{ - EB_S32 rowCount, colCount; - - __m128i a0, a1, a2, a3, a4, a5, a6, a7; - __m128i c0, c1; - c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[2]); - c1 = _mm_shuffle_epi32(c0, 0x55); - c0 = _mm_shuffle_epi32(c0, 0x00); - - if (puWidth & 4) - { - rowCount = puHeight; - - do - { - __m128i sum0, sum1; - a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*4)); - a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*4)); - a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*4)); - a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*4)); - a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*4)); - a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*4)); - a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*4)); - a7 = _mm_loadu_si128((__m128i *)(firstPassIFDst+7*4)); - - a0 = _mm_add_epi16(a0, a7); - a1 = _mm_add_epi16(a1, a6); - a2 = _mm_add_epi16(a2, a5); - a3 = _mm_add_epi16(a3, a4); - sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0); - sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0); - sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); - sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); - - sum0 = _mm_srai_epi32(sum0, 6); - sum1 = _mm_srai_epi32(sum1, 6); - sum0 = _mm_packs_epi32(sum0, sum1); - - _mm_storeu_si128((__m128i *)dst, sum0); - dst += 8; - firstPassIFDst += 8; - rowCount -= 2; - } - while (rowCount > 0); - - puWidth -= 4; - if (puWidth == 0) - { - return; - } - - firstPassIFDst += 32; - } - - colCount = puWidth; - do - { - rowCount = puHeight; - do - { - __m128i sum0, sum1; - a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*8)); - a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*8)); - a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*8)); - a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*8)); - a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*8)); - a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*8)); - a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*8)); - a7 = _mm_loadu_si128((__m128i *)(firstPassIFDst+7*8)); - - a0 = _mm_add_epi16(a0, a7); - a1 = _mm_add_epi16(a1, a6); - a2 = _mm_add_epi16(a2, a5); - a3 = _mm_add_epi16(a3, a4); - sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0); - sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0); - sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); - sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); - - sum0 = _mm_srai_epi32(sum0, 6); - sum1 = _mm_srai_epi32(sum1, 6); - sum0 = _mm_packs_epi32(sum0, sum1); - - _mm_storeu_si128((__m128i *)dst, sum0); - dst += 8; - firstPassIFDst += 8; - } - while (--rowCount > 0); - - firstPassIFDst += 56; - colCount -= 8; - } - while (colCount > 0); -} - -#endif diff --git a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c index bf650f06f..0a1051797 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c +++ b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c @@ -66,7 +66,7 @@ static void _mm_storeh_epi64(__m128i * p, __m128i x) _mm_storeh_pd((double *)p, _mm_castsi128_pd(x)); } -static void PrefetchBlock(EB_U8 *src, EB_U32 srcStride, EB_U32 blkWidth, EB_U32 blkHeight) +void PrefetchBlock(EB_U8 *src, EB_U32 srcStride, EB_U32 blkWidth, EB_U32 blkHeight) { #if PREFETCH EB_U32 rowCount = blkHeight; @@ -232,7 +232,7 @@ void LumaInterpolationCopy_SSSE3( } void EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) -{ +{ EB_S32 rowCount, colCount; __m128i c0, c1, c2; __m128i a0, a1, a2, a3, a4, a5, a6; @@ -347,7 +347,8 @@ void EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3(EB_S16 *firstPassIFDst, EB_BY dst += 8; colCount -= 8; } - while (colCount > 0); + while (colCount > 0); + } void EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) @@ -453,6 +454,7 @@ void EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3(EB_S16 *firstPassIFDst, colCount -= 8; } while (colCount > 0); + } void EbHevcLumaInterpolationFilterTwoDInRawM_SSSE3(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight) @@ -1762,7 +1764,7 @@ void LumaInterpolationFilterOneDOutRawHorizontal_SSSE3( __m128i a0, a1; __m128i b0; __m128i sum; - EB_BYTE ptr; + EB_BYTE ptr; refPic -= 3; @@ -1811,7 +1813,6 @@ void LumaInterpolationFilterOneDOutRawHorizontal_SSSE3( refPic += 4; } - colCount = puWidth; do { @@ -1836,7 +1837,7 @@ void LumaInterpolationFilterOneDOutRawHorizontal_SSSE3( refPic += 8; colCount -= 8; } - while (colCount > 0); + while (colCount > 0); } void LumaInterpolationFilterOneDOutRawHorizontalOut_SSSE3( @@ -2313,7 +2314,7 @@ void LumaInterpolationFilterPose_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); } @@ -2329,7 +2330,7 @@ void LumaInterpolationFilterPosf_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 3 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); } @@ -2342,7 +2343,7 @@ void LumaInterpolationFilterPosg_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 1); } @@ -2355,7 +2356,7 @@ void LumaInterpolationFilterPosi_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); EbHevcLumaInterpolationFilterTwoDInRawM(firstPassIFDst, dst, dstStride, puWidth, puHeight); } @@ -2370,7 +2371,7 @@ void LumaInterpolationFilterPosj_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); EbHevcLumaInterpolationFilterTwoDInRawM(firstPassIFDst, dst, dstStride, puWidth, puHeight); } @@ -2383,7 +2384,7 @@ void LumaInterpolationFilterPosk_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); EbHevcLumaInterpolationFilterTwoDInRawM(firstPassIFDst, dst, dstStride, puWidth, puHeight); } @@ -2396,7 +2397,7 @@ void LumaInterpolationFilterPosp_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); } @@ -2411,7 +2412,7 @@ void LumaInterpolationFilterPosq_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 2 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); } @@ -2424,7 +2425,7 @@ void LumaInterpolationFilterPosr_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); EbHevcLumaInterpolationFilterTwoDInRaw7(firstPassIFDst, dst, dstStride, puWidth, puHeight, 3); } @@ -2452,7 +2453,7 @@ void LumaInterpolationFilterPosaOutRaw_SSSE3( EB_S16 *firstPassIFDst) { (void)firstPassIFDst; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic, srcStride, dst, puWidth, puHeight, 1); } void LumaInterpolationFilterPosbOutRaw_SSSE3( @@ -2465,7 +2466,7 @@ void LumaInterpolationFilterPosbOutRaw_SSSE3( { (void)firstPassIFDst; //LumaInterpolationFilterOneDOutRawHorizontalOut_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 2); - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic, srcStride, dst, puWidth, puHeight, 2); } void LumaInterpolationFilterPoscOutRaw_SSSE3( @@ -2478,7 +2479,7 @@ void LumaInterpolationFilterPoscOutRaw_SSSE3( { (void)firstPassIFDst; //LumaInterpolationFilterOneDOutRawHorizontalOut_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 3); - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic, srcStride, dst, puWidth, puHeight, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic, srcStride, dst, puWidth, puHeight, 3); } void LumaInterpolationFilterPosdOutRaw_SSSE3( @@ -2799,6 +2800,446 @@ void BiPredClippingOnTheFly_SSSE3( } while (colCount != 0); } +#ifdef VNNI_SUPPORT +void EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_VNNI(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) +{ + EB_S32 rowCount, colCount; + __m128i a0, a1, a2, a3, a4, a5, a6; + __m128i c0, c1, c2; + c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[fracPosy]); + c2 = _mm_shuffle_epi32(c0, 0xaa); + c1 = _mm_shuffle_epi32(c0, 0x55); + c0 = _mm_shuffle_epi32(c0, 0x00); + + if (puWidth & 4) + { + rowCount = puHeight; + + do + { + __m128i sum0, sum1; + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*4)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*4)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*4)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*4)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*4)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*4)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*4)); + a0 = _mm_sub_epi16(a0, a6); + + sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0); + sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0); + + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a4, a5), c2); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a4, a5), c2); + + sum0 = _mm_srai_epi32(sum0, 6); + sum1 = _mm_srai_epi32(sum1, 6); + sum0 = _mm_packs_epi32(sum0, sum1); + + _mm_storeu_si128((__m128i *)dst, sum0); + dst += 8; + + firstPassIFDst += 8; + rowCount -= 2; + } + while (rowCount > 0); + + puWidth -= 4; + if (puWidth == 0) + { + return; + } + + firstPassIFDst += (fracPosy == 2) ? 32 : 24; + } + + colCount = puWidth; + do + { + rowCount = puHeight; + do + { + __m128i b0l, b0h, b1l, b1h, b2l, b2h; + __m128i sum0, sum1; + + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*8)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*8)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*8)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*8)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*8)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*8)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*8)); + a0 = _mm_sub_epi16(a0, a6); + + b0l = _mm_unpacklo_epi16(a0, a1); + b0h = _mm_unpackhi_epi16(a0, a1); + b1l = _mm_unpacklo_epi16(a2, a3); + b1h = _mm_unpackhi_epi16(a2, a3); + b2l = _mm_unpacklo_epi16(a4, a5); + b2h = _mm_unpackhi_epi16(a4, a5); + + sum0 = _mm_madd_epi16(b0l, c0); + sum1 = _mm_madd_epi16(b0h, c0); + + sum0 = _mm_dpwssd_epi32(sum0, b1l, c1); + sum1 = _mm_dpwssd_epi32(sum1, b1h, c1); + sum0 = _mm_dpwssd_epi32(sum0, b2l, c2); + sum1 = _mm_dpwssd_epi32(sum1, b2h, c2); + + sum0 = _mm_srai_epi32(sum0, 6); + sum1 = _mm_srai_epi32(sum1, 6); + sum0 = _mm_packs_epi32(sum0, sum1); + + _mm_storeu_si128((__m128i *)dst, sum0); + dst += 8; + + firstPassIFDst += 8; + rowCount--; + } + while (rowCount > 0); + + firstPassIFDst += (fracPosy == 2) ? 56 : 48; + colCount -= 8; + } + while (colCount > 0); + +} + +void EbHevcLumaInterpolationFilterTwoDInRawM_VNNI(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight) +{ + EB_S32 rowCount, colCount; + + __m128i c0, c1; + __m128i a0, a1, a2, a3, a4, a5, a6, a7; + __m128i sum0, sum1; + + EB_BYTE qtr; + + c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[2]); + c1 = _mm_shuffle_epi32(c0, 0x55); + c0 = _mm_shuffle_epi32(c0, 0x00); + + + + if (puWidth & 4) + { + rowCount = puHeight; + qtr = dst; + + do + { + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*4)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*4)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*4)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*4)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*4)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*4)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*4)); + a7 = _mm_loadu_si128((__m128i *)(firstPassIFDst+7*4)); + + sum0 = _mm_set1_epi32(257<<11); + sum1 = _mm_set1_epi32(257<<11); + + a0 = _mm_add_epi16(a0, a7); + a1 = _mm_add_epi16(a1, a6); + a2 = _mm_add_epi16(a2, a5); + a3 = _mm_add_epi16(a3, a4); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a0, a1), c0); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a0, a1), c0); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); + + sum0 = _mm_srai_epi32(sum0, 12); + sum1 = _mm_srai_epi32(sum1, 12); + sum0 = _mm_packs_epi32(sum0, sum1); + sum0 = _mm_packus_epi16(sum0, sum0); + + *(EB_U32 *)qtr = _mm_extract_epi32(sum0, 0); qtr += dstStride; + *(EB_U32 *)qtr = _mm_extract_epi32(sum0, 1); qtr += dstStride; + firstPassIFDst += 8; + rowCount -= 2; + } + while (rowCount > 0); + + puWidth -= 4; + if (puWidth == 0) + { + return; + } + + firstPassIFDst += 32; + dst += 4; + } + + colCount = puWidth; + do + { + qtr = dst; + + rowCount = puHeight; + do + { + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*8)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*8)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*8)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*8)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*8)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*8)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*8)); + a7 = _mm_loadu_si128((__m128i *)(firstPassIFDst+7*8)); + + sum0 = _mm_set1_epi32(257<<11); + sum1 = _mm_set1_epi32(257<<11); + a0 = _mm_add_epi16(a0, a7); + a1 = _mm_add_epi16(a1, a6); + a2 = _mm_add_epi16(a2, a5); + a3 = _mm_add_epi16(a3, a4); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a0, a1), c0); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a0, a1), c0); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); + + sum0 = _mm_srai_epi32(sum0, 12); + sum1 = _mm_srai_epi32(sum1, 12); + sum0 = _mm_packs_epi32(sum0, sum1); + sum0 = _mm_packus_epi16(sum0, sum0); + + _mm_storel_epi64((__m128i *)qtr, sum0); qtr += dstStride; + firstPassIFDst += 8; + } + while (--rowCount > 0); + + firstPassIFDst += 56; + dst += 8; + colCount -= 8; + } + while (colCount > 0); +} + +void EbHevcLumaInterpolationFilterTwoDInRawOutRawM_VNNI(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight) +{ + EB_S32 rowCount, colCount; + + __m128i a0, a1, a2, a3, a4, a5, a6, a7; + __m128i c0, c1; + c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[2]); + c1 = _mm_shuffle_epi32(c0, 0x55); + c0 = _mm_shuffle_epi32(c0, 0x00); + + if (puWidth & 4) + { + rowCount = puHeight; + + do + { + __m128i sum0, sum1; + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*4)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*4)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*4)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*4)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*4)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*4)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*4)); + a7 = _mm_loadu_si128((__m128i *)(firstPassIFDst+7*4)); + + a0 = _mm_add_epi16(a0, a7); + a1 = _mm_add_epi16(a1, a6); + a2 = _mm_add_epi16(a2, a5); + a3 = _mm_add_epi16(a3, a4); + sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0); + sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); + + sum0 = _mm_srai_epi32(sum0, 6); + sum1 = _mm_srai_epi32(sum1, 6); + sum0 = _mm_packs_epi32(sum0, sum1); + + _mm_storeu_si128((__m128i *)dst, sum0); + dst += 8; + firstPassIFDst += 8; + rowCount -= 2; + } + while (rowCount > 0); + + puWidth -= 4; + if (puWidth == 0) + { + return; + } + + firstPassIFDst += 32; + } + + colCount = puWidth; + do + { + rowCount = puHeight; + do + { + __m128i sum0, sum1; + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*8)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*8)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*8)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*8)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*8)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*8)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*8)); + a7 = _mm_loadu_si128((__m128i *)(firstPassIFDst+7*8)); + + a0 = _mm_add_epi16(a0, a7); + a1 = _mm_add_epi16(a1, a6); + a2 = _mm_add_epi16(a2, a5); + a3 = _mm_add_epi16(a3, a4); + sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0); + sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0); + sum0 = _mm_dpwssd_epi32(sum0, _mm_unpacklo_epi16(a2, a3), c1); + sum1 = _mm_dpwssd_epi32(sum1, _mm_unpackhi_epi16(a2, a3), c1); + + sum0 = _mm_srai_epi32(sum0, 6); + sum1 = _mm_srai_epi32(sum1, 6); + sum0 = _mm_packs_epi32(sum0, sum1); + + _mm_storeu_si128((__m128i *)dst, sum0); + dst += 8; + firstPassIFDst += 8; + } + while (--rowCount > 0); + + firstPassIFDst += 56; + colCount -= 8; + } + while (colCount > 0); +} + +void EbHevcLumaInterpolationFilterTwoDInRaw7_VNNI(EB_S16 *firstPassIFDst, EB_BYTE dst, EB_U32 dstStride, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) +{ + EB_S32 rowCount, colCount; + __m128i c0, c1, c2; + __m128i a0, a1, a2, a3, a4, a5, a6; + __m128i sum0 , sum1; + __m128i b0l, b0h, b1l, b1h, b2l, b2h; + EB_BYTE qtr; + c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff7[fracPosy]); + c2 = _mm_shuffle_epi32(c0, 0xaa); + c1 = _mm_shuffle_epi32(c0, 0x55); + c0 = _mm_shuffle_epi32(c0, 0x00); + + + if (puWidth & 4) + { + rowCount = puHeight; + + qtr = dst; + + do + { + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*4)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*4)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*4)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*4)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*4)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*4)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*4)); + a0 = _mm_sub_epi16(a0, a6); + + sum0 = _mm_set1_epi32(257<<11); + sum1 = _mm_set1_epi32(257<<11); + + + b0l = _mm_unpacklo_epi16(a0, a1); + b0h = _mm_unpackhi_epi16(a0, a1); + b1l = _mm_unpacklo_epi16(a2, a3); + b1h = _mm_unpackhi_epi16(a2, a3); + b2l = _mm_unpacklo_epi16(a4, a5); + b2h = _mm_unpackhi_epi16(a4, a5); + + sum0 = _mm_dpwssd_epi32(sum0, b0l, c0); + sum1 = _mm_dpwssd_epi32(sum1, b0h, c0); + sum0 = _mm_dpwssd_epi32(sum0, b1l, c1); + sum1 = _mm_dpwssd_epi32(sum1, b1h, c1); + sum0 = _mm_dpwssd_epi32(sum0, b2l, c2); + sum1 = _mm_dpwssd_epi32(sum1, b2h, c2); + + sum0 = _mm_srai_epi32(sum0, 12); + sum1 = _mm_srai_epi32(sum1, 12); + sum0 = _mm_packs_epi32(sum0, sum1); + sum0 = _mm_packus_epi16(sum0, sum0); + + *(EB_U32 *)qtr = _mm_extract_epi32(sum0, 0); qtr += dstStride; + *(EB_U32 *)qtr = _mm_extract_epi32(sum0, 1); qtr += dstStride; + + firstPassIFDst += 8; + rowCount -= 2; + } + while (rowCount > 0); + + puWidth -= 4; + if (puWidth == 0) + { + return; + } + + firstPassIFDst += (fracPosy == 2) ? 32 : 24; + dst += 4; + } + + colCount = puWidth; + do + { + EB_BYTE qtr = dst; + + rowCount = puHeight; + do + { + a0 = _mm_loadu_si128((__m128i *)(firstPassIFDst+0*8)); + a1 = _mm_loadu_si128((__m128i *)(firstPassIFDst+1*8)); + a2 = _mm_loadu_si128((__m128i *)(firstPassIFDst+2*8)); + a3 = _mm_loadu_si128((__m128i *)(firstPassIFDst+3*8)); + a4 = _mm_loadu_si128((__m128i *)(firstPassIFDst+4*8)); + a5 = _mm_loadu_si128((__m128i *)(firstPassIFDst+5*8)); + a6 = _mm_loadu_si128((__m128i *)(firstPassIFDst+6*8)); + a0 = _mm_sub_epi16(a0, a6); + + sum0 = _mm_set1_epi32(257<<11); + sum1 = _mm_set1_epi32(257<<11); + + b0l = _mm_unpacklo_epi16(a0, a1); + b0h = _mm_unpackhi_epi16(a0, a1); + b1l = _mm_unpacklo_epi16(a2, a3); + b1h = _mm_unpackhi_epi16(a2, a3); + b2l = _mm_unpacklo_epi16(a4, a5); + b2h = _mm_unpackhi_epi16(a4, a5); + + sum0 = _mm_dpwssd_epi32(sum0, b0l, c0); + sum1 = _mm_dpwssd_epi32(sum1, b0h, c0); + sum0 = _mm_dpwssd_epi32(sum0, b1l, c1); + sum1 = _mm_dpwssd_epi32(sum1, b1h, c1); + sum0 = _mm_dpwssd_epi32(sum0, b2l, c2); + sum1 = _mm_dpwssd_epi32(sum1, b2h, c2); + + sum0 = _mm_srai_epi32(sum0, 12); + sum1 = _mm_srai_epi32(sum1, 12); + sum0 = _mm_packs_epi32(sum0, sum1); + sum0 = _mm_packus_epi16(sum0, sum0); + + _mm_storel_epi64((__m128i *)qtr, sum0); qtr += dstStride; + + firstPassIFDst += 8; + rowCount--; + } + while (rowCount > 0); + + firstPassIFDst += (fracPosy == 2) ? 56 : 48; + dst += 8; + colCount -= 8; + } + while (colCount > 0); +} +#endif void LumaInterpolationFilterPosnOutRaw_SSSE3( EB_BYTE refPic, @@ -2821,7 +3262,7 @@ void LumaInterpolationFilterPoseOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 1); } @@ -2835,7 +3276,7 @@ void LumaInterpolationFilterPosfOutRaw_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 3 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 1); } @@ -2847,7 +3288,7 @@ void LumaInterpolationFilterPosgOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 1); } @@ -2859,7 +3300,7 @@ void LumaInterpolationFilterPosiOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 1); EbHevcLumaInterpolationFilterTwoDInRawOutRawM_SSSE3(firstPassIFDst, dst, puWidth, puHeight); } @@ -2871,7 +3312,7 @@ void LumaInterpolationFilterPosjOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 2); EbHevcLumaInterpolationFilterTwoDInRawOutRawM_SSSE3(firstPassIFDst, dst, puWidth, puHeight); } @@ -2883,7 +3324,7 @@ void LumaInterpolationFilterPoskOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-3*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+7, 3); EbHevcLumaInterpolationFilterTwoDInRawOutRawM_SSSE3(firstPassIFDst, dst, puWidth, puHeight); } @@ -2895,7 +3336,7 @@ void LumaInterpolationFilterPospOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 1); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 3); } @@ -2909,7 +3350,7 @@ void LumaInterpolationFilterPosqOutRaw_SSSE3( { EB_U32 puHeight1 = puHeight + 6; EB_BYTE refPic1 = refPic - 2 * srcStride; - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); + LumaInterpolationFilterOneDOutRawHorizontal(refPic1, srcStride, firstPassIFDst, puWidth, puHeight1, 2); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 3); } @@ -2921,6 +3362,6 @@ void LumaInterpolationFilterPosrOutRaw_SSSE3( EB_U32 puHeight, EB_S16 *firstPassIFDst) { - LumaInterpolationFilterOneDOutRawHorizontal_SSSE3(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); + LumaInterpolationFilterOneDOutRawHorizontal(refPic-2*srcStride, srcStride, firstPassIFDst, puWidth, puHeight+6, 3); EbHevcLumaInterpolationFilterTwoDInRawOutRaw7(firstPassIFDst, dst, puWidth, puHeight, 3); } diff --git a/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h b/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h index 6e18ee5c4..456689084 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h +++ b/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h @@ -12,14 +12,16 @@ extern "C" { #endif -#ifdef NON_AVX512_SUPPORT +#ifdef VNNI_SUPPORT +#define LumaInterpolationFilterOneDOutRawHorizontal LumaInterpolationFilterOneDOutRawHorizontal_SSSE3 +#define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_VNNI +#define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_VNNI +#define EbHevcLumaInterpolationFilterTwoDInRawM EbHevcLumaInterpolationFilterTwoDInRawM_VNNI +#else #define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_SSSE3 +#define LumaInterpolationFilterOneDOutRawHorizontal LumaInterpolationFilterOneDOutRawHorizontal_SSSE3 #define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_SSSE3 #define EbHevcLumaInterpolationFilterTwoDInRawM EbHevcLumaInterpolationFilterTwoDInRawM_SSSE3 -#else -#define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_AVX512 -#define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_AVX512 -#define EbHevcLumaInterpolationFilterTwoDInRawM EbHevcLumaInterpolationFilterTwoDInRawM_AVX512 #endif // SSSE3 functions diff --git a/Source/Lib/Codec/EbDefinitions.h b/Source/Lib/Codec/EbDefinitions.h index adf38d530..eb78d1e4d 100644 --- a/Source/Lib/Codec/EbDefinitions.h +++ b/Source/Lib/Codec/EbDefinitions.h @@ -19,6 +19,7 @@ extern "C" { //#define DEBUG_LIFE_CYCLE 0 // Internal Marcos #define NON_AVX512_SUPPORT +//#define VNNI_SUPPORT #ifdef __cplusplus #define EB_EXTERN extern "C" From 0f5ce17a400809c4ea2891b767c383c4aebce749 Mon Sep 17 00:00:00 2001 From: deeptiag1 Date: Wed, 1 Apr 2020 12:41:53 -0700 Subject: [PATCH 11/12] avx512-changes_v3 Signed-off-by: deeptiag1 --- Source/Lib/ASM_AVX2/CMakeLists.txt | 1 - .../ASM_AVX2/EbTransforms_Intrinsic_AVX2.c | 2 +- .../ASM_AVX2/EbTransforms_Intrinsic_VNNI.c | 194 ------------------ .../ASM_SSSE3/EbMcp_Intrinsic_AVX512VNNI.c | 138 +++++++++++++ Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c | 2 +- Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h | 8 - Source/Lib/Codec/EbDefinitions.h | 1 - 7 files changed, 140 insertions(+), 206 deletions(-) delete mode 100644 Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_VNNI.c create mode 100644 Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512VNNI.c diff --git a/Source/Lib/ASM_AVX2/CMakeLists.txt b/Source/Lib/ASM_AVX2/CMakeLists.txt index 0897d7929..28389ee58 100644 --- a/Source/Lib/ASM_AVX2/CMakeLists.txt +++ b/Source/Lib/ASM_AVX2/CMakeLists.txt @@ -82,7 +82,6 @@ set(ASM_AVX2_SOURCE EbNoiseExtractAVX2.c EbPackUnPack_Intrinsic_AVX2.c EbPictureOperators_Intrinsic_AVX2.c - EbTransforms_Intrinsic_VNNI.c EbTransforms_Intrinsic_AVX2.c) if(COMPILE_AS_CPP) diff --git a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c index b9497a71f..d4662b08b 100644 --- a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c +++ b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c @@ -1775,7 +1775,7 @@ void MatMultNxN_AVX2_INTRIN( *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1))); } - +//VNNI code #ifdef VNNI_SUPPORT void EbHevcTransform32_VNNI_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift) { diff --git a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_VNNI.c b/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_VNNI.c deleted file mode 100644 index 8afc854a8..000000000 --- a/Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_VNNI.c +++ /dev/null @@ -1,194 +0,0 @@ -#include "EbTransforms_AVX2.h" -#include "EbDefinitions.h" - -#include -#include - -#ifdef VNNI_SUPPORT - -#ifdef __GNUC__ -__attribute__((aligned(16))) -#endif -static EB_ALIGN(32) const EB_S16 EbHevcCoeff_tbl_AVX2[48 * 16] = -{ - 64, 64, 89, 75, 83, 36, 75, -18, 64, 64, 89, 75, 83, 36, 75, -18, 64, -64, 50, -89, 36, -83, 18, -50, 64, -64, 50, -89, 36, -83, 18, -50, - 64, 64, 50, 18, -36, -83, -89, -50, 64, 64, 50, 18, -36, -83, -89, -50, -64, 64, 18, 75, 83, -36, 75, -89, -64, 64, 18, 75, 83, -36, 75, -89, - 64, 64, -18, -50, -83, -36, 50, 89, 64, 64, -18, -50, -83, -36, 50, 89, 64, -64, -75, -18, -36, 83, 89, -75, 64, -64, -75, -18, -36, 83, 89, -75, - 64, 64, -75, -89, 36, 83, 18, -75, 64, 64, -75, -89, 36, 83, 18, -75, -64, 64, 89, -50, -83, 36, 50, -18, -64, 64, 89, -50, -83, 36, 50, -18, - 90, 87, 87, 57, 80, 9, 70, -43, 90, 87, 87, 57, 80, 9, 70, -43, 57, -80, 43, -90, 25, -70, 9, -25, 57, -80, 43, -90, 25, -70, 9, -25, - 80, 70, 9, -43, -70, -87, -87, 9, 80, 70, 9, -43, -70, -87, -87, 9, -25, 90, 57, 25, 90, -80, 43, -57, -25, 90, 57, 25, 90, -80, 43, -57, - 57, 43, -80, -90, -25, 57, 90, 25, 57, 43, -80, -90, -25, 57, 90, 25, -9, -87, -87, 70, 43, 9, 70, -80, -9, -87, -87, 70, 43, 9, 70, -80, - 25, 9, -70, -25, 90, 43, -80, -57, 25, 9, -70, -25, 90, 43, -80, -57, 43, 70, 9, -80, -57, 87, 87, -90, 43, 70, 9, -80, -57, 87, 87, -90, - 90, 90, 90, 82, 88, 67, 85, 46, 90, 90, 90, 82, 88, 67, 85, 46, 82, 22, 78, -4, 73, -31, 67, -54, 82, 22, 78, -4, 73, -31, 67, -54, - 61, -73, 54, -85, 46, -90, 38, -88, 61, -73, 54, -85, 46, -90, 38, -88, 31, -78, 22, -61, 13, -38, 4, -13, 31, -78, 22, -61, 13, -38, 4, -13, - 88, 85, 67, 46, 31, -13, -13, -67, 88, 85, 67, 46, 31, -13, -13, -67, -54, -90, -82, -73, -90, -22, -78, 38, -54, -90, -82, -73, -90, -22, -78, 38, - -46, 82, -4, 88, 38, 54, 73, -4, -46, 82, -4, 88, 38, 54, 73, -4, 90, -61, 85, -90, 61, -78, 22, -31, 90, -61, 85, -90, 61, -78, 22, -31, - 82, 78, 22, -4, -54, -82, -90, -73, 82, 78, 22, -4, -54, -82, -90, -73, -61, 13, 13, 85, 78, 67, 85, -22, -61, 13, 13, 85, 78, 67, 85, -22, - 31, -88, -46, -61, -90, 31, -67, 90, 31, -88, -46, -61, -90, 31, -67, 90, 4, 54, 73, -38, 88, -90, 38, -46, 4, 54, 73, -38, 88, -90, 38, -46, - 73, 67, -31, -54, -90, -78, -22, 38, 73, 67, -31, -54, -90, -78, -22, 38, 78, 85, 67, -22, -38, -90, -90, 4, 78, 85, 67, -22, -38, -90, -90, 4, - -13, 90, 82, 13, 61, -88, -46, -31, -13, 90, 82, 13, 61, -88, -46, -31, -88, 82, -4, 46, 85, -73, 54, -61, -88, 82, -4, 46, 85, -73, 54, -61, - 61, 54, -73, -85, -46, -4, 82, 88, 61, 54, -73, -85, -46, -4, 82, 88, 31, -46, -88, -61, -13, 82, 90, 13, 31, -46, -88, -61, -13, 82, 90, 13, - -4, -90, -90, 38, 22, 67, 85, -78, -4, -90, -90, 38, 22, 67, 85, -78, -38, -22, -78, 90, 54, -31, 67, -73, -38, -22, -78, 90, 54, -31, 67, -73, - 46, 38, -90, -88, 38, 73, 54, -4, 46, 38, -90, -88, 38, 73, 54, -4, -90, -67, 31, 90, 61, -46, -88, -31, -90, -67, 31, 90, 61, -46, -88, -31, - 22, 85, 67, -78, -85, 13, 13, 61, 22, 85, 67, -78, -85, 13, 13, 61, 73, -90, -82, 54, 4, 22, 78, -82, 73, -90, -82, 54, 4, 22, 78, -82, - 31, 22, -78, -61, 90, 85, -61, -90, 31, 22, -78, -61, 90, 85, -61, -90, 4, 73, 54, -38, -88, -4, 82, 46, 4, 73, 54, -38, -88, -4, 82, 46, - -38, -78, -22, 90, 73, -82, -90, 54, -38, -78, -22, 90, 73, -82, -90, 54, 67, -13, -13, -31, -46, 67, 85, -88, 67, -13, -13, -31, -46, 67, 85, -88, - 13, 4, -38, -13, 61, 22, -78, -31, 13, 4, -38, -13, 61, 22, -78, -31, 88, 38, -90, -46, 85, 54, -73, -61, 88, 38, -90, -46, 85, 54, -73, -61, - 54, 67, -31, -73, 4, 78, 22, -82, 54, 67, -31, -73, 4, 78, 22, -82, -46, 85, 67, -88, -82, 90, 90, -90, -46, 85, 67, -88, -82, 90, 90, -90 -}; - -extern void EbHevcTransform32_VNNI_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift) -{ - EB_U32 i; - __m128i s0; - __m256i o0; - const __m256i *coeff32 = (const __m256i *)EbHevcCoeff_tbl_AVX2; - shift &= 0x0000FFFF; // Redundant code to fix Visual Studio 2012 AVX2 compiler error - s0 = _mm_cvtsi32_si128(shift); - o0 = _mm256_set1_epi32(1 << (shift - 1)); - - for (i = 0; i < 16; i++) - { - __m256i x0, x1, x2, x3,sox0,sox5,soxa,soxf,s1x0,s1x5,s1xa,s1xf; - __m256i y0, y1, y2, y3; - __m256i aa4, aa5, aa6, aa7; - __m256i a0, a1, a2, a3, a4, a5, a6, a7; - __m256i b0, b1, b2, b3, b4, b5, b6, b7; - - x0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x00))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x00)), 0x1); - x1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x08))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x08)), 0x1); - x2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x10))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x10)), 0x1); - x3 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x18))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x18)), 0x1); - - // 32-point butterfly - x2 = _mm256_shuffle_epi8(x2, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); - x3 = _mm256_shuffle_epi8(x3, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); - - y0 = _mm256_add_epi16(x0, x3); - y1 = _mm256_add_epi16(x1, x2); - - y2 = _mm256_sub_epi16(x0, x3); - y3 = _mm256_sub_epi16(x1, x2); - - // 16-point butterfly - y1 = _mm256_shuffle_epi8(y1, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); - - x0 = _mm256_add_epi16(y0, y1); - x1 = _mm256_sub_epi16(y0, y1); - - x2 = y2; - x3 = y3; - - - sox0 = _mm256_shuffle_epi32(x0, 0x00); - sox5 = _mm256_shuffle_epi32(x0, 0x55); - soxa = _mm256_shuffle_epi32(x0, 0xaa); - soxf = _mm256_shuffle_epi32(x0, 0xff); - s1x0 = _mm256_shuffle_epi32(x1, 0x00); - s1x5 = _mm256_shuffle_epi32(x1, 0x55); - s1xa = _mm256_shuffle_epi32(x1, 0xaa); - s1xf = _mm256_shuffle_epi32(x1, 0xff); - - a0 = _mm256_madd_epi16(sox0, coeff32[0]); - - a0 = _mm256_dpwssd_epi32(a0, sox5, coeff32[2]); - a0 = _mm256_dpwssd_epi32(a0, soxa, coeff32[4]); - a0 = _mm256_dpwssd_epi32(a0, soxf, coeff32[6]); - - a1 = _mm256_madd_epi16(sox0, coeff32[1]); - a1 = _mm256_dpwssd_epi32(a1, sox5, coeff32[3]); - a1 = _mm256_dpwssd_epi32(a1, soxa, coeff32[5]); - a1 = _mm256_dpwssd_epi32(a1, soxf, coeff32[7]); - - a2 = _mm256_madd_epi16(s1x0, coeff32[8]); - a2 = _mm256_dpwssd_epi32(a2, s1x5, coeff32[10]); - a2 = _mm256_dpwssd_epi32(a2, s1xa, coeff32[12]); - a2 = _mm256_dpwssd_epi32(a2, s1xf, coeff32[14]); - - a3 = _mm256_madd_epi16(s1x0, coeff32[9]); - a3 = _mm256_dpwssd_epi32(a3, s1x5, coeff32[11]); - a3 = _mm256_dpwssd_epi32(a3, s1xa, coeff32[13]); - a3 = _mm256_dpwssd_epi32(a3, s1xf, coeff32[15]); - - sox0 = _mm256_shuffle_epi32(x2, 0x00); - sox5 = _mm256_shuffle_epi32(x2, 0x55); - soxa = _mm256_shuffle_epi32(x2, 0xaa); - soxf = _mm256_shuffle_epi32(x2, 0xff); - s1x0 = _mm256_shuffle_epi32(x3, 0x00); - s1x5 = _mm256_shuffle_epi32(x3, 0x55); - s1xa = _mm256_shuffle_epi32(x3, 0xaa); - s1xf = _mm256_shuffle_epi32(x3, 0xff); - - a4 = _mm256_madd_epi16(sox0, coeff32[16]); - a4 = _mm256_dpwssd_epi32(a4, sox5, coeff32[20]); - a4 = _mm256_dpwssd_epi32(a4, soxa, coeff32[24]); - a4 = _mm256_dpwssd_epi32(a4, soxf, coeff32[28]); - a4 = _mm256_dpwssd_epi32(a4, s1x0, coeff32[32]); - a4 = _mm256_dpwssd_epi32(a4, s1x5, coeff32[36]); - a4 = _mm256_dpwssd_epi32(a4, s1xa, coeff32[40]); - a4 = _mm256_dpwssd_epi32(a4, s1xf, coeff32[44]); - - a5 = _mm256_madd_epi16(sox0, coeff32[17]); - a5 = _mm256_dpwssd_epi32(a5, sox5, coeff32[21]); - a5 = _mm256_dpwssd_epi32(a5, soxa, coeff32[25]); - a5 = _mm256_dpwssd_epi32(a5, soxf, coeff32[29]); - a5 = _mm256_dpwssd_epi32(a5, s1x0, coeff32[33]); - a5 = _mm256_dpwssd_epi32(a5, s1x5, coeff32[37]); - a5 = _mm256_dpwssd_epi32(a5, s1xa, coeff32[41]); - a5 = _mm256_dpwssd_epi32(a5, s1xf, coeff32[45]); - - a6 = _mm256_madd_epi16(sox0, coeff32[18]); - a6 = _mm256_dpwssd_epi32(a6, sox5, coeff32[22]); - a6 = _mm256_dpwssd_epi32(a6, soxa, coeff32[26]); - a6 = _mm256_dpwssd_epi32(a6, soxf, coeff32[30]); - a6 = _mm256_dpwssd_epi32(a6, s1x0, coeff32[34]); - a6 = _mm256_dpwssd_epi32(a6, s1x5, coeff32[38]); - a6 = _mm256_dpwssd_epi32(a6, s1xa, coeff32[42]); - a6 = _mm256_dpwssd_epi32(a6, s1xf, coeff32[46]); - - a7 = _mm256_madd_epi16(sox0, coeff32[19]); - a7 = _mm256_dpwssd_epi32(a7, sox5, coeff32[23]); - a7 = _mm256_dpwssd_epi32(a7, soxa, coeff32[27]); - a7 = _mm256_dpwssd_epi32(a7, soxf, coeff32[31]); - a7 = _mm256_dpwssd_epi32(a7, s1x0, coeff32[35]); - a7 = _mm256_dpwssd_epi32(a7, s1x5, coeff32[39]); - a7 = _mm256_dpwssd_epi32(a7, s1xa, coeff32[43]); - a7 = _mm256_dpwssd_epi32(a7, s1xf, coeff32[47]); - - b0 = _mm256_sra_epi32(_mm256_add_epi32(a0, o0), s0); - b1 = _mm256_sra_epi32(_mm256_add_epi32(a1, o0), s0); - b2 = _mm256_sra_epi32(_mm256_add_epi32(a2, o0), s0); - b3 = _mm256_sra_epi32(_mm256_add_epi32(a3, o0), s0); - b4 = _mm256_sra_epi32(_mm256_add_epi32(a4, o0), s0); - b5 = _mm256_sra_epi32(_mm256_add_epi32(a5, o0), s0); - b6 = _mm256_sra_epi32(_mm256_add_epi32(a6, o0), s0); - b7 = _mm256_sra_epi32(_mm256_add_epi32(a7, o0), s0); - - x0 = _mm256_packs_epi32(b0, b1); - x1 = _mm256_packs_epi32(b2, b3); - x2 = _mm256_packs_epi32(b4, b5); - x3 = _mm256_packs_epi32(b6, b7); - - y0 = _mm256_unpacklo_epi16(x0, x1); - y1 = _mm256_unpackhi_epi16(x0, x1); - y2 = x2; - y3 = x3; - x0 = _mm256_unpacklo_epi16(y0, y2); - x1 = _mm256_unpackhi_epi16(y0, y2); - x2 = _mm256_unpacklo_epi16(y1, y3); - x3 = _mm256_unpackhi_epi16(y1, y3); - - y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 0)), _mm256_extracti128_si256(x1, 0), 0x1); - y1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x2, 0)), _mm256_extracti128_si256(x3, 0), 0x1); - y2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 1)), _mm256_extracti128_si256(x1, 1), 0x1); - y3 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x2, 1)), _mm256_extracti128_si256(x3, 1), 0x1); - _mm256_storeu_si256((__m256i *)(dst + 0x00), y0); - _mm256_storeu_si256((__m256i *)(dst + 0x10), y1); - _mm256_storeu_si256((__m256i *)(dst + dst_stride + 0x00), y2); - _mm256_storeu_si256((__m256i *)(dst + dst_stride + 0x10), y3); - - src += 2 * src_stride; - dst += 2 * dst_stride; - } -} -#endif \ No newline at end of file diff --git a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512VNNI.c b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512VNNI.c new file mode 100644 index 000000000..29bded488 --- /dev/null +++ b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512VNNI.c @@ -0,0 +1,138 @@ +#include "EbMcp_SSSE3.h" +#include "EbDefinitions.h" + +#include "immintrin.h" + +#ifdef VNNI_SUPPORT + +const EB_S16 EbHevcLumaFilterCoeff1[4][8] = +{ + { 0, 0, 0, 64, 0, 0, 0, 0}, + {-1, 4,-10, 58, 17, -5, 1, 0}, + {-1, 4,-11, 40, 40,-11, 4, -1}, + { 0, 1, -5, 17, 58,-10, 4, -1} +}; + +static const EB_S16 EbHevcLumaFilterCoeff7[4][8] = + { + { 0, 0, 0, 64, 0, 0, 0, 0}, + {-1, 4,-10, 58, 17, -5, 1, 0}, + {-1, 4,-11, 40, 40,-11, 4, -1}, + { 1, -5, 17, 58,-10, 4, -1, 0} + }; + +#ifndef NON_AVX512_SUPPORT +void LumaInterpolationFilterOneDOutRawHorizontal_AVX512( + EB_BYTE refPic, + EB_U32 srcStride, + EB_S16 *dst, + EB_U32 puWidth, + EB_U32 puHeight, + EB_U32 fracPosx) +{ + EB_S32 rowCount, colCount; + __m128i c0, c1, c2, c3; // coeffs + __m128i a0, a1; + __m128i b0; + __m128i sum; + EB_BYTE ptr; + + refPic -= 3; + + PrefetchBlock(refPic, srcStride, (puWidth == 4) ? 16 : puWidth+8, (puWidth == 4) ? ((puHeight+1)&~1) : puHeight); + + c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff1[fracPosx]); + c0 = _mm_packs_epi16(c0, c0); + __m128i ct = _mm_srli_epi64(c0, 32); + __m512i cc0 = _mm512_broadcastd_epi32(c0); + __m512i cc1 = _mm512_broadcastd_epi32(ct); + c0 = _mm_unpacklo_epi16(c0, c0); + c3 = _mm_shuffle_epi32(c0, 0xff); + c2 = _mm_shuffle_epi32(c0, 0xaa); + c1 = _mm_shuffle_epi32(c0, 0x55); + c0 = _mm_shuffle_epi32(c0, 0x00); + __m512i b1 = _mm512_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0); + __m512i b2 = _mm512_set_epi8(14, 13, 12, 11, 13, 12, 11, 10, 12, 11, 10, 9, 11, 10, 9, 8, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 14, 13, 12, 11, 13, 12, 11, 10, 12, 11, 10, 9, 11, 10, 9, 8, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4); + + + if (puWidth & 4) + { + ptr = refPic; + rowCount = puHeight; + do + { + a0 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; + a1 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; + b0 = _mm_unpacklo_epi64(a0, a1); + sum = _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12)), c0); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14)), c1)); + b0 = _mm_unpacklo_epi64(_mm_srli_si128(a0, 4), _mm_srli_si128(a1, 4)); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12)), c2)); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14)), c3)); + + sum = _mm_sub_epi16(sum, _mm_set1_epi16(128*64)); + + _mm_storeu_si128((__m128i *)dst, sum); + dst += 8; + + rowCount -= 2; + } + while (rowCount > 0); + + puWidth -= 4; + if (puWidth == 0) + { + return; + } + + refPic += 4; + } + colCount = puWidth; + int rowLoop = puHeight >>1 ;//divide by 2 + int evenRow = puHeight & 1; + do + { + ptr = refPic; + // rowCount = puHeight; + int rowCount = rowLoop ;//divide by 2 + do + { + __m512i a1 = _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)(ptr))); + __m256i b0 = _mm256_broadcast_i32x4(_mm_loadu_si128((__m128i*)(ptr + srcStride))); ptr += 2 * srcStride; + __m512i s1 = _mm512_inserti64x4(a1, b0, 1); + __m512i sh2 = _mm512_shuffle_epi8(s1, b1); + __m512i sh3 = _mm512_shuffle_epi8(s1, b2); + __m512i sum00 = _mm512_setzero_si512(); + __m512i sum0 = _mm512_dpbusds_epi32(sum00, sh2, cc0); + __m512i sum1 = _mm512_dpbusds_epi32(sum0, sh3, cc1); + __m512i f1 = _mm512_packs_epi32(sum1,sum1);// + __m512i f2 = _mm512_permutexvar_epi64( _mm512_setr_epi64(0x0, 0x0000000000000002, 0x0000000000000004, 0x0000000000000006, 0x0, 0x0002000200020002, 0x0004000400040004, 0x0006000600060006), f1); + f2 = _mm512_sub_epi16(f2, _mm512_set1_epi16(128 * 64)); + _mm256_storeu_si256((__m256i*)dst, _mm512_castsi512_si256(f2)); + dst += 16; + rowCount = rowCount - 1; + } + while (rowCount > 0); + + if (evenRow) + { + a0 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; + + sum = _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8)), c0); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10)), c1)); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12)), c2)); + sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14)), c3)); + + sum = _mm_sub_epi16(sum, _mm_set1_epi16(128*64)); + + _mm_storeu_si128((__m128i *)dst, sum); + dst += 8; + } + + refPic += 8; + colCount -= 8; + } + while (colCount > 0); +} +#endif +#endif diff --git a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c index 0a1051797..c4cd1ef23 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c +++ b/Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_SSSE3.c @@ -2799,7 +2799,7 @@ void BiPredClippingOnTheFly_SSSE3( dst += 8; } while (colCount != 0); } - +//Vnni code #ifdef VNNI_SUPPORT void EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_VNNI(EB_S16 *firstPassIFDst, EB_S16 *dst, EB_U32 puWidth, EB_U32 puHeight, EB_U32 fracPosy) { diff --git a/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h b/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h index 2021a532d..98538ede3 100644 --- a/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h +++ b/Source/Lib/ASM_SSSE3/EbMcp_SSSE3.h @@ -13,15 +13,7 @@ extern "C" { #endif #ifdef VNNI_SUPPORT - #define LumaInterpolationFilterOneDOutRawHorizontal LumaInterpolationFilterOneDOutRawHorizontal_SSSE3 - - - - - - - #define EbHevcLumaInterpolationFilterTwoDInRaw7 EbHevcLumaInterpolationFilterTwoDInRaw7_VNNI #define EbHevcLumaInterpolationFilterTwoDInRawOutRaw7 EbHevcLumaInterpolationFilterTwoDInRawOutRaw7_VNNI #define EbHevcLumaInterpolationFilterTwoDInRawM EbHevcLumaInterpolationFilterTwoDInRawM_VNNI diff --git a/Source/Lib/Codec/EbDefinitions.h b/Source/Lib/Codec/EbDefinitions.h index 0e65784ad..7c921c83c 100644 --- a/Source/Lib/Codec/EbDefinitions.h +++ b/Source/Lib/Codec/EbDefinitions.h @@ -20,7 +20,6 @@ extern "C" { // Internal Marcos #define NON_AVX512_SUPPORT -======= //#define NON_AVX512_SUPPORT //#define VNNI_SUPPORT From a23033fe7a1ffba0a538a7ef9be530e643967aa6 Mon Sep 17 00:00:00 2001 From: deeptiag1 Date: Wed, 1 Apr 2020 12:45:07 -0700 Subject: [PATCH 12/12] avx512-changes_v3 Signed-off-by: deeptiag1 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 53a7c7ab0..f7dd9bd24 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ In order to run the highest resolution supported by the encoder, at least 64GB o - Download the yasm exe from the following [link](http://www.tortall.net/projects/yasm/releases/yasm-1.3.0-win64.exe) - Rename yasm-1.3.0-win64.exe to yasm.exe - Copy yasm.exe into a location that is in the PATH environment variable - - Vnni requires gcc version greater then 9.2. + - Vnni requires gcc version >= 9.2. - __Build Instructions__ - Build the project by following the steps below in a windows command prompt: