From 5d5de794cc68c8442d73d7de57b21b422c4dfd4f Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Tue, 22 Oct 2024 19:54:20 +0200 Subject: [PATCH] Implement LLVM x86 vpclmulqdq intrinsics --- src/shims/x86/mod.rs | 87 ++++--- .../shims/x86/intrinsics-x86-vpclmulqdq.rs | 223 ++++++++++++++++++ 2 files changed, 273 insertions(+), 37 deletions(-) create mode 100644 tests/pass/shims/x86/intrinsics-x86-vpclmulqdq.rs diff --git a/src/shims/x86/mod.rs b/src/shims/x86/mod.rs index 9339d301ae..b67be41ced 100644 --- a/src/shims/x86/mod.rs +++ b/src/shims/x86/mod.rs @@ -95,11 +95,22 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { } } - "pclmulqdq" => { + "pclmulqdq" | "pclmulqdq.256" | "pclmulqdq.512" => { + let mut width = 2; + this.expect_target_feature_for_intrinsic(link_name, "pclmulqdq")?; + if unprefixed_name.ends_with(".256") { + this.expect_target_feature_for_intrinsic(link_name, "vpclmulqdq")?; + width = 4; + } else if unprefixed_name.ends_with(".512") { + this.expect_target_feature_for_intrinsic(link_name, "vpclmulqdq")?; + this.expect_target_feature_for_intrinsic(link_name, "avx512f")?; + width = 8; + } + let [left, right, imm] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; - pclmulqdq(this, left, right, imm, dest)?; + pclmulqdq(this, left, right, imm, dest, width)?; } name if name.starts_with("bmi.") => { @@ -1144,51 +1155,53 @@ fn pclmulqdq<'tcx>( right: &OpTy<'tcx>, imm8: &OpTy<'tcx>, dest: &MPlaceTy<'tcx>, + width: u64, ) -> InterpResult<'tcx, ()> { assert_eq!(left.layout, right.layout); assert_eq!(left.layout.size, dest.layout.size); + assert!([2u64, 4, 8].contains(&width)); - // Transmute to `[u64; 2]` + // Transmute the input into arrays of `u64`. + // Transmute the output into an array of `u128`. - let array_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u64, 2))?; - let left = left.transmute(array_layout, this)?; - let right = right.transmute(array_layout, this)?; - let dest = dest.transmute(array_layout, this)?; + let src_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u64, width))?; + let dest_layout = + this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u128, width / 2))?; + + let left = left.transmute(src_layout, this)?; + let right = right.transmute(src_layout, this)?; + let dest = dest.transmute(dest_layout, this)?; let imm8 = this.read_scalar(imm8)?.to_u8()?; - // select the 64-bit integer from left that the user specified (low or high) - let index = if (imm8 & 0x01) == 0 { 0 } else { 1 }; - let left = this.read_scalar(&this.project_index(&left, index)?)?.to_u64()?; - - // select the 64-bit integer from right that the user specified (low or high) - let index = if (imm8 & 0x10) == 0 { 0 } else { 1 }; - let right = this.read_scalar(&this.project_index(&right, index)?)?.to_u64()?; - - // Perform carry-less multiplication - // - // This operation is like long multiplication, but ignores all carries. - // That idea corresponds to the xor operator, which is used in the implementation. - // - // Wikipedia has an example https://en.wikipedia.org/wiki/Carry-less_product#Example - let mut result: u128 = 0; - - for i in 0..64 { - // if the i-th bit in right is set - if (right & (1 << i)) != 0 { - // xor result with `left` shifted to the left by i positions - result ^= u128::from(left) << i; + for i in 0..(width / 2) { + // select the 64-bit integer from left that the user specified (low or high) + let index = 2 * i + if (imm8 & 0x01) == 0 { 0 } else { 1 }; + let left = this.read_scalar(&this.project_index(&left, index)?)?.to_u64()?; + + // select the 64-bit integer from right that the user specified (low or high) + let index = 2 * i + if (imm8 & 0x10) == 0 { 0 } else { 1 }; + let right = this.read_scalar(&this.project_index(&right, index)?)?.to_u64()?; + + // Perform carry-less multiplication. + // + // This operation is like long multiplication, but ignores all carries. + // That idea corresponds to the xor operator, which is used in the implementation. + // + // Wikipedia has an example https://en.wikipedia.org/wiki/Carry-less_product#Example + let mut result: u128 = 0; + + for i in 0..64 { + // if the i-th bit in right is set + if (right & (1 << i)) != 0 { + // xor result with `left` shifted to the left by i positions + result ^= u128::from(left) << i; + } } - } - - let result_low = (result & 0xFFFF_FFFF_FFFF_FFFF) as u64; - let result_high = (result >> 64) as u64; - - let dest_low = this.project_index(&dest, 0)?; - this.write_scalar(Scalar::from_u64(result_low), &dest_low)?; - let dest_high = this.project_index(&dest, 1)?; - this.write_scalar(Scalar::from_u64(result_high), &dest_high)?; + let dest = this.project_index(&dest, i)?; + this.write_scalar(Scalar::from_u128(result), &dest)?; + } interp_ok(()) } diff --git a/tests/pass/shims/x86/intrinsics-x86-vpclmulqdq.rs b/tests/pass/shims/x86/intrinsics-x86-vpclmulqdq.rs new file mode 100644 index 0000000000..db60c269be --- /dev/null +++ b/tests/pass/shims/x86/intrinsics-x86-vpclmulqdq.rs @@ -0,0 +1,223 @@ +// We're testing x86 target specific features +//@only-target: x86_64 i686 +//@compile-flags: -C target-feature=+vpclmulqdq,+avx512f + +// The constants in the tests below are just bit patterns. They should not +// be interpreted as integers; signedness does not make sense for them, but +// __mXXXi happens to be defined in terms of signed integers. +#![allow(overflowing_literals)] +#![feature(avx512_target_feature)] +#![feature(stdarch_x86_avx512)] + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; +use std::mem::transmute; + +fn main() { + // Mostly copied from library/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs + + assert!(is_x86_feature_detected!("pclmulqdq")); + assert!(is_x86_feature_detected!("vpclmulqdq")); + assert!(is_x86_feature_detected!("avx512f")); + + unsafe { + test_mm256_clmulepi64_epi128(); + test_mm512_clmulepi64_epi128(); + } +} + +macro_rules! verify_kat_pclmul { + ($broadcast:ident, $clmul:ident, $assert:ident) => { + // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf + let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d); + let a = $broadcast(a); + let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d); + let b = $broadcast(b); + let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451); + let r00 = $broadcast(r00); + let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315); + let r01 = $broadcast(r01); + let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9); + let r10 = $broadcast(r10); + let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed); + let r11 = $broadcast(r11); + + $assert($clmul::<0x00>(a, b), r00); + $assert($clmul::<0x10>(a, b), r01); + $assert($clmul::<0x01>(a, b), r10); + $assert($clmul::<0x11>(a, b), r11); + + let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000); + let a0 = $broadcast(a0); + let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000); + let r = $broadcast(r); + $assert($clmul::<0x00>(a0, a0), r); + } +} + +macro_rules! unroll { + ($target:ident[4] = $op:ident::<4>($source:ident);) => { + $target[3] = $op::<3>($source); + $target[2] = $op::<2>($source); + unroll! {$target[2] = $op::<2>($source);} + }; + ($target:ident[2] = $op:ident::<2>($source:ident);) => { + $target[1] = $op::<1>($source); + $target[0] = $op::<0>($source); + }; + (assert_eq_m128i($op:ident::<4>($vec_res:ident),$lin_res:ident[4]);) => { + assert_eq_m128i($op::<3>($vec_res), $lin_res[3]); + assert_eq_m128i($op::<2>($vec_res), $lin_res[2]); + unroll! {assert_eq_m128i($op::<2>($vec_res),$lin_res[2]);} + }; + (assert_eq_m128i($op:ident::<2>($vec_res:ident),$lin_res:ident[2]);) => { + assert_eq_m128i($op::<1>($vec_res), $lin_res[1]); + assert_eq_m128i($op::<0>($vec_res), $lin_res[0]); + }; +} + +// this function tests one of the possible 4 instances +// with different inputs across lanes for the 512-bit version +#[target_feature(enable = "vpclmulqdq,avx512f")] +unsafe fn verify_512_helper( + linear: unsafe fn(__m128i, __m128i) -> __m128i, + vectorized: unsafe fn(__m512i, __m512i) -> __m512i, +) { + let a = _mm512_set_epi64( + 0xDCB4DB3657BF0B7D, + 0x18DB0601068EDD9F, + 0xB76B908233200DC5, + 0xE478235FA8E22D5E, + 0xAB05CFFA2621154C, + 0x1171B47A186174C9, + 0x8C6B6C0E7595CEC9, + 0xBE3E7D4934E961BD, + ); + let b = _mm512_set_epi64( + 0x672F6F105A94CEA7, + 0x8298B8FFCA5F829C, + 0xA3927047B3FB61D8, + 0x978093862CDE7187, + 0xB1927AB22F31D0EC, + 0xA9A5DA619BE4D7AF, + 0xCA2590F56884FDC6, + 0x19BE9F660038BDB5, + ); + + let mut a_decomp = [_mm_setzero_si128(); 4]; + unroll! {a_decomp[4] = _mm512_extracti32x4_epi32::<4>(a);} + let mut b_decomp = [_mm_setzero_si128(); 4]; + unroll! {b_decomp[4] = _mm512_extracti32x4_epi32::<4>(b);} + + let r = vectorized(a, b); + let mut e_decomp = [_mm_setzero_si128(); 4]; + for i in 0..4 { + e_decomp[i] = linear(a_decomp[i], b_decomp[i]); + } + unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32::<4>(r),e_decomp[4]);} +} + +// this function tests one of the possible 4 instances +// with different inputs across lanes for the 256-bit version +#[target_feature(enable = "vpclmulqdq")] +unsafe fn verify_256_helper( + linear: unsafe fn(__m128i, __m128i) -> __m128i, + vectorized: unsafe fn(__m256i, __m256i) -> __m256i, +) { + let a = _mm512_set_epi64( + 0xDCB4DB3657BF0B7D, + 0x18DB0601068EDD9F, + 0xB76B908233200DC5, + 0xE478235FA8E22D5E, + 0xAB05CFFA2621154C, + 0x1171B47A186174C9, + 0x8C6B6C0E7595CEC9, + 0xBE3E7D4934E961BD, + ); + let b = _mm512_set_epi64( + 0x672F6F105A94CEA7, + 0x8298B8FFCA5F829C, + 0xA3927047B3FB61D8, + 0x978093862CDE7187, + 0xB1927AB22F31D0EC, + 0xA9A5DA619BE4D7AF, + 0xCA2590F56884FDC6, + 0x19BE9F660038BDB5, + ); + + let mut a_decomp = [_mm_setzero_si128(); 2]; + unroll! {a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);} + let mut b_decomp = [_mm_setzero_si128(); 2]; + unroll! {b_decomp[2] = _mm512_extracti32x4_epi32::<2>(b);} + + let r = vectorized(_mm512_extracti64x4_epi64::<0>(a), _mm512_extracti64x4_epi64::<0>(b)); + let mut e_decomp = [_mm_setzero_si128(); 2]; + for i in 0..2 { + e_decomp[i] = linear(a_decomp[i], b_decomp[i]); + } + unroll! {assert_eq_m128i(_mm256_extracti128_si256::<2>(r),e_decomp[2]);} +} + +#[target_feature(enable = "vpclmulqdq,avx512f")] +unsafe fn test_mm512_clmulepi64_epi128() { + verify_kat_pclmul!(_mm512_broadcast_i32x4, _mm512_clmulepi64_epi128, assert_eq_m512i); + + verify_512_helper( + |a, b| _mm_clmulepi64_si128::<0x00>(a, b), + |a, b| _mm512_clmulepi64_epi128::<0x00>(a, b), + ); + verify_512_helper( + |a, b| _mm_clmulepi64_si128::<0x01>(a, b), + |a, b| _mm512_clmulepi64_epi128::<0x01>(a, b), + ); + verify_512_helper( + |a, b| _mm_clmulepi64_si128::<0x10>(a, b), + |a, b| _mm512_clmulepi64_epi128::<0x10>(a, b), + ); + verify_512_helper( + |a, b| _mm_clmulepi64_si128::<0x11>(a, b), + |a, b| _mm512_clmulepi64_epi128::<0x11>(a, b), + ); +} + +#[target_feature(enable = "vpclmulqdq")] +unsafe fn test_mm256_clmulepi64_epi128() { + verify_kat_pclmul!(_mm256_broadcastsi128_si256, _mm256_clmulepi64_epi128, assert_eq_m256i); + + verify_256_helper( + |a, b| _mm_clmulepi64_si128::<0x00>(a, b), + |a, b| _mm256_clmulepi64_epi128::<0x00>(a, b), + ); + verify_256_helper( + |a, b| _mm_clmulepi64_si128::<0x01>(a, b), + |a, b| _mm256_clmulepi64_epi128::<0x01>(a, b), + ); + verify_256_helper( + |a, b| _mm_clmulepi64_si128::<0x10>(a, b), + |a, b| _mm256_clmulepi64_epi128::<0x10>(a, b), + ); + verify_256_helper( + |a, b| _mm_clmulepi64_si128::<0x11>(a, b), + |a, b| _mm256_clmulepi64_epi128::<0x11>(a, b), + ); +} + +#[track_caller] +#[target_feature(enable = "avx512f")] +unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) { + assert_eq!(transmute::<_, [u64; 8]>(a), transmute::<_, [u64; 8]>(b)) +} + +#[track_caller] +#[target_feature(enable = "avx")] +unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) { + assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b)) +} + +#[track_caller] +#[target_feature(enable = "sse2")] +unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) { + assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b)) +}