From 5d5de794cc68c8442d73d7de57b21b422c4dfd4f Mon Sep 17 00:00:00 2001
From: Tobias Decking <Tobias.Decking@gmail.com>
Date: Tue, 22 Oct 2024 19:54:20 +0200
Subject: [PATCH] Implement LLVM x86 vpclmulqdq intrinsics

---
 src/shims/x86/mod.rs                          |  87 ++++---
 .../shims/x86/intrinsics-x86-vpclmulqdq.rs    | 223 ++++++++++++++++++
 2 files changed, 273 insertions(+), 37 deletions(-)
 create mode 100644 tests/pass/shims/x86/intrinsics-x86-vpclmulqdq.rs

diff --git a/src/shims/x86/mod.rs b/src/shims/x86/mod.rs
index 9339d301ae..b67be41ced 100644
--- a/src/shims/x86/mod.rs
+++ b/src/shims/x86/mod.rs
@@ -95,11 +95,22 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 }
             }
 
-            "pclmulqdq" => {
+            "pclmulqdq" | "pclmulqdq.256" | "pclmulqdq.512" => {
+                let mut width = 2;
+                this.expect_target_feature_for_intrinsic(link_name, "pclmulqdq")?;
+                if unprefixed_name.ends_with(".256") {
+                    this.expect_target_feature_for_intrinsic(link_name, "vpclmulqdq")?;
+                    width = 4;
+                } else if unprefixed_name.ends_with(".512") {
+                    this.expect_target_feature_for_intrinsic(link_name, "vpclmulqdq")?;
+                    this.expect_target_feature_for_intrinsic(link_name, "avx512f")?;
+                    width = 8;
+                }
+
                 let [left, right, imm] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                pclmulqdq(this, left, right, imm, dest)?;
+                pclmulqdq(this, left, right, imm, dest, width)?;
             }
 
             name if name.starts_with("bmi.") => {
@@ -1144,51 +1155,53 @@ fn pclmulqdq<'tcx>(
     right: &OpTy<'tcx>,
     imm8: &OpTy<'tcx>,
     dest: &MPlaceTy<'tcx>,
+    width: u64,
 ) -> InterpResult<'tcx, ()> {
     assert_eq!(left.layout, right.layout);
     assert_eq!(left.layout.size, dest.layout.size);
+    assert!([2u64, 4, 8].contains(&width));
 
-    // Transmute to `[u64; 2]`
+    // Transmute the input into arrays of `u64`.
+    // Transmute the output into an array of `u128`.
 
-    let array_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u64, 2))?;
-    let left = left.transmute(array_layout, this)?;
-    let right = right.transmute(array_layout, this)?;
-    let dest = dest.transmute(array_layout, this)?;
+    let src_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u64, width))?;
+    let dest_layout =
+        this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u128, width / 2))?;
+
+    let left = left.transmute(src_layout, this)?;
+    let right = right.transmute(src_layout, this)?;
+    let dest = dest.transmute(dest_layout, this)?;
 
     let imm8 = this.read_scalar(imm8)?.to_u8()?;
 
-    // select the 64-bit integer from left that the user specified (low or high)
-    let index = if (imm8 & 0x01) == 0 { 0 } else { 1 };
-    let left = this.read_scalar(&this.project_index(&left, index)?)?.to_u64()?;
-
-    // select the 64-bit integer from right that the user specified (low or high)
-    let index = if (imm8 & 0x10) == 0 { 0 } else { 1 };
-    let right = this.read_scalar(&this.project_index(&right, index)?)?.to_u64()?;
-
-    // Perform carry-less multiplication
-    //
-    // This operation is like long multiplication, but ignores all carries.
-    // That idea corresponds to the xor operator, which is used in the implementation.
-    //
-    // Wikipedia has an example https://en.wikipedia.org/wiki/Carry-less_product#Example
-    let mut result: u128 = 0;
-
-    for i in 0..64 {
-        // if the i-th bit in right is set
-        if (right & (1 << i)) != 0 {
-            // xor result with `left` shifted to the left by i positions
-            result ^= u128::from(left) << i;
+    for i in 0..(width / 2) {
+        // select the 64-bit integer from left that the user specified (low or high)
+        let index = 2 * i + if (imm8 & 0x01) == 0 { 0 } else { 1 };
+        let left = this.read_scalar(&this.project_index(&left, index)?)?.to_u64()?;
+
+        // select the 64-bit integer from right that the user specified (low or high)
+        let index = 2 * i + if (imm8 & 0x10) == 0 { 0 } else { 1 };
+        let right = this.read_scalar(&this.project_index(&right, index)?)?.to_u64()?;
+
+        // Perform carry-less multiplication.
+        //
+        // This operation is like long multiplication, but ignores all carries.
+        // That idea corresponds to the xor operator, which is used in the implementation.
+        //
+        // Wikipedia has an example https://en.wikipedia.org/wiki/Carry-less_product#Example
+        let mut result: u128 = 0;
+
+        for i in 0..64 {
+            // if the i-th bit in right is set
+            if (right & (1 << i)) != 0 {
+                // xor result with `left` shifted to the left by i positions
+                result ^= u128::from(left) << i;
+            }
         }
-    }
-
-    let result_low = (result & 0xFFFF_FFFF_FFFF_FFFF) as u64;
-    let result_high = (result >> 64) as u64;
-
-    let dest_low = this.project_index(&dest, 0)?;
-    this.write_scalar(Scalar::from_u64(result_low), &dest_low)?;
 
-    let dest_high = this.project_index(&dest, 1)?;
-    this.write_scalar(Scalar::from_u64(result_high), &dest_high)?;
+        let dest = this.project_index(&dest, i)?;
+        this.write_scalar(Scalar::from_u128(result), &dest)?;
+    }
 
     interp_ok(())
 }
diff --git a/tests/pass/shims/x86/intrinsics-x86-vpclmulqdq.rs b/tests/pass/shims/x86/intrinsics-x86-vpclmulqdq.rs
new file mode 100644
index 0000000000..db60c269be
--- /dev/null
+++ b/tests/pass/shims/x86/intrinsics-x86-vpclmulqdq.rs
@@ -0,0 +1,223 @@
+// We're testing x86 target specific features
+//@only-target: x86_64 i686
+//@compile-flags: -C target-feature=+vpclmulqdq,+avx512f
+
+// The constants in the tests below are just bit patterns. They should not
+// be interpreted as integers; signedness does not make sense for them, but
+// __mXXXi happens to be defined in terms of signed integers.
+#![allow(overflowing_literals)]
+#![feature(avx512_target_feature)]
+#![feature(stdarch_x86_avx512)]
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use std::mem::transmute;
+
+fn main() {
+    // Mostly copied from library/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs
+
+    assert!(is_x86_feature_detected!("pclmulqdq"));
+    assert!(is_x86_feature_detected!("vpclmulqdq"));
+    assert!(is_x86_feature_detected!("avx512f"));
+
+    unsafe {
+        test_mm256_clmulepi64_epi128();
+        test_mm512_clmulepi64_epi128();
+    }
+}
+
+macro_rules! verify_kat_pclmul {
+    ($broadcast:ident, $clmul:ident, $assert:ident) => {
+        // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
+        let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
+        let a = $broadcast(a);
+        let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
+        let b = $broadcast(b);
+        let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
+        let r00 = $broadcast(r00);
+        let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
+        let r01 = $broadcast(r01);
+        let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
+        let r10 = $broadcast(r10);
+        let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
+        let r11 = $broadcast(r11);
+
+        $assert($clmul::<0x00>(a, b), r00);
+        $assert($clmul::<0x10>(a, b), r01);
+        $assert($clmul::<0x01>(a, b), r10);
+        $assert($clmul::<0x11>(a, b), r11);
+
+        let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
+        let a0 = $broadcast(a0);
+        let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
+        let r = $broadcast(r);
+        $assert($clmul::<0x00>(a0, a0), r);
+    }
+}
+
+macro_rules! unroll {
+    ($target:ident[4] = $op:ident::<4>($source:ident);) => {
+        $target[3] = $op::<3>($source);
+        $target[2] = $op::<2>($source);
+        unroll! {$target[2] = $op::<2>($source);}
+    };
+    ($target:ident[2] = $op:ident::<2>($source:ident);) => {
+        $target[1] = $op::<1>($source);
+        $target[0] = $op::<0>($source);
+    };
+    (assert_eq_m128i($op:ident::<4>($vec_res:ident),$lin_res:ident[4]);) => {
+        assert_eq_m128i($op::<3>($vec_res), $lin_res[3]);
+        assert_eq_m128i($op::<2>($vec_res), $lin_res[2]);
+        unroll! {assert_eq_m128i($op::<2>($vec_res),$lin_res[2]);}
+    };
+    (assert_eq_m128i($op:ident::<2>($vec_res:ident),$lin_res:ident[2]);) => {
+        assert_eq_m128i($op::<1>($vec_res), $lin_res[1]);
+        assert_eq_m128i($op::<0>($vec_res), $lin_res[0]);
+    };
+}
+
+// this function tests one of the possible 4 instances
+// with different inputs across lanes for the 512-bit version
+#[target_feature(enable = "vpclmulqdq,avx512f")]
+unsafe fn verify_512_helper(
+    linear: unsafe fn(__m128i, __m128i) -> __m128i,
+    vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
+) {
+    let a = _mm512_set_epi64(
+        0xDCB4DB3657BF0B7D,
+        0x18DB0601068EDD9F,
+        0xB76B908233200DC5,
+        0xE478235FA8E22D5E,
+        0xAB05CFFA2621154C,
+        0x1171B47A186174C9,
+        0x8C6B6C0E7595CEC9,
+        0xBE3E7D4934E961BD,
+    );
+    let b = _mm512_set_epi64(
+        0x672F6F105A94CEA7,
+        0x8298B8FFCA5F829C,
+        0xA3927047B3FB61D8,
+        0x978093862CDE7187,
+        0xB1927AB22F31D0EC,
+        0xA9A5DA619BE4D7AF,
+        0xCA2590F56884FDC6,
+        0x19BE9F660038BDB5,
+    );
+
+    let mut a_decomp = [_mm_setzero_si128(); 4];
+    unroll! {a_decomp[4] = _mm512_extracti32x4_epi32::<4>(a);}
+    let mut b_decomp = [_mm_setzero_si128(); 4];
+    unroll! {b_decomp[4] = _mm512_extracti32x4_epi32::<4>(b);}
+
+    let r = vectorized(a, b);
+    let mut e_decomp = [_mm_setzero_si128(); 4];
+    for i in 0..4 {
+        e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
+    }
+    unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32::<4>(r),e_decomp[4]);}
+}
+
+// this function tests one of the possible 4 instances
+// with different inputs across lanes for the 256-bit version
+#[target_feature(enable = "vpclmulqdq")]
+unsafe fn verify_256_helper(
+    linear: unsafe fn(__m128i, __m128i) -> __m128i,
+    vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
+) {
+    let a = _mm512_set_epi64(
+        0xDCB4DB3657BF0B7D,
+        0x18DB0601068EDD9F,
+        0xB76B908233200DC5,
+        0xE478235FA8E22D5E,
+        0xAB05CFFA2621154C,
+        0x1171B47A186174C9,
+        0x8C6B6C0E7595CEC9,
+        0xBE3E7D4934E961BD,
+    );
+    let b = _mm512_set_epi64(
+        0x672F6F105A94CEA7,
+        0x8298B8FFCA5F829C,
+        0xA3927047B3FB61D8,
+        0x978093862CDE7187,
+        0xB1927AB22F31D0EC,
+        0xA9A5DA619BE4D7AF,
+        0xCA2590F56884FDC6,
+        0x19BE9F660038BDB5,
+    );
+
+    let mut a_decomp = [_mm_setzero_si128(); 2];
+    unroll! {a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);}
+    let mut b_decomp = [_mm_setzero_si128(); 2];
+    unroll! {b_decomp[2] = _mm512_extracti32x4_epi32::<2>(b);}
+
+    let r = vectorized(_mm512_extracti64x4_epi64::<0>(a), _mm512_extracti64x4_epi64::<0>(b));
+    let mut e_decomp = [_mm_setzero_si128(); 2];
+    for i in 0..2 {
+        e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
+    }
+    unroll! {assert_eq_m128i(_mm256_extracti128_si256::<2>(r),e_decomp[2]);}
+}
+
+#[target_feature(enable = "vpclmulqdq,avx512f")]
+unsafe fn test_mm512_clmulepi64_epi128() {
+    verify_kat_pclmul!(_mm512_broadcast_i32x4, _mm512_clmulepi64_epi128, assert_eq_m512i);
+
+    verify_512_helper(
+        |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
+        |a, b| _mm512_clmulepi64_epi128::<0x00>(a, b),
+    );
+    verify_512_helper(
+        |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
+        |a, b| _mm512_clmulepi64_epi128::<0x01>(a, b),
+    );
+    verify_512_helper(
+        |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
+        |a, b| _mm512_clmulepi64_epi128::<0x10>(a, b),
+    );
+    verify_512_helper(
+        |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
+        |a, b| _mm512_clmulepi64_epi128::<0x11>(a, b),
+    );
+}
+
+#[target_feature(enable = "vpclmulqdq")]
+unsafe fn test_mm256_clmulepi64_epi128() {
+    verify_kat_pclmul!(_mm256_broadcastsi128_si256, _mm256_clmulepi64_epi128, assert_eq_m256i);
+
+    verify_256_helper(
+        |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
+        |a, b| _mm256_clmulepi64_epi128::<0x00>(a, b),
+    );
+    verify_256_helper(
+        |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
+        |a, b| _mm256_clmulepi64_epi128::<0x01>(a, b),
+    );
+    verify_256_helper(
+        |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
+        |a, b| _mm256_clmulepi64_epi128::<0x10>(a, b),
+    );
+    verify_256_helper(
+        |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
+        |a, b| _mm256_clmulepi64_epi128::<0x11>(a, b),
+    );
+}
+
+#[track_caller]
+#[target_feature(enable = "avx512f")]
+unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
+    assert_eq!(transmute::<_, [u64; 8]>(a), transmute::<_, [u64; 8]>(b))
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
+    assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
+    assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
+}