From ff2bb22be5b970eb5cd023f1f8c4ee6143166825 Mon Sep 17 00:00:00 2001
From: Tony Arcieri <bascule@gmail.com>
Date: Fri, 1 Dec 2023 15:50:33 -0700
Subject: [PATCH 1/6] [WIP] Bernstein-Yang modular inversion algorithm

Imported from: https://github.com/privacy-scaling-explorations/halo2curves/pull/83

Original code is Apache 2.0+MIT. Attribution has been added to the top
of the module.
---
 src/modular.rs                |   2 +
 src/modular/bernstein_yang.rs | 435 ++++++++++++++++++++++++++++++++++
 2 files changed, 437 insertions(+)
 create mode 100644 src/modular/bernstein_yang.rs
diff --git a/src/modular.rs b/src/modular.rs
index add5bb82..5ae3d92f 100644
--- a/src/modular.rs
+++ b/src/modular.rs
@@ -21,6 +21,7 @@ mod reduction;
 mod residue;
 
 mod add;
+mod bernstein_yang;
 mod div_by_2;
 mod inv;
 mod mul;
@@ -31,6 +32,7 @@ mod sub;
 pub(crate) mod boxed_residue;
 
 pub use self::{
+    bernstein_yang::BYInverter,
     dyn_residue::{DynResidue, DynResidueParams},
     reduction::montgomery_reduction,
     residue::{Residue, ResidueParams},
diff --git a/src/modular/bernstein_yang.rs b/src/modular/bernstein_yang.rs
new file mode 100644
index 00000000..b2959d52
--- /dev/null
+++ b/src/modular/bernstein_yang.rs
@@ -0,0 +1,435 @@
+//! Implementation of Bernstein-Yang modular inversion and GCD algorithm as described in:
+//! <https://eprint.iacr.org/2019/266>.
+//!
+//! Adapted from the Apache 2.0+MIT licensed implementation originally from:
+//! <https://github.com/privacy-scaling-explorations/halo2curves/pull/83>
+//!
+//! Copyright (c) 2023 Privacy Scaling Explorations Team
+
+use core::{
+    cmp::PartialEq,
+    ops::{Add, Mul, Neg, Sub},
+};
+
+/// Big signed (B * L)-bit integer type, whose variables store
+/// numbers in the two's complement code as arrays of B-bit chunks.
+/// The ordering of the chunks in these arrays is little-endian.
+/// The arithmetic operations for this type are wrapping ones.
+#[derive(Clone, Debug)]
+struct CInt<const B: usize, const L: usize>(pub [u64; L]);
+
+impl<const B: usize, const L: usize> CInt<B, L> {
+    /// Mask, in which the B lowest bits are 1 and only they
+    pub const MASK: u64 = u64::MAX >> (64 - B);
+
+    /// Representation of -1
+    pub const MINUS_ONE: Self = Self([Self::MASK; L]);
+
+    /// Representation of 0
+    pub const ZERO: Self = Self([0; L]);
+
+    /// Representation of 1
+    pub const ONE: Self = {
+        let mut data = [0; L];
+        data[0] = 1;
+        Self(data)
+    };
+
+    /// Returns the result of applying B-bit right
+    /// arithmetical shift to the current number
+    pub fn shift(&self) -> Self {
+        let mut data = [0; L];
+        if self.is_negative() {
+            data[L - 1] = Self::MASK;
+        }
+        data[..L - 1].copy_from_slice(&self.0[1..]);
+        Self(data)
+    }
+
+    /// Returns the lowest B bits of the current number
+    pub fn lowest(&self) -> u64 {
+        self.0[0]
+    }
+
+    /// Returns "true" iff the current number is negative
+    pub fn is_negative(&self) -> bool {
+        self.0[L - 1] > (Self::MASK >> 1)
+    }
+}
+
+impl<const B: usize, const L: usize> PartialEq for CInt<B, L> {
+    fn eq(&self, other: &Self) -> bool {
+        self.0 == other.0
+    }
+}
+
+impl<const B: usize, const L: usize> Add for &CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn add(self, other: Self) -> Self::Output {
+        let (mut data, mut carry) = ([0; L], 0);
+        for i in 0..L {
+            let sum = self.0[i] + other.0[i] + carry;
+            data[i] = sum & CInt::<B, L>::MASK;
+            carry = sum >> B;
+        }
+        Self::Output { 0: data }
+    }
+}
+
+impl<const B: usize, const L: usize> Add<&CInt<B, L>> for CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn add(self, other: &Self) -> Self::Output {
+        &self + other
+    }
+}
+
+impl<const B: usize, const L: usize> Add for CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn add(self, other: Self) -> Self::Output {
+        &self + &other
+    }
+}
+
+impl<const B: usize, const L: usize> Sub for &CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn sub(self, other: Self) -> Self::Output {
+        // For the two's complement code the additive negation is the result of
+        // adding 1 to the bitwise inverted argument's representation. Thus, for
+        // any encoded integers x and y we have x - y = x + !y + 1, where "!" is
+        // the bitwise inversion and addition is done according to the rules of
+        // the code. The algorithm below uses this formula and is the modified
+        // addition algorithm, where the carry flag is initialized with 1 and
+        // the chunks of the second argument are bitwise inverted
+        let (mut data, mut carry) = ([0; L], 1);
+        for i in 0..L {
+            let sum = self.0[i] + (other.0[i] ^ CInt::<B, L>::MASK) + carry;
+            data[i] = sum & CInt::<B, L>::MASK;
+            carry = sum >> B;
+        }
+        Self::Output { 0: data }
+    }
+}
+
+impl<const B: usize, const L: usize> Sub<&CInt<B, L>> for CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn sub(self, other: &Self) -> Self::Output {
+        &self - other
+    }
+}
+
+impl<const B: usize, const L: usize> Sub for CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn sub(self, other: Self) -> Self::Output {
+        &self - &other
+    }
+}
+
+impl<const B: usize, const L: usize> Neg for &CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn neg(self) -> Self::Output {
+        // For the two's complement code the additive negation is the result
+        // of adding 1 to the bitwise inverted argument's representation
+        let (mut data, mut carry) = ([0; L], 1);
+        for i in 0..L {
+            let sum = (self.0[i] ^ CInt::<B, L>::MASK) + carry;
+            data[i] = sum & CInt::<B, L>::MASK;
+            carry = sum >> B;
+        }
+        Self::Output { 0: data }
+    }
+}
+
+impl<const B: usize, const L: usize> Neg for CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn neg(self) -> Self::Output {
+        -&self
+    }
+}
+
+impl<const B: usize, const L: usize> Mul for &CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn mul(self, other: Self) -> Self::Output {
+        let mut data = [0; L];
+        for i in 0..L {
+            let mut carry = 0;
+            for k in 0..(L - i) {
+                let sum = (data[i + k] as u128)
+                    + (carry as u128)
+                    + (self.0[i] as u128) * (other.0[k] as u128);
+                data[i + k] = sum as u64 & CInt::<B, L>::MASK;
+                carry = (sum >> B) as u64;
+            }
+        }
+        Self::Output { 0: data }
+    }
+}
+
+impl<const B: usize, const L: usize> Mul<&CInt<B, L>> for CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn mul(self, other: &Self) -> Self::Output {
+        &self * other
+    }
+}
+
+impl<const B: usize, const L: usize> Mul for CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn mul(self, other: Self) -> Self::Output {
+        &self * &other
+    }
+}
+
+impl<const B: usize, const L: usize> Mul<i64> for &CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn mul(self, other: i64) -> Self::Output {
+        let mut data = [0; L];
+        // If the short multiplicand is non-negative, the standard multiplication
+        // algorithm is performed. Otherwise, the product of the additively negated
+        // multiplicands is found as follows. Since for the two's complement code
+        // the additive negation is the result of adding 1 to the bitwise inverted
+        // argument's representation, for any encoded integers x and y we have
+        // x * y = (-x) * (-y) = (!x + 1) * (-y) = !x * (-y) + (-y),  where "!" is
+        // the bitwise inversion and arithmetic operations are performed according
+        // to the rules of the code. If the short multiplicand is negative, the
+        // algorithm below uses this formula by substituting the short multiplicand
+        // for y and turns into the modified standard multiplication algorithm,
+        // where the carry flag is initialized with the additively negated short
+        // multiplicand and the chunks of the long multiplicand are bitwise inverted
+        let (other, mut carry, mask) = if other < 0 {
+            (-other, -other as u64, CInt::<B, L>::MASK)
+        } else {
+            (other, 0, 0)
+        };
+        for i in 0..L {
+            let sum = (carry as u128) + ((self.0[i] ^ mask) as u128) * (other as u128);
+            data[i] = sum as u64 & CInt::<B, L>::MASK;
+            carry = (sum >> B) as u64;
+        }
+        Self::Output { 0: data }
+    }
+}
+
+impl<const B: usize, const L: usize> Mul<i64> for CInt<B, L> {
+    type Output = CInt<B, L>;
+    fn mul(self, other: i64) -> Self::Output {
+        &self * other
+    }
+}
+
+impl<const B: usize, const L: usize> Mul<&CInt<B, L>> for i64 {
+    type Output = CInt<B, L>;
+    fn mul(self, other: &CInt<B, L>) -> Self::Output {
+        other * self
+    }
+}
+
+impl<const B: usize, const L: usize> Mul<CInt<B, L>> for i64 {
+    type Output = CInt<B, L>;
+    fn mul(self, other: CInt<B, L>) -> Self::Output {
+        other * self
+    }
+}
+
+/// Type of the modular multiplicative inverter based on the Bernstein-Yang method.
+/// The inverter can be created for a specified modulus M and adjusting parameter A
+/// to compute the adjusted multiplicative inverses of positive integers, i.e. for
+/// computing (1 / x) * A (mod M) for a positive integer x.
+///
+/// The adjusting parameter allows computing the multiplicative inverses in the case of
+/// using the Montgomery representation for the input or the expected output. If R is
+/// the Montgomery factor, the multiplicative inverses in the appropriate representation
+/// can be computed provided that the value of A is chosen as follows:
+/// - A = 1, if both the input and the expected output are in the standard form
+/// - A = R^2 mod M, if both the input and the expected output are in the Montgomery form
+/// - A = R mod M, if either the input or the expected output is in the Montgomery form,
+/// but not both of them
+///
+/// The public methods of this type receive and return unsigned big integers as arrays of
+/// 64-bit chunks, the ordering of which is little-endian. Both the modulus and the integer
+/// to be inverted should not exceed 2 ^ (62 * L - 64)
+///
+/// For better understanding the implementation, the following resources are recommended:
+/// - D. Bernstein, B.-Y. Yang, "Fast constant-time gcd computation and modular inversion",
+/// https://gcd.cr.yp.to/safegcd-20190413.pdf
+/// - P. Wuille, "The safegcd implementation in libsecp256k1 explained",
+/// https://github.com/bitcoin-core/secp256k1/blob/master/doc/safegcd_implementation.md
+#[derive(Debug)]
+pub struct BYInverter<const L: usize> {
+    /// Modulus
+    modulus: CInt<62, L>,
+
+    /// Adjusting parameter
+    adjuster: CInt<62, L>,
+
+    /// Multiplicative inverse of the modulus modulo 2^62
+    inverse: i64,
+}
+
+/// Type of the Bernstein-Yang transition matrix multiplied by 2^62
+type Matrix = [[i64; 2]; 2];
+
+impl<const L: usize> BYInverter<L> {
+    /// Returns the Bernstein-Yang transition matrix multiplied by 2^62 and the new value
+    /// of the delta variable for the 62 basic steps of the Bernstein-Yang method, which
+    /// are to be performed sequentially for specified initial values of f, g and delta
+    fn jump(f: &CInt<62, L>, g: &CInt<62, L>, mut delta: i64) -> (i64, Matrix) {
+        let (mut steps, mut f, mut g) = (62, f.lowest() as i64, g.lowest() as i128);
+        let mut t: Matrix = [[1, 0], [0, 1]];
+
+        loop {
+            let zeros = steps.min(g.trailing_zeros() as i64);
+            (steps, delta, g) = (steps - zeros, delta + zeros, g >> zeros);
+            t[0] = [t[0][0] << zeros, t[0][1] << zeros];
+
+            if steps == 0 {
+                break;
+            }
+            if delta > 0 {
+                (delta, f, g) = (-delta, g as i64, -f as i128);
+                (t[0], t[1]) = (t[1], [-t[0][0], -t[0][1]]);
+            }
+
+            // The formula (3 * x) xor 28 = -1 / x (mod 32) for an odd integer x
+            // in the two's complement code has been derived from the formula
+            // (3 * x) xor 2 = 1 / x (mod 32) attributed to Peter Montgomery
+            let mask = (1 << steps.min(1 - delta).min(5)) - 1;
+            let w = (g as i64).wrapping_mul(f.wrapping_mul(3) ^ 28) & mask;
+
+            t[1] = [t[0][0] * w + t[1][0], t[0][1] * w + t[1][1]];
+            g += w as i128 * f as i128;
+        }
+
+        (delta, t)
+    }
+
+    /// Returns the updated values of the variables f and g for specified initial ones and Bernstein-Yang transition
+    /// matrix multiplied by 2^62. The returned vector is "matrix * (f, g)' / 2^62", where "'" is the transpose operator
+    fn fg(f: CInt<62, L>, g: CInt<62, L>, t: Matrix) -> (CInt<62, L>, CInt<62, L>) {
+        (
+            (t[0][0] * &f + t[0][1] * &g).shift(),
+            (t[1][0] * &f + t[1][1] * &g).shift(),
+        )
+    }
+
+    /// Returns the updated values of the variables d and e for specified initial ones and Bernstein-Yang transition
+    /// matrix multiplied by 2^62. The returned vector is congruent modulo M to "matrix * (d, e)' / 2^62 (mod M)",
+    /// where M is the modulus the inverter was created for and "'" stands for the transpose operator. Both the input
+    /// and output values lie in the interval (-2 * M, M)
+    fn de(&self, d: CInt<62, L>, e: CInt<62, L>, t: Matrix) -> (CInt<62, L>, CInt<62, L>) {
+        let mask = CInt::<62, L>::MASK as i64;
+        let mut md = t[0][0] * d.is_negative() as i64 + t[0][1] * e.is_negative() as i64;
+        let mut me = t[1][0] * d.is_negative() as i64 + t[1][1] * e.is_negative() as i64;
+
+        let cd = t[0][0]
+            .wrapping_mul(d.lowest() as i64)
+            .wrapping_add(t[0][1].wrapping_mul(e.lowest() as i64))
+            & mask;
+        let ce = t[1][0]
+            .wrapping_mul(d.lowest() as i64)
+            .wrapping_add(t[1][1].wrapping_mul(e.lowest() as i64))
+            & mask;
+
+        md -= (self.inverse.wrapping_mul(cd).wrapping_add(md)) & mask;
+        me -= (self.inverse.wrapping_mul(ce).wrapping_add(me)) & mask;
+
+        let cd = t[0][0] * &d + t[0][1] * &e + md * &self.modulus;
+        let ce = t[1][0] * &d + t[1][1] * &e + me * &self.modulus;
+
+        (cd.shift(), ce.shift())
+    }
+
+    /// Returns either "value (mod M)" or "-value (mod M)", where M is the modulus the
+    /// inverter was created for, depending on "negate", which determines the presence
+    /// of "-" in the used formula. The input integer lies in the interval (-2 * M, M)
+    fn norm(&self, mut value: CInt<62, L>, negate: bool) -> CInt<62, L> {
+        if value.is_negative() {
+            value = value + &self.modulus;
+        }
+
+        if negate {
+            value = -value;
+        }
+
+        if value.is_negative() {
+            value = value + &self.modulus;
+        }
+
+        value
+    }
+
+    /// Returns a big unsigned integer as an array of O-bit chunks, which is equal modulo
+    /// 2 ^ (O * S) to the input big unsigned integer stored as an array of I-bit chunks.
+    /// The ordering of the chunks in these arrays is little-endian
+    const fn convert<const I: usize, const O: usize, const S: usize>(input: &[u64]) -> [u64; S] {
+        // This function is defined because the method "min" of the usize type is not constant
+        const fn min(a: usize, b: usize) -> usize {
+            if a > b {
+                b
+            } else {
+                a
+            }
+        }
+
+        let (total, mut output, mut bits) = (min(input.len() * I, S * O), [0; S], 0);
+
+        while bits < total {
+            let (i, o) = (bits % I, bits % O);
+            output[bits / O] |= (input[bits / I] >> i) << o;
+            bits += min(I - i, O - o);
+        }
+
+        let mask = u64::MAX >> (64 - O);
+        let mut filled = total / O + if total % O > 0 { 1 } else { 0 };
+
+        while filled > 0 {
+            filled -= 1;
+            output[filled] &= mask;
+        }
+
+        output
+    }
+
+    /// Returns the multiplicative inverse of the argument modulo 2^62. The implementation is based
+    /// on the Hurchalla's method for computing the multiplicative inverse modulo a power of two.
+    /// For better understanding the implementation, the following paper is recommended:
+    /// J. Hurchalla, "An Improved Integer Multiplicative Inverse (modulo 2^w)",
+    /// https://arxiv.org/pdf/2204.04342.pdf
+    const fn inv(value: u64) -> i64 {
+        let x = value.wrapping_mul(3) ^ 2;
+        let y = 1u64.wrapping_sub(x.wrapping_mul(value));
+        let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
+        let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
+        let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
+        (x.wrapping_mul(y.wrapping_add(1)) & CInt::<62, L>::MASK) as i64
+    }
+
+    /// Creates the inverter for specified modulus and adjusting parameter
+    pub const fn new(modulus: &[u64], adjuster: &[u64]) -> Self {
+        Self {
+            modulus: CInt::<62, L>(Self::convert::<64, 62, L>(modulus)),
+            adjuster: CInt::<62, L>(Self::convert::<64, 62, L>(adjuster)),
+            inverse: Self::inv(modulus[0]),
+        }
+    }
+
+    /// Returns either the adjusted modular multiplicative inverse for the argument or None
+    /// depending on invertibility of the argument, i.e. its coprimality with the modulus
+    pub fn invert<const S: usize>(&self, value: &[u64]) -> Option<[u64; S]> {
+        let (mut d, mut e) = (CInt::ZERO, self.adjuster.clone());
+        let mut g = CInt::<62, L>(Self::convert::<64, 62, L>(value));
+        let (mut delta, mut f) = (1, self.modulus.clone());
+        let mut matrix;
+        while g != CInt::ZERO {
+            (delta, matrix) = Self::jump(&f, &g, delta);
+            (f, g) = Self::fg(f, g, matrix);
+            (d, e) = self.de(d, e, matrix);
+        }
+        // At this point the absolute value of "f" equals the greatest common divisor
+        // of the integer to be inverted and the modulus the inverter was created for.
+        // Thus, if "f" is neither 1 nor -1, then the sought inverse does not exist
+        let antiunit = f == CInt::MINUS_ONE;
+        if (f != CInt::ONE) && !antiunit {
+            return None;
+        }
+        Some(Self::convert::<62, 64, S>(&self.norm(d, antiunit).0))
+    }
+}

From d4ee9aea6a78720e2938e913c883cde80b481e82 Mon Sep 17 00:00:00 2001
From: Tony Arcieri <bascule@gmail.com>
Date: Fri, 1 Dec 2023 16:39:28 -0700
Subject: [PATCH 2/6] Rename struct to `BernsteinYangInverter`; add proptests

---
 src/modular.rs                    |   2 +-
 src/modular/bernstein_yang.rs     | 410 +++++++++++++++---------------
 tests/bernstein_yang_proptests.rs |  47 ++++
 3 files changed, 253 insertions(+), 206 deletions(-)
 create mode 100644 tests/bernstein_yang_proptests.rs

diff --git a/src/modular.rs b/src/modular.rs
index 5ae3d92f..eb2854be 100644
--- a/src/modular.rs
+++ b/src/modular.rs
@@ -32,7 +32,7 @@ mod sub;
 pub(crate) mod boxed_residue;
 
 pub use self::{
-    bernstein_yang::BYInverter,
+    bernstein_yang::BernsteinYangInverter,
     dyn_residue::{DynResidue, DynResidueParams},
     reduction::montgomery_reduction,
     residue::{Residue, ResidueParams},
diff --git a/src/modular/bernstein_yang.rs b/src/modular/bernstein_yang.rs
index b2959d52..9851b56b 100644
--- a/src/modular/bernstein_yang.rs
+++ b/src/modular/bernstein_yang.rs
@@ -11,6 +11,211 @@ use core::{
     ops::{Add, Mul, Neg, Sub},
 };
 
+/// Type of the modular multiplicative inverter based on the Bernstein-Yang method.
+/// The inverter can be created for a specified modulus M and adjusting parameter A
+/// to compute the adjusted multiplicative inverses of positive integers, i.e. for
+/// computing (1 / x) * A (mod M) for a positive integer x.
+///
+/// The adjusting parameter allows computing the multiplicative inverses in the case of
+/// using the Montgomery representation for the input or the expected output. If R is
+/// the Montgomery factor, the multiplicative inverses in the appropriate representation
+/// can be computed provided that the value of A is chosen as follows:
+/// - A = 1, if both the input and the expected output are in the standard form
+/// - A = R^2 mod M, if both the input and the expected output are in the Montgomery form
+/// - A = R mod M, if either the input or the expected output is in the Montgomery form,
+/// but not both of them
+///
+/// The public methods of this type receive and return unsigned big integers as arrays of
+/// 64-bit chunks, the ordering of which is little-endian. Both the modulus and the integer
+/// to be inverted should not exceed 2 ^ (62 * L - 64)
+///
+/// For better understanding the implementation, the following resources are recommended:
+/// - D. Bernstein, B.-Y. Yang, "Fast constant-time gcd computation and modular inversion",
+/// https://gcd.cr.yp.to/safegcd-20190413.pdf
+/// - P. Wuille, "The safegcd implementation in libsecp256k1 explained",
+/// https://github.com/bitcoin-core/secp256k1/blob/master/doc/safegcd_implementation.md
+#[derive(Debug)]
+pub struct BernsteinYangInverter<const L: usize> {
+    /// Modulus
+    modulus: CInt<62, L>,
+
+    /// Adjusting parameter
+    adjuster: CInt<62, L>,
+
+    /// Multiplicative inverse of the modulus modulo 2^62
+    inverse: i64,
+}
+
+/// Type of the Bernstein-Yang transition matrix multiplied by 2^62
+type Matrix = [[i64; 2]; 2];
+
+impl<const L: usize> BernsteinYangInverter<L> {
+    /// Creates the inverter for specified modulus and adjusting parameter
+    pub const fn new(modulus: &[u64], adjuster: &[u64]) -> Self {
+        Self {
+            modulus: CInt::<62, L>(Self::convert::<64, 62, L>(modulus)),
+            adjuster: CInt::<62, L>(Self::convert::<64, 62, L>(adjuster)),
+            inverse: Self::inv(modulus[0]),
+        }
+    }
+
+    /// Returns either the adjusted modular multiplicative inverse for the argument or None
+    /// depending on invertibility of the argument, i.e. its coprimality with the modulus
+    pub fn invert<const S: usize>(&self, value: &[u64]) -> Option<[u64; S]> {
+        let (mut d, mut e) = (CInt::ZERO, self.adjuster.clone());
+        let mut g = CInt::<62, L>(Self::convert::<64, 62, L>(value));
+        let (mut delta, mut f) = (1, self.modulus.clone());
+        let mut matrix;
+        while g != CInt::ZERO {
+            (delta, matrix) = Self::jump(&f, &g, delta);
+            (f, g) = Self::fg(f, g, matrix);
+            (d, e) = self.de(d, e, matrix);
+        }
+        // At this point the absolute value of "f" equals the greatest common divisor
+        // of the integer to be inverted and the modulus the inverter was created for.
+        // Thus, if "f" is neither 1 nor -1, then the sought inverse does not exist
+        let antiunit = f == CInt::MINUS_ONE;
+        if (f != CInt::ONE) && !antiunit {
+            return None;
+        }
+        Some(Self::convert::<62, 64, S>(&self.norm(d, antiunit).0))
+    }
+
+    /// Returns the Bernstein-Yang transition matrix multiplied by 2^62 and the new value
+    /// of the delta variable for the 62 basic steps of the Bernstein-Yang method, which
+    /// are to be performed sequentially for specified initial values of f, g and delta
+    fn jump(f: &CInt<62, L>, g: &CInt<62, L>, mut delta: i64) -> (i64, Matrix) {
+        let (mut steps, mut f, mut g) = (62, f.lowest() as i64, g.lowest() as i128);
+        let mut t: Matrix = [[1, 0], [0, 1]];
+
+        loop {
+            let zeros = steps.min(g.trailing_zeros() as i64);
+            (steps, delta, g) = (steps - zeros, delta + zeros, g >> zeros);
+            t[0] = [t[0][0] << zeros, t[0][1] << zeros];
+
+            if steps == 0 {
+                break;
+            }
+            if delta > 0 {
+                (delta, f, g) = (-delta, g as i64, -f as i128);
+                (t[0], t[1]) = (t[1], [-t[0][0], -t[0][1]]);
+            }
+
+            // The formula (3 * x) xor 28 = -1 / x (mod 32) for an odd integer x
+            // in the two's complement code has been derived from the formula
+            // (3 * x) xor 2 = 1 / x (mod 32) attributed to Peter Montgomery
+            let mask = (1 << steps.min(1 - delta).min(5)) - 1;
+            let w = (g as i64).wrapping_mul(f.wrapping_mul(3) ^ 28) & mask;
+
+            t[1] = [t[0][0] * w + t[1][0], t[0][1] * w + t[1][1]];
+            g += w as i128 * f as i128;
+        }
+
+        (delta, t)
+    }
+
+    /// Returns the updated values of the variables f and g for specified initial ones and Bernstein-Yang transition
+    /// matrix multiplied by 2^62. The returned vector is "matrix * (f, g)' / 2^62", where "'" is the transpose operator
+    fn fg(f: CInt<62, L>, g: CInt<62, L>, t: Matrix) -> (CInt<62, L>, CInt<62, L>) {
+        (
+            (t[0][0] * &f + t[0][1] * &g).shift(),
+            (t[1][0] * &f + t[1][1] * &g).shift(),
+        )
+    }
+
+    /// Returns the updated values of the variables d and e for specified initial ones and Bernstein-Yang transition
+    /// matrix multiplied by 2^62. The returned vector is congruent modulo M to "matrix * (d, e)' / 2^62 (mod M)",
+    /// where M is the modulus the inverter was created for and "'" stands for the transpose operator. Both the input
+    /// and output values lie in the interval (-2 * M, M)
+    fn de(&self, d: CInt<62, L>, e: CInt<62, L>, t: Matrix) -> (CInt<62, L>, CInt<62, L>) {
+        let mask = CInt::<62, L>::MASK as i64;
+        let mut md = t[0][0] * d.is_negative() as i64 + t[0][1] * e.is_negative() as i64;
+        let mut me = t[1][0] * d.is_negative() as i64 + t[1][1] * e.is_negative() as i64;
+
+        let cd = t[0][0]
+            .wrapping_mul(d.lowest() as i64)
+            .wrapping_add(t[0][1].wrapping_mul(e.lowest() as i64))
+            & mask;
+        let ce = t[1][0]
+            .wrapping_mul(d.lowest() as i64)
+            .wrapping_add(t[1][1].wrapping_mul(e.lowest() as i64))
+            & mask;
+
+        md -= (self.inverse.wrapping_mul(cd).wrapping_add(md)) & mask;
+        me -= (self.inverse.wrapping_mul(ce).wrapping_add(me)) & mask;
+
+        let cd = t[0][0] * &d + t[0][1] * &e + md * &self.modulus;
+        let ce = t[1][0] * &d + t[1][1] * &e + me * &self.modulus;
+
+        (cd.shift(), ce.shift())
+    }
+
+    /// Returns either "value (mod M)" or "-value (mod M)", where M is the modulus the
+    /// inverter was created for, depending on "negate", which determines the presence
+    /// of "-" in the used formula. The input integer lies in the interval (-2 * M, M)
+    fn norm(&self, mut value: CInt<62, L>, negate: bool) -> CInt<62, L> {
+        if value.is_negative() {
+            value = value + &self.modulus;
+        }
+
+        if negate {
+            value = -value;
+        }
+
+        if value.is_negative() {
+            value = value + &self.modulus;
+        }
+
+        value
+    }
+
+    /// Returns a big unsigned integer as an array of O-bit chunks, which is equal modulo
+    /// 2 ^ (O * S) to the input big unsigned integer stored as an array of I-bit chunks.
+    /// The ordering of the chunks in these arrays is little-endian
+    const fn convert<const I: usize, const O: usize, const S: usize>(input: &[u64]) -> [u64; S] {
+        // This function is defined because the method "min" of the usize type is not constant
+        const fn min(a: usize, b: usize) -> usize {
+            if a > b {
+                b
+            } else {
+                a
+            }
+        }
+
+        let (total, mut output, mut bits) = (min(input.len() * I, S * O), [0; S], 0);
+
+        while bits < total {
+            let (i, o) = (bits % I, bits % O);
+            output[bits / O] |= (input[bits / I] >> i) << o;
+            bits += min(I - i, O - o);
+        }
+
+        let mask = u64::MAX >> (64 - O);
+        let mut filled = total / O + if total % O > 0 { 1 } else { 0 };
+
+        while filled > 0 {
+            filled -= 1;
+            output[filled] &= mask;
+        }
+
+        output
+    }
+
+    /// Returns the multiplicative inverse of the argument modulo 2^62. The implementation is based
+    /// on the Hurchalla's method for computing the multiplicative inverse modulo a power of two.
+    /// For better understanding the implementation, the following paper is recommended:
+    /// J. Hurchalla, "An Improved Integer Multiplicative Inverse (modulo 2^w)",
+    /// https://arxiv.org/pdf/2204.04342.pdf
+    const fn inv(value: u64) -> i64 {
+        let x = value.wrapping_mul(3) ^ 2;
+        let y = 1u64.wrapping_sub(x.wrapping_mul(value));
+        let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
+        let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
+        let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
+        (x.wrapping_mul(y.wrapping_add(1)) & CInt::<62, L>::MASK) as i64
+    }
+}
+
 /// Big signed (B * L)-bit integer type, whose variables store
 /// numbers in the two's complement code as arrays of B-bit chunks.
 /// The ordering of the chunks in these arrays is little-endian.
@@ -228,208 +433,3 @@ impl<const B: usize, const L: usize> Mul<CInt<B, L>> for i64 {
         other * self
     }
 }
-
-/// Type of the modular multiplicative inverter based on the Bernstein-Yang method.
-/// The inverter can be created for a specified modulus M and adjusting parameter A
-/// to compute the adjusted multiplicative inverses of positive integers, i.e. for
-/// computing (1 / x) * A (mod M) for a positive integer x.
-///
-/// The adjusting parameter allows computing the multiplicative inverses in the case of
-/// using the Montgomery representation for the input or the expected output. If R is
-/// the Montgomery factor, the multiplicative inverses in the appropriate representation
-/// can be computed provided that the value of A is chosen as follows:
-/// - A = 1, if both the input and the expected output are in the standard form
-/// - A = R^2 mod M, if both the input and the expected output are in the Montgomery form
-/// - A = R mod M, if either the input or the expected output is in the Montgomery form,
-/// but not both of them
-///
-/// The public methods of this type receive and return unsigned big integers as arrays of
-/// 64-bit chunks, the ordering of which is little-endian. Both the modulus and the integer
-/// to be inverted should not exceed 2 ^ (62 * L - 64)
-///
-/// For better understanding the implementation, the following resources are recommended:
-/// - D. Bernstein, B.-Y. Yang, "Fast constant-time gcd computation and modular inversion",
-/// https://gcd.cr.yp.to/safegcd-20190413.pdf
-/// - P. Wuille, "The safegcd implementation in libsecp256k1 explained",
-/// https://github.com/bitcoin-core/secp256k1/blob/master/doc/safegcd_implementation.md
-#[derive(Debug)]
-pub struct BYInverter<const L: usize> {
-    /// Modulus
-    modulus: CInt<62, L>,
-
-    /// Adjusting parameter
-    adjuster: CInt<62, L>,
-
-    /// Multiplicative inverse of the modulus modulo 2^62
-    inverse: i64,
-}
-
-/// Type of the Bernstein-Yang transition matrix multiplied by 2^62
-type Matrix = [[i64; 2]; 2];
-
-impl<const L: usize> BYInverter<L> {
-    /// Returns the Bernstein-Yang transition matrix multiplied by 2^62 and the new value
-    /// of the delta variable for the 62 basic steps of the Bernstein-Yang method, which
-    /// are to be performed sequentially for specified initial values of f, g and delta
-    fn jump(f: &CInt<62, L>, g: &CInt<62, L>, mut delta: i64) -> (i64, Matrix) {
-        let (mut steps, mut f, mut g) = (62, f.lowest() as i64, g.lowest() as i128);
-        let mut t: Matrix = [[1, 0], [0, 1]];
-
-        loop {
-            let zeros = steps.min(g.trailing_zeros() as i64);
-            (steps, delta, g) = (steps - zeros, delta + zeros, g >> zeros);
-            t[0] = [t[0][0] << zeros, t[0][1] << zeros];
-
-            if steps == 0 {
-                break;
-            }
-            if delta > 0 {
-                (delta, f, g) = (-delta, g as i64, -f as i128);
-                (t[0], t[1]) = (t[1], [-t[0][0], -t[0][1]]);
-            }
-
-            // The formula (3 * x) xor 28 = -1 / x (mod 32) for an odd integer x
-            // in the two's complement code has been derived from the formula
-            // (3 * x) xor 2 = 1 / x (mod 32) attributed to Peter Montgomery
-            let mask = (1 << steps.min(1 - delta).min(5)) - 1;
-            let w = (g as i64).wrapping_mul(f.wrapping_mul(3) ^ 28) & mask;
-
-            t[1] = [t[0][0] * w + t[1][0], t[0][1] * w + t[1][1]];
-            g += w as i128 * f as i128;
-        }
-
-        (delta, t)
-    }
-
-    /// Returns the updated values of the variables f and g for specified initial ones and Bernstein-Yang transition
-    /// matrix multiplied by 2^62. The returned vector is "matrix * (f, g)' / 2^62", where "'" is the transpose operator
-    fn fg(f: CInt<62, L>, g: CInt<62, L>, t: Matrix) -> (CInt<62, L>, CInt<62, L>) {
-        (
-            (t[0][0] * &f + t[0][1] * &g).shift(),
-            (t[1][0] * &f + t[1][1] * &g).shift(),
-        )
-    }
-
-    /// Returns the updated values of the variables d and e for specified initial ones and Bernstein-Yang transition
-    /// matrix multiplied by 2^62. The returned vector is congruent modulo M to "matrix * (d, e)' / 2^62 (mod M)",
-    /// where M is the modulus the inverter was created for and "'" stands for the transpose operator. Both the input
-    /// and output values lie in the interval (-2 * M, M)
-    fn de(&self, d: CInt<62, L>, e: CInt<62, L>, t: Matrix) -> (CInt<62, L>, CInt<62, L>) {
-        let mask = CInt::<62, L>::MASK as i64;
-        let mut md = t[0][0] * d.is_negative() as i64 + t[0][1] * e.is_negative() as i64;
-        let mut me = t[1][0] * d.is_negative() as i64 + t[1][1] * e.is_negative() as i64;
-
-        let cd = t[0][0]
-            .wrapping_mul(d.lowest() as i64)
-            .wrapping_add(t[0][1].wrapping_mul(e.lowest() as i64))
-            & mask;
-        let ce = t[1][0]
-            .wrapping_mul(d.lowest() as i64)
-            .wrapping_add(t[1][1].wrapping_mul(e.lowest() as i64))
-            & mask;
-
-        md -= (self.inverse.wrapping_mul(cd).wrapping_add(md)) & mask;
-        me -= (self.inverse.wrapping_mul(ce).wrapping_add(me)) & mask;
-
-        let cd = t[0][0] * &d + t[0][1] * &e + md * &self.modulus;
-        let ce = t[1][0] * &d + t[1][1] * &e + me * &self.modulus;
-
-        (cd.shift(), ce.shift())
-    }
-
-    /// Returns either "value (mod M)" or "-value (mod M)", where M is the modulus the
-    /// inverter was created for, depending on "negate", which determines the presence
-    /// of "-" in the used formula. The input integer lies in the interval (-2 * M, M)
-    fn norm(&self, mut value: CInt<62, L>, negate: bool) -> CInt<62, L> {
-        if value.is_negative() {
-            value = value + &self.modulus;
-        }
-
-        if negate {
-            value = -value;
-        }
-
-        if value.is_negative() {
-            value = value + &self.modulus;
-        }
-
-        value
-    }
-
-    /// Returns a big unsigned integer as an array of O-bit chunks, which is equal modulo
-    /// 2 ^ (O * S) to the input big unsigned integer stored as an array of I-bit chunks.
-    /// The ordering of the chunks in these arrays is little-endian
-    const fn convert<const I: usize, const O: usize, const S: usize>(input: &[u64]) -> [u64; S] {
-        // This function is defined because the method "min" of the usize type is not constant
-        const fn min(a: usize, b: usize) -> usize {
-            if a > b {
-                b
-            } else {
-                a
-            }
-        }
-
-        let (total, mut output, mut bits) = (min(input.len() * I, S * O), [0; S], 0);
-
-        while bits < total {
-            let (i, o) = (bits % I, bits % O);
-            output[bits / O] |= (input[bits / I] >> i) << o;
-            bits += min(I - i, O - o);
-        }
-
-        let mask = u64::MAX >> (64 - O);
-        let mut filled = total / O + if total % O > 0 { 1 } else { 0 };
-
-        while filled > 0 {
-            filled -= 1;
-            output[filled] &= mask;
-        }
-
-        output
-    }
-
-    /// Returns the multiplicative inverse of the argument modulo 2^62. The implementation is based
-    /// on the Hurchalla's method for computing the multiplicative inverse modulo a power of two.
-    /// For better understanding the implementation, the following paper is recommended:
-    /// J. Hurchalla, "An Improved Integer Multiplicative Inverse (modulo 2^w)",
-    /// https://arxiv.org/pdf/2204.04342.pdf
-    const fn inv(value: u64) -> i64 {
-        let x = value.wrapping_mul(3) ^ 2;
-        let y = 1u64.wrapping_sub(x.wrapping_mul(value));
-        let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
-        let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
-        let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
-        (x.wrapping_mul(y.wrapping_add(1)) & CInt::<62, L>::MASK) as i64
-    }
-
-    /// Creates the inverter for specified modulus and adjusting parameter
-    pub const fn new(modulus: &[u64], adjuster: &[u64]) -> Self {
-        Self {
-            modulus: CInt::<62, L>(Self::convert::<64, 62, L>(modulus)),
-            adjuster: CInt::<62, L>(Self::convert::<64, 62, L>(adjuster)),
-            inverse: Self::inv(modulus[0]),
-        }
-    }
-
-    /// Returns either the adjusted modular multiplicative inverse for the argument or None
-    /// depending on invertibility of the argument, i.e. its coprimality with the modulus
-    pub fn invert<const S: usize>(&self, value: &[u64]) -> Option<[u64; S]> {
-        let (mut d, mut e) = (CInt::ZERO, self.adjuster.clone());
-        let mut g = CInt::<62, L>(Self::convert::<64, 62, L>(value));
-        let (mut delta, mut f) = (1, self.modulus.clone());
-        let mut matrix;
-        while g != CInt::ZERO {
-            (delta, matrix) = Self::jump(&f, &g, delta);
-            (f, g) = Self::fg(f, g, matrix);
-            (d, e) = self.de(d, e, matrix);
-        }
-        // At this point the absolute value of "f" equals the greatest common divisor
-        // of the integer to be inverted and the modulus the inverter was created for.
-        // Thus, if "f" is neither 1 nor -1, then the sought inverse does not exist
-        let antiunit = f == CInt::MINUS_ONE;
-        if (f != CInt::ONE) && !antiunit {
-            return None;
-        }
-        Some(Self::convert::<62, 64, S>(&self.norm(d, antiunit).0))
-    }
-}
diff --git a/tests/bernstein_yang_proptests.rs b/tests/bernstein_yang_proptests.rs
new file mode 100644
index 00000000..341e4069
--- /dev/null
+++ b/tests/bernstein_yang_proptests.rs
@@ -0,0 +1,47 @@
+//! Equivalence tests for Bernstein-Yang inversions.
+
+use crypto_bigint::{modular::BernsteinYangInverter, Encoding, U256};
+use num_bigint::BigUint;
+use num_integer::Integer;
+use num_traits::One;
+use proptest::prelude::*;
+
+/// Example prime number (NIST P-256 curve order)
+const P: U256 =
+    U256::from_be_hex("ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551");
+
+fn to_biguint(uint: &U256) -> BigUint {
+    BigUint::from_bytes_le(uint.to_le_bytes().as_ref())
+}
+
+prop_compose! {
+    fn uint()(bytes in any::<[u8; 32]>()) -> U256 {
+        U256::from_le_slice(&bytes)
+    }
+}
+prop_compose! {
+    fn uint_mod_p(p: U256)(a in uint()) -> U256 {
+        a.wrapping_rem(&p)
+    }
+}
+
+proptest! {
+    #[test]
+    fn inv_mod(x in uint()) {
+        let x_bi = to_biguint(&x);
+        let p_bi = to_biguint(&P);
+
+        let expected_is_some = x_bi.gcd(&p_bi) == BigUint::one();
+
+        let inverter = BernsteinYangInverter::<6>::new(P.as_words(), &[1]);
+        let actual = inverter.invert::<{U256::LIMBS}>(x.as_words());
+
+        prop_assert_eq!(bool::from(expected_is_some), actual.is_some());
+
+        if let Some(actual) = actual {
+            let inv_bi = to_biguint(&U256::from(actual));
+            let res = (inv_bi * x_bi) % p_bi;
+            prop_assert_eq!(res, BigUint::one());
+        }
+    }
+}

From b5ba6dc4e2245d88709dff6e8e40bf117348df05 Mon Sep 17 00:00:00 2001
From: Tony Arcieri <bascule@gmail.com>
Date: Fri, 1 Dec 2023 17:28:57 -0700
Subject: [PATCH 3/6] clippy

---
 src/modular/bernstein_yang.rs | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/modular/bernstein_yang.rs b/src/modular/bernstein_yang.rs
index 9851b56b..8b6fc8f0 100644
--- a/src/modular/bernstein_yang.rs
+++ b/src/modular/bernstein_yang.rs
@@ -6,6 +6,8 @@
 //!
 //! Copyright (c) 2023 Privacy Scaling Explorations Team
 
+#![allow(clippy::needless_range_loop)]
+
 use core::{
     cmp::PartialEq,
     ops::{Add, Mul, Neg, Sub},
@@ -277,7 +279,7 @@ impl<const B: usize, const L: usize> Add for &CInt<B, L> {
             data[i] = sum & CInt::<B, L>::MASK;
             carry = sum >> B;
         }
-        Self::Output { 0: data }
+        CInt(data)
     }
 }
 
@@ -311,7 +313,7 @@ impl<const B: usize, const L: usize> Sub for &CInt<B, L> {
             data[i] = sum & CInt::<B, L>::MASK;
             carry = sum >> B;
         }
-        Self::Output { 0: data }
+        CInt(data)
     }
 }
 
@@ -340,7 +342,7 @@ impl<const B: usize, const L: usize> Neg for &CInt<B, L> {
             data[i] = sum & CInt::<B, L>::MASK;
             carry = sum >> B;
         }
-        Self::Output { 0: data }
+        CInt(data)
     }
 }
 
@@ -365,7 +367,7 @@ impl<const B: usize, const L: usize> Mul for &CInt<B, L> {
                 carry = (sum >> B) as u64;
             }
         }
-        Self::Output { 0: data }
+        CInt(data)
     }
 }
 
@@ -409,7 +411,7 @@ impl<const B: usize, const L: usize> Mul<i64> for &CInt<B, L> {
             data[i] = sum as u64 & CInt::<B, L>::MASK;
             carry = (sum >> B) as u64;
         }
-        Self::Output { 0: data }
+        CInt(data)
     }
 }
 

From f0cbca48650d3e3b60720f5c3700a08e6d7ca491 Mon Sep 17 00:00:00 2001
From: Tony Arcieri <bascule@gmail.com>
Date: Fri, 1 Dec 2023 17:30:32 -0700
Subject: [PATCH 4/6] Fix rustdoc

---
 src/modular/bernstein_yang.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/modular/bernstein_yang.rs b/src/modular/bernstein_yang.rs
index 8b6fc8f0..cd082365 100644
--- a/src/modular/bernstein_yang.rs
+++ b/src/modular/bernstein_yang.rs
@@ -33,9 +33,9 @@ use core::{
 ///
 /// For better understanding the implementation, the following resources are recommended:
 /// - D. Bernstein, B.-Y. Yang, "Fast constant-time gcd computation and modular inversion",
-/// https://gcd.cr.yp.to/safegcd-20190413.pdf
+/// <https://gcd.cr.yp.to/safegcd-20190413.pdf>
 /// - P. Wuille, "The safegcd implementation in libsecp256k1 explained",
-/// https://github.com/bitcoin-core/secp256k1/blob/master/doc/safegcd_implementation.md
+/// <https://github.com/bitcoin-core/secp256k1/blob/master/doc/safegcd_implementation.md>
 #[derive(Debug)]
 pub struct BernsteinYangInverter<const L: usize> {
     /// Modulus

From b78e96668a12bee6c8d90df49b6a677f27547225 Mon Sep 17 00:00:00 2001
From: Tony Arcieri <bascule@gmail.com>
Date: Sun, 3 Dec 2023 09:34:11 -0700
Subject: [PATCH 5/6] Make `BernsteinYangInverter::invert` into a `const fn`

---
 src/modular/bernstein_yang.rs | 238 ++++++++++++----------------------
 1 file changed, 80 insertions(+), 158 deletions(-)

diff --git a/src/modular/bernstein_yang.rs b/src/modular/bernstein_yang.rs
index cd082365..e0f2034b 100644
--- a/src/modular/bernstein_yang.rs
+++ b/src/modular/bernstein_yang.rs
@@ -8,11 +8,6 @@
 
 #![allow(clippy::needless_range_loop)]
 
-use core::{
-    cmp::PartialEq,
-    ops::{Add, Mul, Neg, Sub},
-};
-
 /// Type of the modular multiplicative inverter based on the Bernstein-Yang method.
 /// The inverter can be created for a specified modulus M and adjusting parameter A
 /// to compute the adjusted multiplicative inverses of positive integers, i.e. for
@@ -63,12 +58,13 @@ impl<const L: usize> BernsteinYangInverter<L> {
 
     /// Returns either the adjusted modular multiplicative inverse for the argument or None
     /// depending on invertibility of the argument, i.e. its coprimality with the modulus
-    pub fn invert<const S: usize>(&self, value: &[u64]) -> Option<[u64; S]> {
-        let (mut d, mut e) = (CInt::ZERO, self.adjuster.clone());
+    pub const fn invert<const S: usize>(&self, value: &[u64]) -> Option<[u64; S]> {
+        let (mut d, mut e) = (CInt::ZERO, self.adjuster);
         let mut g = CInt::<62, L>(Self::convert::<64, 62, L>(value));
-        let (mut delta, mut f) = (1, self.modulus.clone());
+        let (mut delta, mut f) = (1, self.modulus);
         let mut matrix;
-        while g != CInt::ZERO {
+
+        while !g.eq(&CInt::ZERO) {
             (delta, matrix) = Self::jump(&f, &g, delta);
             (f, g) = Self::fg(f, g, matrix);
             (d, e) = self.de(d, e, matrix);
@@ -76,8 +72,8 @@ impl<const L: usize> BernsteinYangInverter<L> {
         // At this point the absolute value of "f" equals the greatest common divisor
         // of the integer to be inverted and the modulus the inverter was created for.
         // Thus, if "f" is neither 1 nor -1, then the sought inverse does not exist
-        let antiunit = f == CInt::MINUS_ONE;
-        if (f != CInt::ONE) && !antiunit {
+        let antiunit = f.eq(&CInt::MINUS_ONE);
+        if !f.eq(&CInt::ONE) && !antiunit {
             return None;
         }
         Some(Self::convert::<62, 64, S>(&self.norm(d, antiunit).0))
@@ -86,12 +82,21 @@ impl<const L: usize> BernsteinYangInverter<L> {
     /// Returns the Bernstein-Yang transition matrix multiplied by 2^62 and the new value
     /// of the delta variable for the 62 basic steps of the Bernstein-Yang method, which
     /// are to be performed sequentially for specified initial values of f, g and delta
-    fn jump(f: &CInt<62, L>, g: &CInt<62, L>, mut delta: i64) -> (i64, Matrix) {
+    const fn jump(f: &CInt<62, L>, g: &CInt<62, L>, mut delta: i64) -> (i64, Matrix) {
+        // This function is defined because the method "min" of the i64 type is not constant
+        const fn min(a: i64, b: i64) -> i64 {
+            if a > b {
+                b
+            } else {
+                a
+            }
+        }
+
         let (mut steps, mut f, mut g) = (62, f.lowest() as i64, g.lowest() as i128);
         let mut t: Matrix = [[1, 0], [0, 1]];
 
         loop {
-            let zeros = steps.min(g.trailing_zeros() as i64);
+            let zeros = min(steps, g.trailing_zeros() as i64);
             (steps, delta, g) = (steps - zeros, delta + zeros, g >> zeros);
             t[0] = [t[0][0] << zeros, t[0][1] << zeros];
 
@@ -106,7 +111,7 @@ impl<const L: usize> BernsteinYangInverter<L> {
             // The formula (3 * x) xor 28 = -1 / x (mod 32) for an odd integer x
             // in the two's complement code has been derived from the formula
             // (3 * x) xor 2 = 1 / x (mod 32) attributed to Peter Montgomery
-            let mask = (1 << steps.min(1 - delta).min(5)) - 1;
+            let mask = (1 << min(min(steps, 1 - delta), 5)) - 1;
             let w = (g as i64).wrapping_mul(f.wrapping_mul(3) ^ 28) & mask;
 
             t[1] = [t[0][0] * w + t[1][0], t[0][1] * w + t[1][1]];
@@ -118,10 +123,10 @@ impl<const L: usize> BernsteinYangInverter<L> {
 
     /// Returns the updated values of the variables f and g for specified initial ones and Bernstein-Yang transition
     /// matrix multiplied by 2^62. The returned vector is "matrix * (f, g)' / 2^62", where "'" is the transpose operator
-    fn fg(f: CInt<62, L>, g: CInt<62, L>, t: Matrix) -> (CInt<62, L>, CInt<62, L>) {
+    const fn fg(f: CInt<62, L>, g: CInt<62, L>, t: Matrix) -> (CInt<62, L>, CInt<62, L>) {
         (
-            (t[0][0] * &f + t[0][1] * &g).shift(),
-            (t[1][0] * &f + t[1][1] * &g).shift(),
+            f.mul(t[0][0]).add(&g.mul(t[0][1])).shift(),
+            f.mul(t[1][0]).add(&g.mul(t[1][1])).shift(),
         )
     }
 
@@ -129,7 +134,7 @@ impl<const L: usize> BernsteinYangInverter<L> {
     /// matrix multiplied by 2^62. The returned vector is congruent modulo M to "matrix * (d, e)' / 2^62 (mod M)",
     /// where M is the modulus the inverter was created for and "'" stands for the transpose operator. Both the input
     /// and output values lie in the interval (-2 * M, M)
-    fn de(&self, d: CInt<62, L>, e: CInt<62, L>, t: Matrix) -> (CInt<62, L>, CInt<62, L>) {
+    const fn de(&self, d: CInt<62, L>, e: CInt<62, L>, t: Matrix) -> (CInt<62, L>, CInt<62, L>) {
         let mask = CInt::<62, L>::MASK as i64;
         let mut md = t[0][0] * d.is_negative() as i64 + t[0][1] * e.is_negative() as i64;
         let mut me = t[1][0] * d.is_negative() as i64 + t[1][1] * e.is_negative() as i64;
@@ -146,8 +151,14 @@ impl<const L: usize> BernsteinYangInverter<L> {
         md -= (self.inverse.wrapping_mul(cd).wrapping_add(md)) & mask;
         me -= (self.inverse.wrapping_mul(ce).wrapping_add(me)) & mask;
 
-        let cd = t[0][0] * &d + t[0][1] * &e + md * &self.modulus;
-        let ce = t[1][0] * &d + t[1][1] * &e + me * &self.modulus;
+        let cd = d
+            .mul(t[0][0])
+            .add(&e.mul(t[0][1]))
+            .add(&self.modulus.mul(md));
+        let ce = d
+            .mul(t[1][0])
+            .add(&e.mul(t[1][1]))
+            .add(&self.modulus.mul(me));
 
         (cd.shift(), ce.shift())
     }
@@ -155,17 +166,17 @@ impl<const L: usize> BernsteinYangInverter<L> {
     /// Returns either "value (mod M)" or "-value (mod M)", where M is the modulus the
     /// inverter was created for, depending on "negate", which determines the presence
     /// of "-" in the used formula. The input integer lies in the interval (-2 * M, M)
-    fn norm(&self, mut value: CInt<62, L>, negate: bool) -> CInt<62, L> {
+    const fn norm(&self, mut value: CInt<62, L>, negate: bool) -> CInt<62, L> {
         if value.is_negative() {
-            value = value + &self.modulus;
+            value = value.add(&self.modulus);
         }
 
         if negate {
-            value = -value;
+            value = value.neg();
         }
 
         if value.is_negative() {
-            value = value + &self.modulus;
+            value = value.add(&self.modulus);
         }
 
         value
@@ -222,7 +233,7 @@ impl<const L: usize> BernsteinYangInverter<L> {
 /// numbers in the two's complement code as arrays of B-bit chunks.
 /// The ordering of the chunks in these arrays is little-endian.
 /// The arithmetic operations for this type are wrapping ones.
-#[derive(Clone, Debug)]
+#[derive(Clone, Copy, Debug)]
 struct CInt<const B: usize, const L: usize>(pub [u64; L]);
 
 impl<const B: usize, const L: usize> CInt<B, L> {
@@ -244,150 +255,78 @@ impl<const B: usize, const L: usize> CInt<B, L> {
 
     /// Returns the result of applying B-bit right
     /// arithmetical shift to the current number
-    pub fn shift(&self) -> Self {
+    pub const fn shift(&self) -> Self {
         let mut data = [0; L];
         if self.is_negative() {
             data[L - 1] = Self::MASK;
         }
-        data[..L - 1].copy_from_slice(&self.0[1..]);
+
+        let mut i = 0;
+        while i < L - 1 {
+            data[i] = self.0[i + 1];
+            i += 1;
+        }
+
         Self(data)
     }
 
     /// Returns the lowest B bits of the current number
-    pub fn lowest(&self) -> u64 {
+    pub const fn lowest(&self) -> u64 {
         self.0[0]
     }
 
     /// Returns "true" iff the current number is negative
-    pub fn is_negative(&self) -> bool {
+    pub const fn is_negative(&self) -> bool {
         self.0[L - 1] > (Self::MASK >> 1)
     }
-}
 
-impl<const B: usize, const L: usize> PartialEq for CInt<B, L> {
-    fn eq(&self, other: &Self) -> bool {
-        self.0 == other.0
-    }
-}
+    /// Const fn equivalent for `PartialEq::eq`
+    pub const fn eq(&self, other: &Self) -> bool {
+        let mut ret = true;
+        let mut i = 0;
 
-impl<const B: usize, const L: usize> Add for &CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn add(self, other: Self) -> Self::Output {
-        let (mut data, mut carry) = ([0; L], 0);
-        for i in 0..L {
-            let sum = self.0[i] + other.0[i] + carry;
-            data[i] = sum & CInt::<B, L>::MASK;
-            carry = sum >> B;
+        while i < L {
+            ret &= self.0[i] == other.0[i];
+            i += 1;
         }
-        CInt(data)
-    }
-}
 
-impl<const B: usize, const L: usize> Add<&CInt<B, L>> for CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn add(self, other: &Self) -> Self::Output {
-        &self + other
+        ret
     }
-}
 
-impl<const B: usize, const L: usize> Add for CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn add(self, other: Self) -> Self::Output {
-        &self + &other
-    }
-}
+    /// Const fn equivalent for `Add::add`
+    pub const fn add(&self, other: &Self) -> Self {
+        let (mut data, mut carry) = ([0; L], 0);
+        let mut i = 0;
 
-impl<const B: usize, const L: usize> Sub for &CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn sub(self, other: Self) -> Self::Output {
-        // For the two's complement code the additive negation is the result of
-        // adding 1 to the bitwise inverted argument's representation. Thus, for
-        // any encoded integers x and y we have x - y = x + !y + 1, where "!" is
-        // the bitwise inversion and addition is done according to the rules of
-        // the code. The algorithm below uses this formula and is the modified
-        // addition algorithm, where the carry flag is initialized with 1 and
-        // the chunks of the second argument are bitwise inverted
-        let (mut data, mut carry) = ([0; L], 1);
-        for i in 0..L {
-            let sum = self.0[i] + (other.0[i] ^ CInt::<B, L>::MASK) + carry;
+        while i < L {
+            let sum = self.0[i] + other.0[i] + carry;
             data[i] = sum & CInt::<B, L>::MASK;
             carry = sum >> B;
+            i += 1;
         }
-        CInt(data)
-    }
-}
-
-impl<const B: usize, const L: usize> Sub<&CInt<B, L>> for CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn sub(self, other: &Self) -> Self::Output {
-        &self - other
-    }
-}
 
-impl<const B: usize, const L: usize> Sub for CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn sub(self, other: Self) -> Self::Output {
-        &self - &other
+        Self(data)
     }
-}
 
-impl<const B: usize, const L: usize> Neg for &CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn neg(self) -> Self::Output {
+    /// Const fn equivalent for `Neg::neg`
+    pub const fn neg(&self) -> Self {
         // For the two's complement code the additive negation is the result
         // of adding 1 to the bitwise inverted argument's representation
         let (mut data, mut carry) = ([0; L], 1);
-        for i in 0..L {
-            let sum = (self.0[i] ^ CInt::<B, L>::MASK) + carry;
-            data[i] = sum & CInt::<B, L>::MASK;
-            carry = sum >> B;
-        }
-        CInt(data)
-    }
-}
+        let mut i = 0;
 
-impl<const B: usize, const L: usize> Neg for CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn neg(self) -> Self::Output {
-        -&self
-    }
-}
-
-impl<const B: usize, const L: usize> Mul for &CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn mul(self, other: Self) -> Self::Output {
-        let mut data = [0; L];
-        for i in 0..L {
-            let mut carry = 0;
-            for k in 0..(L - i) {
-                let sum = (data[i + k] as u128)
-                    + (carry as u128)
-                    + (self.0[i] as u128) * (other.0[k] as u128);
-                data[i + k] = sum as u64 & CInt::<B, L>::MASK;
-                carry = (sum >> B) as u64;
-            }
+        while i < L {
+            let sum = (self.0[i] ^ Self::MASK) + carry;
+            data[i] = sum & Self::MASK;
+            carry = sum >> B;
+            i += 1;
         }
-        CInt(data)
-    }
-}
 
-impl<const B: usize, const L: usize> Mul<&CInt<B, L>> for CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn mul(self, other: &Self) -> Self::Output {
-        &self * other
-    }
-}
-
-impl<const B: usize, const L: usize> Mul for CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn mul(self, other: Self) -> Self::Output {
-        &self * &other
+        Self(data)
     }
-}
 
-impl<const B: usize, const L: usize> Mul<i64> for &CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn mul(self, other: i64) -> Self::Output {
+    /// Const fn equivalent for `Mul::<i64>::mul`
+    pub const fn mul(&self, other: i64) -> Self {
         let mut data = [0; L];
         // If the short multiplicand is non-negative, the standard multiplication
         // algorithm is performed. Otherwise, the product of the additively negated
@@ -402,36 +341,19 @@ impl<const B: usize, const L: usize> Mul<i64> for &CInt<B, L> {
         // where the carry flag is initialized with the additively negated short
         // multiplicand and the chunks of the long multiplicand are bitwise inverted
         let (other, mut carry, mask) = if other < 0 {
-            (-other, -other as u64, CInt::<B, L>::MASK)
+            (-other, -other as u64, Self::MASK)
         } else {
             (other, 0, 0)
         };
-        for i in 0..L {
+
+        let mut i = 0;
+        while i < L {
             let sum = (carry as u128) + ((self.0[i] ^ mask) as u128) * (other as u128);
-            data[i] = sum as u64 & CInt::<B, L>::MASK;
+            data[i] = sum as u64 & Self::MASK;
             carry = (sum >> B) as u64;
+            i += 1;
         }
-        CInt(data)
-    }
-}
 
-impl<const B: usize, const L: usize> Mul<i64> for CInt<B, L> {
-    type Output = CInt<B, L>;
-    fn mul(self, other: i64) -> Self::Output {
-        &self * other
-    }
-}
-
-impl<const B: usize, const L: usize> Mul<&CInt<B, L>> for i64 {
-    type Output = CInt<B, L>;
-    fn mul(self, other: &CInt<B, L>) -> Self::Output {
-        other * self
-    }
-}
-
-impl<const B: usize, const L: usize> Mul<CInt<B, L>> for i64 {
-    type Output = CInt<B, L>;
-    fn mul(self, other: CInt<B, L>) -> Self::Output {
-        other * self
+        Self(data)
     }
 }

From a4dd48933e63b8e1849f9112b59694fee3e3994d Mon Sep 17 00:00:00 2001
From: Tony Arcieri <bascule@gmail.com>
Date: Sun, 3 Dec 2023 13:16:25 -0700
Subject: [PATCH 6/6] 32-bit support

---
 src/modular/bernstein_yang.rs | 94 ++++++++++++++++++++++++-----------
 1 file changed, 66 insertions(+), 28 deletions(-)

diff --git a/src/modular/bernstein_yang.rs b/src/modular/bernstein_yang.rs
index e0f2034b..a55aa7cc 100644
--- a/src/modular/bernstein_yang.rs
+++ b/src/modular/bernstein_yang.rs
@@ -8,6 +8,8 @@
 
 #![allow(clippy::needless_range_loop)]
 
+use crate::Word;
+
 /// Type of the modular multiplicative inverter based on the Bernstein-Yang method.
 /// The inverter can be created for a specified modulus M and adjusting parameter A
 /// to compute the adjusted multiplicative inverses of positive integers, i.e. for
@@ -48,19 +50,20 @@ type Matrix = [[i64; 2]; 2];
 
 impl<const L: usize> BernsteinYangInverter<L> {
     /// Creates the inverter for specified modulus and adjusting parameter
-    pub const fn new(modulus: &[u64], adjuster: &[u64]) -> Self {
+    #[allow(trivial_numeric_casts)]
+    pub const fn new(modulus: &[Word], adjuster: &[Word]) -> Self {
         Self {
-            modulus: CInt::<62, L>(Self::convert::<64, 62, L>(modulus)),
-            adjuster: CInt::<62, L>(Self::convert::<64, 62, L>(adjuster)),
-            inverse: Self::inv(modulus[0]),
+            modulus: CInt::<62, L>(convert_in::<{ Word::BITS as usize }, 62, L>(modulus)),
+            adjuster: CInt::<62, L>(convert_in::<{ Word::BITS as usize }, 62, L>(adjuster)),
+            inverse: inv_mod62(modulus),
         }
     }
 
     /// Returns either the adjusted modular multiplicative inverse for the argument or None
     /// depending on invertibility of the argument, i.e. its coprimality with the modulus
-    pub const fn invert<const S: usize>(&self, value: &[u64]) -> Option<[u64; S]> {
+    pub const fn invert<const S: usize>(&self, value: &[Word]) -> Option<[Word; S]> {
         let (mut d, mut e) = (CInt::ZERO, self.adjuster);
-        let mut g = CInt::<62, L>(Self::convert::<64, 62, L>(value));
+        let mut g = CInt::<62, L>(convert_in::<{ Word::BITS as usize }, 62, L>(value));
         let (mut delta, mut f) = (1, self.modulus);
         let mut matrix;
 
@@ -76,7 +79,9 @@ impl<const L: usize> BernsteinYangInverter<L> {
         if !f.eq(&CInt::ONE) && !antiunit {
             return None;
         }
-        Some(Self::convert::<62, 64, S>(&self.norm(d, antiunit).0))
+        Some(convert_out::<62, { Word::BITS as usize }, S>(
+            &self.norm(d, antiunit).0,
+        ))
     }
 
     /// Returns the Bernstein-Yang transition matrix multiplied by 2^62 and the new value
@@ -181,11 +186,40 @@ impl<const L: usize> BernsteinYangInverter<L> {
 
         value
     }
+}
+
+/// Returns the multiplicative inverse of the argument modulo 2^62. The implementation is based
+/// on the Hurchalla's method for computing the multiplicative inverse modulo a power of two.
+/// For better understanding the implementation, the following paper is recommended:
+/// J. Hurchalla, "An Improved Integer Multiplicative Inverse (modulo 2^w)",
+/// https://arxiv.org/pdf/2204.04342.pdf
+const fn inv_mod62(value: &[Word]) -> i64 {
+    let value = {
+        #[cfg(target_pointer_width = "32")]
+        {
+            debug_assert!(value.len() >= 2);
+            value[0] as u64 | (value[1] as u64) << 32
+        }
 
-    /// Returns a big unsigned integer as an array of O-bit chunks, which is equal modulo
-    /// 2 ^ (O * S) to the input big unsigned integer stored as an array of I-bit chunks.
-    /// The ordering of the chunks in these arrays is little-endian
-    const fn convert<const I: usize, const O: usize, const S: usize>(input: &[u64]) -> [u64; S] {
+        #[cfg(target_pointer_width = "64")]
+        {
+            value[0]
+        }
+    };
+
+    let x = value.wrapping_mul(3) ^ 2;
+    let y = 1u64.wrapping_sub(x.wrapping_mul(value));
+    let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
+    let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
+    let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
+    (x.wrapping_mul(y.wrapping_add(1)) & (u64::MAX >> 2)) as i64
+}
+
+/// Write an impl of a `convert_*` function.
+///
+/// Workaround for making this function generic while still allowing it to be `const fn`.
+macro_rules! impl_convert {
+    ($input_type:ty, $output_type:ty, $input:expr) => {{
         // This function is defined because the method "min" of the usize type is not constant
         const fn min(a: usize, b: usize) -> usize {
             if a > b {
@@ -195,15 +229,17 @@ impl<const L: usize> BernsteinYangInverter<L> {
             }
         }
 
-        let (total, mut output, mut bits) = (min(input.len() * I, S * O), [0; S], 0);
+        let total = min($input.len() * I, S * O);
+        let mut output = [0 as $output_type; S];
+        let mut bits = 0;
 
         while bits < total {
             let (i, o) = (bits % I, bits % O);
-            output[bits / O] |= (input[bits / I] >> i) << o;
+            output[bits / O] |= ($input[bits / I] >> i) as $output_type << o;
             bits += min(I - i, O - o);
         }
 
-        let mask = u64::MAX >> (64 - O);
+        let mask = (<$output_type>::MAX as $output_type) >> (<$output_type>::BITS as usize - O);
         let mut filled = total / O + if total % O > 0 { 1 } else { 0 };
 
         while filled > 0 {
@@ -212,21 +248,23 @@ impl<const L: usize> BernsteinYangInverter<L> {
         }
 
         output
-    }
+    }};
+}
 
-    /// Returns the multiplicative inverse of the argument modulo 2^62. The implementation is based
-    /// on the Hurchalla's method for computing the multiplicative inverse modulo a power of two.
-    /// For better understanding the implementation, the following paper is recommended:
-    /// J. Hurchalla, "An Improved Integer Multiplicative Inverse (modulo 2^w)",
-    /// https://arxiv.org/pdf/2204.04342.pdf
-    const fn inv(value: u64) -> i64 {
-        let x = value.wrapping_mul(3) ^ 2;
-        let y = 1u64.wrapping_sub(x.wrapping_mul(value));
-        let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
-        let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
-        let (x, y) = (x.wrapping_mul(y.wrapping_add(1)), y.wrapping_mul(y));
-        (x.wrapping_mul(y.wrapping_add(1)) & CInt::<62, L>::MASK) as i64
-    }
+/// Returns a big unsigned integer as an array of O-bit chunks, which is equal modulo
+/// 2 ^ (O * S) to the input big unsigned integer stored as an array of I-bit chunks.
+/// The ordering of the chunks in these arrays is little-endian
+#[allow(trivial_numeric_casts)]
+const fn convert_in<const I: usize, const O: usize, const S: usize>(input: &[Word]) -> [u64; S] {
+    impl_convert!(Word, u64, input)
+}
+
+/// Returns a big unsigned integer as an array of O-bit chunks, which is equal modulo
+/// 2 ^ (O * S) to the input big unsigned integer stored as an array of I-bit chunks.
+/// The ordering of the chunks in these arrays is little-endian
+#[allow(trivial_numeric_casts)]
+const fn convert_out<const I: usize, const O: usize, const S: usize>(input: &[u64]) -> [Word; S] {
+    impl_convert!(u64, Word, input)
 }
 
 /// Big signed (B * L)-bit integer type, whose variables store