Skip to content

Commit

Permalink
Insert MSM and FFT code and their benchmarks. (#86)
Browse files Browse the repository at this point in the history
* Insert MSM and FFT code and their benchmarks.

Resolves taikoxyz/zkevm-circuits#150.

* feedback

* Add instructions

* feeback

* Implement feedback:  Actually supply the correct arguments to `best_multiexp`.

Split into `singlecore` and `multicore` benchmarks so Criterion's result
caching and comparison over multiple runs makes sense.

Rewrite point and scalar generation.

* Use slicing and parallelism to to decrease running time.

Laptop measurements:
k=22: 109 sec
k=16:   1 sec

* Refactor msm

* Refactor fft

* Update module comments

* Fix formatting

* Implement suggestion for fixing CI
  • Loading branch information
einar-taiko authored Sep 22, 2023
1 parent 2f3e388 commit ee7cb86
Show file tree
Hide file tree
Showing 7 changed files with 491 additions and 1 deletion.
13 changes: 12 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,11 @@ serde = { version = "1.0", default-features = false, optional = true }
serde_arrays = { version = "0.1.0", optional = true }
hex = { version = "0.4", optional = true, default-features = false, features = ["alloc", "serde"] }
blake2b_simd = "1"
maybe-rayon = { version = "0.1.0", default-features = false }

[features]
default = ["reexport", "bits"]
default = ["reexport", "bits", "multicore"]
multicore = ["maybe-rayon/threads"]
asm = []
bits = ["ff/bits"]
bn256-table = []
Expand Down Expand Up @@ -69,3 +71,12 @@ harness = false
[[bench]]
name = "hash_to_curve"
harness = false

[[bench]]
name = "fft"
harness = false

[[bench]]
name = "msm"
harness = false
required-features = ["multicore"]
57 changes: 57 additions & 0 deletions benches/fft.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
//! This benchmarks Fast-Fourier Transform (FFT).
//! Since it is over a finite field, it is actually the Number Theoretical
//! Transform (NNT). It uses the `Fr` scalar field from the BN256 curve.
//!
//! To run this benchmark:
//!
//! cargo bench -- fft
//!
//! Caveat: The multicore benchmark assumes:
//! 1. a multi-core system
//! 2. that the `multicore` feature is enabled. It is by default.

#[macro_use]
extern crate criterion;

use criterion::{BenchmarkId, Criterion};
use group::ff::Field;
use halo2curves::bn256::Fr as Scalar;
use halo2curves::fft::best_fft;
use rand_core::OsRng;
use std::ops::Range;
use std::time::SystemTime;

const RANGE: Range<u32> = 3..19;

fn generate_data(k: u32) -> Vec<Scalar> {
let n = 1 << k;
let timer = SystemTime::now();
println!("\n\nGenerating 2^{k} = {n} values..",);
let data: Vec<Scalar> = (0..n).map(|_| Scalar::random(OsRng)).collect();
let end = timer.elapsed().unwrap();
println!(
"Generating 2^{k} = {n} values took: {} sec.\n\n",
end.as_secs()
);
data
}

fn fft(c: &mut Criterion) {
let max_k = RANGE.max().unwrap_or(16);
let mut data = generate_data(max_k);
let omega = Scalar::random(OsRng);
let mut group = c.benchmark_group("fft");
for k in RANGE {
group.bench_function(BenchmarkId::new("k", k), |b| {
let n = 1 << k;
assert!(n <= data.len());
b.iter(|| {
best_fft(&mut data[..n], omega, k);
});
});
}
group.finish();
}

criterion_group!(benches, fft);
criterion_main!(benches);
116 changes: 116 additions & 0 deletions benches/msm.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
//! This benchmarks Multi Scalar Multiplication (MSM).
//! It measures `G1` from the BN256 curve.
//!
//! To run this benchmark:
//!
//! cargo bench -- msm
//!
//! Caveat: The multicore benchmark assumes:
//! 1. a multi-core system
//! 2. that the `multicore` feature is enabled. It is by default.

#[macro_use]
extern crate criterion;

use criterion::{BenchmarkId, Criterion};
use ff::Field;
use group::prime::PrimeCurveAffine;
use halo2curves::bn256::{Fr as Scalar, G1Affine as Point};
use halo2curves::msm::{best_multiexp, multiexp_serial};
use maybe_rayon::current_thread_index;
use maybe_rayon::prelude::{IntoParallelIterator, ParallelIterator};
use rand_core::SeedableRng;
use rand_xorshift::XorShiftRng;
use std::time::SystemTime;

const SAMPLE_SIZE: usize = 10;
const SINGLECORE_RANGE: [u8; 6] = [3, 8, 10, 12, 14, 16];
const MULTICORE_RANGE: [u8; 9] = [3, 8, 10, 12, 14, 16, 18, 20, 22];
const SEED: [u8; 16] = [
0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, 0xe5,
];

fn generate_coefficients_and_curvepoints(k: u8) -> (Vec<Scalar>, Vec<Point>) {
let n: u64 = {
assert!(k < 64);
1 << k
};

println!("\n\nGenerating 2^{k} = {n} coefficients and curve points..",);
let timer = SystemTime::now();
let coeffs = (0..n)
.into_par_iter()
.map_init(
|| {
let mut thread_seed = SEED;
let uniq = current_thread_index().unwrap().to_ne_bytes();
assert!(std::mem::size_of::<usize>() == 8);
for i in 0..uniq.len() {
thread_seed[i] += uniq[i];
thread_seed[i + 8] += uniq[i];
}
XorShiftRng::from_seed(thread_seed)
},
|rng, _| Scalar::random(rng),
)
.collect();
let bases = (0..n)
.into_par_iter()
.map_init(
|| {
let mut thread_seed = SEED;
let uniq = current_thread_index().unwrap().to_ne_bytes();
assert!(std::mem::size_of::<usize>() == 8);
for i in 0..uniq.len() {
thread_seed[i] += uniq[i];
thread_seed[i + 8] += uniq[i];
}
XorShiftRng::from_seed(thread_seed)
},
|rng, _| Point::random(rng),
)
.collect();
let end = timer.elapsed().unwrap();
println!(
"Generating 2^{k} = {n} coefficients and curve points took: {} sec.\n\n",
end.as_secs()
);

(coeffs, bases)
}

fn msm(c: &mut Criterion) {
let mut group = c.benchmark_group("msm");
let max_k = *SINGLECORE_RANGE
.iter()
.chain(MULTICORE_RANGE.iter())
.max()
.unwrap_or(&16);
let (coeffs, bases) = generate_coefficients_and_curvepoints(max_k);

for k in SINGLECORE_RANGE {
group
.bench_function(BenchmarkId::new("singlecore", k), |b| {
assert!(k < 64);
let n: usize = 1 << k;
let mut acc = Point::identity().into();
b.iter(|| multiexp_serial(&coeffs[..n], &bases[..n], &mut acc));
})
.sample_size(10);
}
for k in MULTICORE_RANGE {
group
.bench_function(BenchmarkId::new("multicore", k), |b| {
assert!(k < 64);
let n: usize = 1 << k;
b.iter(|| {
best_multiexp(&coeffs[..n], &bases[..n]);
})
})
.sample_size(SAMPLE_SIZE);
}
group.finish();
}

criterion_group!(benches, msm);
criterion_main!(benches);
134 changes: 134 additions & 0 deletions src/fft.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
use crate::multicore;
pub use crate::{CurveAffine, CurveExt};
use ff::Field;
use group::{GroupOpsOwned, ScalarMulOwned};

/// This represents an element of a group with basic operations that can be
/// performed. This allows an FFT implementation (for example) to operate
/// generically over either a field or elliptic curve group.
pub trait FftGroup<Scalar: Field>:
Copy + Send + Sync + 'static + GroupOpsOwned + ScalarMulOwned<Scalar>
{
}

impl<T, Scalar> FftGroup<Scalar> for T
where
Scalar: Field,
T: Copy + Send + Sync + 'static + GroupOpsOwned + ScalarMulOwned<Scalar>,
{
}

/// Performs a radix-$2$ Fast-Fourier Transformation (FFT) on a vector of size
/// $n = 2^k$, when provided `log_n` = $k$ and an element of multiplicative
/// order $n$ called `omega` ($\omega$). The result is that the vector `a`, when
/// interpreted as the coefficients of a polynomial of degree $n - 1$, is
/// transformed into the evaluations of this polynomial at each of the $n$
/// distinct powers of $\omega$. This transformation is invertible by providing
/// $\omega^{-1}$ in place of $\omega$ and dividing each resulting field element
/// by $n$.
///
/// This will use multithreading if beneficial.
pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(a: &mut [G], omega: Scalar, log_n: u32) {
fn bitreverse(mut n: usize, l: usize) -> usize {
let mut r = 0;
for _ in 0..l {
r = (r << 1) | (n & 1);
n >>= 1;
}
r
}

let threads = multicore::current_num_threads();
let log_threads = threads.ilog2();
let n = a.len();
assert_eq!(n, 1 << log_n);

for k in 0..n {
let rk = bitreverse(k, log_n as usize);
if k < rk {
a.swap(rk, k);
}
}

// precompute twiddle factors
let twiddles: Vec<_> = (0..(n / 2))
.scan(Scalar::ONE, |w, _| {
let tw = *w;
*w *= &omega;
Some(tw)
})
.collect();

if log_n <= log_threads {
let mut chunk = 2_usize;
let mut twiddle_chunk = n / 2;
for _ in 0..log_n {
a.chunks_mut(chunk).for_each(|coeffs| {
let (left, right) = coeffs.split_at_mut(chunk / 2);

// case when twiddle factor is one
let (a, left) = left.split_at_mut(1);
let (b, right) = right.split_at_mut(1);
let t = b[0];
b[0] = a[0];
a[0] += &t;
b[0] -= &t;

left.iter_mut()
.zip(right.iter_mut())
.enumerate()
.for_each(|(i, (a, b))| {
let mut t = *b;
t *= &twiddles[(i + 1) * twiddle_chunk];
*b = *a;
*a += &t;
*b -= &t;
});
});
chunk *= 2;
twiddle_chunk /= 2;
}
} else {
recursive_butterfly_arithmetic(a, n, 1, &twiddles)
}
}

/// This perform recursive butterfly arithmetic
pub fn recursive_butterfly_arithmetic<Scalar: Field, G: FftGroup<Scalar>>(
a: &mut [G],
n: usize,
twiddle_chunk: usize,
twiddles: &[Scalar],
) {
if n == 2 {
let t = a[1];
a[1] = a[0];
a[0] += &t;
a[1] -= &t;
} else {
let (left, right) = a.split_at_mut(n / 2);
multicore::join(
|| recursive_butterfly_arithmetic(left, n / 2, twiddle_chunk * 2, twiddles),
|| recursive_butterfly_arithmetic(right, n / 2, twiddle_chunk * 2, twiddles),
);

// case when twiddle factor is one
let (a, left) = left.split_at_mut(1);
let (b, right) = right.split_at_mut(1);
let t = b[0];
b[0] = a[0];
a[0] += &t;
b[0] -= &t;

left.iter_mut()
.zip(right.iter_mut())
.enumerate()
.for_each(|(i, (a, b))| {
let mut t = *b;
t *= &twiddles[(i + 1) * twiddle_chunk];
*b = *a;
*a += &t;
*b -= &t;
});
}
}
3 changes: 3 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
mod arithmetic;
pub mod fft;
pub mod hash_to_curve;
pub mod msm;
pub mod multicore;
#[macro_use]
pub mod legendre;
pub mod serde;
Expand Down
Loading

0 comments on commit ee7cb86

Please sign in to comment.