diff --git a/crates/encoding/src/path.rs b/crates/encoding/src/path.rs index 414ce23e6..b0a52b0b7 100644 --- a/crates/encoding/src/path.rs +++ b/crates/encoding/src/path.rs @@ -227,8 +227,11 @@ pub struct Path { pub struct Tile { /// Accumulated backdrop at the left edge of the tile. pub backdrop: i32, - /// Index of first path segment. - pub segments: u32, + /// An enum that holds either the count of the number of path + /// segments in this tile, or an index to the beginning of an + /// allocated slice of `PathSegment` objects. In the latter case, + /// the bits are inverted. + pub segment_count_or_ix: u32, } /// Encoder for path segments. diff --git a/shader/draw_leaf.wgsl b/shader/draw_leaf.wgsl index 6154b9256..827825974 100644 --- a/shader/draw_leaf.wgsl +++ b/shader/draw_leaf.wgsl @@ -108,7 +108,6 @@ fn main( // let x1 = f32(bbox.x1); // let y1 = f32(bbox.y1); // let bbox_f = vec4(x0, y0, x1, y1); - let fill_mode = u32(bbox.linewidth >= 0.0); var transform = Transform(); var linewidth = bbox.linewidth; if linewidth >= 0.0 || tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT || diff --git a/src/cpu_dispatch.rs b/src/cpu_dispatch.rs index 0b8bbc86b..2c3409c16 100644 --- a/src/cpu_dispatch.rs +++ b/src/cpu_dispatch.rs @@ -4,10 +4,12 @@ //! Support for CPU implementations of compute shaders. use std::{ - cell::{RefCell, RefMut}, - ops::Deref, + cell::{Ref, RefCell, RefMut}, + ops::{Deref, DerefMut}, }; +use bytemuck::Pod; + #[derive(Clone, Copy)] pub enum CpuBinding<'a> { Buffer(&'a [u8]), @@ -16,39 +18,88 @@ pub enum CpuBinding<'a> { Texture(&'a CpuTexture), } -pub enum CpuBufGuard<'a> { - Slice(&'a [u8]), - Interior(RefMut<'a, Vec>), +pub enum TypedBufGuard<'a, T: ?Sized> { + Slice(&'a T), + Interior(Ref<'a, T>), +} + +pub enum TypedBufGuardMut<'a, T: ?Sized> { + Slice(&'a mut T), + Interior(RefMut<'a, T>), } -impl<'a> Deref for CpuBufGuard<'a> { - type Target = [u8]; +impl<'a, T: ?Sized> Deref for TypedBufGuard<'a, T> { + type Target = T; fn deref(&self) -> &Self::Target { match self { - CpuBufGuard::Slice(s) => s, - CpuBufGuard::Interior(r) => r, + TypedBufGuard::Slice(s) => s, + TypedBufGuard::Interior(r) => r, } } } -impl<'a> CpuBufGuard<'a> { - /// Get a mutable reference to the buffer. - /// - /// Panics if the underlying resource is read-only. - pub fn as_mut(&mut self) -> &mut [u8] { +impl<'a, T: ?Sized> Deref for TypedBufGuardMut<'a, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { match self { - CpuBufGuard::Interior(r) => &mut *r, - _ => panic!("tried to borrow immutable buffer as mutable"), + TypedBufGuardMut::Slice(s) => s, + TypedBufGuardMut::Interior(r) => r, + } + } +} + +impl<'a, T: ?Sized> DerefMut for TypedBufGuardMut<'a, T> { + fn deref_mut(&mut self) -> &mut Self::Target { + match self { + TypedBufGuardMut::Slice(s) => s, + TypedBufGuardMut::Interior(r) => r, } } } impl<'a> CpuBinding<'a> { - pub fn as_buf(&self) -> CpuBufGuard { + pub fn as_typed(&self) -> TypedBufGuard { + match self { + CpuBinding::Buffer(b) => TypedBufGuard::Slice(bytemuck::from_bytes(b)), + CpuBinding::BufferRW(b) => { + TypedBufGuard::Interior(Ref::map(b.borrow(), |buf| bytemuck::from_bytes(buf))) + } + _ => panic!("resource type mismatch"), + } + } + + pub fn as_typed_mut(&self) -> TypedBufGuardMut { + match self { + CpuBinding::Buffer(_) => panic!("can't borrow external buffer mutably"), + CpuBinding::BufferRW(b) => { + TypedBufGuardMut::Interior(RefMut::map(b.borrow_mut(), |buf| { + bytemuck::from_bytes_mut(buf) + })) + } + _ => panic!("resource type mismatch"), + } + } + + pub fn as_slice(&self) -> TypedBufGuard<[T]> { + match self { + CpuBinding::Buffer(b) => TypedBufGuard::Slice(bytemuck::cast_slice(b)), + CpuBinding::BufferRW(b) => { + TypedBufGuard::Interior(Ref::map(b.borrow(), |buf| bytemuck::cast_slice(buf))) + } + _ => panic!("resource type mismatch"), + } + } + + pub fn as_slice_mut(&self) -> TypedBufGuardMut<[T]> { match self { - CpuBinding::Buffer(b) => CpuBufGuard::Slice(b), - CpuBinding::BufferRW(b) => CpuBufGuard::Interior(b.borrow_mut()), + CpuBinding::Buffer(_) => panic!("can't borrow external buffer mutably"), + CpuBinding::BufferRW(b) => { + TypedBufGuardMut::Interior(RefMut::map(b.borrow_mut(), |buf| { + bytemuck::cast_slice_mut(buf) + })) + } _ => panic!("resource type mismatch"), } } diff --git a/src/cpu_shader/backdrop.rs b/src/cpu_shader/backdrop.rs new file mode 100644 index 000000000..746efdcde --- /dev/null +++ b/src/cpu_shader/backdrop.rs @@ -0,0 +1,30 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{ConfigUniform, Path, Tile}; + +use crate::cpu_dispatch::CpuBinding; + +fn backdrop_main(config: &ConfigUniform, paths: &[Path], tiles: &mut [Tile]) { + for drawobj_ix in 0..config.layout.n_draw_objects { + let path = paths[drawobj_ix as usize]; + let width = path.bbox[2] - path.bbox[0]; + let height = path.bbox[3] - path.bbox[1]; + let base = path.tiles; + for y in 0..height { + let mut sum = 0; + for x in 0..width { + let tile = &mut tiles[(base + y * width + x) as usize]; + sum += tile.backdrop; + tile.backdrop = sum; + } + } + } +} + +pub fn backdrop(_n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let paths = resources[1].as_slice(); + let mut tiles = resources[2].as_slice_mut(); + backdrop_main(&config, &paths, &mut tiles); +} diff --git a/src/cpu_shader/bbox_clear.rs b/src/cpu_shader/bbox_clear.rs new file mode 100644 index 000000000..1e02127d0 --- /dev/null +++ b/src/cpu_shader/bbox_clear.rs @@ -0,0 +1,21 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{ConfigUniform, PathBbox}; + +use crate::cpu_dispatch::CpuBinding; + +fn bbox_clear_main(config: &ConfigUniform, path_bboxes: &mut [PathBbox]) { + for i in 0..(config.layout.n_paths as usize) { + path_bboxes[i].x0 = 0x7fff_ffff; + path_bboxes[i].y0 = 0x7fff_ffff; + path_bboxes[i].x1 = -0x8000_0000; + path_bboxes[i].y1 = -0x8000_0000; + } +} + +pub fn bbox_clear(_n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let mut path_bboxes = resources[1].as_slice_mut(); + bbox_clear_main(&config, &mut path_bboxes); +} diff --git a/src/cpu_shader/binning.rs b/src/cpu_shader/binning.rs new file mode 100644 index 000000000..5ace850c3 --- /dev/null +++ b/src/cpu_shader/binning.rs @@ -0,0 +1,128 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{BinHeader, BumpAllocators, ConfigUniform, DrawMonoid, PathBbox}; + +use crate::cpu_dispatch::CpuBinding; + +const WG_SIZE: usize = 256; +const TILE_WIDTH: usize = 16; +const TILE_HEIGHT: usize = 16; +const N_TILE_X: usize = 16; +const N_TILE_Y: usize = 16; +const SX: f32 = 1.0 / ((N_TILE_X * TILE_WIDTH) as f32); +const SY: f32 = 1.0 / ((N_TILE_Y * TILE_HEIGHT) as f32); + +fn bbox_intersect(a: [f32; 4], b: [f32; 4]) -> [f32; 4] { + [ + a[0].max(b[0]), + a[1].max(b[1]), + a[2].min(b[2]), + a[3].min(b[3]), + ] +} + +fn binning_main( + n_wg: u32, + config: &ConfigUniform, + draw_monoids: &[DrawMonoid], + path_bbox_buf: &[PathBbox], + clip_bbox_buf: &[[f32; 4]], + intersected_bbox: &mut [[f32; 4]], + bump: &mut BumpAllocators, + bin_data: &mut [u32], + bin_header: &mut [BinHeader], +) { + for wg in 0..n_wg as usize { + let mut counts = [0; WG_SIZE]; + let mut bboxes = [[0, 0, 0, 0]; WG_SIZE]; + let width_in_bins = + ((config.width_in_tiles + N_TILE_X as u32 - 1) / N_TILE_X as u32) as i32; + let height_in_bins = + ((config.height_in_tiles + N_TILE_Y as u32 - 1) / N_TILE_Y as u32) as i32; + for local_ix in 0..WG_SIZE { + let element_ix = wg * WG_SIZE + local_ix; + let mut x0 = 0; + let mut y0 = 0; + let mut x1 = 0; + let mut y1 = 0; + if element_ix < config.layout.n_draw_objects as usize { + let draw_monoid = draw_monoids[element_ix]; + let mut clip_bbox = [-1e9, -1e9, 1e9, 1e9]; + if draw_monoid.clip_ix > 0 { + assert!(draw_monoid.clip_ix - 1 < config.layout.n_clips); + clip_bbox = clip_bbox_buf[draw_monoid.clip_ix as usize - 1]; + } + let path_bbox = path_bbox_buf[draw_monoid.path_ix as usize]; + let pb = [ + path_bbox.x0 as f32, + path_bbox.y0 as f32, + path_bbox.x1 as f32, + path_bbox.y1 as f32, + ]; + let bbox = bbox_intersect(clip_bbox, pb); + intersected_bbox[element_ix] = bbox; + if bbox[0] < bbox[2] && bbox[1] < bbox[3] { + x0 = (bbox[0] * SX).floor() as i32; + y0 = (bbox[1] * SY).floor() as i32; + x1 = (bbox[2] * SX).ceil() as i32; + y1 = (bbox[3] * SY).ceil() as i32; + } + } + x0 = x0.clamp(0, width_in_bins); + y0 = y0.clamp(0, height_in_bins); + x1 = x1.clamp(0, width_in_bins); + y1 = y1.clamp(0, height_in_bins); + for y in y0..y1 { + for x in x0..x1 { + counts[(y * width_in_bins + x) as usize] += 1; + } + } + bboxes[local_ix] = [x0, y0, x1, y1]; + } + let mut chunk_offset = [0; WG_SIZE]; + for local_ix in 0..WG_SIZE { + let global_ix = wg * WG_SIZE + local_ix; + chunk_offset[local_ix] = bump.binning; + bump.binning += counts[local_ix]; + bin_header[global_ix] = BinHeader { + element_count: counts[local_ix], + chunk_offset: chunk_offset[local_ix], + }; + } + for local_ix in 0..WG_SIZE { + let element_ix = wg * WG_SIZE + local_ix; + let bbox = bboxes[local_ix]; + for y in bbox[1]..bbox[3] { + for x in bbox[0]..bbox[2] { + let bin_ix = (y * width_in_bins + x) as usize; + let ix = config.layout.bin_data_start + chunk_offset[bin_ix]; + bin_data[ix as usize] = element_ix as u32; + chunk_offset[bin_ix] += 1; + } + } + } + } +} + +pub fn binning(n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let draw_monoids = resources[1].as_slice(); + let path_bbox_buf = resources[2].as_slice(); + let clip_bbox_buf = resources[3].as_slice(); + let mut intersected_bbox = resources[4].as_slice_mut(); + let mut bump = resources[5].as_typed_mut(); + let mut bin_data = resources[6].as_slice_mut(); + let mut bin_header = resources[7].as_slice_mut(); + binning_main( + n_wg, + &config, + &draw_monoids, + &path_bbox_buf, + &clip_bbox_buf, + &mut intersected_bbox, + &mut bump, + &mut bin_data, + &mut bin_header, + ); +} diff --git a/src/cpu_shader/clip_leaf.rs b/src/cpu_shader/clip_leaf.rs new file mode 100644 index 000000000..0f5fc6106 --- /dev/null +++ b/src/cpu_shader/clip_leaf.rs @@ -0,0 +1,86 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{Clip, ConfigUniform, DrawMonoid, PathBbox}; + +use crate::cpu_dispatch::CpuBinding; + +struct ClipStackElement { + // index of draw object + parent_ix: u32, + path_ix: u32, + bbox: [f32; 4], +} + +const BIG_BBOX: [f32; 4] = [-1e9, -1e9, 1e9, 1e9]; + +// Note: this implementation doesn't rigorously follow the +// WGSL original. In particular, it just computes the clips +// sequentially rather than using the partition reductions. +fn clip_leaf_main( + config: &ConfigUniform, + clip_inp: &[Clip], + path_bboxes: &[PathBbox], + draw_monoids: &mut [DrawMonoid], + clip_bboxes: &mut [[f32; 4]], +) { + let mut stack: Vec = Vec::new(); + for global_ix in 0..config.layout.n_clips { + let clip_el = clip_inp[global_ix as usize]; + if clip_el.path_ix >= 0 { + // begin clip + let path_ix = clip_el.path_ix as u32; + let path_bbox = path_bboxes[path_ix as usize]; + let p_bbox = [ + path_bbox.x0 as f32, + path_bbox.y0 as f32, + path_bbox.x1 as f32, + path_bbox.y1 as f32, + ]; + let bbox = if let Some(last) = stack.last() { + [ + p_bbox[0].max(last.bbox[0]), + p_bbox[1].max(last.bbox[1]), + p_bbox[2].min(last.bbox[2]), + p_bbox[3].min(last.bbox[3]), + ] + } else { + p_bbox + }; + clip_bboxes[global_ix as usize] = bbox; + let parent_ix = clip_el.ix; + stack.push(ClipStackElement { + parent_ix, + path_ix, + bbox, + }); + } else { + // end clip + let tos = stack.pop().unwrap(); + let bbox = if let Some(nos) = stack.last() { + nos.bbox + } else { + BIG_BBOX + }; + clip_bboxes[global_ix as usize] = bbox; + draw_monoids[clip_el.ix as usize].path_ix = tos.path_ix; + draw_monoids[clip_el.ix as usize].scene_offset = + draw_monoids[tos.parent_ix as usize].scene_offset; + } + } +} + +pub fn clip_leaf(_n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let clip_inp = resources[1].as_slice(); + let path_bboxes = resources[2].as_slice(); + let mut draw_monoids = resources[5].as_slice_mut(); + let mut clip_bboxes = resources[6].as_slice_mut(); + clip_leaf_main( + &config, + &clip_inp, + &path_bboxes, + &mut draw_monoids, + &mut clip_bboxes, + ); +} diff --git a/src/cpu_shader/clip_reduce.rs b/src/cpu_shader/clip_reduce.rs new file mode 100644 index 000000000..fc30661f1 --- /dev/null +++ b/src/cpu_shader/clip_reduce.rs @@ -0,0 +1,56 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{Clip, ClipBic, ClipElement, PathBbox}; + +use crate::cpu_dispatch::CpuBinding; + +const WG_SIZE: usize = 256; + +fn clip_reduce_main( + n_wg: u32, + clip_inp: &[Clip], + path_bboxes: &[PathBbox], + reduced: &mut [ClipBic], + clip_out: &mut [ClipElement], +) { + let mut scratch = Vec::with_capacity(WG_SIZE); + for wg_ix in 0..n_wg { + scratch.clear(); + let mut bic_reduced = ClipBic::default(); + // reverse scan + for local_ix in (0..WG_SIZE).rev() { + let global_ix = wg_ix as usize * WG_SIZE + local_ix; + let inp = clip_inp[global_ix].path_ix; + let is_push = inp >= 0; + let bic = ClipBic::new(1 - is_push as u32, is_push as u32); + bic_reduced = bic.combine(bic_reduced); + if is_push && bic_reduced.a == 0 { + scratch.push(global_ix as u32); + } + } + reduced[wg_ix as usize] = bic_reduced; + for (i, parent_ix) in scratch.iter().rev().enumerate() { + let mut clip_el = ClipElement::default(); + clip_el.parent_ix = *parent_ix; + let path_ix = clip_inp[*parent_ix as usize].path_ix; + let path_bbox = path_bboxes[path_ix as usize]; + clip_el.bbox = [ + path_bbox.x0 as f32, + path_bbox.y0 as f32, + path_bbox.x1 as f32, + path_bbox.y1 as f32, + ]; + let global_ix = wg_ix as usize * WG_SIZE + i; + clip_out[global_ix] = clip_el; + } + } +} + +pub fn clip_reduce(n_wg: u32, resources: &[CpuBinding]) { + let clip_inp = resources[0].as_slice(); + let path_bboxes = resources[1].as_slice(); + let mut reduced = resources[2].as_slice_mut(); + let mut clip_out = resources[3].as_slice_mut(); + clip_reduce_main(n_wg, &clip_inp, &path_bboxes, &mut reduced, &mut clip_out); +} diff --git a/src/cpu_shader/coarse.rs b/src/cpu_shader/coarse.rs new file mode 100644 index 000000000..390df7f74 --- /dev/null +++ b/src/cpu_shader/coarse.rs @@ -0,0 +1,344 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{BinHeader, BumpAllocators, ConfigUniform, DrawMonoid, DrawTag, Path, Tile}; + +use crate::cpu_dispatch::CpuBinding; + +use super::{ + CMD_BEGIN_CLIP, CMD_COLOR, CMD_END, CMD_END_CLIP, CMD_FILL, CMD_IMAGE, CMD_JUMP, CMD_LIN_GRAD, + CMD_RAD_GRAD, CMD_SOLID, PTCL_INITIAL_ALLOC, +}; + +const N_TILE_X: usize = 16; +const N_TILE_Y: usize = 16; +const N_TILE: usize = N_TILE_X * N_TILE_Y; + +const PTCL_INCREMENT: u32 = 256; +const PTCL_HEADROOM: u32 = 2; + +// Modeled in the WGSL as private-scoped variables +struct TileState { + cmd_offset: u32, + cmd_limit: u32, +} + +impl TileState { + fn new(tile_ix: u32) -> TileState { + let cmd_offset = tile_ix * PTCL_INITIAL_ALLOC; + let cmd_limit = cmd_offset + (PTCL_INITIAL_ALLOC - PTCL_HEADROOM); + TileState { + cmd_offset, + cmd_limit, + } + } + + fn alloc_cmd( + &mut self, + size: u32, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + ) { + if self.cmd_offset + size >= self.cmd_limit { + let ptcl_dyn_start = + config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC; + let chunk_size = PTCL_INCREMENT.max(size + PTCL_HEADROOM); + let new_cmd = ptcl_dyn_start + bump.ptcl; + bump.ptcl += chunk_size; + ptcl[self.cmd_offset as usize] = CMD_JUMP; + ptcl[self.cmd_offset as usize + 1] = new_cmd; + self.cmd_offset = new_cmd; + self.cmd_limit = new_cmd + (PTCL_INCREMENT - PTCL_HEADROOM); + } + } + + fn write(&mut self, ptcl: &mut [u32], offset: u32, value: u32) { + ptcl[(self.cmd_offset + offset) as usize] = value; + } + + fn write_path( + &mut self, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + tile: &mut Tile, + ) { + let n_segs = tile.segment_count_or_ix; + if n_segs != 0 { + let seg_ix = bump.segments; + tile.segment_count_or_ix = !seg_ix; + bump.segments += n_segs; + self.alloc_cmd(4, config, bump, ptcl); + self.write(ptcl, 0, CMD_FILL); + let even_odd = false; // TODO + let size_and_rule = (n_segs << 1) | (even_odd as u32); + self.write(ptcl, 1, size_and_rule); + self.write(ptcl, 2, seg_ix); + self.write(ptcl, 3, tile.backdrop as u32); + self.cmd_offset += 4; + } else { + self.alloc_cmd(1, config, bump, ptcl); + self.write(ptcl, 0, CMD_SOLID); + self.cmd_offset += 1; + } + } + + fn write_color( + &mut self, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + rgba_color: u32, + ) { + self.alloc_cmd(2, config, bump, ptcl); + self.write(ptcl, 0, CMD_COLOR); + self.write(ptcl, 1, rgba_color); + self.cmd_offset += 2; + } + + fn write_image( + &mut self, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + info_offset: u32, + ) { + self.alloc_cmd(2, config, bump, ptcl); + self.write(ptcl, 0, CMD_IMAGE); + self.write(ptcl, 1, info_offset); + self.cmd_offset += 2; + } + + fn write_grad( + &mut self, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + ty: u32, + index: u32, + info_offset: u32, + ) { + self.alloc_cmd(3, config, bump, ptcl); + self.write(ptcl, 0, ty); + self.write(ptcl, 1, index); + self.write(ptcl, 2, info_offset); + self.cmd_offset += 3; + } + + fn write_begin_clip( + &mut self, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + ) { + self.alloc_cmd(1, config, bump, ptcl); + self.write(ptcl, 0, CMD_BEGIN_CLIP); + self.cmd_offset += 1; + } + + fn write_end_clip( + &mut self, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + blend: u32, + alpha: f32, + ) { + self.alloc_cmd(3, config, bump, ptcl); + self.write(ptcl, 0, CMD_END_CLIP); + self.write(ptcl, 1, blend); + self.write(ptcl, 2, f32::to_bits(alpha)); + self.cmd_offset += 3; + } +} + +fn coarse_main( + config: &ConfigUniform, + scene: &[u32], + draw_monoids: &[DrawMonoid], + bin_headers: &[BinHeader], + info_bin_data: &[u32], + paths: &[Path], + tiles: &mut [Tile], + bump: &mut BumpAllocators, + ptcl: &mut [u32], +) { + let width_in_tiles = config.width_in_tiles; + let height_in_tiles = config.height_in_tiles; + let width_in_bins = (width_in_tiles + N_TILE_X as u32 - 1) / N_TILE_X as u32; + let height_in_bins = (height_in_tiles + N_TILE_Y as u32 - 1) / N_TILE_Y as u32; + let n_bins = width_in_bins * height_in_bins; + let bin_data_start = config.layout.bin_data_start; + let drawtag_base = config.layout.draw_tag_base; + let mut compacted = vec![vec![]; N_TILE]; + let n_partitions = (config.layout.n_draw_objects + N_TILE as u32 - 1) / N_TILE as u32; + for bin in 0..n_bins { + for v in &mut compacted { + v.clear(); + } + let bin_x = bin % width_in_bins; + let bin_y = bin / width_in_bins; + let bin_tile_x = N_TILE_X as u32 * bin_x; + let bin_tile_y = N_TILE_Y as u32 * bin_y; + for part in 0..n_partitions { + let in_ix = part * N_TILE as u32 + bin; + let bin_header = bin_headers[in_ix as usize]; + let start = bin_data_start + bin_header.chunk_offset; + for i in 0..bin_header.element_count { + let drawobj_ix = info_bin_data[(start + i) as usize]; + let tag = scene[(drawtag_base + drawobj_ix) as usize]; + if DrawTag(tag) != DrawTag::NOP { + let draw_monoid = draw_monoids[drawobj_ix as usize]; + let path_ix = draw_monoid.path_ix; + let path = paths[path_ix as usize]; + let dx = path.bbox[0] as i32 - bin_tile_x as i32; + let dy = path.bbox[1] as i32 - bin_tile_y as i32; + let x0 = dx.clamp(0, N_TILE_X as i32); + let y0 = dy.clamp(0, N_TILE_Y as i32); + let x1 = (path.bbox[2] as i32 - bin_tile_x as i32).clamp(0, N_TILE_X as i32); + let y1 = (path.bbox[3] as i32 - bin_tile_y as i32).clamp(0, N_TILE_Y as i32); + for y in y0..y1 { + for x in x0..x1 { + compacted[(y * N_TILE_X as i32 + x) as usize].push(drawobj_ix); + } + } + } + } + } + // compacted now has the list of draw objects for each tile. + // While the WGSL source does at most 256 draw objects at a time, + // this version does all the draw objects in a tile. + for tile_ix in 0..N_TILE { + let tile_x = (tile_ix % N_TILE_X) as u32; + let tile_y = (tile_ix / N_TILE_X) as u32; + let this_tile_ix = (bin_tile_y + tile_y) * width_in_tiles + bin_tile_x + tile_x; + let mut tile_state = TileState::new(this_tile_ix); + let blend_offset = tile_state.cmd_offset; + tile_state.cmd_offset += 1; + let mut clip_depth = 0; + let mut clip_zero_depth = 0; + for drawobj_ix in &compacted[tile_ix] { + let drawtag = scene[(drawtag_base + drawobj_ix) as usize]; + if clip_zero_depth == 0 { + let draw_monoid = draw_monoids[*drawobj_ix as usize]; + let path_ix = draw_monoid.path_ix; + let path = paths[path_ix as usize]; + let bbox = path.bbox; + let stride = bbox[2] - bbox[0]; + let x = bin_tile_x + tile_x - bbox[0]; + let y = bin_tile_y + tile_y - bbox[1]; + let tile = &mut tiles[(path.tiles + y * stride + x) as usize]; + let is_clip = (drawtag & 1) != 0; + let mut is_blend = false; + let dd = config.layout.draw_data_base + draw_monoid.scene_offset; + let di = draw_monoid.info_offset; + if is_clip { + const BLEND_CLIP: u32 = (128 << 8) | 3; + let blend = scene[dd as usize]; + is_blend = blend != BLEND_CLIP; + } + let n_segs = tile.segment_count_or_ix; + let include_tile = n_segs != 0 || (tile.backdrop == 0) == is_clip || is_blend; + if include_tile { + // TODO: get drawinfo (linewidth for fills) + match DrawTag(drawtag) { + DrawTag::COLOR => { + tile_state.write_path(config, bump, ptcl, tile); + let rgba_color = scene[dd as usize]; + tile_state.write_color(config, bump, ptcl, rgba_color); + } + DrawTag::IMAGE => { + tile_state.write_path(config, bump, ptcl, tile); + tile_state.write_image(config, bump, ptcl, di + 1); + } + DrawTag::LINEAR_GRADIENT => { + tile_state.write_path(config, bump, ptcl, tile); + let index = scene[dd as usize]; + tile_state.write_grad( + config, + bump, + ptcl, + CMD_LIN_GRAD, + index, + di + 1, + ); + } + DrawTag::RADIAL_GRADIENT => { + tile_state.write_path(config, bump, ptcl, tile); + let index = scene[dd as usize]; + tile_state.write_grad( + config, + bump, + ptcl, + CMD_RAD_GRAD, + index, + di + 1, + ); + } + DrawTag::BEGIN_CLIP => { + if tile.segment_count_or_ix == 0 && tile.backdrop == 0 { + clip_zero_depth = clip_depth + 1; + } else { + tile_state.write_begin_clip(config, bump, ptcl); + // TODO: update blend depth + } + clip_depth += 1; + } + DrawTag::END_CLIP => { + clip_depth -= 1; + tile_state.write_path(config, bump, ptcl, tile); + let blend = scene[dd as usize]; + let alpha = f32::from_bits(scene[dd as usize + 1]); + tile_state.write_end_clip(config, bump, ptcl, blend, alpha); + } + _ => todo!(), + } + } + } else { + // In "clip zero" state, suppress all drawing + match DrawTag(drawtag) { + DrawTag::BEGIN_CLIP => clip_depth += 1, + DrawTag::END_CLIP => { + if clip_depth == clip_zero_depth { + clip_zero_depth = 0; + } + clip_depth -= 1; + } + _ => (), + } + } + } + + if bin_tile_x + tile_x < width_in_tiles && bin_tile_y + tile_y < height_in_tiles { + ptcl[tile_state.cmd_offset as usize] = CMD_END; + let scratch_size = 0; // TODO: actually compute blend depth + ptcl[blend_offset as usize] = bump.blend; + bump.blend += scratch_size; + } + } + } +} + +pub fn coarse(_n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let draw_monoids = resources[2].as_slice(); + let bin_headers = resources[3].as_slice(); + let info_bin_data = resources[4].as_slice(); + let paths = resources[5].as_slice(); + let mut tiles = resources[6].as_slice_mut(); + let mut bump = resources[7].as_typed_mut(); + let mut ptcl = resources[8].as_slice_mut(); + coarse_main( + &config, + &scene, + &draw_monoids, + &bin_headers, + &info_bin_data, + &paths, + &mut tiles, + &mut bump, + &mut ptcl, + ); +} diff --git a/src/cpu_shader/draw_leaf.rs b/src/cpu_shader/draw_leaf.rs new file mode 100644 index 000000000..0aa779e5c --- /dev/null +++ b/src/cpu_shader/draw_leaf.rs @@ -0,0 +1,168 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{Clip, ConfigUniform, DrawMonoid, DrawTag, Monoid, PathBbox}; + +use crate::cpu_dispatch::CpuBinding; + +use super::util::{read_draw_tag_from_scene, Transform, Vec2}; + +const WG_SIZE: usize = 256; + +fn draw_leaf_main( + n_wg: u32, + config: &ConfigUniform, + scene: &[u32], + reduced: &[DrawMonoid], + path_bbox: &[PathBbox], + draw_monoid: &mut [DrawMonoid], + info: &mut [u32], + clip_inp: &mut [Clip], +) { + let mut prefix = DrawMonoid::default(); + for i in 0..n_wg { + let mut m = prefix; + for j in 0..WG_SIZE { + let ix = i * WG_SIZE as u32 + j as u32; + let tag_raw = read_draw_tag_from_scene(config, scene, ix); + let tag_word = DrawTag(tag_raw); + // store exclusive prefix sum + if ix < config.layout.n_draw_objects { + draw_monoid[ix as usize] = m; + } + let m_next = m.combine(&DrawMonoid::new(tag_word)); + let dd = config.layout.draw_data_base + m.scene_offset; + let di = m.info_offset as usize; + if tag_word == DrawTag::COLOR + || tag_word == DrawTag::LINEAR_GRADIENT + || tag_word == DrawTag::RADIAL_GRADIENT + || tag_word == DrawTag::IMAGE + || tag_word == DrawTag::BEGIN_CLIP + { + let bbox = path_bbox[m.path_ix as usize]; + let transform = Transform::read(config.layout.transform_base, bbox.trans_ix, scene); + let linewidth = bbox.linewidth; + match tag_word { + DrawTag::COLOR => { + info[di] = f32::to_bits(linewidth); + } + DrawTag::LINEAR_GRADIENT => { + info[di] = f32::to_bits(linewidth); + let p0 = Vec2::new( + f32::from_bits(scene[dd as usize + 1]), + f32::from_bits(scene[dd as usize + 2]), + ); + let p1 = Vec2::new( + f32::from_bits(scene[dd as usize + 3]), + f32::from_bits(scene[dd as usize + 4]), + ); + let p0 = transform.apply(p0); + let p1 = transform.apply(p1); + let dxy = p1 - p0; + let scale = 1.0 / dxy.dot(dxy); + let line_xy = dxy * scale; + let line_c = -p0.dot(line_xy); + info[di + 1] = f32::to_bits(line_xy.x); + info[di + 2] = f32::to_bits(line_xy.y); + info[di + 3] = f32::to_bits(line_c); + } + DrawTag::RADIAL_GRADIENT => { + info[di] = f32::to_bits(linewidth); + let p0 = Vec2::new( + f32::from_bits(scene[dd as usize + 1]), + f32::from_bits(scene[dd as usize + 2]), + ); + let p1 = Vec2::new( + f32::from_bits(scene[dd as usize + 3]), + f32::from_bits(scene[dd as usize + 4]), + ); + let r0 = f32::from_bits(scene[dd as usize + 5]); + let r1 = f32::from_bits(scene[dd as usize + 6]); + let z = transform.0; + let inv_det = (z[0] * z[3] - z[1] * z[2]).recip(); + let inv_mat = [ + z[3] * inv_det, + -z[1] * inv_det, + -z[2] * inv_det, + z[0] * inv_det, + ]; + let inv_tr = [ + -(inv_mat[0] * z[4] + inv_mat[2] * z[5]) - p0.x, + -(inv_mat[1] * z[4] + inv_mat[3] * z[5]) - p0.y, + ]; + let center1 = p1 - p0; + let rr = r1 / (r1 - r0); + let ra_inv = rr / (r1 * r1 - center1.dot(center1)); + let c1 = center1 * ra_inv; + let ra = rr * ra_inv; + let roff = rr - 1.0; + info[di + 1] = f32::to_bits(inv_mat[0]); + info[di + 2] = f32::to_bits(inv_mat[1]); + info[di + 3] = f32::to_bits(inv_mat[2]); + info[di + 4] = f32::to_bits(inv_mat[3]); + info[di + 5] = f32::to_bits(inv_tr[0]); + info[di + 6] = f32::to_bits(inv_tr[1]); + info[di + 7] = f32::to_bits(c1.x); + info[di + 8] = f32::to_bits(c1.y); + info[di + 9] = f32::to_bits(ra); + info[di + 19] = f32::to_bits(roff); + } + DrawTag::IMAGE => { + info[di] = f32::to_bits(linewidth); + let z = transform.0; + let inv_det = (z[0] * z[3] - z[1] * z[2]).recip(); + let inv_mat = [ + z[3] * inv_det, + -z[1] * inv_det, + -z[2] * inv_det, + z[0] * inv_det, + ]; + let inv_tr = [ + -(inv_mat[0] * z[4] + inv_mat[2] * z[5]), + -(inv_mat[1] * z[4] + inv_mat[3] * z[5]), + ]; + info[di + 1] = f32::to_bits(inv_mat[0]); + info[di + 2] = f32::to_bits(inv_mat[1]); + info[di + 3] = f32::to_bits(inv_mat[2]); + info[di + 4] = f32::to_bits(inv_mat[3]); + info[di + 5] = f32::to_bits(inv_tr[0]); + info[di + 6] = f32::to_bits(inv_tr[1]); + info[di + 7] = scene[dd as usize]; + info[di + 8] = scene[dd as usize + 1]; + } + DrawTag::BEGIN_CLIP => (), + _ => todo!("unhandled draw tag {:x}", tag_word.0), + } + } + if tag_word == DrawTag::BEGIN_CLIP { + let path_ix = m.path_ix as i32; + clip_inp[m.clip_ix as usize] = Clip { ix, path_ix }; + } else if tag_word == DrawTag::END_CLIP { + let path_ix = !ix as i32; + clip_inp[m.clip_ix as usize] = Clip { ix, path_ix }; + } + m = m_next; + } + prefix = prefix.combine(&reduced[i as usize]); + } +} + +pub fn draw_leaf(n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let reduced = resources[2].as_slice(); + let path_bbox = resources[3].as_slice(); + let mut draw_monoid = resources[4].as_slice_mut(); + let mut info = resources[5].as_slice_mut(); + let mut clip_inp = resources[6].as_slice_mut(); + draw_leaf_main( + n_wg, + &config, + &scene, + &reduced, + &path_bbox, + &mut draw_monoid, + &mut info, + &mut clip_inp, + ); +} diff --git a/src/cpu_shader/draw_reduce.rs b/src/cpu_shader/draw_reduce.rs new file mode 100644 index 000000000..61c338c71 --- /dev/null +++ b/src/cpu_shader/draw_reduce.rs @@ -0,0 +1,29 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{ConfigUniform, DrawMonoid, DrawTag, Monoid}; + +use crate::cpu_dispatch::CpuBinding; + +use super::util::read_draw_tag_from_scene; + +const WG_SIZE: usize = 256; + +fn draw_reduce_main(n_wg: u32, config: &ConfigUniform, scene: &[u32], reduced: &mut [DrawMonoid]) { + for i in 0..n_wg { + let mut m = DrawMonoid::default(); + for j in 0..WG_SIZE { + let ix = i * WG_SIZE as u32 + j as u32; + let tag = read_draw_tag_from_scene(config, scene, ix); + m = m.combine(&DrawMonoid::new(DrawTag(tag))); + } + reduced[i as usize] = m; + } +} + +pub fn draw_reduce(n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let mut reduced = resources[2].as_slice_mut(); + draw_reduce_main(n_wg, &config, &scene, &mut reduced); +} diff --git a/src/cpu_shader/fine.rs b/src/cpu_shader/fine.rs new file mode 100644 index 000000000..c64c87627 --- /dev/null +++ b/src/cpu_shader/fine.rs @@ -0,0 +1,188 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{ConfigUniform, PathSegment, Tile}; + +use crate::cpu_dispatch::CpuTexture; + +use super::{CMD_COLOR, CMD_END, CMD_FILL, CMD_JUMP, CMD_SOLID, PTCL_INITIAL_ALLOC}; + +// These should also move into a common area +const TILE_WIDTH: usize = 16; +const TILE_HEIGHT: usize = 16; +const TILE_SIZE: usize = TILE_WIDTH * TILE_HEIGHT; + +fn read_color(ptcl: &[u32], offset: u32) -> u32 { + ptcl[(offset + 1) as usize] +} + +struct CmdFill { + size_and_rule: u32, + seg_data: u32, + backdrop: i32, +} + +fn read_fill(ptcl: &[u32], offset: u32) -> CmdFill { + let size_and_rule = ptcl[(offset + 1) as usize]; + let seg_data = ptcl[(offset + 2) as usize]; + let backdrop = ptcl[(offset + 3) as usize] as i32; + CmdFill { + size_and_rule, + seg_data, + backdrop, + } +} + +fn unpack4x8unorm(x: u32) -> [f32; 4] { + let mut result = [0.0; 4]; + for i in 0..4 { + result[i] = ((x >> (i * 8)) & 0xff) as f32 * (1.0 / 255.0); + } + result +} + +fn pack4x8unorm(x: [f32; 4]) -> u32 { + let mut result = 0; + for i in 0..4 { + let byte = (x[i].clamp(0.0, 1.0) * 255.0).round() as u32; + result |= byte << (i * 8); + } + result +} + +fn fill_path(area: &mut [f32], segments: &[PathSegment], fill: &CmdFill, x_tile: f32, y_tile: f32) { + let n_segs = fill.size_and_rule >> 1; + let even_odd = (fill.size_and_rule & 1) != 0; + let backdrop_f = fill.backdrop as f32; + for a in area.iter_mut() { + *a = backdrop_f; + } + for segment in &segments[fill.seg_data as usize..][..n_segs as usize] { + for yi in 0..TILE_HEIGHT { + let y = segment.origin[1] - (y_tile + yi as f32); + let y0 = y.clamp(0.0, 1.0); + let y1 = (y + segment.delta[1]).clamp(0.0, 1.0); + let dy = y0 - y1; + let y_edge = segment.delta[0].signum() + * (y_tile + yi as f32 - segment.y_edge + 1.0).clamp(0.0, 1.0); + if dy != 0.0 { + let vec_y_recip = segment.delta[1].recip(); + let t0 = (y0 - y) * vec_y_recip; + let t1 = (y1 - y) * vec_y_recip; + let startx = segment.origin[0] - x_tile; + let x0 = startx + t0 * segment.delta[0]; + let x1 = startx + t1 * segment.delta[0]; + let xmin0 = x0.min(x1); + let xmax0 = x0.max(x1); + for i in 0..TILE_WIDTH { + let i_f = i as f32; + let xmin = (xmin0 - i_f).min(1.0) - 1.0e-6; + let xmax = xmax0 - i_f; + let b = xmax.min(1.0); + let c = b.max(0.0); + let d = xmin.max(0.0); + let a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin); + area[yi * TILE_WIDTH + i] += y_edge + a * dy; + } + } else if y_edge != 0.0 { + for i in 0..TILE_WIDTH { + area[yi * TILE_WIDTH + i] += y_edge; + } + } + } + } + if even_odd { + for a in area.iter_mut() { + { + *a = (*a - 2.0 * (0.5 * *a).round()).abs(); + } + } + } else { + for a in area.iter_mut() { + { + *a = a.abs().min(1.0); + } + } + } +} + +// Note: this is a draft. Texture resources are not yet wired up, so it +// has not yet been tested. +#[allow(unused)] +fn fine_main( + config: &ConfigUniform, + tiles: &[Tile], + segments: &[PathSegment], + output: &mut CpuTexture, + ptcl: &[u32], + info: &[u32], + // TODO: image texture resources + // TODO: masks? +) { + let width_in_tiles = config.width_in_tiles; + let height_in_tiles = config.height_in_tiles; + let n_tiles = width_in_tiles * height_in_tiles; + let mut area = vec![0.0f32; TILE_SIZE]; + let mut rgba = vec![[0.0f32; 4]; TILE_SIZE]; + for tile_ix in 0..n_tiles { + for x in &mut rgba { + *x = [0.0; 4]; + } + for a in &mut area { + *a = 0.0; + } + let tile_x = tile_ix % width_in_tiles; + let tile_y = tile_ix / width_in_tiles; + let mut cmd_ix = tile_ix * PTCL_INITIAL_ALLOC; + // skip over blend stack allocation + cmd_ix += 1; + loop { + let tag = ptcl[cmd_ix as usize]; + if tag == CMD_END { + break; + } + match tag { + CMD_FILL => { + let fill = read_fill(ptcl, cmd_ix); + // x0 and y0 will go away when we do tile-relative coords + let x0 = (tile_x as usize * TILE_WIDTH) as f32; + let y0 = (tile_y as usize * TILE_HEIGHT) as f32; + fill_path(&mut area, segments, &fill, x0, y0); + cmd_ix += 4; + } + CMD_SOLID => { + for a in &mut area { + *a = 1.0; + } + cmd_ix += 2; + } + CMD_COLOR => { + let color = read_color(ptcl, cmd_ix); + let fg = unpack4x8unorm(color); + let fg = [fg[3], fg[2], fg[1], fg[0]]; + for i in 0..TILE_SIZE { + let ai = area[i]; + let fg_i = [fg[0] * ai, fg[1] * ai, fg[2] * ai, fg[3] * ai]; + for j in 0..4 { + rgba[i][j] = rgba[i][j] * (1.0 - fg_i[3]) + fg_i[j]; + } + } + cmd_ix += 2; + } + CMD_JUMP => { + cmd_ix = ptcl[(cmd_ix + 1) as usize]; + } + _ => todo!("unhandled ptcl command {tag}"), + } + } + // Write tile (in rgba) + for y in 0..TILE_HEIGHT { + let base = + output.width * (tile_y as usize * TILE_HEIGHT + y) + tile_x as usize * TILE_WIDTH; + for x in 0..TILE_WIDTH { + let rgba32 = pack4x8unorm(rgba[y * TILE_WIDTH + x]); + output.pixels[base + x] = rgba32; + } + } + } +} diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs new file mode 100644 index 000000000..2cdf7256b --- /dev/null +++ b/src/cpu_shader/flatten.rs @@ -0,0 +1,299 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use crate::cpu_dispatch::CpuBinding; + +use super::util::{Transform, Vec2}; +use vello_encoding::{BumpAllocators, ConfigUniform, LineSoup, Monoid, PathBbox, PathMonoid}; + +fn to_minus_one_quarter(x: f32) -> f32 { + // could also be written x.powf(-0.25) + x.sqrt().sqrt().recip() +} + +const D: f32 = 0.67; +fn approx_parabola_integral(x: f32) -> f32 { + x * to_minus_one_quarter(1.0 - D + (D * D * D * D + 0.25 * x * x)) +} + +const B: f32 = 0.39; +fn approx_parabola_inv_integral(x: f32) -> f32 { + x * (1.0 - B + (B * B + 0.5 * x * x)).sqrt() +} + +#[derive(Clone, Copy, Default)] +struct SubdivResult { + val: f32, + a0: f32, + a2: f32, +} + +fn estimate_subdiv(p0: Vec2, p1: Vec2, p2: Vec2, sqrt_tol: f32) -> SubdivResult { + let d01 = p1 - p0; + let d12 = p2 - p1; + let dd = d01 - d12; + let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x; + let cross_inv = if cross.abs() < 1.0e-9 { + 1.0e9 + } else { + cross.recip() + }; + let x0 = d01.dot(dd) * cross_inv; + let x2 = d12.dot(dd) * cross_inv; + let scale = (cross / (dd.length() * (x2 - x0))).abs(); + let a0 = approx_parabola_integral(x0); + let a2 = approx_parabola_integral(x2); + let mut val = 0.0; + if scale < 1e9 { + let da = (a2 - a0).abs(); + let sqrt_scale = scale.sqrt(); + if x0.signum() == x2.signum() { + val = sqrt_scale; + } else { + let xmin = sqrt_tol / sqrt_scale; + val = sqrt_tol / approx_parabola_integral(xmin); + } + val *= da; + } + SubdivResult { val, a0, a2 } +} + +fn eval_quad(p0: Vec2, p1: Vec2, p2: Vec2, t: f32) -> Vec2 { + let mt = 1.0 - t; + p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t +} + +fn eval_cubic(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2, t: f32) -> Vec2 { + let mt = 1.0 - t; + p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t +} + +const MAX_QUADS: u32 = 16; + +struct Cubic { + p0: Vec2, + p1: Vec2, + p2: Vec2, + p3: Vec2, + path_ix: u32, +} + +fn flatten_cubic(cubic: Cubic, line_ix: &mut usize, lines: &mut [LineSoup]) { + let p0 = cubic.p0; + let p1 = cubic.p1; + let p2 = cubic.p2; + let p3 = cubic.p3; + let err_v = (p2 - p1) * 3.0 + p0 - p3; + let err = err_v.dot(err_v); + const ACCURACY: f32 = 0.25; + const Q_ACCURACY: f32 = ACCURACY * 0.1; + const REM_ACCURACY: f32 = ACCURACY - Q_ACCURACY; + const MAX_HYPOT2: f32 = 432.0 * Q_ACCURACY * Q_ACCURACY; + let mut n_quads = ((err * (1.0 / MAX_HYPOT2)).powf(1.0 / 6.0).ceil() as u32).max(1); + n_quads = n_quads.min(MAX_QUADS); + let mut keep_params = [SubdivResult::default(); MAX_QUADS as usize]; + let mut val = 0.0; + let mut qp0 = p0; + let step = (n_quads as f32).recip(); + for i in 0..n_quads { + let t = (i + 1) as f32 * step; + let qp2 = eval_cubic(p0, p1, p2, p3, t); + let mut qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); + qp1 = qp1 * 2.0 - (qp0 + qp2) * 0.5; + let params = estimate_subdiv(qp0, qp1, qp2, REM_ACCURACY.sqrt()); + keep_params[i as usize] = params; + val += params.val; + qp0 = qp2; + } + let n = ((val * (0.5 / REM_ACCURACY.sqrt())).ceil() as u32).max(1); + let mut lp0 = p0; + qp0 = p0; + let v_step = val / (n as f32); + let mut n_out = 1; + let mut val_sum = 0.0; + for i in 0..n_quads { + let t = (i + 1) as f32 * step; + let qp2 = eval_cubic(p0, p1, p2, p3, t); + let mut qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); + qp1 = qp1 * 2.0 - (qp0 + qp2) * 0.5; + let params = keep_params[i as usize]; + let u0 = approx_parabola_inv_integral(params.a0); + let u2 = approx_parabola_inv_integral(params.a2); + let uscale = (u2 - u0).recip(); + let mut val_target = (n_out as f32) * v_step; + while n_out == n || val_target < val_sum + params.val { + let lp1 = if n_out == n { + p3 + } else { + let u = (val_target - val_sum) / params.val; + let a = params.a0 + (params.a2 - params.a0) * u; + let au = approx_parabola_inv_integral(a); + let t = (au - u0) * uscale; + eval_quad(qp0, qp1, qp2, t) + }; + let ls = LineSoup { + path_ix: cubic.path_ix, + _padding: Default::default(), + p0: lp0.to_array(), + p1: lp1.to_array(), + }; + lines[*line_ix] = ls; + *line_ix += 1; + n_out += 1; + val_target += v_step; + lp0 = lp1; + } + val_sum += params.val; + qp0 = qp2; + } +} + +fn read_f32_point(ix: u32, pathdata: &[u32]) -> Vec2 { + let x = f32::from_bits(pathdata[ix as usize]); + let y = f32::from_bits(pathdata[ix as usize + 1]); + Vec2 { x, y } +} + +struct IntBbox { + x0: i32, + y0: i32, + x1: i32, + y1: i32, +} + +impl Default for IntBbox { + fn default() -> Self { + IntBbox { + x0: 0x7fff_ffff, + y0: 0x7fff_ffff, + x1: -0x8000_0000, + y1: -0x8000_0000, + } + } +} + +impl IntBbox { + fn add_pt(&mut self, pt: Vec2) { + self.x0 = self.x0.min(pt.x.floor() as i32); + self.y0 = self.y0.min(pt.y.floor() as i32); + self.x1 = self.x1.max(pt.x.ceil() as i32); + self.y1 = self.y1.max(pt.y.ceil() as i32); + } +} + +// TODO: we're skipping i16 point reading as it's not present in our scenes + +const WG_SIZE: usize = 256; + +const PATH_TAG_SEG_TYPE: u8 = 3; +const PATH_TAG_PATH: u8 = 0x10; +const PATH_TAG_LINETO: u8 = 1; +const PATH_TAG_QUADTO: u8 = 2; +const PATH_TAG_CUBICTO: u8 = 3; +const PATH_TAG_F32: u8 = 8; + +fn flatten_main( + n_wg: u32, + config: &ConfigUniform, + scene: &[u32], + tag_monoids: &[PathMonoid], + path_bboxes: &mut [PathBbox], + bump: &mut BumpAllocators, + lines: &mut [LineSoup], +) { + let mut line_ix = 0; + let mut bbox = IntBbox::default(); + for ix in 0..n_wg as usize * WG_SIZE { + let tag_word = scene[config.layout.path_tag_base as usize + (ix >> 2)]; + let shift = (ix & 3) * 8; + let mut tm = PathMonoid::new(tag_word & ((1 << shift) - 1)); + let tag_byte = (tag_word >> shift) as u8; + if tag_byte != 0 { + tm = tag_monoids[ix >> 2].combine(&tm); + } + let linewidth = + f32::from_bits(scene[(config.layout.linewidth_base + tm.linewidth_ix) as usize]); + if (tag_byte & PATH_TAG_PATH) != 0 { + let out = &mut path_bboxes[tm.path_ix as usize]; + out.linewidth = linewidth; + out.trans_ix = tm.trans_ix; + } + let seg_type = tag_byte & PATH_TAG_SEG_TYPE; + let pathdata = &scene[config.layout.path_data_base as usize..]; + if seg_type != 0 { + let mut p0; + let mut p1; + let mut p2 = Vec2::default(); + let mut p3 = Vec2::default(); + if (tag_byte & PATH_TAG_F32) != 0 { + p0 = read_f32_point(tm.pathseg_offset, pathdata); + p1 = read_f32_point(tm.pathseg_offset + 2, pathdata); + if seg_type >= PATH_TAG_QUADTO { + p2 = read_f32_point(tm.pathseg_offset + 4, pathdata); + if seg_type == PATH_TAG_CUBICTO { + p3 = read_f32_point(tm.pathseg_offset + 6, pathdata); + } + } + } else { + todo!("i16 path data not supported yet"); + } + let transform = Transform::read(config.layout.transform_base, tm.trans_ix, scene); + p0 = transform.apply(p0); + bbox.add_pt(p0); + p1 = transform.apply(p1); + bbox.add_pt(p1); + if seg_type == PATH_TAG_LINETO { + p3 = p1; + p2 = p3.mix(p0, 1.0 / 3.0); + p1 = p0.mix(p3, 1.0 / 3.0); + } else if seg_type >= PATH_TAG_QUADTO { + p2 = transform.apply(p2); + bbox.add_pt(p2); + if seg_type == PATH_TAG_CUBICTO { + p3 = transform.apply(p3); + bbox.add_pt(p3); + } else { + p3 = p2; + p2 = p1.mix(p2, 1.0 / 3.0); + p1 = p1.mix(p0, 1.0 / 3.0); + } + } + let path_ix = tm.path_ix; + let cubic = Cubic { + p0, + p1, + p2, + p3, + path_ix, + }; + flatten_cubic(cubic, &mut line_ix, lines); + } + if (tag_byte & PATH_TAG_PATH) != 0 { + let out = &mut path_bboxes[tm.path_ix as usize]; + out.x0 = bbox.x0; + out.y0 = bbox.y0; + out.x1 = bbox.x1; + out.y1 = bbox.y1; + bbox = IntBbox::default(); + } + } + bump.lines = line_ix as u32; +} + +pub fn flatten(n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let tag_monoids = resources[2].as_slice(); + let mut path_bboxes = resources[3].as_slice_mut(); + let mut bump = resources[4].as_typed_mut(); + let mut lines = resources[5].as_slice_mut(); + flatten_main( + n_wg, + &config, + &scene, + &tag_monoids, + &mut path_bboxes, + &mut bump, + &mut lines, + ); +} diff --git a/src/cpu_shader/mod.rs b/src/cpu_shader/mod.rs index fed341c75..16d261f65 100644 --- a/src/cpu_shader/mod.rs +++ b/src/cpu_shader/mod.rs @@ -1,8 +1,61 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense //! CPU implementations of shader stages. +// Allow un-idiomatic Rust to more closely match shaders +#![allow(clippy::needless_range_loop)] +#![allow(clippy::too_many_arguments)] + +mod backdrop; +mod bbox_clear; +mod binning; +mod clip_leaf; +mod clip_reduce; +mod coarse; +mod draw_leaf; +mod draw_reduce; +mod fine; +mod flatten; +mod path_count; +mod path_count_setup; +mod path_tiling; +mod path_tiling_setup; mod pathtag_reduce; +mod pathtag_scan; +mod tile_alloc; +mod util; +pub use backdrop::backdrop; +pub use bbox_clear::bbox_clear; +pub use binning::binning; +pub use clip_leaf::clip_leaf; +pub use clip_reduce::clip_reduce; +pub use coarse::coarse; +pub use draw_leaf::draw_leaf; +pub use draw_reduce::draw_reduce; +pub use flatten::flatten; +pub use path_count::path_count; +pub use path_count_setup::path_count_setup; +pub use path_tiling::path_tiling; +pub use path_tiling_setup::path_tiling_setup; pub use pathtag_reduce::pathtag_reduce; +pub use pathtag_scan::pathtag_scan; +pub use tile_alloc::tile_alloc; + +// Common definitions + +const PTCL_INITIAL_ALLOC: u32 = 64; + +// Tags for PTCL commands +const CMD_END: u32 = 0; +const CMD_FILL: u32 = 1; +//const CMD_STROKE: u32 = 2; +const CMD_SOLID: u32 = 3; +const CMD_COLOR: u32 = 5; +const CMD_LIN_GRAD: u32 = 6; +const CMD_RAD_GRAD: u32 = 7; +const CMD_IMAGE: u32 = 8; +const CMD_BEGIN_CLIP: u32 = 9; +const CMD_END_CLIP: u32 = 10; +const CMD_JUMP: u32 = 11; diff --git a/src/cpu_shader/path_count.rs b/src/cpu_shader/path_count.rs new file mode 100644 index 000000000..2cee5b815 --- /dev/null +++ b/src/cpu_shader/path_count.rs @@ -0,0 +1,157 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{BumpAllocators, LineSoup, Path, SegmentCount, Tile}; + +use crate::cpu_dispatch::CpuBinding; + +use super::util::{span, Vec2}; + +const TILE_SCALE: f32 = 1.0 / 16.0; + +fn path_count_main( + bump: &mut BumpAllocators, + lines: &[LineSoup], + paths: &[Path], + tile: &mut [Tile], + seg_counts: &mut [SegmentCount], +) { + for line_ix in 0..bump.lines { + let line = lines[line_ix as usize]; + let p0 = Vec2::from_array(line.p0); + let p1 = Vec2::from_array(line.p1); + let is_down = p1.y >= p0.y; + let (xy0, xy1) = if is_down { (p0, p1) } else { (p1, p0) }; + let s0 = xy0 * TILE_SCALE; + let s1 = xy1 * TILE_SCALE; + let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1; + + let dx = (s1.x - s0.x).abs(); + let dy = s1.y - s0.y; + if dx + dy == 0.0 { + continue; + } + if dy == 0.0 && s0.y.floor() == s0.y { + continue; + } + let idxdy = 1.0 / (dx + dy); + let a = dx * idxdy; + let is_positive_slope = s1.x >= s0.x; + let sign = if is_positive_slope { 1.0 } else { -1.0 }; + let xt0 = (s0.x * sign).floor(); + let c = s0.x * sign - xt0; + let y0 = s0.y.floor(); + let ytop = if s0.y == s1.y { s0.y.ceil() } else { y0 + 1.0 }; + let b = (dy * c + dx * (ytop - s0.y)) * idxdy; + let x0 = xt0 * sign + if is_positive_slope { 0.0 } else { -1.0 }; + + let path = paths[line.path_ix as usize]; + let bbox = path.bbox; + let bbox = [ + bbox[0] as i32, + bbox[1] as i32, + bbox[2] as i32, + bbox[3] as i32, + ]; + let xmin = s0.x.min(s1.x); + let stride = bbox[2] - bbox[0]; + if s0.y >= bbox[3] as f32 || s1.y < bbox[1] as f32 || xmin >= bbox[2] as f32 || stride == 0 + { + continue; + } + // Clip line to bounding box. Clipping is done in "i" space. + let mut imin = 0; + if s0.y < bbox[1] as f32 { + let mut iminf = ((bbox[1] as f32 - y0 + b - a) / (1.0 - a)).round() - 1.0; + if y0 + iminf - (a * iminf + b).floor() < bbox[1] as f32 { + iminf += 1.0; + } + imin = iminf as u32; + } + let mut imax = count; + if s1.y > bbox[3] as f32 { + let mut imaxf = ((bbox[3] as f32 - y0 + b - a) / (1.0 - a)).round() - 1.0; + if y0 + imaxf - (a * imaxf + b).floor() < bbox[3] as f32 { + imaxf += 1.0; + } + imax = imaxf as u32; + } + let delta = if is_down { -1 } else { 1 }; + let mut ymin = 0; + let mut ymax = 0; + if s0.x.max(s1.x) < bbox[0] as f32 { + ymin = s0.y.ceil() as i32; + ymax = s1.y.ceil() as i32; + imax = imin; + } else { + let fudge = if is_positive_slope { 0.0 } else { 1.0 }; + if xmin < bbox[0] as f32 { + let mut f = ((sign * (bbox[0] as f32 - x0) - b + fudge) / a).round(); + if (x0 + sign * (a * f + b).floor() < bbox[0] as f32) == is_positive_slope { + f += 1.0; + } + let ynext = (y0 + f - (a * f + b).floor() + 1.0) as i32; + if is_positive_slope { + if f as u32 > imin { + ymin = (y0 + if y0 == s0.y { 0.0 } else { 1.0 }) as i32; + ymax = ynext; + imin = f as u32; + } + } else if (f as u32) < imax { + ymin = ynext; + ymax = s1.y.ceil() as i32; + imax = f as u32; + } + } + if s0.x.max(s1.x) > bbox[2] as f32 { + let mut f = ((sign * (bbox[2] as f32 - x0) - b + fudge) / a).round(); + if (x0 + sign * (a * f + b).floor() < bbox[2] as f32) == is_positive_slope { + f += 1.0; + } + if is_positive_slope { + imax = imax.min(f as u32); + } else { + imin = imin.max(f as u32); + } + } + } + imax = imin.max(imax); + ymin = ymin.max(bbox[1]); + ymax = ymax.min(bbox[3]); + for y in ymin..ymax { + let base = path.tiles as i32 + (y - bbox[1]) * stride; + tile[base as usize].backdrop += delta; + } + let mut last_z = (a * (imin as f32 - 1.0) + b).floor(); + let seg_base = bump.seg_counts; + bump.seg_counts += imax - imin; + for i in imin..imax { + let zf = a * i as f32 + b; + let z = zf.floor(); + let y = (y0 + i as f32 - z) as i32; + let x = (x0 + sign * z) as i32; + let base = path.tiles as i32 + (y - bbox[1]) * stride - bbox[0]; + let top_edge = if i == 0 { y0 == s0.y } else { last_z == z }; + if top_edge && x + 1 < bbox[2] { + let x_bump = (x + 1).max(bbox[0]); + tile[(base + x_bump) as usize].backdrop += delta; + } + // .segments is another name for the .count field; it's overloaded + let seg_within_slice = tile[(base + x) as usize].segment_count_or_ix; + tile[(base + x) as usize].segment_count_or_ix += 1; + let counts = (seg_within_slice << 16) | i; + let seg_count = SegmentCount { line_ix, counts }; + seg_counts[(seg_base + i - imin) as usize] = seg_count; + last_z = z; + } + } +} + +pub fn path_count(_n_wg: u32, resources: &[CpuBinding]) { + let mut bump = resources[1].as_typed_mut(); + let lines = resources[2].as_slice(); + let paths = resources[3].as_slice(); + let mut tile = resources[4].as_slice_mut(); + let mut seg_counts = resources[5].as_slice_mut(); + path_count_main(&mut bump, &lines, &paths, &mut tile, &mut seg_counts); +} diff --git a/src/cpu_shader/path_count_setup.rs b/src/cpu_shader/path_count_setup.rs new file mode 100644 index 000000000..6336cfd47 --- /dev/null +++ b/src/cpu_shader/path_count_setup.rs @@ -0,0 +1,21 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{BumpAllocators, IndirectCount}; + +use crate::cpu_dispatch::CpuBinding; + +const WG_SIZE: usize = 256; + +fn path_count_setup_main(bump: &BumpAllocators, indirect: &mut IndirectCount) { + let lines = bump.lines; + indirect.count_x = (lines + (WG_SIZE as u32 - 1)) / WG_SIZE as u32; + indirect.count_y = 1; + indirect.count_z = 1; +} + +pub fn path_count_setup(_n_wg: u32, resources: &[CpuBinding]) { + let bump = resources[0].as_typed(); + let mut indirect = resources[1].as_typed_mut(); + path_count_setup_main(&bump, &mut indirect); +} diff --git a/src/cpu_shader/path_tiling.rs b/src/cpu_shader/path_tiling.rs new file mode 100644 index 000000000..41549bb54 --- /dev/null +++ b/src/cpu_shader/path_tiling.rs @@ -0,0 +1,152 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{BumpAllocators, LineSoup, Path, PathSegment, SegmentCount, Tile}; + +use crate::cpu_dispatch::CpuBinding; + +use super::util::{span, Vec2}; + +const TILE_WIDTH: u32 = 16; +const TILE_HEIGHT: u32 = 16; +const TILE_SCALE: f32 = 1.0 / 16.0; + +fn path_tiling_main( + bump: &mut BumpAllocators, + seg_counts: &[SegmentCount], + lines: &[LineSoup], + paths: &[Path], + tiles: &[Tile], + segments: &mut [PathSegment], +) { + for seg_ix in 0..bump.seg_counts { + let seg_count = seg_counts[seg_ix as usize]; + let line = lines[seg_count.line_ix as usize]; + let counts = seg_count.counts; + let seg_within_slice = counts >> 16; + let seg_within_line = counts & 0xffff; + + // coarse rasterization logic + let p0 = Vec2::from_array(line.p0); + let p1 = Vec2::from_array(line.p1); + let is_down = p1.y >= p0.y; + let (mut xy0, mut xy1) = if is_down { (p0, p1) } else { (p1, p0) }; + let s0 = xy0 * TILE_SCALE; + let s1 = xy1 * TILE_SCALE; + let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1; + + let dx = (s1.x - s0.x).abs(); + let dy = s1.y - s0.y; + let idxdy = 1.0 / (dx + dy); + let a = dx * idxdy; + let is_positive_slope = s1.x >= s0.x; + let sign = if is_positive_slope { 1.0 } else { -1.0 }; + let xt0 = (s0.x * sign).floor(); + let c = s0.x * sign - xt0; + let y0 = s0.y.floor(); + let ytop = if s0.y == s1.y { s0.y.ceil() } else { y0 + 1.0 }; + let b = (dy * c + dx * (ytop - s0.y)) * idxdy; + let x0 = xt0 * sign + if is_positive_slope { 0.0 } else { -1.0 }; + let z = (a * seg_within_line as f32 + b).floor(); + let x = x0 as i32 + (sign * z) as i32; + let y = (y0 + seg_within_line as f32 - z) as i32; + + let path = paths[line.path_ix as usize]; + let bbox = path.bbox; + let bbox = [ + bbox[0] as i32, + bbox[1] as i32, + bbox[2] as i32, + bbox[3] as i32, + ]; + let stride = bbox[2] - bbox[0]; + let tile_ix = path.tiles as i32 + (y - bbox[1]) * stride + x - bbox[0]; + let tile = tiles[tile_ix as usize]; + let seg_start = !tile.segment_count_or_ix; + if (seg_start as i32) < 0 { + continue; + } + let tile_xy = Vec2::new(x as f32 * TILE_WIDTH as f32, y as f32 * TILE_HEIGHT as f32); + let tile_xy1 = tile_xy + Vec2::new(TILE_WIDTH as f32, TILE_HEIGHT as f32); + + if seg_within_line > 0 { + let z_prev = (a * (seg_within_line as f32 - 1.0) + b).floor(); + if z == z_prev { + // Top edge is clipped + let mut xt = xy0.x + (xy1.x - xy0.x) * (tile_xy.y - xy0.y) / (xy1.y - xy0.y); + xt = xt.clamp(tile_xy.x + 1e-3, tile_xy1.x); + xy0 = Vec2::new(xt, tile_xy.y); + } else { + // If is_positive_slope, left edge is clipped, otherwise right + let x_clip = if is_positive_slope { + tile_xy.x + } else { + tile_xy1.x + }; + let mut yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x); + yt = yt.clamp(tile_xy.y + 1e-3, tile_xy1.y); + xy0 = Vec2::new(x_clip, yt); + } + } + if seg_within_line < count - 1 { + let z_next = (a * (seg_within_line as f32 + 1.0) + b).floor(); + if z == z_next { + // Bottom edge is clipped + let mut xt = xy0.x + (xy1.x - xy0.x) * (tile_xy1.y - xy0.y) / (xy1.y - xy0.y); + xt = xt.clamp(tile_xy.x + 1e-3, tile_xy1.x); + xy1 = Vec2::new(xt, tile_xy1.y); + } else { + // If is_positive_slope, right edge is clipped, otherwise left + let x_clip = if is_positive_slope { + tile_xy1.x + } else { + tile_xy.x + }; + let mut yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x); + yt = yt.clamp(tile_xy.y + 1e-3, tile_xy1.y); + xy1 = Vec2::new(x_clip, yt); + } + } + if !is_down { + (xy0, xy1) = (xy1, xy0); + } + // TODO: figure out what to if both xy0 and xy1 are at left edge + // Also TODO (part of move to 8 byte encoding for segments): don't store y_edge at all, + // resolve this in fine. + let y_edge = if xy0.x == tile_xy.x { + xy0.y + } else if xy1.x == tile_xy.x { + xy1.y + } else { + 1e9 + }; + let segment = PathSegment { + origin: xy0.to_array(), + delta: (xy1 - xy0).to_array(), + y_edge, + _padding: Default::default(), + }; + assert!(xy0.x >= tile_xy.x && xy0.x <= tile_xy1.x); + assert!(xy0.y >= tile_xy.y && xy0.y <= tile_xy1.y); + assert!(xy1.x >= tile_xy.x && xy1.x <= tile_xy1.x); + assert!(xy1.y >= tile_xy.y && xy1.y <= tile_xy1.y); + segments[(seg_start + seg_within_slice) as usize] = segment; + } +} + +pub fn path_tiling(_n_wg: u32, resources: &[CpuBinding]) { + let mut bump = resources[0].as_typed_mut(); + let seg_counts = resources[1].as_slice(); + let lines = resources[2].as_slice(); + let paths = resources[3].as_slice(); + let tiles = resources[4].as_slice(); + let mut segments = resources[5].as_slice_mut(); + path_tiling_main( + &mut bump, + &seg_counts, + &lines, + &paths, + &tiles, + &mut segments, + ); +} diff --git a/src/cpu_shader/path_tiling_setup.rs b/src/cpu_shader/path_tiling_setup.rs new file mode 100644 index 000000000..32e08f9ae --- /dev/null +++ b/src/cpu_shader/path_tiling_setup.rs @@ -0,0 +1,21 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{BumpAllocators, IndirectCount}; + +use crate::cpu_dispatch::CpuBinding; + +const WG_SIZE: usize = 256; + +fn path_tiling_setup_main(bump: &BumpAllocators, indirect: &mut IndirectCount) { + let segments = bump.seg_counts; + indirect.count_x = (segments + (WG_SIZE as u32 - 1)) / WG_SIZE as u32; + indirect.count_y = 1; + indirect.count_z = 1; +} + +pub fn path_tiling_setup(_n_wg: u32, resources: &[CpuBinding]) { + let bump = resources[0].as_typed(); + let mut indirect = resources[1].as_typed_mut(); + path_tiling_setup_main(&bump, &mut indirect); +} diff --git a/src/cpu_shader/pathtag_reduce.rs b/src/cpu_shader/pathtag_reduce.rs index 38ee55c18..58eb36c17 100644 --- a/src/cpu_shader/pathtag_reduce.rs +++ b/src/cpu_shader/pathtag_reduce.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{ConfigUniform, Monoid, PathMonoid}; @@ -25,11 +25,8 @@ fn pathtag_reduce_main( } pub fn pathtag_reduce(n_wg: u32, resources: &[CpuBinding]) { - let r0 = resources[0].as_buf(); - let r1 = resources[1].as_buf(); - let mut r2 = resources[2].as_buf(); - let config = bytemuck::from_bytes(&r0); - let scene = bytemuck::cast_slice(&r1); - let reduced = bytemuck::cast_slice_mut(r2.as_mut()); - pathtag_reduce_main(n_wg, config, scene, reduced); + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let mut reduced = resources[2].as_slice_mut(); + pathtag_reduce_main(n_wg, &config, &scene, &mut reduced); } diff --git a/src/cpu_shader/pathtag_scan.rs b/src/cpu_shader/pathtag_scan.rs new file mode 100644 index 000000000..8a8aa609a --- /dev/null +++ b/src/cpu_shader/pathtag_scan.rs @@ -0,0 +1,37 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{ConfigUniform, Monoid, PathMonoid}; + +use crate::cpu_dispatch::CpuBinding; + +const WG_SIZE: usize = 256; + +fn pathtag_scan_main( + n_wg: u32, + config: &ConfigUniform, + scene: &[u32], + reduced: &[PathMonoid], + tag_monoids: &mut [PathMonoid], +) { + let pathtag_base = config.layout.path_tag_base; + let mut prefix = PathMonoid::default(); + for i in 0..n_wg { + let mut m = prefix; + for j in 0..WG_SIZE { + let ix = (i * WG_SIZE as u32) as usize + j; + tag_monoids[ix] = m; + let tag = scene[pathtag_base as usize + ix]; + m = m.combine(&PathMonoid::new(tag)); + } + prefix = prefix.combine(&reduced[i as usize]); + } +} + +pub fn pathtag_scan(n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let reduced = resources[2].as_slice(); + let mut tag_monoids = resources[3].as_slice_mut(); + pathtag_scan_main(n_wg, &config, &scene, &reduced, &mut tag_monoids); +} diff --git a/src/cpu_shader/tile_alloc.rs b/src/cpu_shader/tile_alloc.rs new file mode 100644 index 000000000..367f28df2 --- /dev/null +++ b/src/cpu_shader/tile_alloc.rs @@ -0,0 +1,72 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{BumpAllocators, ConfigUniform, DrawTag, Path, Tile}; + +use crate::cpu_dispatch::CpuBinding; + +const TILE_WIDTH: usize = 16; +const TILE_HEIGHT: usize = 16; +const SX: f32 = 1.0 / (TILE_WIDTH as f32); +const SY: f32 = 1.0 / (TILE_HEIGHT as f32); + +fn tile_alloc_main( + config: &ConfigUniform, + scene: &[u32], + draw_bboxes: &[[f32; 4]], + bump: &mut BumpAllocators, + paths: &mut [Path], + tiles: &mut [Tile], +) { + let drawtag_base = config.layout.draw_tag_base; + let width_in_tiles = config.width_in_tiles as i32; + let height_in_tiles = config.height_in_tiles as i32; + for drawobj_ix in 0..config.layout.n_draw_objects { + let drawtag = DrawTag(scene[(drawtag_base + drawobj_ix) as usize]); + let mut x0 = 0; + let mut y0 = 0; + let mut x1 = 0; + let mut y1 = 0; + if drawtag != DrawTag::NOP && drawtag != DrawTag::END_CLIP { + let bbox = draw_bboxes[drawobj_ix as usize]; + if bbox[0] < bbox[2] && bbox[1] < bbox[3] { + x0 = (bbox[0] * SX).floor() as i32; + y0 = (bbox[1] * SY).floor() as i32; + x1 = (bbox[2] * SX).ceil() as i32; + y1 = (bbox[3] * SY).ceil() as i32; + } + } + let ux0 = x0.clamp(0, width_in_tiles) as u32; + let uy0 = y0.clamp(0, height_in_tiles) as u32; + let ux1 = x1.clamp(0, width_in_tiles) as u32; + let uy1 = y1.clamp(0, height_in_tiles) as u32; + let tile_count = (ux1 - ux0) * (uy1 - uy0); + let offset = bump.tile; + bump.tile += tile_count; + // We construct it this way because padding is private. + let mut path = Path::default(); + path.bbox = [ux0, uy0, ux1, uy1]; + path.tiles = offset; + paths[drawobj_ix as usize] = path; + for i in 0..tile_count { + tiles[(offset + i) as usize] = Tile::default(); + } + } +} + +pub fn tile_alloc(_n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let draw_bboxes = resources[2].as_slice(); + let mut bump = resources[3].as_typed_mut(); + let mut paths = resources[4].as_slice_mut(); + let mut tiles = resources[5].as_slice_mut(); + tile_alloc_main( + &config, + &scene, + &draw_bboxes, + &mut bump, + &mut paths, + &mut tiles, + ); +} diff --git a/src/cpu_shader/util.rs b/src/cpu_shader/util.rs new file mode 100644 index 000000000..2bb3279aa --- /dev/null +++ b/src/cpu_shader/util.rs @@ -0,0 +1,113 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! Utility types + +use vello_encoding::ConfigUniform; + +#[derive(Clone, Copy, Default, Debug)] +#[repr(C)] +pub struct Vec2 { + pub x: f32, + pub y: f32, +} + +impl std::ops::Add for Vec2 { + type Output = Vec2; + + fn add(self, rhs: Self) -> Self { + Vec2 { + x: self.x + rhs.x, + y: self.y + rhs.y, + } + } +} + +impl std::ops::Sub for Vec2 { + type Output = Vec2; + + fn sub(self, rhs: Self) -> Self { + Vec2 { + x: self.x - rhs.x, + y: self.y - rhs.y, + } + } +} + +impl std::ops::Mul for Vec2 { + type Output = Vec2; + + fn mul(self, rhs: f32) -> Self { + Vec2 { + x: self.x * rhs, + y: self.y * rhs, + } + } +} + +impl Vec2 { + pub fn new(x: f32, y: f32) -> Self { + Vec2 { x, y } + } + + pub fn dot(self, other: Vec2) -> f32 { + self.x * other.x + self.y * other.y + } + + pub fn length(self) -> f32 { + self.x.hypot(self.y) + } + + pub fn to_array(self) -> [f32; 2] { + [self.x, self.y] + } + + pub fn from_array(a: [f32; 2]) -> Self { + Vec2 { x: a[0], y: a[1] } + } + + pub fn mix(self, other: Vec2, t: f32) -> Self { + let x = self.x + (other.x - self.x) * t; + let y = self.y + (other.y - self.y) * t; + Vec2 { x, y } + } +} + +pub struct Transform(pub [f32; 6]); + +impl Transform { + pub fn apply(&self, p: Vec2) -> Vec2 { + let z = self.0; + let x = z[0] * p.x + z[2] * p.y + z[4]; + let y = z[1] * p.x + z[3] * p.y + z[5]; + Vec2 { x, y } + } + + pub fn read(transform_base: u32, ix: u32, data: &[u32]) -> Transform { + let mut z = [0.0; 6]; + let base = (transform_base + ix * 6) as usize; + for i in 0..6 { + z[i] = f32::from_bits(data[base + i]); + } + Transform(z) + } +} + +pub fn span(a: f32, b: f32) -> u32 { + (a.max(b).ceil() - a.min(b).floor()).max(1.0) as u32 +} + +const DRAWTAG_NOP: u32 = 0; + +/// Read draw tag, guarded by number of draw objects. +/// +/// The `ix` argument is allowed to exceed the number of draw objects, +/// in which case a NOP is returned. +pub fn read_draw_tag_from_scene(config: &ConfigUniform, scene: &[u32], ix: u32) -> u32 { + if ix < config.layout.n_draw_objects { + let tag_ix = config.layout.draw_tag_base + ix; + scene[tag_ix as usize] + } else { + DRAWTAG_NOP + } +} diff --git a/src/lib.rs b/src/lib.rs index 55147acd8..28e5bf7ff 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -72,6 +72,8 @@ pub struct Renderer { profiler: GpuProfiler, #[cfg(feature = "wgpu-profiler")] pub profile_result: Option>, + #[cfg(feature = "hot_reload")] + use_cpu: bool, } /// Parameters used in a single render that are configurable by the client. @@ -101,7 +103,10 @@ impl Renderer { /// Creates a new renderer for the specified device. pub fn new(device: &Device, render_options: &RendererOptions) -> Result { let mut engine = WgpuEngine::new(); - let shaders = shaders::full_shaders(device, &mut engine, render_options.use_cpu)?; + let mut shaders = shaders::full_shaders(device, &mut engine)?; + if render_options.use_cpu { + shaders.install_cpu_shaders(&mut engine); + } let blit = render_options .surface_format .map(|surface_format| BlitPipeline::new(device, surface_format)); @@ -115,6 +120,8 @@ impl Renderer { profiler: GpuProfiler::new(3, render_options.timestamp_period, device.features()), #[cfg(feature = "wgpu-profiler")] profile_result: None, + #[cfg(feature = "hot_reload")] + use_cpu: render_options.use_cpu, }) } @@ -220,7 +227,10 @@ impl Renderer { pub async fn reload_shaders(&mut self, device: &Device) -> Result<()> { device.push_error_scope(wgpu::ErrorFilter::Validation); let mut engine = WgpuEngine::new(); - let shaders = shaders::full_shaders(device, &mut engine, false)?; + let mut shaders = shaders::full_shaders(device, &mut engine)?; + if self.use_cpu { + shaders.install_cpu_shaders(&mut engine); + } let error = device.pop_error_scope().await; if let Some(error) = error { return Err(error.into()); diff --git a/src/render.rs b/src/render.rs index 462563633..268007faa 100644 --- a/src/render.rs +++ b/src/render.rs @@ -139,7 +139,8 @@ impl Render { ); let mut pathtag_parent = reduced_buf; let mut large_pathtag_bufs = None; - if wg_counts.use_large_path_scan { + let use_large_path_scan = wg_counts.use_large_path_scan && !shaders.pathtag_is_cpu; + if use_large_path_scan { let reduced2_buf = ResourceProxy::new_buf( buffer_sizes.path_reduced2.size_in_bytes().into(), "reduced2_buf", @@ -166,7 +167,7 @@ impl Render { buffer_sizes.path_monoids.size_in_bytes().into(), "tagmonoid_buf", ); - let pathtag_scan = if wg_counts.use_large_path_scan { + let pathtag_scan = if use_large_path_scan { shaders.pathtag_scan_large } else { shaders.pathtag_scan diff --git a/src/shaders.rs b/src/shaders.rs index 23a3950f3..86e6ed7bd 100644 --- a/src/shaders.rs +++ b/src/shaders.rs @@ -79,14 +79,13 @@ pub struct FullShaders { pub path_tiling_setup: ShaderId, pub path_tiling: ShaderId, pub fine: ShaderId, + // 2-level dispatch works for CPU pathtag scan even for large + // inputs, 3-level is not yet implemented. + pub pathtag_is_cpu: bool, } #[cfg(feature = "wgpu")] -pub fn full_shaders( - device: &Device, - engine: &mut WgpuEngine, - use_cpu: bool, -) -> Result { +pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result { let imports = SHARED_SHADERS .iter() .copied() @@ -103,9 +102,6 @@ pub fn full_shaders( preprocess::preprocess(shader!("pathtag_reduce"), &full_config, &imports).into(), &[BindType::Uniform, BindType::BufReadOnly, BindType::Buffer], )?; - if use_cpu { - engine.set_cpu_shader(pathtag_reduce, cpu_shader::pathtag_reduce); - } let pathtag_reduce2 = engine.add_shader( device, "pathtag_reduce2", @@ -331,9 +327,42 @@ pub fn full_shaders( path_tiling_setup, path_tiling, fine, + pathtag_is_cpu: false, }) } +#[cfg(feature = "wgpu")] +impl FullShaders { + /// Install the CPU shaders. + /// + /// There are a couple things to note here. The granularity provided by + /// this method is coarse; it installs all the shaders. There are many + /// use cases (including debugging), where a mix is desired, or the + /// choice between GPU and CPU dispatch might be dynamic. + /// + /// Second, the actual mapping to CPU shaders is not really specific to + /// the engine, and should be split out into a back-end agnostic struct. + pub fn install_cpu_shaders(&mut self, engine: &mut WgpuEngine) { + engine.set_cpu_shader(self.pathtag_reduce, cpu_shader::pathtag_reduce); + engine.set_cpu_shader(self.pathtag_scan, cpu_shader::pathtag_scan); + engine.set_cpu_shader(self.bbox_clear, cpu_shader::bbox_clear); + engine.set_cpu_shader(self.flatten, cpu_shader::flatten); + engine.set_cpu_shader(self.draw_reduce, cpu_shader::draw_reduce); + engine.set_cpu_shader(self.draw_leaf, cpu_shader::draw_leaf); + engine.set_cpu_shader(self.clip_reduce, cpu_shader::clip_reduce); + engine.set_cpu_shader(self.clip_leaf, cpu_shader::clip_leaf); + engine.set_cpu_shader(self.binning, cpu_shader::binning); + engine.set_cpu_shader(self.tile_alloc, cpu_shader::tile_alloc); + engine.set_cpu_shader(self.path_count_setup, cpu_shader::path_count_setup); + engine.set_cpu_shader(self.path_count, cpu_shader::path_count); + engine.set_cpu_shader(self.backdrop, cpu_shader::backdrop); + engine.set_cpu_shader(self.coarse, cpu_shader::coarse); + engine.set_cpu_shader(self.path_tiling_setup, cpu_shader::path_tiling_setup); + engine.set_cpu_shader(self.path_tiling, cpu_shader::path_tiling); + self.pathtag_is_cpu = true; + } +} + macro_rules! shared_shader { ($name:expr) => { ( diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs index 12380e32c..c5359c1bb 100644 --- a/src/wgpu_engine.rs +++ b/src/wgpu_engine.rs @@ -19,6 +19,7 @@ use crate::{ BufProxy, Command, Id, ImageProxy, Recording, ResourceProxy, ShaderId, }; +#[derive(Default)] pub struct WgpuEngine { shaders: Vec, pool: ResourcePool, @@ -90,12 +91,7 @@ enum TransientBuf<'a> { impl WgpuEngine { pub fn new() -> WgpuEngine { - WgpuEngine { - shaders: vec![], - pool: Default::default(), - bind_map: Default::default(), - downloads: Default::default(), - } + Default::default() } /// Add a shader.