From d2c2a267f4adebddfc889d5ba6d02e52f2428b61 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 19 Sep 2023 10:57:47 -0700 Subject: [PATCH 1/3] Add CPU shaders This commit adds a full pipeline of CPU shaders for the compute stage of the pipeline (not counting fine rasterization). It's based on what's in the multi branch, but also ports bounding box fixes and other cleanups. Currently the selection of shaders is all or nothing (based on the use-cpu flag), but will probably become more fine-grained. Going forward, the intent is to keep CPU and GPU shaders in sync. It has been quite valuable in diagnosing problems in the pipeline. --- crates/encoding/src/path.rs | 6 +- shader/draw_leaf.wgsl | 1 - src/cpu_dispatch.rs | 89 +++++-- src/cpu_shader/backdrop.rs | 30 +++ src/cpu_shader/bbox_clear.rs | 21 ++ src/cpu_shader/binning.rs | 127 ++++++++++ src/cpu_shader/clip_leaf.rs | 86 +++++++ src/cpu_shader/clip_reduce.rs | 57 +++++ src/cpu_shader/coarse.rs | 346 ++++++++++++++++++++++++++++ src/cpu_shader/draw_leaf.rs | 172 ++++++++++++++ src/cpu_shader/draw_reduce.rs | 32 +++ src/cpu_shader/fine.rs | 188 +++++++++++++++ src/cpu_shader/flatten.rs | 299 ++++++++++++++++++++++++ src/cpu_shader/mod.rs | 53 +++++ src/cpu_shader/path_count.rs | 157 +++++++++++++ src/cpu_shader/path_count_setup.rs | 21 ++ src/cpu_shader/path_tiling.rs | 152 ++++++++++++ src/cpu_shader/path_tiling_setup.rs | 21 ++ src/cpu_shader/pathtag_reduce.rs | 11 +- src/cpu_shader/pathtag_scan.rs | 37 +++ src/cpu_shader/tile_alloc.rs | 72 ++++++ src/cpu_shader/util.rs | 96 ++++++++ src/lib.rs | 14 +- src/shaders.rs | 40 +++- src/wgpu_engine.rs | 8 +- 25 files changed, 2091 insertions(+), 45 deletions(-) create mode 100644 src/cpu_shader/backdrop.rs create mode 100644 src/cpu_shader/bbox_clear.rs create mode 100644 src/cpu_shader/binning.rs create mode 100644 src/cpu_shader/clip_leaf.rs create mode 100644 src/cpu_shader/clip_reduce.rs create mode 100644 src/cpu_shader/coarse.rs create mode 100644 src/cpu_shader/draw_leaf.rs create mode 100644 src/cpu_shader/draw_reduce.rs create mode 100644 src/cpu_shader/fine.rs create mode 100644 src/cpu_shader/flatten.rs create mode 100644 src/cpu_shader/path_count.rs create mode 100644 src/cpu_shader/path_count_setup.rs create mode 100644 src/cpu_shader/path_tiling.rs create mode 100644 src/cpu_shader/path_tiling_setup.rs create mode 100644 src/cpu_shader/pathtag_scan.rs create mode 100644 src/cpu_shader/tile_alloc.rs create mode 100644 src/cpu_shader/util.rs diff --git a/crates/encoding/src/path.rs b/crates/encoding/src/path.rs index 414ce23e6..b1d161525 100644 --- a/crates/encoding/src/path.rs +++ b/crates/encoding/src/path.rs @@ -227,8 +227,10 @@ pub struct Path { pub struct Tile { /// Accumulated backdrop at the left edge of the tile. pub backdrop: i32, - /// Index of first path segment. - pub segments: u32, + /// An enum that can hold either a count or an index to the + /// beginning of an allocated slice. In the latter case, the + /// bits are inverted. + pub segment_count_or_ix: u32, } /// Encoder for path segments. diff --git a/shader/draw_leaf.wgsl b/shader/draw_leaf.wgsl index 6154b9256..827825974 100644 --- a/shader/draw_leaf.wgsl +++ b/shader/draw_leaf.wgsl @@ -108,7 +108,6 @@ fn main( // let x1 = f32(bbox.x1); // let y1 = f32(bbox.y1); // let bbox_f = vec4(x0, y0, x1, y1); - let fill_mode = u32(bbox.linewidth >= 0.0); var transform = Transform(); var linewidth = bbox.linewidth; if linewidth >= 0.0 || tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT || diff --git a/src/cpu_dispatch.rs b/src/cpu_dispatch.rs index 0b8bbc86b..2c3409c16 100644 --- a/src/cpu_dispatch.rs +++ b/src/cpu_dispatch.rs @@ -4,10 +4,12 @@ //! Support for CPU implementations of compute shaders. use std::{ - cell::{RefCell, RefMut}, - ops::Deref, + cell::{Ref, RefCell, RefMut}, + ops::{Deref, DerefMut}, }; +use bytemuck::Pod; + #[derive(Clone, Copy)] pub enum CpuBinding<'a> { Buffer(&'a [u8]), @@ -16,39 +18,88 @@ pub enum CpuBinding<'a> { Texture(&'a CpuTexture), } -pub enum CpuBufGuard<'a> { - Slice(&'a [u8]), - Interior(RefMut<'a, Vec>), +pub enum TypedBufGuard<'a, T: ?Sized> { + Slice(&'a T), + Interior(Ref<'a, T>), +} + +pub enum TypedBufGuardMut<'a, T: ?Sized> { + Slice(&'a mut T), + Interior(RefMut<'a, T>), } -impl<'a> Deref for CpuBufGuard<'a> { - type Target = [u8]; +impl<'a, T: ?Sized> Deref for TypedBufGuard<'a, T> { + type Target = T; fn deref(&self) -> &Self::Target { match self { - CpuBufGuard::Slice(s) => s, - CpuBufGuard::Interior(r) => r, + TypedBufGuard::Slice(s) => s, + TypedBufGuard::Interior(r) => r, } } } -impl<'a> CpuBufGuard<'a> { - /// Get a mutable reference to the buffer. - /// - /// Panics if the underlying resource is read-only. - pub fn as_mut(&mut self) -> &mut [u8] { +impl<'a, T: ?Sized> Deref for TypedBufGuardMut<'a, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { match self { - CpuBufGuard::Interior(r) => &mut *r, - _ => panic!("tried to borrow immutable buffer as mutable"), + TypedBufGuardMut::Slice(s) => s, + TypedBufGuardMut::Interior(r) => r, + } + } +} + +impl<'a, T: ?Sized> DerefMut for TypedBufGuardMut<'a, T> { + fn deref_mut(&mut self) -> &mut Self::Target { + match self { + TypedBufGuardMut::Slice(s) => s, + TypedBufGuardMut::Interior(r) => r, } } } impl<'a> CpuBinding<'a> { - pub fn as_buf(&self) -> CpuBufGuard { + pub fn as_typed(&self) -> TypedBufGuard { + match self { + CpuBinding::Buffer(b) => TypedBufGuard::Slice(bytemuck::from_bytes(b)), + CpuBinding::BufferRW(b) => { + TypedBufGuard::Interior(Ref::map(b.borrow(), |buf| bytemuck::from_bytes(buf))) + } + _ => panic!("resource type mismatch"), + } + } + + pub fn as_typed_mut(&self) -> TypedBufGuardMut { + match self { + CpuBinding::Buffer(_) => panic!("can't borrow external buffer mutably"), + CpuBinding::BufferRW(b) => { + TypedBufGuardMut::Interior(RefMut::map(b.borrow_mut(), |buf| { + bytemuck::from_bytes_mut(buf) + })) + } + _ => panic!("resource type mismatch"), + } + } + + pub fn as_slice(&self) -> TypedBufGuard<[T]> { + match self { + CpuBinding::Buffer(b) => TypedBufGuard::Slice(bytemuck::cast_slice(b)), + CpuBinding::BufferRW(b) => { + TypedBufGuard::Interior(Ref::map(b.borrow(), |buf| bytemuck::cast_slice(buf))) + } + _ => panic!("resource type mismatch"), + } + } + + pub fn as_slice_mut(&self) -> TypedBufGuardMut<[T]> { match self { - CpuBinding::Buffer(b) => CpuBufGuard::Slice(b), - CpuBinding::BufferRW(b) => CpuBufGuard::Interior(b.borrow_mut()), + CpuBinding::Buffer(_) => panic!("can't borrow external buffer mutably"), + CpuBinding::BufferRW(b) => { + TypedBufGuardMut::Interior(RefMut::map(b.borrow_mut(), |buf| { + bytemuck::cast_slice_mut(buf) + })) + } _ => panic!("resource type mismatch"), } } diff --git a/src/cpu_shader/backdrop.rs b/src/cpu_shader/backdrop.rs new file mode 100644 index 000000000..2a19fd843 --- /dev/null +++ b/src/cpu_shader/backdrop.rs @@ -0,0 +1,30 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{ConfigUniform, Path, Tile}; + +use crate::cpu_dispatch::CpuBinding; + +fn backdrop_main(config: &ConfigUniform, paths: &[Path], tiles: &mut [Tile]) { + for drawobj_ix in 0..config.layout.n_draw_objects { + let path = paths[drawobj_ix as usize]; + let width = path.bbox[2] - path.bbox[0]; + let height = path.bbox[3] - path.bbox[1]; + let base = path.tiles; + for y in 0..height { + let mut sum = 0; + for x in 0..width { + let tile = &mut tiles[(base + y * width + x) as usize]; + sum += tile.backdrop; + tile.backdrop = sum; + } + } + } +} + +pub fn backdrop(_n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let paths = resources[1].as_slice(); + let mut tiles = resources[2].as_slice_mut(); + backdrop_main(&config, &paths, &mut tiles); +} diff --git a/src/cpu_shader/bbox_clear.rs b/src/cpu_shader/bbox_clear.rs new file mode 100644 index 000000000..014b905f5 --- /dev/null +++ b/src/cpu_shader/bbox_clear.rs @@ -0,0 +1,21 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{ConfigUniform, PathBbox}; + +use crate::cpu_dispatch::CpuBinding; + +fn bbox_clear_main(config: &ConfigUniform, path_bboxes: &mut [PathBbox]) { + for i in 0..(config.layout.n_paths as usize) { + path_bboxes[i].x0 = 0x7fff_ffff; + path_bboxes[i].y0 = 0x7fff_ffff; + path_bboxes[i].x1 = -0x8000_0000; + path_bboxes[i].y1 = -0x8000_0000; + } +} + +pub fn bbox_clear(_n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let mut path_bboxes = resources[1].as_slice_mut(); + bbox_clear_main(&config, &mut path_bboxes); +} diff --git a/src/cpu_shader/binning.rs b/src/cpu_shader/binning.rs new file mode 100644 index 000000000..8c2a79583 --- /dev/null +++ b/src/cpu_shader/binning.rs @@ -0,0 +1,127 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{BinHeader, BumpAllocators, ConfigUniform, DrawMonoid, PathBbox}; + +use crate::cpu_dispatch::CpuBinding; + +const WG_SIZE: usize = 256; +const TILE_WIDTH: usize = 16; +const TILE_HEIGHT: usize = 16; +const N_TILE_X: usize = 16; +const N_TILE_Y: usize = 16; +const SX: f32 = 1.0 / ((N_TILE_X * TILE_WIDTH) as f32); +const SY: f32 = 1.0 / ((N_TILE_Y * TILE_HEIGHT) as f32); + +fn bbox_intersect(a: [f32; 4], b: [f32; 4]) -> [f32; 4] { + [ + a[0].max(b[0]), + a[1].max(b[1]), + a[2].min(b[2]), + a[3].min(b[3]), + ] +} + +fn binning_main( + n_wg: u32, + config: &ConfigUniform, + draw_monoids: &[DrawMonoid], + path_bbox_buf: &[PathBbox], + clip_bbox_buf: &[[f32; 4]], + intersected_bbox: &mut [[f32; 4]], + bump: &mut BumpAllocators, + bin_data: &mut [u32], + bin_header: &mut [BinHeader], +) { + for wg in 0..n_wg as usize { + let mut counts = [0; WG_SIZE]; + let mut bboxes = [[0, 0, 0, 0]; WG_SIZE]; + let width_in_bins = + ((config.width_in_tiles + N_TILE_X as u32 - 1) / N_TILE_X as u32) as i32; + let height_in_bins = + ((config.height_in_tiles + N_TILE_Y as u32 - 1) / N_TILE_Y as u32) as i32; + for local_ix in 0..WG_SIZE { + let element_ix = wg * WG_SIZE + local_ix; + let mut x0 = 0; + let mut y0 = 0; + let mut x1 = 0; + let mut y1 = 0; + if element_ix < config.layout.n_draw_objects as usize { + let draw_monoid = draw_monoids[element_ix]; + let mut clip_bbox = [-1e9, -1e9, 1e9, 1e9]; + if draw_monoid.clip_ix > 0 { + clip_bbox = clip_bbox_buf[draw_monoid.clip_ix as usize - 1]; + } + let path_bbox = path_bbox_buf[draw_monoid.path_ix as usize]; + let pb = [ + path_bbox.x0 as f32, + path_bbox.y0 as f32, + path_bbox.x1 as f32, + path_bbox.y1 as f32, + ]; + let bbox = bbox_intersect(clip_bbox, pb); + intersected_bbox[element_ix] = bbox; + if bbox[0] < bbox[2] && bbox[1] < bbox[3] { + x0 = (bbox[0] * SX).floor() as i32; + y0 = (bbox[1] * SY).floor() as i32; + x1 = (bbox[2] * SX).ceil() as i32; + y1 = (bbox[3] * SY).ceil() as i32; + } + } + x0 = x0.clamp(0, width_in_bins); + y0 = y0.clamp(0, height_in_bins); + x1 = x1.clamp(0, width_in_bins); + y1 = y1.clamp(0, height_in_bins); + for y in y0..y1 { + for x in x0..x1 { + counts[(y * width_in_bins + x) as usize] += 1; + } + } + bboxes[local_ix] = [x0, y0, x1, y1]; + } + let mut chunk_offset = [0; WG_SIZE]; + for local_ix in 0..WG_SIZE { + let global_ix = wg * WG_SIZE + local_ix; + chunk_offset[local_ix] = bump.binning; + bump.binning += counts[local_ix]; + bin_header[global_ix] = BinHeader { + element_count: counts[local_ix], + chunk_offset: chunk_offset[local_ix], + }; + } + for local_ix in 0..WG_SIZE { + let element_ix = wg * WG_SIZE + local_ix; + let bbox = bboxes[local_ix]; + for y in bbox[1]..bbox[3] { + for x in bbox[0]..bbox[2] { + let bin_ix = (y * width_in_bins + x) as usize; + let ix = config.layout.bin_data_start + chunk_offset[bin_ix]; + bin_data[ix as usize] = element_ix as u32; + chunk_offset[bin_ix] += 1; + } + } + } + } +} + +pub fn binning(n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let draw_monoids = resources[1].as_slice(); + let path_bbox_buf = resources[2].as_slice(); + let clip_bbox_buf = resources[3].as_slice(); + let mut intersected_bbox = resources[4].as_slice_mut(); + let mut bump = resources[5].as_typed_mut(); + let mut bin_data = resources[6].as_slice_mut(); + let mut bin_header = resources[7].as_slice_mut(); + binning_main( + n_wg, + &config, + &draw_monoids, + &path_bbox_buf, + &clip_bbox_buf, + &mut intersected_bbox, + &mut bump, + &mut bin_data, + &mut bin_header, + ); +} diff --git a/src/cpu_shader/clip_leaf.rs b/src/cpu_shader/clip_leaf.rs new file mode 100644 index 000000000..63b528cf1 --- /dev/null +++ b/src/cpu_shader/clip_leaf.rs @@ -0,0 +1,86 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{Clip, ConfigUniform, DrawMonoid, PathBbox}; + +use crate::cpu_dispatch::CpuBinding; + +struct ClipStackElement { + // index of draw object + parent_ix: u32, + path_ix: u32, + bbox: [f32; 4], +} + +const BIG_BBOX: [f32; 4] = [-1e9, -1e9, 1e9, 1e9]; + +// Note: this implementation doesn't rigorously follow the +// WGSL original. In particular, it just computes the clips +// sequentially rather than using the partition reductions. +fn clip_leaf_main( + config: &ConfigUniform, + clip_inp: &[Clip], + path_bboxes: &[PathBbox], + draw_monoids: &mut [DrawMonoid], + clip_bboxes: &mut [[f32; 4]], +) { + let mut stack: Vec = Vec::new(); + for global_ix in 0..config.layout.n_clips { + let clip_el = clip_inp[global_ix as usize]; + if clip_el.path_ix >= 0 { + // begin clip + let path_ix = clip_el.path_ix as u32; + let path_bbox = path_bboxes[path_ix as usize]; + let p_bbox = [ + path_bbox.x0 as f32, + path_bbox.y0 as f32, + path_bbox.x1 as f32, + path_bbox.y1 as f32, + ]; + let bbox = if let Some(last) = stack.last() { + [ + p_bbox[0].max(last.bbox[0]), + p_bbox[1].max(last.bbox[1]), + p_bbox[2].min(last.bbox[2]), + p_bbox[3].min(last.bbox[3]), + ] + } else { + p_bbox + }; + clip_bboxes[global_ix as usize] = bbox; + let parent_ix = clip_el.ix; + stack.push(ClipStackElement { + parent_ix, + path_ix, + bbox, + }); + } else { + // end clip + let tos = stack.pop().unwrap(); + let bbox = if let Some(nos) = stack.last() { + nos.bbox + } else { + BIG_BBOX + }; + clip_bboxes[global_ix as usize] = bbox; + draw_monoids[clip_el.ix as usize].path_ix = tos.path_ix; + draw_monoids[clip_el.ix as usize].scene_offset = + draw_monoids[tos.parent_ix as usize].scene_offset; + } + } +} + +pub fn clip_leaf(_n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let clip_inp = resources[1].as_slice(); + let path_bboxes = resources[2].as_slice(); + let mut draw_monoids = resources[5].as_slice_mut(); + let mut clip_bboxes = resources[6].as_slice_mut(); + clip_leaf_main( + &config, + &clip_inp, + &path_bboxes, + &mut draw_monoids, + &mut clip_bboxes, + ); +} diff --git a/src/cpu_shader/clip_reduce.rs b/src/cpu_shader/clip_reduce.rs new file mode 100644 index 000000000..a8433bdee --- /dev/null +++ b/src/cpu_shader/clip_reduce.rs @@ -0,0 +1,57 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{Clip, ClipBic, ClipElement, PathBbox}; + +use crate::cpu_dispatch::CpuBinding; + +const WG_SIZE: usize = 256; + +fn clip_reduce_main( + n_wg: u32, + clip_inp: &[Clip], + path_bboxes: &[PathBbox], + reduced: &mut [ClipBic], + clip_out: &mut [ClipElement], +) { + let mut scratch = Vec::with_capacity(WG_SIZE); + for wg_ix in 0..n_wg { + scratch.clear(); + let mut bic_reduced = ClipBic::default(); + // reverse scan + for local_ix in (0..WG_SIZE).rev() { + let global_ix = wg_ix as usize * WG_SIZE + local_ix; + let inp = clip_inp[global_ix].path_ix; + let is_push = inp >= 0; + let bic = ClipBic::new(1 - is_push as u32, is_push as u32); + bic_reduced = bic.combine(bic_reduced); + if is_push && bic_reduced.a == 0 { + scratch.push(global_ix as u32); + } + } + reduced[wg_ix as usize] = bic_reduced; + for (i, parent_ix) in scratch.iter().rev().enumerate() { + let mut clip_el = ClipElement::default(); + clip_el.parent_ix = *parent_ix; + let path_ix = clip_inp[*parent_ix as usize].path_ix; + let path_bbox = path_bboxes[path_ix as usize]; + clip_el.bbox = [ + path_bbox.x0 as f32, + path_bbox.y0 as f32, + path_bbox.x1 as f32, + path_bbox.y1 as f32, + ]; + let global_ix = wg_ix as usize * WG_SIZE + i; + clip_out[global_ix] = clip_el; + } + } +} + +pub fn clip_reduce(n_wg: u32, resources: &[CpuBinding]) { + // TODO: probably remove config, it's not needed + let clip_inp = resources[0].as_slice(); + let path_bboxes = resources[1].as_slice(); + let mut reduced = resources[2].as_slice_mut(); + let mut clip_out = resources[3].as_slice_mut(); + clip_reduce_main(n_wg, &clip_inp, &path_bboxes, &mut reduced, &mut clip_out); +} diff --git a/src/cpu_shader/coarse.rs b/src/cpu_shader/coarse.rs new file mode 100644 index 000000000..69e5d5113 --- /dev/null +++ b/src/cpu_shader/coarse.rs @@ -0,0 +1,346 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{BinHeader, BumpAllocators, ConfigUniform, DrawMonoid, DrawTag, Path, Tile}; + +use crate::cpu_dispatch::CpuBinding; + +use super::{ + CMD_BEGIN_CLIP, CMD_COLOR, CMD_END, CMD_END_CLIP, CMD_FILL, CMD_IMAGE, CMD_JUMP, CMD_LIN_GRAD, + CMD_RAD_GRAD, CMD_SOLID, PTCL_INITIAL_ALLOC, +}; + +const N_TILE_X: usize = 16; +const N_TILE_Y: usize = 16; +const N_TILE: usize = N_TILE_X * N_TILE_Y; + +const PTCL_INCREMENT: u32 = 256; +const PTCL_HEADROOM: u32 = 2; + +// Modeled in the WGSL as private-scoped variables +struct TileState { + cmd_offset: u32, + cmd_limit: u32, +} + +impl TileState { + fn new(tile_ix: u32) -> TileState { + let cmd_offset = tile_ix * PTCL_INITIAL_ALLOC; + let cmd_limit = cmd_offset + (PTCL_INITIAL_ALLOC - PTCL_HEADROOM); + TileState { + cmd_offset, + cmd_limit, + } + } + + fn alloc_cmd( + &mut self, + size: u32, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + ) { + if self.cmd_offset + size >= self.cmd_limit { + let ptcl_dyn_start = + config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC; + let chunk_size = PTCL_INCREMENT.max(size + PTCL_HEADROOM); + let new_cmd = ptcl_dyn_start + bump.ptcl; + bump.ptcl += chunk_size; + ptcl[self.cmd_offset as usize] = CMD_JUMP; + ptcl[self.cmd_offset as usize + 1] = new_cmd; + self.cmd_offset = new_cmd; + self.cmd_limit = new_cmd + (PTCL_INCREMENT - PTCL_HEADROOM); + } + } + + fn write(&mut self, ptcl: &mut [u32], offset: u32, value: u32) { + ptcl[(self.cmd_offset + offset) as usize] = value; + } + + // TODO: handle even/odd winding rule + fn write_path( + &mut self, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + tile: &mut Tile, + ) { + let n_segs = tile.segment_count_or_ix; + if n_segs != 0 { + let seg_ix = bump.segments; + tile.segment_count_or_ix = !seg_ix; + bump.segments += n_segs; + self.alloc_cmd(4, config, bump, ptcl); + self.write(ptcl, 0, CMD_FILL); + let even_odd = false; // TODO + let size_and_rule = (n_segs << 1) | (even_odd as u32); + self.write(ptcl, 1, size_and_rule); + self.write(ptcl, 2, seg_ix); + self.write(ptcl, 3, tile.backdrop as u32); + self.cmd_offset += 4; + } else { + self.alloc_cmd(1, config, bump, ptcl); + self.write(ptcl, 0, CMD_SOLID); + self.cmd_offset += 1; + } + } + + fn write_color( + &mut self, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + rgba_color: u32, + ) { + self.alloc_cmd(2, config, bump, ptcl); + self.write(ptcl, 0, CMD_COLOR); + self.write(ptcl, 1, rgba_color); + self.cmd_offset += 2; + } + + fn write_image( + &mut self, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + info_offset: u32, + ) { + self.alloc_cmd(2, config, bump, ptcl); + self.write(ptcl, 0, CMD_IMAGE); + self.write(ptcl, 1, info_offset); + self.cmd_offset += 2; + } + + fn write_grad( + &mut self, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + ty: u32, + index: u32, + info_offset: u32, + ) { + self.alloc_cmd(3, config, bump, ptcl); + self.write(ptcl, 0, ty); + self.write(ptcl, 1, index); + self.write(ptcl, 2, info_offset); + self.cmd_offset += 3; + } + + fn write_begin_clip( + &mut self, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + ) { + self.alloc_cmd(1, config, bump, ptcl); + self.write(ptcl, 0, CMD_BEGIN_CLIP); + self.cmd_offset += 1; + } + + fn write_end_clip( + &mut self, + config: &ConfigUniform, + bump: &mut BumpAllocators, + ptcl: &mut [u32], + blend: u32, + alpha: f32, + ) { + self.alloc_cmd(3, config, bump, ptcl); + self.write(ptcl, 0, CMD_END_CLIP); + self.write(ptcl, 1, blend); + self.write(ptcl, 2, f32::to_bits(alpha)); + self.cmd_offset += 3; + } +} + +fn coarse_main( + config: &ConfigUniform, + scene: &[u32], + draw_monoids: &[DrawMonoid], + bin_headers: &[BinHeader], + info_bin_data: &[u32], + paths: &[Path], + tiles: &mut [Tile], + bump: &mut BumpAllocators, + ptcl: &mut [u32], +) { + let width_in_tiles = config.width_in_tiles; + let height_in_tiles = config.height_in_tiles; + let width_in_bins = (width_in_tiles + N_TILE_X as u32 - 1) / N_TILE_X as u32; + let height_in_bins = (height_in_tiles + N_TILE_Y as u32 - 1) / N_TILE_Y as u32; + let n_bins = width_in_bins * height_in_bins; + let bin_data_start = config.layout.bin_data_start; + let drawtag_base = config.layout.draw_tag_base; + let mut compacted = vec![vec![]; N_TILE]; + let n_partitions = (config.layout.n_draw_objects + N_TILE as u32 - 1) / N_TILE as u32; + for bin in 0..n_bins { + for v in &mut compacted { + v.clear(); + } + let bin_x = bin % width_in_bins; + let bin_y = bin / width_in_bins; + let bin_tile_x = N_TILE_X as u32 * bin_x; + let bin_tile_y = N_TILE_Y as u32 * bin_y; + for part in 0..n_partitions { + let in_ix = part * N_TILE as u32 + bin; + let bin_header = bin_headers[in_ix as usize]; + let start = bin_data_start + bin_header.chunk_offset; + for i in 0..bin_header.element_count { + let drawobj_ix = info_bin_data[(start + i) as usize]; + let tag = scene[(drawtag_base + drawobj_ix) as usize]; + if DrawTag(tag) != DrawTag::NOP { + let draw_monoid = draw_monoids[drawobj_ix as usize]; + let path_ix = draw_monoid.path_ix; + let path = paths[path_ix as usize]; + let dx = path.bbox[0] as i32 - bin_tile_x as i32; + let dy = path.bbox[1] as i32 - bin_tile_y as i32; + let x0 = dx.clamp(0, N_TILE_X as i32); + let y0 = dy.clamp(0, N_TILE_Y as i32); + let x1 = (path.bbox[2] as i32 - bin_tile_x as i32).clamp(0, N_TILE_X as i32); + let y1 = (path.bbox[3] as i32 - bin_tile_y as i32).clamp(0, N_TILE_Y as i32); + for y in y0..y1 { + for x in x0..x1 { + compacted[(y * N_TILE_X as i32 + x) as usize].push(drawobj_ix); + } + } + } + } + } + // compacted now has the list of draw objects for each tile. + // While the WGSL source does at most 256 draw objects at a time, + // this version does all the draw objects in a tile. + for tile_ix in 0..N_TILE { + let tile_x = (tile_ix % N_TILE_X) as u32; + let tile_y = (tile_ix / N_TILE_X) as u32; + let this_tile_ix = (bin_tile_y + tile_y) * width_in_tiles + bin_tile_x + tile_x; + let mut tile_state = TileState::new(this_tile_ix); + let blend_offset = tile_state.cmd_offset; + tile_state.cmd_offset += 1; + // Discussion question: do these belong in tile state? + let mut clip_depth = 0; + let mut clip_zero_depth = 0; + for drawobj_ix in &compacted[tile_ix] { + let drawtag = scene[(drawtag_base + drawobj_ix) as usize]; + if clip_zero_depth == 0 { + let draw_monoid = draw_monoids[*drawobj_ix as usize]; + let path_ix = draw_monoid.path_ix; + let path = paths[path_ix as usize]; + let bbox = path.bbox; + let stride = bbox[2] - bbox[0]; + let x = bin_tile_x + tile_x - bbox[0]; + let y = bin_tile_y + tile_y - bbox[1]; + let tile = &mut tiles[(path.tiles + y * stride + x) as usize]; + let is_clip = (drawtag & 1) != 0; + let mut is_blend = false; + let dd = config.layout.draw_data_base + draw_monoid.scene_offset; + let di = draw_monoid.info_offset; + if is_clip { + const BLEND_CLIP: u32 = (128 << 8) | 3; + let blend = scene[dd as usize]; + is_blend = blend != BLEND_CLIP; + } + let n_segs = tile.segment_count_or_ix; + let include_tile = n_segs != 0 || (tile.backdrop == 0) == is_clip || is_blend; + if include_tile { + // TODO: get drawinfo (linewidth for fills) + match DrawTag(drawtag) { + DrawTag::COLOR => { + tile_state.write_path(config, bump, ptcl, tile); + let rgba_color = scene[dd as usize]; + tile_state.write_color(config, bump, ptcl, rgba_color); + } + DrawTag::IMAGE => { + tile_state.write_path(config, bump, ptcl, tile); + tile_state.write_image(config, bump, ptcl, di + 1); + } + DrawTag::LINEAR_GRADIENT => { + tile_state.write_path(config, bump, ptcl, tile); + let index = scene[dd as usize]; + tile_state.write_grad( + config, + bump, + ptcl, + CMD_LIN_GRAD, + index, + di + 1, + ); + } + DrawTag::RADIAL_GRADIENT => { + tile_state.write_path(config, bump, ptcl, tile); + let index = scene[dd as usize]; + tile_state.write_grad( + config, + bump, + ptcl, + CMD_RAD_GRAD, + index, + di + 1, + ); + } + DrawTag::BEGIN_CLIP => { + if tile.segment_count_or_ix == 0 && tile.backdrop == 0 { + clip_zero_depth = clip_depth + 1; + } else { + tile_state.write_begin_clip(config, bump, ptcl); + // TODO: update blend depth + } + clip_depth += 1; + } + DrawTag::END_CLIP => { + clip_depth -= 1; + tile_state.write_path(config, bump, ptcl, tile); + let blend = scene[dd as usize]; + let alpha = f32::from_bits(scene[dd as usize + 1]); + tile_state.write_end_clip(config, bump, ptcl, blend, alpha); + } + _ => todo!(), + } + } + } else { + // In "clip zero" state, suppress all drawing + match DrawTag(drawtag) { + DrawTag::BEGIN_CLIP => clip_depth += 1, + DrawTag::END_CLIP => { + if clip_depth == clip_zero_depth { + clip_zero_depth = 0; + } + clip_depth -= 1; + } + _ => (), + } + } + } + + if bin_tile_x + tile_x < width_in_tiles && bin_tile_y + tile_y < height_in_tiles { + ptcl[tile_state.cmd_offset as usize] = CMD_END; + let scratch_size = 0; // TODO: actually compute + ptcl[blend_offset as usize] = bump.blend; + bump.blend += scratch_size; + } + } + } +} + +pub fn coarse(_n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let draw_monoids = resources[2].as_slice(); + let bin_headers = resources[3].as_slice(); + let info_bin_data = resources[4].as_slice(); + let paths = resources[5].as_slice(); + let mut tiles = resources[6].as_slice_mut(); + let mut bump = resources[7].as_typed_mut(); + let mut ptcl = resources[8].as_slice_mut(); + coarse_main( + &config, + &scene, + &draw_monoids, + &bin_headers, + &info_bin_data, + &paths, + &mut tiles, + &mut bump, + &mut ptcl, + ); +} diff --git a/src/cpu_shader/draw_leaf.rs b/src/cpu_shader/draw_leaf.rs new file mode 100644 index 000000000..4837d024e --- /dev/null +++ b/src/cpu_shader/draw_leaf.rs @@ -0,0 +1,172 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{Clip, ConfigUniform, DrawMonoid, DrawTag, Monoid, PathBbox}; + +use crate::cpu_dispatch::CpuBinding; + +use super::util::{Transform, Vec2}; + +const WG_SIZE: usize = 256; + +fn draw_leaf_main( + n_wg: u32, + config: &ConfigUniform, + scene: &[u32], + reduced: &[DrawMonoid], + path_bbox: &[PathBbox], + draw_monoid: &mut [DrawMonoid], + info: &mut [u32], + clip_inp: &mut [Clip], +) { + let drawtag_base = config.layout.draw_tag_base; + let mut prefix = DrawMonoid::default(); + for i in 0..n_wg { + let mut m = prefix; + for j in 0..WG_SIZE { + let ix = i * WG_SIZE as u32 + j as u32; + let tag_raw = if ix < config.layout.n_draw_objects { + scene[(drawtag_base + ix) as usize] + } else { + 0 + }; + let tag_word = DrawTag(tag_raw); + // store exclusive prefix sum + if ix < config.layout.n_draw_objects { + draw_monoid[ix as usize] = m; + } + let dd = config.layout.draw_data_base + m.scene_offset; + let di = m.info_offset as usize; + if tag_word == DrawTag::COLOR + || tag_word == DrawTag::LINEAR_GRADIENT + || tag_word == DrawTag::RADIAL_GRADIENT + || tag_word == DrawTag::IMAGE + || tag_word == DrawTag::BEGIN_CLIP + { + let bbox = path_bbox[m.path_ix as usize]; + let transform = Transform::read(config.layout.transform_base, bbox.trans_ix, scene); + let linewidth = bbox.linewidth; + match tag_word { + DrawTag::COLOR => { + info[di] = f32::to_bits(linewidth); + } + DrawTag::LINEAR_GRADIENT => { + info[di] = f32::to_bits(linewidth); + let p0 = Vec2::new( + f32::from_bits(scene[dd as usize + 1]), + f32::from_bits(scene[dd as usize + 2]), + ); + let p1 = Vec2::new( + f32::from_bits(scene[dd as usize + 3]), + f32::from_bits(scene[dd as usize + 4]), + ); + let p0 = transform.apply(p0); + let p1 = transform.apply(p1); + let dxy = p1 - p0; + let scale = 1.0 / dxy.dot(dxy); + let line_xy = dxy * scale; + let line_c = -p0.dot(line_xy); + info[di + 1] = f32::to_bits(line_xy.x); + info[di + 2] = f32::to_bits(line_xy.y); + info[di + 3] = f32::to_bits(line_c); + } + DrawTag::RADIAL_GRADIENT => { + info[di] = f32::to_bits(linewidth); + let p0 = Vec2::new( + f32::from_bits(scene[dd as usize + 1]), + f32::from_bits(scene[dd as usize + 2]), + ); + let p1 = Vec2::new( + f32::from_bits(scene[dd as usize + 3]), + f32::from_bits(scene[dd as usize + 4]), + ); + let r0 = f32::from_bits(scene[dd as usize + 5]); + let r1 = f32::from_bits(scene[dd as usize + 6]); + let z = transform.0; + let inv_det = (z[0] * z[3] - z[1] * z[2]).recip(); + let inv_mat = [ + z[3] * inv_det, + -z[1] * inv_det, + -z[2] * inv_det, + z[0] * inv_det, + ]; + let inv_tr = [ + -(inv_mat[0] * z[4] + inv_mat[2] * z[5]) - p0.x, + -(inv_mat[1] * z[4] + inv_mat[3] * z[5]) - p0.y, + ]; + let center1 = p1 - p0; + let rr = r1 / (r1 - r0); + let ra_inv = rr / (r1 * r1 - center1.dot(center1)); + let c1 = center1 * ra_inv; + let ra = rr * ra_inv; + let roff = rr - 1.0; + info[di + 1] = f32::to_bits(inv_mat[0]); + info[di + 2] = f32::to_bits(inv_mat[1]); + info[di + 3] = f32::to_bits(inv_mat[2]); + info[di + 4] = f32::to_bits(inv_mat[3]); + info[di + 5] = f32::to_bits(inv_tr[0]); + info[di + 6] = f32::to_bits(inv_tr[1]); + info[di + 7] = f32::to_bits(c1.x); + info[di + 8] = f32::to_bits(c1.y); + info[di + 9] = f32::to_bits(ra); + info[di + 19] = f32::to_bits(roff); + } + DrawTag::IMAGE => { + info[di] = f32::to_bits(linewidth); + let z = transform.0; + let inv_det = (z[0] * z[3] - z[1] * z[2]).recip(); + let inv_mat = [ + z[3] * inv_det, + -z[1] * inv_det, + -z[2] * inv_det, + z[0] * inv_det, + ]; + let inv_tr = [ + -(inv_mat[0] * z[4] + inv_mat[2] * z[5]), + -(inv_mat[1] * z[4] + inv_mat[3] * z[5]), + ]; + info[di + 1] = f32::to_bits(inv_mat[0]); + info[di + 2] = f32::to_bits(inv_mat[1]); + info[di + 3] = f32::to_bits(inv_mat[2]); + info[di + 4] = f32::to_bits(inv_mat[3]); + info[di + 5] = f32::to_bits(inv_tr[0]); + info[di + 6] = f32::to_bits(inv_tr[1]); + info[di + 7] = scene[dd as usize]; + info[di + 8] = scene[dd as usize + 1]; + } + DrawTag::BEGIN_CLIP => (), + _ => todo!("unhandled draw tag {:x}", tag_word.0), + } + } + if tag_word == DrawTag::BEGIN_CLIP { + let path_ix = m.path_ix as i32; + clip_inp[m.clip_ix as usize] = Clip { ix, path_ix }; + } else if tag_word == DrawTag::END_CLIP { + let path_ix = !ix as i32; + clip_inp[m.clip_ix as usize] = Clip { ix, path_ix }; + } + m = m.combine(&DrawMonoid::new(tag_word)); + } + prefix = prefix.combine(&reduced[i as usize]); + } +} + +pub fn draw_leaf(n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let reduced = resources[2].as_slice(); + let path_bbox = resources[3].as_slice(); + let mut draw_monoid = resources[4].as_slice_mut(); + let mut info = resources[5].as_slice_mut(); + let mut clip_inp = resources[6].as_slice_mut(); + draw_leaf_main( + n_wg, + &config, + &scene, + &reduced, + &path_bbox, + &mut draw_monoid, + &mut info, + &mut clip_inp, + ); +} diff --git a/src/cpu_shader/draw_reduce.rs b/src/cpu_shader/draw_reduce.rs new file mode 100644 index 000000000..9ec876e96 --- /dev/null +++ b/src/cpu_shader/draw_reduce.rs @@ -0,0 +1,32 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{ConfigUniform, DrawMonoid, DrawTag, Monoid}; + +use crate::cpu_dispatch::CpuBinding; + +const WG_SIZE: usize = 256; + +fn draw_reduce_main(n_wg: u32, config: &ConfigUniform, scene: &[u32], reduced: &mut [DrawMonoid]) { + let drawtag_base = config.layout.draw_tag_base; + for i in 0..n_wg { + let mut m = DrawMonoid::default(); + for j in 0..WG_SIZE { + let ix = i * WG_SIZE as u32 + j as u32; + let tag = if ix < config.layout.n_draw_objects { + scene[(drawtag_base + ix) as usize] + } else { + 0 + }; + m = m.combine(&DrawMonoid::new(DrawTag(tag))); + } + reduced[i as usize] = m; + } +} + +pub fn draw_reduce(n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let mut reduced = resources[2].as_slice_mut(); + draw_reduce_main(n_wg, &config, &scene, &mut reduced); +} diff --git a/src/cpu_shader/fine.rs b/src/cpu_shader/fine.rs new file mode 100644 index 000000000..a28cfe697 --- /dev/null +++ b/src/cpu_shader/fine.rs @@ -0,0 +1,188 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{ConfigUniform, PathSegment, Tile}; + +use crate::cpu_dispatch::CpuTexture; + +use super::{CMD_COLOR, CMD_END, CMD_FILL, CMD_JUMP, CMD_SOLID, PTCL_INITIAL_ALLOC}; + +// These should also move into a common area +const TILE_WIDTH: usize = 16; +const TILE_HEIGHT: usize = 16; +const TILE_SIZE: usize = TILE_WIDTH * TILE_HEIGHT; + +fn read_color(ptcl: &[u32], offset: u32) -> u32 { + ptcl[(offset + 1) as usize] +} + +struct CmdFill { + size_and_rule: u32, + seg_data: u32, + backdrop: i32, +} + +fn read_fill(ptcl: &[u32], offset: u32) -> CmdFill { + let size_and_rule = ptcl[(offset + 1) as usize]; + let seg_data = ptcl[(offset + 2) as usize]; + let backdrop = ptcl[(offset + 3) as usize] as i32; + CmdFill { + size_and_rule, + seg_data, + backdrop, + } +} + +fn unpack4x8unorm(x: u32) -> [f32; 4] { + let mut result = [0.0; 4]; + for i in 0..4 { + result[i] = ((x >> (i * 8)) & 0xff) as f32 * (1.0 / 255.0); + } + result +} + +fn pack4x8unorm(x: [f32; 4]) -> u32 { + let mut result = 0; + for i in 0..4 { + let byte = (x[i].clamp(0.0, 1.0) * 255.0).round() as u32; + result |= byte << (i * 8); + } + result +} + +fn fill_path(area: &mut [f32], segments: &[PathSegment], fill: &CmdFill, x_tile: f32, y_tile: f32) { + let n_segs = fill.size_and_rule >> 1; + let even_odd = (fill.size_and_rule & 1) != 0; + let backdrop_f = fill.backdrop as f32; + for a in area.iter_mut() { + *a = backdrop_f; + } + for segment in &segments[fill.seg_data as usize..][..n_segs as usize] { + for yi in 0..TILE_HEIGHT { + let y = segment.origin[1] - (y_tile + yi as f32); + let y0 = y.clamp(0.0, 1.0); + let y1 = (y + segment.delta[1]).clamp(0.0, 1.0); + let dy = y0 - y1; + let y_edge = segment.delta[0].signum() + * (y_tile + yi as f32 - segment.y_edge + 1.0).clamp(0.0, 1.0); + if dy != 0.0 { + let vec_y_recip = segment.delta[1].recip(); + let t0 = (y0 - y) * vec_y_recip; + let t1 = (y1 - y) * vec_y_recip; + let startx = segment.origin[0] - x_tile; + let x0 = startx + t0 * segment.delta[0]; + let x1 = startx + t1 * segment.delta[0]; + let xmin0 = x0.min(x1); + let xmax0 = x0.max(x1); + for i in 0..TILE_WIDTH { + let i_f = i as f32; + let xmin = (xmin0 - i_f).min(1.0) - 1.0e-6; + let xmax = xmax0 - i_f; + let b = xmax.min(1.0); + let c = b.max(0.0); + let d = xmin.max(0.0); + let a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin); + area[yi * TILE_WIDTH + i] += y_edge + a * dy; + } + } else if y_edge != 0.0 { + for i in 0..TILE_WIDTH { + area[yi * TILE_WIDTH + i] += y_edge; + } + } + } + } + if even_odd { + for a in area.iter_mut() { + { + *a = (*a - 2.0 * (0.5 * *a).round()).abs(); + } + } + } else { + for a in area.iter_mut() { + { + *a = a.abs().min(1.0); + } + } + } +} + +// Note: this is a draft. Texture resources are not yet wired up, so it +// has not yet been tested. +#[allow(unused)] +fn fine_main( + config: &ConfigUniform, + tiles: &[Tile], + segments: &[PathSegment], + output: &mut CpuTexture, + ptcl: &[u32], + info: &[u32], + // TODO: image texture resources + // TODO: masks? +) { + let width_in_tiles = config.width_in_tiles; + let height_in_tiles = config.height_in_tiles; + let n_tiles = width_in_tiles * height_in_tiles; + let mut area = vec![0.0f32; TILE_SIZE]; + let mut rgba = vec![[0.0f32; 4]; TILE_SIZE]; + for tile_ix in 0..n_tiles { + for x in &mut rgba { + *x = [0.0; 4]; + } + for a in &mut area { + *a = 0.0; + } + let tile_x = tile_ix % width_in_tiles; + let tile_y = tile_ix / width_in_tiles; + let mut cmd_ix = tile_ix * PTCL_INITIAL_ALLOC; + // skip over blend stack allocation + cmd_ix += 1; + loop { + let tag = ptcl[cmd_ix as usize]; + if tag == CMD_END { + break; + } + match tag { + CMD_FILL => { + let fill = read_fill(ptcl, cmd_ix); + // x0 and y0 will go away when we do tile-relative coords + let x0 = (tile_x as usize * TILE_WIDTH) as f32; + let y0 = (tile_y as usize * TILE_HEIGHT) as f32; + fill_path(&mut area, segments, &fill, x0, y0); + cmd_ix += 4; + } + CMD_SOLID => { + for a in &mut area { + *a = 1.0; + } + cmd_ix += 2; + } + CMD_COLOR => { + let color = read_color(ptcl, cmd_ix); + let fg = unpack4x8unorm(color); + let fg = [fg[3], fg[2], fg[1], fg[0]]; + for i in 0..TILE_SIZE { + let ai = area[i]; + let fg_i = [fg[0] * ai, fg[1] * ai, fg[2] * ai, fg[3] * ai]; + for j in 0..4 { + rgba[i][j] = rgba[i][j] * (1.0 - fg_i[3]) + fg_i[j]; + } + } + cmd_ix += 2; + } + CMD_JUMP => { + cmd_ix = ptcl[(cmd_ix + 1) as usize]; + } + _ => todo!("unhandled ptcl command {tag}"), + } + } + // Write tile (in rgba) + for y in 0..TILE_HEIGHT { + let base = + output.width * (tile_y as usize * TILE_HEIGHT + y) + tile_x as usize * TILE_WIDTH; + for x in 0..TILE_WIDTH { + let rgba32 = pack4x8unorm(rgba[y * TILE_WIDTH + x]); + output.pixels[base + x] = rgba32; + } + } + } +} diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs new file mode 100644 index 000000000..41cc949d5 --- /dev/null +++ b/src/cpu_shader/flatten.rs @@ -0,0 +1,299 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use crate::cpu_dispatch::CpuBinding; + +use super::util::{Transform, Vec2}; +use vello_encoding::{BumpAllocators, ConfigUniform, LineSoup, Monoid, PathBbox, PathMonoid}; + +fn to_minus_one_quarter(x: f32) -> f32 { + // could also be written x.powf(-0.25) + x.sqrt().sqrt().recip() +} + +const D: f32 = 0.67; +fn approx_parabola_integral(x: f32) -> f32 { + x * to_minus_one_quarter(1.0 - D + (D * D * D * D + 0.25 * x * x)) +} + +const B: f32 = 0.39; +fn approx_parabola_inv_integral(x: f32) -> f32 { + x * (1.0 - B + (B * B + 0.5 * x * x)).sqrt() +} + +#[derive(Clone, Copy, Default)] +struct SubdivResult { + val: f32, + a0: f32, + a2: f32, +} + +fn estimate_subdiv(p0: Vec2, p1: Vec2, p2: Vec2, sqrt_tol: f32) -> SubdivResult { + let d01 = p1 - p0; + let d12 = p2 - p1; + let dd = d01 - d12; + let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x; + let cross_inv = if cross.abs() < 1.0e-9 { + 1.0e9 + } else { + cross.recip() + }; + let x0 = d01.dot(dd) * cross_inv; + let x2 = d12.dot(dd) * cross_inv; + let scale = (cross / (dd.length() * (x2 - x0))).abs(); + let a0 = approx_parabola_integral(x0); + let a2 = approx_parabola_integral(x2); + let mut val = 0.0; + if scale < 1e9 { + let da = (a2 - a0).abs(); + let sqrt_scale = scale.sqrt(); + if x0.signum() == x2.signum() { + val = sqrt_scale; + } else { + let xmin = sqrt_tol / sqrt_scale; + val = sqrt_tol / approx_parabola_integral(xmin); + } + val *= da; + } + SubdivResult { val, a0, a2 } +} + +fn eval_quad(p0: Vec2, p1: Vec2, p2: Vec2, t: f32) -> Vec2 { + let mt = 1.0 - t; + p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t +} + +fn eval_cubic(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2, t: f32) -> Vec2 { + let mt = 1.0 - t; + p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t +} + +const MAX_QUADS: u32 = 16; + +struct Cubic { + p0: Vec2, + p1: Vec2, + p2: Vec2, + p3: Vec2, + path_ix: u32, +} + +fn flatten_cubic(cubic: Cubic, line_ix: &mut usize, lines: &mut [LineSoup]) { + let p0 = cubic.p0; + let p1 = cubic.p1; + let p2 = cubic.p2; + let p3 = cubic.p3; + let err_v = (p2 - p1) * 3.0 + p0 - p3; + let err = err_v.dot(err_v); + const ACCURACY: f32 = 0.25; + const Q_ACCURACY: f32 = ACCURACY * 0.1; + const REM_ACCURACY: f32 = ACCURACY - Q_ACCURACY; + const MAX_HYPOT2: f32 = 432.0 * Q_ACCURACY * Q_ACCURACY; + let mut n_quads = ((err * (1.0 / MAX_HYPOT2)).powf(1.0 / 6.0).ceil() as u32).max(1); + n_quads = n_quads.min(MAX_QUADS); + let mut keep_params = [SubdivResult::default(); MAX_QUADS as usize]; + let mut val = 0.0; + let mut qp0 = p0; + let step = (n_quads as f32).recip(); + for i in 0..n_quads { + let t = (i + 1) as f32 * step; + let qp2 = eval_cubic(p0, p1, p2, p3, t); + let mut qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); + qp1 = qp1 * 2.0 - (qp0 + qp2) * 0.5; + let params = estimate_subdiv(qp0, qp1, qp2, REM_ACCURACY.sqrt()); + keep_params[i as usize] = params; + val += params.val; + qp0 = qp2; + } + let n = ((val * (0.5 / REM_ACCURACY.sqrt())).ceil() as u32).max(1); + let mut lp0 = p0; + qp0 = p0; + let v_step = val / (n as f32); + let mut n_out = 1; + let mut val_sum = 0.0; + for i in 0..n_quads { + let t = (i + 1) as f32 * step; + let qp2 = eval_cubic(p0, p1, p2, p3, t); + let mut qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); + qp1 = qp1 * 2.0 - (qp0 + qp2) * 0.5; + let params = keep_params[i as usize]; + let u0 = approx_parabola_inv_integral(params.a0); + let u2 = approx_parabola_inv_integral(params.a2); + let uscale = (u2 - u0).recip(); + let mut val_target = (n_out as f32) * v_step; + while n_out == n || val_target < val_sum + params.val { + let lp1 = if n_out == n { + p3 + } else { + let u = (val_target - val_sum) / params.val; + let a = params.a0 + (params.a2 - params.a0) * u; + let au = approx_parabola_inv_integral(a); + let t = (au - u0) * uscale; + eval_quad(qp0, qp1, qp2, t) + }; + let ls = LineSoup { + path_ix: cubic.path_ix, + _padding: Default::default(), + p0: lp0.to_array(), + p1: lp1.to_array(), + }; + lines[*line_ix] = ls; + *line_ix += 1; + n_out += 1; + val_target += v_step; + lp0 = lp1; + } + val_sum += params.val; + qp0 = qp2; + } +} + +fn read_f32_point(ix: u32, pathdata: &[u32]) -> Vec2 { + let x = f32::from_bits(pathdata[ix as usize]); + let y = f32::from_bits(pathdata[ix as usize + 1]); + Vec2 { x, y } +} + +struct IntBbox { + x0: i32, + y0: i32, + x1: i32, + y1: i32, +} + +impl Default for IntBbox { + fn default() -> Self { + IntBbox { + x0: 0x7fff_ffff, + y0: 0x7fff_ffff, + x1: -0x8000_0000, + y1: -0x8000_0000, + } + } +} + +impl IntBbox { + fn add_pt(&mut self, pt: Vec2) { + self.x0 = self.x0.min(pt.x.floor() as i32); + self.y0 = self.y0.min(pt.y.floor() as i32); + self.x1 = self.x1.max(pt.x.ceil() as i32); + self.y1 = self.y1.max(pt.y.ceil() as i32); + } +} + +// TODO: we're skipping i16 point reading as it's not present in our scenes + +const WG_SIZE: usize = 256; + +const PATH_TAG_SEG_TYPE: u8 = 3; +const PATH_TAG_PATH: u8 = 0x10; +const PATH_TAG_LINETO: u8 = 1; +const PATH_TAG_QUADTO: u8 = 2; +const PATH_TAG_CUBICTO: u8 = 3; +const PATH_TAG_F32: u8 = 8; + +fn flatten_main( + n_wg: u32, + config: &ConfigUniform, + scene: &[u32], + tag_monoids: &[PathMonoid], + path_bboxes: &mut [PathBbox], + bump: &mut BumpAllocators, + lines: &mut [LineSoup], +) { + let mut line_ix = 0; + let mut bbox = IntBbox::default(); + for ix in 0..n_wg as usize * WG_SIZE { + let tag_word = scene[config.layout.path_tag_base as usize + (ix >> 2)]; + let shift = (ix & 3) * 8; + let mut tm = PathMonoid::new(tag_word & ((1 << shift) - 1)); + let tag_byte = (tag_word >> shift) as u8; + if tag_byte != 0 { + tm = tag_monoids[ix >> 2].combine(&tm); + } + let linewidth = + f32::from_bits(scene[(config.layout.linewidth_base + tm.linewidth_ix) as usize]); + if (tag_byte & PATH_TAG_PATH) != 0 { + let out = &mut path_bboxes[tm.path_ix as usize]; + out.linewidth = linewidth; + out.trans_ix = tm.trans_ix; + } + let seg_type = tag_byte & PATH_TAG_SEG_TYPE; + let pathdata = &scene[config.layout.path_data_base as usize..]; + if seg_type != 0 { + let mut p0; + let mut p1; + let mut p2 = Vec2::default(); + let mut p3 = Vec2::default(); + if (tag_byte & PATH_TAG_F32) != 0 { + p0 = read_f32_point(tm.pathseg_offset, pathdata); + p1 = read_f32_point(tm.pathseg_offset + 2, pathdata); + if seg_type >= PATH_TAG_QUADTO { + p2 = read_f32_point(tm.pathseg_offset + 4, pathdata); + if seg_type == PATH_TAG_CUBICTO { + p3 = read_f32_point(tm.pathseg_offset + 6, pathdata); + } + } + } else { + todo!("i16 path data not supported yet"); + } + let transform = Transform::read(config.layout.transform_base, tm.trans_ix, scene); + p0 = transform.apply(p0); + bbox.add_pt(p0); + p1 = transform.apply(p1); + bbox.add_pt(p1); + if seg_type == PATH_TAG_LINETO { + p3 = p1; + p2 = p3.mix(p0, 1.0 / 3.0); + p1 = p0.mix(p3, 1.0 / 3.0); + } else if seg_type >= PATH_TAG_QUADTO { + p2 = transform.apply(p2); + bbox.add_pt(p2); + if seg_type == PATH_TAG_CUBICTO { + p3 = transform.apply(p3); + bbox.add_pt(p3); + } else { + p3 = p2; + p2 = p1.mix(p2, 1.0 / 3.0); + p1 = p1.mix(p0, 1.0 / 3.0); + } + } + let path_ix = tm.path_ix; + let cubic = Cubic { + p0, + p1, + p2, + p3, + path_ix, + }; + flatten_cubic(cubic, &mut line_ix, lines); + } + if (tag_byte & PATH_TAG_PATH) != 0 { + let out = &mut path_bboxes[tm.path_ix as usize]; + out.x0 = bbox.x0; + out.y0 = bbox.y0; + out.x1 = bbox.x1; + out.y1 = bbox.y1; + bbox = IntBbox::default(); + } + } + bump.lines = line_ix as u32; +} + +pub fn flatten(n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let tag_monoids = resources[2].as_slice(); + let mut path_bboxes = resources[3].as_slice_mut(); + let mut bump = resources[4].as_typed_mut(); + let mut lines = resources[5].as_slice_mut(); + flatten_main( + n_wg, + &config, + &scene, + &tag_monoids, + &mut path_bboxes, + &mut bump, + &mut lines, + ); +} diff --git a/src/cpu_shader/mod.rs b/src/cpu_shader/mod.rs index fed341c75..6257aaf07 100644 --- a/src/cpu_shader/mod.rs +++ b/src/cpu_shader/mod.rs @@ -3,6 +3,59 @@ //! CPU implementations of shader stages. +// Allow un-idiomatic Rust to more closely match shaders +#![allow(clippy::needless_range_loop)] +#![allow(clippy::too_many_arguments)] + +mod backdrop; +mod bbox_clear; +mod binning; +mod clip_leaf; +mod clip_reduce; +mod coarse; +mod draw_leaf; +mod draw_reduce; +mod fine; +mod flatten; +mod path_count; +mod path_count_setup; +mod path_tiling; +mod path_tiling_setup; mod pathtag_reduce; +mod pathtag_scan; +mod tile_alloc; +mod util; +pub use backdrop::backdrop; +pub use bbox_clear::bbox_clear; +pub use binning::binning; +pub use clip_leaf::clip_leaf; +pub use clip_reduce::clip_reduce; +pub use coarse::coarse; +pub use draw_leaf::draw_leaf; +pub use draw_reduce::draw_reduce; +pub use flatten::flatten; +pub use path_count::path_count; +pub use path_count_setup::path_count_setup; +pub use path_tiling::path_tiling; +pub use path_tiling_setup::path_tiling_setup; pub use pathtag_reduce::pathtag_reduce; +pub use pathtag_scan::pathtag_scan; +pub use tile_alloc::tile_alloc; + +// Common definitions + +const PTCL_INITIAL_ALLOC: u32 = 64; + +// Tags for PTCL commands +const CMD_END: u32 = 0; +const CMD_FILL: u32 = 1; +//const CMD_STROKE: u32 = 2; +const CMD_SOLID: u32 = 3; +const CMD_COLOR: u32 = 5; +const CMD_LIN_GRAD: u32 = 6; +const CMD_RAD_GRAD: u32 = 7; +const CMD_IMAGE: u32 = 8; +const CMD_BEGIN_CLIP: u32 = 9; +const CMD_END_CLIP: u32 = 10; +const CMD_JUMP: u32 = 11; diff --git a/src/cpu_shader/path_count.rs b/src/cpu_shader/path_count.rs new file mode 100644 index 000000000..cc5f79f5a --- /dev/null +++ b/src/cpu_shader/path_count.rs @@ -0,0 +1,157 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{BumpAllocators, LineSoup, Path, SegmentCount, Tile}; + +use crate::cpu_dispatch::CpuBinding; + +use super::util::{span, Vec2}; + +const TILE_SCALE: f32 = 1.0 / 16.0; + +fn path_count_main( + bump: &mut BumpAllocators, + lines: &[LineSoup], + paths: &[Path], + tile: &mut [Tile], + seg_counts: &mut [SegmentCount], +) { + for line_ix in 0..bump.lines { + let line = lines[line_ix as usize]; + let p0 = Vec2::from_array(line.p0); + let p1 = Vec2::from_array(line.p1); + let is_down = p1.y >= p0.y; + let (xy0, xy1) = if is_down { (p0, p1) } else { (p1, p0) }; + let s0 = xy0 * TILE_SCALE; + let s1 = xy1 * TILE_SCALE; + let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1; + + let dx = (s1.x - s0.x).abs(); + let dy = s1.y - s0.y; + if dx + dy == 0.0 { + continue; + } + if dy == 0.0 && s0.y.floor() == s0.y { + continue; + } + let dy_dxdy = dy / (dx + dy); + let a = 1.0 - dy_dxdy; + let is_positive_slope = s1.x >= s0.x; + let sign = if is_positive_slope { 1.0 } else { -1.0 }; + let xt0 = (s0.x * sign).floor(); + let c = s0.x * sign - xt0; + let y0 = s0.y.floor(); + let ytop = if s0.y == s1.y { s0.y.ceil() } else { y0 + 1.0 }; + let b = dy_dxdy * c + a * (ytop - s0.y); + let x0 = xt0 * sign + if is_positive_slope { 0.0 } else { -1.0 }; + + let path = paths[line.path_ix as usize]; + let bbox = path.bbox; + let bbox = [ + bbox[0] as i32, + bbox[1] as i32, + bbox[2] as i32, + bbox[3] as i32, + ]; + let xmin = s0.x.min(s1.x); + let stride = bbox[2] - bbox[0]; + if s0.y >= bbox[3] as f32 || s1.y < bbox[1] as f32 || xmin >= bbox[2] as f32 || stride == 0 + { + continue; + } + // Clip line to bounding box. Clipping is done in "i" space. + let mut imin = 0; + if s0.y < bbox[1] as f32 { + let mut iminf = ((bbox[1] as f32 - y0 + b - a) / (1.0 - a)).round() - 1.0; + if y0 + iminf - (a * iminf + b).floor() < bbox[1] as f32 { + iminf += 1.0; + } + imin = iminf as u32; + } + let mut imax = count; + if s1.y > bbox[3] as f32 { + let mut imaxf = ((bbox[3] as f32 - y0 + b - a) / (1.0 - a)).round() - 1.0; + if y0 + imaxf - (a * imaxf + b).floor() < bbox[3] as f32 { + imaxf += 1.0; + } + imax = imaxf as u32; + } + let delta = if is_down { -1 } else { 1 }; + let mut ymin = 0; + let mut ymax = 0; + if s0.x.max(s1.x) < bbox[0] as f32 { + ymin = s0.y.ceil() as i32; + ymax = s1.y.ceil() as i32; + imax = imin; + } else { + let fudge = if is_positive_slope { 0.0 } else { 1.0 }; + if xmin < bbox[0] as f32 { + let mut f = ((sign * (bbox[0] as f32 - x0) - b + fudge) / a).round(); + if (x0 + sign * (a * f + b).floor() < bbox[0] as f32) == is_positive_slope { + f += 1.0; + } + let ynext = (y0 + f - (a * f + b).floor() + 1.0) as i32; + if is_positive_slope { + if f as u32 > imin { + ymin = (y0 + if y0 == s0.y { 0.0 } else { 1.0 }) as i32; + ymax = ynext; + imin = f as u32; + } + } else if (f as u32) < imax { + ymin = ynext; + ymax = s1.y.ceil() as i32; + imax = f as u32; + } + } + if s0.x.max(s1.x) > bbox[2] as f32 { + let mut f = ((sign * (bbox[2] as f32 - x0) - b + fudge) / a).round(); + if (x0 + sign * (a * f + b).floor() < bbox[2] as f32) == is_positive_slope { + f += 1.0; + } + if is_positive_slope { + imax = imax.min(f as u32); + } else { + imin = imin.max(f as u32); + } + } + } + imax = imin.max(imax); + ymin = ymin.max(bbox[1]); + ymax = ymax.min(bbox[3]); + for y in ymin..ymax { + let base = path.tiles as i32 + (y - bbox[1]) * stride; + tile[base as usize].backdrop += delta; + } + let mut last_z = (a * (imin as f32 - 1.0) + b).floor(); + let seg_base = bump.seg_counts; + bump.seg_counts += imax - imin; + for i in imin..imax { + let zf = a * i as f32 + b; + let z = zf.floor(); + let y = (y0 + i as f32 - z) as i32; + let x = (x0 + sign * z) as i32; + let base = path.tiles as i32 + (y - bbox[1]) * stride - bbox[0]; + let top_edge = if i == 0 { y0 == s0.y } else { last_z == z }; + if top_edge && x + 1 < bbox[2] { + let x_bump = (x + 1).max(bbox[0]); + tile[(base + x_bump) as usize].backdrop += delta; + } + // .segments is another name for the .count field; it's overloaded + let seg_within_slice = tile[(base + x) as usize].segment_count_or_ix; + tile[(base + x) as usize].segment_count_or_ix += 1; + let counts = (seg_within_slice << 16) | i; + let seg_count = SegmentCount { line_ix, counts }; + seg_counts[(seg_base + i - imin) as usize] = seg_count; + last_z = z; + } + } +} + +pub fn path_count(_n_wg: u32, resources: &[CpuBinding]) { + let mut bump = resources[1].as_typed_mut(); + let lines = resources[2].as_slice(); + let paths = resources[3].as_slice(); + let mut tile = resources[4].as_slice_mut(); + let mut seg_counts = resources[5].as_slice_mut(); + path_count_main(&mut bump, &lines, &paths, &mut tile, &mut seg_counts); +} diff --git a/src/cpu_shader/path_count_setup.rs b/src/cpu_shader/path_count_setup.rs new file mode 100644 index 000000000..1327d708d --- /dev/null +++ b/src/cpu_shader/path_count_setup.rs @@ -0,0 +1,21 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{BumpAllocators, IndirectCount}; + +use crate::cpu_dispatch::CpuBinding; + +const WG_SIZE: usize = 256; + +fn path_count_setup_main(bump: &BumpAllocators, indirect: &mut IndirectCount) { + let lines = bump.lines; + indirect.count_x = (lines + (WG_SIZE as u32 - 1)) / WG_SIZE as u32; + indirect.count_y = 1; + indirect.count_z = 1; +} + +pub fn path_count_setup(_n_wg: u32, resources: &[CpuBinding]) { + let bump = resources[0].as_typed(); + let mut indirect = resources[1].as_typed_mut(); + path_count_setup_main(&bump, &mut indirect); +} diff --git a/src/cpu_shader/path_tiling.rs b/src/cpu_shader/path_tiling.rs new file mode 100644 index 000000000..fdd2d97db --- /dev/null +++ b/src/cpu_shader/path_tiling.rs @@ -0,0 +1,152 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{BumpAllocators, LineSoup, Path, PathSegment, SegmentCount, Tile}; + +use crate::cpu_dispatch::CpuBinding; + +use super::util::{span, Vec2}; + +const TILE_WIDTH: u32 = 16; +const TILE_HEIGHT: u32 = 16; +const TILE_SCALE: f32 = 1.0 / 16.0; + +fn path_tiling_main( + bump: &mut BumpAllocators, + seg_counts: &[SegmentCount], + lines: &[LineSoup], + paths: &[Path], + tiles: &[Tile], + segments: &mut [PathSegment], +) { + for seg_ix in 0..bump.seg_counts { + let seg_count = seg_counts[seg_ix as usize]; + let line = lines[seg_count.line_ix as usize]; + let counts = seg_count.counts; + let seg_within_slice = counts >> 16; + let seg_within_line = counts & 0xffff; + + // coarse rasterization logic + let p0 = Vec2::from_array(line.p0); + let p1 = Vec2::from_array(line.p1); + let is_down = p1.y >= p0.y; + let (mut xy0, mut xy1) = if is_down { (p0, p1) } else { (p1, p0) }; + let s0 = xy0 * TILE_SCALE; + let s1 = xy1 * TILE_SCALE; + let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1; + + let dx = (s1.x - s0.x).abs(); + let dy = s1.y - s0.y; + let dy_dxdy = dy / (dx + dy); + let a = 1.0 - dy_dxdy; + let is_positive_slope = s1.x >= s0.x; + let sign = if is_positive_slope { 1.0 } else { -1.0 }; + let xt0 = (s0.x * sign).floor(); + let c = s0.x * sign - xt0; + let y0 = s0.y.floor(); + let ytop = if s0.y == s1.y { s0.y.ceil() } else { y0 + 1.0 }; + let b = dy_dxdy * c + a * (ytop - s0.y); + let x0 = xt0 * sign + if is_positive_slope { 0.0 } else { -1.0 }; + let z = (a * seg_within_line as f32 + b).floor(); + let x = x0 as i32 + (sign * z) as i32; + let y = (y0 + seg_within_line as f32 - z) as i32; + + let path = paths[line.path_ix as usize]; + let bbox = path.bbox; + let bbox = [ + bbox[0] as i32, + bbox[1] as i32, + bbox[2] as i32, + bbox[3] as i32, + ]; + let stride = bbox[2] - bbox[0]; + let tile_ix = path.tiles as i32 + (y - bbox[1]) * stride + x - bbox[0]; + let tile = tiles[tile_ix as usize]; + let seg_start = !tile.segment_count_or_ix; + if (seg_start as i32) < 0 { + continue; + } + let tile_xy = Vec2::new(x as f32 * TILE_WIDTH as f32, y as f32 * TILE_HEIGHT as f32); + let tile_xy1 = tile_xy + Vec2::new(TILE_WIDTH as f32, TILE_HEIGHT as f32); + + if seg_within_line > 0 { + let z_prev = (a * (seg_within_line as f32 - 1.0) + b).floor(); + if z == z_prev { + // Top edge is clipped + let mut xt = xy0.x + (xy1.x - xy0.x) * (tile_xy.y - xy0.y) / (xy1.y - xy0.y); + xt = xt.clamp(tile_xy.x + 1e-3, tile_xy1.x); + xy0 = Vec2::new(xt, tile_xy.y); + } else { + // If is_positive_slope, left edge is clipped, otherwise right + let x_clip = if is_positive_slope { + tile_xy.x + } else { + tile_xy1.x + }; + let mut yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x); + yt = yt.clamp(tile_xy.y + 1e-3, tile_xy1.y); + xy0 = Vec2::new(x_clip, yt); + } + } + if seg_within_line < count - 1 { + let z_next = (a * (seg_within_line as f32 + 1.0) + b).floor(); + if z == z_next { + // Bottom edge is clipped + let mut xt = xy0.x + (xy1.x - xy0.x) * (tile_xy1.y - xy0.y) / (xy1.y - xy0.y); + xt = xt.clamp(tile_xy.x + 1e-3, tile_xy1.x); + xy1 = Vec2::new(xt, tile_xy1.y); + } else { + // If is_positive_slope, right edge is clipped, otherwise left + let x_clip = if is_positive_slope { + tile_xy1.x + } else { + tile_xy.x + }; + let mut yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x); + yt = yt.clamp(tile_xy.y + 1e-3, tile_xy1.y); + xy1 = Vec2::new(x_clip, yt); + } + } + if !is_down { + (xy0, xy1) = (xy1, xy0); + } + // TODO: figure out what to if both xy0 and xy1 are at left edge + // Also TODO (part of move to 8 byte encoding for segments): don't store y_edge at all, + // resolve this in fine. + let y_edge = if xy0.x == tile_xy.x { + xy0.y + } else if xy1.x == tile_xy.x { + xy1.y + } else { + 1e9 + }; + let segment = PathSegment { + origin: xy0.to_array(), + delta: (xy1 - xy0).to_array(), + y_edge, + _padding: Default::default(), + }; + assert!(xy0.x >= tile_xy.x && xy0.x <= tile_xy1.x); + assert!(xy0.y >= tile_xy.y && xy0.y <= tile_xy1.y); + assert!(xy1.x >= tile_xy.x && xy1.x <= tile_xy1.x); + assert!(xy1.y >= tile_xy.y && xy1.y <= tile_xy1.y); + segments[(seg_start + seg_within_slice) as usize] = segment; + } +} + +pub fn path_tiling(_n_wg: u32, resources: &[CpuBinding]) { + let mut bump = resources[0].as_typed_mut(); + let seg_counts = resources[1].as_slice(); + let lines = resources[2].as_slice(); + let paths = resources[3].as_slice(); + let tiles = resources[4].as_slice(); + let mut segments = resources[5].as_slice_mut(); + path_tiling_main( + &mut bump, + &seg_counts, + &lines, + &paths, + &tiles, + &mut segments, + ); +} diff --git a/src/cpu_shader/path_tiling_setup.rs b/src/cpu_shader/path_tiling_setup.rs new file mode 100644 index 000000000..8efee31b0 --- /dev/null +++ b/src/cpu_shader/path_tiling_setup.rs @@ -0,0 +1,21 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{BumpAllocators, IndirectCount}; + +use crate::cpu_dispatch::CpuBinding; + +const WG_SIZE: usize = 256; + +fn path_tiling_setup_main(bump: &BumpAllocators, indirect: &mut IndirectCount) { + let segments = bump.seg_counts; + indirect.count_x = (segments + (WG_SIZE as u32 - 1)) / WG_SIZE as u32; + indirect.count_y = 1; + indirect.count_z = 1; +} + +pub fn path_tiling_setup(_n_wg: u32, resources: &[CpuBinding]) { + let bump = resources[0].as_typed(); + let mut indirect = resources[1].as_typed_mut(); + path_tiling_setup_main(&bump, &mut indirect); +} diff --git a/src/cpu_shader/pathtag_reduce.rs b/src/cpu_shader/pathtag_reduce.rs index 38ee55c18..31979e8ed 100644 --- a/src/cpu_shader/pathtag_reduce.rs +++ b/src/cpu_shader/pathtag_reduce.rs @@ -25,11 +25,8 @@ fn pathtag_reduce_main( } pub fn pathtag_reduce(n_wg: u32, resources: &[CpuBinding]) { - let r0 = resources[0].as_buf(); - let r1 = resources[1].as_buf(); - let mut r2 = resources[2].as_buf(); - let config = bytemuck::from_bytes(&r0); - let scene = bytemuck::cast_slice(&r1); - let reduced = bytemuck::cast_slice_mut(r2.as_mut()); - pathtag_reduce_main(n_wg, config, scene, reduced); + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let mut reduced = resources[2].as_slice_mut(); + pathtag_reduce_main(n_wg, &config, &scene, &mut reduced); } diff --git a/src/cpu_shader/pathtag_scan.rs b/src/cpu_shader/pathtag_scan.rs new file mode 100644 index 000000000..8a8aa609a --- /dev/null +++ b/src/cpu_shader/pathtag_scan.rs @@ -0,0 +1,37 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{ConfigUniform, Monoid, PathMonoid}; + +use crate::cpu_dispatch::CpuBinding; + +const WG_SIZE: usize = 256; + +fn pathtag_scan_main( + n_wg: u32, + config: &ConfigUniform, + scene: &[u32], + reduced: &[PathMonoid], + tag_monoids: &mut [PathMonoid], +) { + let pathtag_base = config.layout.path_tag_base; + let mut prefix = PathMonoid::default(); + for i in 0..n_wg { + let mut m = prefix; + for j in 0..WG_SIZE { + let ix = (i * WG_SIZE as u32) as usize + j; + tag_monoids[ix] = m; + let tag = scene[pathtag_base as usize + ix]; + m = m.combine(&PathMonoid::new(tag)); + } + prefix = prefix.combine(&reduced[i as usize]); + } +} + +pub fn pathtag_scan(n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let reduced = resources[2].as_slice(); + let mut tag_monoids = resources[3].as_slice_mut(); + pathtag_scan_main(n_wg, &config, &scene, &reduced, &mut tag_monoids); +} diff --git a/src/cpu_shader/tile_alloc.rs b/src/cpu_shader/tile_alloc.rs new file mode 100644 index 000000000..367f28df2 --- /dev/null +++ b/src/cpu_shader/tile_alloc.rs @@ -0,0 +1,72 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_encoding::{BumpAllocators, ConfigUniform, DrawTag, Path, Tile}; + +use crate::cpu_dispatch::CpuBinding; + +const TILE_WIDTH: usize = 16; +const TILE_HEIGHT: usize = 16; +const SX: f32 = 1.0 / (TILE_WIDTH as f32); +const SY: f32 = 1.0 / (TILE_HEIGHT as f32); + +fn tile_alloc_main( + config: &ConfigUniform, + scene: &[u32], + draw_bboxes: &[[f32; 4]], + bump: &mut BumpAllocators, + paths: &mut [Path], + tiles: &mut [Tile], +) { + let drawtag_base = config.layout.draw_tag_base; + let width_in_tiles = config.width_in_tiles as i32; + let height_in_tiles = config.height_in_tiles as i32; + for drawobj_ix in 0..config.layout.n_draw_objects { + let drawtag = DrawTag(scene[(drawtag_base + drawobj_ix) as usize]); + let mut x0 = 0; + let mut y0 = 0; + let mut x1 = 0; + let mut y1 = 0; + if drawtag != DrawTag::NOP && drawtag != DrawTag::END_CLIP { + let bbox = draw_bboxes[drawobj_ix as usize]; + if bbox[0] < bbox[2] && bbox[1] < bbox[3] { + x0 = (bbox[0] * SX).floor() as i32; + y0 = (bbox[1] * SY).floor() as i32; + x1 = (bbox[2] * SX).ceil() as i32; + y1 = (bbox[3] * SY).ceil() as i32; + } + } + let ux0 = x0.clamp(0, width_in_tiles) as u32; + let uy0 = y0.clamp(0, height_in_tiles) as u32; + let ux1 = x1.clamp(0, width_in_tiles) as u32; + let uy1 = y1.clamp(0, height_in_tiles) as u32; + let tile_count = (ux1 - ux0) * (uy1 - uy0); + let offset = bump.tile; + bump.tile += tile_count; + // We construct it this way because padding is private. + let mut path = Path::default(); + path.bbox = [ux0, uy0, ux1, uy1]; + path.tiles = offset; + paths[drawobj_ix as usize] = path; + for i in 0..tile_count { + tiles[(offset + i) as usize] = Tile::default(); + } + } +} + +pub fn tile_alloc(_n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let draw_bboxes = resources[2].as_slice(); + let mut bump = resources[3].as_typed_mut(); + let mut paths = resources[4].as_slice_mut(); + let mut tiles = resources[5].as_slice_mut(); + tile_alloc_main( + &config, + &scene, + &draw_bboxes, + &mut bump, + &mut paths, + &mut tiles, + ); +} diff --git a/src/cpu_shader/util.rs b/src/cpu_shader/util.rs new file mode 100644 index 000000000..7c32cddb6 --- /dev/null +++ b/src/cpu_shader/util.rs @@ -0,0 +1,96 @@ +// Copyright 2023 The Vello authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! Utility types + +#[derive(Clone, Copy, Default, Debug)] +#[repr(C)] +pub struct Vec2 { + pub x: f32, + pub y: f32, +} + +impl std::ops::Add for Vec2 { + type Output = Vec2; + + fn add(self, rhs: Self) -> Self { + Vec2 { + x: self.x + rhs.x, + y: self.y + rhs.y, + } + } +} + +impl std::ops::Sub for Vec2 { + type Output = Vec2; + + fn sub(self, rhs: Self) -> Self { + Vec2 { + x: self.x - rhs.x, + y: self.y - rhs.y, + } + } +} + +impl std::ops::Mul for Vec2 { + type Output = Vec2; + + fn mul(self, rhs: f32) -> Self { + Vec2 { + x: self.x * rhs, + y: self.y * rhs, + } + } +} + +impl Vec2 { + pub fn new(x: f32, y: f32) -> Self { + Vec2 { x, y } + } + + pub fn dot(self, other: Vec2) -> f32 { + self.x * other.x + self.y * other.y + } + + pub fn length(self) -> f32 { + self.x.hypot(self.y) + } + + pub fn to_array(self) -> [f32; 2] { + [self.x, self.y] + } + + pub fn from_array(a: [f32; 2]) -> Self { + Vec2 { x: a[0], y: a[1] } + } + + pub fn mix(self, other: Vec2, t: f32) -> Self { + let x = self.x + (other.x - self.x) * t; + let y = self.y + (other.y - self.y) * t; + Vec2 { x, y } + } +} + +pub struct Transform(pub [f32; 6]); + +impl Transform { + pub fn apply(&self, p: Vec2) -> Vec2 { + let z = self.0; + let x = z[0] * p.x + z[2] * p.y + z[4]; + let y = z[1] * p.x + z[3] * p.y + z[5]; + Vec2 { x, y } + } + + pub fn read(transform_base: u32, ix: u32, data: &[u32]) -> Transform { + let mut z = [0.0; 6]; + let base = (transform_base + ix * 6) as usize; + for i in 0..6 { + z[i] = f32::from_bits(data[base + i]); + } + Transform(z) + } +} + +pub fn span(a: f32, b: f32) -> u32 { + (a.max(b).ceil() - a.min(b).floor()).max(1.0) as u32 +} diff --git a/src/lib.rs b/src/lib.rs index 55147acd8..28e5bf7ff 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -72,6 +72,8 @@ pub struct Renderer { profiler: GpuProfiler, #[cfg(feature = "wgpu-profiler")] pub profile_result: Option>, + #[cfg(feature = "hot_reload")] + use_cpu: bool, } /// Parameters used in a single render that are configurable by the client. @@ -101,7 +103,10 @@ impl Renderer { /// Creates a new renderer for the specified device. pub fn new(device: &Device, render_options: &RendererOptions) -> Result { let mut engine = WgpuEngine::new(); - let shaders = shaders::full_shaders(device, &mut engine, render_options.use_cpu)?; + let mut shaders = shaders::full_shaders(device, &mut engine)?; + if render_options.use_cpu { + shaders.install_cpu_shaders(&mut engine); + } let blit = render_options .surface_format .map(|surface_format| BlitPipeline::new(device, surface_format)); @@ -115,6 +120,8 @@ impl Renderer { profiler: GpuProfiler::new(3, render_options.timestamp_period, device.features()), #[cfg(feature = "wgpu-profiler")] profile_result: None, + #[cfg(feature = "hot_reload")] + use_cpu: render_options.use_cpu, }) } @@ -220,7 +227,10 @@ impl Renderer { pub async fn reload_shaders(&mut self, device: &Device) -> Result<()> { device.push_error_scope(wgpu::ErrorFilter::Validation); let mut engine = WgpuEngine::new(); - let shaders = shaders::full_shaders(device, &mut engine, false)?; + let mut shaders = shaders::full_shaders(device, &mut engine)?; + if self.use_cpu { + shaders.install_cpu_shaders(&mut engine); + } let error = device.pop_error_scope().await; if let Some(error) = error { return Err(error.into()); diff --git a/src/shaders.rs b/src/shaders.rs index 23a3950f3..0a73d2366 100644 --- a/src/shaders.rs +++ b/src/shaders.rs @@ -82,11 +82,7 @@ pub struct FullShaders { } #[cfg(feature = "wgpu")] -pub fn full_shaders( - device: &Device, - engine: &mut WgpuEngine, - use_cpu: bool, -) -> Result { +pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result { let imports = SHARED_SHADERS .iter() .copied() @@ -103,9 +99,6 @@ pub fn full_shaders( preprocess::preprocess(shader!("pathtag_reduce"), &full_config, &imports).into(), &[BindType::Uniform, BindType::BufReadOnly, BindType::Buffer], )?; - if use_cpu { - engine.set_cpu_shader(pathtag_reduce, cpu_shader::pathtag_reduce); - } let pathtag_reduce2 = engine.add_shader( device, "pathtag_reduce2", @@ -334,6 +327,37 @@ pub fn full_shaders( }) } +#[cfg(feature = "wgpu")] +impl FullShaders { + /// Install the CPU shaders. + /// + /// There are a couple things to note here. The granularity provided by + /// this method is coarse; it installs all the shaders. There are many + /// use cases (including debugging), where a mix is desired, or the + /// choice between GPU and CPU dispatch might be dynamic. + /// + /// Second, the actual mapping to CPU shaders is not really specific to + /// the engine, and should be split out into a back-end agnostic struct. + pub fn install_cpu_shaders(&mut self, engine: &mut WgpuEngine) { + engine.set_cpu_shader(self.pathtag_reduce, cpu_shader::pathtag_reduce); + engine.set_cpu_shader(self.pathtag_scan, cpu_shader::pathtag_scan); + engine.set_cpu_shader(self.bbox_clear, cpu_shader::bbox_clear); + engine.set_cpu_shader(self.flatten, cpu_shader::flatten); + engine.set_cpu_shader(self.draw_reduce, cpu_shader::draw_reduce); + engine.set_cpu_shader(self.draw_leaf, cpu_shader::draw_leaf); + engine.set_cpu_shader(self.clip_reduce, cpu_shader::clip_reduce); + engine.set_cpu_shader(self.clip_leaf, cpu_shader::clip_leaf); + engine.set_cpu_shader(self.binning, cpu_shader::binning); + engine.set_cpu_shader(self.tile_alloc, cpu_shader::tile_alloc); + engine.set_cpu_shader(self.path_count_setup, cpu_shader::path_count_setup); + engine.set_cpu_shader(self.path_count, cpu_shader::path_count); + engine.set_cpu_shader(self.backdrop, cpu_shader::backdrop); + engine.set_cpu_shader(self.coarse, cpu_shader::coarse); + engine.set_cpu_shader(self.path_tiling_setup, cpu_shader::path_tiling_setup); + engine.set_cpu_shader(self.path_tiling, cpu_shader::path_tiling); + } +} + macro_rules! shared_shader { ($name:expr) => { ( diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs index 12380e32c..c5359c1bb 100644 --- a/src/wgpu_engine.rs +++ b/src/wgpu_engine.rs @@ -19,6 +19,7 @@ use crate::{ BufProxy, Command, Id, ImageProxy, Recording, ResourceProxy, ShaderId, }; +#[derive(Default)] pub struct WgpuEngine { shaders: Vec, pool: ResourcePool, @@ -90,12 +91,7 @@ enum TransientBuf<'a> { impl WgpuEngine { pub fn new() -> WgpuEngine { - WgpuEngine { - shaders: vec![], - pool: Default::default(), - bind_map: Default::default(), - downloads: Default::default(), - } + Default::default() } /// Add a shader. From 5080af015b58b88379fda0ccbc8e06607cdacf8d Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 10 Oct 2023 16:10:28 -0700 Subject: [PATCH 2/3] Fix pathtag limit, add unlicense Adds Unlicense to the SPDX line, and also relaxes the 64k (or any) limit on the number of pathtags in the CPU case. The CPU shaders don't exactly match the GPU in the "use_large_path_scan" case, but they do work. --- crates/encoding/src/path.rs | 7 ++++--- src/cpu_shader/backdrop.rs | 2 +- src/cpu_shader/bbox_clear.rs | 2 +- src/cpu_shader/binning.rs | 2 +- src/cpu_shader/clip_leaf.rs | 2 +- src/cpu_shader/clip_reduce.rs | 2 +- src/cpu_shader/coarse.rs | 2 +- src/cpu_shader/draw_leaf.rs | 2 +- src/cpu_shader/draw_reduce.rs | 2 +- src/cpu_shader/fine.rs | 2 +- src/cpu_shader/flatten.rs | 2 +- src/cpu_shader/mod.rs | 2 +- src/cpu_shader/path_count.rs | 2 +- src/cpu_shader/path_count_setup.rs | 2 +- src/cpu_shader/path_tiling.rs | 2 +- src/cpu_shader/path_tiling_setup.rs | 2 +- src/cpu_shader/pathtag_reduce.rs | 2 +- src/render.rs | 5 +++-- src/shaders.rs | 5 +++++ 19 files changed, 28 insertions(+), 21 deletions(-) diff --git a/crates/encoding/src/path.rs b/crates/encoding/src/path.rs index b1d161525..b0a52b0b7 100644 --- a/crates/encoding/src/path.rs +++ b/crates/encoding/src/path.rs @@ -227,9 +227,10 @@ pub struct Path { pub struct Tile { /// Accumulated backdrop at the left edge of the tile. pub backdrop: i32, - /// An enum that can hold either a count or an index to the - /// beginning of an allocated slice. In the latter case, the - /// bits are inverted. + /// An enum that holds either the count of the number of path + /// segments in this tile, or an index to the beginning of an + /// allocated slice of `PathSegment` objects. In the latter case, + /// the bits are inverted. pub segment_count_or_ix: u32, } diff --git a/src/cpu_shader/backdrop.rs b/src/cpu_shader/backdrop.rs index 2a19fd843..746efdcde 100644 --- a/src/cpu_shader/backdrop.rs +++ b/src/cpu_shader/backdrop.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{ConfigUniform, Path, Tile}; diff --git a/src/cpu_shader/bbox_clear.rs b/src/cpu_shader/bbox_clear.rs index 014b905f5..1e02127d0 100644 --- a/src/cpu_shader/bbox_clear.rs +++ b/src/cpu_shader/bbox_clear.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{ConfigUniform, PathBbox}; diff --git a/src/cpu_shader/binning.rs b/src/cpu_shader/binning.rs index 8c2a79583..136e333c7 100644 --- a/src/cpu_shader/binning.rs +++ b/src/cpu_shader/binning.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{BinHeader, BumpAllocators, ConfigUniform, DrawMonoid, PathBbox}; diff --git a/src/cpu_shader/clip_leaf.rs b/src/cpu_shader/clip_leaf.rs index 63b528cf1..0f5fc6106 100644 --- a/src/cpu_shader/clip_leaf.rs +++ b/src/cpu_shader/clip_leaf.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{Clip, ConfigUniform, DrawMonoid, PathBbox}; diff --git a/src/cpu_shader/clip_reduce.rs b/src/cpu_shader/clip_reduce.rs index a8433bdee..96bc3582e 100644 --- a/src/cpu_shader/clip_reduce.rs +++ b/src/cpu_shader/clip_reduce.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{Clip, ClipBic, ClipElement, PathBbox}; diff --git a/src/cpu_shader/coarse.rs b/src/cpu_shader/coarse.rs index 69e5d5113..c54aa1076 100644 --- a/src/cpu_shader/coarse.rs +++ b/src/cpu_shader/coarse.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{BinHeader, BumpAllocators, ConfigUniform, DrawMonoid, DrawTag, Path, Tile}; diff --git a/src/cpu_shader/draw_leaf.rs b/src/cpu_shader/draw_leaf.rs index 4837d024e..1d699307b 100644 --- a/src/cpu_shader/draw_leaf.rs +++ b/src/cpu_shader/draw_leaf.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{Clip, ConfigUniform, DrawMonoid, DrawTag, Monoid, PathBbox}; diff --git a/src/cpu_shader/draw_reduce.rs b/src/cpu_shader/draw_reduce.rs index 9ec876e96..019b9416c 100644 --- a/src/cpu_shader/draw_reduce.rs +++ b/src/cpu_shader/draw_reduce.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{ConfigUniform, DrawMonoid, DrawTag, Monoid}; diff --git a/src/cpu_shader/fine.rs b/src/cpu_shader/fine.rs index a28cfe697..c64c87627 100644 --- a/src/cpu_shader/fine.rs +++ b/src/cpu_shader/fine.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{ConfigUniform, PathSegment, Tile}; diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs index 41cc949d5..2cdf7256b 100644 --- a/src/cpu_shader/flatten.rs +++ b/src/cpu_shader/flatten.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use crate::cpu_dispatch::CpuBinding; diff --git a/src/cpu_shader/mod.rs b/src/cpu_shader/mod.rs index 6257aaf07..16d261f65 100644 --- a/src/cpu_shader/mod.rs +++ b/src/cpu_shader/mod.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense //! CPU implementations of shader stages. diff --git a/src/cpu_shader/path_count.rs b/src/cpu_shader/path_count.rs index cc5f79f5a..b55cd1239 100644 --- a/src/cpu_shader/path_count.rs +++ b/src/cpu_shader/path_count.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{BumpAllocators, LineSoup, Path, SegmentCount, Tile}; diff --git a/src/cpu_shader/path_count_setup.rs b/src/cpu_shader/path_count_setup.rs index 1327d708d..6336cfd47 100644 --- a/src/cpu_shader/path_count_setup.rs +++ b/src/cpu_shader/path_count_setup.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{BumpAllocators, IndirectCount}; diff --git a/src/cpu_shader/path_tiling.rs b/src/cpu_shader/path_tiling.rs index fdd2d97db..53f5cd970 100644 --- a/src/cpu_shader/path_tiling.rs +++ b/src/cpu_shader/path_tiling.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{BumpAllocators, LineSoup, Path, PathSegment, SegmentCount, Tile}; diff --git a/src/cpu_shader/path_tiling_setup.rs b/src/cpu_shader/path_tiling_setup.rs index 8efee31b0..32e08f9ae 100644 --- a/src/cpu_shader/path_tiling_setup.rs +++ b/src/cpu_shader/path_tiling_setup.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{BumpAllocators, IndirectCount}; diff --git a/src/cpu_shader/pathtag_reduce.rs b/src/cpu_shader/pathtag_reduce.rs index 31979e8ed..58eb36c17 100644 --- a/src/cpu_shader/pathtag_reduce.rs +++ b/src/cpu_shader/pathtag_reduce.rs @@ -1,5 +1,5 @@ // Copyright 2023 The Vello authors -// SPDX-License-Identifier: Apache-2.0 OR MIT +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense use vello_encoding::{ConfigUniform, Monoid, PathMonoid}; diff --git a/src/render.rs b/src/render.rs index 462563633..268007faa 100644 --- a/src/render.rs +++ b/src/render.rs @@ -139,7 +139,8 @@ impl Render { ); let mut pathtag_parent = reduced_buf; let mut large_pathtag_bufs = None; - if wg_counts.use_large_path_scan { + let use_large_path_scan = wg_counts.use_large_path_scan && !shaders.pathtag_is_cpu; + if use_large_path_scan { let reduced2_buf = ResourceProxy::new_buf( buffer_sizes.path_reduced2.size_in_bytes().into(), "reduced2_buf", @@ -166,7 +167,7 @@ impl Render { buffer_sizes.path_monoids.size_in_bytes().into(), "tagmonoid_buf", ); - let pathtag_scan = if wg_counts.use_large_path_scan { + let pathtag_scan = if use_large_path_scan { shaders.pathtag_scan_large } else { shaders.pathtag_scan diff --git a/src/shaders.rs b/src/shaders.rs index 0a73d2366..86e6ed7bd 100644 --- a/src/shaders.rs +++ b/src/shaders.rs @@ -79,6 +79,9 @@ pub struct FullShaders { pub path_tiling_setup: ShaderId, pub path_tiling: ShaderId, pub fine: ShaderId, + // 2-level dispatch works for CPU pathtag scan even for large + // inputs, 3-level is not yet implemented. + pub pathtag_is_cpu: bool, } #[cfg(feature = "wgpu")] @@ -324,6 +327,7 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result Date: Wed, 11 Oct 2023 11:42:54 -0700 Subject: [PATCH 3/3] Address review feedback --- src/cpu_shader/binning.rs | 1 + src/cpu_shader/clip_reduce.rs | 1 - src/cpu_shader/coarse.rs | 4 +--- src/cpu_shader/draw_leaf.rs | 12 ++++-------- src/cpu_shader/draw_reduce.rs | 9 +++------ src/cpu_shader/path_count.rs | 6 +++--- src/cpu_shader/path_tiling.rs | 6 +++--- src/cpu_shader/util.rs | 17 +++++++++++++++++ 8 files changed, 32 insertions(+), 24 deletions(-) diff --git a/src/cpu_shader/binning.rs b/src/cpu_shader/binning.rs index 136e333c7..5ace850c3 100644 --- a/src/cpu_shader/binning.rs +++ b/src/cpu_shader/binning.rs @@ -50,6 +50,7 @@ fn binning_main( let draw_monoid = draw_monoids[element_ix]; let mut clip_bbox = [-1e9, -1e9, 1e9, 1e9]; if draw_monoid.clip_ix > 0 { + assert!(draw_monoid.clip_ix - 1 < config.layout.n_clips); clip_bbox = clip_bbox_buf[draw_monoid.clip_ix as usize - 1]; } let path_bbox = path_bbox_buf[draw_monoid.path_ix as usize]; diff --git a/src/cpu_shader/clip_reduce.rs b/src/cpu_shader/clip_reduce.rs index 96bc3582e..fc30661f1 100644 --- a/src/cpu_shader/clip_reduce.rs +++ b/src/cpu_shader/clip_reduce.rs @@ -48,7 +48,6 @@ fn clip_reduce_main( } pub fn clip_reduce(n_wg: u32, resources: &[CpuBinding]) { - // TODO: probably remove config, it's not needed let clip_inp = resources[0].as_slice(); let path_bboxes = resources[1].as_slice(); let mut reduced = resources[2].as_slice_mut(); diff --git a/src/cpu_shader/coarse.rs b/src/cpu_shader/coarse.rs index c54aa1076..390df7f74 100644 --- a/src/cpu_shader/coarse.rs +++ b/src/cpu_shader/coarse.rs @@ -57,7 +57,6 @@ impl TileState { ptcl[(self.cmd_offset + offset) as usize] = value; } - // TODO: handle even/odd winding rule fn write_path( &mut self, config: &ConfigUniform, @@ -217,7 +216,6 @@ fn coarse_main( let mut tile_state = TileState::new(this_tile_ix); let blend_offset = tile_state.cmd_offset; tile_state.cmd_offset += 1; - // Discussion question: do these belong in tile state? let mut clip_depth = 0; let mut clip_zero_depth = 0; for drawobj_ix in &compacted[tile_ix] { @@ -314,7 +312,7 @@ fn coarse_main( if bin_tile_x + tile_x < width_in_tiles && bin_tile_y + tile_y < height_in_tiles { ptcl[tile_state.cmd_offset as usize] = CMD_END; - let scratch_size = 0; // TODO: actually compute + let scratch_size = 0; // TODO: actually compute blend depth ptcl[blend_offset as usize] = bump.blend; bump.blend += scratch_size; } diff --git a/src/cpu_shader/draw_leaf.rs b/src/cpu_shader/draw_leaf.rs index 1d699307b..0aa779e5c 100644 --- a/src/cpu_shader/draw_leaf.rs +++ b/src/cpu_shader/draw_leaf.rs @@ -5,7 +5,7 @@ use vello_encoding::{Clip, ConfigUniform, DrawMonoid, DrawTag, Monoid, PathBbox} use crate::cpu_dispatch::CpuBinding; -use super::util::{Transform, Vec2}; +use super::util::{read_draw_tag_from_scene, Transform, Vec2}; const WG_SIZE: usize = 256; @@ -19,22 +19,18 @@ fn draw_leaf_main( info: &mut [u32], clip_inp: &mut [Clip], ) { - let drawtag_base = config.layout.draw_tag_base; let mut prefix = DrawMonoid::default(); for i in 0..n_wg { let mut m = prefix; for j in 0..WG_SIZE { let ix = i * WG_SIZE as u32 + j as u32; - let tag_raw = if ix < config.layout.n_draw_objects { - scene[(drawtag_base + ix) as usize] - } else { - 0 - }; + let tag_raw = read_draw_tag_from_scene(config, scene, ix); let tag_word = DrawTag(tag_raw); // store exclusive prefix sum if ix < config.layout.n_draw_objects { draw_monoid[ix as usize] = m; } + let m_next = m.combine(&DrawMonoid::new(tag_word)); let dd = config.layout.draw_data_base + m.scene_offset; let di = m.info_offset as usize; if tag_word == DrawTag::COLOR @@ -145,7 +141,7 @@ fn draw_leaf_main( let path_ix = !ix as i32; clip_inp[m.clip_ix as usize] = Clip { ix, path_ix }; } - m = m.combine(&DrawMonoid::new(tag_word)); + m = m_next; } prefix = prefix.combine(&reduced[i as usize]); } diff --git a/src/cpu_shader/draw_reduce.rs b/src/cpu_shader/draw_reduce.rs index 019b9416c..61c338c71 100644 --- a/src/cpu_shader/draw_reduce.rs +++ b/src/cpu_shader/draw_reduce.rs @@ -5,19 +5,16 @@ use vello_encoding::{ConfigUniform, DrawMonoid, DrawTag, Monoid}; use crate::cpu_dispatch::CpuBinding; +use super::util::read_draw_tag_from_scene; + const WG_SIZE: usize = 256; fn draw_reduce_main(n_wg: u32, config: &ConfigUniform, scene: &[u32], reduced: &mut [DrawMonoid]) { - let drawtag_base = config.layout.draw_tag_base; for i in 0..n_wg { let mut m = DrawMonoid::default(); for j in 0..WG_SIZE { let ix = i * WG_SIZE as u32 + j as u32; - let tag = if ix < config.layout.n_draw_objects { - scene[(drawtag_base + ix) as usize] - } else { - 0 - }; + let tag = read_draw_tag_from_scene(config, scene, ix); m = m.combine(&DrawMonoid::new(DrawTag(tag))); } reduced[i as usize] = m; diff --git a/src/cpu_shader/path_count.rs b/src/cpu_shader/path_count.rs index b55cd1239..2cee5b815 100644 --- a/src/cpu_shader/path_count.rs +++ b/src/cpu_shader/path_count.rs @@ -34,15 +34,15 @@ fn path_count_main( if dy == 0.0 && s0.y.floor() == s0.y { continue; } - let dy_dxdy = dy / (dx + dy); - let a = 1.0 - dy_dxdy; + let idxdy = 1.0 / (dx + dy); + let a = dx * idxdy; let is_positive_slope = s1.x >= s0.x; let sign = if is_positive_slope { 1.0 } else { -1.0 }; let xt0 = (s0.x * sign).floor(); let c = s0.x * sign - xt0; let y0 = s0.y.floor(); let ytop = if s0.y == s1.y { s0.y.ceil() } else { y0 + 1.0 }; - let b = dy_dxdy * c + a * (ytop - s0.y); + let b = (dy * c + dx * (ytop - s0.y)) * idxdy; let x0 = xt0 * sign + if is_positive_slope { 0.0 } else { -1.0 }; let path = paths[line.path_ix as usize]; diff --git a/src/cpu_shader/path_tiling.rs b/src/cpu_shader/path_tiling.rs index 53f5cd970..41549bb54 100644 --- a/src/cpu_shader/path_tiling.rs +++ b/src/cpu_shader/path_tiling.rs @@ -37,15 +37,15 @@ fn path_tiling_main( let dx = (s1.x - s0.x).abs(); let dy = s1.y - s0.y; - let dy_dxdy = dy / (dx + dy); - let a = 1.0 - dy_dxdy; + let idxdy = 1.0 / (dx + dy); + let a = dx * idxdy; let is_positive_slope = s1.x >= s0.x; let sign = if is_positive_slope { 1.0 } else { -1.0 }; let xt0 = (s0.x * sign).floor(); let c = s0.x * sign - xt0; let y0 = s0.y.floor(); let ytop = if s0.y == s1.y { s0.y.ceil() } else { y0 + 1.0 }; - let b = dy_dxdy * c + a * (ytop - s0.y); + let b = (dy * c + dx * (ytop - s0.y)) * idxdy; let x0 = xt0 * sign + if is_positive_slope { 0.0 } else { -1.0 }; let z = (a * seg_within_line as f32 + b).floor(); let x = x0 as i32 + (sign * z) as i32; diff --git a/src/cpu_shader/util.rs b/src/cpu_shader/util.rs index 7c32cddb6..2bb3279aa 100644 --- a/src/cpu_shader/util.rs +++ b/src/cpu_shader/util.rs @@ -3,6 +3,8 @@ //! Utility types +use vello_encoding::ConfigUniform; + #[derive(Clone, Copy, Default, Debug)] #[repr(C)] pub struct Vec2 { @@ -94,3 +96,18 @@ impl Transform { pub fn span(a: f32, b: f32) -> u32 { (a.max(b).ceil() - a.min(b).floor()).max(1.0) as u32 } + +const DRAWTAG_NOP: u32 = 0; + +/// Read draw tag, guarded by number of draw objects. +/// +/// The `ix` argument is allowed to exceed the number of draw objects, +/// in which case a NOP is returned. +pub fn read_draw_tag_from_scene(config: &ConfigUniform, scene: &[u32], ix: u32) -> u32 { + if ix < config.layout.n_draw_objects { + let tag_ix = config.layout.draw_tag_base + ix; + scene[tag_ix as usize] + } else { + DRAWTAG_NOP + } +}