From 7220ce6817a3854b4ea14cf8bdfa60d273252ba3 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Thu, 5 Oct 2023 06:40:08 -0700 Subject: [PATCH 1/3] Contiguous storage for path segments This is part of the larger multisampled path rendering work, under stroke rework (#303). It refactors the GPU pipeline so that the path segments available to fine rasterization are stored as a contiguous slice rather than a linked list as before. Numerous parts of the pipeline are refactored. In the old pipeline, path segment decoding generated cubic line segments and also estimated a bounding box (somewhat imprecise), and the combination of flattening those cubics and tiling was in a separate stage (path_coarse) quite a bit later in the pipeline. In the new pipeline, path decoding is fused with flattening, generating a `LineSoup` structure (line segments associated with paths, otherwise unordered) (with bbox as a side effect), and tiling is spread over multiple stages, later in the pipeline. The first tiling stage (path_count) counts the number of tiles that will be generated. Then coarse rasterization allocates contiguous slices based on those counts. The second stage does a scattered write of the resulting tiles. Both of these stages rely on indirect dispatch, as the number of lines and the number of segments (respectively) are not known at encode time. These changes only make sense for filled paths, thus they relied on stroke expansion being done earlier, currently on the CPU. --- crates/encoding/src/clip.rs | 11 + crates/encoding/src/config.rs | 35 ++- crates/encoding/src/lib.rs | 7 +- crates/encoding/src/path.rs | 26 ++- shader/coarse.wgsl | 59 +++-- shader/fine.wgsl | 67 ++---- shader/{pathseg.wgsl => flatten.wgsl} | 184 +++++++++++---- shader/path_coarse.wgsl | 309 -------------------------- shader/path_coarse_full.wgsl | 273 ----------------------- shader/path_count.wgsl | 189 ++++++++++++++++ shader/path_count_setup.wgsl | 22 ++ shader/path_tiling.wgsl | 127 +++++++++++ shader/path_tiling_setup.wgsl | 22 ++ shader/shared/bump.wgsl | 10 +- shader/shared/config.wgsl | 3 + shader/shared/ptcl.wgsl | 3 +- shader/shared/segment.wgsl | 27 ++- src/lib.rs | 4 +- src/render.rs | 66 ++++-- src/shaders.rs | 58 +++-- 20 files changed, 756 insertions(+), 746 deletions(-) rename shader/{pathseg.wgsl => flatten.wgsl} (52%) delete mode 100644 shader/path_coarse.wgsl delete mode 100644 shader/path_coarse_full.wgsl create mode 100644 shader/path_count.wgsl create mode 100644 shader/path_count_setup.wgsl create mode 100644 shader/path_tiling.wgsl create mode 100644 shader/path_tiling_setup.wgsl diff --git a/crates/encoding/src/clip.rs b/crates/encoding/src/clip.rs index 5a4e8f074..025b86bc5 100644 --- a/crates/encoding/src/clip.rs +++ b/crates/encoding/src/clip.rs @@ -41,3 +41,14 @@ pub struct Clip { pub struct ClipBbox { pub bbox: [f32; 4], } + +impl ClipBic { + pub fn new(a: u32, b: u32) -> Self { + ClipBic { a, b } + } + + pub fn combine(self, other: ClipBic) -> Self { + let m = self.b.min(other.a); + ClipBic::new(self.a + other.a - m, self.b + other.b - m) + } +} diff --git a/crates/encoding/src/config.rs b/crates/encoding/src/config.rs index 93ffe80c6..d853a0808 100644 --- a/crates/encoding/src/config.rs +++ b/crates/encoding/src/config.rs @@ -1,9 +1,11 @@ // Copyright 2023 The Vello authors // SPDX-License-Identifier: Apache-2.0 OR MIT +use crate::SegmentCount; + use super::{ - BinHeader, Clip, ClipBbox, ClipBic, ClipElement, Cubic, DrawBbox, DrawMonoid, Layout, Path, - PathBbox, PathMonoid, PathSegment, Tile, + BinHeader, Clip, ClipBbox, ClipBic, ClipElement, Cubic, DrawBbox, DrawMonoid, Layout, LineSoup, + Path, PathBbox, PathMonoid, PathSegment, Tile, }; use bytemuck::{Pod, Zeroable}; use std::mem; @@ -29,8 +31,24 @@ pub struct BumpAllocators { pub binning: u32, pub ptcl: u32, pub tile: u32, + pub seg_counts: u32, pub segments: u32, pub blend: u32, + pub lines: u32, +} + +/// Storage of indirect dispatch size values. +/// +/// The original plan was to reuse BumpAllocators, but the WebGPU compatible +/// usage list rules forbid that being used as indirect counts while also +/// bound as writable. +#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)] +#[repr(C)] +pub struct IndirectCount { + pub count_x: u32, + pub count_y: u32, + pub count_z: u32, + pub pad0: u32, } /// Uniform render configuration data used by all GPU stages. @@ -114,7 +132,7 @@ pub struct WorkgroupCounts { pub path_scan1: WorkgroupSize, pub path_scan: WorkgroupSize, pub bbox_clear: WorkgroupSize, - pub path_seg: WorkgroupSize, + pub flatten: WorkgroupSize, pub draw_reduce: WorkgroupSize, pub draw_leaf: WorkgroupSize, pub clip_reduce: WorkgroupSize, @@ -159,7 +177,7 @@ impl WorkgroupCounts { path_scan1: (reduced_size / PATH_REDUCE_WG, 1, 1), path_scan: (path_tag_wgs, 1, 1), bbox_clear: (draw_object_wgs, 1, 1), - path_seg: (path_coarse_wgs, 1, 1), + flatten: (path_coarse_wgs, 1, 1), draw_reduce: (draw_object_wgs, 1, 1), draw_leaf: (draw_object_wgs, 1, 1), clip_reduce: (clip_reduce_wgs, 1, 1), @@ -248,11 +266,14 @@ pub struct BufferSizes { pub clip_bboxes: BufferSize, pub draw_bboxes: BufferSize, pub bump_alloc: BufferSize, + pub indirect_count: BufferSize, pub bin_headers: BufferSize, pub paths: BufferSize, // Bump allocated buffers + pub lines: BufferSize, pub bin_data: BufferSize, pub tiles: BufferSize, + pub seg_counts: BufferSize, pub segments: BufferSize, pub ptcl: BufferSize, } @@ -284,6 +305,7 @@ impl BufferSizes { let clip_bboxes = BufferSize::new(n_clips); let draw_bboxes = BufferSize::new(n_paths); let bump_alloc = BufferSize::new(1); + let indirect_count = BufferSize::new(1); let bin_headers = BufferSize::new(draw_object_wgs * 256); let n_paths_aligned = align_up(n_paths, 256); let paths = BufferSize::new(n_paths_aligned); @@ -293,6 +315,8 @@ impl BufferSizes { // reasonable heuristics. let bin_data = BufferSize::new(1 << 18); let tiles = BufferSize::new(1 << 21); + let lines = BufferSize::new(1 << 21); + let seg_counts = BufferSize::new(1 << 21); let segments = BufferSize::new(1 << 21); let ptcl = BufferSize::new(1 << 23); Self { @@ -311,10 +335,13 @@ impl BufferSizes { clip_bboxes, draw_bboxes, bump_alloc, + indirect_count, + lines, bin_headers, paths, bin_data, tiles, + seg_counts, segments, ptcl, } diff --git a/crates/encoding/src/lib.rs b/crates/encoding/src/lib.rs index 62a7d6c8e..3d24c192f 100644 --- a/crates/encoding/src/lib.rs +++ b/crates/encoding/src/lib.rs @@ -24,8 +24,8 @@ mod resolve; pub use binning::BinHeader; pub use clip::{Clip, ClipBbox, ClipBic, ClipElement}; pub use config::{ - BufferSize, BufferSizes, BumpAllocators, ConfigUniform, RenderConfig, WorkgroupCounts, - WorkgroupSize, + BufferSize, BufferSizes, BumpAllocators, ConfigUniform, IndirectCount, RenderConfig, + WorkgroupCounts, WorkgroupSize, }; pub use draw::{ DrawBbox, DrawBeginClip, DrawColor, DrawImage, DrawLinearGradient, DrawMonoid, @@ -35,7 +35,8 @@ pub use encoding::{Encoding, StreamOffsets}; pub use math::Transform; pub use monoid::Monoid; pub use path::{ - Cubic, Path, PathBbox, PathEncoder, PathMonoid, PathSegment, PathSegmentType, PathTag, Tile, + Cubic, LineSoup, Path, PathBbox, PathEncoder, PathMonoid, PathSegment, PathSegmentType, + PathTag, SegmentCount, Tile, }; pub use resolve::{resolve_solid_paths_only, Layout}; diff --git a/crates/encoding/src/path.rs b/crates/encoding/src/path.rs index 10e7499dd..414ce23e6 100644 --- a/crates/encoding/src/path.rs +++ b/crates/encoding/src/path.rs @@ -6,6 +6,28 @@ use peniko::kurbo::Shape; use super::Monoid; +/// Line segment (after flattening, before tiling). +#[derive(Clone, Copy, Debug, Zeroable, Pod, Default)] +#[repr(C)] +pub struct LineSoup { + pub path_ix: u32, + pub _padding: u32, + pub p0: [f32; 2], + pub p1: [f32; 2], +} + +/// Line segment (after flattening, before tiling). +#[derive(Clone, Copy, Debug, Zeroable, Pod, Default)] +#[repr(C)] +pub struct SegmentCount { + pub line_ix: u32, + // This could more accurately be modeled as: + // segment_within_line: u16, + // segment_within_slice: u16, + // However, here we mirror the way it's written in WGSL + pub counts: u32, +} + /// Path segment. #[derive(Clone, Copy, Debug, Zeroable, Pod, Default)] #[repr(C)] @@ -13,7 +35,7 @@ pub struct PathSegment { pub origin: [f32; 2], pub delta: [f32; 2], pub y_edge: f32, - pub next: u32, + pub _padding: u32, } /// Path segment type. @@ -193,7 +215,7 @@ pub struct PathBbox { #[repr(C)] pub struct Path { /// Bounding box in tiles. - pub bbox: [f32; 4], + pub bbox: [u32; 4], /// Offset (in u32s) to tile rectangle. pub tiles: u32, _padding: [u32; 3], diff --git a/shader/coarse.wgsl b/shader/coarse.wgsl index e497c85b4..2d324c798 100644 --- a/shader/coarse.wgsl +++ b/shader/coarse.wgsl @@ -33,7 +33,7 @@ var info_bin_data: array; var paths: array; @group(0) @binding(6) -var tiles: array; +var tiles: array; @group(0) @binding(7) var bump: BumpAllocators; @@ -82,31 +82,30 @@ fn alloc_cmd(size: u32) { } } -fn write_path(tile: Tile, linewidth: f32) -> bool { - // TODO: take flags - alloc_cmd(3u); - if linewidth < 0.0 { - let even_odd = linewidth < -1.0; - if tile.segments != 0u { - let fill = CmdFill(tile.segments, tile.backdrop); - ptcl[cmd_offset] = CMD_FILL; - let segments_and_rule = select(fill.tile << 1u, (fill.tile << 1u) | 1u, even_odd); - ptcl[cmd_offset + 1u] = segments_and_rule; - ptcl[cmd_offset + 2u] = u32(fill.backdrop); - cmd_offset += 3u; - } else { - if even_odd && (abs(tile.backdrop) & 1) == 0 { - return false; - } - ptcl[cmd_offset] = CMD_SOLID; - cmd_offset += 1u; - } +fn write_path(tile: Tile, tile_ix: u32, linewidth: f32) -> bool { + let even_odd = linewidth < -1.0; + // We overload the "segments" field to store both count (written by + // path_count stage) and segment allocation (used by path_tiling and + // fine). + let n_segs = tile.segments; + if n_segs != 0u { + var seg_ix = atomicAdd(&bump.segments, n_segs); + tiles[tile_ix].segments = ~seg_ix; + alloc_cmd(4u); + ptcl[cmd_offset] = CMD_FILL; + let size_and_rule = (n_segs << 1u) | u32(even_odd); + let fill = CmdFill(size_and_rule, seg_ix, tile.backdrop); + ptcl[cmd_offset + 1u] = fill.size_and_rule; + ptcl[cmd_offset + 2u] = fill.seg_data; + ptcl[cmd_offset + 3u] = u32(fill.backdrop); + cmd_offset += 4u; } else { - let stroke = CmdStroke(tile.segments, 0.5 * linewidth); - ptcl[cmd_offset] = CMD_STROKE; - ptcl[cmd_offset + 1u] = stroke.tile; - ptcl[cmd_offset + 2u] = bitcast(stroke.half_width); - cmd_offset += 3u; + if even_odd && (abs(tile.backdrop) & 1) == 0 { + return false; + } + alloc_cmd(1u); + ptcl[cmd_offset] = CMD_SOLID; + cmd_offset += 1u; } return true; } @@ -352,7 +351,7 @@ fn main( // DRAWTAG_FILL_COLOR case 0x44u: { let linewidth = bitcast(info_bin_data[di]); - if write_path(tile, linewidth) { + if write_path(tile, tile_ix, linewidth) { let rgba_color = scene[dd]; write_color(CmdColor(rgba_color)); } @@ -360,7 +359,7 @@ fn main( // DRAWTAG_FILL_LIN_GRADIENT case 0x114u: { let linewidth = bitcast(info_bin_data[di]); - if write_path(tile, linewidth) { + if write_path(tile, tile_ix, linewidth) { let index = scene[dd]; let info_offset = di + 1u; write_grad(CMD_LIN_GRAD, index, info_offset); @@ -369,7 +368,7 @@ fn main( // DRAWTAG_FILL_RAD_GRADIENT case 0x29cu: { let linewidth = bitcast(info_bin_data[di]); - if write_path(tile, linewidth) { + if write_path(tile, tile_ix, linewidth) { let index = scene[dd]; let info_offset = di + 1u; write_grad(CMD_RAD_GRAD, index, info_offset); @@ -378,7 +377,7 @@ fn main( // DRAWTAG_FILL_IMAGE case 0x248u: { let linewidth = bitcast(info_bin_data[di]); - if write_path(tile, linewidth) { + if write_path(tile, tile_ix, linewidth) { write_image(di + 1u); } } @@ -396,7 +395,7 @@ fn main( // DRAWTAG_END_CLIP case 0x21u: { clip_depth -= 1u; - write_path(tile, -1.0); + write_path(tile, tile_ix, -1.0); let blend = scene[dd]; let alpha = bitcast(scene[dd + 1u]); write_end_clip(CmdEndClip(blend, alpha)); diff --git a/shader/fine.wgsl b/shader/fine.wgsl index f64f0159b..79984596f 100644 --- a/shader/fine.wgsl +++ b/shader/fine.wgsl @@ -41,15 +41,10 @@ var gradients: texture_2d; var image_atlas: texture_2d; fn read_fill(cmd_ix: u32) -> CmdFill { - let tile = ptcl[cmd_ix + 1u]; - let backdrop = i32(ptcl[cmd_ix + 2u]); - return CmdFill(tile, backdrop); -} - -fn read_stroke(cmd_ix: u32) -> CmdStroke { - let tile = ptcl[cmd_ix + 1u]; - let half_width = bitcast(ptcl[cmd_ix + 2u]); - return CmdStroke(tile, half_width); + let size_and_rule = ptcl[cmd_ix + 1u]; + let seg_data = ptcl[cmd_ix + 2u]; + let backdrop = i32(ptcl[cmd_ix + 3u]); + return CmdFill(size_and_rule, seg_data, backdrop); } fn read_color(cmd_ix: u32) -> CmdColor { @@ -140,15 +135,15 @@ var output: texture_storage_2d; let PIXELS_PER_THREAD = 4u; -fn fill_path(tile: Tile, xy: vec2, even_odd: bool) -> array { +fn fill_path(seg_data: u32, n_segs: u32, backdrop: i32, xy: vec2, even_odd: bool) -> array { var area: array; - let backdrop_f = f32(tile.backdrop); + let backdrop_f = f32(backdrop); for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { area[i] = backdrop_f; } - var segment_ix = tile.segments; - while segment_ix != 0u { - let segment = segments[segment_ix]; + for (var i = 0u; i < n_segs; i++) { + let seg_off = seg_data + i; + let segment = segments[seg_off]; let y = segment.origin.y - xy.y; let y0 = clamp(y, 0.0, 1.0); let y1 = clamp(y + segment.delta.y, 0.0, 1.0); @@ -177,7 +172,6 @@ fn fill_path(tile: Tile, xy: vec2, even_odd: bool) -> array, even_odd: bool) -> array) -> array { - var df: array; - for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { - df[i] = 1e9; - } - var segment_ix = seg; - while segment_ix != 0u { - let segment = segments[segment_ix]; - let delta = segment.delta; - let dpos0 = xy + vec2(0.5, 0.5) - segment.origin; - let scale = 1.0 / dot(delta, delta); - for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { - let dpos = vec2(dpos0.x + f32(i), dpos0.y); - let t = clamp(dot(dpos, delta) * scale, 0.0, 1.0); - // performance idea: hoist sqrt out of loop - df[i] = min(df[i], length(delta * t - dpos)); - } - segment_ix = segment.next; - } - for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { - // reuse array; return alpha rather than distance - df[i] = clamp(half_width + 0.5 - df[i], 0.0, 1.0); - } - return df; -} - // The X size should be 16 / PIXELS_PER_THREAD @compute @workgroup_size(4, 16) fn main( @@ -250,16 +218,19 @@ fn main( // CMD_FILL case 1u: { let fill = read_fill(cmd_ix); - let segments = fill.tile >> 1u; - let even_odd = (fill.tile & 1u) != 0u; - let tile = Tile(fill.backdrop, segments); - area = fill_path(tile, xy, even_odd); - cmd_ix += 3u; + let n_segs = fill.size_and_rule >> 1u; + let even_odd = (fill.size_and_rule & 1u) != 0u; + area = fill_path(fill.seg_data, n_segs, fill.backdrop, xy, even_odd); + cmd_ix += 4u; } // CMD_STROKE case 2u: { - let stroke = read_stroke(cmd_ix); - area = stroke_path(stroke.tile, stroke.half_width, xy); + // Stroking in fine rasterization is disabled, as strokes will be expanded + // to fills earlier in the pipeline. This implementation is a stub, just to + // keep the shader from crashing. + for (var i = 0u; i < PIXELS_PER_THREAD; i++) { + area[i] = 0.0; + } cmd_ix += 3u; } // CMD_SOLID diff --git a/shader/pathseg.wgsl b/shader/flatten.wgsl similarity index 52% rename from shader/pathseg.wgsl rename to shader/flatten.wgsl index bf02419bc..b4197bce6 100644 --- a/shader/pathseg.wgsl +++ b/shader/flatten.wgsl @@ -1,20 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense -// Path segment decoding for the full case. - -// In the simple case, path segments are decoded as part of the coarse -// path rendering stage. In the full case, they are separated, as the -// decoding process also generates bounding boxes, and those in turn are -// used for tile allocation and clipping; actual coarse path rasterization -// can't proceed until those are complete. - -// There's some duplication of the decoding code but we won't worry about -// that just now. Perhaps it could be factored more nicely later. +// Flatten curves to lines #import config #import pathtag +#import segment #import cubic -#import transform +#import bump @group(0) @binding(0) var config: Config; @@ -38,37 +30,133 @@ struct AtomicPathBbox { var path_bboxes: array; @group(0) @binding(4) -var cubics: array; - -// Monoid is yagni, for future optimization - -// struct BboxMonoid { -// bbox: vec4, -// flags: u32, -// } - -// let FLAG_RESET_BBOX = 1u; -// let FLAG_SET_BBOX = 2u; - -// fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid { -// var c: BboxMonoid; -// c.bbox = b.bbox; -// // TODO: previous-me thought this should be gated on b & SET_BBOX == false also -// if (a.flags & FLAG_RESET_BBOX) == 0u && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y { -// c.bbox = a.bbox; -// } else if (a.flags & FLAG_RESET_BBOX) == 0u && (b.flags & FLAG_SET_BBOX) == 0u || -// (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y) -// { -// c.bbox = vec4(min(a.bbox.xy, c.bbox.xy), max(a.bbox.xw, c.bbox.zw)); -// } -// c.flags = (a.flags & FLAG_SET_BBOX) | b.flags; -// c.flags |= (a.flags & FLAG_RESET_BBOX) << 1u; -// return c; -// } - -// fn bbox_monoid_identity() -> BboxMonoid { -// return BboxMonoid(); -// } +var bump: BumpAllocators; + +@group(0) @binding(5) +var lines: array; + +struct SubdivResult { + val: f32, + a0: f32, + a2: f32, +} + +let D = 0.67; +fn approx_parabola_integral(x: f32) -> f32 { + return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x))); +} + +let B = 0.39; +fn approx_parabola_inv_integral(x: f32) -> f32 { + return x * sqrt(1.0 - B + (B * B + 0.5 * x * x)); +} + +fn estimate_subdiv(p0: vec2, p1: vec2, p2: vec2, sqrt_tol: f32) -> SubdivResult { + let d01 = p1 - p0; + let d12 = p2 - p1; + let dd = d01 - d12; + let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x; + let cross_inv = select(1.0 / cross, 1.0e9, abs(cross) < 1.0e-9); + let x0 = dot(d01, dd) * cross_inv; + let x2 = dot(d12, dd) * cross_inv; + let scale = abs(cross / (length(dd) * (x2 - x0))); + + let a0 = approx_parabola_integral(x0); + let a2 = approx_parabola_integral(x2); + var val = 0.0; + if scale < 1e9 { + let da = abs(a2 - a0); + let sqrt_scale = sqrt(scale); + if sign(x0) == sign(x2) { + val = sqrt_scale; + } else { + let xmin = sqrt_tol / sqrt_scale; + val = sqrt_tol / approx_parabola_integral(xmin); + } + val *= da; + } + return SubdivResult(val, a0, a2); +} + +fn eval_quad(p0: vec2, p1: vec2, p2: vec2, t: f32) -> vec2 { + let mt = 1.0 - t; + return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t; +} + +fn eval_cubic(p0: vec2, p1: vec2, p2: vec2, p3: vec2, t: f32) -> vec2 { + let mt = 1.0 - t; + return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t; +} + +let MAX_QUADS = 16u; + +fn flatten_cubic(cubic: Cubic) { + let p0 = cubic.p0; + let p1 = cubic.p1; + let p2 = cubic.p2; + let p3 = cubic.p3; + let err_v = 3.0 * (p2 - p1) + p0 - p3; + let err = dot(err_v, err_v); + let ACCURACY = 0.25; + let Q_ACCURACY = ACCURACY * 0.1; + let REM_ACCURACY = ACCURACY - Q_ACCURACY; + let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY; + var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u); + n_quads = min(n_quads, MAX_QUADS); + var keep_params: array; + var val = 0.0; + var qp0 = p0; + let step = 1.0 / f32(n_quads); + for (var i = 0u; i < n_quads; i += 1u) { + let t = f32(i + 1u) * step; + let qp2 = eval_cubic(p0, p1, p2, p3, t); + var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); + qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); + let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY)); + keep_params[i] = params; + val += params.val; + qp0 = qp2; + } + let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u); + var lp0 = p0; + qp0 = p0; + let v_step = val / f32(n); + var n_out = 1u; + var val_sum = 0.0; + for (var i = 0u; i < n_quads; i += 1u) { + let t = f32(i + 1u) * step; + let qp2 = eval_cubic(p0, p1, p2, p3, t); + var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); + qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); + let params = keep_params[i]; + let u0 = approx_parabola_inv_integral(params.a0); + let u2 = approx_parabola_inv_integral(params.a2); + let uscale = 1.0 / (u2 - u0); + var val_target = f32(n_out) * v_step; + while n_out == n || val_target < val_sum + params.val { + var lp1: vec2; + if n_out == n { + lp1 = p3; + } else { + let u = (val_target - val_sum) / params.val; + let a = mix(params.a0, params.a2, u); + let au = approx_parabola_inv_integral(a); + let t = (au - u0) * uscale; + lp1 = eval_quad(qp0, qp1, qp2, t); + } + + // Output line segment lp0..lp1 + let line_ix = atomicAdd(&bump.lines, 1u); + // TODO: check failure + lines[line_ix] = LineSoup(cubic.path_ix, lp0, lp1); + n_out += 1u; + val_target += v_step; + lp0 = lp1; + } + val_sum += params.val; + qp0 = qp2; + } +} var pathdata_base: u32; @@ -85,6 +173,11 @@ fn read_i16_point(ix: u32) -> vec2 { return vec2(x, y); } +struct Transform { + matrx: vec4, + translate: vec2, +} + fn read_transform(transform_base: u32, ix: u32) -> Transform { let base = transform_base + ix * 6u; let c0 = bitcast(scene[base]); @@ -98,6 +191,10 @@ fn read_transform(transform_base: u32, ix: u32) -> Transform { return Transform(matrx, translate); } +fn transform_apply(transform: Transform, p: vec2) -> vec2 { + return transform.matrx.xy * p.x + transform.matrx.zw * p.y + transform.translate; +} + fn round_down(x: f32) -> i32 { return i32(floor(x)); } @@ -116,6 +213,7 @@ fn main( pathdata_base = config.pathdata_base; let shift = (ix & 3u) * 8u; var tm = reduce_tag(tag_word & ((1u << shift) - 1u)); + // TODO: this can be a read buf overflow. Conditionalize by tag byte? tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm); var tag_byte = (tag_word >> shift) & 0xffu; @@ -181,7 +279,7 @@ fn main( bbox += vec4(-stroke, stroke); } let flags = u32(linewidth >= 0.0); - cubics[global_id.x] = Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags); + flatten_cubic(Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags)); // Update bounding box using atomics only. Computing a monoid is a // potential future optimization. if bbox.z > bbox.x || bbox.w > bbox.y { diff --git a/shader/path_coarse.wgsl b/shader/path_coarse.wgsl deleted file mode 100644 index 5f168a63c..000000000 --- a/shader/path_coarse.wgsl +++ /dev/null @@ -1,309 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -#import config -#import pathtag - -@group(0) @binding(0) -var config: Config; - -@group(0) @binding(1) -var scene: array; - -@group(0) @binding(2) -var tag_monoids: array; - -#ifdef cubics_out -@group(0) @binding(3) -var output: array>; -#else -// We don't get this from import as it's the atomic version -struct AtomicTile { - backdrop: atomic, - segments: atomic, -} - -#import segment - -@group(0) @binding(3) -var tiles: array; - -@group(0) @binding(4) -var segments: array; -#endif - -var pathdata_base: u32; - -fn read_f32_point(ix: u32) -> vec2 { - let x = bitcast(scene[pathdata_base + ix]); - let y = bitcast(scene[pathdata_base + ix + 1u]); - return vec2(x, y); -} - -fn read_i16_point(ix: u32) -> vec2 { - let raw = scene[pathdata_base + ix]; - let x = f32(i32(raw << 16u) >> 16u); - let y = f32(i32(raw) >> 16u); - return vec2(x, y); -} - -#ifndef cubics_out -struct SubdivResult { - val: f32, - a0: f32, - a2: f32, -} - -let D = 0.67; -fn approx_parabola_integral(x: f32) -> f32 { - return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x))); -} - -let B = 0.39; -fn approx_parabola_inv_integral(x: f32) -> f32 { - return x * sqrt(1.0 - B + (B * B + 0.5 * x * x)); -} - -fn estimate_subdiv(p0: vec2, p1: vec2, p2: vec2, sqrt_tol: f32) -> SubdivResult { - let d01 = p1 - p0; - let d12 = p2 - p1; - let dd = d01 - d12; - let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x; - let cross_inv = 1.0 / cross; - let x0 = dot(d01, dd) * cross_inv; - let x2 = dot(d12, dd) * cross_inv; - let scale = abs(cross / (length(dd) * (x2 - x0))); - - let a0 = approx_parabola_integral(x0); - let a2 = approx_parabola_integral(x2); - var val = 0.0; - if scale < 1e9 { - let da = abs(a2 - a0); - let sqrt_scale = sqrt(scale); - if sign(x0) == sign(x2) { - val = sqrt_scale; - } else { - let xmin = sqrt_tol / sqrt_scale; - val = sqrt_tol / approx_parabola_integral(xmin); - } - val *= da; - } - return SubdivResult(val, a0, a2); -} - -fn eval_quad(p0: vec2, p1: vec2, p2: vec2, t: f32) -> vec2 { - let mt = 1.0 - t; - return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t; -} - -fn eval_cubic(p0: vec2, p1: vec2, p2: vec2, p3: vec2, t: f32) -> vec2 { - let mt = 1.0 - t; - return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t; -} - -fn alloc_segment() -> u32 { - // Use 0-index segment (address is sentinel) as counter - // TODO: separate small buffer binding for this? - return atomicAdd(&tiles[4096].segments, 1u) + 1u; -} -#endif - -let MAX_QUADS = 16u; - -@compute @workgroup_size(256) -fn main( - @builtin(global_invocation_id) global_id: vec3, - @builtin(local_invocation_id) local_id: vec3, -) { - // Obtain exclusive prefix sum of tag monoid - let ix = global_id.x; - let tag_word = scene[config.pathtag_base + (ix >> 2u)]; - pathdata_base = config.pathdata_base; - let shift = (ix & 3u) * 8u; - var tm = reduce_tag(tag_word & ((1u << shift) - 1u)); - tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm); - var tag_byte = (tag_word >> shift) & 0xffu; - // should be extractBits(tag_word, shift, 8)? - - // Decode path data - let seg_type = tag_byte & PATH_TAG_SEG_TYPE; - if seg_type != 0u { - var p0: vec2; - var p1: vec2; - var p2: vec2; - var p3: vec2; - if (tag_byte & PATH_TAG_F32) != 0u { - p0 = read_f32_point(tm.pathseg_offset); - p1 = read_f32_point(tm.pathseg_offset + 2u); - if seg_type >= PATH_TAG_QUADTO { - p2 = read_f32_point(tm.pathseg_offset + 4u); - if seg_type == PATH_TAG_CUBICTO { - p3 = read_f32_point(tm.pathseg_offset + 6u); - } - } - } else { - p0 = read_i16_point(tm.pathseg_offset); - p1 = read_i16_point(tm.pathseg_offset + 1u); - if seg_type >= PATH_TAG_QUADTO { - p2 = read_i16_point(tm.pathseg_offset + 2u); - if seg_type == PATH_TAG_CUBICTO { - p3 = read_i16_point(tm.pathseg_offset + 3u); - } - } - } - // TODO: transform goes here - // Degree-raise - if seg_type == PATH_TAG_LINETO { - p3 = p1; - p2 = mix(p3, p0, 1.0 / 3.0); - p1 = mix(p0, p3, 1.0 / 3.0); - } else if seg_type == PATH_TAG_QUADTO { - p3 = p2; - p2 = mix(p1, p2, 1.0 / 3.0); - p1 = mix(p1, p0, 1.0 / 3.0); - } -#ifdef cubics_out - let out_ix = ix * 4u; - output[out_ix] = p0; - output[out_ix + 1u] = p1; - output[out_ix + 2u] = p2; - output[out_ix + 3u] = p3; -#else - let err_v = 3.0 * (p2 - p1) + p0 - p3; - let err = dot(err_v, err_v); - let ACCURACY = 0.25; - let Q_ACCURACY = ACCURACY * 0.1; - let REM_ACCURACY = (ACCURACY - Q_ACCURACY); - let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY; - var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u); - n_quads = min(n_quads, MAX_QUADS); - var keep_params: array; - var val = 0.0; - var qp0 = p0; - let step = 1.0 / f32(n_quads); - for (var i = 0u; i < n_quads; i += 1u) { - let t = f32(i + 1u) * step; - let qp2 = eval_cubic(p0, p1, p2, p3, t); - var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); - qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); - let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY)); - keep_params[i] = params; - val += params.val; - qp0 = qp2; - } - let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u); - var lp0 = p0; - qp0 = p0; - let v_step = val / f32(n); - var n_out = 1u; - var val_sum = 0.0; - for (var i = 0u; i < n_quads; i += 1u) { - let t = f32(i + 1u) * step; - let qp2 = eval_cubic(p0, p1, p2, p3, t); - var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); - qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); - let params = keep_params[i]; - let u0 = approx_parabola_inv_integral(params.a0); - let u2 = approx_parabola_inv_integral(params.a2); - let uscale = 1.0 / (u2 - u0); - var val_target = f32(n_out) * v_step; - while n_out == n || val_target < val_sum + params.val { - var lp1: vec2; - if n_out == n { - lp1 = p3; - } else { - let u = (val_target - val_sum) / params.val; - let a = mix(params.a0, params.a2, u); - let au = approx_parabola_inv_integral(a); - let t = (au - u0) * uscale; - lp1 = eval_quad(qp0, qp1, qp2, t); - } - - // Output line segment lp0..lp1 - let xymin = min(lp0, lp1); - let xymax = max(lp0, lp1); - let dp = lp1 - lp0; - let recip_dx = 1.0 / dp.x; - let invslope = select(dp.x / dp.y, 1.0e9, abs(dp.y) < 1.0e-9); - let c = 0.5 * abs(invslope); - let b = invslope; - let SX = 1.0 / f32(TILE_WIDTH); - let SY = 1.0 / f32(TILE_HEIGHT); - let a = (lp0.x - (lp0.y - 0.5 * f32(TILE_HEIGHT)) * b) * SX; - var x0 = i32(floor(xymin.x * SX)); - var x1 = i32(floor(xymax.x * SX) + 1.0); - var y0 = i32(floor(xymin.y * SY)); - var y1 = i32(floor(xymax.y * SY) + 1.0); - x0 = clamp(x0, 0, i32(config.width_in_tiles)); - x1 = clamp(x1, 0, i32(config.width_in_tiles)); - y0 = clamp(y0, 0, i32(config.height_in_tiles)); - y1 = clamp(y1, 0, i32(config.height_in_tiles)); - var xc = a + b * f32(y0); - var xray = i32(floor(lp0.x * SX)); - var last_xray = i32(floor(lp1.x * SX)); - if dp.y < 0.0 { - let tmp = xray; - xray = last_xray; - last_xray = tmp; - } - for (var y = y0; y < y1; y += 1) { - let tile_y0 = f32(y) * f32(TILE_HEIGHT); - let xbackdrop = max(xray + 1, 0); - if xymin.y < tile_y0 && xbackdrop < i32(config.width_in_tiles) { - let backdrop = select(-1, 1, dp.y < 0.0); - let tile_ix = y * i32(config.width_in_tiles) + xbackdrop; - atomicAdd(&tiles[tile_ix].backdrop, backdrop); - } - var next_xray = last_xray; - if y + 1 < y1 { - let tile_y1 = f32(y + 1) * f32(TILE_HEIGHT); - let x_edge = lp0.x + (tile_y1 - lp0.y) * invslope; - next_xray = i32(floor(x_edge * SX)); - } - let min_xray = min(xray, next_xray); - let max_xray = max(xray, next_xray); - var xx0 = min(i32(floor(xc - c)), min_xray); - var xx1 = max(i32(ceil(xc + c)), max_xray + 1); - xx0 = clamp(xx0, x0, x1); - xx1 = clamp(xx1, x0, x1); - var tile_seg: Segment; - for (var x = xx0; x < xx1; x += 1) { - let tile_x0 = f32(x) * f32(TILE_WIDTH); - let tile_ix = y * i32(config.width_in_tiles) + x; - // allocate segment, insert linked list - let seg_ix = alloc_segment(); - let old = atomicExchange(&tiles[tile_ix].segments, seg_ix); - tile_seg.origin = lp0; - tile_seg.delta = dp; - var y_edge = mix(lp0.y, lp1.y, (tile_x0 - lp0.x) * recip_dx); - if xymin.x < tile_x0 { - let p = vec2(tile_x0, y_edge); - if dp.x < 0.0 { - tile_seg.delta = p - lp0; - } else { - tile_seg.origin = p; - tile_seg.delta = lp1 - p; - } - if tile_seg.delta.x == 0.0 { - tile_seg.delta.x = sign(dp.x) * 1e-9; - } - } - if x <= min_xray || max_xray < x { - y_edge = 1e9; - } - tile_seg.y_edge = y_edge; - tile_seg.next = old; - segments[seg_ix] = tile_seg; - } - xc += b; - xray = next_xray; - } - n_out += 1u; - val_target += v_step; - lp0 = lp1; - } - val_sum += params.val; - qp0 = qp2; - } -#endif - } -} diff --git a/shader/path_coarse_full.wgsl b/shader/path_coarse_full.wgsl deleted file mode 100644 index cf03301b6..000000000 --- a/shader/path_coarse_full.wgsl +++ /dev/null @@ -1,273 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -// Path coarse rasterization for the full implementation. - -#import config -#import pathtag -#import tile -#import segment -#import cubic -#import bump - -@group(0) @binding(0) -var config: Config; - -@group(0) @binding(1) -var scene: array; - -@group(0) @binding(2) -var cubics: array; - -@group(0) @binding(3) -var paths: array; - -// We don't get this from import as it's the atomic version -struct AtomicTile { - backdrop: atomic, - segments: atomic, -} - -@group(0) @binding(4) -var bump: BumpAllocators; - -@group(0) @binding(5) -var tiles: array; - -@group(0) @binding(6) -var segments: array; - -struct SubdivResult { - val: f32, - a0: f32, - a2: f32, -} - -let D = 0.67; -fn approx_parabola_integral(x: f32) -> f32 { - return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x))); -} - -let B = 0.39; -fn approx_parabola_inv_integral(x: f32) -> f32 { - return x * sqrt(1.0 - B + (B * B + 0.5 * x * x)); -} - -fn estimate_subdiv(p0: vec2, p1: vec2, p2: vec2, sqrt_tol: f32) -> SubdivResult { - let d01 = p1 - p0; - let d12 = p2 - p1; - let dd = d01 - d12; - let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x; - let cross_inv = select(1.0 / cross, 1.0e9, abs(cross) < 1.0e-9); - let x0 = dot(d01, dd) * cross_inv; - let x2 = dot(d12, dd) * cross_inv; - let scale = abs(cross / (length(dd) * (x2 - x0))); - - let a0 = approx_parabola_integral(x0); - let a2 = approx_parabola_integral(x2); - var val = 0.0; - if scale < 1e9 { - let da = abs(a2 - a0); - let sqrt_scale = sqrt(scale); - if sign(x0) == sign(x2) { - val = sqrt_scale; - } else { - let xmin = sqrt_tol / sqrt_scale; - val = sqrt_tol / approx_parabola_integral(xmin); - } - val *= da; - } - return SubdivResult(val, a0, a2); -} - -fn eval_quad(p0: vec2, p1: vec2, p2: vec2, t: f32) -> vec2 { - let mt = 1.0 - t; - return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t; -} - -fn eval_cubic(p0: vec2, p1: vec2, p2: vec2, p3: vec2, t: f32) -> vec2 { - let mt = 1.0 - t; - return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t; -} - -fn alloc_segment() -> u32 { - var offset = atomicAdd(&bump.segments, 1u) + 1u; - if offset + 1u > config.segments_size { - offset = 0u; - atomicOr(&bump.failed, STAGE_PATH_COARSE); - } - return offset; -} - -let MAX_QUADS = 16u; - -@compute @workgroup_size(256) -fn main( - @builtin(global_invocation_id) global_id: vec3, -) { - // Exit early if prior stages failed, as we can't run this stage. - // We need to check only prior stages, as if this stage has failed in another workgroup, - // we still want to know this workgroup's memory requirement. - if (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC)) != 0u { - return; - } - let ix = global_id.x; - let tag_word = scene[config.pathtag_base + (ix >> 2u)]; - let shift = (ix & 3u) * 8u; - var tag_byte = (tag_word >> shift) & 0xffu; - - if (tag_byte & PATH_TAG_SEG_TYPE) != 0u { - // Discussion question: it might actually be cheaper to do the path segment - // decoding & transform again rather than store the result in a buffer; - // classic memory vs ALU tradeoff. - let cubic = cubics[global_id.x]; - let path = paths[cubic.path_ix]; - let is_stroke = (cubic.flags & CUBIC_IS_STROKE) != 0u; - let bbox = vec4(path.bbox); - let p0 = cubic.p0; - let p1 = cubic.p1; - let p2 = cubic.p2; - let p3 = cubic.p3; - let err_v = 3.0 * (p2 - p1) + p0 - p3; - let err = dot(err_v, err_v); - let ACCURACY = 0.25; - let Q_ACCURACY = ACCURACY * 0.1; - let REM_ACCURACY = (ACCURACY - Q_ACCURACY); - let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY; - var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u); - n_quads = min(n_quads, MAX_QUADS); - var keep_params: array; - var val = 0.0; - var qp0 = p0; - let step = 1.0 / f32(n_quads); - for (var i = 0u; i < n_quads; i += 1u) { - let t = f32(i + 1u) * step; - let qp2 = eval_cubic(p0, p1, p2, p3, t); - var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); - qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); - let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY)); - keep_params[i] = params; - val += params.val; - qp0 = qp2; - } - let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u); - var lp0 = p0; - qp0 = p0; - let v_step = val / f32(n); - var n_out = 1u; - var val_sum = 0.0; - for (var i = 0u; i < n_quads; i += 1u) { - let t = f32(i + 1u) * step; - let qp2 = eval_cubic(p0, p1, p2, p3, t); - var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); - qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); - let params = keep_params[i]; - let u0 = approx_parabola_inv_integral(params.a0); - let u2 = approx_parabola_inv_integral(params.a2); - let uscale = 1.0 / (u2 - u0); - var val_target = f32(n_out) * v_step; - while n_out == n || val_target < val_sum + params.val { - var lp1: vec2; - if n_out == n { - lp1 = p3; - } else { - let u = (val_target - val_sum) / params.val; - let a = mix(params.a0, params.a2, u); - let au = approx_parabola_inv_integral(a); - let t = (au - u0) * uscale; - lp1 = eval_quad(qp0, qp1, qp2, t); - } - - // Output line segment lp0..lp1 - let xymin = min(lp0, lp1) - cubic.stroke; - let xymax = max(lp0, lp1) + cubic.stroke; - let dp = lp1 - lp0; - let recip_dx = 1.0 / dp.x; - let invslope = select(dp.x / dp.y, 1.0e9, abs(dp.y) < 1.0e-9); - let SX = 1.0 / f32(TILE_WIDTH); - let SY = 1.0 / f32(TILE_HEIGHT); - let c = (cubic.stroke.x + abs(invslope) * (0.5 * f32(TILE_HEIGHT) + cubic.stroke.y)) * SX; - let b = invslope; - let a = (lp0.x - (lp0.y - 0.5 * f32(TILE_HEIGHT)) * b) * SX; - var x0 = i32(floor(xymin.x * SX)); - var x1 = i32(floor(xymax.x * SX) + 1.0); - var y0 = i32(floor(xymin.y * SY)); - var y1 = i32(floor(xymax.y * SY) + 1.0); - x0 = clamp(x0, bbox.x, bbox.z); - x1 = clamp(x1, bbox.x, bbox.z); - y0 = clamp(y0, bbox.y, bbox.w); - y1 = clamp(y1, bbox.y, bbox.w); - var xc = a + b * f32(y0); - let stride = bbox.z - bbox.x; - var base = i32(path.tiles) + (y0 - bbox.y) * stride - bbox.x; - var xray = i32(floor(lp0.x * SX)); - var last_xray = i32(floor(lp1.x * SX)); - if dp.y < 0.0 { - let tmp = xray; - xray = last_xray; - last_xray = tmp; - } - for (var y = y0; y < y1; y += 1) { - let tile_y0 = f32(y) * f32(TILE_HEIGHT); - let xbackdrop = max(xray + 1, bbox.x); - if !is_stroke && xymin.y < tile_y0 && xbackdrop < bbox.z { - let backdrop = select(-1, 1, dp.y < 0.0); - let tile_ix = base + xbackdrop; - atomicAdd(&tiles[tile_ix].backdrop, backdrop); - } - var next_xray = last_xray; - if y + 1 < y1 { - let tile_y1 = f32(y + 1) * f32(TILE_HEIGHT); - let x_edge = lp0.x + (tile_y1 - lp0.y) * invslope; - next_xray = i32(floor(x_edge * SX)); - } - let min_xray = min(xray, next_xray); - let max_xray = max(xray, next_xray); - var xx0 = min(i32(floor(xc - c)), min_xray); - var xx1 = max(i32(ceil(xc + c)), max_xray + 1); - xx0 = clamp(xx0, x0, x1); - xx1 = clamp(xx1, x0, x1); - var tile_seg: Segment; - for (var x = xx0; x < xx1; x += 1) { - let tile_x0 = f32(x) * f32(TILE_WIDTH); - let tile_ix = base + x; - // allocate segment, insert linked list - let seg_ix = alloc_segment(); - let old = atomicExchange(&tiles[tile_ix].segments, seg_ix); - tile_seg.origin = lp0; - tile_seg.delta = dp; - var y_edge = 0.0; - if !is_stroke { - y_edge = mix(lp0.y, lp1.y, (tile_x0 - lp0.x) * recip_dx); - if xymin.x < tile_x0 { - let p = vec2(tile_x0, y_edge); - if dp.x < 0.0 { - tile_seg.delta = p - lp0; - } else { - tile_seg.origin = p; - tile_seg.delta = lp1 - p; - } - if tile_seg.delta.x == 0.0 { - tile_seg.delta.x = sign(dp.x) * 1e-9; - } - } - if x <= min_xray || max_xray < x { - y_edge = 1e9; - } - } - tile_seg.y_edge = y_edge; - tile_seg.next = old; - segments[seg_ix] = tile_seg; - } - xc += b; - base += stride; - xray = next_xray; - } - n_out += 1u; - val_target += v_step; - lp0 = lp1; - } - val_sum += params.val; - qp0 = qp2; - } - } -} diff --git a/shader/path_count.wgsl b/shader/path_count.wgsl new file mode 100644 index 000000000..3cf4e4c02 --- /dev/null +++ b/shader/path_count.wgsl @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Stage to compute counts of number of segments in each tile + +#import bump +#import config +#import segment +#import tile + +@group(0) @binding(0) +var config: Config; + +// TODO: this is cut'n'pasted from path_coarse. +struct AtomicTile { + backdrop: atomic, + // This is "segments" but we're renaming + count: atomic, +} + +@group(0) @binding(1) +var bump: BumpAllocators; + +@group(0) @binding(2) +var lines: array; + +@group(0) @binding(3) +var paths: array; + +@group(0) @binding(4) +var tile: array; + +@group(0) @binding(5) +var seg_counts: array; + +// number of integer cells spanned by interval defined by a, b +fn span(a: f32, b: f32) -> u32 { + return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0)); +} + +// Note regarding clipping to bounding box: +// +// We have to do the backdrop bumps for all tiles to the left of the bbox. +// This should probably be a separate loop. This also causes a numerical +// robustness issue. + +// This shader is dispatched with one thread for each line. +@compute @workgroup_size(256) +fn main( + @builtin(global_invocation_id) global_id: vec3, +) { + let n_lines = atomicLoad(&bump.lines); + var count = 0u; + if global_id.x < n_lines { + let line = lines[global_id.x]; + // coarse rasterization logic to count number of tiles touched by line + let is_down = line.p1.y >= line.p0.y; + let xy0 = select(line.p1, line.p0, is_down); + let xy1 = select(line.p0, line.p1, is_down); + let s0 = xy0 * TILE_SCALE; + let s1 = xy1 * TILE_SCALE; + // TODO: clip line to bounding box + count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u; + let line_ix = global_id.x; + + let dx = abs(s1.x - s0.x); + let dy = s1.y - s0.y; + if dx + dy == 0.0 { + // Zero-length segment, drop it. Note, this should be culled earlier. + return; + } + if dy == 0.0 && floor(s0.y) == s0.y { + return; + } + let idxdy = 1.0 / (dx + dy); + let a = dx * idxdy; + let is_positive_slope = s1.x >= s0.x; + let sign = select(-1.0, 1.0, is_positive_slope); + let xt0 = floor(s0.x * sign); + let c = s0.x * sign - xt0; + let y0 = floor(s0.y); + let ytop = select(y0 + 1.0, ceil(s0.y), s0.y == s1.y); + let b = (dy * c + dx * (ytop - s0.y)) * idxdy; + let x0 = xt0 * sign + select(-1.0, 0.0, is_positive_slope); + + let path = paths[line.path_ix]; + let bbox = vec4(path.bbox); + let xmin = min(s0.x, s1.x); + // If line is to left of bbox, we may still need to do backdrop + let stride = bbox.z - bbox.x; + if s0.y >= f32(bbox.w) || s1.y <= f32(bbox.y) || xmin >= f32(bbox.z) || stride == 0 { + return; + } + // Clip line to bounding box. Clipping is done in "i" space. + var imin = 0u; + if s0.y < f32(bbox.y) { + var iminf = round((f32(bbox.y) - y0 + b - a) / (1.0 - a)) - 1.0; + // Numerical robustness: goal is to find the first i value for which + // the following predicate is false. Above formula is designed to + // undershoot by 0.5. + if y0 + iminf - floor(a * iminf + b) < f32(bbox.y) { + iminf += 1.0; + } + imin = u32(iminf); + } + var imax = count; + if s1.y > f32(bbox.w) { + var imaxf = round((f32(bbox.w) - y0 + b - a) / (1.0 - a)) - 1.0; + if y0 + imaxf - floor(a * imaxf + b) < f32(bbox.w) { + imaxf += 1.0; + } + imax = u32(imaxf); + } + let delta = select(1, -1, is_down); + var ymin = 0; + var ymax = 0; + if max(s0.x, s1.x) <= f32(bbox.x) { + ymin = i32(ceil(s0.y)); + ymax = i32(ceil(s1.y)); + imax = imin; + } else { + let fudge = select(1.0, 0.0, is_positive_slope); + if xmin < f32(bbox.x) { + var f = round((sign * (f32(bbox.x) - x0) - b + fudge) / a); + if (x0 + sign * floor(a * f + b) < f32(bbox.x)) == is_positive_slope { + f += 1.0; + } + let ynext = i32(y0 + f - floor(a * f + b) + 1.0); + if is_positive_slope { + if u32(f) > imin { + ymin = i32(y0 + select(1.0, 0.0, y0 == s0.y)); + ymax = ynext; + imin = u32(f); + } + } else { + if u32(f) < imax { + ymin = ynext; + ymax = i32(ceil(s1.y)); + imax = u32(f); + } + } + } + if max(s0.x, s1.x) > f32(bbox.z) { + var f = round((sign * (f32(bbox.z) - x0) - b + fudge) / a); + if (x0 + sign * floor(a * f + b) < f32(bbox.z)) == is_positive_slope { + f += 1.0; + } + if is_positive_slope { + imax = min(imax, u32(f)); + } else { + imin = max(imin, u32(f)); + } + } + } + imax = max(imin, imax); + // Apply backdrop for part of line left of bbox + ymin = max(ymin, bbox.y); + ymax = min(ymax, bbox.w); + for (var y = ymin; y < ymax; y++) { + let base = i32(path.tiles) + (y - bbox.y) * stride; + atomicAdd(&tile[base].backdrop, delta); + } + var last_z = floor(a * (f32(imin) - 1.0) + b); + let seg_base = atomicAdd(&bump.seg_counts, imax - imin); + for (var i = imin; i < imax; i++) { + let subix = i; + // coarse rasterization logic + // Note: we hope fast-math doesn't strength reduce this. + let zf = a * f32(subix) + b; + let z = floor(zf); + // x, y are tile coordinates relative to render target + let y = i32(y0 + f32(subix) - z); + let x = i32(x0 + sign * z); + let base = i32(path.tiles) + (y - bbox.y) * stride - bbox.x; + let top_edge = select(last_z == z, y0 == s0.y, subix == 0u); + if top_edge && x + 1 < bbox.z { + let x_bump = max(x + 1, bbox.x); + atomicAdd(&tile[base + x_bump].backdrop, delta); + } + let seg_within_slice = atomicAdd(&tile[base + x].count, 1u); + // Pack two count values into a single u32 + let counts = (seg_within_slice << 16u) | subix; + let seg_count = SegmentCount(line_ix, counts); + seg_counts[seg_base + i - imin] = seg_count; + // Note: since we're iterating, we have a reliable value for + // last_z. + last_z = z; + } + } +} diff --git a/shader/path_count_setup.wgsl b/shader/path_count_setup.wgsl new file mode 100644 index 000000000..95af29c38 --- /dev/null +++ b/shader/path_count_setup.wgsl @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Set up dispatch size for path count stage. + +#import bump + +@group(0) @binding(0) +var bump: BumpAllocators; + +@group(0) @binding(1) +var indirect: IndirectCount; + +// Partition size for path count stage +let WG_SIZE = 256u; + +@compute @workgroup_size(1) +fn main() { + let lines = atomicLoad(&bump.lines); + indirect.count_x = (lines + (WG_SIZE - 1u)) / WG_SIZE; + indirect.count_y = 1u; + indirect.count_z = 1u; +} diff --git a/shader/path_tiling.wgsl b/shader/path_tiling.wgsl new file mode 100644 index 000000000..444974e6a --- /dev/null +++ b/shader/path_tiling.wgsl @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Write path segments + +#import bump +#import config +#import segment +#import tile + +@group(0) @binding(0) +var bump: BumpAllocators; + +@group(0) @binding(1) +var seg_counts: array; + +@group(0) @binding(2) +var lines: array; + +@group(0) @binding(3) +var paths: array; + +@group(0) @binding(4) +var tiles: array; + +@group(0) @binding(5) +var segments: array; + +fn span(a: f32, b: f32) -> u32 { + return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0)); +} + +// One invocation for each tile that is to be written. +// Total number of invocations = bump.seg_counts +@compute @workgroup_size(256) +fn main( + @builtin(global_invocation_id) global_id: vec3, +) { + let n_segments = atomicLoad(&bump.seg_counts); + if global_id.x < n_segments { + let seg_count = seg_counts[global_id.x]; + let line = lines[seg_count.line_ix]; + let counts = seg_count.counts; + let seg_within_slice = counts >> 16u; + let seg_within_line = counts & 0xffffu; + + // coarse rasterization logic + let is_down = line.p1.y >= line.p0.y; + var xy0 = select(line.p1, line.p0, is_down); + var xy1 = select(line.p0, line.p1, is_down); + let s0 = xy0 * TILE_SCALE; + let s1 = xy1 * TILE_SCALE; + let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u; + let dx = abs(s1.x - s0.x); + let dy = s1.y - s0.y; + let idxdy = 1.0 / (dx + dy); + let a = dx * idxdy; + let is_positive_slope = s1.x >= s0.x; + let sign = select(-1.0, 1.0, is_positive_slope); + let xt0 = floor(s0.x * sign); + let c = s0.x * sign - xt0; + let y0i = floor(s0.y); + let ytop = select(y0i + 1.0, ceil(s0.y), s0.y == s1.y); + let b = (dy * c + dx * (ytop - s0.y)) * idxdy; + let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0)); + let z = floor(a * f32(seg_within_line) + b); + let x = x0i + i32(sign * z); + let y = i32(y0i + f32(seg_within_line) - z); + + let path = paths[line.path_ix]; + let bbox = vec4(path.bbox); + let stride = bbox.z - bbox.x; + let tile_ix = i32(path.tiles) + (y - bbox.y) * stride + x - bbox.x; + let tile = tiles[tile_ix]; + let seg_start = ~tile.segments; + if i32(seg_start) < 0 { + return; + } + let tile_xy = vec2(f32(x) * f32(TILE_WIDTH), f32(y) * f32(TILE_HEIGHT)); + let tile_xy1 = tile_xy + vec2(f32(TILE_WIDTH), f32(TILE_HEIGHT)); + + if seg_within_line > 0u { + let z_prev = floor(a * (f32(seg_within_line) - 1.0) + b); + if z == z_prev { + // Top edge is clipped + var xt = xy0.x + (xy1.x - xy0.x) * (tile_xy.y - xy0.y) / (xy1.y - xy0.y); + // TODO: we want to switch to tile-relative coordinates + xt = clamp(xt, tile_xy.x + 1e-3, tile_xy1.x); + xy0 = vec2(xt, tile_xy.y); + } else { + // If is_positive_slope, left edge is clipped, otherwise right + let x_clip = select(tile_xy1.x, tile_xy.x, is_positive_slope); + var yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x); + yt = clamp(yt, tile_xy.y + 1e-3, tile_xy1.y); + xy0 = vec2(x_clip, yt); + } + } + if seg_within_line < count - 1u { + let z_next = floor(a * (f32(seg_within_line) + 1.0) + b); + if z == z_next { + // Bottom edge is clipped + var xt = xy0.x + (xy1.x - xy0.x) * (tile_xy1.y - xy0.y) / (xy1.y - xy0.y); + xt = clamp(xt, tile_xy.x + 1e-3, tile_xy1.x); + xy1 = vec2(xt, tile_xy1.y); + } else { + // If is_positive_slope, right edge is clipped, otherwise left + let x_clip = select(tile_xy.x, tile_xy1.x, is_positive_slope); + var yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x); + yt = clamp(yt, tile_xy.y + 1e-3, tile_xy1.y); + xy1 = vec2(x_clip, yt); + } + } + // See comments in CPU version of shader + var y_edge = 1e9; + if xy0.x == tile_xy.x { + y_edge = xy0.y; + } else if xy1.x == tile_xy.x { + y_edge = xy1.y; + } + if !is_down { + let tmp = xy0; + xy0 = xy1; + xy1 = tmp; + } + let segment = Segment(xy0, xy1 - xy0, y_edge); + segments[seg_start + seg_within_slice] = segment; + } +} diff --git a/shader/path_tiling_setup.wgsl b/shader/path_tiling_setup.wgsl new file mode 100644 index 000000000..722be1914 --- /dev/null +++ b/shader/path_tiling_setup.wgsl @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Set up dispatch size for path tiling stage. + +#import bump + +@group(0) @binding(0) +var bump: BumpAllocators; + +@group(0) @binding(1) +var indirect: IndirectCount; + +// Partition size for path tiling stage +let WG_SIZE = 256u; + +@compute @workgroup_size(1) +fn main() { + let segments = atomicLoad(&bump.seg_counts); + indirect.count_x = (segments + (WG_SIZE - 1u)) / WG_SIZE; + indirect.count_y = 1u; + indirect.count_z = 1u; +} diff --git a/shader/shared/bump.wgsl b/shader/shared/bump.wgsl index 4864f3139..6c5789f15 100644 --- a/shader/shared/bump.wgsl +++ b/shader/shared/bump.wgsl @@ -6,13 +6,21 @@ let STAGE_TILE_ALLOC: u32 = 0x2u; let STAGE_PATH_COARSE: u32 = 0x4u; let STAGE_COARSE: u32 = 0x8u; -// This must be kept in sync with the struct in src/render.rs +// This must be kept in sync with the struct in config.rs in the encoding crate. struct BumpAllocators { // Bitmask of stages that have failed allocation. failed: atomic, binning: atomic, ptcl: atomic, tile: atomic, + seg_counts: atomic, segments: atomic, blend: atomic, + lines: atomic, +} + +struct IndirectCount { + count_x: u32, + count_y: u32, + count_z: u32, } diff --git a/shader/shared/config.wgsl b/shader/shared/config.wgsl index 4ac718c3e..1e553f9ae 100644 --- a/shader/shared/config.wgsl +++ b/shader/shared/config.wgsl @@ -48,6 +48,9 @@ let N_TILE_Y = 16u; //let N_TILE = N_TILE_X * N_TILE_Y; let N_TILE = 256u; +// Not currently supporting non-square tiles +let TILE_SCALE = 0.0625; + let BLEND_STACK_SPLIT = 4u; // The following are computed in draw_leaf from the generic gradient parameters diff --git a/shader/shared/ptcl.wgsl b/shader/shared/ptcl.wgsl index d5fc5d4fd..cf759a5b4 100644 --- a/shader/shared/ptcl.wgsl +++ b/shader/shared/ptcl.wgsl @@ -25,7 +25,8 @@ let CMD_JUMP = 11u; // hand in the relevant shaders struct CmdFill { - tile: u32, + size_and_rule: u32, + seg_data: u32, backdrop: i32, } diff --git a/shader/shared/segment.wgsl b/shader/shared/segment.wgsl index 7d0359262..9fe134478 100644 --- a/shader/shared/segment.wgsl +++ b/shader/shared/segment.wgsl @@ -1,8 +1,33 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense +// Segments laid out for contiguous storage struct Segment { origin: vec2, delta: vec2, y_edge: f32, - next: u32, +} + +// A line segment produced by flattening and ready for rasterization. +// +// The name is perhaps too playful, but reflects the fact that these +// lines are completely unordered. They will flow through coarse path +// rasterization, then the per-tile segments will be scatter-written into +// segment storage so that each (tile, path) tuple gets a contiguous +// slice of segments. +struct LineSoup { + path_ix: u32, + // Note: this creates an alignment gap. Don't worry about + // this now, but maybe move to scalars. + p0: vec2, + p1: vec2, +} + +// An intermediate data structure for sorting tile segments. +struct SegmentCount { + // Reference to element of LineSoup array + line_ix: u32, + // Two count values packed into a single u32 + // Lower 16 bits: index of segment within line + // Upper 16 bits: index of segment within segment slice + counts: u32, } diff --git a/src/lib.rs b/src/lib.rs index f45bdcf2b..55147acd8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -251,7 +251,9 @@ impl Renderer { ) -> Result> { let mut render = Render::new(); let encoding = scene.data(); - let recording = render.render_encoding_coarse(encoding, &self.shaders, params, true); + // TODO: turn this on; the download feature interacts with CPU dispatch + let robust = false; + let recording = render.render_encoding_coarse(encoding, &self.shaders, params, robust); let target = render.out_image(); let bump_buf = render.bump_buf(); self.engine.run_recording( diff --git a/src/render.rs b/src/render.rs index 03e648446..462563633 100644 --- a/src/render.rs +++ b/src/render.rs @@ -190,17 +190,21 @@ impl Render { wg_counts.bbox_clear, [config_buf, path_bbox_buf], ); - let cubic_buf = - ResourceProxy::new_buf(buffer_sizes.cubics.size_in_bytes().into(), "cubic_buf"); + let bump_buf = BufProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf"); + recording.clear_all(bump_buf); + let bump_buf = ResourceProxy::Buf(bump_buf); + let lines_buf = + ResourceProxy::new_buf(buffer_sizes.lines.size_in_bytes().into(), "lines_buf"); recording.dispatch( - shaders.pathseg, - wg_counts.path_seg, + shaders.flatten, + wg_counts.flatten, [ config_buf, scene_buf, tagmonoid_buf, path_bbox_buf, - cubic_buf, + bump_buf, + lines_buf, ], ); let draw_reduced_buf = ResourceProxy::new_buf( @@ -273,13 +277,10 @@ impl Render { buffer_sizes.draw_bboxes.size_in_bytes().into(), "draw_bbox_buf", ); - let bump_buf = BufProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf"); let bin_header_buf = ResourceProxy::new_buf( buffer_sizes.bin_headers.size_in_bytes().into(), "bin_header_buf", ); - recording.clear_all(bump_buf); - let bump_buf = ResourceProxy::Buf(bump_buf); recording.dispatch( shaders.binning, wg_counts.binning, @@ -314,21 +315,33 @@ impl Render { ], ); recording.free_resource(draw_bbox_buf); + recording.free_resource(tagmonoid_buf); + let indirect_count_buf = BufProxy::new( + buffer_sizes.indirect_count.size_in_bytes().into(), + "indirect_count", + ); recording.dispatch( - shaders.path_coarse, - wg_counts.path_coarse, + shaders.path_count_setup, + (1, 1, 1), + [bump_buf, indirect_count_buf.into()], + ); + let seg_counts_buf = ResourceProxy::new_buf( + buffer_sizes.seg_counts.size_in_bytes().into(), + "seg_counts_buf", + ); + recording.dispatch_indirect( + shaders.path_count, + indirect_count_buf, + 0, [ config_buf, - scene_buf, - cubic_buf, - path_buf, bump_buf, + lines_buf, + path_buf, tile_buf, - segments_buf, + seg_counts_buf, ], ); - recording.free_resource(tagmonoid_buf); - recording.free_resource(cubic_buf); recording.dispatch( shaders.backdrop, wg_counts.backdrop, @@ -349,6 +362,27 @@ impl Render { ptcl_buf, ], ); + recording.dispatch( + shaders.path_tiling_setup, + (1, 1, 1), + [bump_buf, indirect_count_buf.into()], + ); + recording.dispatch_indirect( + shaders.path_tiling, + indirect_count_buf, + 0, + [ + bump_buf, + seg_counts_buf, + lines_buf, + path_buf, + tile_buf, + segments_buf, + ], + ); + recording.free_buf(indirect_count_buf); + recording.free_resource(seg_counts_buf); + recording.free_resource(lines_buf); recording.free_resource(scene_buf); recording.free_resource(draw_monoid_buf); recording.free_resource(bin_header_buf); diff --git a/src/shaders.rs b/src/shaders.rs index b55f158c1..23a3950f3 100644 --- a/src/shaders.rs +++ b/src/shaders.rs @@ -65,16 +65,19 @@ pub struct FullShaders { pub pathtag_scan: ShaderId, pub pathtag_scan_large: ShaderId, pub bbox_clear: ShaderId, - pub pathseg: ShaderId, + pub flatten: ShaderId, pub draw_reduce: ShaderId, pub draw_leaf: ShaderId, pub clip_reduce: ShaderId, pub clip_leaf: ShaderId, pub binning: ShaderId, pub tile_alloc: ShaderId, - pub path_coarse: ShaderId, pub backdrop: ShaderId, + pub path_count_setup: ShaderId, + pub path_count: ShaderId, pub coarse: ShaderId, + pub path_tiling_setup: ShaderId, + pub path_tiling: ShaderId, pub fine: ShaderId, } @@ -147,16 +150,17 @@ pub fn full_shaders( preprocess::preprocess(shader!("bbox_clear"), &empty, &imports).into(), &[BindType::Uniform, BindType::Buffer], )?; - let pathseg = engine.add_shader( + let flatten = engine.add_shader( device, - "pathseg", - preprocess::preprocess(shader!("pathseg"), &full_config, &imports).into(), + "flatten", + preprocess::preprocess(shader!("flatten"), &full_config, &imports).into(), &[ BindType::Uniform, BindType::BufReadOnly, BindType::BufReadOnly, BindType::Buffer, BindType::Buffer, + BindType::Buffer, ], )?; let draw_reduce = engine.add_shader( @@ -232,17 +236,21 @@ pub fn full_shaders( BindType::Buffer, ], )?; - - let path_coarse = engine.add_shader( + let path_count_setup = engine.add_shader( + device, + "path_count_setup", + preprocess::preprocess(shader!("path_count_setup"), &empty, &imports).into(), + &[BindType::BufReadOnly, BindType::Buffer], + )?; + let path_count = engine.add_shader( device, - "path_coarse_full", - preprocess::preprocess(shader!("path_coarse_full"), &full_config, &imports).into(), + "path_count", + preprocess::preprocess(shader!("path_count"), &full_config, &imports).into(), &[ BindType::Uniform, + BindType::Buffer, BindType::BufReadOnly, BindType::BufReadOnly, - BindType::BufReadOnly, - BindType::Buffer, BindType::Buffer, BindType::Buffer, ], @@ -264,9 +272,28 @@ pub fn full_shaders( BindType::BufReadOnly, BindType::BufReadOnly, BindType::BufReadOnly, - BindType::BufReadOnly, BindType::Buffer, BindType::Buffer, + BindType::Buffer, + ], + )?; + let path_tiling_setup = engine.add_shader( + device, + "path_tiling_setup", + preprocess::preprocess(shader!("path_tiling_setup"), &empty, &imports).into(), + &[BindType::BufReadOnly, BindType::Buffer], + )?; + let path_tiling = engine.add_shader( + device, + "path_tiling", + preprocess::preprocess(shader!("path_tiling"), &empty, &imports).into(), + &[ + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::Buffer, ], )?; let fine = engine.add_shader( @@ -290,16 +317,19 @@ pub fn full_shaders( pathtag_scan1, pathtag_scan_large, bbox_clear, - pathseg, + flatten, draw_reduce, draw_leaf, clip_reduce, clip_leaf, binning, tile_alloc, - path_coarse, + path_count_setup, + path_count, backdrop, coarse, + path_tiling_setup, + path_tiling, fine, }) } From 79d4fd92c5a8834d0167cefd9b48607d9ba4b805 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Fri, 6 Oct 2023 15:21:49 -0700 Subject: [PATCH 2/3] Address review feedback Mostly comments, but also a little cleanup of function signatures in fine. --- crates/encoding/src/config.rs | 8 ++++---- shader/coarse.wgsl | 8 ++++---- shader/fine.wgsl | 12 ++++++------ shader/flatten.wgsl | 10 +++++----- shader/path_count.wgsl | 10 +++++----- shader/path_tiling.wgsl | 4 +++- shader/shared/tile.wgsl | 7 ++++++- 7 files changed, 33 insertions(+), 26 deletions(-) diff --git a/crates/encoding/src/config.rs b/crates/encoding/src/config.rs index d853a0808..6f24925b8 100644 --- a/crates/encoding/src/config.rs +++ b/crates/encoding/src/config.rs @@ -16,7 +16,7 @@ const TILE_HEIGHT: u32 = 16; // TODO: Obtain these from the vello_shaders crate pub(crate) const PATH_REDUCE_WG: u32 = 256; const PATH_BBOX_WG: u32 = 256; -const PATH_COARSE_WG: u32 = 256; +const FLATTEN_WG: u32 = 256; const CLIP_REDUCE_WG: u32 = 256; /// Counters for tracking dynamic allocation on the GPU. @@ -164,7 +164,7 @@ impl WorkgroupCounts { path_tag_wgs }; let draw_object_wgs = (n_draw_objects + PATH_BBOX_WG - 1) / PATH_BBOX_WG; - let path_coarse_wgs = (n_path_tags + PATH_COARSE_WG - 1) / PATH_COARSE_WG; + let flatten_wgs = (n_path_tags + FLATTEN_WG - 1) / FLATTEN_WG; let clip_reduce_wgs = n_clips.saturating_sub(1) / CLIP_REDUCE_WG; let clip_wgs = (n_clips + CLIP_REDUCE_WG - 1) / CLIP_REDUCE_WG; let path_wgs = (n_paths + PATH_BBOX_WG - 1) / PATH_BBOX_WG; @@ -177,14 +177,14 @@ impl WorkgroupCounts { path_scan1: (reduced_size / PATH_REDUCE_WG, 1, 1), path_scan: (path_tag_wgs, 1, 1), bbox_clear: (draw_object_wgs, 1, 1), - flatten: (path_coarse_wgs, 1, 1), + flatten: (flatten_wgs, 1, 1), draw_reduce: (draw_object_wgs, 1, 1), draw_leaf: (draw_object_wgs, 1, 1), clip_reduce: (clip_reduce_wgs, 1, 1), clip_leaf: (clip_wgs, 1, 1), binning: (draw_object_wgs, 1, 1), tile_alloc: (path_wgs, 1, 1), - path_coarse: (path_coarse_wgs, 1, 1), + path_coarse: (flatten_wgs, 1, 1), backdrop: (path_wgs, 1, 1), coarse: (width_in_bins, height_in_bins, 1), fine: (width_in_tiles, height_in_tiles, 1), diff --git a/shader/coarse.wgsl b/shader/coarse.wgsl index 2d324c798..0635443fe 100644 --- a/shader/coarse.wgsl +++ b/shader/coarse.wgsl @@ -87,10 +87,10 @@ fn write_path(tile: Tile, tile_ix: u32, linewidth: f32) -> bool { // We overload the "segments" field to store both count (written by // path_count stage) and segment allocation (used by path_tiling and // fine). - let n_segs = tile.segments; + let n_segs = tile.segment_count_or_ix; if n_segs != 0u { var seg_ix = atomicAdd(&bump.segments, n_segs); - tiles[tile_ix].segments = ~seg_ix; + tiles[tile_ix].segment_count_or_ix = ~seg_ix; alloc_cmd(4u); ptcl[cmd_offset] = CMD_FILL; let size_and_rule = (n_segs << 1u) | u32(even_odd); @@ -310,7 +310,7 @@ fn main( let blend = scene[dd]; is_blend = blend != BLEND_CLIP; } - let include_tile = tile.segments != 0u || (tile.backdrop == 0) == is_clip || is_blend; + let include_tile = tile.segment_count_or_ix != 0u || (tile.backdrop == 0) == is_clip || is_blend; if include_tile { let el_slice = el_ix / 32u; let el_mask = 1u << (el_ix & 31u); @@ -383,7 +383,7 @@ fn main( } // DRAWTAG_BEGIN_CLIP case 0x9u: { - if tile.segments == 0u && tile.backdrop == 0 { + if tile.segment_count_or_ix == 0u && tile.backdrop == 0 { clip_zero_depth = clip_depth + 1u; } else { write_begin_clip(); diff --git a/shader/fine.wgsl b/shader/fine.wgsl index 79984596f..108c88cba 100644 --- a/shader/fine.wgsl +++ b/shader/fine.wgsl @@ -135,14 +135,16 @@ var output: texture_storage_2d; let PIXELS_PER_THREAD = 4u; -fn fill_path(seg_data: u32, n_segs: u32, backdrop: i32, xy: vec2, even_odd: bool) -> array { +fn fill_path(fill: CmdFill, xy: vec2) -> array { + let n_segs = fill.size_and_rule >> 1u; + let even_odd = (fill.size_and_rule & 1u) != 0u; var area: array; - let backdrop_f = f32(backdrop); + let backdrop_f = f32(fill.backdrop); for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { area[i] = backdrop_f; } for (var i = 0u; i < n_segs; i++) { - let seg_off = seg_data + i; + let seg_off = fill.seg_data + i; let segment = segments[seg_off]; let y = segment.origin.y - xy.y; let y0 = clamp(y, 0.0, 1.0); @@ -218,9 +220,7 @@ fn main( // CMD_FILL case 1u: { let fill = read_fill(cmd_ix); - let n_segs = fill.size_and_rule >> 1u; - let even_odd = (fill.size_and_rule & 1u) != 0u; - area = fill_path(fill.seg_data, n_segs, fill.backdrop, xy, even_odd); + area = fill_path(fill, xy); cmd_ix += 4u; } // CMD_STROKE diff --git a/shader/flatten.wgsl b/shader/flatten.wgsl index b4197bce6..e27bb6ed1 100644 --- a/shader/flatten.wgsl +++ b/shader/flatten.wgsl @@ -174,7 +174,7 @@ fn read_i16_point(ix: u32) -> vec2 { } struct Transform { - matrx: vec4, + mat: vec4, translate: vec2, } @@ -186,13 +186,13 @@ fn read_transform(transform_base: u32, ix: u32) -> Transform { let c3 = bitcast(scene[base + 3u]); let c4 = bitcast(scene[base + 4u]); let c5 = bitcast(scene[base + 5u]); - let matrx = vec4(c0, c1, c2, c3); + let mat = vec4(c0, c1, c2, c3); let translate = vec2(c4, c5); - return Transform(matrx, translate); + return Transform(mat, translate); } fn transform_apply(transform: Transform, p: vec2) -> vec2 { - return transform.matrx.xy * p.x + transform.matrx.zw * p.y + transform.translate; + return transform.mat.xy * p.x + transform.mat.zw * p.y + transform.translate; } fn round_down(x: f32) -> i32 { @@ -275,7 +275,7 @@ fn main( // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm // This is the correct bounding box, but we're not handling rendering // in the isotropic case, so it may mismatch. - stroke = 0.5 * linewidth * vec2(length(transform.matrx.xz), length(transform.matrx.yw)); + stroke = 0.5 * linewidth * vec2(length(transform.mat.xz), length(transform.mat.yw)); bbox += vec4(-stroke, stroke); } let flags = u32(linewidth >= 0.0); diff --git a/shader/path_count.wgsl b/shader/path_count.wgsl index 3cf4e4c02..e34ff7a99 100644 --- a/shader/path_count.wgsl +++ b/shader/path_count.wgsl @@ -13,8 +13,7 @@ var config: Config; // TODO: this is cut'n'pasted from path_coarse. struct AtomicTile { backdrop: atomic, - // This is "segments" but we're renaming - count: atomic, + segment_count_or_ix: atomic, } @group(0) @binding(1) @@ -58,14 +57,15 @@ fn main( let xy1 = select(line.p0, line.p1, is_down); let s0 = xy0 * TILE_SCALE; let s1 = xy1 * TILE_SCALE; - // TODO: clip line to bounding box count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u; let line_ix = global_id.x; let dx = abs(s1.x - s0.x); let dy = s1.y - s0.y; if dx + dy == 0.0 { - // Zero-length segment, drop it. Note, this should be culled earlier. + // Zero-length segment, drop it. Note, this could be culled in the + // flattening stage, but eliding the test here would be fragile, as + // it would be pretty bad to let it slip through. return; } if dy == 0.0 && floor(s0.y) == s0.y { @@ -176,7 +176,7 @@ fn main( let x_bump = max(x + 1, bbox.x); atomicAdd(&tile[base + x_bump].backdrop, delta); } - let seg_within_slice = atomicAdd(&tile[base + x].count, 1u); + let seg_within_slice = atomicAdd(&tile[base + x].segment_count_or_ix, 1u); // Pack two count values into a single u32 let counts = (seg_within_slice << 16u) | subix; let seg_count = SegmentCount(line_ix, counts); diff --git a/shader/path_tiling.wgsl b/shader/path_tiling.wgsl index 444974e6a..ace01ab97 100644 --- a/shader/path_tiling.wgsl +++ b/shader/path_tiling.wgsl @@ -52,6 +52,8 @@ fn main( let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u; let dx = abs(s1.x - s0.x); let dy = s1.y - s0.y; + // Division by zero can't happen because zero-length lines + // have already been discarded in the path_count stage. let idxdy = 1.0 / (dx + dy); let a = dx * idxdy; let is_positive_slope = s1.x >= s0.x; @@ -71,7 +73,7 @@ fn main( let stride = bbox.z - bbox.x; let tile_ix = i32(path.tiles) + (y - bbox.y) * stride + x - bbox.x; let tile = tiles[tile_ix]; - let seg_start = ~tile.segments; + let seg_start = ~tile.segment_count_or_ix; if i32(seg_start) < 0 { return; } diff --git a/shader/shared/tile.wgsl b/shader/shared/tile.wgsl index 7af8a6256..063a5bc16 100644 --- a/shader/shared/tile.wgsl +++ b/shader/shared/tile.wgsl @@ -11,5 +11,10 @@ struct Path { struct Tile { backdrop: i32, - segments: u32, + // This is used for the count of the number of segments in the + // tile up to coarse rasterization, and the index afterwards. + // In the latter variant, the bits are inverted so that tiling + // can detect whether the tile was allocated; it's best to + // consider this an enum packed into a u32. + segment_count_or_ix: u32, } From b46609d251ba2a174f1a386b95eb84ee49cbfcfa Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Fri, 6 Oct 2023 16:54:36 -0700 Subject: [PATCH 3/3] Add documentation of ClipBic Refer to Arxiv paper and mention bicyclic semigroups. --- crates/encoding/src/clip.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/crates/encoding/src/clip.rs b/crates/encoding/src/clip.rs index 025b86bc5..1a68fbab6 100644 --- a/crates/encoding/src/clip.rs +++ b/crates/encoding/src/clip.rs @@ -4,10 +4,18 @@ use bytemuck::{Pod, Zeroable}; /// Clip stack element. +/// +/// This is the bicyclic semigroup, a monoid useful for representing +/// stack depth. There is considerably more detail in the draft paper +/// [Fast GPU bounding boxes on tree-structured scenes]. +/// +/// [Fast GPU bounding boxes on tree-structured scenes]: https://arxiv.org/abs/2205.11659 #[derive(Copy, Clone, Pod, Zeroable, Debug, Default)] #[repr(C)] pub struct ClipBic { + /// When interpreted as a stack operation, the number of pop operations. pub a: u32, + /// When interpreted as a stack operation, the number of push operations. pub b: u32, } @@ -47,6 +55,12 @@ impl ClipBic { ClipBic { a, b } } + /// The bicyclic semigroup operation. + /// + /// This operation is associative. When interpreted as a stack + /// operation, it represents doing the pops of `self`, the pushes of + /// `self`, the pops of `other`, and the pushes of `other`. The middle + /// two can cancel each other out. pub fn combine(self, other: ClipBic) -> Self { let m = self.b.min(other.a); ClipBic::new(self.a + other.a - m, self.b + other.b - m)