Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CPU shaders #374

Merged
merged 3 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions crates/encoding/src/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,10 @@ pub struct Path {
pub struct Tile {
/// Accumulated backdrop at the left edge of the tile.
pub backdrop: i32,
/// Index of first path segment.
pub segments: u32,
/// An enum that can hold either a count or an index to the
/// beginning of an allocated slice. In the latter case, the
/// bits are inverted.
pub segment_count_or_ix: u32,
raphlinus marked this conversation as resolved.
Show resolved Hide resolved
}

/// Encoder for path segments.
Expand Down
1 change: 0 additions & 1 deletion shader/draw_leaf.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ fn main(
// let x1 = f32(bbox.x1);
// let y1 = f32(bbox.y1);
// let bbox_f = vec4(x0, y0, x1, y1);
let fill_mode = u32(bbox.linewidth >= 0.0);
var transform = Transform();
var linewidth = bbox.linewidth;
if linewidth >= 0.0 || tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT ||
Expand Down
89 changes: 70 additions & 19 deletions src/cpu_dispatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
//! Support for CPU implementations of compute shaders.

use std::{
cell::{RefCell, RefMut},
ops::Deref,
cell::{Ref, RefCell, RefMut},
ops::{Deref, DerefMut},
};

use bytemuck::Pod;

#[derive(Clone, Copy)]
pub enum CpuBinding<'a> {
Buffer(&'a [u8]),
Expand All @@ -16,39 +18,88 @@ pub enum CpuBinding<'a> {
Texture(&'a CpuTexture),
}

pub enum CpuBufGuard<'a> {
Slice(&'a [u8]),
Interior(RefMut<'a, Vec<u8>>),
pub enum TypedBufGuard<'a, T: ?Sized> {
Slice(&'a T),
Interior(Ref<'a, T>),
}

pub enum TypedBufGuardMut<'a, T: ?Sized> {
Slice(&'a mut T),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this variant ever constructed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so, I'd have to check.

Interior(RefMut<'a, T>),
}

impl<'a> Deref for CpuBufGuard<'a> {
type Target = [u8];
impl<'a, T: ?Sized> Deref for TypedBufGuard<'a, T> {
type Target = T;

fn deref(&self) -> &Self::Target {
match self {
CpuBufGuard::Slice(s) => s,
CpuBufGuard::Interior(r) => r,
TypedBufGuard::Slice(s) => s,
TypedBufGuard::Interior(r) => r,
}
}
}

impl<'a> CpuBufGuard<'a> {
/// Get a mutable reference to the buffer.
///
/// Panics if the underlying resource is read-only.
pub fn as_mut(&mut self) -> &mut [u8] {
impl<'a, T: ?Sized> Deref for TypedBufGuardMut<'a, T> {
type Target = T;

fn deref(&self) -> &Self::Target {
match self {
CpuBufGuard::Interior(r) => &mut *r,
_ => panic!("tried to borrow immutable buffer as mutable"),
TypedBufGuardMut::Slice(s) => s,
TypedBufGuardMut::Interior(r) => r,
}
}
}

impl<'a, T: ?Sized> DerefMut for TypedBufGuardMut<'a, T> {
fn deref_mut(&mut self) -> &mut Self::Target {
match self {
TypedBufGuardMut::Slice(s) => s,
TypedBufGuardMut::Interior(r) => r,
}
}
}

impl<'a> CpuBinding<'a> {
pub fn as_buf(&self) -> CpuBufGuard {
pub fn as_typed<T: Pod>(&self) -> TypedBufGuard<T> {
match self {
CpuBinding::Buffer(b) => TypedBufGuard::Slice(bytemuck::from_bytes(b)),
CpuBinding::BufferRW(b) => {
TypedBufGuard::Interior(Ref::map(b.borrow(), |buf| bytemuck::from_bytes(buf)))
}
_ => panic!("resource type mismatch"),
}
}

pub fn as_typed_mut<T: Pod>(&self) -> TypedBufGuardMut<T> {
match self {
CpuBinding::Buffer(_) => panic!("can't borrow external buffer mutably"),
CpuBinding::BufferRW(b) => {
TypedBufGuardMut::Interior(RefMut::map(b.borrow_mut(), |buf| {
bytemuck::from_bytes_mut(buf)
}))
}
_ => panic!("resource type mismatch"),
}
}

pub fn as_slice<T: Pod>(&self) -> TypedBufGuard<[T]> {
match self {
CpuBinding::Buffer(b) => TypedBufGuard::Slice(bytemuck::cast_slice(b)),
CpuBinding::BufferRW(b) => {
TypedBufGuard::Interior(Ref::map(b.borrow(), |buf| bytemuck::cast_slice(buf)))
}
_ => panic!("resource type mismatch"),
}
}

pub fn as_slice_mut<T: Pod>(&self) -> TypedBufGuardMut<[T]> {
match self {
CpuBinding::Buffer(b) => CpuBufGuard::Slice(b),
CpuBinding::BufferRW(b) => CpuBufGuard::Interior(b.borrow_mut()),
CpuBinding::Buffer(_) => panic!("can't borrow external buffer mutably"),
CpuBinding::BufferRW(b) => {
TypedBufGuardMut::Interior(RefMut::map(b.borrow_mut(), |buf| {
bytemuck::cast_slice_mut(buf)
}))
}
_ => panic!("resource type mismatch"),
}
}
Expand Down
30 changes: 30 additions & 0 deletions src/cpu_shader/backdrop.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright 2023 The Vello authors
// SPDX-License-Identifier: Apache-2.0 OR MIT

use vello_encoding::{ConfigUniform, Path, Tile};

use crate::cpu_dispatch::CpuBinding;

fn backdrop_main(config: &ConfigUniform, paths: &[Path], tiles: &mut [Tile]) {
for drawobj_ix in 0..config.layout.n_draw_objects {
let path = paths[drawobj_ix as usize];
let width = path.bbox[2] - path.bbox[0];
let height = path.bbox[3] - path.bbox[1];
let base = path.tiles;
for y in 0..height {
let mut sum = 0;
for x in 0..width {
let tile = &mut tiles[(base + y * width + x) as usize];
sum += tile.backdrop;
tile.backdrop = sum;
}
}
}
}

pub fn backdrop(_n_wg: u32, resources: &[CpuBinding]) {
let config = resources[0].as_typed();
let paths = resources[1].as_slice();
let mut tiles = resources[2].as_slice_mut();
backdrop_main(&config, &paths, &mut tiles);
}
21 changes: 21 additions & 0 deletions src/cpu_shader/bbox_clear.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Copyright 2023 The Vello authors
// SPDX-License-Identifier: Apache-2.0 OR MIT

use vello_encoding::{ConfigUniform, PathBbox};

use crate::cpu_dispatch::CpuBinding;

fn bbox_clear_main(config: &ConfigUniform, path_bboxes: &mut [PathBbox]) {
for i in 0..(config.layout.n_paths as usize) {
path_bboxes[i].x0 = 0x7fff_ffff;
path_bboxes[i].y0 = 0x7fff_ffff;
path_bboxes[i].x1 = -0x8000_0000;
path_bboxes[i].y1 = -0x8000_0000;
}
}

pub fn bbox_clear(_n_wg: u32, resources: &[CpuBinding]) {
let config = resources[0].as_typed();
let mut path_bboxes = resources[1].as_slice_mut();
bbox_clear_main(&config, &mut path_bboxes);
}
127 changes: 127 additions & 0 deletions src/cpu_shader/binning.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Copyright 2023 The Vello authors
// SPDX-License-Identifier: Apache-2.0 OR MIT

use vello_encoding::{BinHeader, BumpAllocators, ConfigUniform, DrawMonoid, PathBbox};

use crate::cpu_dispatch::CpuBinding;

const WG_SIZE: usize = 256;
const TILE_WIDTH: usize = 16;
const TILE_HEIGHT: usize = 16;
const N_TILE_X: usize = 16;
const N_TILE_Y: usize = 16;
const SX: f32 = 1.0 / ((N_TILE_X * TILE_WIDTH) as f32);
const SY: f32 = 1.0 / ((N_TILE_Y * TILE_HEIGHT) as f32);

fn bbox_intersect(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
[
a[0].max(b[0]),
a[1].max(b[1]),
a[2].min(b[2]),
a[3].min(b[3]),
]
}

fn binning_main(
n_wg: u32,
config: &ConfigUniform,
draw_monoids: &[DrawMonoid],
path_bbox_buf: &[PathBbox],
clip_bbox_buf: &[[f32; 4]],
intersected_bbox: &mut [[f32; 4]],
bump: &mut BumpAllocators,
bin_data: &mut [u32],
bin_header: &mut [BinHeader],
) {
for wg in 0..n_wg as usize {
let mut counts = [0; WG_SIZE];
let mut bboxes = [[0, 0, 0, 0]; WG_SIZE];
let width_in_bins =
((config.width_in_tiles + N_TILE_X as u32 - 1) / N_TILE_X as u32) as i32;
let height_in_bins =
((config.height_in_tiles + N_TILE_Y as u32 - 1) / N_TILE_Y as u32) as i32;
for local_ix in 0..WG_SIZE {
let element_ix = wg * WG_SIZE + local_ix;
let mut x0 = 0;
let mut y0 = 0;
let mut x1 = 0;
let mut y1 = 0;
if element_ix < config.layout.n_draw_objects as usize {
let draw_monoid = draw_monoids[element_ix];
let mut clip_bbox = [-1e9, -1e9, 1e9, 1e9];
if draw_monoid.clip_ix > 0 {
clip_bbox = clip_bbox_buf[draw_monoid.clip_ix as usize - 1];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might as well address the TODO in binning.wgsl and add this assertion here?: assert!(draw_monoid.clip_ix - 1 < config.layout.n_clip).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we also lower the length of the slice to that value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That latter question is a deep one. It is clearly more idiomatic Rust, but I think the goal here is to match the GPU as much as possible, which to me reads as putting in the assert.

}
let path_bbox = path_bbox_buf[draw_monoid.path_ix as usize];
let pb = [
path_bbox.x0 as f32,
path_bbox.y0 as f32,
path_bbox.x1 as f32,
path_bbox.y1 as f32,
];
let bbox = bbox_intersect(clip_bbox, pb);
intersected_bbox[element_ix] = bbox;
if bbox[0] < bbox[2] && bbox[1] < bbox[3] {
x0 = (bbox[0] * SX).floor() as i32;
y0 = (bbox[1] * SY).floor() as i32;
x1 = (bbox[2] * SX).ceil() as i32;
y1 = (bbox[3] * SY).ceil() as i32;
}
}
x0 = x0.clamp(0, width_in_bins);
y0 = y0.clamp(0, height_in_bins);
x1 = x1.clamp(0, width_in_bins);
y1 = y1.clamp(0, height_in_bins);
for y in y0..y1 {
for x in x0..x1 {
counts[(y * width_in_bins + x) as usize] += 1;
}
}
bboxes[local_ix] = [x0, y0, x1, y1];
}
let mut chunk_offset = [0; WG_SIZE];
for local_ix in 0..WG_SIZE {
let global_ix = wg * WG_SIZE + local_ix;
chunk_offset[local_ix] = bump.binning;
bump.binning += counts[local_ix];
bin_header[global_ix] = BinHeader {
element_count: counts[local_ix],
chunk_offset: chunk_offset[local_ix],
};
}
for local_ix in 0..WG_SIZE {
let element_ix = wg * WG_SIZE + local_ix;
let bbox = bboxes[local_ix];
for y in bbox[1]..bbox[3] {
for x in bbox[0]..bbox[2] {
let bin_ix = (y * width_in_bins + x) as usize;
let ix = config.layout.bin_data_start + chunk_offset[bin_ix];
bin_data[ix as usize] = element_ix as u32;
chunk_offset[bin_ix] += 1;
}
}
}
}
}

pub fn binning(n_wg: u32, resources: &[CpuBinding]) {
let config = resources[0].as_typed();
let draw_monoids = resources[1].as_slice();
let path_bbox_buf = resources[2].as_slice();
let clip_bbox_buf = resources[3].as_slice();
let mut intersected_bbox = resources[4].as_slice_mut();
let mut bump = resources[5].as_typed_mut();
let mut bin_data = resources[6].as_slice_mut();
let mut bin_header = resources[7].as_slice_mut();
binning_main(
n_wg,
&config,
&draw_monoids,
&path_bbox_buf,
&clip_bbox_buf,
&mut intersected_bbox,
&mut bump,
&mut bin_data,
&mut bin_header,
);
}
Loading