diff --git a/crates/encoding/src/path.rs b/crates/encoding/src/path.rs
index 414ce23e6..b0a52b0b7 100644
--- a/crates/encoding/src/path.rs
+++ b/crates/encoding/src/path.rs
@@ -227,8 +227,11 @@ pub struct Path {
 pub struct Tile {
     /// Accumulated backdrop at the left edge of the tile.
     pub backdrop: i32,
-    /// Index of first path segment.
-    pub segments: u32,
+    /// An enum that holds either the count of the number of path
+    /// segments in this tile, or an index to the beginning of an
+    /// allocated slice of `PathSegment` objects. In the latter case,
+    /// the bits are inverted.
+    pub segment_count_or_ix: u32,
 }
 
 /// Encoder for path segments.
diff --git a/shader/draw_leaf.wgsl b/shader/draw_leaf.wgsl
index 6154b9256..827825974 100644
--- a/shader/draw_leaf.wgsl
+++ b/shader/draw_leaf.wgsl
@@ -108,7 +108,6 @@ fn main(
         // let x1 = f32(bbox.x1);
         // let y1 = f32(bbox.y1);
         // let bbox_f = vec4(x0, y0, x1, y1);
-        let fill_mode = u32(bbox.linewidth >= 0.0);
         var transform = Transform();
         var linewidth = bbox.linewidth;
         if linewidth >= 0.0 || tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT ||
diff --git a/src/cpu_dispatch.rs b/src/cpu_dispatch.rs
index 0b8bbc86b..2c3409c16 100644
--- a/src/cpu_dispatch.rs
+++ b/src/cpu_dispatch.rs
@@ -4,10 +4,12 @@
 //! Support for CPU implementations of compute shaders.
 
 use std::{
-    cell::{RefCell, RefMut},
-    ops::Deref,
+    cell::{Ref, RefCell, RefMut},
+    ops::{Deref, DerefMut},
 };
 
+use bytemuck::Pod;
+
 #[derive(Clone, Copy)]
 pub enum CpuBinding<'a> {
     Buffer(&'a [u8]),
@@ -16,39 +18,88 @@ pub enum CpuBinding<'a> {
     Texture(&'a CpuTexture),
 }
 
-pub enum CpuBufGuard<'a> {
-    Slice(&'a [u8]),
-    Interior(RefMut<'a, Vec<u8>>),
+pub enum TypedBufGuard<'a, T: ?Sized> {
+    Slice(&'a T),
+    Interior(Ref<'a, T>),
+}
+
+pub enum TypedBufGuardMut<'a, T: ?Sized> {
+    Slice(&'a mut T),
+    Interior(RefMut<'a, T>),
 }
 
-impl<'a> Deref for CpuBufGuard<'a> {
-    type Target = [u8];
+impl<'a, T: ?Sized> Deref for TypedBufGuard<'a, T> {
+    type Target = T;
 
     fn deref(&self) -> &Self::Target {
         match self {
-            CpuBufGuard::Slice(s) => s,
-            CpuBufGuard::Interior(r) => r,
+            TypedBufGuard::Slice(s) => s,
+            TypedBufGuard::Interior(r) => r,
         }
     }
 }
 
-impl<'a> CpuBufGuard<'a> {
-    /// Get a mutable reference to the buffer.
-    ///
-    /// Panics if the underlying resource is read-only.
-    pub fn as_mut(&mut self) -> &mut [u8] {
+impl<'a, T: ?Sized> Deref for TypedBufGuardMut<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
         match self {
-            CpuBufGuard::Interior(r) => &mut *r,
-            _ => panic!("tried to borrow immutable buffer as mutable"),
+            TypedBufGuardMut::Slice(s) => s,
+            TypedBufGuardMut::Interior(r) => r,
+        }
+    }
+}
+
+impl<'a, T: ?Sized> DerefMut for TypedBufGuardMut<'a, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        match self {
+            TypedBufGuardMut::Slice(s) => s,
+            TypedBufGuardMut::Interior(r) => r,
         }
     }
 }
 
 impl<'a> CpuBinding<'a> {
-    pub fn as_buf(&self) -> CpuBufGuard {
+    pub fn as_typed<T: Pod>(&self) -> TypedBufGuard<T> {
+        match self {
+            CpuBinding::Buffer(b) => TypedBufGuard::Slice(bytemuck::from_bytes(b)),
+            CpuBinding::BufferRW(b) => {
+                TypedBufGuard::Interior(Ref::map(b.borrow(), |buf| bytemuck::from_bytes(buf)))
+            }
+            _ => panic!("resource type mismatch"),
+        }
+    }
+
+    pub fn as_typed_mut<T: Pod>(&self) -> TypedBufGuardMut<T> {
+        match self {
+            CpuBinding::Buffer(_) => panic!("can't borrow external buffer mutably"),
+            CpuBinding::BufferRW(b) => {
+                TypedBufGuardMut::Interior(RefMut::map(b.borrow_mut(), |buf| {
+                    bytemuck::from_bytes_mut(buf)
+                }))
+            }
+            _ => panic!("resource type mismatch"),
+        }
+    }
+
+    pub fn as_slice<T: Pod>(&self) -> TypedBufGuard<[T]> {
+        match self {
+            CpuBinding::Buffer(b) => TypedBufGuard::Slice(bytemuck::cast_slice(b)),
+            CpuBinding::BufferRW(b) => {
+                TypedBufGuard::Interior(Ref::map(b.borrow(), |buf| bytemuck::cast_slice(buf)))
+            }
+            _ => panic!("resource type mismatch"),
+        }
+    }
+
+    pub fn as_slice_mut<T: Pod>(&self) -> TypedBufGuardMut<[T]> {
         match self {
-            CpuBinding::Buffer(b) => CpuBufGuard::Slice(b),
-            CpuBinding::BufferRW(b) => CpuBufGuard::Interior(b.borrow_mut()),
+            CpuBinding::Buffer(_) => panic!("can't borrow external buffer mutably"),
+            CpuBinding::BufferRW(b) => {
+                TypedBufGuardMut::Interior(RefMut::map(b.borrow_mut(), |buf| {
+                    bytemuck::cast_slice_mut(buf)
+                }))
+            }
             _ => panic!("resource type mismatch"),
         }
     }
diff --git a/src/cpu_shader/backdrop.rs b/src/cpu_shader/backdrop.rs
new file mode 100644
index 000000000..746efdcde
--- /dev/null
+++ b/src/cpu_shader/backdrop.rs
@@ -0,0 +1,30 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{ConfigUniform, Path, Tile};
+
+use crate::cpu_dispatch::CpuBinding;
+
+fn backdrop_main(config: &ConfigUniform, paths: &[Path], tiles: &mut [Tile]) {
+    for drawobj_ix in 0..config.layout.n_draw_objects {
+        let path = paths[drawobj_ix as usize];
+        let width = path.bbox[2] - path.bbox[0];
+        let height = path.bbox[3] - path.bbox[1];
+        let base = path.tiles;
+        for y in 0..height {
+            let mut sum = 0;
+            for x in 0..width {
+                let tile = &mut tiles[(base + y * width + x) as usize];
+                sum += tile.backdrop;
+                tile.backdrop = sum;
+            }
+        }
+    }
+}
+
+pub fn backdrop(_n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let paths = resources[1].as_slice();
+    let mut tiles = resources[2].as_slice_mut();
+    backdrop_main(&config, &paths, &mut tiles);
+}
diff --git a/src/cpu_shader/bbox_clear.rs b/src/cpu_shader/bbox_clear.rs
new file mode 100644
index 000000000..1e02127d0
--- /dev/null
+++ b/src/cpu_shader/bbox_clear.rs
@@ -0,0 +1,21 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{ConfigUniform, PathBbox};
+
+use crate::cpu_dispatch::CpuBinding;
+
+fn bbox_clear_main(config: &ConfigUniform, path_bboxes: &mut [PathBbox]) {
+    for i in 0..(config.layout.n_paths as usize) {
+        path_bboxes[i].x0 = 0x7fff_ffff;
+        path_bboxes[i].y0 = 0x7fff_ffff;
+        path_bboxes[i].x1 = -0x8000_0000;
+        path_bboxes[i].y1 = -0x8000_0000;
+    }
+}
+
+pub fn bbox_clear(_n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let mut path_bboxes = resources[1].as_slice_mut();
+    bbox_clear_main(&config, &mut path_bboxes);
+}
diff --git a/src/cpu_shader/binning.rs b/src/cpu_shader/binning.rs
new file mode 100644
index 000000000..5ace850c3
--- /dev/null
+++ b/src/cpu_shader/binning.rs
@@ -0,0 +1,128 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{BinHeader, BumpAllocators, ConfigUniform, DrawMonoid, PathBbox};
+
+use crate::cpu_dispatch::CpuBinding;
+
+const WG_SIZE: usize = 256;
+const TILE_WIDTH: usize = 16;
+const TILE_HEIGHT: usize = 16;
+const N_TILE_X: usize = 16;
+const N_TILE_Y: usize = 16;
+const SX: f32 = 1.0 / ((N_TILE_X * TILE_WIDTH) as f32);
+const SY: f32 = 1.0 / ((N_TILE_Y * TILE_HEIGHT) as f32);
+
+fn bbox_intersect(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
+    [
+        a[0].max(b[0]),
+        a[1].max(b[1]),
+        a[2].min(b[2]),
+        a[3].min(b[3]),
+    ]
+}
+
+fn binning_main(
+    n_wg: u32,
+    config: &ConfigUniform,
+    draw_monoids: &[DrawMonoid],
+    path_bbox_buf: &[PathBbox],
+    clip_bbox_buf: &[[f32; 4]],
+    intersected_bbox: &mut [[f32; 4]],
+    bump: &mut BumpAllocators,
+    bin_data: &mut [u32],
+    bin_header: &mut [BinHeader],
+) {
+    for wg in 0..n_wg as usize {
+        let mut counts = [0; WG_SIZE];
+        let mut bboxes = [[0, 0, 0, 0]; WG_SIZE];
+        let width_in_bins =
+            ((config.width_in_tiles + N_TILE_X as u32 - 1) / N_TILE_X as u32) as i32;
+        let height_in_bins =
+            ((config.height_in_tiles + N_TILE_Y as u32 - 1) / N_TILE_Y as u32) as i32;
+        for local_ix in 0..WG_SIZE {
+            let element_ix = wg * WG_SIZE + local_ix;
+            let mut x0 = 0;
+            let mut y0 = 0;
+            let mut x1 = 0;
+            let mut y1 = 0;
+            if element_ix < config.layout.n_draw_objects as usize {
+                let draw_monoid = draw_monoids[element_ix];
+                let mut clip_bbox = [-1e9, -1e9, 1e9, 1e9];
+                if draw_monoid.clip_ix > 0 {
+                    assert!(draw_monoid.clip_ix - 1 < config.layout.n_clips);
+                    clip_bbox = clip_bbox_buf[draw_monoid.clip_ix as usize - 1];
+                }
+                let path_bbox = path_bbox_buf[draw_monoid.path_ix as usize];
+                let pb = [
+                    path_bbox.x0 as f32,
+                    path_bbox.y0 as f32,
+                    path_bbox.x1 as f32,
+                    path_bbox.y1 as f32,
+                ];
+                let bbox = bbox_intersect(clip_bbox, pb);
+                intersected_bbox[element_ix] = bbox;
+                if bbox[0] < bbox[2] && bbox[1] < bbox[3] {
+                    x0 = (bbox[0] * SX).floor() as i32;
+                    y0 = (bbox[1] * SY).floor() as i32;
+                    x1 = (bbox[2] * SX).ceil() as i32;
+                    y1 = (bbox[3] * SY).ceil() as i32;
+                }
+            }
+            x0 = x0.clamp(0, width_in_bins);
+            y0 = y0.clamp(0, height_in_bins);
+            x1 = x1.clamp(0, width_in_bins);
+            y1 = y1.clamp(0, height_in_bins);
+            for y in y0..y1 {
+                for x in x0..x1 {
+                    counts[(y * width_in_bins + x) as usize] += 1;
+                }
+            }
+            bboxes[local_ix] = [x0, y0, x1, y1];
+        }
+        let mut chunk_offset = [0; WG_SIZE];
+        for local_ix in 0..WG_SIZE {
+            let global_ix = wg * WG_SIZE + local_ix;
+            chunk_offset[local_ix] = bump.binning;
+            bump.binning += counts[local_ix];
+            bin_header[global_ix] = BinHeader {
+                element_count: counts[local_ix],
+                chunk_offset: chunk_offset[local_ix],
+            };
+        }
+        for local_ix in 0..WG_SIZE {
+            let element_ix = wg * WG_SIZE + local_ix;
+            let bbox = bboxes[local_ix];
+            for y in bbox[1]..bbox[3] {
+                for x in bbox[0]..bbox[2] {
+                    let bin_ix = (y * width_in_bins + x) as usize;
+                    let ix = config.layout.bin_data_start + chunk_offset[bin_ix];
+                    bin_data[ix as usize] = element_ix as u32;
+                    chunk_offset[bin_ix] += 1;
+                }
+            }
+        }
+    }
+}
+
+pub fn binning(n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let draw_monoids = resources[1].as_slice();
+    let path_bbox_buf = resources[2].as_slice();
+    let clip_bbox_buf = resources[3].as_slice();
+    let mut intersected_bbox = resources[4].as_slice_mut();
+    let mut bump = resources[5].as_typed_mut();
+    let mut bin_data = resources[6].as_slice_mut();
+    let mut bin_header = resources[7].as_slice_mut();
+    binning_main(
+        n_wg,
+        &config,
+        &draw_monoids,
+        &path_bbox_buf,
+        &clip_bbox_buf,
+        &mut intersected_bbox,
+        &mut bump,
+        &mut bin_data,
+        &mut bin_header,
+    );
+}
diff --git a/src/cpu_shader/clip_leaf.rs b/src/cpu_shader/clip_leaf.rs
new file mode 100644
index 000000000..0f5fc6106
--- /dev/null
+++ b/src/cpu_shader/clip_leaf.rs
@@ -0,0 +1,86 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{Clip, ConfigUniform, DrawMonoid, PathBbox};
+
+use crate::cpu_dispatch::CpuBinding;
+
+struct ClipStackElement {
+    // index of draw object
+    parent_ix: u32,
+    path_ix: u32,
+    bbox: [f32; 4],
+}
+
+const BIG_BBOX: [f32; 4] = [-1e9, -1e9, 1e9, 1e9];
+
+// Note: this implementation doesn't rigorously follow the
+// WGSL original. In particular, it just computes the clips
+// sequentially rather than using the partition reductions.
+fn clip_leaf_main(
+    config: &ConfigUniform,
+    clip_inp: &[Clip],
+    path_bboxes: &[PathBbox],
+    draw_monoids: &mut [DrawMonoid],
+    clip_bboxes: &mut [[f32; 4]],
+) {
+    let mut stack: Vec<ClipStackElement> = Vec::new();
+    for global_ix in 0..config.layout.n_clips {
+        let clip_el = clip_inp[global_ix as usize];
+        if clip_el.path_ix >= 0 {
+            // begin clip
+            let path_ix = clip_el.path_ix as u32;
+            let path_bbox = path_bboxes[path_ix as usize];
+            let p_bbox = [
+                path_bbox.x0 as f32,
+                path_bbox.y0 as f32,
+                path_bbox.x1 as f32,
+                path_bbox.y1 as f32,
+            ];
+            let bbox = if let Some(last) = stack.last() {
+                [
+                    p_bbox[0].max(last.bbox[0]),
+                    p_bbox[1].max(last.bbox[1]),
+                    p_bbox[2].min(last.bbox[2]),
+                    p_bbox[3].min(last.bbox[3]),
+                ]
+            } else {
+                p_bbox
+            };
+            clip_bboxes[global_ix as usize] = bbox;
+            let parent_ix = clip_el.ix;
+            stack.push(ClipStackElement {
+                parent_ix,
+                path_ix,
+                bbox,
+            });
+        } else {
+            // end clip
+            let tos = stack.pop().unwrap();
+            let bbox = if let Some(nos) = stack.last() {
+                nos.bbox
+            } else {
+                BIG_BBOX
+            };
+            clip_bboxes[global_ix as usize] = bbox;
+            draw_monoids[clip_el.ix as usize].path_ix = tos.path_ix;
+            draw_monoids[clip_el.ix as usize].scene_offset =
+                draw_monoids[tos.parent_ix as usize].scene_offset;
+        }
+    }
+}
+
+pub fn clip_leaf(_n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let clip_inp = resources[1].as_slice();
+    let path_bboxes = resources[2].as_slice();
+    let mut draw_monoids = resources[5].as_slice_mut();
+    let mut clip_bboxes = resources[6].as_slice_mut();
+    clip_leaf_main(
+        &config,
+        &clip_inp,
+        &path_bboxes,
+        &mut draw_monoids,
+        &mut clip_bboxes,
+    );
+}
diff --git a/src/cpu_shader/clip_reduce.rs b/src/cpu_shader/clip_reduce.rs
new file mode 100644
index 000000000..fc30661f1
--- /dev/null
+++ b/src/cpu_shader/clip_reduce.rs
@@ -0,0 +1,56 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{Clip, ClipBic, ClipElement, PathBbox};
+
+use crate::cpu_dispatch::CpuBinding;
+
+const WG_SIZE: usize = 256;
+
+fn clip_reduce_main(
+    n_wg: u32,
+    clip_inp: &[Clip],
+    path_bboxes: &[PathBbox],
+    reduced: &mut [ClipBic],
+    clip_out: &mut [ClipElement],
+) {
+    let mut scratch = Vec::with_capacity(WG_SIZE);
+    for wg_ix in 0..n_wg {
+        scratch.clear();
+        let mut bic_reduced = ClipBic::default();
+        // reverse scan
+        for local_ix in (0..WG_SIZE).rev() {
+            let global_ix = wg_ix as usize * WG_SIZE + local_ix;
+            let inp = clip_inp[global_ix].path_ix;
+            let is_push = inp >= 0;
+            let bic = ClipBic::new(1 - is_push as u32, is_push as u32);
+            bic_reduced = bic.combine(bic_reduced);
+            if is_push && bic_reduced.a == 0 {
+                scratch.push(global_ix as u32);
+            }
+        }
+        reduced[wg_ix as usize] = bic_reduced;
+        for (i, parent_ix) in scratch.iter().rev().enumerate() {
+            let mut clip_el = ClipElement::default();
+            clip_el.parent_ix = *parent_ix;
+            let path_ix = clip_inp[*parent_ix as usize].path_ix;
+            let path_bbox = path_bboxes[path_ix as usize];
+            clip_el.bbox = [
+                path_bbox.x0 as f32,
+                path_bbox.y0 as f32,
+                path_bbox.x1 as f32,
+                path_bbox.y1 as f32,
+            ];
+            let global_ix = wg_ix as usize * WG_SIZE + i;
+            clip_out[global_ix] = clip_el;
+        }
+    }
+}
+
+pub fn clip_reduce(n_wg: u32, resources: &[CpuBinding]) {
+    let clip_inp = resources[0].as_slice();
+    let path_bboxes = resources[1].as_slice();
+    let mut reduced = resources[2].as_slice_mut();
+    let mut clip_out = resources[3].as_slice_mut();
+    clip_reduce_main(n_wg, &clip_inp, &path_bboxes, &mut reduced, &mut clip_out);
+}
diff --git a/src/cpu_shader/coarse.rs b/src/cpu_shader/coarse.rs
new file mode 100644
index 000000000..390df7f74
--- /dev/null
+++ b/src/cpu_shader/coarse.rs
@@ -0,0 +1,344 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{BinHeader, BumpAllocators, ConfigUniform, DrawMonoid, DrawTag, Path, Tile};
+
+use crate::cpu_dispatch::CpuBinding;
+
+use super::{
+    CMD_BEGIN_CLIP, CMD_COLOR, CMD_END, CMD_END_CLIP, CMD_FILL, CMD_IMAGE, CMD_JUMP, CMD_LIN_GRAD,
+    CMD_RAD_GRAD, CMD_SOLID, PTCL_INITIAL_ALLOC,
+};
+
+const N_TILE_X: usize = 16;
+const N_TILE_Y: usize = 16;
+const N_TILE: usize = N_TILE_X * N_TILE_Y;
+
+const PTCL_INCREMENT: u32 = 256;
+const PTCL_HEADROOM: u32 = 2;
+
+// Modeled in the WGSL as private-scoped variables
+struct TileState {
+    cmd_offset: u32,
+    cmd_limit: u32,
+}
+
+impl TileState {
+    fn new(tile_ix: u32) -> TileState {
+        let cmd_offset = tile_ix * PTCL_INITIAL_ALLOC;
+        let cmd_limit = cmd_offset + (PTCL_INITIAL_ALLOC - PTCL_HEADROOM);
+        TileState {
+            cmd_offset,
+            cmd_limit,
+        }
+    }
+
+    fn alloc_cmd(
+        &mut self,
+        size: u32,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+    ) {
+        if self.cmd_offset + size >= self.cmd_limit {
+            let ptcl_dyn_start =
+                config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC;
+            let chunk_size = PTCL_INCREMENT.max(size + PTCL_HEADROOM);
+            let new_cmd = ptcl_dyn_start + bump.ptcl;
+            bump.ptcl += chunk_size;
+            ptcl[self.cmd_offset as usize] = CMD_JUMP;
+            ptcl[self.cmd_offset as usize + 1] = new_cmd;
+            self.cmd_offset = new_cmd;
+            self.cmd_limit = new_cmd + (PTCL_INCREMENT - PTCL_HEADROOM);
+        }
+    }
+
+    fn write(&mut self, ptcl: &mut [u32], offset: u32, value: u32) {
+        ptcl[(self.cmd_offset + offset) as usize] = value;
+    }
+
+    fn write_path(
+        &mut self,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+        tile: &mut Tile,
+    ) {
+        let n_segs = tile.segment_count_or_ix;
+        if n_segs != 0 {
+            let seg_ix = bump.segments;
+            tile.segment_count_or_ix = !seg_ix;
+            bump.segments += n_segs;
+            self.alloc_cmd(4, config, bump, ptcl);
+            self.write(ptcl, 0, CMD_FILL);
+            let even_odd = false; // TODO
+            let size_and_rule = (n_segs << 1) | (even_odd as u32);
+            self.write(ptcl, 1, size_and_rule);
+            self.write(ptcl, 2, seg_ix);
+            self.write(ptcl, 3, tile.backdrop as u32);
+            self.cmd_offset += 4;
+        } else {
+            self.alloc_cmd(1, config, bump, ptcl);
+            self.write(ptcl, 0, CMD_SOLID);
+            self.cmd_offset += 1;
+        }
+    }
+
+    fn write_color(
+        &mut self,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+        rgba_color: u32,
+    ) {
+        self.alloc_cmd(2, config, bump, ptcl);
+        self.write(ptcl, 0, CMD_COLOR);
+        self.write(ptcl, 1, rgba_color);
+        self.cmd_offset += 2;
+    }
+
+    fn write_image(
+        &mut self,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+        info_offset: u32,
+    ) {
+        self.alloc_cmd(2, config, bump, ptcl);
+        self.write(ptcl, 0, CMD_IMAGE);
+        self.write(ptcl, 1, info_offset);
+        self.cmd_offset += 2;
+    }
+
+    fn write_grad(
+        &mut self,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+        ty: u32,
+        index: u32,
+        info_offset: u32,
+    ) {
+        self.alloc_cmd(3, config, bump, ptcl);
+        self.write(ptcl, 0, ty);
+        self.write(ptcl, 1, index);
+        self.write(ptcl, 2, info_offset);
+        self.cmd_offset += 3;
+    }
+
+    fn write_begin_clip(
+        &mut self,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+    ) {
+        self.alloc_cmd(1, config, bump, ptcl);
+        self.write(ptcl, 0, CMD_BEGIN_CLIP);
+        self.cmd_offset += 1;
+    }
+
+    fn write_end_clip(
+        &mut self,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+        blend: u32,
+        alpha: f32,
+    ) {
+        self.alloc_cmd(3, config, bump, ptcl);
+        self.write(ptcl, 0, CMD_END_CLIP);
+        self.write(ptcl, 1, blend);
+        self.write(ptcl, 2, f32::to_bits(alpha));
+        self.cmd_offset += 3;
+    }
+}
+
+fn coarse_main(
+    config: &ConfigUniform,
+    scene: &[u32],
+    draw_monoids: &[DrawMonoid],
+    bin_headers: &[BinHeader],
+    info_bin_data: &[u32],
+    paths: &[Path],
+    tiles: &mut [Tile],
+    bump: &mut BumpAllocators,
+    ptcl: &mut [u32],
+) {
+    let width_in_tiles = config.width_in_tiles;
+    let height_in_tiles = config.height_in_tiles;
+    let width_in_bins = (width_in_tiles + N_TILE_X as u32 - 1) / N_TILE_X as u32;
+    let height_in_bins = (height_in_tiles + N_TILE_Y as u32 - 1) / N_TILE_Y as u32;
+    let n_bins = width_in_bins * height_in_bins;
+    let bin_data_start = config.layout.bin_data_start;
+    let drawtag_base = config.layout.draw_tag_base;
+    let mut compacted = vec![vec![]; N_TILE];
+    let n_partitions = (config.layout.n_draw_objects + N_TILE as u32 - 1) / N_TILE as u32;
+    for bin in 0..n_bins {
+        for v in &mut compacted {
+            v.clear();
+        }
+        let bin_x = bin % width_in_bins;
+        let bin_y = bin / width_in_bins;
+        let bin_tile_x = N_TILE_X as u32 * bin_x;
+        let bin_tile_y = N_TILE_Y as u32 * bin_y;
+        for part in 0..n_partitions {
+            let in_ix = part * N_TILE as u32 + bin;
+            let bin_header = bin_headers[in_ix as usize];
+            let start = bin_data_start + bin_header.chunk_offset;
+            for i in 0..bin_header.element_count {
+                let drawobj_ix = info_bin_data[(start + i) as usize];
+                let tag = scene[(drawtag_base + drawobj_ix) as usize];
+                if DrawTag(tag) != DrawTag::NOP {
+                    let draw_monoid = draw_monoids[drawobj_ix as usize];
+                    let path_ix = draw_monoid.path_ix;
+                    let path = paths[path_ix as usize];
+                    let dx = path.bbox[0] as i32 - bin_tile_x as i32;
+                    let dy = path.bbox[1] as i32 - bin_tile_y as i32;
+                    let x0 = dx.clamp(0, N_TILE_X as i32);
+                    let y0 = dy.clamp(0, N_TILE_Y as i32);
+                    let x1 = (path.bbox[2] as i32 - bin_tile_x as i32).clamp(0, N_TILE_X as i32);
+                    let y1 = (path.bbox[3] as i32 - bin_tile_y as i32).clamp(0, N_TILE_Y as i32);
+                    for y in y0..y1 {
+                        for x in x0..x1 {
+                            compacted[(y * N_TILE_X as i32 + x) as usize].push(drawobj_ix);
+                        }
+                    }
+                }
+            }
+        }
+        // compacted now has the list of draw objects for each tile.
+        // While the WGSL source does at most 256 draw objects at a time,
+        // this version does all the draw objects in a tile.
+        for tile_ix in 0..N_TILE {
+            let tile_x = (tile_ix % N_TILE_X) as u32;
+            let tile_y = (tile_ix / N_TILE_X) as u32;
+            let this_tile_ix = (bin_tile_y + tile_y) * width_in_tiles + bin_tile_x + tile_x;
+            let mut tile_state = TileState::new(this_tile_ix);
+            let blend_offset = tile_state.cmd_offset;
+            tile_state.cmd_offset += 1;
+            let mut clip_depth = 0;
+            let mut clip_zero_depth = 0;
+            for drawobj_ix in &compacted[tile_ix] {
+                let drawtag = scene[(drawtag_base + drawobj_ix) as usize];
+                if clip_zero_depth == 0 {
+                    let draw_monoid = draw_monoids[*drawobj_ix as usize];
+                    let path_ix = draw_monoid.path_ix;
+                    let path = paths[path_ix as usize];
+                    let bbox = path.bbox;
+                    let stride = bbox[2] - bbox[0];
+                    let x = bin_tile_x + tile_x - bbox[0];
+                    let y = bin_tile_y + tile_y - bbox[1];
+                    let tile = &mut tiles[(path.tiles + y * stride + x) as usize];
+                    let is_clip = (drawtag & 1) != 0;
+                    let mut is_blend = false;
+                    let dd = config.layout.draw_data_base + draw_monoid.scene_offset;
+                    let di = draw_monoid.info_offset;
+                    if is_clip {
+                        const BLEND_CLIP: u32 = (128 << 8) | 3;
+                        let blend = scene[dd as usize];
+                        is_blend = blend != BLEND_CLIP;
+                    }
+                    let n_segs = tile.segment_count_or_ix;
+                    let include_tile = n_segs != 0 || (tile.backdrop == 0) == is_clip || is_blend;
+                    if include_tile {
+                        // TODO: get drawinfo (linewidth for fills)
+                        match DrawTag(drawtag) {
+                            DrawTag::COLOR => {
+                                tile_state.write_path(config, bump, ptcl, tile);
+                                let rgba_color = scene[dd as usize];
+                                tile_state.write_color(config, bump, ptcl, rgba_color);
+                            }
+                            DrawTag::IMAGE => {
+                                tile_state.write_path(config, bump, ptcl, tile);
+                                tile_state.write_image(config, bump, ptcl, di + 1);
+                            }
+                            DrawTag::LINEAR_GRADIENT => {
+                                tile_state.write_path(config, bump, ptcl, tile);
+                                let index = scene[dd as usize];
+                                tile_state.write_grad(
+                                    config,
+                                    bump,
+                                    ptcl,
+                                    CMD_LIN_GRAD,
+                                    index,
+                                    di + 1,
+                                );
+                            }
+                            DrawTag::RADIAL_GRADIENT => {
+                                tile_state.write_path(config, bump, ptcl, tile);
+                                let index = scene[dd as usize];
+                                tile_state.write_grad(
+                                    config,
+                                    bump,
+                                    ptcl,
+                                    CMD_RAD_GRAD,
+                                    index,
+                                    di + 1,
+                                );
+                            }
+                            DrawTag::BEGIN_CLIP => {
+                                if tile.segment_count_or_ix == 0 && tile.backdrop == 0 {
+                                    clip_zero_depth = clip_depth + 1;
+                                } else {
+                                    tile_state.write_begin_clip(config, bump, ptcl);
+                                    // TODO: update blend depth
+                                }
+                                clip_depth += 1;
+                            }
+                            DrawTag::END_CLIP => {
+                                clip_depth -= 1;
+                                tile_state.write_path(config, bump, ptcl, tile);
+                                let blend = scene[dd as usize];
+                                let alpha = f32::from_bits(scene[dd as usize + 1]);
+                                tile_state.write_end_clip(config, bump, ptcl, blend, alpha);
+                            }
+                            _ => todo!(),
+                        }
+                    }
+                } else {
+                    // In "clip zero" state, suppress all drawing
+                    match DrawTag(drawtag) {
+                        DrawTag::BEGIN_CLIP => clip_depth += 1,
+                        DrawTag::END_CLIP => {
+                            if clip_depth == clip_zero_depth {
+                                clip_zero_depth = 0;
+                            }
+                            clip_depth -= 1;
+                        }
+                        _ => (),
+                    }
+                }
+            }
+
+            if bin_tile_x + tile_x < width_in_tiles && bin_tile_y + tile_y < height_in_tiles {
+                ptcl[tile_state.cmd_offset as usize] = CMD_END;
+                let scratch_size = 0; // TODO: actually compute blend depth
+                ptcl[blend_offset as usize] = bump.blend;
+                bump.blend += scratch_size;
+            }
+        }
+    }
+}
+
+pub fn coarse(_n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let draw_monoids = resources[2].as_slice();
+    let bin_headers = resources[3].as_slice();
+    let info_bin_data = resources[4].as_slice();
+    let paths = resources[5].as_slice();
+    let mut tiles = resources[6].as_slice_mut();
+    let mut bump = resources[7].as_typed_mut();
+    let mut ptcl = resources[8].as_slice_mut();
+    coarse_main(
+        &config,
+        &scene,
+        &draw_monoids,
+        &bin_headers,
+        &info_bin_data,
+        &paths,
+        &mut tiles,
+        &mut bump,
+        &mut ptcl,
+    );
+}
diff --git a/src/cpu_shader/draw_leaf.rs b/src/cpu_shader/draw_leaf.rs
new file mode 100644
index 000000000..0aa779e5c
--- /dev/null
+++ b/src/cpu_shader/draw_leaf.rs
@@ -0,0 +1,168 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{Clip, ConfigUniform, DrawMonoid, DrawTag, Monoid, PathBbox};
+
+use crate::cpu_dispatch::CpuBinding;
+
+use super::util::{read_draw_tag_from_scene, Transform, Vec2};
+
+const WG_SIZE: usize = 256;
+
+fn draw_leaf_main(
+    n_wg: u32,
+    config: &ConfigUniform,
+    scene: &[u32],
+    reduced: &[DrawMonoid],
+    path_bbox: &[PathBbox],
+    draw_monoid: &mut [DrawMonoid],
+    info: &mut [u32],
+    clip_inp: &mut [Clip],
+) {
+    let mut prefix = DrawMonoid::default();
+    for i in 0..n_wg {
+        let mut m = prefix;
+        for j in 0..WG_SIZE {
+            let ix = i * WG_SIZE as u32 + j as u32;
+            let tag_raw = read_draw_tag_from_scene(config, scene, ix);
+            let tag_word = DrawTag(tag_raw);
+            // store exclusive prefix sum
+            if ix < config.layout.n_draw_objects {
+                draw_monoid[ix as usize] = m;
+            }
+            let m_next = m.combine(&DrawMonoid::new(tag_word));
+            let dd = config.layout.draw_data_base + m.scene_offset;
+            let di = m.info_offset as usize;
+            if tag_word == DrawTag::COLOR
+                || tag_word == DrawTag::LINEAR_GRADIENT
+                || tag_word == DrawTag::RADIAL_GRADIENT
+                || tag_word == DrawTag::IMAGE
+                || tag_word == DrawTag::BEGIN_CLIP
+            {
+                let bbox = path_bbox[m.path_ix as usize];
+                let transform = Transform::read(config.layout.transform_base, bbox.trans_ix, scene);
+                let linewidth = bbox.linewidth;
+                match tag_word {
+                    DrawTag::COLOR => {
+                        info[di] = f32::to_bits(linewidth);
+                    }
+                    DrawTag::LINEAR_GRADIENT => {
+                        info[di] = f32::to_bits(linewidth);
+                        let p0 = Vec2::new(
+                            f32::from_bits(scene[dd as usize + 1]),
+                            f32::from_bits(scene[dd as usize + 2]),
+                        );
+                        let p1 = Vec2::new(
+                            f32::from_bits(scene[dd as usize + 3]),
+                            f32::from_bits(scene[dd as usize + 4]),
+                        );
+                        let p0 = transform.apply(p0);
+                        let p1 = transform.apply(p1);
+                        let dxy = p1 - p0;
+                        let scale = 1.0 / dxy.dot(dxy);
+                        let line_xy = dxy * scale;
+                        let line_c = -p0.dot(line_xy);
+                        info[di + 1] = f32::to_bits(line_xy.x);
+                        info[di + 2] = f32::to_bits(line_xy.y);
+                        info[di + 3] = f32::to_bits(line_c);
+                    }
+                    DrawTag::RADIAL_GRADIENT => {
+                        info[di] = f32::to_bits(linewidth);
+                        let p0 = Vec2::new(
+                            f32::from_bits(scene[dd as usize + 1]),
+                            f32::from_bits(scene[dd as usize + 2]),
+                        );
+                        let p1 = Vec2::new(
+                            f32::from_bits(scene[dd as usize + 3]),
+                            f32::from_bits(scene[dd as usize + 4]),
+                        );
+                        let r0 = f32::from_bits(scene[dd as usize + 5]);
+                        let r1 = f32::from_bits(scene[dd as usize + 6]);
+                        let z = transform.0;
+                        let inv_det = (z[0] * z[3] - z[1] * z[2]).recip();
+                        let inv_mat = [
+                            z[3] * inv_det,
+                            -z[1] * inv_det,
+                            -z[2] * inv_det,
+                            z[0] * inv_det,
+                        ];
+                        let inv_tr = [
+                            -(inv_mat[0] * z[4] + inv_mat[2] * z[5]) - p0.x,
+                            -(inv_mat[1] * z[4] + inv_mat[3] * z[5]) - p0.y,
+                        ];
+                        let center1 = p1 - p0;
+                        let rr = r1 / (r1 - r0);
+                        let ra_inv = rr / (r1 * r1 - center1.dot(center1));
+                        let c1 = center1 * ra_inv;
+                        let ra = rr * ra_inv;
+                        let roff = rr - 1.0;
+                        info[di + 1] = f32::to_bits(inv_mat[0]);
+                        info[di + 2] = f32::to_bits(inv_mat[1]);
+                        info[di + 3] = f32::to_bits(inv_mat[2]);
+                        info[di + 4] = f32::to_bits(inv_mat[3]);
+                        info[di + 5] = f32::to_bits(inv_tr[0]);
+                        info[di + 6] = f32::to_bits(inv_tr[1]);
+                        info[di + 7] = f32::to_bits(c1.x);
+                        info[di + 8] = f32::to_bits(c1.y);
+                        info[di + 9] = f32::to_bits(ra);
+                        info[di + 19] = f32::to_bits(roff);
+                    }
+                    DrawTag::IMAGE => {
+                        info[di] = f32::to_bits(linewidth);
+                        let z = transform.0;
+                        let inv_det = (z[0] * z[3] - z[1] * z[2]).recip();
+                        let inv_mat = [
+                            z[3] * inv_det,
+                            -z[1] * inv_det,
+                            -z[2] * inv_det,
+                            z[0] * inv_det,
+                        ];
+                        let inv_tr = [
+                            -(inv_mat[0] * z[4] + inv_mat[2] * z[5]),
+                            -(inv_mat[1] * z[4] + inv_mat[3] * z[5]),
+                        ];
+                        info[di + 1] = f32::to_bits(inv_mat[0]);
+                        info[di + 2] = f32::to_bits(inv_mat[1]);
+                        info[di + 3] = f32::to_bits(inv_mat[2]);
+                        info[di + 4] = f32::to_bits(inv_mat[3]);
+                        info[di + 5] = f32::to_bits(inv_tr[0]);
+                        info[di + 6] = f32::to_bits(inv_tr[1]);
+                        info[di + 7] = scene[dd as usize];
+                        info[di + 8] = scene[dd as usize + 1];
+                    }
+                    DrawTag::BEGIN_CLIP => (),
+                    _ => todo!("unhandled draw tag {:x}", tag_word.0),
+                }
+            }
+            if tag_word == DrawTag::BEGIN_CLIP {
+                let path_ix = m.path_ix as i32;
+                clip_inp[m.clip_ix as usize] = Clip { ix, path_ix };
+            } else if tag_word == DrawTag::END_CLIP {
+                let path_ix = !ix as i32;
+                clip_inp[m.clip_ix as usize] = Clip { ix, path_ix };
+            }
+            m = m_next;
+        }
+        prefix = prefix.combine(&reduced[i as usize]);
+    }
+}
+
+pub fn draw_leaf(n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let reduced = resources[2].as_slice();
+    let path_bbox = resources[3].as_slice();
+    let mut draw_monoid = resources[4].as_slice_mut();
+    let mut info = resources[5].as_slice_mut();
+    let mut clip_inp = resources[6].as_slice_mut();
+    draw_leaf_main(
+        n_wg,
+        &config,
+        &scene,
+        &reduced,
+        &path_bbox,
+        &mut draw_monoid,
+        &mut info,
+        &mut clip_inp,
+    );
+}
diff --git a/src/cpu_shader/draw_reduce.rs b/src/cpu_shader/draw_reduce.rs
new file mode 100644
index 000000000..61c338c71
--- /dev/null
+++ b/src/cpu_shader/draw_reduce.rs
@@ -0,0 +1,29 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{ConfigUniform, DrawMonoid, DrawTag, Monoid};
+
+use crate::cpu_dispatch::CpuBinding;
+
+use super::util::read_draw_tag_from_scene;
+
+const WG_SIZE: usize = 256;
+
+fn draw_reduce_main(n_wg: u32, config: &ConfigUniform, scene: &[u32], reduced: &mut [DrawMonoid]) {
+    for i in 0..n_wg {
+        let mut m = DrawMonoid::default();
+        for j in 0..WG_SIZE {
+            let ix = i * WG_SIZE as u32 + j as u32;
+            let tag = read_draw_tag_from_scene(config, scene, ix);
+            m = m.combine(&DrawMonoid::new(DrawTag(tag)));
+        }
+        reduced[i as usize] = m;
+    }
+}
+
+pub fn draw_reduce(n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let mut reduced = resources[2].as_slice_mut();
+    draw_reduce_main(n_wg, &config, &scene, &mut reduced);
+}
diff --git a/src/cpu_shader/fine.rs b/src/cpu_shader/fine.rs
new file mode 100644
index 000000000..c64c87627
--- /dev/null
+++ b/src/cpu_shader/fine.rs
@@ -0,0 +1,188 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{ConfigUniform, PathSegment, Tile};
+
+use crate::cpu_dispatch::CpuTexture;
+
+use super::{CMD_COLOR, CMD_END, CMD_FILL, CMD_JUMP, CMD_SOLID, PTCL_INITIAL_ALLOC};
+
+// These should also move into a common area
+const TILE_WIDTH: usize = 16;
+const TILE_HEIGHT: usize = 16;
+const TILE_SIZE: usize = TILE_WIDTH * TILE_HEIGHT;
+
+fn read_color(ptcl: &[u32], offset: u32) -> u32 {
+    ptcl[(offset + 1) as usize]
+}
+
+struct CmdFill {
+    size_and_rule: u32,
+    seg_data: u32,
+    backdrop: i32,
+}
+
+fn read_fill(ptcl: &[u32], offset: u32) -> CmdFill {
+    let size_and_rule = ptcl[(offset + 1) as usize];
+    let seg_data = ptcl[(offset + 2) as usize];
+    let backdrop = ptcl[(offset + 3) as usize] as i32;
+    CmdFill {
+        size_and_rule,
+        seg_data,
+        backdrop,
+    }
+}
+
+fn unpack4x8unorm(x: u32) -> [f32; 4] {
+    let mut result = [0.0; 4];
+    for i in 0..4 {
+        result[i] = ((x >> (i * 8)) & 0xff) as f32 * (1.0 / 255.0);
+    }
+    result
+}
+
+fn pack4x8unorm(x: [f32; 4]) -> u32 {
+    let mut result = 0;
+    for i in 0..4 {
+        let byte = (x[i].clamp(0.0, 1.0) * 255.0).round() as u32;
+        result |= byte << (i * 8);
+    }
+    result
+}
+
+fn fill_path(area: &mut [f32], segments: &[PathSegment], fill: &CmdFill, x_tile: f32, y_tile: f32) {
+    let n_segs = fill.size_and_rule >> 1;
+    let even_odd = (fill.size_and_rule & 1) != 0;
+    let backdrop_f = fill.backdrop as f32;
+    for a in area.iter_mut() {
+        *a = backdrop_f;
+    }
+    for segment in &segments[fill.seg_data as usize..][..n_segs as usize] {
+        for yi in 0..TILE_HEIGHT {
+            let y = segment.origin[1] - (y_tile + yi as f32);
+            let y0 = y.clamp(0.0, 1.0);
+            let y1 = (y + segment.delta[1]).clamp(0.0, 1.0);
+            let dy = y0 - y1;
+            let y_edge = segment.delta[0].signum()
+                * (y_tile + yi as f32 - segment.y_edge + 1.0).clamp(0.0, 1.0);
+            if dy != 0.0 {
+                let vec_y_recip = segment.delta[1].recip();
+                let t0 = (y0 - y) * vec_y_recip;
+                let t1 = (y1 - y) * vec_y_recip;
+                let startx = segment.origin[0] - x_tile;
+                let x0 = startx + t0 * segment.delta[0];
+                let x1 = startx + t1 * segment.delta[0];
+                let xmin0 = x0.min(x1);
+                let xmax0 = x0.max(x1);
+                for i in 0..TILE_WIDTH {
+                    let i_f = i as f32;
+                    let xmin = (xmin0 - i_f).min(1.0) - 1.0e-6;
+                    let xmax = xmax0 - i_f;
+                    let b = xmax.min(1.0);
+                    let c = b.max(0.0);
+                    let d = xmin.max(0.0);
+                    let a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
+                    area[yi * TILE_WIDTH + i] += y_edge + a * dy;
+                }
+            } else if y_edge != 0.0 {
+                for i in 0..TILE_WIDTH {
+                    area[yi * TILE_WIDTH + i] += y_edge;
+                }
+            }
+        }
+    }
+    if even_odd {
+        for a in area.iter_mut() {
+            {
+                *a = (*a - 2.0 * (0.5 * *a).round()).abs();
+            }
+        }
+    } else {
+        for a in area.iter_mut() {
+            {
+                *a = a.abs().min(1.0);
+            }
+        }
+    }
+}
+
+// Note: this is a draft. Texture resources are not yet wired up, so it
+// has not yet been tested.
+#[allow(unused)]
+fn fine_main(
+    config: &ConfigUniform,
+    tiles: &[Tile],
+    segments: &[PathSegment],
+    output: &mut CpuTexture,
+    ptcl: &[u32],
+    info: &[u32],
+    // TODO: image texture resources
+    // TODO: masks?
+) {
+    let width_in_tiles = config.width_in_tiles;
+    let height_in_tiles = config.height_in_tiles;
+    let n_tiles = width_in_tiles * height_in_tiles;
+    let mut area = vec![0.0f32; TILE_SIZE];
+    let mut rgba = vec![[0.0f32; 4]; TILE_SIZE];
+    for tile_ix in 0..n_tiles {
+        for x in &mut rgba {
+            *x = [0.0; 4];
+        }
+        for a in &mut area {
+            *a = 0.0;
+        }
+        let tile_x = tile_ix % width_in_tiles;
+        let tile_y = tile_ix / width_in_tiles;
+        let mut cmd_ix = tile_ix * PTCL_INITIAL_ALLOC;
+        // skip over blend stack allocation
+        cmd_ix += 1;
+        loop {
+            let tag = ptcl[cmd_ix as usize];
+            if tag == CMD_END {
+                break;
+            }
+            match tag {
+                CMD_FILL => {
+                    let fill = read_fill(ptcl, cmd_ix);
+                    // x0 and y0 will go away when we do tile-relative coords
+                    let x0 = (tile_x as usize * TILE_WIDTH) as f32;
+                    let y0 = (tile_y as usize * TILE_HEIGHT) as f32;
+                    fill_path(&mut area, segments, &fill, x0, y0);
+                    cmd_ix += 4;
+                }
+                CMD_SOLID => {
+                    for a in &mut area {
+                        *a = 1.0;
+                    }
+                    cmd_ix += 2;
+                }
+                CMD_COLOR => {
+                    let color = read_color(ptcl, cmd_ix);
+                    let fg = unpack4x8unorm(color);
+                    let fg = [fg[3], fg[2], fg[1], fg[0]];
+                    for i in 0..TILE_SIZE {
+                        let ai = area[i];
+                        let fg_i = [fg[0] * ai, fg[1] * ai, fg[2] * ai, fg[3] * ai];
+                        for j in 0..4 {
+                            rgba[i][j] = rgba[i][j] * (1.0 - fg_i[3]) + fg_i[j];
+                        }
+                    }
+                    cmd_ix += 2;
+                }
+                CMD_JUMP => {
+                    cmd_ix = ptcl[(cmd_ix + 1) as usize];
+                }
+                _ => todo!("unhandled ptcl command {tag}"),
+            }
+        }
+        // Write tile (in rgba)
+        for y in 0..TILE_HEIGHT {
+            let base =
+                output.width * (tile_y as usize * TILE_HEIGHT + y) + tile_x as usize * TILE_WIDTH;
+            for x in 0..TILE_WIDTH {
+                let rgba32 = pack4x8unorm(rgba[y * TILE_WIDTH + x]);
+                output.pixels[base + x] = rgba32;
+            }
+        }
+    }
+}
diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
new file mode 100644
index 000000000..2cdf7256b
--- /dev/null
+++ b/src/cpu_shader/flatten.rs
@@ -0,0 +1,299 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use crate::cpu_dispatch::CpuBinding;
+
+use super::util::{Transform, Vec2};
+use vello_encoding::{BumpAllocators, ConfigUniform, LineSoup, Monoid, PathBbox, PathMonoid};
+
+fn to_minus_one_quarter(x: f32) -> f32 {
+    // could also be written x.powf(-0.25)
+    x.sqrt().sqrt().recip()
+}
+
+const D: f32 = 0.67;
+fn approx_parabola_integral(x: f32) -> f32 {
+    x * to_minus_one_quarter(1.0 - D + (D * D * D * D + 0.25 * x * x))
+}
+
+const B: f32 = 0.39;
+fn approx_parabola_inv_integral(x: f32) -> f32 {
+    x * (1.0 - B + (B * B + 0.5 * x * x)).sqrt()
+}
+
+#[derive(Clone, Copy, Default)]
+struct SubdivResult {
+    val: f32,
+    a0: f32,
+    a2: f32,
+}
+
+fn estimate_subdiv(p0: Vec2, p1: Vec2, p2: Vec2, sqrt_tol: f32) -> SubdivResult {
+    let d01 = p1 - p0;
+    let d12 = p2 - p1;
+    let dd = d01 - d12;
+    let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
+    let cross_inv = if cross.abs() < 1.0e-9 {
+        1.0e9
+    } else {
+        cross.recip()
+    };
+    let x0 = d01.dot(dd) * cross_inv;
+    let x2 = d12.dot(dd) * cross_inv;
+    let scale = (cross / (dd.length() * (x2 - x0))).abs();
+    let a0 = approx_parabola_integral(x0);
+    let a2 = approx_parabola_integral(x2);
+    let mut val = 0.0;
+    if scale < 1e9 {
+        let da = (a2 - a0).abs();
+        let sqrt_scale = scale.sqrt();
+        if x0.signum() == x2.signum() {
+            val = sqrt_scale;
+        } else {
+            let xmin = sqrt_tol / sqrt_scale;
+            val = sqrt_tol / approx_parabola_integral(xmin);
+        }
+        val *= da;
+    }
+    SubdivResult { val, a0, a2 }
+}
+
+fn eval_quad(p0: Vec2, p1: Vec2, p2: Vec2, t: f32) -> Vec2 {
+    let mt = 1.0 - t;
+    p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t
+}
+
+fn eval_cubic(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2, t: f32) -> Vec2 {
+    let mt = 1.0 - t;
+    p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t
+}
+
+const MAX_QUADS: u32 = 16;
+
+struct Cubic {
+    p0: Vec2,
+    p1: Vec2,
+    p2: Vec2,
+    p3: Vec2,
+    path_ix: u32,
+}
+
+fn flatten_cubic(cubic: Cubic, line_ix: &mut usize, lines: &mut [LineSoup]) {
+    let p0 = cubic.p0;
+    let p1 = cubic.p1;
+    let p2 = cubic.p2;
+    let p3 = cubic.p3;
+    let err_v = (p2 - p1) * 3.0 + p0 - p3;
+    let err = err_v.dot(err_v);
+    const ACCURACY: f32 = 0.25;
+    const Q_ACCURACY: f32 = ACCURACY * 0.1;
+    const REM_ACCURACY: f32 = ACCURACY - Q_ACCURACY;
+    const MAX_HYPOT2: f32 = 432.0 * Q_ACCURACY * Q_ACCURACY;
+    let mut n_quads = ((err * (1.0 / MAX_HYPOT2)).powf(1.0 / 6.0).ceil() as u32).max(1);
+    n_quads = n_quads.min(MAX_QUADS);
+    let mut keep_params = [SubdivResult::default(); MAX_QUADS as usize];
+    let mut val = 0.0;
+    let mut qp0 = p0;
+    let step = (n_quads as f32).recip();
+    for i in 0..n_quads {
+        let t = (i + 1) as f32 * step;
+        let qp2 = eval_cubic(p0, p1, p2, p3, t);
+        let mut qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
+        qp1 = qp1 * 2.0 - (qp0 + qp2) * 0.5;
+        let params = estimate_subdiv(qp0, qp1, qp2, REM_ACCURACY.sqrt());
+        keep_params[i as usize] = params;
+        val += params.val;
+        qp0 = qp2;
+    }
+    let n = ((val * (0.5 / REM_ACCURACY.sqrt())).ceil() as u32).max(1);
+    let mut lp0 = p0;
+    qp0 = p0;
+    let v_step = val / (n as f32);
+    let mut n_out = 1;
+    let mut val_sum = 0.0;
+    for i in 0..n_quads {
+        let t = (i + 1) as f32 * step;
+        let qp2 = eval_cubic(p0, p1, p2, p3, t);
+        let mut qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
+        qp1 = qp1 * 2.0 - (qp0 + qp2) * 0.5;
+        let params = keep_params[i as usize];
+        let u0 = approx_parabola_inv_integral(params.a0);
+        let u2 = approx_parabola_inv_integral(params.a2);
+        let uscale = (u2 - u0).recip();
+        let mut val_target = (n_out as f32) * v_step;
+        while n_out == n || val_target < val_sum + params.val {
+            let lp1 = if n_out == n {
+                p3
+            } else {
+                let u = (val_target - val_sum) / params.val;
+                let a = params.a0 + (params.a2 - params.a0) * u;
+                let au = approx_parabola_inv_integral(a);
+                let t = (au - u0) * uscale;
+                eval_quad(qp0, qp1, qp2, t)
+            };
+            let ls = LineSoup {
+                path_ix: cubic.path_ix,
+                _padding: Default::default(),
+                p0: lp0.to_array(),
+                p1: lp1.to_array(),
+            };
+            lines[*line_ix] = ls;
+            *line_ix += 1;
+            n_out += 1;
+            val_target += v_step;
+            lp0 = lp1;
+        }
+        val_sum += params.val;
+        qp0 = qp2;
+    }
+}
+
+fn read_f32_point(ix: u32, pathdata: &[u32]) -> Vec2 {
+    let x = f32::from_bits(pathdata[ix as usize]);
+    let y = f32::from_bits(pathdata[ix as usize + 1]);
+    Vec2 { x, y }
+}
+
+struct IntBbox {
+    x0: i32,
+    y0: i32,
+    x1: i32,
+    y1: i32,
+}
+
+impl Default for IntBbox {
+    fn default() -> Self {
+        IntBbox {
+            x0: 0x7fff_ffff,
+            y0: 0x7fff_ffff,
+            x1: -0x8000_0000,
+            y1: -0x8000_0000,
+        }
+    }
+}
+
+impl IntBbox {
+    fn add_pt(&mut self, pt: Vec2) {
+        self.x0 = self.x0.min(pt.x.floor() as i32);
+        self.y0 = self.y0.min(pt.y.floor() as i32);
+        self.x1 = self.x1.max(pt.x.ceil() as i32);
+        self.y1 = self.y1.max(pt.y.ceil() as i32);
+    }
+}
+
+// TODO: we're skipping i16 point reading as it's not present in our scenes
+
+const WG_SIZE: usize = 256;
+
+const PATH_TAG_SEG_TYPE: u8 = 3;
+const PATH_TAG_PATH: u8 = 0x10;
+const PATH_TAG_LINETO: u8 = 1;
+const PATH_TAG_QUADTO: u8 = 2;
+const PATH_TAG_CUBICTO: u8 = 3;
+const PATH_TAG_F32: u8 = 8;
+
+fn flatten_main(
+    n_wg: u32,
+    config: &ConfigUniform,
+    scene: &[u32],
+    tag_monoids: &[PathMonoid],
+    path_bboxes: &mut [PathBbox],
+    bump: &mut BumpAllocators,
+    lines: &mut [LineSoup],
+) {
+    let mut line_ix = 0;
+    let mut bbox = IntBbox::default();
+    for ix in 0..n_wg as usize * WG_SIZE {
+        let tag_word = scene[config.layout.path_tag_base as usize + (ix >> 2)];
+        let shift = (ix & 3) * 8;
+        let mut tm = PathMonoid::new(tag_word & ((1 << shift) - 1));
+        let tag_byte = (tag_word >> shift) as u8;
+        if tag_byte != 0 {
+            tm = tag_monoids[ix >> 2].combine(&tm);
+        }
+        let linewidth =
+            f32::from_bits(scene[(config.layout.linewidth_base + tm.linewidth_ix) as usize]);
+        if (tag_byte & PATH_TAG_PATH) != 0 {
+            let out = &mut path_bboxes[tm.path_ix as usize];
+            out.linewidth = linewidth;
+            out.trans_ix = tm.trans_ix;
+        }
+        let seg_type = tag_byte & PATH_TAG_SEG_TYPE;
+        let pathdata = &scene[config.layout.path_data_base as usize..];
+        if seg_type != 0 {
+            let mut p0;
+            let mut p1;
+            let mut p2 = Vec2::default();
+            let mut p3 = Vec2::default();
+            if (tag_byte & PATH_TAG_F32) != 0 {
+                p0 = read_f32_point(tm.pathseg_offset, pathdata);
+                p1 = read_f32_point(tm.pathseg_offset + 2, pathdata);
+                if seg_type >= PATH_TAG_QUADTO {
+                    p2 = read_f32_point(tm.pathseg_offset + 4, pathdata);
+                    if seg_type == PATH_TAG_CUBICTO {
+                        p3 = read_f32_point(tm.pathseg_offset + 6, pathdata);
+                    }
+                }
+            } else {
+                todo!("i16 path data not supported yet");
+            }
+            let transform = Transform::read(config.layout.transform_base, tm.trans_ix, scene);
+            p0 = transform.apply(p0);
+            bbox.add_pt(p0);
+            p1 = transform.apply(p1);
+            bbox.add_pt(p1);
+            if seg_type == PATH_TAG_LINETO {
+                p3 = p1;
+                p2 = p3.mix(p0, 1.0 / 3.0);
+                p1 = p0.mix(p3, 1.0 / 3.0);
+            } else if seg_type >= PATH_TAG_QUADTO {
+                p2 = transform.apply(p2);
+                bbox.add_pt(p2);
+                if seg_type == PATH_TAG_CUBICTO {
+                    p3 = transform.apply(p3);
+                    bbox.add_pt(p3);
+                } else {
+                    p3 = p2;
+                    p2 = p1.mix(p2, 1.0 / 3.0);
+                    p1 = p1.mix(p0, 1.0 / 3.0);
+                }
+            }
+            let path_ix = tm.path_ix;
+            let cubic = Cubic {
+                p0,
+                p1,
+                p2,
+                p3,
+                path_ix,
+            };
+            flatten_cubic(cubic, &mut line_ix, lines);
+        }
+        if (tag_byte & PATH_TAG_PATH) != 0 {
+            let out = &mut path_bboxes[tm.path_ix as usize];
+            out.x0 = bbox.x0;
+            out.y0 = bbox.y0;
+            out.x1 = bbox.x1;
+            out.y1 = bbox.y1;
+            bbox = IntBbox::default();
+        }
+    }
+    bump.lines = line_ix as u32;
+}
+
+pub fn flatten(n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let tag_monoids = resources[2].as_slice();
+    let mut path_bboxes = resources[3].as_slice_mut();
+    let mut bump = resources[4].as_typed_mut();
+    let mut lines = resources[5].as_slice_mut();
+    flatten_main(
+        n_wg,
+        &config,
+        &scene,
+        &tag_monoids,
+        &mut path_bboxes,
+        &mut bump,
+        &mut lines,
+    );
+}
diff --git a/src/cpu_shader/mod.rs b/src/cpu_shader/mod.rs
index fed341c75..16d261f65 100644
--- a/src/cpu_shader/mod.rs
+++ b/src/cpu_shader/mod.rs
@@ -1,8 +1,61 @@
 // Copyright 2023 The Vello authors
-// SPDX-License-Identifier: Apache-2.0 OR MIT
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
 //! CPU implementations of shader stages.
 
+// Allow un-idiomatic Rust to more closely match shaders
+#![allow(clippy::needless_range_loop)]
+#![allow(clippy::too_many_arguments)]
+
+mod backdrop;
+mod bbox_clear;
+mod binning;
+mod clip_leaf;
+mod clip_reduce;
+mod coarse;
+mod draw_leaf;
+mod draw_reduce;
+mod fine;
+mod flatten;
+mod path_count;
+mod path_count_setup;
+mod path_tiling;
+mod path_tiling_setup;
 mod pathtag_reduce;
+mod pathtag_scan;
+mod tile_alloc;
+mod util;
 
+pub use backdrop::backdrop;
+pub use bbox_clear::bbox_clear;
+pub use binning::binning;
+pub use clip_leaf::clip_leaf;
+pub use clip_reduce::clip_reduce;
+pub use coarse::coarse;
+pub use draw_leaf::draw_leaf;
+pub use draw_reduce::draw_reduce;
+pub use flatten::flatten;
+pub use path_count::path_count;
+pub use path_count_setup::path_count_setup;
+pub use path_tiling::path_tiling;
+pub use path_tiling_setup::path_tiling_setup;
 pub use pathtag_reduce::pathtag_reduce;
+pub use pathtag_scan::pathtag_scan;
+pub use tile_alloc::tile_alloc;
+
+// Common definitions
+
+const PTCL_INITIAL_ALLOC: u32 = 64;
+
+// Tags for PTCL commands
+const CMD_END: u32 = 0;
+const CMD_FILL: u32 = 1;
+//const CMD_STROKE: u32 = 2;
+const CMD_SOLID: u32 = 3;
+const CMD_COLOR: u32 = 5;
+const CMD_LIN_GRAD: u32 = 6;
+const CMD_RAD_GRAD: u32 = 7;
+const CMD_IMAGE: u32 = 8;
+const CMD_BEGIN_CLIP: u32 = 9;
+const CMD_END_CLIP: u32 = 10;
+const CMD_JUMP: u32 = 11;
diff --git a/src/cpu_shader/path_count.rs b/src/cpu_shader/path_count.rs
new file mode 100644
index 000000000..2cee5b815
--- /dev/null
+++ b/src/cpu_shader/path_count.rs
@@ -0,0 +1,157 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{BumpAllocators, LineSoup, Path, SegmentCount, Tile};
+
+use crate::cpu_dispatch::CpuBinding;
+
+use super::util::{span, Vec2};
+
+const TILE_SCALE: f32 = 1.0 / 16.0;
+
+fn path_count_main(
+    bump: &mut BumpAllocators,
+    lines: &[LineSoup],
+    paths: &[Path],
+    tile: &mut [Tile],
+    seg_counts: &mut [SegmentCount],
+) {
+    for line_ix in 0..bump.lines {
+        let line = lines[line_ix as usize];
+        let p0 = Vec2::from_array(line.p0);
+        let p1 = Vec2::from_array(line.p1);
+        let is_down = p1.y >= p0.y;
+        let (xy0, xy1) = if is_down { (p0, p1) } else { (p1, p0) };
+        let s0 = xy0 * TILE_SCALE;
+        let s1 = xy1 * TILE_SCALE;
+        let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1;
+
+        let dx = (s1.x - s0.x).abs();
+        let dy = s1.y - s0.y;
+        if dx + dy == 0.0 {
+            continue;
+        }
+        if dy == 0.0 && s0.y.floor() == s0.y {
+            continue;
+        }
+        let idxdy = 1.0 / (dx + dy);
+        let a = dx * idxdy;
+        let is_positive_slope = s1.x >= s0.x;
+        let sign = if is_positive_slope { 1.0 } else { -1.0 };
+        let xt0 = (s0.x * sign).floor();
+        let c = s0.x * sign - xt0;
+        let y0 = s0.y.floor();
+        let ytop = if s0.y == s1.y { s0.y.ceil() } else { y0 + 1.0 };
+        let b = (dy * c + dx * (ytop - s0.y)) * idxdy;
+        let x0 = xt0 * sign + if is_positive_slope { 0.0 } else { -1.0 };
+
+        let path = paths[line.path_ix as usize];
+        let bbox = path.bbox;
+        let bbox = [
+            bbox[0] as i32,
+            bbox[1] as i32,
+            bbox[2] as i32,
+            bbox[3] as i32,
+        ];
+        let xmin = s0.x.min(s1.x);
+        let stride = bbox[2] - bbox[0];
+        if s0.y >= bbox[3] as f32 || s1.y < bbox[1] as f32 || xmin >= bbox[2] as f32 || stride == 0
+        {
+            continue;
+        }
+        // Clip line to bounding box. Clipping is done in "i" space.
+        let mut imin = 0;
+        if s0.y < bbox[1] as f32 {
+            let mut iminf = ((bbox[1] as f32 - y0 + b - a) / (1.0 - a)).round() - 1.0;
+            if y0 + iminf - (a * iminf + b).floor() < bbox[1] as f32 {
+                iminf += 1.0;
+            }
+            imin = iminf as u32;
+        }
+        let mut imax = count;
+        if s1.y > bbox[3] as f32 {
+            let mut imaxf = ((bbox[3] as f32 - y0 + b - a) / (1.0 - a)).round() - 1.0;
+            if y0 + imaxf - (a * imaxf + b).floor() < bbox[3] as f32 {
+                imaxf += 1.0;
+            }
+            imax = imaxf as u32;
+        }
+        let delta = if is_down { -1 } else { 1 };
+        let mut ymin = 0;
+        let mut ymax = 0;
+        if s0.x.max(s1.x) < bbox[0] as f32 {
+            ymin = s0.y.ceil() as i32;
+            ymax = s1.y.ceil() as i32;
+            imax = imin;
+        } else {
+            let fudge = if is_positive_slope { 0.0 } else { 1.0 };
+            if xmin < bbox[0] as f32 {
+                let mut f = ((sign * (bbox[0] as f32 - x0) - b + fudge) / a).round();
+                if (x0 + sign * (a * f + b).floor() < bbox[0] as f32) == is_positive_slope {
+                    f += 1.0;
+                }
+                let ynext = (y0 + f - (a * f + b).floor() + 1.0) as i32;
+                if is_positive_slope {
+                    if f as u32 > imin {
+                        ymin = (y0 + if y0 == s0.y { 0.0 } else { 1.0 }) as i32;
+                        ymax = ynext;
+                        imin = f as u32;
+                    }
+                } else if (f as u32) < imax {
+                    ymin = ynext;
+                    ymax = s1.y.ceil() as i32;
+                    imax = f as u32;
+                }
+            }
+            if s0.x.max(s1.x) > bbox[2] as f32 {
+                let mut f = ((sign * (bbox[2] as f32 - x0) - b + fudge) / a).round();
+                if (x0 + sign * (a * f + b).floor() < bbox[2] as f32) == is_positive_slope {
+                    f += 1.0;
+                }
+                if is_positive_slope {
+                    imax = imax.min(f as u32);
+                } else {
+                    imin = imin.max(f as u32);
+                }
+            }
+        }
+        imax = imin.max(imax);
+        ymin = ymin.max(bbox[1]);
+        ymax = ymax.min(bbox[3]);
+        for y in ymin..ymax {
+            let base = path.tiles as i32 + (y - bbox[1]) * stride;
+            tile[base as usize].backdrop += delta;
+        }
+        let mut last_z = (a * (imin as f32 - 1.0) + b).floor();
+        let seg_base = bump.seg_counts;
+        bump.seg_counts += imax - imin;
+        for i in imin..imax {
+            let zf = a * i as f32 + b;
+            let z = zf.floor();
+            let y = (y0 + i as f32 - z) as i32;
+            let x = (x0 + sign * z) as i32;
+            let base = path.tiles as i32 + (y - bbox[1]) * stride - bbox[0];
+            let top_edge = if i == 0 { y0 == s0.y } else { last_z == z };
+            if top_edge && x + 1 < bbox[2] {
+                let x_bump = (x + 1).max(bbox[0]);
+                tile[(base + x_bump) as usize].backdrop += delta;
+            }
+            // .segments is another name for the .count field; it's overloaded
+            let seg_within_slice = tile[(base + x) as usize].segment_count_or_ix;
+            tile[(base + x) as usize].segment_count_or_ix += 1;
+            let counts = (seg_within_slice << 16) | i;
+            let seg_count = SegmentCount { line_ix, counts };
+            seg_counts[(seg_base + i - imin) as usize] = seg_count;
+            last_z = z;
+        }
+    }
+}
+
+pub fn path_count(_n_wg: u32, resources: &[CpuBinding]) {
+    let mut bump = resources[1].as_typed_mut();
+    let lines = resources[2].as_slice();
+    let paths = resources[3].as_slice();
+    let mut tile = resources[4].as_slice_mut();
+    let mut seg_counts = resources[5].as_slice_mut();
+    path_count_main(&mut bump, &lines, &paths, &mut tile, &mut seg_counts);
+}
diff --git a/src/cpu_shader/path_count_setup.rs b/src/cpu_shader/path_count_setup.rs
new file mode 100644
index 000000000..6336cfd47
--- /dev/null
+++ b/src/cpu_shader/path_count_setup.rs
@@ -0,0 +1,21 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{BumpAllocators, IndirectCount};
+
+use crate::cpu_dispatch::CpuBinding;
+
+const WG_SIZE: usize = 256;
+
+fn path_count_setup_main(bump: &BumpAllocators, indirect: &mut IndirectCount) {
+    let lines = bump.lines;
+    indirect.count_x = (lines + (WG_SIZE as u32 - 1)) / WG_SIZE as u32;
+    indirect.count_y = 1;
+    indirect.count_z = 1;
+}
+
+pub fn path_count_setup(_n_wg: u32, resources: &[CpuBinding]) {
+    let bump = resources[0].as_typed();
+    let mut indirect = resources[1].as_typed_mut();
+    path_count_setup_main(&bump, &mut indirect);
+}
diff --git a/src/cpu_shader/path_tiling.rs b/src/cpu_shader/path_tiling.rs
new file mode 100644
index 000000000..41549bb54
--- /dev/null
+++ b/src/cpu_shader/path_tiling.rs
@@ -0,0 +1,152 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{BumpAllocators, LineSoup, Path, PathSegment, SegmentCount, Tile};
+
+use crate::cpu_dispatch::CpuBinding;
+
+use super::util::{span, Vec2};
+
+const TILE_WIDTH: u32 = 16;
+const TILE_HEIGHT: u32 = 16;
+const TILE_SCALE: f32 = 1.0 / 16.0;
+
+fn path_tiling_main(
+    bump: &mut BumpAllocators,
+    seg_counts: &[SegmentCount],
+    lines: &[LineSoup],
+    paths: &[Path],
+    tiles: &[Tile],
+    segments: &mut [PathSegment],
+) {
+    for seg_ix in 0..bump.seg_counts {
+        let seg_count = seg_counts[seg_ix as usize];
+        let line = lines[seg_count.line_ix as usize];
+        let counts = seg_count.counts;
+        let seg_within_slice = counts >> 16;
+        let seg_within_line = counts & 0xffff;
+
+        // coarse rasterization logic
+        let p0 = Vec2::from_array(line.p0);
+        let p1 = Vec2::from_array(line.p1);
+        let is_down = p1.y >= p0.y;
+        let (mut xy0, mut xy1) = if is_down { (p0, p1) } else { (p1, p0) };
+        let s0 = xy0 * TILE_SCALE;
+        let s1 = xy1 * TILE_SCALE;
+        let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1;
+
+        let dx = (s1.x - s0.x).abs();
+        let dy = s1.y - s0.y;
+        let idxdy = 1.0 / (dx + dy);
+        let a = dx * idxdy;
+        let is_positive_slope = s1.x >= s0.x;
+        let sign = if is_positive_slope { 1.0 } else { -1.0 };
+        let xt0 = (s0.x * sign).floor();
+        let c = s0.x * sign - xt0;
+        let y0 = s0.y.floor();
+        let ytop = if s0.y == s1.y { s0.y.ceil() } else { y0 + 1.0 };
+        let b = (dy * c + dx * (ytop - s0.y)) * idxdy;
+        let x0 = xt0 * sign + if is_positive_slope { 0.0 } else { -1.0 };
+        let z = (a * seg_within_line as f32 + b).floor();
+        let x = x0 as i32 + (sign * z) as i32;
+        let y = (y0 + seg_within_line as f32 - z) as i32;
+
+        let path = paths[line.path_ix as usize];
+        let bbox = path.bbox;
+        let bbox = [
+            bbox[0] as i32,
+            bbox[1] as i32,
+            bbox[2] as i32,
+            bbox[3] as i32,
+        ];
+        let stride = bbox[2] - bbox[0];
+        let tile_ix = path.tiles as i32 + (y - bbox[1]) * stride + x - bbox[0];
+        let tile = tiles[tile_ix as usize];
+        let seg_start = !tile.segment_count_or_ix;
+        if (seg_start as i32) < 0 {
+            continue;
+        }
+        let tile_xy = Vec2::new(x as f32 * TILE_WIDTH as f32, y as f32 * TILE_HEIGHT as f32);
+        let tile_xy1 = tile_xy + Vec2::new(TILE_WIDTH as f32, TILE_HEIGHT as f32);
+
+        if seg_within_line > 0 {
+            let z_prev = (a * (seg_within_line as f32 - 1.0) + b).floor();
+            if z == z_prev {
+                // Top edge is clipped
+                let mut xt = xy0.x + (xy1.x - xy0.x) * (tile_xy.y - xy0.y) / (xy1.y - xy0.y);
+                xt = xt.clamp(tile_xy.x + 1e-3, tile_xy1.x);
+                xy0 = Vec2::new(xt, tile_xy.y);
+            } else {
+                // If is_positive_slope, left edge is clipped, otherwise right
+                let x_clip = if is_positive_slope {
+                    tile_xy.x
+                } else {
+                    tile_xy1.x
+                };
+                let mut yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x);
+                yt = yt.clamp(tile_xy.y + 1e-3, tile_xy1.y);
+                xy0 = Vec2::new(x_clip, yt);
+            }
+        }
+        if seg_within_line < count - 1 {
+            let z_next = (a * (seg_within_line as f32 + 1.0) + b).floor();
+            if z == z_next {
+                // Bottom edge is clipped
+                let mut xt = xy0.x + (xy1.x - xy0.x) * (tile_xy1.y - xy0.y) / (xy1.y - xy0.y);
+                xt = xt.clamp(tile_xy.x + 1e-3, tile_xy1.x);
+                xy1 = Vec2::new(xt, tile_xy1.y);
+            } else {
+                // If is_positive_slope, right edge is clipped, otherwise left
+                let x_clip = if is_positive_slope {
+                    tile_xy1.x
+                } else {
+                    tile_xy.x
+                };
+                let mut yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x);
+                yt = yt.clamp(tile_xy.y + 1e-3, tile_xy1.y);
+                xy1 = Vec2::new(x_clip, yt);
+            }
+        }
+        if !is_down {
+            (xy0, xy1) = (xy1, xy0);
+        }
+        // TODO: figure out what to if both xy0 and xy1 are at left edge
+        // Also TODO (part of move to 8 byte encoding for segments): don't store y_edge at all,
+        // resolve this in fine.
+        let y_edge = if xy0.x == tile_xy.x {
+            xy0.y
+        } else if xy1.x == tile_xy.x {
+            xy1.y
+        } else {
+            1e9
+        };
+        let segment = PathSegment {
+            origin: xy0.to_array(),
+            delta: (xy1 - xy0).to_array(),
+            y_edge,
+            _padding: Default::default(),
+        };
+        assert!(xy0.x >= tile_xy.x && xy0.x <= tile_xy1.x);
+        assert!(xy0.y >= tile_xy.y && xy0.y <= tile_xy1.y);
+        assert!(xy1.x >= tile_xy.x && xy1.x <= tile_xy1.x);
+        assert!(xy1.y >= tile_xy.y && xy1.y <= tile_xy1.y);
+        segments[(seg_start + seg_within_slice) as usize] = segment;
+    }
+}
+
+pub fn path_tiling(_n_wg: u32, resources: &[CpuBinding]) {
+    let mut bump = resources[0].as_typed_mut();
+    let seg_counts = resources[1].as_slice();
+    let lines = resources[2].as_slice();
+    let paths = resources[3].as_slice();
+    let tiles = resources[4].as_slice();
+    let mut segments = resources[5].as_slice_mut();
+    path_tiling_main(
+        &mut bump,
+        &seg_counts,
+        &lines,
+        &paths,
+        &tiles,
+        &mut segments,
+    );
+}
diff --git a/src/cpu_shader/path_tiling_setup.rs b/src/cpu_shader/path_tiling_setup.rs
new file mode 100644
index 000000000..32e08f9ae
--- /dev/null
+++ b/src/cpu_shader/path_tiling_setup.rs
@@ -0,0 +1,21 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{BumpAllocators, IndirectCount};
+
+use crate::cpu_dispatch::CpuBinding;
+
+const WG_SIZE: usize = 256;
+
+fn path_tiling_setup_main(bump: &BumpAllocators, indirect: &mut IndirectCount) {
+    let segments = bump.seg_counts;
+    indirect.count_x = (segments + (WG_SIZE as u32 - 1)) / WG_SIZE as u32;
+    indirect.count_y = 1;
+    indirect.count_z = 1;
+}
+
+pub fn path_tiling_setup(_n_wg: u32, resources: &[CpuBinding]) {
+    let bump = resources[0].as_typed();
+    let mut indirect = resources[1].as_typed_mut();
+    path_tiling_setup_main(&bump, &mut indirect);
+}
diff --git a/src/cpu_shader/pathtag_reduce.rs b/src/cpu_shader/pathtag_reduce.rs
index 38ee55c18..58eb36c17 100644
--- a/src/cpu_shader/pathtag_reduce.rs
+++ b/src/cpu_shader/pathtag_reduce.rs
@@ -1,5 +1,5 @@
 // Copyright 2023 The Vello authors
-// SPDX-License-Identifier: Apache-2.0 OR MIT
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
 use vello_encoding::{ConfigUniform, Monoid, PathMonoid};
 
@@ -25,11 +25,8 @@ fn pathtag_reduce_main(
 }
 
 pub fn pathtag_reduce(n_wg: u32, resources: &[CpuBinding]) {
-    let r0 = resources[0].as_buf();
-    let r1 = resources[1].as_buf();
-    let mut r2 = resources[2].as_buf();
-    let config = bytemuck::from_bytes(&r0);
-    let scene = bytemuck::cast_slice(&r1);
-    let reduced = bytemuck::cast_slice_mut(r2.as_mut());
-    pathtag_reduce_main(n_wg, config, scene, reduced);
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let mut reduced = resources[2].as_slice_mut();
+    pathtag_reduce_main(n_wg, &config, &scene, &mut reduced);
 }
diff --git a/src/cpu_shader/pathtag_scan.rs b/src/cpu_shader/pathtag_scan.rs
new file mode 100644
index 000000000..8a8aa609a
--- /dev/null
+++ b/src/cpu_shader/pathtag_scan.rs
@@ -0,0 +1,37 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use vello_encoding::{ConfigUniform, Monoid, PathMonoid};
+
+use crate::cpu_dispatch::CpuBinding;
+
+const WG_SIZE: usize = 256;
+
+fn pathtag_scan_main(
+    n_wg: u32,
+    config: &ConfigUniform,
+    scene: &[u32],
+    reduced: &[PathMonoid],
+    tag_monoids: &mut [PathMonoid],
+) {
+    let pathtag_base = config.layout.path_tag_base;
+    let mut prefix = PathMonoid::default();
+    for i in 0..n_wg {
+        let mut m = prefix;
+        for j in 0..WG_SIZE {
+            let ix = (i * WG_SIZE as u32) as usize + j;
+            tag_monoids[ix] = m;
+            let tag = scene[pathtag_base as usize + ix];
+            m = m.combine(&PathMonoid::new(tag));
+        }
+        prefix = prefix.combine(&reduced[i as usize]);
+    }
+}
+
+pub fn pathtag_scan(n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let reduced = resources[2].as_slice();
+    let mut tag_monoids = resources[3].as_slice_mut();
+    pathtag_scan_main(n_wg, &config, &scene, &reduced, &mut tag_monoids);
+}
diff --git a/src/cpu_shader/tile_alloc.rs b/src/cpu_shader/tile_alloc.rs
new file mode 100644
index 000000000..367f28df2
--- /dev/null
+++ b/src/cpu_shader/tile_alloc.rs
@@ -0,0 +1,72 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use vello_encoding::{BumpAllocators, ConfigUniform, DrawTag, Path, Tile};
+
+use crate::cpu_dispatch::CpuBinding;
+
+const TILE_WIDTH: usize = 16;
+const TILE_HEIGHT: usize = 16;
+const SX: f32 = 1.0 / (TILE_WIDTH as f32);
+const SY: f32 = 1.0 / (TILE_HEIGHT as f32);
+
+fn tile_alloc_main(
+    config: &ConfigUniform,
+    scene: &[u32],
+    draw_bboxes: &[[f32; 4]],
+    bump: &mut BumpAllocators,
+    paths: &mut [Path],
+    tiles: &mut [Tile],
+) {
+    let drawtag_base = config.layout.draw_tag_base;
+    let width_in_tiles = config.width_in_tiles as i32;
+    let height_in_tiles = config.height_in_tiles as i32;
+    for drawobj_ix in 0..config.layout.n_draw_objects {
+        let drawtag = DrawTag(scene[(drawtag_base + drawobj_ix) as usize]);
+        let mut x0 = 0;
+        let mut y0 = 0;
+        let mut x1 = 0;
+        let mut y1 = 0;
+        if drawtag != DrawTag::NOP && drawtag != DrawTag::END_CLIP {
+            let bbox = draw_bboxes[drawobj_ix as usize];
+            if bbox[0] < bbox[2] && bbox[1] < bbox[3] {
+                x0 = (bbox[0] * SX).floor() as i32;
+                y0 = (bbox[1] * SY).floor() as i32;
+                x1 = (bbox[2] * SX).ceil() as i32;
+                y1 = (bbox[3] * SY).ceil() as i32;
+            }
+        }
+        let ux0 = x0.clamp(0, width_in_tiles) as u32;
+        let uy0 = y0.clamp(0, height_in_tiles) as u32;
+        let ux1 = x1.clamp(0, width_in_tiles) as u32;
+        let uy1 = y1.clamp(0, height_in_tiles) as u32;
+        let tile_count = (ux1 - ux0) * (uy1 - uy0);
+        let offset = bump.tile;
+        bump.tile += tile_count;
+        // We construct it this way because padding is private.
+        let mut path = Path::default();
+        path.bbox = [ux0, uy0, ux1, uy1];
+        path.tiles = offset;
+        paths[drawobj_ix as usize] = path;
+        for i in 0..tile_count {
+            tiles[(offset + i) as usize] = Tile::default();
+        }
+    }
+}
+
+pub fn tile_alloc(_n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let draw_bboxes = resources[2].as_slice();
+    let mut bump = resources[3].as_typed_mut();
+    let mut paths = resources[4].as_slice_mut();
+    let mut tiles = resources[5].as_slice_mut();
+    tile_alloc_main(
+        &config,
+        &scene,
+        &draw_bboxes,
+        &mut bump,
+        &mut paths,
+        &mut tiles,
+    );
+}
diff --git a/src/cpu_shader/util.rs b/src/cpu_shader/util.rs
new file mode 100644
index 000000000..2bb3279aa
--- /dev/null
+++ b/src/cpu_shader/util.rs
@@ -0,0 +1,113 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! Utility types
+
+use vello_encoding::ConfigUniform;
+
+#[derive(Clone, Copy, Default, Debug)]
+#[repr(C)]
+pub struct Vec2 {
+    pub x: f32,
+    pub y: f32,
+}
+
+impl std::ops::Add for Vec2 {
+    type Output = Vec2;
+
+    fn add(self, rhs: Self) -> Self {
+        Vec2 {
+            x: self.x + rhs.x,
+            y: self.y + rhs.y,
+        }
+    }
+}
+
+impl std::ops::Sub for Vec2 {
+    type Output = Vec2;
+
+    fn sub(self, rhs: Self) -> Self {
+        Vec2 {
+            x: self.x - rhs.x,
+            y: self.y - rhs.y,
+        }
+    }
+}
+
+impl std::ops::Mul<f32> for Vec2 {
+    type Output = Vec2;
+
+    fn mul(self, rhs: f32) -> Self {
+        Vec2 {
+            x: self.x * rhs,
+            y: self.y * rhs,
+        }
+    }
+}
+
+impl Vec2 {
+    pub fn new(x: f32, y: f32) -> Self {
+        Vec2 { x, y }
+    }
+
+    pub fn dot(self, other: Vec2) -> f32 {
+        self.x * other.x + self.y * other.y
+    }
+
+    pub fn length(self) -> f32 {
+        self.x.hypot(self.y)
+    }
+
+    pub fn to_array(self) -> [f32; 2] {
+        [self.x, self.y]
+    }
+
+    pub fn from_array(a: [f32; 2]) -> Self {
+        Vec2 { x: a[0], y: a[1] }
+    }
+
+    pub fn mix(self, other: Vec2, t: f32) -> Self {
+        let x = self.x + (other.x - self.x) * t;
+        let y = self.y + (other.y - self.y) * t;
+        Vec2 { x, y }
+    }
+}
+
+pub struct Transform(pub [f32; 6]);
+
+impl Transform {
+    pub fn apply(&self, p: Vec2) -> Vec2 {
+        let z = self.0;
+        let x = z[0] * p.x + z[2] * p.y + z[4];
+        let y = z[1] * p.x + z[3] * p.y + z[5];
+        Vec2 { x, y }
+    }
+
+    pub fn read(transform_base: u32, ix: u32, data: &[u32]) -> Transform {
+        let mut z = [0.0; 6];
+        let base = (transform_base + ix * 6) as usize;
+        for i in 0..6 {
+            z[i] = f32::from_bits(data[base + i]);
+        }
+        Transform(z)
+    }
+}
+
+pub fn span(a: f32, b: f32) -> u32 {
+    (a.max(b).ceil() - a.min(b).floor()).max(1.0) as u32
+}
+
+const DRAWTAG_NOP: u32 = 0;
+
+/// Read draw tag, guarded by number of draw objects.
+///
+/// The `ix` argument is allowed to exceed the number of draw objects,
+/// in which case a NOP is returned.
+pub fn read_draw_tag_from_scene(config: &ConfigUniform, scene: &[u32], ix: u32) -> u32 {
+    if ix < config.layout.n_draw_objects {
+        let tag_ix = config.layout.draw_tag_base + ix;
+        scene[tag_ix as usize]
+    } else {
+        DRAWTAG_NOP
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 55147acd8..28e5bf7ff 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -72,6 +72,8 @@ pub struct Renderer {
     profiler: GpuProfiler,
     #[cfg(feature = "wgpu-profiler")]
     pub profile_result: Option<Vec<wgpu_profiler::GpuTimerScopeResult>>,
+    #[cfg(feature = "hot_reload")]
+    use_cpu: bool,
 }
 
 /// Parameters used in a single render that are configurable by the client.
@@ -101,7 +103,10 @@ impl Renderer {
     /// Creates a new renderer for the specified device.
     pub fn new(device: &Device, render_options: &RendererOptions) -> Result<Self> {
         let mut engine = WgpuEngine::new();
-        let shaders = shaders::full_shaders(device, &mut engine, render_options.use_cpu)?;
+        let mut shaders = shaders::full_shaders(device, &mut engine)?;
+        if render_options.use_cpu {
+            shaders.install_cpu_shaders(&mut engine);
+        }
         let blit = render_options
             .surface_format
             .map(|surface_format| BlitPipeline::new(device, surface_format));
@@ -115,6 +120,8 @@ impl Renderer {
             profiler: GpuProfiler::new(3, render_options.timestamp_period, device.features()),
             #[cfg(feature = "wgpu-profiler")]
             profile_result: None,
+            #[cfg(feature = "hot_reload")]
+            use_cpu: render_options.use_cpu,
         })
     }
 
@@ -220,7 +227,10 @@ impl Renderer {
     pub async fn reload_shaders(&mut self, device: &Device) -> Result<()> {
         device.push_error_scope(wgpu::ErrorFilter::Validation);
         let mut engine = WgpuEngine::new();
-        let shaders = shaders::full_shaders(device, &mut engine, false)?;
+        let mut shaders = shaders::full_shaders(device, &mut engine)?;
+        if self.use_cpu {
+            shaders.install_cpu_shaders(&mut engine);
+        }
         let error = device.pop_error_scope().await;
         if let Some(error) = error {
             return Err(error.into());
diff --git a/src/render.rs b/src/render.rs
index 462563633..268007faa 100644
--- a/src/render.rs
+++ b/src/render.rs
@@ -139,7 +139,8 @@ impl Render {
         );
         let mut pathtag_parent = reduced_buf;
         let mut large_pathtag_bufs = None;
-        if wg_counts.use_large_path_scan {
+        let use_large_path_scan = wg_counts.use_large_path_scan && !shaders.pathtag_is_cpu;
+        if use_large_path_scan {
             let reduced2_buf = ResourceProxy::new_buf(
                 buffer_sizes.path_reduced2.size_in_bytes().into(),
                 "reduced2_buf",
@@ -166,7 +167,7 @@ impl Render {
             buffer_sizes.path_monoids.size_in_bytes().into(),
             "tagmonoid_buf",
         );
-        let pathtag_scan = if wg_counts.use_large_path_scan {
+        let pathtag_scan = if use_large_path_scan {
             shaders.pathtag_scan_large
         } else {
             shaders.pathtag_scan
diff --git a/src/shaders.rs b/src/shaders.rs
index 23a3950f3..86e6ed7bd 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -79,14 +79,13 @@ pub struct FullShaders {
     pub path_tiling_setup: ShaderId,
     pub path_tiling: ShaderId,
     pub fine: ShaderId,
+    // 2-level dispatch works for CPU pathtag scan even for large
+    // inputs, 3-level is not yet implemented.
+    pub pathtag_is_cpu: bool,
 }
 
 #[cfg(feature = "wgpu")]
-pub fn full_shaders(
-    device: &Device,
-    engine: &mut WgpuEngine,
-    use_cpu: bool,
-) -> Result<FullShaders, Error> {
+pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShaders, Error> {
     let imports = SHARED_SHADERS
         .iter()
         .copied()
@@ -103,9 +102,6 @@ pub fn full_shaders(
         preprocess::preprocess(shader!("pathtag_reduce"), &full_config, &imports).into(),
         &[BindType::Uniform, BindType::BufReadOnly, BindType::Buffer],
     )?;
-    if use_cpu {
-        engine.set_cpu_shader(pathtag_reduce, cpu_shader::pathtag_reduce);
-    }
     let pathtag_reduce2 = engine.add_shader(
         device,
         "pathtag_reduce2",
@@ -331,9 +327,42 @@ pub fn full_shaders(
         path_tiling_setup,
         path_tiling,
         fine,
+        pathtag_is_cpu: false,
     })
 }
 
+#[cfg(feature = "wgpu")]
+impl FullShaders {
+    /// Install the CPU shaders.
+    ///
+    /// There are a couple things to note here. The granularity provided by
+    /// this method is coarse; it installs all the shaders. There are many
+    /// use cases (including debugging), where a mix is desired, or the
+    /// choice between GPU and CPU dispatch might be dynamic.
+    ///
+    /// Second, the actual mapping to CPU shaders is not really specific to
+    /// the engine, and should be split out into a back-end agnostic struct.
+    pub fn install_cpu_shaders(&mut self, engine: &mut WgpuEngine) {
+        engine.set_cpu_shader(self.pathtag_reduce, cpu_shader::pathtag_reduce);
+        engine.set_cpu_shader(self.pathtag_scan, cpu_shader::pathtag_scan);
+        engine.set_cpu_shader(self.bbox_clear, cpu_shader::bbox_clear);
+        engine.set_cpu_shader(self.flatten, cpu_shader::flatten);
+        engine.set_cpu_shader(self.draw_reduce, cpu_shader::draw_reduce);
+        engine.set_cpu_shader(self.draw_leaf, cpu_shader::draw_leaf);
+        engine.set_cpu_shader(self.clip_reduce, cpu_shader::clip_reduce);
+        engine.set_cpu_shader(self.clip_leaf, cpu_shader::clip_leaf);
+        engine.set_cpu_shader(self.binning, cpu_shader::binning);
+        engine.set_cpu_shader(self.tile_alloc, cpu_shader::tile_alloc);
+        engine.set_cpu_shader(self.path_count_setup, cpu_shader::path_count_setup);
+        engine.set_cpu_shader(self.path_count, cpu_shader::path_count);
+        engine.set_cpu_shader(self.backdrop, cpu_shader::backdrop);
+        engine.set_cpu_shader(self.coarse, cpu_shader::coarse);
+        engine.set_cpu_shader(self.path_tiling_setup, cpu_shader::path_tiling_setup);
+        engine.set_cpu_shader(self.path_tiling, cpu_shader::path_tiling);
+        self.pathtag_is_cpu = true;
+    }
+}
+
 macro_rules! shared_shader {
     ($name:expr) => {
         (
diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs
index 12380e32c..c5359c1bb 100644
--- a/src/wgpu_engine.rs
+++ b/src/wgpu_engine.rs
@@ -19,6 +19,7 @@ use crate::{
     BufProxy, Command, Id, ImageProxy, Recording, ResourceProxy, ShaderId,
 };
 
+#[derive(Default)]
 pub struct WgpuEngine {
     shaders: Vec<Shader>,
     pool: ResourcePool,
@@ -90,12 +91,7 @@ enum TransientBuf<'a> {
 
 impl WgpuEngine {
     pub fn new() -> WgpuEngine {
-        WgpuEngine {
-            shaders: vec![],
-            pool: Default::default(),
-            bind_map: Default::default(),
-            downloads: Default::default(),
-        }
+        Default::default()
     }
 
     /// Add a shader.