From 28db39ac78078c6cccd181d889a5d92eec917c13 Mon Sep 17 00:00:00 2001
From: Andreas Reich <r_andreas2@web.de>
Date: Sat, 9 Dec 2023 12:51:30 +0100
Subject: [PATCH] split lib.rs into a number of separate modules

---
 src/lib.rs                       | 1069 +-----------------------------
 src/profiler.rs                  |  879 ++++++++++++++++++++++++
 src/profiler_command_recorder.rs |   32 +
 src/profiler_query.rs            |  103 +++
 src/profiler_settings.rs         |   57 ++
 src/tracy.rs                     |    8 +-
 6 files changed, 1083 insertions(+), 1065 deletions(-)
 create mode 100644 src/profiler.rs
 create mode 100644 src/profiler_command_recorder.rs
 create mode 100644 src/profiler_query.rs
 create mode 100644 src/profiler_settings.rs
diff --git a/src/lib.rs b/src/lib.rs
index 0a0caa8..a93169c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -100,1070 +100,17 @@ On [`GpuProfiler::end_frame`], we memorize the total size of all `QueryPool`s in
 
 pub mod chrometrace;
 mod errors;
+mod profiler;
+mod profiler_command_recorder;
+mod profiler_query;
+mod profiler_settings;
 mod scope;
 #[cfg(feature = "tracy")]
 mod tracy;
 
 pub use errors::{CreationError, EndFrameError, SettingsError};
+pub use profiler::GpuProfiler;
+pub use profiler_command_recorder::ProfilerCommandRecorder;
+pub use profiler_query::{GpuProfilerQuery, GpuTimerQueryResult};
+pub use profiler_settings::GpuProfilerSettings;
 pub use scope::{ManualOwningScope, OwningScope, Scope};
-
-// ---------------
-
-use std::{
-    collections::HashMap,
-    ops::Range,
-    sync::{
-        atomic::{AtomicU32, Ordering},
-        Arc,
-    },
-    thread::ThreadId,
-};
-
-use parking_lot::{Mutex, RwLock};
-
-/// The result of a gpu timer scope.
-#[derive(Debug, Clone)]
-pub struct GpuTimerQueryResult {
-    /// Label that was specified when opening the scope.
-    pub label: String,
-
-    /// The process id of the process that opened this scope.
-    pub pid: u32,
-
-    /// The thread id of the thread that opened this scope.
-    pub tid: ThreadId,
-
-    /// Time range of this scope in seconds.
-    ///
-    /// Meaning of absolute value is not defined.
-    pub time: Range<f64>,
-
-    /// Scopes that were opened while this scope was open.
-    pub nested_queries: Vec<GpuTimerQueryResult>,
-}
-
-/// An inflight query for the profiler.
-///
-/// If timer queries are enabled, this represents a reserved timer query pair on
-/// one of the profiler's query sets.
-/// *Must* be closed by calling [`GpuProfiler::end_query`].
-///
-/// Emitted by [`GpuProfiler::begin_query`]/[`GpuProfiler::begin_pass_query`] and consumed by [`GpuProfiler::end_query`].
-pub struct GpuProfilerQuery {
-    /// The label assigned to this query.
-    /// Will be moved into [`GpuProfilerQuery::label`] once the query is fully processed.
-    pub label: String,
-
-    /// The process id of the process that opened this query.
-    pub pid: u32,
-
-    /// The thread id of the thread that opened this query.
-    pub tid: ThreadId,
-
-    /// The actual query on a query pool if any (none if disabled for this type of query).
-    timer_query_pair: Option<ReservedTimerQueryPair>,
-
-    /// Handle which identifies this query, used for building the tree of queries.
-    handle: GpuTimerQueryTreeHandle,
-
-    /// Which query this query is a child of.
-    parent_handle: GpuTimerQueryTreeHandle,
-
-    /// Whether a debug group was opened for this scope.
-    has_debug_group: bool,
-
-    #[cfg(feature = "tracy")]
-    tracy_scope: Option<tracy_client::GpuSpan>,
-}
-
-impl GpuProfilerQuery {
-    /// Use the reserved query for render pass timestamp writes if any.
-    ///
-    /// Use this only for a single render/compute pass, otherwise results will be overwritten.
-    /// Only ever returns `Some` for queries that were created using [`GpuProfiler::begin_pass_query`].
-    pub fn render_pass_timestamp_writes(&self) -> Option<wgpu::RenderPassTimestampWrites> {
-        self.timer_query_pair.as_ref().and_then(|query| {
-            (query.usage_state == QueryPairUsageState::ReservedForPassTimestampWrites).then(|| {
-                wgpu::RenderPassTimestampWrites {
-                    query_set: &query.pool.query_set,
-                    beginning_of_pass_write_index: Some(query.start_query_idx),
-                    end_of_pass_write_index: Some(query.start_query_idx + 1),
-                }
-            })
-        })
-    }
-
-    /// Use the reserved query for compute pass timestamp writes if any.
-    ///
-    /// Use this only for a single render/compute pass, otherwise results will be overwritten.
-    /// Only ever returns `Some` for queries that were created using [`GpuProfiler::begin_pass_query`].
-    pub fn compute_pass_timestamp_writes(&self) -> Option<wgpu::ComputePassTimestampWrites> {
-        self.timer_query_pair.as_ref().and_then(|query| {
-            (query.usage_state == QueryPairUsageState::ReservedForPassTimestampWrites).then(|| {
-                wgpu::ComputePassTimestampWrites {
-                    query_set: &query.pool.query_set,
-                    beginning_of_pass_write_index: Some(query.start_query_idx),
-                    end_of_pass_write_index: Some(query.start_query_idx + 1),
-                }
-            })
-        })
-    }
-
-    /// Makes this scope a child of the passed scope.
-    #[inline]
-    pub fn with_parent(self, parent: Option<&GpuProfilerQuery>) -> Self {
-        Self {
-            parent_handle: parent.map_or(ROOT_QUERY_HANDLE, |p| p.handle),
-            ..self
-        }
-    }
-}
-
-/// Settings passed on initialization of [`GpuProfiler`].
-#[derive(Debug, Clone)]
-pub struct GpuProfilerSettings {
-    /// Enables/disables gpu timer queries.
-    ///
-    /// If false, the profiler will not emit any timer queries, making most operations on [`GpuProfiler`] no-ops.
-    ///
-    /// Since all resource creation is done lazily, this provides an effective way of disabling the profiler at runtime
-    /// without the need of special build configurations or code to handle enabled/disabled profiling.
-    pub enable_timer_queries: bool,
-
-    /// Enables/disables debug markers for all scopes on the respective encoder or pass.
-    ///
-    /// This is useful for debugging with tools like RenderDoc.
-    /// Debug markers will be emitted even if the device does not support timer queries or disables them via
-    /// [`GpuProfilerSettings::enable_timer_queries`].
-    pub enable_debug_groups: bool,
-
-    /// The profiler queues up to `max_num_pending_frames` "profiler-frames" at a time.
-    ///
-    /// A profiler-frame is regarded as in-flight until its queries have been successfully
-    /// resolved using [`GpuProfiler::process_finished_frame`].
-    /// How long this takes to happen, depends on how fast buffer mappings return successfully
-    /// which in turn primarily depends on how fast the device is able to finish work queued to the [`wgpu::Queue`].
-    ///
-    /// If this threshold is exceeded, [`GpuProfiler::end_frame`] will silently drop frames.
-    /// *Newer* frames will be dropped first in order to get results back eventually.
-    /// (If the profiler were to drop the oldest frame, one may end up in a situation where there is never
-    /// frame that is fully processed and thus never any results to be retrieved).
-    ///
-    /// Good values for `max_num_pending_frames` are 2-4 but may depend on your application workload
-    /// and GPU-CPU syncing strategy.
-    /// Must be greater than 0.
-    pub max_num_pending_frames: usize,
-}
-
-impl Default for GpuProfilerSettings {
-    fn default() -> Self {
-        Self {
-            enable_timer_queries: true,
-            enable_debug_groups: true,
-            max_num_pending_frames: 3,
-        }
-    }
-}
-
-impl GpuProfilerSettings {
-    pub fn validate(&self) -> Result<(), SettingsError> {
-        if self.max_num_pending_frames == 0 {
-            Err(SettingsError::InvalidMaxNumPendingFrames)
-        } else {
-            Ok(())
-        }
-    }
-}
-
-/// Profiler instance.
-///
-/// You can have an arbitrary number of independent profiler instances per application/adapter.
-/// Manages all the necessary [`wgpu::QuerySet`] and [`wgpu::Buffer`] behind the scenes.
-///
-/// Any query creation method may allocate a new [`wgpu::QuerySet`] and [`wgpu::Buffer`] internally if necessary.
-///
-/// After the first call that passes [`wgpu::Device`], the same device must be used with all subsequent
-/// calls to [`GpuProfiler`] and all passed references to wgpu objects must originate from that device.
-pub struct GpuProfiler {
-    unused_pools: Vec<QueryPool>,
-
-    active_frame: ActiveFrame,
-    pending_frames: Vec<PendingFrame>,
-
-    num_open_queries: AtomicU32,
-    next_query_handle: AtomicU32,
-
-    size_for_new_query_pools: u32,
-
-    settings: GpuProfilerSettings,
-
-    #[cfg(feature = "tracy")]
-    tracy_context: Option<tracy_client::GpuContext>,
-}
-
-// Public interface
-impl GpuProfiler {
-    /// Combination of all timer query features [`GpuProfiler`] can leverage.
-    pub const ALL_WGPU_TIMER_FEATURES: wgpu::Features =
-        wgpu::Features::TIMESTAMP_QUERY.union(wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES);
-
-    /// Combination of all timer query features [`GpuProfiler`] can leverage.
-    #[deprecated(since = "0.9.0", note = "Use ALL_WGPU_TIMER_FEATURES instead")]
-    pub const REQUIRED_WGPU_FEATURES: wgpu::Features = GpuProfiler::ALL_WGPU_TIMER_FEATURES;
-
-    /// Creates a new Profiler object.
-    ///
-    /// There is nothing preventing the use of several independent profiler objects.
-    pub fn new(settings: GpuProfilerSettings) -> Result<Self, CreationError> {
-        settings.validate()?;
-
-        let (closed_scope_sender, closed_scope_receiver) = std::sync::mpsc::channel();
-
-        Ok(GpuProfiler {
-            unused_pools: Vec::new(),
-
-            pending_frames: Vec::with_capacity(settings.max_num_pending_frames),
-            active_frame: ActiveFrame {
-                query_pools: RwLock::new(PendingFramePools::default()),
-                closed_query_sender: closed_scope_sender,
-                closed_query_receiver: Mutex::new(closed_scope_receiver),
-            },
-
-            num_open_queries: AtomicU32::new(0),
-            next_query_handle: AtomicU32::new(0),
-
-            size_for_new_query_pools: QueryPool::MIN_CAPACITY,
-
-            settings,
-
-            #[cfg(feature = "tracy")]
-            tracy_context: None,
-        })
-    }
-
-    /// Creates a new profiler and connects to a running Tracy client.
-    #[cfg(feature = "tracy")]
-    pub fn new_with_tracy_client(
-        settings: GpuProfilerSettings,
-        backend: wgpu::Backend,
-        device: &wgpu::Device,
-        queue: &wgpu::Queue,
-    ) -> Result<Self, CreationError> {
-        let mut profiler = Self::new(settings)?;
-        profiler.tracy_context = Some(tracy::create_tracy_gpu_client(backend, device, queue)?);
-        Ok(profiler)
-    }
-
-    /// Changes the settings of an existing profiler.
-    ///
-    /// If timer scopes are disabled by setting [GpuProfilerSettings::enable_timer_queries] to false,
-    /// any timer queries that are in flight will still be processed,
-    /// but unused query sets and buffers will be deallocated during [`Self::process_finished_frame`].
-    /// Similarly, any opened debugging scope will still be closed if debug groups are disabled by setting
-    /// [GpuProfilerSettings::enable_debug_groups] to false.
-    pub fn change_settings(&mut self, settings: GpuProfilerSettings) -> Result<(), SettingsError> {
-        settings.validate()?;
-        if !settings.enable_timer_queries {
-            self.unused_pools.clear();
-        }
-        self.settings = settings;
-
-        Ok(())
-    }
-
-    /// Starts a new auto-closing profiler scope.
-    ///
-    /// To nest scopes inside this scope, call [`Scope::scope`] on the returned scope.
-    ///
-    /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
-    /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will
-    /// not show up in the final results.
-    /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
-    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened.
-    ///
-    /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
-    ///
-    /// Scope is automatically closed on drop.
-    #[must_use]
-    #[track_caller]
-    #[inline]
-    pub fn scope<'a, Recorder: ProfilerCommandRecorder>(
-        &'a self,
-        label: impl Into<String>,
-        encoder_or_pass: &'a mut Recorder,
-        device: &wgpu::Device,
-    ) -> Scope<'a, Recorder> {
-        let scope = self.begin_query(label, encoder_or_pass, device);
-        Scope {
-            profiler: self,
-            recorder: encoder_or_pass,
-            scope: Some(scope),
-        }
-    }
-
-    /// Starts a new auto-closing profiler scope that takes ownership of the passed encoder or rendering/compute pass.
-    ///
-    /// To nest scopes inside this scope, call [`OwningScope::scope`] on the returned scope.
-    ///
-    /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
-    /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will
-    /// not show up in the final results.
-    /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
-    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened.
-    ///
-    /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
-    ///
-    /// Scope is automatically closed on drop.
-    #[must_use]
-    #[track_caller]
-    #[inline]
-    pub fn owning_scope<'a, Recorder: ProfilerCommandRecorder>(
-        &'a self,
-        label: impl Into<String>,
-        mut encoder_or_pass: Recorder,
-        device: &wgpu::Device,
-    ) -> OwningScope<'a, Recorder> {
-        let scope = self.begin_query(label, &mut encoder_or_pass, device);
-        OwningScope {
-            profiler: self,
-            recorder: encoder_or_pass,
-            scope: Some(scope),
-        }
-    }
-
-    /// Starts a new **manually closed** profiler scope that takes ownership of the passed encoder or rendering/compute pass.
-    ///
-    /// Does NOT call [`GpuProfiler::end_query()`] on drop.
-    /// This construct is just for completeness in cases where working with scopes is preferred but one can't rely on the Drop call in the right place.
-    /// This is useful when the owned value needs to be recovered after the end of the scope.
-    /// In particular, to submit a [`wgpu::CommandEncoder`] to a queue, ownership of the encoder is necessary.
-    ///
-    /// To nest scopes inside this scope, call [`ManualOwningScope::scope`] on the returned scope.
-    ///
-    /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
-    /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will
-    /// not show up in the final results.
-    /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
-    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened.
-    ///
-    /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
-    #[must_use]
-    #[track_caller]
-    #[inline]
-    pub fn manual_owning_scope<'a, Recorder: ProfilerCommandRecorder>(
-        &'a self,
-        label: impl Into<String>,
-        mut encoder_or_pass: Recorder,
-        device: &wgpu::Device,
-    ) -> ManualOwningScope<'a, Recorder> {
-        let scope = self.begin_query(label, &mut encoder_or_pass, device);
-        ManualOwningScope {
-            profiler: self,
-            recorder: encoder_or_pass,
-            scope: Some(scope),
-        }
-    }
-
-    /// Starts a new profiler query on the given encoder or rendering/compute pass (if enabled).
-    ///
-    /// The returned query *must* be closed by calling [`GpuProfiler::end_query`] with the same encoder/pass,
-    /// even if timer queries are disabled.
-    /// To do this automatically, use [`GpuProfiler::scope`]/[`GpuProfiler::owning_scope`] instead.
-    ///
-    /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
-    /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will
-    /// not show up in the final results.
-    /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
-    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no timer queries will be allocated.
-    ///
-    /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
-    #[track_caller]
-    #[must_use]
-    pub fn begin_query<Recorder: ProfilerCommandRecorder>(
-        &self,
-        label: impl Into<String>,
-        encoder_or_pass: &mut Recorder,
-        device: &wgpu::Device,
-    ) -> GpuProfilerQuery {
-        let mut query = self.begin_query_internal(label.into(), encoder_or_pass, device);
-        if let Some(timer_query) = &mut query.timer_query_pair {
-            encoder_or_pass
-                .write_timestamp(&timer_query.pool.query_set, timer_query.start_query_idx);
-            timer_query.usage_state = QueryPairUsageState::OnlyStartWritten;
-        };
-
-        if self.settings.enable_debug_groups {
-            encoder_or_pass.push_debug_group(&query.label);
-            query.has_debug_group = true;
-        }
-        query
-    }
-
-    /// Starts a new profiler query to be used for render/compute pass timestamp writes.
-    ///
-    /// The returned query *must* be closed by calling [`GpuProfiler::end_query`], even if timer queries are disabled.
-    /// To do this automatically, use [`Scope::scoped_render_pass`]/[`Scope::scoped_compute_pass`] instead.
-    ///
-    /// Call [`GpuProfilerQuery::render_pass_timestamp_writes`] or [`GpuProfilerQuery::compute_pass_timestamp_writes`]
-    /// to acquire the corresponding `wgpu::RenderPassTimestampWrites`/`wgpu::ComputePassTimestampWrites` object.
-    ///
-    /// If the [`wgpu::Device`] does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be reserved.
-    ///
-    /// Unlike [`GpuProfiler::begin_query`] this will not create a debug scope,
-    /// in order to not force passing of the same encoder/pass to [`GpuProfiler::end_query`].
-    /// (this is needed to relax resource tracking requirements a bit, making it easier to implement the automatic scopes)
-    pub fn begin_pass_query(
-        &self,
-        label: impl Into<String>,
-        encoder: &mut wgpu::CommandEncoder,
-        device: &wgpu::Device,
-    ) -> GpuProfilerQuery {
-        let mut query = self.begin_query_internal(label.into(), encoder, device);
-        if let Some(timer_query) = &mut query.timer_query_pair {
-            timer_query.usage_state = QueryPairUsageState::ReservedForPassTimestampWrites;
-        }
-        query
-    }
-
-    /// Ends passed query.
-    ///
-    /// If the passed query was opened with [`GpuProfiler::begin_query`], the passed encoder or pass must be the same
-    /// as when the query was opened.
-    pub fn end_query<Recorder: ProfilerCommandRecorder>(
-        &self,
-        encoder_or_pass: &mut Recorder,
-        mut query: GpuProfilerQuery,
-    ) {
-        if let Some(timer_query) = &mut query.timer_query_pair {
-            match timer_query.usage_state {
-                QueryPairUsageState::Reserved => {
-                    unreachable!("Query pair has been reserved but isn't used for anything!")
-                }
-                QueryPairUsageState::ReservedForPassTimestampWrites => {
-                    // No need to do a timestamp write, this is handled by wgpu.
-                }
-                QueryPairUsageState::OnlyStartWritten => {
-                    encoder_or_pass.write_timestamp(
-                        &timer_query.pool.query_set,
-                        timer_query.start_query_idx + 1,
-                    );
-                    timer_query.usage_state = QueryPairUsageState::BothStartAndEndWritten;
-                }
-                QueryPairUsageState::BothStartAndEndWritten => {
-                    unreachable!("Query pair has already been used!")
-                }
-            }
-        }
-
-        #[cfg(feature = "tracy")]
-        if let Some(ref mut tracy_scope) = query.tracy_scope {
-            tracy_scope.end_zone();
-        }
-
-        if query.has_debug_group {
-            encoder_or_pass.pop_debug_group();
-        }
-
-        let send_result = self.active_frame.closed_query_sender.send(query);
-
-        // The only way we can fail sending the query is if the receiver has been dropped.
-        // Since it sits on `active_frame` as well, there's no way for this to happen!
-        debug_assert!(send_result.is_ok());
-
-        // Count queries even if we haven't processed this one, makes experiences more consistent
-        // if there's a lack of support for some queries.
-        self.num_open_queries.fetch_sub(1, Ordering::Release);
-    }
-
-    /// Puts query resolve commands in the encoder for all unresolved, pending queries of the active profiler frame.
-    ///
-    /// Note that you do *not* need to do this for every encoder, it is sufficient do do this once per frame as long
-    /// as you submit the corresponding command buffer after all others that may have opened queries in the same frame.
-    /// (It does not matter if the passed encoder itself has previously opened queries or not.)
-    /// If you were to make this part of a command buffer that is enqueued before any other that has
-    /// opened queries in the same profiling frame, no failure will occur but some timing results may be invalid.
-    ///
-    /// It is advised to call this only once at the end of a profiling frame, but it is safe to do so several times.
-    ///
-    ///
-    /// Implementation note:
-    /// This method could be made `&self`, taking the internal lock on the query pools.
-    /// However, the intended use is to call this once at the end of a frame, so we instead
-    /// encourage this explicit sync point and avoid the lock.
-    pub fn resolve_queries(&mut self, encoder: &mut wgpu::CommandEncoder) {
-        let query_pools = self.active_frame.query_pools.get_mut();
-
-        for query_pool in query_pools.used_pools.iter_mut() {
-            // We sync with the last update of num_used_query (which has Release semantics)
-            // mostly to be on the safe side - it happened inside a lock which gives it release semantics anyways
-            // but the concern is that if we don't acquire here, we may miss on other side prior effects of the query begin.
-            let num_used_queries = query_pool.num_used_queries.load(Ordering::Acquire);
-            let num_resolved_queries = query_pool.num_resolved_queries.load(Ordering::Acquire);
-
-            if num_resolved_queries == num_used_queries {
-                continue;
-            }
-
-            assert!(num_resolved_queries < num_used_queries);
-
-            encoder.resolve_query_set(
-                &query_pool.query_set,
-                num_resolved_queries..num_used_queries,
-                &query_pool.resolve_buffer,
-                (num_resolved_queries * QUERY_SIZE) as u64,
-            );
-            query_pool
-                .num_resolved_queries
-                .store(num_used_queries, Ordering::Release);
-
-            encoder.copy_buffer_to_buffer(
-                &query_pool.resolve_buffer,
-                0,
-                &query_pool.read_buffer,
-                0,
-                (num_used_queries * QUERY_SIZE) as u64,
-            );
-        }
-    }
-
-    /// Marks the end of a frame.
-    ///
-    /// Needs to be called **after** submitting any encoder used in the current profiler frame.
-    ///
-    /// Fails if there are still open queries or unresolved queries.
-    pub fn end_frame(&mut self) -> Result<(), EndFrameError> {
-        let num_open_queries = self.num_open_queries.load(Ordering::Acquire);
-        if num_open_queries != 0 {
-            return Err(EndFrameError::UnclosedQueries(num_open_queries));
-        }
-
-        let query_pools = self.active_frame.query_pools.get_mut();
-
-        let mut new_pending_frame = PendingFrame {
-            query_pools: std::mem::take(&mut query_pools.used_pools),
-            closed_query_by_parent_handle: HashMap::new(),
-            mapped_buffers: Arc::new(AtomicU32::new(0)),
-        };
-
-        for query in self.active_frame.closed_query_receiver.get_mut().try_iter() {
-            new_pending_frame
-                .closed_query_by_parent_handle
-                .entry(query.parent_handle)
-                .or_default()
-                .push(query);
-        }
-
-        // All loads of pool.num_used_queries are Relaxed since we assume,
-        // that we already acquired the state during `resolve_queries` and no further otherwise unobserved
-        // modifications happened since then.
-
-        let num_unresolved_queries = new_pending_frame
-            .query_pools
-            .iter()
-            .map(|pool| {
-                pool.num_used_queries.load(Ordering::Relaxed)
-                    - pool.num_resolved_queries.load(Ordering::Relaxed)
-            })
-            .sum();
-        if num_unresolved_queries != 0 {
-            return Err(EndFrameError::UnresolvedQueries(num_unresolved_queries));
-        }
-
-        // Next time we create a new query pool, we want it to be at least as big to hold all queries of this frame.
-        self.size_for_new_query_pools = self
-            .size_for_new_query_pools
-            .max(
-                new_pending_frame
-                    .query_pools
-                    .iter()
-                    .map(|pool| pool.num_used_queries.load(Ordering::Relaxed))
-                    .sum(),
-            )
-            .min(QUERY_SET_MAX_QUERIES);
-
-        // Make sure we don't overflow.
-        if self.pending_frames.len() == self.settings.max_num_pending_frames {
-            // Drop previous (!) frame.
-            // Dropping the oldest frame could get us into an endless cycle where we're never able to complete
-            // any pending frames as the ones closest to completion would be evicted.
-            if let Some(dropped_frame) = self.pending_frames.pop() {
-                // Drop queries first since they still have references to the query pools that we want to reuse.
-                drop(dropped_frame.closed_query_by_parent_handle);
-
-                // Mark the frame as dropped. We'll give back the query pools once the mapping is done.
-                // Any previously issued map_async call that haven't finished yet, will invoke their callback with mapping abort.
-                self.reset_and_cache_unused_query_pools(dropped_frame.query_pools);
-            }
-        }
-
-        // Map all buffers.
-        for pool in new_pending_frame.query_pools.iter_mut() {
-            let mapped_buffers = new_pending_frame.mapped_buffers.clone();
-            pool.read_buffer
-                .slice(0..(pool.num_used_queries.load(Ordering::Relaxed) * QUERY_SIZE) as u64)
-                .map_async(wgpu::MapMode::Read, move |mapping_result| {
-                    // Mapping should not fail unless it was cancelled due to the frame being dropped.
-                    match mapping_result {
-                        Err(_) => {
-                            // We only want to ignore the error iff the mapping has been aborted by us (due to a dropped frame, see above).
-                            // In any other case, we need should panic as this would imply something went seriously sideways.
-                            //
-                            // As of writing, this is not yet possible in wgpu, see https://github.com/gfx-rs/wgpu/pull/2939
-                        }
-                        Ok(()) => {
-                            mapped_buffers.fetch_add(1, std::sync::atomic::Ordering::Release);
-                        }
-                    }
-                });
-        }
-
-        // Enqueue
-        self.pending_frames.push(new_pending_frame);
-        assert!(self.pending_frames.len() <= self.settings.max_num_pending_frames);
-
-        Ok(())
-    }
-
-    /// Checks if all timer queries for the oldest pending finished frame are done and returns that snapshot if any.
-    ///
-    /// timestamp_period:
-    ///    The timestamp period of the device. Pass the result of [`wgpu::Queue::get_timestamp_period()`].
-    ///    Note that some implementations (Chrome as of writing) may converge to a timestamp period while the application is running,
-    ///    so caching this value is usually not recommended.
-    pub fn process_finished_frame(
-        &mut self,
-        timestamp_period: f32,
-    ) -> Option<Vec<GpuTimerQueryResult>> {
-        let frame = self.pending_frames.first_mut()?;
-
-        // We only process if all mappings succeed.
-        if frame
-            .mapped_buffers
-            .load(std::sync::atomic::Ordering::Acquire)
-            != frame.query_pools.len() as u32
-        {
-            return None;
-        }
-
-        let mut frame = self.pending_frames.remove(0);
-
-        let results = {
-            let timestamp_to_sec = timestamp_period as f64 / 1000.0 / 1000.0 / 1000.0;
-
-            Self::process_timings_recursive(
-                timestamp_to_sec,
-                &mut frame.closed_query_by_parent_handle,
-                ROOT_QUERY_HANDLE,
-            )
-        };
-
-        self.reset_and_cache_unused_query_pools(frame.query_pools);
-
-        Some(results)
-    }
-}
-
-// --------------------------------------------------------------------------------
-// Internals
-// --------------------------------------------------------------------------------
-
-const QUERY_SIZE: u32 = wgpu::QUERY_SIZE;
-const QUERY_SET_MAX_QUERIES: u32 = wgpu::QUERY_SET_MAX_QUERIES;
-
-/// Returns true if a timestamp should be written to the encoder or pass.
-fn timestamp_write_supported<Recorder: ProfilerCommandRecorder>(
-    encoder_or_pass: &mut Recorder,
-    features: wgpu::Features,
-) -> bool {
-    let required_feature = if encoder_or_pass.is_pass() {
-        wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES
-    } else {
-        wgpu::Features::TIMESTAMP_QUERY
-    };
-    features.contains(required_feature)
-}
-
-impl GpuProfiler {
-    fn next_scope_tree_handle(&self) -> GpuTimerQueryTreeHandle {
-        // Relaxed is fine, we just want a number that nobody uses this frame already.
-        let mut handle = self.next_query_handle.fetch_add(1, Ordering::Relaxed);
-
-        // We don't ever expect to run out of handles during a single frame, but who knows how long the app runs.
-        while handle == ROOT_QUERY_HANDLE {
-            handle = self.next_query_handle.fetch_add(1, Ordering::Relaxed);
-        }
-
-        handle
-    }
-
-    fn reset_and_cache_unused_query_pools(&mut self, mut discarded_pools: Vec<Arc<QueryPool>>) {
-        let capacity_threshold = self.size_for_new_query_pools / 2;
-        for pool in discarded_pools.drain(..) {
-            // If the pool is truly unused now, it's ref count should be 1!
-            // If we use it anywhere else we have an implementation bug.
-            let mut pool = Arc::into_inner(pool).expect("Pool still in use");
-            pool.reset();
-
-            // If a pool was less than half of the size of the max frame, then we don't keep it.
-            // This way we're going to need less pools in upcoming frames and thus have less overhead in the long run.
-            // If timer queries were disabled, we also don't keep any pools.
-            if self.settings.enable_timer_queries && pool.capacity >= capacity_threshold {
-                self.active_frame
-                    .query_pools
-                    .get_mut()
-                    .unused_pools
-                    .push(pool);
-            }
-        }
-    }
-
-    fn try_reserve_query_pair(pool: &Arc<QueryPool>) -> Option<ReservedTimerQueryPair> {
-        let mut num_used_queries = pool.num_used_queries.load(Ordering::Relaxed);
-
-        loop {
-            if pool.capacity < num_used_queries + 2 {
-                // This pool is out of capacity, we failed the operation.
-                return None;
-            }
-
-            match pool.num_used_queries.compare_exchange_weak(
-                num_used_queries,
-                num_used_queries + 2,
-                // Write to num_used_queries with release semantics to be on the safe side.
-                // (It doesn't look like there's other side effects that we need to publish.)
-                Ordering::Release,
-                // No barrier for the failure case.
-                // The only thing we have to acquire is the pool's capacity which is constant and
-                // was definitely acquired by the RWLock prior to this call.
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => {
-                    // We successfully acquired two queries!
-                    return Some(ReservedTimerQueryPair {
-                        pool: pool.clone(),
-                        start_query_idx: num_used_queries,
-                        usage_state: QueryPairUsageState::Reserved,
-                    });
-                }
-                Err(updated) => {
-                    // Someone else acquired queries in the meantime, try again.
-                    num_used_queries = updated;
-                }
-            }
-        }
-    }
-
-    // Reserves two query objects.
-    // Our query pools always have an even number of queries, so we know the next query is the next in the same pool.
-    fn reserve_query_pair(&self, device: &wgpu::Device) -> ReservedTimerQueryPair {
-        // First, try to allocate from current top pool.
-        // Requires taking a read lock on the current query pool.
-        {
-            let query_pools = self.active_frame.query_pools.read();
-            if let Some(pair) = query_pools
-                .used_pools
-                .last()
-                .and_then(Self::try_reserve_query_pair)
-            {
-                return pair;
-            }
-        }
-        // If this didn't work, we may need to add a new pool.
-        // Requires taking a write lock on the current query pool.
-        {
-            let mut query_pools = self.active_frame.query_pools.write();
-
-            // It could be that by now, another thread has already added a new pool!
-            // This is a bit unfortunate because it means we unnecessarily took a write lock, but it seems hard to get around this.
-            if let Some(pair) = query_pools
-                .used_pools
-                .last()
-                .and_then(Self::try_reserve_query_pair)
-            {
-                return pair;
-            }
-
-            // Now we know for certain that the last pool is exhausted, so add a new one!
-            let new_pool = if let Some(reused_pool) = query_pools.unused_pools.pop() {
-                // First check if there's an unused pool we can take.
-                Arc::new(reused_pool)
-            } else {
-                // If we can't, create a new pool that is as big as all previous pools combined.
-                Arc::new(QueryPool::new(
-                    query_pools
-                        .used_pools
-                        .iter()
-                        .map(|pool| pool.capacity)
-                        .sum::<u32>()
-                        .max(self.size_for_new_query_pools)
-                        .min(QUERY_SET_MAX_QUERIES),
-                    device,
-                ))
-            };
-
-            let pair = Self::try_reserve_query_pair(&new_pool)
-                .expect("Freshly reserved pool doesn't have enough capacity");
-            query_pools.used_pools.push(new_pool);
-
-            pair
-        }
-    }
-
-    #[track_caller]
-    #[must_use]
-    fn begin_query_internal<Recorder: ProfilerCommandRecorder>(
-        &self,
-        label: String,
-        encoder_or_pass: &mut Recorder,
-        device: &wgpu::Device,
-    ) -> GpuProfilerQuery {
-        // Give opening/closing queries acquire/release semantics:
-        // This way, we won't get any nasty surprises when observing zero open queries.
-        self.num_open_queries.fetch_add(1, Ordering::Acquire);
-
-        let query = if self.settings.enable_timer_queries
-            && timestamp_write_supported(encoder_or_pass, device.features())
-        {
-            Some(self.reserve_query_pair(device))
-        } else {
-            None
-        };
-
-        let _tracy_scope = if self.settings.enable_timer_queries {
-            #[cfg(feature = "tracy")]
-            {
-                let location = std::panic::Location::caller();
-                self.tracy_context.as_ref().and_then(|c| {
-                    c.span_alloc(&label, "", location.file(), location.line())
-                        .ok()
-                })
-            }
-            #[cfg(not(feature = "tracy"))]
-            Option::<()>::None
-        } else {
-            None
-        };
-
-        GpuProfilerQuery {
-            label,
-            pid: std::process::id(),
-            tid: std::thread::current().id(),
-            timer_query_pair: query,
-            handle: self.next_scope_tree_handle(),
-            parent_handle: ROOT_QUERY_HANDLE,
-            has_debug_group: false,
-            #[cfg(feature = "tracy")]
-            tracy_scope: _tracy_scope,
-        }
-    }
-
-    fn process_timings_recursive(
-        timestamp_to_sec: f64,
-        closed_scope_by_parent_handle: &mut HashMap<GpuTimerQueryTreeHandle, Vec<GpuProfilerQuery>>,
-        parent_handle: GpuTimerQueryTreeHandle,
-    ) -> Vec<GpuTimerQueryResult> {
-        let Some(queries_with_same_parent) = closed_scope_by_parent_handle.remove(&parent_handle)
-        else {
-            return Vec::new();
-        };
-
-        queries_with_same_parent
-            .into_iter()
-            .filter_map(|mut scope| {
-                let Some(query) = scope.timer_query_pair.take() else {
-                    // Inactive queries don't have any results or nested queries with results.
-                    // Currently, we drop them from the results completely.
-                    // In the future we could still make them show up since they convey information like label & pid/tid.
-                    return None;
-                };
-
-                // Read timestamp from buffer.
-                // By design timestamps for start/end are consecutive.
-                let offset = (query.start_query_idx * QUERY_SIZE) as u64;
-                let buffer_slice = &query
-                    .pool
-                    .read_buffer
-                    .slice(offset..(offset + (QUERY_SIZE * 2) as u64))
-                    .get_mapped_range();
-                let start_raw =
-                    u64::from_le_bytes(buffer_slice[0..QUERY_SIZE as usize].try_into().unwrap());
-                let end_raw = u64::from_le_bytes(
-                    buffer_slice[QUERY_SIZE as usize..(QUERY_SIZE as usize) * 2]
-                        .try_into()
-                        .unwrap(),
-                );
-
-                #[cfg(feature = "tracy")]
-                if let Some(tracy_scope) = scope.tracy_scope.take() {
-                    tracy_scope.upload_timestamp(start_raw as i64, end_raw as i64);
-                }
-
-                let nested_queries = Self::process_timings_recursive(
-                    timestamp_to_sec,
-                    closed_scope_by_parent_handle,
-                    scope.handle,
-                );
-
-                Some(GpuTimerQueryResult {
-                    label: std::mem::take(&mut scope.label),
-                    time: (start_raw as f64 * timestamp_to_sec)
-                        ..(end_raw as f64 * timestamp_to_sec),
-                    nested_queries,
-                    pid: scope.pid,
-                    tid: scope.tid,
-                })
-            })
-            .collect::<Vec<_>>()
-    }
-}
-
-#[derive(PartialEq, Eq)]
-enum QueryPairUsageState {
-    /// Transitional state used upon creation.
-    Reserved,
-
-    /// Don't do manual timestamp writes, wgpu is expected to do them for us.
-    ReservedForPassTimestampWrites,
-
-    /// Start query has been used, end query is still available.
-    OnlyStartWritten,
-
-    /// Both start & end query have been used.
-    BothStartAndEndWritten,
-}
-
-struct ReservedTimerQueryPair {
-    /// QueryPool on which both start & end queries of the scope are done.
-    ///
-    /// By putting an arc here instead of an index into a vec, we don't need
-    /// need to take any locks upon closing a profiling scope.
-    pool: Arc<QueryPool>,
-
-    /// Query index at which the scope begins.
-    /// The query after this is reserved for the end of the scope.
-    start_query_idx: u32,
-
-    /// Current use of the query pair.
-    usage_state: QueryPairUsageState,
-}
-
-/// A pool of queries, consisting of a single queryset & buffer for query results.
-#[derive(Debug)]
-struct QueryPool {
-    query_set: wgpu::QuerySet,
-
-    resolve_buffer: wgpu::Buffer,
-    read_buffer: wgpu::Buffer,
-
-    capacity: u32,
-    num_used_queries: AtomicU32,
-    num_resolved_queries: AtomicU32,
-}
-
-impl QueryPool {
-    const MIN_CAPACITY: u32 = 32;
-
-    fn new(capacity: u32, device: &wgpu::Device) -> Self {
-        QueryPool {
-            query_set: device.create_query_set(&wgpu::QuerySetDescriptor {
-                label: Some("GpuProfiler - Query Set"),
-                ty: wgpu::QueryType::Timestamp,
-                count: capacity,
-            }),
-
-            resolve_buffer: device.create_buffer(&wgpu::BufferDescriptor {
-                label: Some("GpuProfiler - Query Resolve Buffer"),
-                size: (QUERY_SIZE * capacity) as u64,
-                usage: wgpu::BufferUsages::QUERY_RESOLVE | wgpu::BufferUsages::COPY_SRC,
-                mapped_at_creation: false,
-            }),
-
-            read_buffer: device.create_buffer(&wgpu::BufferDescriptor {
-                label: Some("GpuProfiler - Query Read Buffer"),
-                size: (QUERY_SIZE * capacity) as u64,
-                usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
-                mapped_at_creation: false,
-            }),
-
-            capacity,
-            num_used_queries: AtomicU32::new(0),
-            num_resolved_queries: AtomicU32::new(0),
-        }
-    }
-
-    fn reset(&mut self) {
-        self.num_used_queries = AtomicU32::new(0);
-        self.num_resolved_queries = AtomicU32::new(0);
-        self.read_buffer.unmap();
-    }
-}
-
-#[derive(Default)]
-struct PendingFramePools {
-    /// List of all pools used in this frame.
-    /// The last pool is the one new profiling queries will try to make timer queries into.
-    used_pools: Vec<Arc<QueryPool>>,
-
-    /// List of unused pools recycled from previous frames.
-    unused_pools: Vec<QueryPool>,
-}
-
-/// Internal handle to building a tree of profiling queries.
-type GpuTimerQueryTreeHandle = u32;
-
-/// Handle for the root scope.
-const ROOT_QUERY_HANDLE: GpuTimerQueryTreeHandle = std::u32::MAX;
-
-struct ActiveFrame {
-    query_pools: RwLock<PendingFramePools>,
-
-    /// Closed queries get send to this channel.
-    ///
-    /// Note that channel is still overkill for what we want here:
-    /// We're in a multi producer situation, *but* the single consumer is known to be only
-    /// active in a mut context, i.e. while we're consuming we know that we're not producing.
-    /// We have to wrap it in a Mutex because the channel is not Sync, but we actually never lock it
-    /// since we only ever access it in a `mut` context.
-    closed_query_sender: std::sync::mpsc::Sender<GpuProfilerQuery>,
-    closed_query_receiver: Mutex<std::sync::mpsc::Receiver<GpuProfilerQuery>>,
-}
-
-struct PendingFrame {
-    query_pools: Vec<Arc<QueryPool>>,
-    closed_query_by_parent_handle: HashMap<GpuTimerQueryTreeHandle, Vec<GpuProfilerQuery>>,
-
-    /// Keeps track of the number of buffers in the query pool that have been mapped successfully.
-    mapped_buffers: std::sync::Arc<std::sync::atomic::AtomicU32>,
-}
-
-pub trait ProfilerCommandRecorder {
-    /// Returns `true` if it's a pass or `false` if it's an encoder
-    fn is_pass(&self) -> bool;
-    fn write_timestamp(&mut self, query_set: &wgpu::QuerySet, query_index: u32);
-    fn push_debug_group(&mut self, label: &str);
-    fn pop_debug_group(&mut self);
-}
-
-macro_rules! ImplProfilerCommandRecorder {
-    ($($name:ident $(< $lt:lifetime >)? : $pass:literal,)*) => {
-        $(
-            impl $(< $lt >)? ProfilerCommandRecorder for wgpu::$name $(< $lt >)? {
-                fn is_pass(&self) -> bool { $pass }
-
-                fn write_timestamp(&mut self, query_set: &wgpu::QuerySet, query_index: u32) {
-                    self.write_timestamp(query_set, query_index)
-                }
-
-                fn push_debug_group(&mut self, label: &str) {
-                    self.push_debug_group(label)
-                }
-
-                fn pop_debug_group(&mut self) {
-                    self.pop_debug_group()
-                }
-            }
-        )*
-    };
-}
-
-ImplProfilerCommandRecorder!(CommandEncoder:false, RenderPass<'a>:true, ComputePass<'a>:true,);
diff --git a/src/profiler.rs b/src/profiler.rs
new file mode 100644
index 0000000..64b7b21
--- /dev/null
+++ b/src/profiler.rs
@@ -0,0 +1,879 @@
+use std::{
+    collections::HashMap,
+    sync::{
+        atomic::{AtomicU32, Ordering},
+        Arc,
+    },
+};
+
+use parking_lot::{Mutex, RwLock};
+
+use crate::{
+    CreationError, EndFrameError, GpuProfilerQuery, GpuProfilerSettings, GpuTimerQueryResult,
+    ManualOwningScope, OwningScope, ProfilerCommandRecorder, Scope, SettingsError,
+};
+
+/// Profiler instance.
+///
+/// You can have an arbitrary number of independent profiler instances per application/adapter.
+/// Manages all the necessary [`wgpu::QuerySet`] and [`wgpu::Buffer`] behind the scenes.
+///
+/// Any query creation method may allocate a new [`wgpu::QuerySet`] and [`wgpu::Buffer`] internally if necessary.
+///
+/// After the first call that passes [`wgpu::Device`], the same device must be used with all subsequent
+/// calls to [`GpuProfiler`] and all passed references to wgpu objects must originate from that device.
+pub struct GpuProfiler {
+    unused_pools: Vec<QueryPool>,
+
+    active_frame: ActiveFrame,
+    pending_frames: Vec<PendingFrame>,
+
+    num_open_queries: AtomicU32,
+    next_query_handle: AtomicU32,
+
+    size_for_new_query_pools: u32,
+
+    settings: GpuProfilerSettings,
+
+    #[cfg(feature = "tracy")]
+    tracy_context: Option<tracy_client::GpuContext>,
+}
+
+// Public interface
+impl GpuProfiler {
+    /// Combination of all timer query features [`GpuProfiler`] can leverage.
+    pub const ALL_WGPU_TIMER_FEATURES: wgpu::Features =
+        wgpu::Features::TIMESTAMP_QUERY.union(wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES);
+
+    /// Combination of all timer query features [`GpuProfiler`] can leverage.
+    #[deprecated(since = "0.9.0", note = "Use ALL_WGPU_TIMER_FEATURES instead")]
+    pub const REQUIRED_WGPU_FEATURES: wgpu::Features = GpuProfiler::ALL_WGPU_TIMER_FEATURES;
+
+    /// Creates a new Profiler object.
+    ///
+    /// There is nothing preventing the use of several independent profiler objects.
+    pub fn new(settings: GpuProfilerSettings) -> Result<Self, CreationError> {
+        settings.validate()?;
+
+        let (closed_scope_sender, closed_scope_receiver) = std::sync::mpsc::channel();
+
+        Ok(GpuProfiler {
+            unused_pools: Vec::new(),
+
+            pending_frames: Vec::with_capacity(settings.max_num_pending_frames),
+            active_frame: ActiveFrame {
+                query_pools: RwLock::new(PendingFramePools::default()),
+                closed_query_sender: closed_scope_sender,
+                closed_query_receiver: Mutex::new(closed_scope_receiver),
+            },
+
+            num_open_queries: AtomicU32::new(0),
+            next_query_handle: AtomicU32::new(0),
+
+            size_for_new_query_pools: QueryPool::MIN_CAPACITY,
+
+            settings,
+
+            #[cfg(feature = "tracy")]
+            tracy_context: None,
+        })
+    }
+
+    /// Creates a new profiler and connects to a running Tracy client.
+    #[cfg(feature = "tracy")]
+    pub fn new_with_tracy_client(
+        settings: GpuProfilerSettings,
+        backend: wgpu::Backend,
+        device: &wgpu::Device,
+        queue: &wgpu::Queue,
+    ) -> Result<Self, CreationError> {
+        let mut profiler = Self::new(settings)?;
+        profiler.tracy_context = Some(crate::tracy::create_tracy_gpu_client(
+            backend, device, queue,
+        )?);
+        Ok(profiler)
+    }
+
+    /// Changes the settings of an existing profiler.
+    ///
+    /// If timer scopes are disabled by setting [GpuProfilerSettings::enable_timer_queries] to false,
+    /// any timer queries that are in flight will still be processed,
+    /// but unused query sets and buffers will be deallocated during [`Self::process_finished_frame`].
+    /// Similarly, any opened debugging scope will still be closed if debug groups are disabled by setting
+    /// [GpuProfilerSettings::enable_debug_groups] to false.
+    pub fn change_settings(&mut self, settings: GpuProfilerSettings) -> Result<(), SettingsError> {
+        settings.validate()?;
+        if !settings.enable_timer_queries {
+            self.unused_pools.clear();
+        }
+        self.settings = settings;
+
+        Ok(())
+    }
+
+    /// Starts a new auto-closing profiler scope.
+    ///
+    /// To nest scopes inside this scope, call [`Scope::scope`] on the returned scope.
+    ///
+    /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
+    /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will
+    /// not show up in the final results.
+    /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
+    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened.
+    ///
+    /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
+    ///
+    /// Scope is automatically closed on drop.
+    #[must_use]
+    #[track_caller]
+    #[inline]
+    pub fn scope<'a, Recorder: ProfilerCommandRecorder>(
+        &'a self,
+        label: impl Into<String>,
+        encoder_or_pass: &'a mut Recorder,
+        device: &wgpu::Device,
+    ) -> Scope<'a, Recorder> {
+        let scope = self.begin_query(label, encoder_or_pass, device);
+        Scope {
+            profiler: self,
+            recorder: encoder_or_pass,
+            scope: Some(scope),
+        }
+    }
+
+    /// Starts a new auto-closing profiler scope that takes ownership of the passed encoder or rendering/compute pass.
+    ///
+    /// To nest scopes inside this scope, call [`OwningScope::scope`] on the returned scope.
+    ///
+    /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
+    /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will
+    /// not show up in the final results.
+    /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
+    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened.
+    ///
+    /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
+    ///
+    /// Scope is automatically closed on drop.
+    #[must_use]
+    #[track_caller]
+    #[inline]
+    pub fn owning_scope<'a, Recorder: ProfilerCommandRecorder>(
+        &'a self,
+        label: impl Into<String>,
+        mut encoder_or_pass: Recorder,
+        device: &wgpu::Device,
+    ) -> OwningScope<'a, Recorder> {
+        let scope = self.begin_query(label, &mut encoder_or_pass, device);
+        OwningScope {
+            profiler: self,
+            recorder: encoder_or_pass,
+            scope: Some(scope),
+        }
+    }
+
+    /// Starts a new **manually closed** profiler scope that takes ownership of the passed encoder or rendering/compute pass.
+    ///
+    /// Does NOT call [`GpuProfiler::end_query()`] on drop.
+    /// This construct is just for completeness in cases where working with scopes is preferred but one can't rely on the Drop call in the right place.
+    /// This is useful when the owned value needs to be recovered after the end of the scope.
+    /// In particular, to submit a [`wgpu::CommandEncoder`] to a queue, ownership of the encoder is necessary.
+    ///
+    /// To nest scopes inside this scope, call [`ManualOwningScope::scope`] on the returned scope.
+    ///
+    /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
+    /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will
+    /// not show up in the final results.
+    /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
+    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened.
+    ///
+    /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
+    #[must_use]
+    #[track_caller]
+    #[inline]
+    pub fn manual_owning_scope<'a, Recorder: ProfilerCommandRecorder>(
+        &'a self,
+        label: impl Into<String>,
+        mut encoder_or_pass: Recorder,
+        device: &wgpu::Device,
+    ) -> ManualOwningScope<'a, Recorder> {
+        let scope = self.begin_query(label, &mut encoder_or_pass, device);
+        ManualOwningScope {
+            profiler: self,
+            recorder: encoder_or_pass,
+            scope: Some(scope),
+        }
+    }
+
+    /// Starts a new profiler query on the given encoder or rendering/compute pass (if enabled).
+    ///
+    /// The returned query *must* be closed by calling [`GpuProfiler::end_query`] with the same encoder/pass,
+    /// even if timer queries are disabled.
+    /// To do this automatically, use [`GpuProfiler::scope`]/[`GpuProfiler::owning_scope`] instead.
+    ///
+    /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
+    /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will
+    /// not show up in the final results.
+    /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
+    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no timer queries will be allocated.
+    ///
+    /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
+    #[track_caller]
+    #[must_use]
+    pub fn begin_query<Recorder: ProfilerCommandRecorder>(
+        &self,
+        label: impl Into<String>,
+        encoder_or_pass: &mut Recorder,
+        device: &wgpu::Device,
+    ) -> GpuProfilerQuery {
+        let mut query = self.begin_query_internal(label.into(), encoder_or_pass, device);
+        if let Some(timer_query) = &mut query.timer_query_pair {
+            encoder_or_pass
+                .write_timestamp(&timer_query.pool.query_set, timer_query.start_query_idx);
+            timer_query.usage_state = QueryPairUsageState::OnlyStartWritten;
+        };
+
+        if self.settings.enable_debug_groups {
+            encoder_or_pass.push_debug_group(&query.label);
+            query.has_debug_group = true;
+        }
+        query
+    }
+
+    /// Starts a new profiler query to be used for render/compute pass timestamp writes.
+    ///
+    /// The returned query *must* be closed by calling [`GpuProfiler::end_query`], even if timer queries are disabled.
+    /// To do this automatically, use [`Scope::scoped_render_pass`]/[`Scope::scoped_compute_pass`] instead.
+    ///
+    /// Call [`GpuProfilerQuery::render_pass_timestamp_writes`] or [`GpuProfilerQuery::compute_pass_timestamp_writes`]
+    /// to acquire the corresponding `wgpu::RenderPassTimestampWrites`/`wgpu::ComputePassTimestampWrites` object.
+    ///
+    /// If the [`wgpu::Device`] does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be reserved.
+    ///
+    /// Unlike [`GpuProfiler::begin_query`] this will not create a debug scope,
+    /// in order to not force passing of the same encoder/pass to [`GpuProfiler::end_query`].
+    /// (this is needed to relax resource tracking requirements a bit, making it easier to implement the automatic scopes)
+    pub fn begin_pass_query(
+        &self,
+        label: impl Into<String>,
+        encoder: &mut wgpu::CommandEncoder,
+        device: &wgpu::Device,
+    ) -> GpuProfilerQuery {
+        let mut query = self.begin_query_internal(label.into(), encoder, device);
+        if let Some(timer_query) = &mut query.timer_query_pair {
+            timer_query.usage_state = QueryPairUsageState::ReservedForPassTimestampWrites;
+        }
+        query
+    }
+
+    /// Ends passed query.
+    ///
+    /// If the passed query was opened with [`GpuProfiler::begin_query`], the passed encoder or pass must be the same
+    /// as when the query was opened.
+    pub fn end_query<Recorder: ProfilerCommandRecorder>(
+        &self,
+        encoder_or_pass: &mut Recorder,
+        mut query: GpuProfilerQuery,
+    ) {
+        if let Some(timer_query) = &mut query.timer_query_pair {
+            match timer_query.usage_state {
+                QueryPairUsageState::Reserved => {
+                    unreachable!("Query pair has been reserved but isn't used for anything!")
+                }
+                QueryPairUsageState::ReservedForPassTimestampWrites => {
+                    // No need to do a timestamp write, this is handled by wgpu.
+                }
+                QueryPairUsageState::OnlyStartWritten => {
+                    encoder_or_pass.write_timestamp(
+                        &timer_query.pool.query_set,
+                        timer_query.start_query_idx + 1,
+                    );
+                    timer_query.usage_state = QueryPairUsageState::BothStartAndEndWritten;
+                }
+                QueryPairUsageState::BothStartAndEndWritten => {
+                    unreachable!("Query pair has already been used!")
+                }
+            }
+        }
+
+        #[cfg(feature = "tracy")]
+        if let Some(ref mut tracy_scope) = query.tracy_scope {
+            tracy_scope.end_zone();
+        }
+
+        if query.has_debug_group {
+            encoder_or_pass.pop_debug_group();
+        }
+
+        let send_result = self.active_frame.closed_query_sender.send(query);
+
+        // The only way we can fail sending the query is if the receiver has been dropped.
+        // Since it sits on `active_frame` as well, there's no way for this to happen!
+        debug_assert!(send_result.is_ok());
+
+        // Count queries even if we haven't processed this one, makes experiences more consistent
+        // if there's a lack of support for some queries.
+        self.num_open_queries.fetch_sub(1, Ordering::Release);
+    }
+
+    /// Puts query resolve commands in the encoder for all unresolved, pending queries of the active profiler frame.
+    ///
+    /// Note that you do *not* need to do this for every encoder, it is sufficient do do this once per frame as long
+    /// as you submit the corresponding command buffer after all others that may have opened queries in the same frame.
+    /// (It does not matter if the passed encoder itself has previously opened queries or not.)
+    /// If you were to make this part of a command buffer that is enqueued before any other that has
+    /// opened queries in the same profiling frame, no failure will occur but some timing results may be invalid.
+    ///
+    /// It is advised to call this only once at the end of a profiling frame, but it is safe to do so several times.
+    ///
+    ///
+    /// Implementation note:
+    /// This method could be made `&self`, taking the internal lock on the query pools.
+    /// However, the intended use is to call this once at the end of a frame, so we instead
+    /// encourage this explicit sync point and avoid the lock.
+    pub fn resolve_queries(&mut self, encoder: &mut wgpu::CommandEncoder) {
+        let query_pools = self.active_frame.query_pools.get_mut();
+
+        for query_pool in query_pools.used_pools.iter_mut() {
+            // We sync with the last update of num_used_query (which has Release semantics)
+            // mostly to be on the safe side - it happened inside a lock which gives it release semantics anyways
+            // but the concern is that if we don't acquire here, we may miss on other side prior effects of the query begin.
+            let num_used_queries = query_pool.num_used_queries.load(Ordering::Acquire);
+            let num_resolved_queries = query_pool.num_resolved_queries.load(Ordering::Acquire);
+
+            if num_resolved_queries == num_used_queries {
+                continue;
+            }
+
+            assert!(num_resolved_queries < num_used_queries);
+
+            encoder.resolve_query_set(
+                &query_pool.query_set,
+                num_resolved_queries..num_used_queries,
+                &query_pool.resolve_buffer,
+                (num_resolved_queries * wgpu::QUERY_SIZE) as u64,
+            );
+            query_pool
+                .num_resolved_queries
+                .store(num_used_queries, Ordering::Release);
+
+            encoder.copy_buffer_to_buffer(
+                &query_pool.resolve_buffer,
+                0,
+                &query_pool.read_buffer,
+                0,
+                (num_used_queries * wgpu::QUERY_SIZE) as u64,
+            );
+        }
+    }
+
+    /// Marks the end of a frame.
+    ///
+    /// Needs to be called **after** submitting any encoder used in the current profiler frame.
+    ///
+    /// Fails if there are still open queries or unresolved queries.
+    pub fn end_frame(&mut self) -> Result<(), EndFrameError> {
+        let num_open_queries = self.num_open_queries.load(Ordering::Acquire);
+        if num_open_queries != 0 {
+            return Err(EndFrameError::UnclosedQueries(num_open_queries));
+        }
+
+        let query_pools = self.active_frame.query_pools.get_mut();
+
+        let mut new_pending_frame = PendingFrame {
+            query_pools: std::mem::take(&mut query_pools.used_pools),
+            closed_query_by_parent_handle: HashMap::new(),
+            mapped_buffers: Arc::new(AtomicU32::new(0)),
+        };
+
+        for query in self.active_frame.closed_query_receiver.get_mut().try_iter() {
+            new_pending_frame
+                .closed_query_by_parent_handle
+                .entry(query.parent_handle)
+                .or_default()
+                .push(query);
+        }
+
+        // All loads of pool.num_used_queries are Relaxed since we assume,
+        // that we already acquired the state during `resolve_queries` and no further otherwise unobserved
+        // modifications happened since then.
+
+        let num_unresolved_queries = new_pending_frame
+            .query_pools
+            .iter()
+            .map(|pool| {
+                pool.num_used_queries.load(Ordering::Relaxed)
+                    - pool.num_resolved_queries.load(Ordering::Relaxed)
+            })
+            .sum();
+        if num_unresolved_queries != 0 {
+            return Err(EndFrameError::UnresolvedQueries(num_unresolved_queries));
+        }
+
+        // Next time we create a new query pool, we want it to be at least as big to hold all queries of this frame.
+        self.size_for_new_query_pools = self
+            .size_for_new_query_pools
+            .max(
+                new_pending_frame
+                    .query_pools
+                    .iter()
+                    .map(|pool| pool.num_used_queries.load(Ordering::Relaxed))
+                    .sum(),
+            )
+            .min(QUERY_SET_MAX_QUERIES);
+
+        // Make sure we don't overflow.
+        if self.pending_frames.len() == self.settings.max_num_pending_frames {
+            // Drop previous (!) frame.
+            // Dropping the oldest frame could get us into an endless cycle where we're never able to complete
+            // any pending frames as the ones closest to completion would be evicted.
+            if let Some(dropped_frame) = self.pending_frames.pop() {
+                // Drop queries first since they still have references to the query pools that we want to reuse.
+                drop(dropped_frame.closed_query_by_parent_handle);
+
+                // Mark the frame as dropped. We'll give back the query pools once the mapping is done.
+                // Any previously issued map_async call that haven't finished yet, will invoke their callback with mapping abort.
+                self.reset_and_cache_unused_query_pools(dropped_frame.query_pools);
+            }
+        }
+
+        // Map all buffers.
+        for pool in new_pending_frame.query_pools.iter_mut() {
+            let mapped_buffers = new_pending_frame.mapped_buffers.clone();
+            pool.read_buffer
+                .slice(0..(pool.num_used_queries.load(Ordering::Relaxed) * wgpu::QUERY_SIZE) as u64)
+                .map_async(wgpu::MapMode::Read, move |mapping_result| {
+                    // Mapping should not fail unless it was cancelled due to the frame being dropped.
+                    match mapping_result {
+                        Err(_) => {
+                            // We only want to ignore the error iff the mapping has been aborted by us (due to a dropped frame, see above).
+                            // In any other case, we need should panic as this would imply something went seriously sideways.
+                            //
+                            // As of writing, this is not yet possible in wgpu, see https://github.com/gfx-rs/wgpu/pull/2939
+                        }
+                        Ok(()) => {
+                            mapped_buffers.fetch_add(1, std::sync::atomic::Ordering::Release);
+                        }
+                    }
+                });
+        }
+
+        // Enqueue
+        self.pending_frames.push(new_pending_frame);
+        assert!(self.pending_frames.len() <= self.settings.max_num_pending_frames);
+
+        Ok(())
+    }
+
+    /// Checks if all timer queries for the oldest pending finished frame are done and returns that snapshot if any.
+    ///
+    /// timestamp_period:
+    ///    The timestamp period of the device. Pass the result of [`wgpu::Queue::get_timestamp_period()`].
+    ///    Note that some implementations (Chrome as of writing) may converge to a timestamp period while the application is running,
+    ///    so caching this value is usually not recommended.
+    pub fn process_finished_frame(
+        &mut self,
+        timestamp_period: f32,
+    ) -> Option<Vec<GpuTimerQueryResult>> {
+        let frame = self.pending_frames.first_mut()?;
+
+        // We only process if all mappings succeed.
+        if frame
+            .mapped_buffers
+            .load(std::sync::atomic::Ordering::Acquire)
+            != frame.query_pools.len() as u32
+        {
+            return None;
+        }
+
+        let mut frame = self.pending_frames.remove(0);
+
+        let results = {
+            let timestamp_to_sec = timestamp_period as f64 / 1000.0 / 1000.0 / 1000.0;
+
+            Self::process_timings_recursive(
+                timestamp_to_sec,
+                &mut frame.closed_query_by_parent_handle,
+                ROOT_QUERY_HANDLE,
+            )
+        };
+
+        self.reset_and_cache_unused_query_pools(frame.query_pools);
+
+        Some(results)
+    }
+}
+
+// --------------------------------------------------------------------------------
+// Internals
+// --------------------------------------------------------------------------------
+
+const QUERY_SET_MAX_QUERIES: u32 = wgpu::QUERY_SET_MAX_QUERIES;
+
+/// Returns true if a timestamp should be written to the encoder or pass.
+fn timestamp_write_supported<Recorder: ProfilerCommandRecorder>(
+    encoder_or_pass: &mut Recorder,
+    features: wgpu::Features,
+) -> bool {
+    let required_feature = if encoder_or_pass.is_pass() {
+        wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES
+    } else {
+        wgpu::Features::TIMESTAMP_QUERY
+    };
+    features.contains(required_feature)
+}
+
+impl GpuProfiler {
+    fn next_scope_tree_handle(&self) -> GpuTimerQueryTreeHandle {
+        // Relaxed is fine, we just want a number that nobody uses this frame already.
+        let mut handle = self.next_query_handle.fetch_add(1, Ordering::Relaxed);
+
+        // We don't ever expect to run out of handles during a single frame, but who knows how long the app runs.
+        while handle == ROOT_QUERY_HANDLE {
+            handle = self.next_query_handle.fetch_add(1, Ordering::Relaxed);
+        }
+
+        handle
+    }
+
+    fn reset_and_cache_unused_query_pools(&mut self, mut discarded_pools: Vec<Arc<QueryPool>>) {
+        let capacity_threshold = self.size_for_new_query_pools / 2;
+        for pool in discarded_pools.drain(..) {
+            // If the pool is truly unused now, it's ref count should be 1!
+            // If we use it anywhere else we have an implementation bug.
+            let mut pool = Arc::into_inner(pool).expect("Pool still in use");
+            pool.reset();
+
+            // If a pool was less than half of the size of the max frame, then we don't keep it.
+            // This way we're going to need less pools in upcoming frames and thus have less overhead in the long run.
+            // If timer queries were disabled, we also don't keep any pools.
+            if self.settings.enable_timer_queries && pool.capacity >= capacity_threshold {
+                self.active_frame
+                    .query_pools
+                    .get_mut()
+                    .unused_pools
+                    .push(pool);
+            }
+        }
+    }
+
+    fn try_reserve_query_pair(pool: &Arc<QueryPool>) -> Option<ReservedTimerQueryPair> {
+        let mut num_used_queries = pool.num_used_queries.load(Ordering::Relaxed);
+
+        loop {
+            if pool.capacity < num_used_queries + 2 {
+                // This pool is out of capacity, we failed the operation.
+                return None;
+            }
+
+            match pool.num_used_queries.compare_exchange_weak(
+                num_used_queries,
+                num_used_queries + 2,
+                // Write to num_used_queries with release semantics to be on the safe side.
+                // (It doesn't look like there's other side effects that we need to publish.)
+                Ordering::Release,
+                // No barrier for the failure case.
+                // The only thing we have to acquire is the pool's capacity which is constant and
+                // was definitely acquired by the RWLock prior to this call.
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => {
+                    // We successfully acquired two queries!
+                    return Some(ReservedTimerQueryPair {
+                        pool: pool.clone(),
+                        start_query_idx: num_used_queries,
+                        usage_state: QueryPairUsageState::Reserved,
+                    });
+                }
+                Err(updated) => {
+                    // Someone else acquired queries in the meantime, try again.
+                    num_used_queries = updated;
+                }
+            }
+        }
+    }
+
+    // Reserves two query objects.
+    // Our query pools always have an even number of queries, so we know the next query is the next in the same pool.
+    fn reserve_query_pair(&self, device: &wgpu::Device) -> ReservedTimerQueryPair {
+        // First, try to allocate from current top pool.
+        // Requires taking a read lock on the current query pool.
+        {
+            let query_pools = self.active_frame.query_pools.read();
+            if let Some(pair) = query_pools
+                .used_pools
+                .last()
+                .and_then(Self::try_reserve_query_pair)
+            {
+                return pair;
+            }
+        }
+        // If this didn't work, we may need to add a new pool.
+        // Requires taking a write lock on the current query pool.
+        {
+            let mut query_pools = self.active_frame.query_pools.write();
+
+            // It could be that by now, another thread has already added a new pool!
+            // This is a bit unfortunate because it means we unnecessarily took a write lock, but it seems hard to get around this.
+            if let Some(pair) = query_pools
+                .used_pools
+                .last()
+                .and_then(Self::try_reserve_query_pair)
+            {
+                return pair;
+            }
+
+            // Now we know for certain that the last pool is exhausted, so add a new one!
+            let new_pool = if let Some(reused_pool) = query_pools.unused_pools.pop() {
+                // First check if there's an unused pool we can take.
+                Arc::new(reused_pool)
+            } else {
+                // If we can't, create a new pool that is as big as all previous pools combined.
+                Arc::new(QueryPool::new(
+                    query_pools
+                        .used_pools
+                        .iter()
+                        .map(|pool| pool.capacity)
+                        .sum::<u32>()
+                        .max(self.size_for_new_query_pools)
+                        .min(QUERY_SET_MAX_QUERIES),
+                    device,
+                ))
+            };
+
+            let pair = Self::try_reserve_query_pair(&new_pool)
+                .expect("Freshly reserved pool doesn't have enough capacity");
+            query_pools.used_pools.push(new_pool);
+
+            pair
+        }
+    }
+
+    #[track_caller]
+    #[must_use]
+    fn begin_query_internal<Recorder: ProfilerCommandRecorder>(
+        &self,
+        label: String,
+        encoder_or_pass: &mut Recorder,
+        device: &wgpu::Device,
+    ) -> GpuProfilerQuery {
+        // Give opening/closing queries acquire/release semantics:
+        // This way, we won't get any nasty surprises when observing zero open queries.
+        self.num_open_queries.fetch_add(1, Ordering::Acquire);
+
+        let query = if self.settings.enable_timer_queries
+            && timestamp_write_supported(encoder_or_pass, device.features())
+        {
+            Some(self.reserve_query_pair(device))
+        } else {
+            None
+        };
+
+        let _tracy_scope = if self.settings.enable_timer_queries {
+            #[cfg(feature = "tracy")]
+            {
+                let location = std::panic::Location::caller();
+                self.tracy_context.as_ref().and_then(|c| {
+                    c.span_alloc(&label, "", location.file(), location.line())
+                        .ok()
+                })
+            }
+            #[cfg(not(feature = "tracy"))]
+            Option::<()>::None
+        } else {
+            None
+        };
+
+        GpuProfilerQuery {
+            label,
+            pid: std::process::id(),
+            tid: std::thread::current().id(),
+            timer_query_pair: query,
+            handle: self.next_scope_tree_handle(),
+            parent_handle: ROOT_QUERY_HANDLE,
+            has_debug_group: false,
+            #[cfg(feature = "tracy")]
+            tracy_scope: _tracy_scope,
+        }
+    }
+
+    fn process_timings_recursive(
+        timestamp_to_sec: f64,
+        closed_scope_by_parent_handle: &mut HashMap<GpuTimerQueryTreeHandle, Vec<GpuProfilerQuery>>,
+        parent_handle: GpuTimerQueryTreeHandle,
+    ) -> Vec<GpuTimerQueryResult> {
+        let Some(queries_with_same_parent) = closed_scope_by_parent_handle.remove(&parent_handle)
+        else {
+            return Vec::new();
+        };
+
+        queries_with_same_parent
+            .into_iter()
+            .filter_map(|mut scope| {
+                let Some(query) = scope.timer_query_pair.take() else {
+                    // Inactive queries don't have any results or nested queries with results.
+                    // Currently, we drop them from the results completely.
+                    // In the future we could still make them show up since they convey information like label & pid/tid.
+                    return None;
+                };
+
+                // Read timestamp from buffer.
+                // By design timestamps for start/end are consecutive.
+                let offset = (query.start_query_idx * wgpu::QUERY_SIZE) as u64;
+                let buffer_slice = &query
+                    .pool
+                    .read_buffer
+                    .slice(offset..(offset + (wgpu::QUERY_SIZE * 2) as u64))
+                    .get_mapped_range();
+                let start_raw = u64::from_le_bytes(
+                    buffer_slice[0..wgpu::QUERY_SIZE as usize]
+                        .try_into()
+                        .unwrap(),
+                );
+                let end_raw = u64::from_le_bytes(
+                    buffer_slice[wgpu::QUERY_SIZE as usize..(wgpu::QUERY_SIZE as usize) * 2]
+                        .try_into()
+                        .unwrap(),
+                );
+
+                #[cfg(feature = "tracy")]
+                if let Some(tracy_scope) = scope.tracy_scope.take() {
+                    tracy_scope.upload_timestamp(start_raw as i64, end_raw as i64);
+                }
+
+                let nested_queries = Self::process_timings_recursive(
+                    timestamp_to_sec,
+                    closed_scope_by_parent_handle,
+                    scope.handle,
+                );
+
+                Some(GpuTimerQueryResult {
+                    label: std::mem::take(&mut scope.label),
+                    time: (start_raw as f64 * timestamp_to_sec)
+                        ..(end_raw as f64 * timestamp_to_sec),
+                    nested_queries,
+                    pid: scope.pid,
+                    tid: scope.tid,
+                })
+            })
+            .collect::<Vec<_>>()
+    }
+}
+
+#[derive(PartialEq, Eq)]
+pub enum QueryPairUsageState {
+    /// Transitional state used upon creation.
+    Reserved,
+
+    /// Don't do manual timestamp writes, wgpu is expected to do them for us.
+    ReservedForPassTimestampWrites,
+
+    /// Start query has been used, end query is still available.
+    OnlyStartWritten,
+
+    /// Both start & end query have been used.
+    BothStartAndEndWritten,
+}
+
+pub struct ReservedTimerQueryPair {
+    /// QueryPool on which both start & end queries of the scope are done.
+    ///
+    /// By putting an arc here instead of an index into a vec, we don't need
+    /// need to take any locks upon closing a profiling scope.
+    pub pool: Arc<QueryPool>,
+
+    /// Query index at which the scope begins.
+    /// The query after this is reserved for the end of the scope.
+    pub start_query_idx: u32,
+
+    /// Current use of the query pair.
+    pub usage_state: QueryPairUsageState,
+}
+
+/// A pool of queries, consisting of a single queryset & buffer for query results.
+#[derive(Debug)]
+pub struct QueryPool {
+    pub query_set: wgpu::QuerySet,
+
+    resolve_buffer: wgpu::Buffer,
+    read_buffer: wgpu::Buffer,
+
+    capacity: u32,
+    num_used_queries: AtomicU32,
+    num_resolved_queries: AtomicU32,
+}
+
+impl QueryPool {
+    const MIN_CAPACITY: u32 = 32;
+
+    fn new(capacity: u32, device: &wgpu::Device) -> Self {
+        QueryPool {
+            query_set: device.create_query_set(&wgpu::QuerySetDescriptor {
+                label: Some("GpuProfiler - Query Set"),
+                ty: wgpu::QueryType::Timestamp,
+                count: capacity,
+            }),
+
+            resolve_buffer: device.create_buffer(&wgpu::BufferDescriptor {
+                label: Some("GpuProfiler - Query Resolve Buffer"),
+                size: (wgpu::QUERY_SIZE * capacity) as u64,
+                usage: wgpu::BufferUsages::QUERY_RESOLVE | wgpu::BufferUsages::COPY_SRC,
+                mapped_at_creation: false,
+            }),
+
+            read_buffer: device.create_buffer(&wgpu::BufferDescriptor {
+                label: Some("GpuProfiler - Query Read Buffer"),
+                size: (wgpu::QUERY_SIZE * capacity) as u64,
+                usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+                mapped_at_creation: false,
+            }),
+
+            capacity,
+            num_used_queries: AtomicU32::new(0),
+            num_resolved_queries: AtomicU32::new(0),
+        }
+    }
+
+    fn reset(&mut self) {
+        self.num_used_queries = AtomicU32::new(0);
+        self.num_resolved_queries = AtomicU32::new(0);
+        self.read_buffer.unmap();
+    }
+}
+
+#[derive(Default)]
+struct PendingFramePools {
+    /// List of all pools used in this frame.
+    /// The last pool is the one new profiling queries will try to make timer queries into.
+    used_pools: Vec<Arc<QueryPool>>,
+
+    /// List of unused pools recycled from previous frames.
+    unused_pools: Vec<QueryPool>,
+}
+
+/// Internal handle to building a tree of profiling queries.
+pub type GpuTimerQueryTreeHandle = u32;
+
+/// Handle for the root scope.
+pub const ROOT_QUERY_HANDLE: GpuTimerQueryTreeHandle = std::u32::MAX;
+
+struct ActiveFrame {
+    query_pools: RwLock<PendingFramePools>,
+
+    /// Closed queries get send to this channel.
+    ///
+    /// Note that channel is still overkill for what we want here:
+    /// We're in a multi producer situation, *but* the single consumer is known to be only
+    /// active in a mut context, i.e. while we're consuming we know that we're not producing.
+    /// We have to wrap it in a Mutex because the channel is not Sync, but we actually never lock it
+    /// since we only ever access it in a `mut` context.
+    closed_query_sender: std::sync::mpsc::Sender<GpuProfilerQuery>,
+    closed_query_receiver: Mutex<std::sync::mpsc::Receiver<GpuProfilerQuery>>,
+}
+
+struct PendingFrame {
+    query_pools: Vec<Arc<QueryPool>>,
+    closed_query_by_parent_handle: HashMap<GpuTimerQueryTreeHandle, Vec<GpuProfilerQuery>>,
+
+    /// Keeps track of the number of buffers in the query pool that have been mapped successfully.
+    mapped_buffers: std::sync::Arc<std::sync::atomic::AtomicU32>,
+}
diff --git a/src/profiler_command_recorder.rs b/src/profiler_command_recorder.rs
new file mode 100644
index 0000000..d6a8dff
--- /dev/null
+++ b/src/profiler_command_recorder.rs
@@ -0,0 +1,32 @@
+/// Trait for exposing the methods of `wgpu::CommandEncoder`, `wgpu::RenderPass` and `wgpu::ComputePass` that are used by the profiler.
+pub trait ProfilerCommandRecorder {
+    /// Returns `true` if it's a pass or `false` if it's an encoder
+    fn is_pass(&self) -> bool;
+    fn write_timestamp(&mut self, query_set: &wgpu::QuerySet, query_index: u32);
+    fn push_debug_group(&mut self, label: &str);
+    fn pop_debug_group(&mut self);
+}
+
+macro_rules! ImplProfilerCommandRecorder {
+    ($($name:ident $(< $lt:lifetime >)? : $pass:literal,)*) => {
+        $(
+            impl $(< $lt >)? ProfilerCommandRecorder for wgpu::$name $(< $lt >)? {
+                fn is_pass(&self) -> bool { $pass }
+
+                fn write_timestamp(&mut self, query_set: &wgpu::QuerySet, query_index: u32) {
+                    self.write_timestamp(query_set, query_index)
+                }
+
+                fn push_debug_group(&mut self, label: &str) {
+                    self.push_debug_group(label)
+                }
+
+                fn pop_debug_group(&mut self) {
+                    self.pop_debug_group()
+                }
+            }
+        )*
+    };
+}
+
+ImplProfilerCommandRecorder!(CommandEncoder:false, RenderPass<'a>:true, ComputePass<'a>:true,);
diff --git a/src/profiler_query.rs b/src/profiler_query.rs
new file mode 100644
index 0000000..485a162
--- /dev/null
+++ b/src/profiler_query.rs
@@ -0,0 +1,103 @@
+use std::{ops::Range, thread::ThreadId};
+
+use crate::profiler::{
+    GpuTimerQueryTreeHandle, QueryPairUsageState, ReservedTimerQueryPair, ROOT_QUERY_HANDLE,
+};
+
+/// The result of a gpu timer scope.
+#[derive(Debug, Clone)]
+pub struct GpuTimerQueryResult {
+    /// Label that was specified when opening the scope.
+    pub label: String,
+
+    /// The process id of the process that opened this scope.
+    pub pid: u32,
+
+    /// The thread id of the thread that opened this scope.
+    pub tid: ThreadId,
+
+    /// Time range of this scope in seconds.
+    ///
+    /// Meaning of absolute value is not defined.
+    pub time: Range<f64>,
+
+    /// Scopes that were opened while this scope was open.
+    pub nested_queries: Vec<GpuTimerQueryResult>,
+}
+
+/// An inflight query for the profiler.
+///
+/// If timer queries are enabled, this represents a reserved timer query pair on
+/// one of the profiler's query sets.
+/// *Must* be closed by calling [`GpuProfiler::end_query`].
+///
+/// Emitted by [`GpuProfiler::begin_query`]/[`GpuProfiler::begin_pass_query`] and consumed by [`GpuProfiler::end_query`].
+pub struct GpuProfilerQuery {
+    /// The label assigned to this query.
+    /// Will be moved into [`GpuProfilerQuery::label`] once the query is fully processed.
+    pub label: String,
+
+    /// The process id of the process that opened this query.
+    pub pid: u32,
+
+    /// The thread id of the thread that opened this query.
+    pub tid: ThreadId,
+
+    /// The actual query on a query pool if any (none if disabled for this type of query).
+    pub(crate) timer_query_pair: Option<ReservedTimerQueryPair>,
+
+    /// Handle which identifies this query, used for building the tree of queries.
+    pub(crate) handle: GpuTimerQueryTreeHandle,
+
+    /// Which query this query is a child of.
+    pub(crate) parent_handle: GpuTimerQueryTreeHandle,
+
+    /// Whether a debug group was opened for this scope.
+    pub(crate) has_debug_group: bool,
+
+    #[cfg(feature = "tracy")]
+    pub(crate) tracy_scope: Option<tracy_client::GpuSpan>,
+}
+
+impl GpuProfilerQuery {
+    /// Use the reserved query for render pass timestamp writes if any.
+    ///
+    /// Use this only for a single render/compute pass, otherwise results will be overwritten.
+    /// Only ever returns `Some` for queries that were created using [`GpuProfiler::begin_pass_query`].
+    pub fn render_pass_timestamp_writes(&self) -> Option<wgpu::RenderPassTimestampWrites> {
+        self.timer_query_pair.as_ref().and_then(|query| {
+            (query.usage_state == QueryPairUsageState::ReservedForPassTimestampWrites).then(|| {
+                wgpu::RenderPassTimestampWrites {
+                    query_set: &query.pool.query_set,
+                    beginning_of_pass_write_index: Some(query.start_query_idx),
+                    end_of_pass_write_index: Some(query.start_query_idx + 1),
+                }
+            })
+        })
+    }
+
+    /// Use the reserved query for compute pass timestamp writes if any.
+    ///
+    /// Use this only for a single render/compute pass, otherwise results will be overwritten.
+    /// Only ever returns `Some` for queries that were created using [`GpuProfiler::begin_pass_query`].
+    pub fn compute_pass_timestamp_writes(&self) -> Option<wgpu::ComputePassTimestampWrites> {
+        self.timer_query_pair.as_ref().and_then(|query| {
+            (query.usage_state == QueryPairUsageState::ReservedForPassTimestampWrites).then(|| {
+                wgpu::ComputePassTimestampWrites {
+                    query_set: &query.pool.query_set,
+                    beginning_of_pass_write_index: Some(query.start_query_idx),
+                    end_of_pass_write_index: Some(query.start_query_idx + 1),
+                }
+            })
+        })
+    }
+
+    /// Makes this scope a child of the passed scope.
+    #[inline]
+    pub fn with_parent(self, parent: Option<&GpuProfilerQuery>) -> Self {
+        Self {
+            parent_handle: parent.map_or(ROOT_QUERY_HANDLE, |p| p.handle),
+            ..self
+        }
+    }
+}
diff --git a/src/profiler_settings.rs b/src/profiler_settings.rs
new file mode 100644
index 0000000..b7c2b14
--- /dev/null
+++ b/src/profiler_settings.rs
@@ -0,0 +1,57 @@
+use crate::SettingsError;
+
+/// Settings passed on initialization of [`GpuProfiler`].
+#[derive(Debug, Clone)]
+pub struct GpuProfilerSettings {
+    /// Enables/disables gpu timer queries.
+    ///
+    /// If false, the profiler will not emit any timer queries, making most operations on [`GpuProfiler`] no-ops.
+    ///
+    /// Since all resource creation is done lazily, this provides an effective way of disabling the profiler at runtime
+    /// without the need of special build configurations or code to handle enabled/disabled profiling.
+    pub enable_timer_queries: bool,
+
+    /// Enables/disables debug markers for all scopes on the respective encoder or pass.
+    ///
+    /// This is useful for debugging with tools like RenderDoc.
+    /// Debug markers will be emitted even if the device does not support timer queries or disables them via
+    /// [`GpuProfilerSettings::enable_timer_queries`].
+    pub enable_debug_groups: bool,
+
+    /// The profiler queues up to `max_num_pending_frames` "profiler-frames" at a time.
+    ///
+    /// A profiler-frame is regarded as in-flight until its queries have been successfully
+    /// resolved using [`GpuProfiler::process_finished_frame`].
+    /// How long this takes to happen, depends on how fast buffer mappings return successfully
+    /// which in turn primarily depends on how fast the device is able to finish work queued to the [`wgpu::Queue`].
+    ///
+    /// If this threshold is exceeded, [`GpuProfiler::end_frame`] will silently drop frames.
+    /// *Newer* frames will be dropped first in order to get results back eventually.
+    /// (If the profiler were to drop the oldest frame, one may end up in a situation where there is never
+    /// frame that is fully processed and thus never any results to be retrieved).
+    ///
+    /// Good values for `max_num_pending_frames` are 2-4 but may depend on your application workload
+    /// and GPU-CPU syncing strategy.
+    /// Must be greater than 0.
+    pub max_num_pending_frames: usize,
+}
+
+impl Default for GpuProfilerSettings {
+    fn default() -> Self {
+        Self {
+            enable_timer_queries: true,
+            enable_debug_groups: true,
+            max_num_pending_frames: 3,
+        }
+    }
+}
+
+impl GpuProfilerSettings {
+    pub fn validate(&self) -> Result<(), SettingsError> {
+        if self.max_num_pending_frames == 0 {
+            Err(SettingsError::InvalidMaxNumPendingFrames)
+        } else {
+            Ok(())
+        }
+    }
+}
diff --git a/src/tracy.rs b/src/tracy.rs
index 19fa579..6764563 100644
--- a/src/tracy.rs
+++ b/src/tracy.rs
@@ -1,6 +1,6 @@
 use crate::CreationError;
 
-pub(crate) fn create_tracy_gpu_client(
+pub fn create_tracy_gpu_client(
     backend: wgpu::Backend,
     device: &wgpu::Device,
     queue: &wgpu::Queue,
@@ -13,14 +13,14 @@ pub(crate) fn create_tracy_gpu_client(
 
     let resolve_buffer = device.create_buffer(&wgpu::BufferDescriptor {
         label: Some("wgpu-profiler gpu -> cpu resolve buffer"),
-        size: crate::QUERY_SIZE as _,
+        size: wgpu::QUERY_SIZE as _,
         usage: wgpu::BufferUsages::QUERY_RESOLVE | wgpu::BufferUsages::COPY_SRC,
         mapped_at_creation: false,
     });
 
     let map_buffer = device.create_buffer(&wgpu::BufferDescriptor {
         label: Some("wgpu-profiler gpu -> cpu map buffer"),
-        size: crate::QUERY_SIZE as _,
+        size: wgpu::QUERY_SIZE as _,
         usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
         mapped_at_creation: false,
     });
@@ -30,7 +30,7 @@ pub(crate) fn create_tracy_gpu_client(
     });
     encoder.write_timestamp(&query_set, 0);
     encoder.resolve_query_set(&query_set, 0..1, &resolve_buffer, 0);
-    encoder.copy_buffer_to_buffer(&resolve_buffer, 0, &map_buffer, 0, crate::QUERY_SIZE as _);
+    encoder.copy_buffer_to_buffer(&resolve_buffer, 0, &map_buffer, 0, wgpu::QUERY_SIZE as _);
     queue.submit(Some(encoder.finish()));
 
     map_buffer.slice(..).map_async(wgpu::MapMode::Read, |_| ());