diff --git a/src/lib.rs b/src/lib.rs index 0a0caa8..a93169c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -100,1070 +100,17 @@ On [`GpuProfiler::end_frame`], we memorize the total size of all `QueryPool`s in pub mod chrometrace; mod errors; +mod profiler; +mod profiler_command_recorder; +mod profiler_query; +mod profiler_settings; mod scope; #[cfg(feature = "tracy")] mod tracy; pub use errors::{CreationError, EndFrameError, SettingsError}; +pub use profiler::GpuProfiler; +pub use profiler_command_recorder::ProfilerCommandRecorder; +pub use profiler_query::{GpuProfilerQuery, GpuTimerQueryResult}; +pub use profiler_settings::GpuProfilerSettings; pub use scope::{ManualOwningScope, OwningScope, Scope}; - -// --------------- - -use std::{ - collections::HashMap, - ops::Range, - sync::{ - atomic::{AtomicU32, Ordering}, - Arc, - }, - thread::ThreadId, -}; - -use parking_lot::{Mutex, RwLock}; - -/// The result of a gpu timer scope. -#[derive(Debug, Clone)] -pub struct GpuTimerQueryResult { - /// Label that was specified when opening the scope. - pub label: String, - - /// The process id of the process that opened this scope. - pub pid: u32, - - /// The thread id of the thread that opened this scope. - pub tid: ThreadId, - - /// Time range of this scope in seconds. - /// - /// Meaning of absolute value is not defined. - pub time: Range, - - /// Scopes that were opened while this scope was open. - pub nested_queries: Vec, -} - -/// An inflight query for the profiler. -/// -/// If timer queries are enabled, this represents a reserved timer query pair on -/// one of the profiler's query sets. -/// *Must* be closed by calling [`GpuProfiler::end_query`]. -/// -/// Emitted by [`GpuProfiler::begin_query`]/[`GpuProfiler::begin_pass_query`] and consumed by [`GpuProfiler::end_query`]. -pub struct GpuProfilerQuery { - /// The label assigned to this query. - /// Will be moved into [`GpuProfilerQuery::label`] once the query is fully processed. - pub label: String, - - /// The process id of the process that opened this query. - pub pid: u32, - - /// The thread id of the thread that opened this query. - pub tid: ThreadId, - - /// The actual query on a query pool if any (none if disabled for this type of query). - timer_query_pair: Option, - - /// Handle which identifies this query, used for building the tree of queries. - handle: GpuTimerQueryTreeHandle, - - /// Which query this query is a child of. - parent_handle: GpuTimerQueryTreeHandle, - - /// Whether a debug group was opened for this scope. - has_debug_group: bool, - - #[cfg(feature = "tracy")] - tracy_scope: Option, -} - -impl GpuProfilerQuery { - /// Use the reserved query for render pass timestamp writes if any. - /// - /// Use this only for a single render/compute pass, otherwise results will be overwritten. - /// Only ever returns `Some` for queries that were created using [`GpuProfiler::begin_pass_query`]. - pub fn render_pass_timestamp_writes(&self) -> Option { - self.timer_query_pair.as_ref().and_then(|query| { - (query.usage_state == QueryPairUsageState::ReservedForPassTimestampWrites).then(|| { - wgpu::RenderPassTimestampWrites { - query_set: &query.pool.query_set, - beginning_of_pass_write_index: Some(query.start_query_idx), - end_of_pass_write_index: Some(query.start_query_idx + 1), - } - }) - }) - } - - /// Use the reserved query for compute pass timestamp writes if any. - /// - /// Use this only for a single render/compute pass, otherwise results will be overwritten. - /// Only ever returns `Some` for queries that were created using [`GpuProfiler::begin_pass_query`]. - pub fn compute_pass_timestamp_writes(&self) -> Option { - self.timer_query_pair.as_ref().and_then(|query| { - (query.usage_state == QueryPairUsageState::ReservedForPassTimestampWrites).then(|| { - wgpu::ComputePassTimestampWrites { - query_set: &query.pool.query_set, - beginning_of_pass_write_index: Some(query.start_query_idx), - end_of_pass_write_index: Some(query.start_query_idx + 1), - } - }) - }) - } - - /// Makes this scope a child of the passed scope. - #[inline] - pub fn with_parent(self, parent: Option<&GpuProfilerQuery>) -> Self { - Self { - parent_handle: parent.map_or(ROOT_QUERY_HANDLE, |p| p.handle), - ..self - } - } -} - -/// Settings passed on initialization of [`GpuProfiler`]. -#[derive(Debug, Clone)] -pub struct GpuProfilerSettings { - /// Enables/disables gpu timer queries. - /// - /// If false, the profiler will not emit any timer queries, making most operations on [`GpuProfiler`] no-ops. - /// - /// Since all resource creation is done lazily, this provides an effective way of disabling the profiler at runtime - /// without the need of special build configurations or code to handle enabled/disabled profiling. - pub enable_timer_queries: bool, - - /// Enables/disables debug markers for all scopes on the respective encoder or pass. - /// - /// This is useful for debugging with tools like RenderDoc. - /// Debug markers will be emitted even if the device does not support timer queries or disables them via - /// [`GpuProfilerSettings::enable_timer_queries`]. - pub enable_debug_groups: bool, - - /// The profiler queues up to `max_num_pending_frames` "profiler-frames" at a time. - /// - /// A profiler-frame is regarded as in-flight until its queries have been successfully - /// resolved using [`GpuProfiler::process_finished_frame`]. - /// How long this takes to happen, depends on how fast buffer mappings return successfully - /// which in turn primarily depends on how fast the device is able to finish work queued to the [`wgpu::Queue`]. - /// - /// If this threshold is exceeded, [`GpuProfiler::end_frame`] will silently drop frames. - /// *Newer* frames will be dropped first in order to get results back eventually. - /// (If the profiler were to drop the oldest frame, one may end up in a situation where there is never - /// frame that is fully processed and thus never any results to be retrieved). - /// - /// Good values for `max_num_pending_frames` are 2-4 but may depend on your application workload - /// and GPU-CPU syncing strategy. - /// Must be greater than 0. - pub max_num_pending_frames: usize, -} - -impl Default for GpuProfilerSettings { - fn default() -> Self { - Self { - enable_timer_queries: true, - enable_debug_groups: true, - max_num_pending_frames: 3, - } - } -} - -impl GpuProfilerSettings { - pub fn validate(&self) -> Result<(), SettingsError> { - if self.max_num_pending_frames == 0 { - Err(SettingsError::InvalidMaxNumPendingFrames) - } else { - Ok(()) - } - } -} - -/// Profiler instance. -/// -/// You can have an arbitrary number of independent profiler instances per application/adapter. -/// Manages all the necessary [`wgpu::QuerySet`] and [`wgpu::Buffer`] behind the scenes. -/// -/// Any query creation method may allocate a new [`wgpu::QuerySet`] and [`wgpu::Buffer`] internally if necessary. -/// -/// After the first call that passes [`wgpu::Device`], the same device must be used with all subsequent -/// calls to [`GpuProfiler`] and all passed references to wgpu objects must originate from that device. -pub struct GpuProfiler { - unused_pools: Vec, - - active_frame: ActiveFrame, - pending_frames: Vec, - - num_open_queries: AtomicU32, - next_query_handle: AtomicU32, - - size_for_new_query_pools: u32, - - settings: GpuProfilerSettings, - - #[cfg(feature = "tracy")] - tracy_context: Option, -} - -// Public interface -impl GpuProfiler { - /// Combination of all timer query features [`GpuProfiler`] can leverage. - pub const ALL_WGPU_TIMER_FEATURES: wgpu::Features = - wgpu::Features::TIMESTAMP_QUERY.union(wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES); - - /// Combination of all timer query features [`GpuProfiler`] can leverage. - #[deprecated(since = "0.9.0", note = "Use ALL_WGPU_TIMER_FEATURES instead")] - pub const REQUIRED_WGPU_FEATURES: wgpu::Features = GpuProfiler::ALL_WGPU_TIMER_FEATURES; - - /// Creates a new Profiler object. - /// - /// There is nothing preventing the use of several independent profiler objects. - pub fn new(settings: GpuProfilerSettings) -> Result { - settings.validate()?; - - let (closed_scope_sender, closed_scope_receiver) = std::sync::mpsc::channel(); - - Ok(GpuProfiler { - unused_pools: Vec::new(), - - pending_frames: Vec::with_capacity(settings.max_num_pending_frames), - active_frame: ActiveFrame { - query_pools: RwLock::new(PendingFramePools::default()), - closed_query_sender: closed_scope_sender, - closed_query_receiver: Mutex::new(closed_scope_receiver), - }, - - num_open_queries: AtomicU32::new(0), - next_query_handle: AtomicU32::new(0), - - size_for_new_query_pools: QueryPool::MIN_CAPACITY, - - settings, - - #[cfg(feature = "tracy")] - tracy_context: None, - }) - } - - /// Creates a new profiler and connects to a running Tracy client. - #[cfg(feature = "tracy")] - pub fn new_with_tracy_client( - settings: GpuProfilerSettings, - backend: wgpu::Backend, - device: &wgpu::Device, - queue: &wgpu::Queue, - ) -> Result { - let mut profiler = Self::new(settings)?; - profiler.tracy_context = Some(tracy::create_tracy_gpu_client(backend, device, queue)?); - Ok(profiler) - } - - /// Changes the settings of an existing profiler. - /// - /// If timer scopes are disabled by setting [GpuProfilerSettings::enable_timer_queries] to false, - /// any timer queries that are in flight will still be processed, - /// but unused query sets and buffers will be deallocated during [`Self::process_finished_frame`]. - /// Similarly, any opened debugging scope will still be closed if debug groups are disabled by setting - /// [GpuProfilerSettings::enable_debug_groups] to false. - pub fn change_settings(&mut self, settings: GpuProfilerSettings) -> Result<(), SettingsError> { - settings.validate()?; - if !settings.enable_timer_queries { - self.unused_pools.clear(); - } - self.settings = settings; - - Ok(()) - } - - /// Starts a new auto-closing profiler scope. - /// - /// To nest scopes inside this scope, call [`Scope::scope`] on the returned scope. - /// - /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`] - /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will - /// not show up in the final results. - /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`] - /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened. - /// - /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass. - /// - /// Scope is automatically closed on drop. - #[must_use] - #[track_caller] - #[inline] - pub fn scope<'a, Recorder: ProfilerCommandRecorder>( - &'a self, - label: impl Into, - encoder_or_pass: &'a mut Recorder, - device: &wgpu::Device, - ) -> Scope<'a, Recorder> { - let scope = self.begin_query(label, encoder_or_pass, device); - Scope { - profiler: self, - recorder: encoder_or_pass, - scope: Some(scope), - } - } - - /// Starts a new auto-closing profiler scope that takes ownership of the passed encoder or rendering/compute pass. - /// - /// To nest scopes inside this scope, call [`OwningScope::scope`] on the returned scope. - /// - /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`] - /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will - /// not show up in the final results. - /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`] - /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened. - /// - /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass. - /// - /// Scope is automatically closed on drop. - #[must_use] - #[track_caller] - #[inline] - pub fn owning_scope<'a, Recorder: ProfilerCommandRecorder>( - &'a self, - label: impl Into, - mut encoder_or_pass: Recorder, - device: &wgpu::Device, - ) -> OwningScope<'a, Recorder> { - let scope = self.begin_query(label, &mut encoder_or_pass, device); - OwningScope { - profiler: self, - recorder: encoder_or_pass, - scope: Some(scope), - } - } - - /// Starts a new **manually closed** profiler scope that takes ownership of the passed encoder or rendering/compute pass. - /// - /// Does NOT call [`GpuProfiler::end_query()`] on drop. - /// This construct is just for completeness in cases where working with scopes is preferred but one can't rely on the Drop call in the right place. - /// This is useful when the owned value needs to be recovered after the end of the scope. - /// In particular, to submit a [`wgpu::CommandEncoder`] to a queue, ownership of the encoder is necessary. - /// - /// To nest scopes inside this scope, call [`ManualOwningScope::scope`] on the returned scope. - /// - /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`] - /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will - /// not show up in the final results. - /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`] - /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened. - /// - /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass. - #[must_use] - #[track_caller] - #[inline] - pub fn manual_owning_scope<'a, Recorder: ProfilerCommandRecorder>( - &'a self, - label: impl Into, - mut encoder_or_pass: Recorder, - device: &wgpu::Device, - ) -> ManualOwningScope<'a, Recorder> { - let scope = self.begin_query(label, &mut encoder_or_pass, device); - ManualOwningScope { - profiler: self, - recorder: encoder_or_pass, - scope: Some(scope), - } - } - - /// Starts a new profiler query on the given encoder or rendering/compute pass (if enabled). - /// - /// The returned query *must* be closed by calling [`GpuProfiler::end_query`] with the same encoder/pass, - /// even if timer queries are disabled. - /// To do this automatically, use [`GpuProfiler::scope`]/[`GpuProfiler::owning_scope`] instead. - /// - /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`] - /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will - /// not show up in the final results. - /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`] - /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no timer queries will be allocated. - /// - /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass. - #[track_caller] - #[must_use] - pub fn begin_query( - &self, - label: impl Into, - encoder_or_pass: &mut Recorder, - device: &wgpu::Device, - ) -> GpuProfilerQuery { - let mut query = self.begin_query_internal(label.into(), encoder_or_pass, device); - if let Some(timer_query) = &mut query.timer_query_pair { - encoder_or_pass - .write_timestamp(&timer_query.pool.query_set, timer_query.start_query_idx); - timer_query.usage_state = QueryPairUsageState::OnlyStartWritten; - }; - - if self.settings.enable_debug_groups { - encoder_or_pass.push_debug_group(&query.label); - query.has_debug_group = true; - } - query - } - - /// Starts a new profiler query to be used for render/compute pass timestamp writes. - /// - /// The returned query *must* be closed by calling [`GpuProfiler::end_query`], even if timer queries are disabled. - /// To do this automatically, use [`Scope::scoped_render_pass`]/[`Scope::scoped_compute_pass`] instead. - /// - /// Call [`GpuProfilerQuery::render_pass_timestamp_writes`] or [`GpuProfilerQuery::compute_pass_timestamp_writes`] - /// to acquire the corresponding `wgpu::RenderPassTimestampWrites`/`wgpu::ComputePassTimestampWrites` object. - /// - /// If the [`wgpu::Device`] does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be reserved. - /// - /// Unlike [`GpuProfiler::begin_query`] this will not create a debug scope, - /// in order to not force passing of the same encoder/pass to [`GpuProfiler::end_query`]. - /// (this is needed to relax resource tracking requirements a bit, making it easier to implement the automatic scopes) - pub fn begin_pass_query( - &self, - label: impl Into, - encoder: &mut wgpu::CommandEncoder, - device: &wgpu::Device, - ) -> GpuProfilerQuery { - let mut query = self.begin_query_internal(label.into(), encoder, device); - if let Some(timer_query) = &mut query.timer_query_pair { - timer_query.usage_state = QueryPairUsageState::ReservedForPassTimestampWrites; - } - query - } - - /// Ends passed query. - /// - /// If the passed query was opened with [`GpuProfiler::begin_query`], the passed encoder or pass must be the same - /// as when the query was opened. - pub fn end_query( - &self, - encoder_or_pass: &mut Recorder, - mut query: GpuProfilerQuery, - ) { - if let Some(timer_query) = &mut query.timer_query_pair { - match timer_query.usage_state { - QueryPairUsageState::Reserved => { - unreachable!("Query pair has been reserved but isn't used for anything!") - } - QueryPairUsageState::ReservedForPassTimestampWrites => { - // No need to do a timestamp write, this is handled by wgpu. - } - QueryPairUsageState::OnlyStartWritten => { - encoder_or_pass.write_timestamp( - &timer_query.pool.query_set, - timer_query.start_query_idx + 1, - ); - timer_query.usage_state = QueryPairUsageState::BothStartAndEndWritten; - } - QueryPairUsageState::BothStartAndEndWritten => { - unreachable!("Query pair has already been used!") - } - } - } - - #[cfg(feature = "tracy")] - if let Some(ref mut tracy_scope) = query.tracy_scope { - tracy_scope.end_zone(); - } - - if query.has_debug_group { - encoder_or_pass.pop_debug_group(); - } - - let send_result = self.active_frame.closed_query_sender.send(query); - - // The only way we can fail sending the query is if the receiver has been dropped. - // Since it sits on `active_frame` as well, there's no way for this to happen! - debug_assert!(send_result.is_ok()); - - // Count queries even if we haven't processed this one, makes experiences more consistent - // if there's a lack of support for some queries. - self.num_open_queries.fetch_sub(1, Ordering::Release); - } - - /// Puts query resolve commands in the encoder for all unresolved, pending queries of the active profiler frame. - /// - /// Note that you do *not* need to do this for every encoder, it is sufficient do do this once per frame as long - /// as you submit the corresponding command buffer after all others that may have opened queries in the same frame. - /// (It does not matter if the passed encoder itself has previously opened queries or not.) - /// If you were to make this part of a command buffer that is enqueued before any other that has - /// opened queries in the same profiling frame, no failure will occur but some timing results may be invalid. - /// - /// It is advised to call this only once at the end of a profiling frame, but it is safe to do so several times. - /// - /// - /// Implementation note: - /// This method could be made `&self`, taking the internal lock on the query pools. - /// However, the intended use is to call this once at the end of a frame, so we instead - /// encourage this explicit sync point and avoid the lock. - pub fn resolve_queries(&mut self, encoder: &mut wgpu::CommandEncoder) { - let query_pools = self.active_frame.query_pools.get_mut(); - - for query_pool in query_pools.used_pools.iter_mut() { - // We sync with the last update of num_used_query (which has Release semantics) - // mostly to be on the safe side - it happened inside a lock which gives it release semantics anyways - // but the concern is that if we don't acquire here, we may miss on other side prior effects of the query begin. - let num_used_queries = query_pool.num_used_queries.load(Ordering::Acquire); - let num_resolved_queries = query_pool.num_resolved_queries.load(Ordering::Acquire); - - if num_resolved_queries == num_used_queries { - continue; - } - - assert!(num_resolved_queries < num_used_queries); - - encoder.resolve_query_set( - &query_pool.query_set, - num_resolved_queries..num_used_queries, - &query_pool.resolve_buffer, - (num_resolved_queries * QUERY_SIZE) as u64, - ); - query_pool - .num_resolved_queries - .store(num_used_queries, Ordering::Release); - - encoder.copy_buffer_to_buffer( - &query_pool.resolve_buffer, - 0, - &query_pool.read_buffer, - 0, - (num_used_queries * QUERY_SIZE) as u64, - ); - } - } - - /// Marks the end of a frame. - /// - /// Needs to be called **after** submitting any encoder used in the current profiler frame. - /// - /// Fails if there are still open queries or unresolved queries. - pub fn end_frame(&mut self) -> Result<(), EndFrameError> { - let num_open_queries = self.num_open_queries.load(Ordering::Acquire); - if num_open_queries != 0 { - return Err(EndFrameError::UnclosedQueries(num_open_queries)); - } - - let query_pools = self.active_frame.query_pools.get_mut(); - - let mut new_pending_frame = PendingFrame { - query_pools: std::mem::take(&mut query_pools.used_pools), - closed_query_by_parent_handle: HashMap::new(), - mapped_buffers: Arc::new(AtomicU32::new(0)), - }; - - for query in self.active_frame.closed_query_receiver.get_mut().try_iter() { - new_pending_frame - .closed_query_by_parent_handle - .entry(query.parent_handle) - .or_default() - .push(query); - } - - // All loads of pool.num_used_queries are Relaxed since we assume, - // that we already acquired the state during `resolve_queries` and no further otherwise unobserved - // modifications happened since then. - - let num_unresolved_queries = new_pending_frame - .query_pools - .iter() - .map(|pool| { - pool.num_used_queries.load(Ordering::Relaxed) - - pool.num_resolved_queries.load(Ordering::Relaxed) - }) - .sum(); - if num_unresolved_queries != 0 { - return Err(EndFrameError::UnresolvedQueries(num_unresolved_queries)); - } - - // Next time we create a new query pool, we want it to be at least as big to hold all queries of this frame. - self.size_for_new_query_pools = self - .size_for_new_query_pools - .max( - new_pending_frame - .query_pools - .iter() - .map(|pool| pool.num_used_queries.load(Ordering::Relaxed)) - .sum(), - ) - .min(QUERY_SET_MAX_QUERIES); - - // Make sure we don't overflow. - if self.pending_frames.len() == self.settings.max_num_pending_frames { - // Drop previous (!) frame. - // Dropping the oldest frame could get us into an endless cycle where we're never able to complete - // any pending frames as the ones closest to completion would be evicted. - if let Some(dropped_frame) = self.pending_frames.pop() { - // Drop queries first since they still have references to the query pools that we want to reuse. - drop(dropped_frame.closed_query_by_parent_handle); - - // Mark the frame as dropped. We'll give back the query pools once the mapping is done. - // Any previously issued map_async call that haven't finished yet, will invoke their callback with mapping abort. - self.reset_and_cache_unused_query_pools(dropped_frame.query_pools); - } - } - - // Map all buffers. - for pool in new_pending_frame.query_pools.iter_mut() { - let mapped_buffers = new_pending_frame.mapped_buffers.clone(); - pool.read_buffer - .slice(0..(pool.num_used_queries.load(Ordering::Relaxed) * QUERY_SIZE) as u64) - .map_async(wgpu::MapMode::Read, move |mapping_result| { - // Mapping should not fail unless it was cancelled due to the frame being dropped. - match mapping_result { - Err(_) => { - // We only want to ignore the error iff the mapping has been aborted by us (due to a dropped frame, see above). - // In any other case, we need should panic as this would imply something went seriously sideways. - // - // As of writing, this is not yet possible in wgpu, see https://github.com/gfx-rs/wgpu/pull/2939 - } - Ok(()) => { - mapped_buffers.fetch_add(1, std::sync::atomic::Ordering::Release); - } - } - }); - } - - // Enqueue - self.pending_frames.push(new_pending_frame); - assert!(self.pending_frames.len() <= self.settings.max_num_pending_frames); - - Ok(()) - } - - /// Checks if all timer queries for the oldest pending finished frame are done and returns that snapshot if any. - /// - /// timestamp_period: - /// The timestamp period of the device. Pass the result of [`wgpu::Queue::get_timestamp_period()`]. - /// Note that some implementations (Chrome as of writing) may converge to a timestamp period while the application is running, - /// so caching this value is usually not recommended. - pub fn process_finished_frame( - &mut self, - timestamp_period: f32, - ) -> Option> { - let frame = self.pending_frames.first_mut()?; - - // We only process if all mappings succeed. - if frame - .mapped_buffers - .load(std::sync::atomic::Ordering::Acquire) - != frame.query_pools.len() as u32 - { - return None; - } - - let mut frame = self.pending_frames.remove(0); - - let results = { - let timestamp_to_sec = timestamp_period as f64 / 1000.0 / 1000.0 / 1000.0; - - Self::process_timings_recursive( - timestamp_to_sec, - &mut frame.closed_query_by_parent_handle, - ROOT_QUERY_HANDLE, - ) - }; - - self.reset_and_cache_unused_query_pools(frame.query_pools); - - Some(results) - } -} - -// -------------------------------------------------------------------------------- -// Internals -// -------------------------------------------------------------------------------- - -const QUERY_SIZE: u32 = wgpu::QUERY_SIZE; -const QUERY_SET_MAX_QUERIES: u32 = wgpu::QUERY_SET_MAX_QUERIES; - -/// Returns true if a timestamp should be written to the encoder or pass. -fn timestamp_write_supported( - encoder_or_pass: &mut Recorder, - features: wgpu::Features, -) -> bool { - let required_feature = if encoder_or_pass.is_pass() { - wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES - } else { - wgpu::Features::TIMESTAMP_QUERY - }; - features.contains(required_feature) -} - -impl GpuProfiler { - fn next_scope_tree_handle(&self) -> GpuTimerQueryTreeHandle { - // Relaxed is fine, we just want a number that nobody uses this frame already. - let mut handle = self.next_query_handle.fetch_add(1, Ordering::Relaxed); - - // We don't ever expect to run out of handles during a single frame, but who knows how long the app runs. - while handle == ROOT_QUERY_HANDLE { - handle = self.next_query_handle.fetch_add(1, Ordering::Relaxed); - } - - handle - } - - fn reset_and_cache_unused_query_pools(&mut self, mut discarded_pools: Vec>) { - let capacity_threshold = self.size_for_new_query_pools / 2; - for pool in discarded_pools.drain(..) { - // If the pool is truly unused now, it's ref count should be 1! - // If we use it anywhere else we have an implementation bug. - let mut pool = Arc::into_inner(pool).expect("Pool still in use"); - pool.reset(); - - // If a pool was less than half of the size of the max frame, then we don't keep it. - // This way we're going to need less pools in upcoming frames and thus have less overhead in the long run. - // If timer queries were disabled, we also don't keep any pools. - if self.settings.enable_timer_queries && pool.capacity >= capacity_threshold { - self.active_frame - .query_pools - .get_mut() - .unused_pools - .push(pool); - } - } - } - - fn try_reserve_query_pair(pool: &Arc) -> Option { - let mut num_used_queries = pool.num_used_queries.load(Ordering::Relaxed); - - loop { - if pool.capacity < num_used_queries + 2 { - // This pool is out of capacity, we failed the operation. - return None; - } - - match pool.num_used_queries.compare_exchange_weak( - num_used_queries, - num_used_queries + 2, - // Write to num_used_queries with release semantics to be on the safe side. - // (It doesn't look like there's other side effects that we need to publish.) - Ordering::Release, - // No barrier for the failure case. - // The only thing we have to acquire is the pool's capacity which is constant and - // was definitely acquired by the RWLock prior to this call. - Ordering::Relaxed, - ) { - Ok(_) => { - // We successfully acquired two queries! - return Some(ReservedTimerQueryPair { - pool: pool.clone(), - start_query_idx: num_used_queries, - usage_state: QueryPairUsageState::Reserved, - }); - } - Err(updated) => { - // Someone else acquired queries in the meantime, try again. - num_used_queries = updated; - } - } - } - } - - // Reserves two query objects. - // Our query pools always have an even number of queries, so we know the next query is the next in the same pool. - fn reserve_query_pair(&self, device: &wgpu::Device) -> ReservedTimerQueryPair { - // First, try to allocate from current top pool. - // Requires taking a read lock on the current query pool. - { - let query_pools = self.active_frame.query_pools.read(); - if let Some(pair) = query_pools - .used_pools - .last() - .and_then(Self::try_reserve_query_pair) - { - return pair; - } - } - // If this didn't work, we may need to add a new pool. - // Requires taking a write lock on the current query pool. - { - let mut query_pools = self.active_frame.query_pools.write(); - - // It could be that by now, another thread has already added a new pool! - // This is a bit unfortunate because it means we unnecessarily took a write lock, but it seems hard to get around this. - if let Some(pair) = query_pools - .used_pools - .last() - .and_then(Self::try_reserve_query_pair) - { - return pair; - } - - // Now we know for certain that the last pool is exhausted, so add a new one! - let new_pool = if let Some(reused_pool) = query_pools.unused_pools.pop() { - // First check if there's an unused pool we can take. - Arc::new(reused_pool) - } else { - // If we can't, create a new pool that is as big as all previous pools combined. - Arc::new(QueryPool::new( - query_pools - .used_pools - .iter() - .map(|pool| pool.capacity) - .sum::() - .max(self.size_for_new_query_pools) - .min(QUERY_SET_MAX_QUERIES), - device, - )) - }; - - let pair = Self::try_reserve_query_pair(&new_pool) - .expect("Freshly reserved pool doesn't have enough capacity"); - query_pools.used_pools.push(new_pool); - - pair - } - } - - #[track_caller] - #[must_use] - fn begin_query_internal( - &self, - label: String, - encoder_or_pass: &mut Recorder, - device: &wgpu::Device, - ) -> GpuProfilerQuery { - // Give opening/closing queries acquire/release semantics: - // This way, we won't get any nasty surprises when observing zero open queries. - self.num_open_queries.fetch_add(1, Ordering::Acquire); - - let query = if self.settings.enable_timer_queries - && timestamp_write_supported(encoder_or_pass, device.features()) - { - Some(self.reserve_query_pair(device)) - } else { - None - }; - - let _tracy_scope = if self.settings.enable_timer_queries { - #[cfg(feature = "tracy")] - { - let location = std::panic::Location::caller(); - self.tracy_context.as_ref().and_then(|c| { - c.span_alloc(&label, "", location.file(), location.line()) - .ok() - }) - } - #[cfg(not(feature = "tracy"))] - Option::<()>::None - } else { - None - }; - - GpuProfilerQuery { - label, - pid: std::process::id(), - tid: std::thread::current().id(), - timer_query_pair: query, - handle: self.next_scope_tree_handle(), - parent_handle: ROOT_QUERY_HANDLE, - has_debug_group: false, - #[cfg(feature = "tracy")] - tracy_scope: _tracy_scope, - } - } - - fn process_timings_recursive( - timestamp_to_sec: f64, - closed_scope_by_parent_handle: &mut HashMap>, - parent_handle: GpuTimerQueryTreeHandle, - ) -> Vec { - let Some(queries_with_same_parent) = closed_scope_by_parent_handle.remove(&parent_handle) - else { - return Vec::new(); - }; - - queries_with_same_parent - .into_iter() - .filter_map(|mut scope| { - let Some(query) = scope.timer_query_pair.take() else { - // Inactive queries don't have any results or nested queries with results. - // Currently, we drop them from the results completely. - // In the future we could still make them show up since they convey information like label & pid/tid. - return None; - }; - - // Read timestamp from buffer. - // By design timestamps for start/end are consecutive. - let offset = (query.start_query_idx * QUERY_SIZE) as u64; - let buffer_slice = &query - .pool - .read_buffer - .slice(offset..(offset + (QUERY_SIZE * 2) as u64)) - .get_mapped_range(); - let start_raw = - u64::from_le_bytes(buffer_slice[0..QUERY_SIZE as usize].try_into().unwrap()); - let end_raw = u64::from_le_bytes( - buffer_slice[QUERY_SIZE as usize..(QUERY_SIZE as usize) * 2] - .try_into() - .unwrap(), - ); - - #[cfg(feature = "tracy")] - if let Some(tracy_scope) = scope.tracy_scope.take() { - tracy_scope.upload_timestamp(start_raw as i64, end_raw as i64); - } - - let nested_queries = Self::process_timings_recursive( - timestamp_to_sec, - closed_scope_by_parent_handle, - scope.handle, - ); - - Some(GpuTimerQueryResult { - label: std::mem::take(&mut scope.label), - time: (start_raw as f64 * timestamp_to_sec) - ..(end_raw as f64 * timestamp_to_sec), - nested_queries, - pid: scope.pid, - tid: scope.tid, - }) - }) - .collect::>() - } -} - -#[derive(PartialEq, Eq)] -enum QueryPairUsageState { - /// Transitional state used upon creation. - Reserved, - - /// Don't do manual timestamp writes, wgpu is expected to do them for us. - ReservedForPassTimestampWrites, - - /// Start query has been used, end query is still available. - OnlyStartWritten, - - /// Both start & end query have been used. - BothStartAndEndWritten, -} - -struct ReservedTimerQueryPair { - /// QueryPool on which both start & end queries of the scope are done. - /// - /// By putting an arc here instead of an index into a vec, we don't need - /// need to take any locks upon closing a profiling scope. - pool: Arc, - - /// Query index at which the scope begins. - /// The query after this is reserved for the end of the scope. - start_query_idx: u32, - - /// Current use of the query pair. - usage_state: QueryPairUsageState, -} - -/// A pool of queries, consisting of a single queryset & buffer for query results. -#[derive(Debug)] -struct QueryPool { - query_set: wgpu::QuerySet, - - resolve_buffer: wgpu::Buffer, - read_buffer: wgpu::Buffer, - - capacity: u32, - num_used_queries: AtomicU32, - num_resolved_queries: AtomicU32, -} - -impl QueryPool { - const MIN_CAPACITY: u32 = 32; - - fn new(capacity: u32, device: &wgpu::Device) -> Self { - QueryPool { - query_set: device.create_query_set(&wgpu::QuerySetDescriptor { - label: Some("GpuProfiler - Query Set"), - ty: wgpu::QueryType::Timestamp, - count: capacity, - }), - - resolve_buffer: device.create_buffer(&wgpu::BufferDescriptor { - label: Some("GpuProfiler - Query Resolve Buffer"), - size: (QUERY_SIZE * capacity) as u64, - usage: wgpu::BufferUsages::QUERY_RESOLVE | wgpu::BufferUsages::COPY_SRC, - mapped_at_creation: false, - }), - - read_buffer: device.create_buffer(&wgpu::BufferDescriptor { - label: Some("GpuProfiler - Query Read Buffer"), - size: (QUERY_SIZE * capacity) as u64, - usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, - mapped_at_creation: false, - }), - - capacity, - num_used_queries: AtomicU32::new(0), - num_resolved_queries: AtomicU32::new(0), - } - } - - fn reset(&mut self) { - self.num_used_queries = AtomicU32::new(0); - self.num_resolved_queries = AtomicU32::new(0); - self.read_buffer.unmap(); - } -} - -#[derive(Default)] -struct PendingFramePools { - /// List of all pools used in this frame. - /// The last pool is the one new profiling queries will try to make timer queries into. - used_pools: Vec>, - - /// List of unused pools recycled from previous frames. - unused_pools: Vec, -} - -/// Internal handle to building a tree of profiling queries. -type GpuTimerQueryTreeHandle = u32; - -/// Handle for the root scope. -const ROOT_QUERY_HANDLE: GpuTimerQueryTreeHandle = std::u32::MAX; - -struct ActiveFrame { - query_pools: RwLock, - - /// Closed queries get send to this channel. - /// - /// Note that channel is still overkill for what we want here: - /// We're in a multi producer situation, *but* the single consumer is known to be only - /// active in a mut context, i.e. while we're consuming we know that we're not producing. - /// We have to wrap it in a Mutex because the channel is not Sync, but we actually never lock it - /// since we only ever access it in a `mut` context. - closed_query_sender: std::sync::mpsc::Sender, - closed_query_receiver: Mutex>, -} - -struct PendingFrame { - query_pools: Vec>, - closed_query_by_parent_handle: HashMap>, - - /// Keeps track of the number of buffers in the query pool that have been mapped successfully. - mapped_buffers: std::sync::Arc, -} - -pub trait ProfilerCommandRecorder { - /// Returns `true` if it's a pass or `false` if it's an encoder - fn is_pass(&self) -> bool; - fn write_timestamp(&mut self, query_set: &wgpu::QuerySet, query_index: u32); - fn push_debug_group(&mut self, label: &str); - fn pop_debug_group(&mut self); -} - -macro_rules! ImplProfilerCommandRecorder { - ($($name:ident $(< $lt:lifetime >)? : $pass:literal,)*) => { - $( - impl $(< $lt >)? ProfilerCommandRecorder for wgpu::$name $(< $lt >)? { - fn is_pass(&self) -> bool { $pass } - - fn write_timestamp(&mut self, query_set: &wgpu::QuerySet, query_index: u32) { - self.write_timestamp(query_set, query_index) - } - - fn push_debug_group(&mut self, label: &str) { - self.push_debug_group(label) - } - - fn pop_debug_group(&mut self) { - self.pop_debug_group() - } - } - )* - }; -} - -ImplProfilerCommandRecorder!(CommandEncoder:false, RenderPass<'a>:true, ComputePass<'a>:true,); diff --git a/src/profiler.rs b/src/profiler.rs new file mode 100644 index 0000000..64b7b21 --- /dev/null +++ b/src/profiler.rs @@ -0,0 +1,879 @@ +use std::{ + collections::HashMap, + sync::{ + atomic::{AtomicU32, Ordering}, + Arc, + }, +}; + +use parking_lot::{Mutex, RwLock}; + +use crate::{ + CreationError, EndFrameError, GpuProfilerQuery, GpuProfilerSettings, GpuTimerQueryResult, + ManualOwningScope, OwningScope, ProfilerCommandRecorder, Scope, SettingsError, +}; + +/// Profiler instance. +/// +/// You can have an arbitrary number of independent profiler instances per application/adapter. +/// Manages all the necessary [`wgpu::QuerySet`] and [`wgpu::Buffer`] behind the scenes. +/// +/// Any query creation method may allocate a new [`wgpu::QuerySet`] and [`wgpu::Buffer`] internally if necessary. +/// +/// After the first call that passes [`wgpu::Device`], the same device must be used with all subsequent +/// calls to [`GpuProfiler`] and all passed references to wgpu objects must originate from that device. +pub struct GpuProfiler { + unused_pools: Vec, + + active_frame: ActiveFrame, + pending_frames: Vec, + + num_open_queries: AtomicU32, + next_query_handle: AtomicU32, + + size_for_new_query_pools: u32, + + settings: GpuProfilerSettings, + + #[cfg(feature = "tracy")] + tracy_context: Option, +} + +// Public interface +impl GpuProfiler { + /// Combination of all timer query features [`GpuProfiler`] can leverage. + pub const ALL_WGPU_TIMER_FEATURES: wgpu::Features = + wgpu::Features::TIMESTAMP_QUERY.union(wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES); + + /// Combination of all timer query features [`GpuProfiler`] can leverage. + #[deprecated(since = "0.9.0", note = "Use ALL_WGPU_TIMER_FEATURES instead")] + pub const REQUIRED_WGPU_FEATURES: wgpu::Features = GpuProfiler::ALL_WGPU_TIMER_FEATURES; + + /// Creates a new Profiler object. + /// + /// There is nothing preventing the use of several independent profiler objects. + pub fn new(settings: GpuProfilerSettings) -> Result { + settings.validate()?; + + let (closed_scope_sender, closed_scope_receiver) = std::sync::mpsc::channel(); + + Ok(GpuProfiler { + unused_pools: Vec::new(), + + pending_frames: Vec::with_capacity(settings.max_num_pending_frames), + active_frame: ActiveFrame { + query_pools: RwLock::new(PendingFramePools::default()), + closed_query_sender: closed_scope_sender, + closed_query_receiver: Mutex::new(closed_scope_receiver), + }, + + num_open_queries: AtomicU32::new(0), + next_query_handle: AtomicU32::new(0), + + size_for_new_query_pools: QueryPool::MIN_CAPACITY, + + settings, + + #[cfg(feature = "tracy")] + tracy_context: None, + }) + } + + /// Creates a new profiler and connects to a running Tracy client. + #[cfg(feature = "tracy")] + pub fn new_with_tracy_client( + settings: GpuProfilerSettings, + backend: wgpu::Backend, + device: &wgpu::Device, + queue: &wgpu::Queue, + ) -> Result { + let mut profiler = Self::new(settings)?; + profiler.tracy_context = Some(crate::tracy::create_tracy_gpu_client( + backend, device, queue, + )?); + Ok(profiler) + } + + /// Changes the settings of an existing profiler. + /// + /// If timer scopes are disabled by setting [GpuProfilerSettings::enable_timer_queries] to false, + /// any timer queries that are in flight will still be processed, + /// but unused query sets and buffers will be deallocated during [`Self::process_finished_frame`]. + /// Similarly, any opened debugging scope will still be closed if debug groups are disabled by setting + /// [GpuProfilerSettings::enable_debug_groups] to false. + pub fn change_settings(&mut self, settings: GpuProfilerSettings) -> Result<(), SettingsError> { + settings.validate()?; + if !settings.enable_timer_queries { + self.unused_pools.clear(); + } + self.settings = settings; + + Ok(()) + } + + /// Starts a new auto-closing profiler scope. + /// + /// To nest scopes inside this scope, call [`Scope::scope`] on the returned scope. + /// + /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`] + /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will + /// not show up in the final results. + /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`] + /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened. + /// + /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass. + /// + /// Scope is automatically closed on drop. + #[must_use] + #[track_caller] + #[inline] + pub fn scope<'a, Recorder: ProfilerCommandRecorder>( + &'a self, + label: impl Into, + encoder_or_pass: &'a mut Recorder, + device: &wgpu::Device, + ) -> Scope<'a, Recorder> { + let scope = self.begin_query(label, encoder_or_pass, device); + Scope { + profiler: self, + recorder: encoder_or_pass, + scope: Some(scope), + } + } + + /// Starts a new auto-closing profiler scope that takes ownership of the passed encoder or rendering/compute pass. + /// + /// To nest scopes inside this scope, call [`OwningScope::scope`] on the returned scope. + /// + /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`] + /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will + /// not show up in the final results. + /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`] + /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened. + /// + /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass. + /// + /// Scope is automatically closed on drop. + #[must_use] + #[track_caller] + #[inline] + pub fn owning_scope<'a, Recorder: ProfilerCommandRecorder>( + &'a self, + label: impl Into, + mut encoder_or_pass: Recorder, + device: &wgpu::Device, + ) -> OwningScope<'a, Recorder> { + let scope = self.begin_query(label, &mut encoder_or_pass, device); + OwningScope { + profiler: self, + recorder: encoder_or_pass, + scope: Some(scope), + } + } + + /// Starts a new **manually closed** profiler scope that takes ownership of the passed encoder or rendering/compute pass. + /// + /// Does NOT call [`GpuProfiler::end_query()`] on drop. + /// This construct is just for completeness in cases where working with scopes is preferred but one can't rely on the Drop call in the right place. + /// This is useful when the owned value needs to be recovered after the end of the scope. + /// In particular, to submit a [`wgpu::CommandEncoder`] to a queue, ownership of the encoder is necessary. + /// + /// To nest scopes inside this scope, call [`ManualOwningScope::scope`] on the returned scope. + /// + /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`] + /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will + /// not show up in the final results. + /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`] + /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened. + /// + /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass. + #[must_use] + #[track_caller] + #[inline] + pub fn manual_owning_scope<'a, Recorder: ProfilerCommandRecorder>( + &'a self, + label: impl Into, + mut encoder_or_pass: Recorder, + device: &wgpu::Device, + ) -> ManualOwningScope<'a, Recorder> { + let scope = self.begin_query(label, &mut encoder_or_pass, device); + ManualOwningScope { + profiler: self, + recorder: encoder_or_pass, + scope: Some(scope), + } + } + + /// Starts a new profiler query on the given encoder or rendering/compute pass (if enabled). + /// + /// The returned query *must* be closed by calling [`GpuProfiler::end_query`] with the same encoder/pass, + /// even if timer queries are disabled. + /// To do this automatically, use [`GpuProfiler::scope`]/[`GpuProfiler::owning_scope`] instead. + /// + /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`] + /// does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be queried and the scope will + /// not show up in the final results. + /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`] + /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no timer queries will be allocated. + /// + /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass. + #[track_caller] + #[must_use] + pub fn begin_query( + &self, + label: impl Into, + encoder_or_pass: &mut Recorder, + device: &wgpu::Device, + ) -> GpuProfilerQuery { + let mut query = self.begin_query_internal(label.into(), encoder_or_pass, device); + if let Some(timer_query) = &mut query.timer_query_pair { + encoder_or_pass + .write_timestamp(&timer_query.pool.query_set, timer_query.start_query_idx); + timer_query.usage_state = QueryPairUsageState::OnlyStartWritten; + }; + + if self.settings.enable_debug_groups { + encoder_or_pass.push_debug_group(&query.label); + query.has_debug_group = true; + } + query + } + + /// Starts a new profiler query to be used for render/compute pass timestamp writes. + /// + /// The returned query *must* be closed by calling [`GpuProfiler::end_query`], even if timer queries are disabled. + /// To do this automatically, use [`Scope::scoped_render_pass`]/[`Scope::scoped_compute_pass`] instead. + /// + /// Call [`GpuProfilerQuery::render_pass_timestamp_writes`] or [`GpuProfilerQuery::compute_pass_timestamp_writes`] + /// to acquire the corresponding `wgpu::RenderPassTimestampWrites`/`wgpu::ComputePassTimestampWrites` object. + /// + /// If the [`wgpu::Device`] does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be reserved. + /// + /// Unlike [`GpuProfiler::begin_query`] this will not create a debug scope, + /// in order to not force passing of the same encoder/pass to [`GpuProfiler::end_query`]. + /// (this is needed to relax resource tracking requirements a bit, making it easier to implement the automatic scopes) + pub fn begin_pass_query( + &self, + label: impl Into, + encoder: &mut wgpu::CommandEncoder, + device: &wgpu::Device, + ) -> GpuProfilerQuery { + let mut query = self.begin_query_internal(label.into(), encoder, device); + if let Some(timer_query) = &mut query.timer_query_pair { + timer_query.usage_state = QueryPairUsageState::ReservedForPassTimestampWrites; + } + query + } + + /// Ends passed query. + /// + /// If the passed query was opened with [`GpuProfiler::begin_query`], the passed encoder or pass must be the same + /// as when the query was opened. + pub fn end_query( + &self, + encoder_or_pass: &mut Recorder, + mut query: GpuProfilerQuery, + ) { + if let Some(timer_query) = &mut query.timer_query_pair { + match timer_query.usage_state { + QueryPairUsageState::Reserved => { + unreachable!("Query pair has been reserved but isn't used for anything!") + } + QueryPairUsageState::ReservedForPassTimestampWrites => { + // No need to do a timestamp write, this is handled by wgpu. + } + QueryPairUsageState::OnlyStartWritten => { + encoder_or_pass.write_timestamp( + &timer_query.pool.query_set, + timer_query.start_query_idx + 1, + ); + timer_query.usage_state = QueryPairUsageState::BothStartAndEndWritten; + } + QueryPairUsageState::BothStartAndEndWritten => { + unreachable!("Query pair has already been used!") + } + } + } + + #[cfg(feature = "tracy")] + if let Some(ref mut tracy_scope) = query.tracy_scope { + tracy_scope.end_zone(); + } + + if query.has_debug_group { + encoder_or_pass.pop_debug_group(); + } + + let send_result = self.active_frame.closed_query_sender.send(query); + + // The only way we can fail sending the query is if the receiver has been dropped. + // Since it sits on `active_frame` as well, there's no way for this to happen! + debug_assert!(send_result.is_ok()); + + // Count queries even if we haven't processed this one, makes experiences more consistent + // if there's a lack of support for some queries. + self.num_open_queries.fetch_sub(1, Ordering::Release); + } + + /// Puts query resolve commands in the encoder for all unresolved, pending queries of the active profiler frame. + /// + /// Note that you do *not* need to do this for every encoder, it is sufficient do do this once per frame as long + /// as you submit the corresponding command buffer after all others that may have opened queries in the same frame. + /// (It does not matter if the passed encoder itself has previously opened queries or not.) + /// If you were to make this part of a command buffer that is enqueued before any other that has + /// opened queries in the same profiling frame, no failure will occur but some timing results may be invalid. + /// + /// It is advised to call this only once at the end of a profiling frame, but it is safe to do so several times. + /// + /// + /// Implementation note: + /// This method could be made `&self`, taking the internal lock on the query pools. + /// However, the intended use is to call this once at the end of a frame, so we instead + /// encourage this explicit sync point and avoid the lock. + pub fn resolve_queries(&mut self, encoder: &mut wgpu::CommandEncoder) { + let query_pools = self.active_frame.query_pools.get_mut(); + + for query_pool in query_pools.used_pools.iter_mut() { + // We sync with the last update of num_used_query (which has Release semantics) + // mostly to be on the safe side - it happened inside a lock which gives it release semantics anyways + // but the concern is that if we don't acquire here, we may miss on other side prior effects of the query begin. + let num_used_queries = query_pool.num_used_queries.load(Ordering::Acquire); + let num_resolved_queries = query_pool.num_resolved_queries.load(Ordering::Acquire); + + if num_resolved_queries == num_used_queries { + continue; + } + + assert!(num_resolved_queries < num_used_queries); + + encoder.resolve_query_set( + &query_pool.query_set, + num_resolved_queries..num_used_queries, + &query_pool.resolve_buffer, + (num_resolved_queries * wgpu::QUERY_SIZE) as u64, + ); + query_pool + .num_resolved_queries + .store(num_used_queries, Ordering::Release); + + encoder.copy_buffer_to_buffer( + &query_pool.resolve_buffer, + 0, + &query_pool.read_buffer, + 0, + (num_used_queries * wgpu::QUERY_SIZE) as u64, + ); + } + } + + /// Marks the end of a frame. + /// + /// Needs to be called **after** submitting any encoder used in the current profiler frame. + /// + /// Fails if there are still open queries or unresolved queries. + pub fn end_frame(&mut self) -> Result<(), EndFrameError> { + let num_open_queries = self.num_open_queries.load(Ordering::Acquire); + if num_open_queries != 0 { + return Err(EndFrameError::UnclosedQueries(num_open_queries)); + } + + let query_pools = self.active_frame.query_pools.get_mut(); + + let mut new_pending_frame = PendingFrame { + query_pools: std::mem::take(&mut query_pools.used_pools), + closed_query_by_parent_handle: HashMap::new(), + mapped_buffers: Arc::new(AtomicU32::new(0)), + }; + + for query in self.active_frame.closed_query_receiver.get_mut().try_iter() { + new_pending_frame + .closed_query_by_parent_handle + .entry(query.parent_handle) + .or_default() + .push(query); + } + + // All loads of pool.num_used_queries are Relaxed since we assume, + // that we already acquired the state during `resolve_queries` and no further otherwise unobserved + // modifications happened since then. + + let num_unresolved_queries = new_pending_frame + .query_pools + .iter() + .map(|pool| { + pool.num_used_queries.load(Ordering::Relaxed) + - pool.num_resolved_queries.load(Ordering::Relaxed) + }) + .sum(); + if num_unresolved_queries != 0 { + return Err(EndFrameError::UnresolvedQueries(num_unresolved_queries)); + } + + // Next time we create a new query pool, we want it to be at least as big to hold all queries of this frame. + self.size_for_new_query_pools = self + .size_for_new_query_pools + .max( + new_pending_frame + .query_pools + .iter() + .map(|pool| pool.num_used_queries.load(Ordering::Relaxed)) + .sum(), + ) + .min(QUERY_SET_MAX_QUERIES); + + // Make sure we don't overflow. + if self.pending_frames.len() == self.settings.max_num_pending_frames { + // Drop previous (!) frame. + // Dropping the oldest frame could get us into an endless cycle where we're never able to complete + // any pending frames as the ones closest to completion would be evicted. + if let Some(dropped_frame) = self.pending_frames.pop() { + // Drop queries first since they still have references to the query pools that we want to reuse. + drop(dropped_frame.closed_query_by_parent_handle); + + // Mark the frame as dropped. We'll give back the query pools once the mapping is done. + // Any previously issued map_async call that haven't finished yet, will invoke their callback with mapping abort. + self.reset_and_cache_unused_query_pools(dropped_frame.query_pools); + } + } + + // Map all buffers. + for pool in new_pending_frame.query_pools.iter_mut() { + let mapped_buffers = new_pending_frame.mapped_buffers.clone(); + pool.read_buffer + .slice(0..(pool.num_used_queries.load(Ordering::Relaxed) * wgpu::QUERY_SIZE) as u64) + .map_async(wgpu::MapMode::Read, move |mapping_result| { + // Mapping should not fail unless it was cancelled due to the frame being dropped. + match mapping_result { + Err(_) => { + // We only want to ignore the error iff the mapping has been aborted by us (due to a dropped frame, see above). + // In any other case, we need should panic as this would imply something went seriously sideways. + // + // As of writing, this is not yet possible in wgpu, see https://github.com/gfx-rs/wgpu/pull/2939 + } + Ok(()) => { + mapped_buffers.fetch_add(1, std::sync::atomic::Ordering::Release); + } + } + }); + } + + // Enqueue + self.pending_frames.push(new_pending_frame); + assert!(self.pending_frames.len() <= self.settings.max_num_pending_frames); + + Ok(()) + } + + /// Checks if all timer queries for the oldest pending finished frame are done and returns that snapshot if any. + /// + /// timestamp_period: + /// The timestamp period of the device. Pass the result of [`wgpu::Queue::get_timestamp_period()`]. + /// Note that some implementations (Chrome as of writing) may converge to a timestamp period while the application is running, + /// so caching this value is usually not recommended. + pub fn process_finished_frame( + &mut self, + timestamp_period: f32, + ) -> Option> { + let frame = self.pending_frames.first_mut()?; + + // We only process if all mappings succeed. + if frame + .mapped_buffers + .load(std::sync::atomic::Ordering::Acquire) + != frame.query_pools.len() as u32 + { + return None; + } + + let mut frame = self.pending_frames.remove(0); + + let results = { + let timestamp_to_sec = timestamp_period as f64 / 1000.0 / 1000.0 / 1000.0; + + Self::process_timings_recursive( + timestamp_to_sec, + &mut frame.closed_query_by_parent_handle, + ROOT_QUERY_HANDLE, + ) + }; + + self.reset_and_cache_unused_query_pools(frame.query_pools); + + Some(results) + } +} + +// -------------------------------------------------------------------------------- +// Internals +// -------------------------------------------------------------------------------- + +const QUERY_SET_MAX_QUERIES: u32 = wgpu::QUERY_SET_MAX_QUERIES; + +/// Returns true if a timestamp should be written to the encoder or pass. +fn timestamp_write_supported( + encoder_or_pass: &mut Recorder, + features: wgpu::Features, +) -> bool { + let required_feature = if encoder_or_pass.is_pass() { + wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES + } else { + wgpu::Features::TIMESTAMP_QUERY + }; + features.contains(required_feature) +} + +impl GpuProfiler { + fn next_scope_tree_handle(&self) -> GpuTimerQueryTreeHandle { + // Relaxed is fine, we just want a number that nobody uses this frame already. + let mut handle = self.next_query_handle.fetch_add(1, Ordering::Relaxed); + + // We don't ever expect to run out of handles during a single frame, but who knows how long the app runs. + while handle == ROOT_QUERY_HANDLE { + handle = self.next_query_handle.fetch_add(1, Ordering::Relaxed); + } + + handle + } + + fn reset_and_cache_unused_query_pools(&mut self, mut discarded_pools: Vec>) { + let capacity_threshold = self.size_for_new_query_pools / 2; + for pool in discarded_pools.drain(..) { + // If the pool is truly unused now, it's ref count should be 1! + // If we use it anywhere else we have an implementation bug. + let mut pool = Arc::into_inner(pool).expect("Pool still in use"); + pool.reset(); + + // If a pool was less than half of the size of the max frame, then we don't keep it. + // This way we're going to need less pools in upcoming frames and thus have less overhead in the long run. + // If timer queries were disabled, we also don't keep any pools. + if self.settings.enable_timer_queries && pool.capacity >= capacity_threshold { + self.active_frame + .query_pools + .get_mut() + .unused_pools + .push(pool); + } + } + } + + fn try_reserve_query_pair(pool: &Arc) -> Option { + let mut num_used_queries = pool.num_used_queries.load(Ordering::Relaxed); + + loop { + if pool.capacity < num_used_queries + 2 { + // This pool is out of capacity, we failed the operation. + return None; + } + + match pool.num_used_queries.compare_exchange_weak( + num_used_queries, + num_used_queries + 2, + // Write to num_used_queries with release semantics to be on the safe side. + // (It doesn't look like there's other side effects that we need to publish.) + Ordering::Release, + // No barrier for the failure case. + // The only thing we have to acquire is the pool's capacity which is constant and + // was definitely acquired by the RWLock prior to this call. + Ordering::Relaxed, + ) { + Ok(_) => { + // We successfully acquired two queries! + return Some(ReservedTimerQueryPair { + pool: pool.clone(), + start_query_idx: num_used_queries, + usage_state: QueryPairUsageState::Reserved, + }); + } + Err(updated) => { + // Someone else acquired queries in the meantime, try again. + num_used_queries = updated; + } + } + } + } + + // Reserves two query objects. + // Our query pools always have an even number of queries, so we know the next query is the next in the same pool. + fn reserve_query_pair(&self, device: &wgpu::Device) -> ReservedTimerQueryPair { + // First, try to allocate from current top pool. + // Requires taking a read lock on the current query pool. + { + let query_pools = self.active_frame.query_pools.read(); + if let Some(pair) = query_pools + .used_pools + .last() + .and_then(Self::try_reserve_query_pair) + { + return pair; + } + } + // If this didn't work, we may need to add a new pool. + // Requires taking a write lock on the current query pool. + { + let mut query_pools = self.active_frame.query_pools.write(); + + // It could be that by now, another thread has already added a new pool! + // This is a bit unfortunate because it means we unnecessarily took a write lock, but it seems hard to get around this. + if let Some(pair) = query_pools + .used_pools + .last() + .and_then(Self::try_reserve_query_pair) + { + return pair; + } + + // Now we know for certain that the last pool is exhausted, so add a new one! + let new_pool = if let Some(reused_pool) = query_pools.unused_pools.pop() { + // First check if there's an unused pool we can take. + Arc::new(reused_pool) + } else { + // If we can't, create a new pool that is as big as all previous pools combined. + Arc::new(QueryPool::new( + query_pools + .used_pools + .iter() + .map(|pool| pool.capacity) + .sum::() + .max(self.size_for_new_query_pools) + .min(QUERY_SET_MAX_QUERIES), + device, + )) + }; + + let pair = Self::try_reserve_query_pair(&new_pool) + .expect("Freshly reserved pool doesn't have enough capacity"); + query_pools.used_pools.push(new_pool); + + pair + } + } + + #[track_caller] + #[must_use] + fn begin_query_internal( + &self, + label: String, + encoder_or_pass: &mut Recorder, + device: &wgpu::Device, + ) -> GpuProfilerQuery { + // Give opening/closing queries acquire/release semantics: + // This way, we won't get any nasty surprises when observing zero open queries. + self.num_open_queries.fetch_add(1, Ordering::Acquire); + + let query = if self.settings.enable_timer_queries + && timestamp_write_supported(encoder_or_pass, device.features()) + { + Some(self.reserve_query_pair(device)) + } else { + None + }; + + let _tracy_scope = if self.settings.enable_timer_queries { + #[cfg(feature = "tracy")] + { + let location = std::panic::Location::caller(); + self.tracy_context.as_ref().and_then(|c| { + c.span_alloc(&label, "", location.file(), location.line()) + .ok() + }) + } + #[cfg(not(feature = "tracy"))] + Option::<()>::None + } else { + None + }; + + GpuProfilerQuery { + label, + pid: std::process::id(), + tid: std::thread::current().id(), + timer_query_pair: query, + handle: self.next_scope_tree_handle(), + parent_handle: ROOT_QUERY_HANDLE, + has_debug_group: false, + #[cfg(feature = "tracy")] + tracy_scope: _tracy_scope, + } + } + + fn process_timings_recursive( + timestamp_to_sec: f64, + closed_scope_by_parent_handle: &mut HashMap>, + parent_handle: GpuTimerQueryTreeHandle, + ) -> Vec { + let Some(queries_with_same_parent) = closed_scope_by_parent_handle.remove(&parent_handle) + else { + return Vec::new(); + }; + + queries_with_same_parent + .into_iter() + .filter_map(|mut scope| { + let Some(query) = scope.timer_query_pair.take() else { + // Inactive queries don't have any results or nested queries with results. + // Currently, we drop them from the results completely. + // In the future we could still make them show up since they convey information like label & pid/tid. + return None; + }; + + // Read timestamp from buffer. + // By design timestamps for start/end are consecutive. + let offset = (query.start_query_idx * wgpu::QUERY_SIZE) as u64; + let buffer_slice = &query + .pool + .read_buffer + .slice(offset..(offset + (wgpu::QUERY_SIZE * 2) as u64)) + .get_mapped_range(); + let start_raw = u64::from_le_bytes( + buffer_slice[0..wgpu::QUERY_SIZE as usize] + .try_into() + .unwrap(), + ); + let end_raw = u64::from_le_bytes( + buffer_slice[wgpu::QUERY_SIZE as usize..(wgpu::QUERY_SIZE as usize) * 2] + .try_into() + .unwrap(), + ); + + #[cfg(feature = "tracy")] + if let Some(tracy_scope) = scope.tracy_scope.take() { + tracy_scope.upload_timestamp(start_raw as i64, end_raw as i64); + } + + let nested_queries = Self::process_timings_recursive( + timestamp_to_sec, + closed_scope_by_parent_handle, + scope.handle, + ); + + Some(GpuTimerQueryResult { + label: std::mem::take(&mut scope.label), + time: (start_raw as f64 * timestamp_to_sec) + ..(end_raw as f64 * timestamp_to_sec), + nested_queries, + pid: scope.pid, + tid: scope.tid, + }) + }) + .collect::>() + } +} + +#[derive(PartialEq, Eq)] +pub enum QueryPairUsageState { + /// Transitional state used upon creation. + Reserved, + + /// Don't do manual timestamp writes, wgpu is expected to do them for us. + ReservedForPassTimestampWrites, + + /// Start query has been used, end query is still available. + OnlyStartWritten, + + /// Both start & end query have been used. + BothStartAndEndWritten, +} + +pub struct ReservedTimerQueryPair { + /// QueryPool on which both start & end queries of the scope are done. + /// + /// By putting an arc here instead of an index into a vec, we don't need + /// need to take any locks upon closing a profiling scope. + pub pool: Arc, + + /// Query index at which the scope begins. + /// The query after this is reserved for the end of the scope. + pub start_query_idx: u32, + + /// Current use of the query pair. + pub usage_state: QueryPairUsageState, +} + +/// A pool of queries, consisting of a single queryset & buffer for query results. +#[derive(Debug)] +pub struct QueryPool { + pub query_set: wgpu::QuerySet, + + resolve_buffer: wgpu::Buffer, + read_buffer: wgpu::Buffer, + + capacity: u32, + num_used_queries: AtomicU32, + num_resolved_queries: AtomicU32, +} + +impl QueryPool { + const MIN_CAPACITY: u32 = 32; + + fn new(capacity: u32, device: &wgpu::Device) -> Self { + QueryPool { + query_set: device.create_query_set(&wgpu::QuerySetDescriptor { + label: Some("GpuProfiler - Query Set"), + ty: wgpu::QueryType::Timestamp, + count: capacity, + }), + + resolve_buffer: device.create_buffer(&wgpu::BufferDescriptor { + label: Some("GpuProfiler - Query Resolve Buffer"), + size: (wgpu::QUERY_SIZE * capacity) as u64, + usage: wgpu::BufferUsages::QUERY_RESOLVE | wgpu::BufferUsages::COPY_SRC, + mapped_at_creation: false, + }), + + read_buffer: device.create_buffer(&wgpu::BufferDescriptor { + label: Some("GpuProfiler - Query Read Buffer"), + size: (wgpu::QUERY_SIZE * capacity) as u64, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + }), + + capacity, + num_used_queries: AtomicU32::new(0), + num_resolved_queries: AtomicU32::new(0), + } + } + + fn reset(&mut self) { + self.num_used_queries = AtomicU32::new(0); + self.num_resolved_queries = AtomicU32::new(0); + self.read_buffer.unmap(); + } +} + +#[derive(Default)] +struct PendingFramePools { + /// List of all pools used in this frame. + /// The last pool is the one new profiling queries will try to make timer queries into. + used_pools: Vec>, + + /// List of unused pools recycled from previous frames. + unused_pools: Vec, +} + +/// Internal handle to building a tree of profiling queries. +pub type GpuTimerQueryTreeHandle = u32; + +/// Handle for the root scope. +pub const ROOT_QUERY_HANDLE: GpuTimerQueryTreeHandle = std::u32::MAX; + +struct ActiveFrame { + query_pools: RwLock, + + /// Closed queries get send to this channel. + /// + /// Note that channel is still overkill for what we want here: + /// We're in a multi producer situation, *but* the single consumer is known to be only + /// active in a mut context, i.e. while we're consuming we know that we're not producing. + /// We have to wrap it in a Mutex because the channel is not Sync, but we actually never lock it + /// since we only ever access it in a `mut` context. + closed_query_sender: std::sync::mpsc::Sender, + closed_query_receiver: Mutex>, +} + +struct PendingFrame { + query_pools: Vec>, + closed_query_by_parent_handle: HashMap>, + + /// Keeps track of the number of buffers in the query pool that have been mapped successfully. + mapped_buffers: std::sync::Arc, +} diff --git a/src/profiler_command_recorder.rs b/src/profiler_command_recorder.rs new file mode 100644 index 0000000..d6a8dff --- /dev/null +++ b/src/profiler_command_recorder.rs @@ -0,0 +1,32 @@ +/// Trait for exposing the methods of `wgpu::CommandEncoder`, `wgpu::RenderPass` and `wgpu::ComputePass` that are used by the profiler. +pub trait ProfilerCommandRecorder { + /// Returns `true` if it's a pass or `false` if it's an encoder + fn is_pass(&self) -> bool; + fn write_timestamp(&mut self, query_set: &wgpu::QuerySet, query_index: u32); + fn push_debug_group(&mut self, label: &str); + fn pop_debug_group(&mut self); +} + +macro_rules! ImplProfilerCommandRecorder { + ($($name:ident $(< $lt:lifetime >)? : $pass:literal,)*) => { + $( + impl $(< $lt >)? ProfilerCommandRecorder for wgpu::$name $(< $lt >)? { + fn is_pass(&self) -> bool { $pass } + + fn write_timestamp(&mut self, query_set: &wgpu::QuerySet, query_index: u32) { + self.write_timestamp(query_set, query_index) + } + + fn push_debug_group(&mut self, label: &str) { + self.push_debug_group(label) + } + + fn pop_debug_group(&mut self) { + self.pop_debug_group() + } + } + )* + }; +} + +ImplProfilerCommandRecorder!(CommandEncoder:false, RenderPass<'a>:true, ComputePass<'a>:true,); diff --git a/src/profiler_query.rs b/src/profiler_query.rs new file mode 100644 index 0000000..485a162 --- /dev/null +++ b/src/profiler_query.rs @@ -0,0 +1,103 @@ +use std::{ops::Range, thread::ThreadId}; + +use crate::profiler::{ + GpuTimerQueryTreeHandle, QueryPairUsageState, ReservedTimerQueryPair, ROOT_QUERY_HANDLE, +}; + +/// The result of a gpu timer scope. +#[derive(Debug, Clone)] +pub struct GpuTimerQueryResult { + /// Label that was specified when opening the scope. + pub label: String, + + /// The process id of the process that opened this scope. + pub pid: u32, + + /// The thread id of the thread that opened this scope. + pub tid: ThreadId, + + /// Time range of this scope in seconds. + /// + /// Meaning of absolute value is not defined. + pub time: Range, + + /// Scopes that were opened while this scope was open. + pub nested_queries: Vec, +} + +/// An inflight query for the profiler. +/// +/// If timer queries are enabled, this represents a reserved timer query pair on +/// one of the profiler's query sets. +/// *Must* be closed by calling [`GpuProfiler::end_query`]. +/// +/// Emitted by [`GpuProfiler::begin_query`]/[`GpuProfiler::begin_pass_query`] and consumed by [`GpuProfiler::end_query`]. +pub struct GpuProfilerQuery { + /// The label assigned to this query. + /// Will be moved into [`GpuProfilerQuery::label`] once the query is fully processed. + pub label: String, + + /// The process id of the process that opened this query. + pub pid: u32, + + /// The thread id of the thread that opened this query. + pub tid: ThreadId, + + /// The actual query on a query pool if any (none if disabled for this type of query). + pub(crate) timer_query_pair: Option, + + /// Handle which identifies this query, used for building the tree of queries. + pub(crate) handle: GpuTimerQueryTreeHandle, + + /// Which query this query is a child of. + pub(crate) parent_handle: GpuTimerQueryTreeHandle, + + /// Whether a debug group was opened for this scope. + pub(crate) has_debug_group: bool, + + #[cfg(feature = "tracy")] + pub(crate) tracy_scope: Option, +} + +impl GpuProfilerQuery { + /// Use the reserved query for render pass timestamp writes if any. + /// + /// Use this only for a single render/compute pass, otherwise results will be overwritten. + /// Only ever returns `Some` for queries that were created using [`GpuProfiler::begin_pass_query`]. + pub fn render_pass_timestamp_writes(&self) -> Option { + self.timer_query_pair.as_ref().and_then(|query| { + (query.usage_state == QueryPairUsageState::ReservedForPassTimestampWrites).then(|| { + wgpu::RenderPassTimestampWrites { + query_set: &query.pool.query_set, + beginning_of_pass_write_index: Some(query.start_query_idx), + end_of_pass_write_index: Some(query.start_query_idx + 1), + } + }) + }) + } + + /// Use the reserved query for compute pass timestamp writes if any. + /// + /// Use this only for a single render/compute pass, otherwise results will be overwritten. + /// Only ever returns `Some` for queries that were created using [`GpuProfiler::begin_pass_query`]. + pub fn compute_pass_timestamp_writes(&self) -> Option { + self.timer_query_pair.as_ref().and_then(|query| { + (query.usage_state == QueryPairUsageState::ReservedForPassTimestampWrites).then(|| { + wgpu::ComputePassTimestampWrites { + query_set: &query.pool.query_set, + beginning_of_pass_write_index: Some(query.start_query_idx), + end_of_pass_write_index: Some(query.start_query_idx + 1), + } + }) + }) + } + + /// Makes this scope a child of the passed scope. + #[inline] + pub fn with_parent(self, parent: Option<&GpuProfilerQuery>) -> Self { + Self { + parent_handle: parent.map_or(ROOT_QUERY_HANDLE, |p| p.handle), + ..self + } + } +} diff --git a/src/profiler_settings.rs b/src/profiler_settings.rs new file mode 100644 index 0000000..b7c2b14 --- /dev/null +++ b/src/profiler_settings.rs @@ -0,0 +1,57 @@ +use crate::SettingsError; + +/// Settings passed on initialization of [`GpuProfiler`]. +#[derive(Debug, Clone)] +pub struct GpuProfilerSettings { + /// Enables/disables gpu timer queries. + /// + /// If false, the profiler will not emit any timer queries, making most operations on [`GpuProfiler`] no-ops. + /// + /// Since all resource creation is done lazily, this provides an effective way of disabling the profiler at runtime + /// without the need of special build configurations or code to handle enabled/disabled profiling. + pub enable_timer_queries: bool, + + /// Enables/disables debug markers for all scopes on the respective encoder or pass. + /// + /// This is useful for debugging with tools like RenderDoc. + /// Debug markers will be emitted even if the device does not support timer queries or disables them via + /// [`GpuProfilerSettings::enable_timer_queries`]. + pub enable_debug_groups: bool, + + /// The profiler queues up to `max_num_pending_frames` "profiler-frames" at a time. + /// + /// A profiler-frame is regarded as in-flight until its queries have been successfully + /// resolved using [`GpuProfiler::process_finished_frame`]. + /// How long this takes to happen, depends on how fast buffer mappings return successfully + /// which in turn primarily depends on how fast the device is able to finish work queued to the [`wgpu::Queue`]. + /// + /// If this threshold is exceeded, [`GpuProfiler::end_frame`] will silently drop frames. + /// *Newer* frames will be dropped first in order to get results back eventually. + /// (If the profiler were to drop the oldest frame, one may end up in a situation where there is never + /// frame that is fully processed and thus never any results to be retrieved). + /// + /// Good values for `max_num_pending_frames` are 2-4 but may depend on your application workload + /// and GPU-CPU syncing strategy. + /// Must be greater than 0. + pub max_num_pending_frames: usize, +} + +impl Default for GpuProfilerSettings { + fn default() -> Self { + Self { + enable_timer_queries: true, + enable_debug_groups: true, + max_num_pending_frames: 3, + } + } +} + +impl GpuProfilerSettings { + pub fn validate(&self) -> Result<(), SettingsError> { + if self.max_num_pending_frames == 0 { + Err(SettingsError::InvalidMaxNumPendingFrames) + } else { + Ok(()) + } + } +} diff --git a/src/tracy.rs b/src/tracy.rs index 19fa579..6764563 100644 --- a/src/tracy.rs +++ b/src/tracy.rs @@ -1,6 +1,6 @@ use crate::CreationError; -pub(crate) fn create_tracy_gpu_client( +pub fn create_tracy_gpu_client( backend: wgpu::Backend, device: &wgpu::Device, queue: &wgpu::Queue, @@ -13,14 +13,14 @@ pub(crate) fn create_tracy_gpu_client( let resolve_buffer = device.create_buffer(&wgpu::BufferDescriptor { label: Some("wgpu-profiler gpu -> cpu resolve buffer"), - size: crate::QUERY_SIZE as _, + size: wgpu::QUERY_SIZE as _, usage: wgpu::BufferUsages::QUERY_RESOLVE | wgpu::BufferUsages::COPY_SRC, mapped_at_creation: false, }); let map_buffer = device.create_buffer(&wgpu::BufferDescriptor { label: Some("wgpu-profiler gpu -> cpu map buffer"), - size: crate::QUERY_SIZE as _, + size: wgpu::QUERY_SIZE as _, usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST, mapped_at_creation: false, }); @@ -30,7 +30,7 @@ pub(crate) fn create_tracy_gpu_client( }); encoder.write_timestamp(&query_set, 0); encoder.resolve_query_set(&query_set, 0..1, &resolve_buffer, 0); - encoder.copy_buffer_to_buffer(&resolve_buffer, 0, &map_buffer, 0, crate::QUERY_SIZE as _); + encoder.copy_buffer_to_buffer(&resolve_buffer, 0, &map_buffer, 0, wgpu::QUERY_SIZE as _); queue.submit(Some(encoder.finish())); map_buffer.slice(..).map_async(wgpu::MapMode::Read, |_| ());