From bc9468908cb16a8239b6e46624c3952b7a264042 Mon Sep 17 00:00:00 2001 From: Manato HIRABAYASHI Date: Fri, 10 May 2024 11:34:56 +0900 Subject: [PATCH] feat: introduce multiple cuda stream to perform memcpy in parallel Signed-off-by: Manato HIRABAYASHI --- include/mtr/cuda_helper.hpp | 38 +++++++++++++++++++++++++++++++++++++ include/mtr/mtr.hpp | 1 + src/mtr.cpp | 20 +++++++++++-------- 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/include/mtr/cuda_helper.hpp b/include/mtr/cuda_helper.hpp index 0ecb2c5..6bb3248 100644 --- a/include/mtr/cuda_helper.hpp +++ b/include/mtr/cuda_helper.hpp @@ -26,6 +26,7 @@ #include #include #include +#include #define CHECK_CUDA_ERROR(e) (cuda::check_error(e, __FILE__, __LINE__)) @@ -132,6 +133,43 @@ class EventDebugger bool has_event_{false}; }; +class StreamRingBuffer +{ + public: + StreamRingBuffer (const size_t buffer_length) + : buffer_length_(buffer_length), + current_index_(0) + { + for (size_t i = 0; i < buffer_length_; i++) { + cudaStream_t s; + CHECK_CUDA_ERROR(cudaStreamCreate(&s)); + ring_buffer_.push_back(s); + } + } + + cudaStream_t& operator()(void) + { + auto& res = ring_buffer_[current_index_]; + current_index_++; + if (current_index_ >= buffer_length_) { + current_index_ = 0; + } + return res; + } + + void SyncAllStreams(void) + { + for (const auto& s : ring_buffer_) { + CHECK_CUDA_ERROR(cudaStreamSynchronize(s)); + } + } + + protected: + size_t buffer_length_; + size_t current_index_; + std::vector ring_buffer_; +}; + } // namespace cuda #endif // MTR__CUDA_HELPER_HPP_ diff --git a/include/mtr/mtr.hpp b/include/mtr/mtr.hpp index 02a632b..69220e1 100644 --- a/include/mtr/mtr.hpp +++ b/include/mtr/mtr.hpp @@ -151,6 +151,7 @@ class TrtMTR std::unique_ptr builder_; cudaStream_t stream_{nullptr}; + cuda::StreamRingBuffer copy_streams_; IntentionPoint intention_point_; diff --git a/src/mtr.cpp b/src/mtr.cpp index 334b83b..7eba972 100644 --- a/src/mtr.cpp +++ b/src/mtr.cpp @@ -24,7 +24,8 @@ TrtMTR::TrtMTR( const std::string & model_path, const MTRConfig & config, const BuildConfig & build_config, const size_t max_workspace_size) : config_(config), - intention_point_(config_.intention_point_filepath, config_.num_intention_point_cluster) + intention_point_(config_.intention_point_filepath, config_.num_intention_point_cluster), + copy_streams_(7) // 7 is the maximum number of consecutive memory copy in this class { builder_ = std::make_unique(model_path, build_config, max_workspace_size); builder_->setup(); @@ -143,30 +144,33 @@ bool TrtMTR::preProcess(const AgentData & agent_data, const PolylineData & polyl { CHECK_CUDA_ERROR(cudaMemcpyAsync( d_target_index_.get(), agent_data.target_indices().data(), sizeof(int) * num_target_, - cudaMemcpyHostToDevice, stream_)); + cudaMemcpyHostToDevice, copy_streams_())); CHECK_CUDA_ERROR(cudaMemcpyAsync( d_label_index_.get(), agent_data.label_indices().data(), sizeof(int) * num_agent_, - cudaMemcpyHostToDevice, stream_)); + cudaMemcpyHostToDevice, copy_streams_())); CHECK_CUDA_ERROR(cudaMemcpyAsync( d_timestamps_.get(), agent_data.timestamps().data(), sizeof(float) * num_timestamp_, - cudaMemcpyHostToDevice, stream_)); + cudaMemcpyHostToDevice, copy_streams_())); CHECK_CUDA_ERROR(cudaMemcpyAsync( d_trajectory_.get(), agent_data.data_ptr(), sizeof(float) * agent_data.size(), - cudaMemcpyHostToDevice, stream_)); + cudaMemcpyHostToDevice, copy_streams_())); CHECK_CUDA_ERROR(cudaMemcpyAsync( d_target_state_.get(), agent_data.target_data_ptr(), - sizeof(float) * num_target_ * num_agent_dim_, cudaMemcpyHostToDevice, stream_)); + sizeof(float) * num_target_ * num_agent_dim_, cudaMemcpyHostToDevice, copy_streams_())); CHECK_CUDA_ERROR(cudaMemcpyAsync( d_polyline_.get(), polyline_data.data_ptr(), sizeof(float) * polyline_data.size(), - cudaMemcpyHostToDevice, stream_)); + cudaMemcpyHostToDevice, copy_streams_())); const auto target_label_names = getLabelNames(agent_data.target_label_indices()); const auto intention_points = intention_point_.get_points(target_label_names); CHECK_CUDA_ERROR(cudaMemcpyAsync( d_intention_points_.get(), intention_points.data(), sizeof(float) * num_target_ * config_.num_intention_point_cluster * 2, cudaMemcpyHostToDevice, - stream_)); + copy_streams_())); + + // Wait until all memory copy have been done + copy_streams_.SyncAllStreams(); // DEBUG event_debugger_.createEvent(stream_);