feat: introduce multiple cuda stream to perform memcpy in parallel

Signed-off-by: Manato HIRABAYASHI <[email protected]>
ktro2828 · May 10, 2024 · bc94689 · bc94689
1 parent 75cd53b
commit bc94689
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 8 deletions.
diff --git a/include/mtr/cuda_helper.hpp b/include/mtr/cuda_helper.hpp
@@ -26,6 +26,7 @@
 #include <sstream>
 #include <stdexcept>
 #include <type_traits>
+#include <vector>
 
 #define CHECK_CUDA_ERROR(e) (cuda::check_error(e, __FILE__, __LINE__))
 
@@ -132,6 +133,43 @@ class EventDebugger
   bool has_event_{false};
 };
 
+class StreamRingBuffer
+{
+ public:
+  StreamRingBuffer (const size_t buffer_length)
+      : buffer_length_(buffer_length),
+        current_index_(0)
+  {
+    for (size_t i = 0; i < buffer_length_; i++) {
+      cudaStream_t s;
+      CHECK_CUDA_ERROR(cudaStreamCreate(&s));
+      ring_buffer_.push_back(s);
+    }
+  }
+
+  cudaStream_t& operator()(void)
+  {
+    auto& res =  ring_buffer_[current_index_];
+    current_index_++;
+    if (current_index_ >= buffer_length_) {
+      current_index_ = 0;
+    }
+    return res;
+  }
+
+  void SyncAllStreams(void)
+  {
+    for (const auto& s : ring_buffer_) {
+      CHECK_CUDA_ERROR(cudaStreamSynchronize(s));
+    }
+  }
+
+ protected:
+  size_t buffer_length_;
+  size_t current_index_;
+  std::vector<cudaStream_t> ring_buffer_;
+};
+
 }  // namespace cuda
 
 #endif  // MTR__CUDA_HELPER_HPP_
diff --git a/include/mtr/mtr.hpp b/include/mtr/mtr.hpp
@@ -151,6 +151,7 @@ class TrtMTR
 
   std::unique_ptr<MTRBuilder> builder_;
   cudaStream_t stream_{nullptr};
+  cuda::StreamRingBuffer copy_streams_;
 
   IntentionPoint intention_point_;
 

diff --git a/src/mtr.cpp b/src/mtr.cpp
@@ -24,7 +24,8 @@ TrtMTR::TrtMTR(
   const std::string & model_path, const MTRConfig & config, const BuildConfig & build_config,
   const size_t max_workspace_size)
 : config_(config),
-  intention_point_(config_.intention_point_filepath, config_.num_intention_point_cluster)
+  intention_point_(config_.intention_point_filepath, config_.num_intention_point_cluster),
+  copy_streams_(7)  // 7 is the maximum number of consecutive memory copy in this class
 {
   builder_ = std::make_unique<MTRBuilder>(model_path, build_config, max_workspace_size);
   builder_->setup();
@@ -143,30 +144,33 @@ bool TrtMTR::preProcess(const AgentData & agent_data, const PolylineData & polyl
 {
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_target_index_.get(), agent_data.target_indices().data(), sizeof(int) * num_target_,
-    cudaMemcpyHostToDevice, stream_));
+    cudaMemcpyHostToDevice, copy_streams_()));
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_label_index_.get(), agent_data.label_indices().data(), sizeof(int) * num_agent_,
-    cudaMemcpyHostToDevice, stream_));
+    cudaMemcpyHostToDevice, copy_streams_()));
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_timestamps_.get(), agent_data.timestamps().data(), sizeof(float) * num_timestamp_,
-    cudaMemcpyHostToDevice, stream_));
+    cudaMemcpyHostToDevice, copy_streams_()));
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_trajectory_.get(), agent_data.data_ptr(), sizeof(float) * agent_data.size(),
-    cudaMemcpyHostToDevice, stream_));
+    cudaMemcpyHostToDevice, copy_streams_()));
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_target_state_.get(), agent_data.target_data_ptr(),
-    sizeof(float) * num_target_ * num_agent_dim_, cudaMemcpyHostToDevice, stream_));
+    sizeof(float) * num_target_ * num_agent_dim_, cudaMemcpyHostToDevice, copy_streams_()));
 
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_polyline_.get(), polyline_data.data_ptr(), sizeof(float) * polyline_data.size(),
-    cudaMemcpyHostToDevice, stream_));
+    cudaMemcpyHostToDevice, copy_streams_()));
 
   const auto target_label_names = getLabelNames(agent_data.target_label_indices());
   const auto intention_points = intention_point_.get_points(target_label_names);
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_intention_points_.get(), intention_points.data(),
     sizeof(float) * num_target_ * config_.num_intention_point_cluster * 2, cudaMemcpyHostToDevice,
-    stream_));
+    copy_streams_()));
+
+  // Wait until all memory copy have been done
+  copy_streams_.SyncAllStreams();
 
   // DEBUG
   event_debugger_.createEvent(stream_);