From bc9468908cb16a8239b6e46624c3952b7a264042 Mon Sep 17 00:00:00 2001
From: Manato HIRABAYASHI <manato.hirabayashi@tier4.jp>
Date: Fri, 10 May 2024 11:34:56 +0900
Subject: [PATCH] feat: introduce multiple cuda stream to perform memcpy in
 parallel

Signed-off-by: Manato HIRABAYASHI <manato.hirabayashi@tier4.jp>
---
 include/mtr/cuda_helper.hpp | 38 +++++++++++++++++++++++++++++++++++++
 include/mtr/mtr.hpp         |  1 +
 src/mtr.cpp                 | 20 +++++++++++--------
 3 files changed, 51 insertions(+), 8 deletions(-)
diff --git a/include/mtr/cuda_helper.hpp b/include/mtr/cuda_helper.hpp
index 0ecb2c5..6bb3248 100644
--- a/include/mtr/cuda_helper.hpp
+++ b/include/mtr/cuda_helper.hpp
@@ -26,6 +26,7 @@
 #include <sstream>
 #include <stdexcept>
 #include <type_traits>
+#include <vector>
 
 #define CHECK_CUDA_ERROR(e) (cuda::check_error(e, __FILE__, __LINE__))
 
@@ -132,6 +133,43 @@ class EventDebugger
   bool has_event_{false};
 };
 
+class StreamRingBuffer
+{
+ public:
+  StreamRingBuffer (const size_t buffer_length)
+      : buffer_length_(buffer_length),
+        current_index_(0)
+  {
+    for (size_t i = 0; i < buffer_length_; i++) {
+      cudaStream_t s;
+      CHECK_CUDA_ERROR(cudaStreamCreate(&s));
+      ring_buffer_.push_back(s);
+    }
+  }
+
+  cudaStream_t& operator()(void)
+  {
+    auto& res =  ring_buffer_[current_index_];
+    current_index_++;
+    if (current_index_ >= buffer_length_) {
+      current_index_ = 0;
+    }
+    return res;
+  }
+
+  void SyncAllStreams(void)
+  {
+    for (const auto& s : ring_buffer_) {
+      CHECK_CUDA_ERROR(cudaStreamSynchronize(s));
+    }
+  }
+
+ protected:
+  size_t buffer_length_;
+  size_t current_index_;
+  std::vector<cudaStream_t> ring_buffer_;
+};
+
 }  // namespace cuda
 
 #endif  // MTR__CUDA_HELPER_HPP_
diff --git a/include/mtr/mtr.hpp b/include/mtr/mtr.hpp
index 02a632b..69220e1 100644
--- a/include/mtr/mtr.hpp
+++ b/include/mtr/mtr.hpp
@@ -151,6 +151,7 @@ class TrtMTR
 
   std::unique_ptr<MTRBuilder> builder_;
   cudaStream_t stream_{nullptr};
+  cuda::StreamRingBuffer copy_streams_;
 
   IntentionPoint intention_point_;
 
diff --git a/src/mtr.cpp b/src/mtr.cpp
index 334b83b..7eba972 100644
--- a/src/mtr.cpp
+++ b/src/mtr.cpp
@@ -24,7 +24,8 @@ TrtMTR::TrtMTR(
   const std::string & model_path, const MTRConfig & config, const BuildConfig & build_config,
   const size_t max_workspace_size)
 : config_(config),
-  intention_point_(config_.intention_point_filepath, config_.num_intention_point_cluster)
+  intention_point_(config_.intention_point_filepath, config_.num_intention_point_cluster),
+  copy_streams_(7)  // 7 is the maximum number of consecutive memory copy in this class
 {
   builder_ = std::make_unique<MTRBuilder>(model_path, build_config, max_workspace_size);
   builder_->setup();
@@ -143,30 +144,33 @@ bool TrtMTR::preProcess(const AgentData & agent_data, const PolylineData & polyl
 {
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_target_index_.get(), agent_data.target_indices().data(), sizeof(int) * num_target_,
-    cudaMemcpyHostToDevice, stream_));
+    cudaMemcpyHostToDevice, copy_streams_()));
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_label_index_.get(), agent_data.label_indices().data(), sizeof(int) * num_agent_,
-    cudaMemcpyHostToDevice, stream_));
+    cudaMemcpyHostToDevice, copy_streams_()));
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_timestamps_.get(), agent_data.timestamps().data(), sizeof(float) * num_timestamp_,
-    cudaMemcpyHostToDevice, stream_));
+    cudaMemcpyHostToDevice, copy_streams_()));
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_trajectory_.get(), agent_data.data_ptr(), sizeof(float) * agent_data.size(),
-    cudaMemcpyHostToDevice, stream_));
+    cudaMemcpyHostToDevice, copy_streams_()));
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_target_state_.get(), agent_data.target_data_ptr(),
-    sizeof(float) * num_target_ * num_agent_dim_, cudaMemcpyHostToDevice, stream_));
+    sizeof(float) * num_target_ * num_agent_dim_, cudaMemcpyHostToDevice, copy_streams_()));
 
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_polyline_.get(), polyline_data.data_ptr(), sizeof(float) * polyline_data.size(),
-    cudaMemcpyHostToDevice, stream_));
+    cudaMemcpyHostToDevice, copy_streams_()));
 
   const auto target_label_names = getLabelNames(agent_data.target_label_indices());
   const auto intention_points = intention_point_.get_points(target_label_names);
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_intention_points_.get(), intention_points.data(),
     sizeof(float) * num_target_ * config_.num_intention_point_cluster * 2, cudaMemcpyHostToDevice,
-    stream_));
+    copy_streams_()));
+
+  // Wait until all memory copy have been done
+  copy_streams_.SyncAllStreams();
 
   // DEBUG
   event_debugger_.createEvent(stream_);