Skip to content

Commit

Permalink
feat: introduce multiple cuda stream to perform memcpy in parallel
Browse files Browse the repository at this point in the history
Signed-off-by: Manato HIRABAYASHI <[email protected]>
  • Loading branch information
manato committed May 10, 2024
1 parent 75cd53b commit bc94689
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 8 deletions.
38 changes: 38 additions & 0 deletions include/mtr/cuda_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <sstream>
#include <stdexcept>
#include <type_traits>
#include <vector>

#define CHECK_CUDA_ERROR(e) (cuda::check_error(e, __FILE__, __LINE__))

Expand Down Expand Up @@ -132,6 +133,43 @@ class EventDebugger
bool has_event_{false};
};

class StreamRingBuffer
{
public:
StreamRingBuffer (const size_t buffer_length)
: buffer_length_(buffer_length),
current_index_(0)
{
for (size_t i = 0; i < buffer_length_; i++) {
cudaStream_t s;
CHECK_CUDA_ERROR(cudaStreamCreate(&s));
ring_buffer_.push_back(s);
}
}

cudaStream_t& operator()(void)
{
auto& res = ring_buffer_[current_index_];
current_index_++;
if (current_index_ >= buffer_length_) {
current_index_ = 0;
}
return res;
}

void SyncAllStreams(void)
{
for (const auto& s : ring_buffer_) {
CHECK_CUDA_ERROR(cudaStreamSynchronize(s));
}
}

protected:
size_t buffer_length_;
size_t current_index_;
std::vector<cudaStream_t> ring_buffer_;
};

} // namespace cuda

#endif // MTR__CUDA_HELPER_HPP_
1 change: 1 addition & 0 deletions include/mtr/mtr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ class TrtMTR

std::unique_ptr<MTRBuilder> builder_;
cudaStream_t stream_{nullptr};
cuda::StreamRingBuffer copy_streams_;

IntentionPoint intention_point_;

Expand Down
20 changes: 12 additions & 8 deletions src/mtr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ TrtMTR::TrtMTR(
const std::string & model_path, const MTRConfig & config, const BuildConfig & build_config,
const size_t max_workspace_size)
: config_(config),
intention_point_(config_.intention_point_filepath, config_.num_intention_point_cluster)
intention_point_(config_.intention_point_filepath, config_.num_intention_point_cluster),
copy_streams_(7) // 7 is the maximum number of consecutive memory copy in this class
{
builder_ = std::make_unique<MTRBuilder>(model_path, build_config, max_workspace_size);
builder_->setup();
Expand Down Expand Up @@ -143,30 +144,33 @@ bool TrtMTR::preProcess(const AgentData & agent_data, const PolylineData & polyl
{
CHECK_CUDA_ERROR(cudaMemcpyAsync(
d_target_index_.get(), agent_data.target_indices().data(), sizeof(int) * num_target_,
cudaMemcpyHostToDevice, stream_));
cudaMemcpyHostToDevice, copy_streams_()));
CHECK_CUDA_ERROR(cudaMemcpyAsync(
d_label_index_.get(), agent_data.label_indices().data(), sizeof(int) * num_agent_,
cudaMemcpyHostToDevice, stream_));
cudaMemcpyHostToDevice, copy_streams_()));
CHECK_CUDA_ERROR(cudaMemcpyAsync(
d_timestamps_.get(), agent_data.timestamps().data(), sizeof(float) * num_timestamp_,
cudaMemcpyHostToDevice, stream_));
cudaMemcpyHostToDevice, copy_streams_()));
CHECK_CUDA_ERROR(cudaMemcpyAsync(
d_trajectory_.get(), agent_data.data_ptr(), sizeof(float) * agent_data.size(),
cudaMemcpyHostToDevice, stream_));
cudaMemcpyHostToDevice, copy_streams_()));
CHECK_CUDA_ERROR(cudaMemcpyAsync(
d_target_state_.get(), agent_data.target_data_ptr(),
sizeof(float) * num_target_ * num_agent_dim_, cudaMemcpyHostToDevice, stream_));
sizeof(float) * num_target_ * num_agent_dim_, cudaMemcpyHostToDevice, copy_streams_()));

CHECK_CUDA_ERROR(cudaMemcpyAsync(
d_polyline_.get(), polyline_data.data_ptr(), sizeof(float) * polyline_data.size(),
cudaMemcpyHostToDevice, stream_));
cudaMemcpyHostToDevice, copy_streams_()));

const auto target_label_names = getLabelNames(agent_data.target_label_indices());
const auto intention_points = intention_point_.get_points(target_label_names);
CHECK_CUDA_ERROR(cudaMemcpyAsync(
d_intention_points_.get(), intention_points.data(),
sizeof(float) * num_target_ * config_.num_intention_point_cluster * 2, cudaMemcpyHostToDevice,
stream_));
copy_streams_()));

// Wait until all memory copy have been done
copy_streams_.SyncAllStreams();

// DEBUG
event_debugger_.createEvent(stream_);
Expand Down

0 comments on commit bc94689

Please sign in to comment.