Fix bug where null response were accidentally excluded from profile e…

…xport (#379) * Fix bug where null response were accidentally excluded from profile export * Fix bug * Address feedback
triton-inference-server · Aug 15, 2023 · 2b87928 · 2b87928
1 parent 54e3036
commit 2b87928
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 36 deletions.
diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
@@ -167,7 +167,7 @@ InferContext::SendRequest(
       auto total = end_time_sync - start_time_sync;
       thread_stat_->request_records_.emplace_back(RequestRecord(
           start_time_sync, std::move(end_time_syncs),
-          infer_data_.options_->sequence_end_, delayed, sequence_id));
+          infer_data_.options_->sequence_end_, delayed, sequence_id, false));
       thread_stat_->status_ =
           infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
       if (!thread_stat_->status_.IsOk()) {
@@ -257,9 +257,9 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
         if (thread_stat_->cb_status_.IsOk() == false) {
           return;
         }
-        if (is_null_response == false) {
-          it->second.response_times_.push_back(
-              std::chrono::system_clock::now());
+        it->second.response_times_.push_back(std::chrono::system_clock::now());
+        if (is_null_response == true) {
+          it->second.has_null_last_response_ = true;
         }
         thread_stat_->cb_status_ =
             result_ptr->IsFinalResponse(&is_final_response);
@@ -270,7 +270,7 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
           thread_stat_->request_records_.emplace_back(
               it->second.start_time_, it->second.response_times_,
               it->second.sequence_end_, it->second.delayed_,
-              it->second.sequence_id_);
+              it->second.sequence_id_, it->second.has_null_last_response_);
           infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
           thread_stat_->cb_status_ = ValidateOutputs(result);
           async_req_map_.erase(request_id);

diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc
@@ -1279,15 +1279,28 @@ InferenceProfiler::ValidLatencyMeasurement(
   for (size_t i = 0; i < all_request_records_.size(); i++) {
     const auto& request_record = all_request_records_[i];
     uint64_t request_start_ns = CHRONO_TO_NANOS(request_record.start_time_);
-    uint64_t request_end_ns =
-        CHRONO_TO_NANOS(request_record.response_times_.back());
+    uint64_t request_end_ns;
+
+    if (request_record.has_null_last_response_ == false) {
+      request_end_ns = CHRONO_TO_NANOS(request_record.response_times_.back());
+    } else if (request_record.response_times_.size() > 1) {
+      size_t last_response_idx{request_record.response_times_.size() - 2};
+      request_end_ns =
+          CHRONO_TO_NANOS(request_record.response_times_[last_response_idx]);
+    } else {
+      erase_indices.push_back(i);
+      continue;
+    }
 
     if (request_start_ns <= request_end_ns) {
       // Only counting requests that end within the time interval
       if ((request_end_ns >= valid_range.first) &&
           (request_end_ns <= valid_range.second)) {
         valid_latencies->push_back(request_end_ns - request_start_ns);
         response_count += request_record.response_times_.size();
+        if (request_record.has_null_last_response_) {
+          response_count--;
+        }
         erase_indices.push_back(i);
         if (request_record.sequence_end_) {
           valid_sequence_count++;

diff --git a/src/c++/perf_analyzer/request_record.h b/src/c++/perf_analyzer/request_record.h
@@ -39,10 +39,12 @@ struct RequestRecord {
       std::chrono::time_point<std::chrono::system_clock> start_time,
       std::vector<std::chrono::time_point<std::chrono::system_clock>>
           response_times,
-      bool sequence_end, bool delayed, uint64_t sequence_id)
+      bool sequence_end, bool delayed, uint64_t sequence_id,
+      bool has_null_last_response)
       : start_time_(start_time), response_times_(response_times),
         sequence_end_(sequence_end), delayed_(delayed),
-        sequence_id_(sequence_id)
+        sequence_id_(sequence_id),
+        has_null_last_response_(has_null_last_response)
   {
   }
   // The timestamp of when the request was started.
@@ -56,6 +58,8 @@ struct RequestRecord {
   bool delayed_;
   // Sequence ID of the request
   uint64_t sequence_id_;
+  // Whether the last response is null
+  bool has_null_last_response_;
 };
 
 }}  // namespace triton::perfanalyzer
diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc
@@ -179,32 +179,32 @@ TEST_CASE("testing the ValidLatencyMeasurement function")
       // current window
       RequestRecord(
           time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, 0,
-          false, 0),
+          false, 0, false),
 
       // request starts before window starts and ends inside window: included in
       // current window
       RequestRecord(
           time_point(ns(3)), std::vector<time_point>{time_point(ns(5))}, 0,
-          false, 0),
+          false, 0, false),
 
       // requests start and end inside window: included in current window
       RequestRecord(
           time_point(ns(6)), std::vector<time_point>{time_point(ns(9))}, 0,
-          false, 0),
+          false, 0, false),
       RequestRecord(
           time_point(ns(10)), std::vector<time_point>{time_point(ns(14))}, 0,
-          false, 0),
+          false, 0, false),
 
       // request starts before window ends and ends after window ends: not
       // included in current window
       RequestRecord(
           time_point(ns(15)), std::vector<time_point>{time_point(ns(20))}, 0,
-          false, 0),
+          false, 0, false),
 
       // request starts after window ends: not included in current window
       RequestRecord(
           time_point(ns(21)), std::vector<time_point>{time_point(ns(27))}, 0,
-          false, 0)};
+          false, 0, false)};
 
   TestInferenceProfiler::ValidLatencyMeasurement(
       window, valid_sequence_count, delayed_request_count, &latencies,
@@ -882,17 +882,44 @@ TEST_CASE(
         request1_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response1_timestamp, response2_timestamp},
-        0, false, 0)};
+        0, false, 0, false)};
 
     auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(4)};
-    auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
-    auto response4_timestamp{clock_epoch + std::chrono::nanoseconds(6)};
-    auto response5_timestamp{clock_epoch + std::chrono::nanoseconds(7)};
-    auto request_record2{RequestRecord(
-        request2_timestamp,
-        std::vector<std::chrono::time_point<std::chrono::system_clock>>{
-            response3_timestamp, response4_timestamp, response5_timestamp},
-        0, false, 0)};
+    RequestRecord request_record2{};
+    size_t expected_response_count{0};
+
+    SUBCASE("second request has three data responses")
+    {
+      auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
+      auto response4_timestamp{clock_epoch + std::chrono::nanoseconds(6)};
+      auto response5_timestamp{clock_epoch + std::chrono::nanoseconds(7)};
+      request_record2 = RequestRecord(
+          request2_timestamp,
+          std::vector<std::chrono::time_point<std::chrono::system_clock>>{
+              response3_timestamp, response4_timestamp, response5_timestamp},
+          0, false, 0, false);
+      expected_response_count = 5;
+    }
+    SUBCASE("second request has two data responses and one null response")
+    {
+      auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
+      auto response4_timestamp{clock_epoch + std::chrono::nanoseconds(6)};
+      auto response5_timestamp{clock_epoch + std::chrono::nanoseconds(7)};
+      request_record2 = RequestRecord(
+          request2_timestamp,
+          std::vector<std::chrono::time_point<std::chrono::system_clock>>{
+              response3_timestamp, response4_timestamp, response5_timestamp},
+          0, false, 0, true);
+      expected_response_count = 4;
+    }
+    SUBCASE("second request has one null response")
+    {
+      request_record2 = RequestRecord(
+          request2_timestamp,
+          std::vector<std::chrono::time_point<std::chrono::system_clock>>{}, 0,
+          false, 0, true);
+      expected_response_count = 2;
+    }
 
     mock_inference_profiler.all_request_records_ = {
         request_record1, request_record2};
@@ -909,7 +936,7 @@ TEST_CASE(
         valid_range, valid_sequence_count, delayed_request_count,
         &valid_latencies, response_count, valid_requests);
 
-    CHECK(response_count == 5);
+    CHECK(response_count == expected_response_count);
   }
   SUBCASE("testing logic relevant to valid request output")
   {
@@ -921,23 +948,23 @@ TEST_CASE(
         request1_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response1_timestamp},
-        0, false, 0)};
+        0, false, 0, false)};
 
     auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(3)};
     auto response2_timestamp{clock_epoch + std::chrono::nanoseconds(4)};
     auto request_record2{RequestRecord(
         request2_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response2_timestamp},
-        0, false, 0)};
+        0, false, 0, false)};
 
     auto request3_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
     auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(6)};
     auto request_record3{RequestRecord(
         request3_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response3_timestamp},
-        0, false, 0)};
+        0, false, 0, false)};
 
     mock_inference_profiler.all_request_records_ = {
         request_record1, request_record2, request_record3};

diff --git a/src/c++/perf_analyzer/test_load_manager.cc b/src/c++/perf_analyzer/test_load_manager.cc
@@ -40,10 +40,10 @@ operator==(const RequestRecord& lhs, const RequestRecord& rhs)
 {
   return std::tie(
              lhs.start_time_, lhs.response_times_, lhs.sequence_end_,
-             lhs.delayed_, lhs.sequence_id_) ==
+             lhs.delayed_, lhs.sequence_id_, lhs.has_null_last_response_) ==
          std::tie(
              rhs.start_time_, rhs.response_times_, rhs.sequence_end_,
-             rhs.delayed_, rhs.sequence_id_);
+             rhs.delayed_, rhs.sequence_id_, rhs.has_null_last_response_);
 }
 
 }  // namespace
@@ -134,13 +134,13 @@ class TestLoadManager : public TestLoadManagerBase, public LoadManager {
     using ns = std::chrono::nanoseconds;
     auto request_record1 = RequestRecord(
         time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, 0, false,
-        0);
+        0, false);
     auto request_record2 = RequestRecord(
         time_point(ns(3)), std::vector<time_point>{time_point(ns(4))}, 0, false,
-        0);
+        0, false);
     auto request_record3 = RequestRecord(
         time_point(ns(5)), std::vector<time_point>{time_point(ns(6))}, 0, false,
-        0);
+        0, false);
 
     std::vector<RequestRecord> source_request_records;
 
@@ -295,13 +295,13 @@ class TestLoadManager : public TestLoadManagerBase, public LoadManager {
     using ns = std::chrono::nanoseconds;
     auto request_record1 = RequestRecord(
         time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, 0, false,
-        0);
+        0, false);
     auto request_record2 = RequestRecord(
         time_point(ns(3)), std::vector<time_point>{time_point(ns(4))}, 0, false,
-        0);
+        0, false);
     auto request_record3 = RequestRecord(
         time_point(ns(5)), std::vector<time_point>{time_point(ns(6))}, 0, false,
-        0);
+        0, false);
 
     SUBCASE("No threads")
     {