From 0094d30dbe33099e9f70fc7fdd35b7927fcc04ce Mon Sep 17 00:00:00 2001
From: Matthew Kotila <matthew.r.kotila@gmail.com>
Date: Thu, 10 Aug 2023 00:53:00 +0000
Subject: [PATCH 1/3] Fix bug where null response were accidentally excluded
 from profile export

---
 src/c++/perf_analyzer/infer_context.cc           |  8 ++++----
 src/c++/perf_analyzer/inference_profiler.cc      | 14 ++++++++++++--
 src/c++/perf_analyzer/request_record.h           |  8 ++++++--
 src/c++/perf_analyzer/test_inference_profiler.cc | 16 ++++++++--------
 src/c++/perf_analyzer/test_load_manager.cc       | 16 ++++++++--------
 5 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
index f020cd8fa..f5381e89a 100644
--- a/src/c++/perf_analyzer/infer_context.cc
+++ b/src/c++/perf_analyzer/infer_context.cc
@@ -167,7 +167,7 @@ InferContext::SendRequest(
       auto total = end_time_sync - start_time_sync;
       thread_stat_->request_records_.emplace_back(RequestRecord(
           start_time_sync, std::move(end_time_syncs),
-          infer_data_.options_->sequence_end_, delayed, sequence_id));
+          infer_data_.options_->sequence_end_, delayed, sequence_id, false));
       thread_stat_->status_ =
           infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
       if (!thread_stat_->status_.IsOk()) {
@@ -257,9 +257,9 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
         if (thread_stat_->cb_status_.IsOk() == false) {
           return;
         }
+        it->second.response_times_.push_back(std::chrono::system_clock::now());
         if (is_null_response == false) {
-          it->second.response_times_.push_back(
-              std::chrono::system_clock::now());
+          it->second.has_null_last_response_ = true;
         }
         thread_stat_->cb_status_ =
             result_ptr->IsFinalResponse(&is_final_response);
@@ -270,7 +270,7 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
           thread_stat_->request_records_.emplace_back(
               it->second.start_time_, it->second.response_times_,
               it->second.sequence_end_, it->second.delayed_,
-              it->second.sequence_id_);
+              it->second.sequence_id_, it->second.has_null_last_response_);
           infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
           thread_stat_->cb_status_ = ValidateOutputs(result);
           async_req_map_.erase(request_id);
diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc
index a0b0103d3..cb870f6f7 100644
--- a/src/c++/perf_analyzer/inference_profiler.cc
+++ b/src/c++/perf_analyzer/inference_profiler.cc
@@ -1279,8 +1279,18 @@ InferenceProfiler::ValidLatencyMeasurement(
   for (size_t i = 0; i < all_request_records_.size(); i++) {
     const auto& request_record = all_request_records_[i];
     uint64_t request_start_ns = CHRONO_TO_NANOS(request_record.start_time_);
-    uint64_t request_end_ns =
-        CHRONO_TO_NANOS(request_record.response_times_.back());
+    uint64_t request_end_ns;
+
+    if (request_record.has_null_last_response_ == false) {
+      request_end_ns = CHRONO_TO_NANOS(request_record.response_times_.back());
+    } else if (request_record.response_times_.size() > 1) {
+      size_t last_response_idx{request_record.response_times_.size() - 2};
+      request_end_ns =
+          CHRONO_TO_NANOS(request_record.response_times_[last_response_idx]);
+    } else {
+      erase_indices.push_back(i);
+      continue;
+    }
 
     if (request_start_ns <= request_end_ns) {
       // Only counting requests that end within the time interval
diff --git a/src/c++/perf_analyzer/request_record.h b/src/c++/perf_analyzer/request_record.h
index ba441549a..0c9ca9263 100644
--- a/src/c++/perf_analyzer/request_record.h
+++ b/src/c++/perf_analyzer/request_record.h
@@ -39,10 +39,12 @@ struct RequestRecord {
       std::chrono::time_point<std::chrono::system_clock> start_time,
       std::vector<std::chrono::time_point<std::chrono::system_clock>>
           response_times,
-      bool sequence_end, bool delayed, uint64_t sequence_id)
+      bool sequence_end, bool delayed, uint64_t sequence_id,
+      bool has_null_last_response)
       : start_time_(start_time), response_times_(response_times),
         sequence_end_(sequence_end), delayed_(delayed),
-        sequence_id_(sequence_id)
+        sequence_id_(sequence_id),
+        has_null_last_response_(has_null_last_response)
   {
   }
   // The timestamp of when the request was started.
@@ -56,6 +58,8 @@ struct RequestRecord {
   bool delayed_;
   // Sequence ID of the request
   uint64_t sequence_id_;
+  // Whether the last response is null
+  bool has_null_last_response_;
 };
 
 }}  // namespace triton::perfanalyzer
diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc
index c72445d72..2c4a47d20 100644
--- a/src/c++/perf_analyzer/test_inference_profiler.cc
+++ b/src/c++/perf_analyzer/test_inference_profiler.cc
@@ -179,32 +179,32 @@ TEST_CASE("testing the ValidLatencyMeasurement function")
       // current window
       RequestRecord(
           time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, 0,
-          false, 0),
+          false, 0, false),
 
       // request starts before window starts and ends inside window: included in
       // current window
       RequestRecord(
           time_point(ns(3)), std::vector<time_point>{time_point(ns(5))}, 0,
-          false, 0),
+          false, 0, false),
 
       // requests start and end inside window: included in current window
       RequestRecord(
           time_point(ns(6)), std::vector<time_point>{time_point(ns(9))}, 0,
-          false, 0),
+          false, 0, false),
       RequestRecord(
           time_point(ns(10)), std::vector<time_point>{time_point(ns(14))}, 0,
-          false, 0),
+          false, 0, false),
 
       // request starts before window ends and ends after window ends: not
       // included in current window
       RequestRecord(
           time_point(ns(15)), std::vector<time_point>{time_point(ns(20))}, 0,
-          false, 0),
+          false, 0, false),
 
       // request starts after window ends: not included in current window
       RequestRecord(
           time_point(ns(21)), std::vector<time_point>{time_point(ns(27))}, 0,
-          false, 0)};
+          false, 0, false)};
 
   TestInferenceProfiler::ValidLatencyMeasurement(
       window, valid_sequence_count, delayed_request_count, &latencies,
@@ -882,7 +882,7 @@ TEST_CASE(
         request1_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response1_timestamp, response2_timestamp},
-        0, false, 0)};
+        0, false, 0, false)};
 
     auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(4)};
     auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
@@ -892,7 +892,7 @@ TEST_CASE(
         request2_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response3_timestamp, response4_timestamp, response5_timestamp},
-        0, false, 0)};
+        0, false, 0, false)};
 
     mock_inference_profiler.all_request_records_ = {
         request_record1, request_record2};
diff --git a/src/c++/perf_analyzer/test_load_manager.cc b/src/c++/perf_analyzer/test_load_manager.cc
index ada49e25b..2a11e0749 100644
--- a/src/c++/perf_analyzer/test_load_manager.cc
+++ b/src/c++/perf_analyzer/test_load_manager.cc
@@ -40,10 +40,10 @@ operator==(const RequestRecord& lhs, const RequestRecord& rhs)
 {
   return std::tie(
              lhs.start_time_, lhs.response_times_, lhs.sequence_end_,
-             lhs.delayed_, lhs.sequence_id_) ==
+             lhs.delayed_, lhs.sequence_id_, lhs.has_null_last_response_) ==
          std::tie(
              rhs.start_time_, rhs.response_times_, rhs.sequence_end_,
-             rhs.delayed_, rhs.sequence_id_);
+             rhs.delayed_, rhs.sequence_id_, rhs.has_null_last_response_);
 }
 
 }  // namespace
@@ -134,13 +134,13 @@ class TestLoadManager : public TestLoadManagerBase, public LoadManager {
     using ns = std::chrono::nanoseconds;
     auto request_record1 = RequestRecord(
         time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, 0, false,
-        0);
+        0, false);
     auto request_record2 = RequestRecord(
         time_point(ns(3)), std::vector<time_point>{time_point(ns(4))}, 0, false,
-        0);
+        0, false);
     auto request_record3 = RequestRecord(
         time_point(ns(5)), std::vector<time_point>{time_point(ns(6))}, 0, false,
-        0);
+        0, false);
 
     std::vector<RequestRecord> source_request_records;
 
@@ -295,13 +295,13 @@ class TestLoadManager : public TestLoadManagerBase, public LoadManager {
     using ns = std::chrono::nanoseconds;
     auto request_record1 = RequestRecord(
         time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, 0, false,
-        0);
+        0, false);
     auto request_record2 = RequestRecord(
         time_point(ns(3)), std::vector<time_point>{time_point(ns(4))}, 0, false,
-        0);
+        0, false);
     auto request_record3 = RequestRecord(
         time_point(ns(5)), std::vector<time_point>{time_point(ns(6))}, 0, false,
-        0);
+        0, false);
 
     SUBCASE("No threads")
     {

From a704f59f9158c6203a1b2ae513f1c4e9e06c13df Mon Sep 17 00:00:00 2001
From: Matthew Kotila <matthew.r.kotila@gmail.com>
Date: Thu, 10 Aug 2023 06:58:13 +0000
Subject: [PATCH 2/3] Fix bug

---
 src/c++/perf_analyzer/infer_context.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
index f5381e89a..16855ca53 100644
--- a/src/c++/perf_analyzer/infer_context.cc
+++ b/src/c++/perf_analyzer/infer_context.cc
@@ -258,7 +258,7 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
           return;
         }
         it->second.response_times_.push_back(std::chrono::system_clock::now());
-        if (is_null_response == false) {
+        if (is_null_response == true) {
           it->second.has_null_last_response_ = true;
         }
         thread_stat_->cb_status_ =

From c1902bb558132cfa8cfdeea7265fde454ff55042 Mon Sep 17 00:00:00 2001
From: Matthew Kotila <matthew.r.kotila@gmail.com>
Date: Thu, 10 Aug 2023 22:31:22 +0000
Subject: [PATCH 3/3] Address feedback

---
 src/c++/perf_analyzer/inference_profiler.cc   |  3 ++
 .../perf_analyzer/test_inference_profiler.cc  | 51 ++++++++++++++-----
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc
index cb870f6f7..da5acf255 100644
--- a/src/c++/perf_analyzer/inference_profiler.cc
+++ b/src/c++/perf_analyzer/inference_profiler.cc
@@ -1298,6 +1298,9 @@ InferenceProfiler::ValidLatencyMeasurement(
           (request_end_ns <= valid_range.second)) {
         valid_latencies->push_back(request_end_ns - request_start_ns);
         response_count += request_record.response_times_.size();
+        if (request_record.has_null_last_response_) {
+          response_count--;
+        }
         erase_indices.push_back(i);
         if (request_record.sequence_end_) {
           valid_sequence_count++;
diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc
index 2c4a47d20..5a41319bd 100644
--- a/src/c++/perf_analyzer/test_inference_profiler.cc
+++ b/src/c++/perf_analyzer/test_inference_profiler.cc
@@ -885,14 +885,41 @@ TEST_CASE(
         0, false, 0, false)};
 
     auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(4)};
-    auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
-    auto response4_timestamp{clock_epoch + std::chrono::nanoseconds(6)};
-    auto response5_timestamp{clock_epoch + std::chrono::nanoseconds(7)};
-    auto request_record2{RequestRecord(
-        request2_timestamp,
-        std::vector<std::chrono::time_point<std::chrono::system_clock>>{
-            response3_timestamp, response4_timestamp, response5_timestamp},
-        0, false, 0, false)};
+    RequestRecord request_record2{};
+    size_t expected_response_count{0};
+
+    SUBCASE("second request has three data responses")
+    {
+      auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
+      auto response4_timestamp{clock_epoch + std::chrono::nanoseconds(6)};
+      auto response5_timestamp{clock_epoch + std::chrono::nanoseconds(7)};
+      request_record2 = RequestRecord(
+          request2_timestamp,
+          std::vector<std::chrono::time_point<std::chrono::system_clock>>{
+              response3_timestamp, response4_timestamp, response5_timestamp},
+          0, false, 0, false);
+      expected_response_count = 5;
+    }
+    SUBCASE("second request has two data responses and one null response")
+    {
+      auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
+      auto response4_timestamp{clock_epoch + std::chrono::nanoseconds(6)};
+      auto response5_timestamp{clock_epoch + std::chrono::nanoseconds(7)};
+      request_record2 = RequestRecord(
+          request2_timestamp,
+          std::vector<std::chrono::time_point<std::chrono::system_clock>>{
+              response3_timestamp, response4_timestamp, response5_timestamp},
+          0, false, 0, true);
+      expected_response_count = 4;
+    }
+    SUBCASE("second request has one null response")
+    {
+      request_record2 = RequestRecord(
+          request2_timestamp,
+          std::vector<std::chrono::time_point<std::chrono::system_clock>>{}, 0,
+          false, 0, true);
+      expected_response_count = 2;
+    }
 
     mock_inference_profiler.all_request_records_ = {
         request_record1, request_record2};
@@ -909,7 +936,7 @@ TEST_CASE(
         valid_range, valid_sequence_count, delayed_request_count,
         &valid_latencies, response_count, valid_requests);
 
-    CHECK(response_count == 5);
+    CHECK(response_count == expected_response_count);
   }
   SUBCASE("testing logic relevant to valid request output")
   {
@@ -921,7 +948,7 @@ TEST_CASE(
         request1_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response1_timestamp},
-        0, false, 0)};
+        0, false, 0, false)};
 
     auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(3)};
     auto response2_timestamp{clock_epoch + std::chrono::nanoseconds(4)};
@@ -929,7 +956,7 @@ TEST_CASE(
         request2_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response2_timestamp},
-        0, false, 0)};
+        0, false, 0, false)};
 
     auto request3_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
     auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(6)};
@@ -937,7 +964,7 @@ TEST_CASE(
         request3_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response3_timestamp},
-        0, false, 0)};
+        0, false, 0, false)};
 
     mock_inference_profiler.all_request_records_ = {
         request_record1, request_record2, request_record3};