triton-inference-server · KrishnanPrash · Sep 6, 2024 · Sep 6, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/docs/customization_guide/tritonfrontend.md b/docs/customization_guide/tritonfrontend.md
@@ -57,14 +57,18 @@ Note: `model_path` may need to be edited depending on your setup.
 
 2. Now, to start up the respective services with `tritonfrontend`
 ```python
-from tritonfrontend import KServeHttp, KServeGrpc
+from tritonfrontend import KServeHttp, KServeGrpc, Metrics
 http_options = KServeHttp.Options(thread_count=5)
 http_service = KServeHttp(server, http_options)
 http_service.start()
 
 # Default options (if none provided)
 grpc_service = KServeGrpc(server)
 grpc_service.start()
+
+# Can start metrics service as well
+metrics_service = Metrics(server)
+metrics_service.start()
 ```
 
 3. Finally, with running services, we can use `tritonclient` or simple `curl` commands to send requests and receive responses from the frontends.
@@ -97,6 +101,7 @@ print("[INFERENCE RESULTS]")
 print("Output data:", output_data)
 
 # Stop respective services and server.
+metrics_service.stop()
 http_service.stop()
 grpc_service.stop()
 server.stop()
@@ -139,7 +144,6 @@ With this workflow, you can avoid having to stop each service after client reque
 - The following features are not currently supported when launching the Triton frontend services through the python bindings:
     - [Tracing](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/trace.md)
     - [Shared Memory](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_shared_memory.md)
-    - [Metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md)
     - [Restricted Protocols](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#limit-endpoint-access-beta)
     - VertexAI
     - Sagemaker

diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py
@@ -34,7 +34,7 @@
 import tritonclient.http as httpclient
 import tritonserver
 from tritonclient.utils import InferenceServerException
-from tritonfrontend import KServeGrpc, KServeHttp
+from tritonfrontend import KServeGrpc, KServeHttp, Metrics
 
 
 class TestHttpOptions:
@@ -78,8 +78,25 @@ def test_wrong_grpc_parameters(self):
             KServeGrpc.Options(server_key=10)
 
 
+class TestMetricsOptions:
+    def test_correct_http_parameters(self):
+        Metrics.Options(address="0.0.0.1", port=8080, thread_count=16)
+
+    def test_wrong_http_parameters(self):
+        # Out of range
+        with pytest.raises(Exception):
+            Metrics.Options(port=-15)
+        with pytest.raises(Exception):
+            Metrics.Options(thread_count=-5)
+
+        # Wrong data type
+        with pytest.raises(Exception):
+            Metrics.Options(thread_count="ten")
+
+
 HTTP_ARGS = (KServeHttp, httpclient, "localhost:8000")  # Default HTTP args
 GRPC_ARGS = (KServeGrpc, grpcclient, "localhost:8001")  # Default GRPC args
+METRICS_ARGS = (Metrics, "localhost:8002")  # Default Metrics args
 
 
 class TestKServe:
@@ -271,6 +288,62 @@ def callback(user_data, result, error):
         utils.teardown_client(grpc_client)
         utils.teardown_server(server)
 
+    @pytest.mark.parametrize("frontend, url", [METRICS_ARGS])
+    def test_metrics_default_port(self, frontend, url):
+        server = utils.setup_server()
+        service = utils.setup_service(server, frontend)
+
+        metrics_url = f"http://{url}/metrics"
+        status_code, _ = utils.get_metrics(metrics_url)
+
+        assert status_code == 200
+
+        utils.teardown_service(service)
+        utils.teardown_server(server)
+
+    @pytest.mark.parametrize("frontend", [Metrics])
+    def test_metrics_custom_port(self, frontend, port=8005):
+        server = utils.setup_server()
+        service = utils.setup_service(server, frontend, Metrics.Options(port=port))
+
+        metrics_url = f"http://localhost:{port}/metrics"
+        status_code, _ = utils.get_metrics(metrics_url)
+
+        assert status_code == 200
+
+        utils.teardown_service(service)
+        utils.teardown_server(server)
+
+    @pytest.mark.parametrize("frontend, url", [METRICS_ARGS])
+    def test_metrics_update(self, frontend, url):
+        # For this test
+        # Setup Server, KServeGrpc, Metrics
+        server = utils.setup_server()
+        grpc_service = utils.setup_service(
+            server, KServeGrpc
+        )  # Needed to send inference request
+        metrics_service = utils.setup_service(server, frontend)
+
+        # Get Metrics and verify inference count == 0 before inference
+        before_status_code, before_inference_count = utils.get_metrics(
+            f"http://{url}/metrics"
+        )
+        assert before_status_code == 200 and before_inference_count == 0
+
+        # Send 1 Inference Request with send_and_test_inference()
+        assert utils.send_and_test_inference_identity(GRPC_ARGS[1], GRPC_ARGS[2])
+
+        # Get Metrics and verify inference count == 1 after inference
+        after_status_code, after_inference_count = utils.get_metrics(
+            f"http://{url}/metrics"
+        )
+        assert after_status_code == 200 and after_inference_count == 1
+
+        # Teardown Metrics, GrpcService, Server
+        utils.teardown_service(grpc_service)
+        utils.teardown_service(metrics_service)
+        utils.teardown_server(server)
+
     # KNOWN ISSUE: CAUSES SEGFAULT
     # Created  [DLIS-7231] to address at future date
     # Once the server has been stopped, the underlying TRITONSERVER_Server instance

diff --git a/qa/L0_python_api/test_model_repository/identity/config.pbtxt b/qa/L0_python_api/test_model_repository/identity/config.pbtxt
@@ -41,4 +41,10 @@ output [
     data_type: TYPE_STRING
     dims: [ 1 ]
   }
-]
+]
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
diff --git a/qa/L0_python_api/testing_utils.py b/qa/L0_python_api/testing_utils.py
@@ -25,21 +25,20 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
-import queue
+import re
 from functools import partial
-from typing import Union
+from typing import Tuple, Union
 
 import numpy as np
 import requests
 import tritonserver
-from tritonclient.utils import InferenceServerException
-from tritonfrontend import KServeGrpc, KServeHttp
-
-# TODO: Re-Format documentation to fit:
-# https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings
+from tritonfrontend import KServeGrpc, KServeHttp, Metrics
 
 
 def setup_server(model_repository="test_model_repository") -> tritonserver.Server:
+    """
+    Using tritonserver, starts a server with the models: identity and delayed_identity
+    """
     module_directory = os.path.split(os.path.abspath(__file__))[0]
     model_path = os.path.abspath(os.path.join(module_directory, model_repository))
 
@@ -61,9 +60,12 @@ def teardown_server(server: tritonserver.Server) -> None:
 
 def setup_service(
     server: tritonserver.Server,
-    frontend: Union[KServeHttp, KServeGrpc],
+    frontend: Union[KServeHttp, KServeGrpc, Metrics],
     options=None,
-) -> Union[KServeHttp, KServeGrpc]:
+) -> Union[KServeHttp, KServeGrpc, Metrics]:
+    """
+    Used to create and start any of the frontends supported by tritonfrontend.
+    """
     service = frontend(server=server, options=options)
     service.start()
     return service
@@ -73,16 +75,31 @@ def teardown_service(service: Union[KServeHttp, KServeGrpc]) -> None:
     service.stop()
 
 
-def setup_client(frontend_client, url: str):
+def setup_client(
+    frontend_client: Union["tritonclient.http", "tritonclient.grpc"], url: str
+):
+    """
+    Sets up a client to communicate with the Server through the respective protocol.
+    """
     return frontend_client.InferenceServerClient(url=url)
 
 
-def teardown_client(client) -> None:
+def teardown_client(
+    client: Union[
+        "tritonclient.http.InferenceServerClient",
+        "tritonclient.grpc.InferenceServerClient",
+    ]
+) -> None:
     client.close()
 
 
-# Sends an inference to test_model_repository/identity model and verifies input == output.
-def send_and_test_inference_identity(frontend_client, url: str) -> bool:
+def send_and_test_inference_identity(
+    frontend_client: Union["tritonclient.http", "tritonclient.grpc"], url: str
+) -> bool:
+    """
+    Sends an inference request to the model at test_model_repository/identity
+    and verifies input == output
+    """
     model_name = "identity"
     client = setup_client(frontend_client, url)
     input_data = np.array(["testing"], dtype=object)
@@ -102,9 +119,13 @@ def send_and_test_inference_identity(frontend_client, url: str) -> bool:
     return input_data[0] == output_data[0].decode()
 
 
-# Sends multiple streaming requests to "delayed_identity" model with negligible delays,
-# and verifies the inputs matches outputs and the ordering is preserved.
-def send_and_test_stream_inference(frontend_client, url: str) -> bool:
+def send_and_test_stream_inference(
+    frontend_client: Union["tritonclient.http", "tritonclient.grpc"], url: str
+) -> bool:
+    """
+    Sends multiple streaming requests to "delayed_identity" model with negligible delays
+    and verifies the inputs matches outputs and the ordering is preserved.
+    """
     num_requests = 100
     requests = []
     for i in range(num_requests):
@@ -135,14 +156,18 @@ def callback(responses, result, error):
 
 
 def send_and_test_generate_inference() -> bool:
+    """
+    Sends an inference request to and identity model through the
+    HTTP generate endpoint and verifies input == output
+    """
     model_name = "identity"
     url = f"http://localhost:8000/v2/models/{model_name}/generate"
     input_text = "testing"
     data = {
         "INPUT0": input_text,
     }
 
-    response = requests.post(url, json=data, stream=True)
+    response = requests.post(url, json=data)
     if response.status_code == 200:
         result = response.json()
         output_text = result.get("OUTPUT0", "")
@@ -151,3 +176,32 @@ def send_and_test_generate_inference() -> bool:
             return True
 
     return False
+
+
+def get_metrics(metrics_url: str, model_name: str = "identity") -> Tuple[int, int]:
+    """
+    Sends a request to the metrics endpoint and returns the following information:
+    1. Status Code = Indicates whether interaction with Metrics endpoint was successful
+    2. Inference Count = Indicates whether metrics data being returned is accurate
+    """
+    response = requests.get(metrics_url)
+    inference_count = None
+
+    if response.status_code == 200:
+        inference_count = _extract_inference_count(response.text, model_name)
+    return response.status_code, inference_count
+
+
+def _extract_inference_count(metrics_data: str, model_name: str):
+    """
+    Helper function for _get_metrics that parses metrics_data (prometheus-friendly
+    format) with regex to extract the inference count of model_name.
+    """
+    pattern = (
+        rf'nv_inference_count\{{.*?model="{re.escape(model_name)}".*?\}}\s+([0-9.]+)'
+    )
+    match = re.search(pattern, metrics_data)
+    if match:
+        return int(float(match.group(1)))
+
+    return None
diff --git a/src/http_server.cc b/src/http_server.cc
@@ -364,6 +364,22 @@ HTTPMetricsServer::Create(
   return nullptr;
 }
 
+TRITONSERVER_Error*
+HTTPMetricsServer::Create(
+    std::shared_ptr<TRITONSERVER_Server>& server,
+    const UnorderedMapType& options, std::unique_ptr<HTTPServer>* service)
+{
+  int port;
+  std::string address;
+  int thread_count;
+
+  RETURN_IF_ERR(GetValue(options, "port", &port));
+  RETURN_IF_ERR(GetValue(options, "address", &address));
+  RETURN_IF_ERR(GetValue(options, "thread_count", &thread_count));
+
+  return Create(server, port, address, thread_count, service);
+}
+
 #endif  // TRITON_ENABLE_METRICS
 
 namespace {

diff --git a/src/http_server.h b/src/http_server.h
@@ -134,6 +134,10 @@ class HTTPMetricsServer : public HTTPServer {
       std::string address, int thread_cnt,
       std::unique_ptr<HTTPServer>* metrics_server);
 
+  static TRITONSERVER_Error* Create(
+      std::shared_ptr<TRITONSERVER_Server>& server,
+      const UnorderedMapType& options, std::unique_ptr<HTTPServer>* service);
+
   ~HTTPMetricsServer() = default;
 
  private:

diff --git a/src/python/tritonfrontend/__init__.py b/src/python/tritonfrontend/__init__.py
@@ -30,13 +30,19 @@
 from importlib.metadata import PackageNotFoundError, version
 
 try:
-    from tritonfrontend._api._kservehttp import KServeHttp
+    from tritonfrontend._api import KServeHttp
 except ImportError:
     # TRITON_ENABLE_HTTP=OFF
     pass
 
 try:
-    from tritonfrontend._api._kservegrpc import KServeGrpc
+    from tritonfrontend._api import KServeGrpc
 except ImportError:
     # TRITON_ENABLE_GRPC=OFF
     pass
+
+try:
+    from tritonfrontend._api import Metrics
+except ImportError:
+    # TRITON_ENABLE_METRICS=OFF
+    pass
diff --git a/src/python/tritonfrontend/_api/__init__.py b/src/python/tritonfrontend/_api/__init__.py
@@ -37,3 +37,10 @@
     # TRITON_ENABLE_GRPC=OFF
     # TritonFrontendGrpc Package was not present
     pass
+
+try:
+    from ._metrics import Metrics
+except ImportError:
+    # TRITON_ENABLE_Metrics=OFF
+    # TritonFrontendMetrics Package was not present
+    pass
diff --git a/src/python/tritonfrontend/_api/_error_mapping.py b/src/python/tritonfrontend/_api/_error_mapping.py
@@ -24,6 +24,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import sys
+
 import tritonserver
 from tritonfrontend._c.tritonfrontend_bindings import (
     AlreadyExistsError,
@@ -47,3 +49,15 @@
     AlreadyExistsError: tritonserver.AlreadyExistsError,
     UnsupportedError: tritonserver.UnsupportedError,
 }
+
+
+def handle_triton_error(func):
+    def error_handling_wrapper(*args, **kwargs):
+        try:
+            func(*args, **kwargs)
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            # raise ... from None masks the tritonfrontend Error from being added in traceback
+            raise ERROR_MAPPING[exc_type](exc_value) from None
+
+    return error_handling_wrapper