From 25e244e035a7364a4022187bb7a131e8c4b41c6f Mon Sep 17 00:00:00 2001 From: Ulrik Falklof Date: Thu, 28 May 2020 14:17:35 +0200 Subject: [PATCH] Add AC hit rate metrics with prometheus labels This is a draft, not ready to be merged. Summary of what is added by this commit: - Prometheus counter for cache hit ratio of only AC requests. - Support for prometheus labels based on custom HTTP and gRPC headers. Cache hit ratio for CAS entries is easily misinterpreted. Example: A typical action cache hit often involves 3 or more HTTP requests: GET AC 200 GET CAS 200 (.o file) GET CAS 200 (.d file) ... But a cache miss for the same action is typically a single HTTP request: GET AC 404 The ratio between all HTTP GET 200 vs HTTP GET 404 above does not represent the cache hit ratio experienced by the user for actions. The ratio of only AC requests is easier to reason about, especially when AC requests checks existence of CAS dependencies. The number of AC hits and misses can be directly compared against numbers printed in the end of each build by bazel client. And against other prometheus counters produced by remote execution systems for executed actions. An understanding about the reason for cache misses is necessary to improve the cache hit ratio. It could be that the system has been configured in a way that prevent artifacts from being reused between different OS. Or that the cache is only populated by CI jobs on master, potentially resulting in cache misses for other users, etc. It becomes easier to notice such patterns, if cache hit ratio could be calculated for different categories of builds. Such categories can be set as custom headers via bazel flags --remote_header=branch=master and applied as prometheus labels. Mapping of headers to prometheus labels are controlled in bazel-remote's config file. The ratio between cache uploads and cache misses is also relevant, as an view about which categories are not uploading their result. The ratio of cache uploads can also indicate if much is uploaded but seldom requested. E.g. does it make sense to populate central caches from interactive builds or only from CI? Categories and custom headers, could also be set for an overview about: - Bazel versions using a cache instance? - How much separate organizations are using a cache instance? - From which network traffic originates? - Which products are built using the cache? - If the traffic comes via proxy adding its own headers? - Distinguish dummy requests for monitoring the cache, from real requests? - ... --- BUILD.bazel | 1 + README.md | 17 ++++ cache/cache.go | 7 +- config/config.go | 8 ++ main.go | 8 +- server/BUILD.bazel | 1 + server/grpc.go | 10 +- server/grpc_ac.go | 12 +++ server/grpc_test.go | 3 +- server/http.go | 74 +++++++++++---- server/http_test.go | 17 ++-- utils/BUILD.bazel | 3 + utils/metrics/BUILD.bazel | 13 +++ utils/metrics/metrics.go | 187 ++++++++++++++++++++++++++++++++++++++ utils/testutils.go | 12 +++ 15 files changed, 335 insertions(+), 38 deletions(-) create mode 100644 utils/metrics/BUILD.bazel create mode 100644 utils/metrics/metrics.go diff --git a/BUILD.bazel b/BUILD.bazel index 298e8b322..4a2da0b6f 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -22,6 +22,7 @@ go_library( "//config:go_default_library", "//server:go_default_library", "//utils/idle:go_default_library", + "//utils/metrics:go_default_library", "//utils/rlimit:go_default_library", "@com_github_abbot_go_http_auth//:go_default_library", "@com_github_grpc_ecosystem_go_grpc_prometheus//:go_default_library", diff --git a/README.md b/README.md index 70cd71e52..b89855136 100644 --- a/README.md +++ b/README.md @@ -223,6 +223,23 @@ host: localhost # If true, enable experimental remote asset API support: #experimental_remote_asset_api: true + +# Allows mapping HTTP and gRPC headers to prometheus +# labels. Headers can be set by bazel client as: +# --remote_header=os=ubuntu18-04. Not all counters are +# affected. +#metrics: +# categories: +# os: +# - rhel7 +# - rhel8 +# - ubuntu16-04 +# - ubuntu18-04 +# branch: +# - master +# user: +# - ci + ``` ## Docker diff --git a/cache/cache.go b/cache/cache.go index 49c7cc93c..da21930f4 100644 --- a/cache/cache.go +++ b/cache/cache.go @@ -21,6 +21,8 @@ const ( // used for HTTP when running with the --disable_http_ac_validation // commandline flag. RAW + + UNKNOWN ) func (e EntryKind) String() string { @@ -30,7 +32,10 @@ func (e EntryKind) String() string { if e == CAS { return "cas" } - return "raw" + if e == RAW { + return "raw" + } + return "unknown" } // Logger is designed to be satisfied by log.Logger. diff --git a/config/config.go b/config/config.go index c9efc2da3..7df906921 100644 --- a/config/config.go +++ b/config/config.go @@ -35,6 +35,11 @@ type HTTPBackendConfig struct { BaseURL string `yaml:"url"` } +// Metrics stores configuration for prometheus metrics. +type Metrics struct { + Categories map[string][]string `yaml:"categories"` +} + // Config holds the top-level configuration for bazel-remote. type Config struct { Host string `yaml:"host"` @@ -55,6 +60,7 @@ type Config struct { DisableGRPCACDepsCheck bool `yaml:"disable_grpc_ac_deps_check"` EnableACKeyInstanceMangling bool `yaml:"enable_ac_key_instance_mangling"` EnableEndpointMetrics bool `yaml:"enable_endpoint_metrics"` + Metrics *Metrics `yaml:"metrics"` ExperimentalRemoteAssetAPI bool `yaml:"experimental_remote_asset_api"` HTTPReadTimeout time.Duration `yaml:"http_read_timeout"` HTTPWriteTimeout time.Duration `yaml:"http_write_timeout"` @@ -73,6 +79,7 @@ func New(dir string, maxSize int, host string, port int, grpcPort int, disableGRPCACDepsCheck bool, enableACKeyInstanceMangling bool, enableEndpointMetrics bool, + metrics *Metrics, experimentalRemoteAssetAPI bool, httpReadTimeout time.Duration, httpWriteTimeout time.Duration) (*Config, error) { @@ -95,6 +102,7 @@ func New(dir string, maxSize int, host string, port int, grpcPort int, DisableGRPCACDepsCheck: disableGRPCACDepsCheck, EnableACKeyInstanceMangling: enableACKeyInstanceMangling, EnableEndpointMetrics: enableEndpointMetrics, + Metrics: metrics, ExperimentalRemoteAssetAPI: experimentalRemoteAssetAPI, HTTPReadTimeout: httpReadTimeout, HTTPWriteTimeout: httpWriteTimeout, diff --git a/main.go b/main.go index 98876863d..f945ebc10 100644 --- a/main.go +++ b/main.go @@ -22,6 +22,7 @@ import ( "github.com/buchgr/bazel-remote/config" "github.com/buchgr/bazel-remote/server" "github.com/buchgr/bazel-remote/utils/idle" + "github.com/buchgr/bazel-remote/utils/metrics" "github.com/buchgr/bazel-remote/utils/rlimit" grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" @@ -283,6 +284,7 @@ func main() { ctx.Bool("disable_grpc_ac_deps_check"), ctx.Bool("enable_ac_key_instance_mangling"), ctx.Bool("enable_endpoint_metrics"), + nil, ctx.Bool("experimental_remote_asset_api"), ctx.Duration("http_read_timeout"), ctx.Duration("http_write_timeout"), @@ -311,6 +313,7 @@ func main() { accessLogger := log.New(os.Stdout, "", logFlags) errorLogger := log.New(os.Stderr, "", logFlags) + metrics := metrics.NewMetrics(c.Metrics) var proxyCache cache.Proxy if c.GoogleCloudStorage != nil { @@ -344,8 +347,7 @@ func main() { } validateAC := !c.DisableHTTPACValidation - h := server.NewHTTPCache(diskCache, accessLogger, errorLogger, validateAC, c.EnableACKeyInstanceMangling, gitCommit) - + h := server.NewHTTPCache(diskCache, accessLogger, errorLogger, metrics, validateAC, c.EnableACKeyInstanceMangling, gitCommit) var htpasswdSecrets auth.SecretProvider cacheHandler := h.CacheHandler if c.HtpasswdFile != "" { @@ -444,7 +446,7 @@ func main() { validateAC, c.EnableACKeyInstanceMangling, enableRemoteAssetAPI, - diskCache, accessLogger, errorLogger) + diskCache, accessLogger, errorLogger, metrics) if err3 != nil { log.Fatal(err3) } diff --git a/server/BUILD.bazel b/server/BUILD.bazel index 00db468d8..f67d07b59 100644 --- a/server/BUILD.bazel +++ b/server/BUILD.bazel @@ -18,6 +18,7 @@ go_library( "//cache:go_default_library", "//cache/disk:go_default_library", "//utils/idle:go_default_library", + "//utils/metrics:go_default_library", "@com_github_abbot_go_http_auth//:go_default_library", "@com_github_bazelbuild_remote_apis//build/bazel/remote/asset/v1:go_default_library", "@com_github_bazelbuild_remote_apis//build/bazel/remote/execution/v2:go_default_library", diff --git a/server/grpc.go b/server/grpc.go index a89ec78d6..e78a7b71e 100644 --- a/server/grpc.go +++ b/server/grpc.go @@ -18,7 +18,7 @@ import ( "github.com/buchgr/bazel-remote/cache" "github.com/buchgr/bazel-remote/cache/disk" - + "github.com/buchgr/bazel-remote/utils/metrics" _ "github.com/mostynb/go-grpc-compression/snappy" // Register snappy _ "github.com/mostynb/go-grpc-compression/zstd" // and zstd support. ) @@ -39,6 +39,7 @@ type grpcServer struct { errorLogger cache.Logger depsCheck bool mangleACKeys bool + metrics metrics.Metrics } // ListenAndServeGRPC creates a new gRPC server and listens on the given @@ -48,27 +49,28 @@ func ListenAndServeGRPC(addr string, opts []grpc.ServerOption, validateACDeps bool, mangleACKeys bool, enableRemoteAssetAPI bool, - c *disk.Cache, a cache.Logger, e cache.Logger) error { + c *disk.Cache, a cache.Logger, e cache.Logger, m metrics.Metrics) error { listener, err := net.Listen("tcp", addr) if err != nil { return err } - return serveGRPC(listener, opts, validateACDeps, mangleACKeys, enableRemoteAssetAPI, c, a, e) + return serveGRPC(listener, opts, validateACDeps, mangleACKeys, enableRemoteAssetAPI, c, a, e, m) } func serveGRPC(l net.Listener, opts []grpc.ServerOption, validateACDepsCheck bool, mangleACKeys bool, enableRemoteAssetAPI bool, - c *disk.Cache, a cache.Logger, e cache.Logger) error { + c *disk.Cache, a cache.Logger, e cache.Logger, m metrics.Metrics) error { srv := grpc.NewServer(opts...) s := &grpcServer{ cache: c, accessLogger: a, errorLogger: e, depsCheck: validateACDepsCheck, mangleACKeys: mangleACKeys, + metrics: m, } pb.RegisterActionCacheServer(srv, s) pb.RegisterCapabilitiesServer(srv, s) diff --git a/server/grpc_ac.go b/server/grpc_ac.go index a92d91bb0..d330aa616 100644 --- a/server/grpc_ac.go +++ b/server/grpc_ac.go @@ -13,10 +13,12 @@ import ( pb "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2" "github.com/golang/protobuf/proto" "google.golang.org/grpc/codes" + "google.golang.org/grpc/metadata" "google.golang.org/grpc/peer" "google.golang.org/grpc/status" "github.com/buchgr/bazel-remote/cache" + "github.com/buchgr/bazel-remote/utils/metrics" ) var ( @@ -63,6 +65,7 @@ func (s *grpcServer) GetActionResult(ctx context.Context, } if rdr == nil || sizeBytes <= 0 { s.accessLogger.Printf("%s %s %s", logPrefix, req.ActionDigest.Hash, "NOT FOUND") + s.incAcRequestMetrics(metrics.METHOD_GET, metrics.NOT_FOUND, ctx) return nil, status.Error(codes.NotFound, fmt.Sprintf("%s not found in AC", req.ActionDigest.Hash)) } @@ -82,6 +85,7 @@ func (s *grpcServer) GetActionResult(ctx context.Context, } s.accessLogger.Printf("%s %s OK", logPrefix, req.ActionDigest.Hash) + s.incAcRequestMetrics(metrics.METHOD_GET, metrics.OK, ctx) return result, nil } @@ -93,6 +97,7 @@ func (s *grpcServer) GetActionResult(ctx context.Context, if result == nil { s.accessLogger.Printf("%s %s NOT FOUND", logPrefix, req.ActionDigest.Hash) + s.incAcRequestMetrics(metrics.METHOD_GET, metrics.NOT_FOUND, ctx) return nil, status.Error(codes.NotFound, fmt.Sprintf("%s not found in AC", req.ActionDigest.Hash)) } @@ -129,6 +134,7 @@ func (s *grpcServer) GetActionResult(ctx context.Context, } s.accessLogger.Printf("GRPC AC GET %s OK", req.ActionDigest.Hash) + s.incAcRequestMetrics(metrics.METHOD_GET, metrics.OK, ctx) return result, nil } @@ -290,6 +296,7 @@ func (s *grpcServer) UpdateActionResult(ctx context.Context, } s.accessLogger.Printf("GRPC AC PUT %s OK", req.ActionDigest.Hash) + s.incAcRequestMetrics(metrics.METHOD_PUT, metrics.OK, ctx) // Trivia: the RE API wants us to return the ActionResult from the // request, in order to follow this standard method style guide: @@ -331,3 +338,8 @@ func addWorkerMetadataGRPC(ctx context.Context, ar *pb.ActionResult) { ar.ExecutionMetadata.Worker = worker } + +func (s *grpcServer) incAcRequestMetrics(method metrics.Method, status metrics.Status, ctx context.Context) { + headers, _ := metadata.FromIncomingContext(ctx) + s.metrics.IncomingRequestCompleted(metrics.AC, method, status, headers, metrics.GRPC) +} diff --git a/server/grpc_test.go b/server/grpc_test.go index 89eff7b9c..1654e1e3b 100644 --- a/server/grpc_test.go +++ b/server/grpc_test.go @@ -73,6 +73,7 @@ func TestMain(m *testing.M) { accessLogger := testutils.NewSilentLogger() errorLogger := testutils.NewSilentLogger() + metrics := testutils.NewMetricsStub() listener = bufconn.Listen(bufSize) @@ -87,7 +88,7 @@ func TestMain(m *testing.M) { validateAC, mangleACKeys, enableRemoteAssetAPI, - diskCache, accessLogger, errorLogger) + diskCache, accessLogger, errorLogger, metrics) if err2 != nil { fmt.Println(err2) os.Exit(1) diff --git a/server/http.go b/server/http.go index 8e336a742..08a2eb4d0 100644 --- a/server/http.go +++ b/server/http.go @@ -17,6 +17,7 @@ import ( pb "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2" "github.com/buchgr/bazel-remote/cache" "github.com/buchgr/bazel-remote/cache/disk" + "github.com/buchgr/bazel-remote/utils/metrics" "github.com/golang/protobuf/jsonpb" "github.com/golang/protobuf/proto" ) @@ -36,6 +37,7 @@ type httpCache struct { validateAC bool mangleACKeys bool gitCommit string + metrics metrics.Metrics } type statusPageData struct { @@ -51,8 +53,7 @@ type statusPageData struct { // accessLogger will print one line for each HTTP request to stdout. // errorLogger will print unexpected server errors. Inexistent files and malformed URLs will not // be reported. -func NewHTTPCache(cache *disk.Cache, accessLogger cache.Logger, errorLogger cache.Logger, validateAC bool, mangleACKeys bool, commit string) HTTPCache { - +func NewHTTPCache(cache *disk.Cache, accessLogger cache.Logger, errorLogger cache.Logger, metrics metrics.Metrics, validateAC bool, mangleACKeys bool, commit string) HTTPCache { _, _, numItems := cache.Stats() errorLogger.Printf("Loaded %d existing disk cache items.", numItems) @@ -63,6 +64,7 @@ func NewHTTPCache(cache *disk.Cache, accessLogger cache.Logger, errorLogger cach errorLogger: errorLogger, validateAC: validateAC, mangleACKeys: mangleACKeys, + metrics: metrics, } if commit != "{STABLE_GIT_COMMIT}" { @@ -106,32 +108,32 @@ func (h *httpCache) handleContainsValidAC(w http.ResponseWriter, r *http.Request _, data, err := h.cache.GetValidatedActionResult(hash) if err != nil { http.Error(w, "Not found", http.StatusNotFound) - h.logResponse(http.StatusNotFound, r) + h.logResponse(http.StatusNotFound, r, cache.AC) return } if data == nil { http.Error(w, "Not found", http.StatusNotFound) - h.logResponse(http.StatusNotFound, r) + h.logResponse(http.StatusNotFound, r, cache.AC) return } w.Header().Set("Content-Length", strconv.FormatInt(int64(len(data)), 10)) w.WriteHeader(http.StatusOK) - h.logResponse(http.StatusOK, r) + h.logResponse(http.StatusOK, r, cache.AC) } func (h *httpCache) handleGetValidAC(w http.ResponseWriter, r *http.Request, hash string) { _, data, err := h.cache.GetValidatedActionResult(hash) if err != nil { http.Error(w, "Not found", http.StatusNotFound) - h.logResponse(http.StatusNotFound, r) + h.logResponse(http.StatusNotFound, r, cache.AC) return } if data == nil { http.Error(w, "Not found", http.StatusNotFound) - h.logResponse(http.StatusNotFound, r) + h.logResponse(http.StatusNotFound, r, cache.AC) return } @@ -139,7 +141,7 @@ func (h *httpCache) handleGetValidAC(w http.ResponseWriter, r *http.Request, has ar := &pb.ActionResult{} err = proto.Unmarshal(data, ar) if err != nil { - h.logResponse(http.StatusInternalServerError, r) + h.logResponse(http.StatusInternalServerError, r, cache.AC) return } @@ -147,10 +149,11 @@ func (h *httpCache) handleGetValidAC(w http.ResponseWriter, r *http.Request, has marshaler := jsonpb.Marshaler{} err = marshaler.Marshal(w, ar) if err != nil { - h.logResponse(http.StatusInternalServerError, r) + h.logResponse(http.StatusInternalServerError, r, cache.AC) return } + h.logResponse(http.StatusOK, r, cache.AC) return } @@ -159,18 +162,20 @@ func (h *httpCache) handleGetValidAC(w http.ResponseWriter, r *http.Request, has bytesWritten, err := w.Write(data) if err != nil { - h.logResponse(http.StatusInternalServerError, r) + h.logResponse(http.StatusInternalServerError, r, cache.AC) return } if bytesWritten != len(data) { - h.logResponse(http.StatusInternalServerError, r) + h.logResponse(http.StatusInternalServerError, r, cache.AC) return } + + h.logResponse(http.StatusOK, r, cache.AC) } // Helper function for logging responses -func (h *httpCache) logResponse(code int, r *http.Request) { +func (h *httpCache) logResponse(code int, r *http.Request, kind cache.EntryKind) { // Parse the client ip:port var clientAddress string var err error @@ -179,6 +184,36 @@ func (h *httpCache) logResponse(code int, r *http.Request) { clientAddress = r.RemoteAddr } h.accessLogger.Printf("%4s %d %15s %s", r.Method, code, clientAddress, r.URL.Path) + h.increaseMetrics(code, r, kind) +} + +func (h *httpCache) increaseMetrics(code int, r *http.Request, kind cache.EntryKind) { + if kind == cache.AC { + // See comments in metrics.go about why only counting AC requests, + // and why mapping several statuses as "other". + + var status metrics.Status + var method metrics.Method + switch r.Method { + case http.MethodGet: + method = metrics.METHOD_GET + case http.MethodPut: + method = metrics.METHOD_PUT + case http.MethodHead: + method = metrics.METHOD_HEAD + default: + method = metrics.METHOD_OTHER + } + switch code { + case http.StatusOK: + status = metrics.OK + case http.StatusNotFound: + status = metrics.NOT_FOUND + default: + status = metrics.OTHER_STATUS + } + h.metrics.IncomingRequestCompleted(metrics.AC, method, status, r.Header, metrics.HTTP) + } } func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { @@ -187,7 +222,7 @@ func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { kind, hash, instance, err := parseRequestURL(r.URL.Path, h.validateAC) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) - h.logResponse(http.StatusBadRequest, r) + h.logResponse(http.StatusBadRequest, r, cache.UNKNOWN) return } @@ -216,7 +251,7 @@ func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { if rdr == nil { http.Error(w, "Not found", http.StatusNotFound) - h.logResponse(http.StatusNotFound, r) + h.logResponse(http.StatusNotFound, r, kind) return } defer rdr.Close() @@ -225,7 +260,7 @@ func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Length", strconv.FormatInt(sizeBytes, 10)) io.Copy(w, rdr) - h.logResponse(http.StatusOK, r) + h.logResponse(http.StatusOK, r, kind) case http.MethodPut: contentLength := r.ContentLength @@ -289,7 +324,7 @@ func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { } h.errorLogger.Printf("PUT %s: %s", path(kind, hash), err) } else { - h.logResponse(http.StatusOK, r) + h.logResponse(http.StatusOK, r, kind) } case http.MethodHead: @@ -304,18 +339,17 @@ func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { ok, size := h.cache.Contains(kind, hash, -1) if !ok { http.Error(w, "Not found", http.StatusNotFound) - h.logResponse(http.StatusNotFound, r) + h.logResponse(http.StatusNotFound, r, kind) return } - w.Header().Set("Content-Length", strconv.FormatInt(size, 10)) w.WriteHeader(http.StatusOK) - h.logResponse(http.StatusOK, r) + h.logResponse(http.StatusOK, r, kind) default: msg := fmt.Sprintf("Method '%s' not supported.", html.EscapeString(m)) http.Error(w, msg, http.StatusMethodNotAllowed) - h.logResponse(http.StatusMethodNotAllowed, r) + h.logResponse(http.StatusMethodNotAllowed, r, kind) } } diff --git a/server/http_test.go b/server/http_test.go index 104b79093..b33af3943 100644 --- a/server/http_test.go +++ b/server/http_test.go @@ -36,8 +36,7 @@ func TestDownloadFile(t *testing.T) { } c := disk.New(cacheDir, blobSize, nil) - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), true, false, "") - + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), true, false, "") req, err := http.NewRequest("GET", "/cas/"+hash, bytes.NewReader([]byte{})) if err != nil { t.Fatal(err) @@ -99,7 +98,7 @@ func TestUploadFilesConcurrently(t *testing.T) { } c := disk.New(cacheDir, 1000*1024, nil) - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), true, false, "") + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), true, false, "") handler := http.HandlerFunc(h.CacheHandler) var wg sync.WaitGroup @@ -157,7 +156,7 @@ func TestUploadSameFileConcurrently(t *testing.T) { numWorkers := 100 c := disk.New(cacheDir, int64(len(data)*numWorkers), nil) - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), true, false, "") + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), true, false, "") handler := http.HandlerFunc(h.CacheHandler) var wg sync.WaitGroup @@ -203,7 +202,7 @@ func TestUploadCorruptedFile(t *testing.T) { } c := disk.New(cacheDir, 2048, nil) - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), true, false, "") + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), true, false, "") rr := httptest.NewRecorder() handler := http.HandlerFunc(h.CacheHandler) handler.ServeHTTP(rr, r) @@ -245,7 +244,7 @@ func TestUploadEmptyActionResult(t *testing.T) { c := disk.New(cacheDir, 2048, nil) validate := true mangle := false - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), validate, mangle, "") + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), validate, mangle, "") rr := httptest.NewRecorder() handler := http.HandlerFunc(h.CacheHandler) handler.ServeHTTP(rr, r) @@ -302,7 +301,7 @@ func testEmptyBlobAvailable(t *testing.T, method string) { c := disk.New(cacheDir, 2048, nil) validate := true mangle := false - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), validate, mangle, "") + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), validate, mangle, "") rr := httptest.NewRecorder() handler := http.HandlerFunc(h.CacheHandler) handler.ServeHTTP(rr, r) @@ -325,7 +324,7 @@ func TestStatusPage(t *testing.T) { } c := disk.New(cacheDir, 2048, nil) - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), true, false, "") + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), true, false, "") rr := httptest.NewRecorder() handler := http.HandlerFunc(h.StatusPageHandler) handler.ServeHTTP(rr, r) @@ -466,7 +465,7 @@ func TestRemoteReturnsNotFound(t *testing.T) { defer os.RemoveAll(cacheDir) emptyCache := disk.New(cacheDir, 1024, nil) - h := NewHTTPCache(emptyCache, testutils.NewSilentLogger(), testutils.NewSilentLogger(), true, false, "") + h := NewHTTPCache(emptyCache, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), true, false, "") // create a fake http.Request _, hash := testutils.RandomDataAndHash(1024) url, _ := url.Parse(fmt.Sprintf("http://localhost:8080/ac/%s", hash)) diff --git a/utils/BUILD.bazel b/utils/BUILD.bazel index cdeaa623e..a6632cb43 100644 --- a/utils/BUILD.bazel +++ b/utils/BUILD.bazel @@ -5,4 +5,7 @@ go_library( srcs = ["testutils.go"], importpath = "github.com/buchgr/bazel-remote/utils", visibility = ["//visibility:public"], + deps = [ + "//utils/metrics:go_default_library", + ] ) diff --git a/utils/metrics/BUILD.bazel b/utils/metrics/BUILD.bazel new file mode 100644 index 000000000..1901f9091 --- /dev/null +++ b/utils/metrics/BUILD.bazel @@ -0,0 +1,13 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "go_default_library", + srcs = ["metrics.go"], + importpath = "github.com/buchgr/bazel-remote/utils/metrics", + visibility = ["//visibility:public"], + deps = [ + "//config:go_default_library", + "@com_github_prometheus_client_golang//prometheus:go_default_library", + "@com_github_prometheus_client_golang//prometheus/promauto:go_default_library", + ], +) diff --git a/utils/metrics/metrics.go b/utils/metrics/metrics.go new file mode 100644 index 000000000..d912c7438 --- /dev/null +++ b/utils/metrics/metrics.go @@ -0,0 +1,187 @@ +package metrics + +import ( + "github.com/buchgr/bazel-remote/config" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "strings" +) + +// TODO Add test cases for this file + +type Method int +type Status int +type Kind int +type Protocol int + +const ( + METHOD_GET Method = iota + METHOD_HEAD + METHOD_PUT + METHOD_OTHER +) + +const ( + OK Status = iota + NOT_FOUND + OTHER_STATUS +) + +const ( + AC Kind = iota + CAS +) + +const ( + HTTP Protocol = iota + GRPC +) + +func (e Method) String() string { + // Actually HTTP names, but can be conceptually mapped also to GRPC protocol. + if e == METHOD_GET { + return "GET" + } + if e == METHOD_HEAD { + return "HEAD" + } + if e == METHOD_PUT { + return "PUT" + } + return "other" +} + +func (e Status) String() string { + // Names that works for both HTTP and GRPC, instead of HTTP or GRPC specific codes. + if e == OK { + return "OK" + } + if e == NOT_FOUND { + return "NotFound" + } + return "other" +} + +func (e Kind) String() string { + if e == AC { + return "AC" + } + if e == CAS { + return "CAS" + } + return "other" +} + +type Metrics interface { + // TODO Document interface + IncomingRequestCompleted(kind Kind, method Method, status Status, headers map[string][]string, protocol Protocol) +} + +type metrics struct { + categoryValues map[string]map[string]struct{} + counterIncomingCompletedReqs *prometheus.CounterVec +} + +func NewMetrics(config *config.Metrics) Metrics { + + labels := []string{"method", "status", "kind"} + categoryValues := make(map[string]map[string]struct{}) + + if config != nil && config.Categories != nil { + for categoryName, whiteListedValues := range config.Categories { + // Normalize to lower case since canonical for gRPC headers + // and convention for prometheus. + categoryName := strings.ToLower(categoryName) + + // Store white listed category values as set for efficient access + whiteListedSet := make(map[string]struct{}) + for _, categoryValue := range whiteListedValues { + whiteListedSet[categoryValue] = struct{}{} + } + categoryValues[categoryName] = whiteListedSet + + // Construct a prometheus label for each category. + // Prometheus does not allow changing set of + // labels until next time bazel-remote is + // restarted. + labels = append(labels, categoryName) + } + } + + // For now we only count AC requests, and only the most common status codes, + // becuse: + // + // - No identified use case for others. + // - Limit number of prometheus time series (if many configured categories). + // - Reduce performance overhead of counters (if many configured categories). + // - Would otherwise require injecting invocations in more places. + // + // But the naming, and the labels, of the counter, are generic to allow + // counting additional requests types or status codes in the future. Without + // having to rename the counter and get issues with non continous history of + // metrics. + + counterIncomingCompletedReqs := promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "bazel_remote_incoming_requests_completed_total", + Help: "The number of incoming HTTP and gRPC request. Currently only AC requests", + }, + labels) + + m := &metrics{ + categoryValues: categoryValues, + counterIncomingCompletedReqs: counterIncomingCompletedReqs, + } + return m +} + +func getLabelValueFromHeaderValues(headerValues []string, whiteListedValues map[string]struct{}) string { + for _, headerValue := range headerValues { + // Prometheus only allows one value per label. + // Pick the first white listed header value we find. + if _, ok := whiteListedValues[headerValue]; ok { + return headerValue + } + } + + // The values found in the header has not been white listed in + // the configuration file. Represent them as "other". + // + // The white listening is an attempt to avoid polluting + // prometheus with too many different time series. + // + // https://prometheus.io/docs/practices/naming/ warns about: + // + // "CAUTION: Remember that every unique combination of key-value + // label pairs represents a new time series, which can dramatically + // increase the amount of data stored. Do not use labels to store + // dimensions with high cardinality (many different label values), + // such as user IDs, email addresses, or other unbounded sets of + // values." + // + // It would have been nice if bazel-remote could reload the set + // of white listed values from updated configuration file, by + // SIGHUP signal instead of having to restart bazel-remote. + return "other" +} + +func (m *metrics) IncomingRequestCompleted(kind Kind, method Method, status Status, headers map[string][]string, protocol Protocol) { + labels := make(prometheus.Labels) + labels["method"] = method.String() + labels["status"] = status.String() + labels["kind"] = kind.String() + for labelName := range m.categoryValues { + // The canonical form of gRPC and HTTP/2 headers is lowercase "category" + headerName := labelName + if protocol == HTTP { + // but the golang http library is normalizing HTTP/1.1 headers as "Category". + headerName = strings.Title(headerName) + } + if headerValues, ok := headers[headerName]; ok { + labels[labelName] = getLabelValueFromHeaderValues(headerValues, m.categoryValues[labelName]) + } else { + labels[labelName] = "" // no header for this label + } + } + m.counterIncomingCompletedReqs.With(labels).Inc() +} diff --git a/utils/testutils.go b/utils/testutils.go index f0ee56037..15322eb7d 100644 --- a/utils/testutils.go +++ b/utils/testutils.go @@ -8,6 +8,8 @@ import ( "log" "os" "testing" + + "github.com/buchgr/bazel-remote/utils/metrics" ) // TempDir creates a temporary directory and returns its name. If an error @@ -47,3 +49,13 @@ func RandomDataAndHash(size int64) ([]byte, string) { func NewSilentLogger() *log.Logger { return log.New(ioutil.Discard, "", 0) } + +type metricsStub struct{} + +func NewMetricsStub() *metricsStub { + return new(metricsStub) +} + +func (m metricsStub) IncomingRequestCompleted(kind metrics.Kind, method metrics.Method, status metrics.Status, headers map[string][]string, protocol metrics.Protocol) { + // Do nothing +}