From fc8e15ad762f2e1ab3fb96b376f3a131b1e87653 Mon Sep 17 00:00:00 2001 From: Zach Hirschtritt Date: Thu, 14 Aug 2025 08:44:37 -0400 Subject: [PATCH] fix: drop default otel scope info from metrics (#10306) # Which Problems Are Solved Currently, the prometheus endpoint metrics contain otel specific labels that increase the overall metric size to the point that the exemplar implementation in the underlying prom exporter library throws an error, see https://github.com/zitadel/zitadel/issues/10047. The MaxRuneSize for metric refs in exemplars is 128 and many of metrics cross this because of `otel_scope_name`. # How the Problems Are Solved This change drops those otel specific labels on the prometheus exporter: `otel_scope_name` and `otel_scope_version` Current metrics example: ``` http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_version="0.53.0",le="0"} 0 http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_version="0.53.0",le="5"} 100 http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_version="0.53.0",le="10"} 100 ... grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/ListIAMMemberRoles",otel_scope_name="",otel_scope_version="",return_code="200"} 3 grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/ListIAMMembers",otel_scope_name="",otel_scope_version="",return_code="200"} 3 grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/ListMilestones",otel_scope_name="",otel_scope_version="",return_code="200"} 1 ``` New example: ``` http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",le="10"} 8 http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",le="25"} 8 http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",le="50"} 9 http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",le="75"} 9 ... grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/GetSupportedLanguages",return_code="200"} 1 grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/ListMilestones",return_code="200"} 1 grpc_server_grpc_status_code_total{grpc_method="/zitadel.auth.v1.AuthService/GetMyLabelPolicy",return_code="200"} 3 ``` # Additional Changes None # Additional Context From my understanding, this change is fully spec compliant with Prometheus and Otel: * https://opentelemetry.io/docs/specs/otel/compatibility/prometheus_and_openmetrics/#instrumentation-scope However, these tags were originally added as optional labels to disambiguate metrics. But I'm not sure we need to care about that right now? My gut feeling is that exemplar support (the ability for traces to reference metrics) would be a preferable tradeoff to this label standard. Co-authored-by: Silvan <27845747+adlerhurst@users.noreply.github.com> --- internal/telemetry/metrics/otel/open_telemetry.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/telemetry/metrics/otel/open_telemetry.go b/internal/telemetry/metrics/otel/open_telemetry.go index c4509ed5db3..ca1b3f89aea 100644 --- a/internal/telemetry/metrics/otel/open_telemetry.go +++ b/internal/telemetry/metrics/otel/open_telemetry.go @@ -32,7 +32,7 @@ func NewMetrics(meterName string) (metrics.Metrics, error) { if err != nil { return nil, err } - exporter, err := prometheus.New() + exporter, err := prometheus.New(prometheus.WithoutScopeInfo()) if err != nil { return &Metrics{}, err }