From 532932ef9485dd17d9be948b5a072a6788a3129f Mon Sep 17 00:00:00 2001 From: Zach Hirschtritt Date: Thu, 14 Aug 2025 08:44:37 -0400 Subject: [PATCH] fix: drop default otel scope info from metrics (#10306) # Which Problems Are Solved Currently, the prometheus endpoint metrics contain otel specific labels that increase the overall metric size to the point that the exemplar implementation in the underlying prom exporter library throws an error, see https://github.com/zitadel/zitadel/issues/10047. The MaxRuneSize for metric refs in exemplars is 128 and many of metrics cross this because of `otel_scope_name`. # How the Problems Are Solved This change drops those otel specific labels on the prometheus exporter: `otel_scope_name` and `otel_scope_version` Current metrics example: ``` http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_version="0.53.0",le="0"} 0 http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_version="0.53.0",le="5"} 100 http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_version="0.53.0",le="10"} 100 ... grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/ListIAMMemberRoles",otel_scope_name="",otel_scope_version="",return_code="200"} 3 grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/ListIAMMembers",otel_scope_name="",otel_scope_version="",return_code="200"} 3 grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/ListMilestones",otel_scope_name="",otel_scope_version="",return_code="200"} 1 ``` New example: ``` http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",le="10"} 8 http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",le="25"} 8 http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",le="50"} 9 http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",le="75"} 9 ... grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/GetSupportedLanguages",return_code="200"} 1 grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/ListMilestones",return_code="200"} 1 grpc_server_grpc_status_code_total{grpc_method="/zitadel.auth.v1.AuthService/GetMyLabelPolicy",return_code="200"} 3 ``` # Additional Changes None # Additional Context From my understanding, this change is fully spec compliant with Prometheus and Otel: * https://opentelemetry.io/docs/specs/otel/compatibility/prometheus_and_openmetrics/#instrumentation-scope However, these tags were originally added as optional labels to disambiguate metrics. But I'm not sure we need to care about that right now? My gut feeling is that exemplar support (the ability for traces to reference metrics) would be a preferable tradeoff to this label standard. Co-authored-by: Silvan <27845747+adlerhurst@users.noreply.github.com> --- internal/telemetry/metrics/otel/open_telemetry.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/telemetry/metrics/otel/open_telemetry.go b/internal/telemetry/metrics/otel/open_telemetry.go index c4509ed5db..ca1b3f89ae 100644 --- a/internal/telemetry/metrics/otel/open_telemetry.go +++ b/internal/telemetry/metrics/otel/open_telemetry.go @@ -32,7 +32,7 @@ func NewMetrics(meterName string) (metrics.Metrics, error) { if err != nil { return nil, err } - exporter, err := prometheus.New() + exporter, err := prometheus.New(prometheus.WithoutScopeInfo()) if err != nil { return &Metrics{}, err }