Files
zitadel/internal/telemetry/metrics/otel/open_telemetry.go
Zach Hirschtritt 99c96e4f70 fix: drop default otel scope info from metrics (#10306)
# Which Problems Are Solved

Currently, the prometheus endpoint metrics contain otel specific labels
that increase the overall metric size to the point that the exemplar
implementation in the underlying prom exporter library throws an error,
see https://github.com/zitadel/zitadel/issues/10047. The MaxRuneSize for
metric refs in exemplars is 128 and many of metrics cross this because
of `otel_scope_name`.

# How the Problems Are Solved

This change drops those otel specific labels on the prometheus exporter:
`otel_scope_name` and `otel_scope_version`

Current metrics example:
```
http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_version="0.53.0",le="0"} 0
http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_version="0.53.0",le="5"} 100
http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_version="0.53.0",le="10"} 100
...
grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/ListIAMMemberRoles",otel_scope_name="",otel_scope_version="",return_code="200"} 3
grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/ListIAMMembers",otel_scope_name="",otel_scope_version="",return_code="200"} 3
grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/ListMilestones",otel_scope_name="",otel_scope_version="",return_code="200"} 1
```

New example:
```
http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",le="10"} 8
http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",le="25"} 8
http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",le="50"} 9
http_server_duration_milliseconds_bucket{http_method="GET",http_status_code="200",le="75"} 9
...
grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/GetSupportedLanguages",return_code="200"} 1
grpc_server_grpc_status_code_total{grpc_method="/zitadel.admin.v1.AdminService/ListMilestones",return_code="200"} 1
grpc_server_grpc_status_code_total{grpc_method="/zitadel.auth.v1.AuthService/GetMyLabelPolicy",return_code="200"} 3
```

# Additional Changes

None

# Additional Context

From my understanding, this change is fully spec compliant with
Prometheus and Otel:
*
https://opentelemetry.io/docs/specs/otel/compatibility/prometheus_and_openmetrics/#instrumentation-scope

However, these tags were originally added as optional labels to
disambiguate metrics. But I'm not sure we need to care about that right
now? My gut feeling is that exemplar support (the ability for traces to
reference metrics) would be a preferable tradeoff to this label
standard.

Co-authored-by: Silvan <27845747+adlerhurst@users.noreply.github.com>
(cherry picked from commit 532932ef94)
2025-08-15 14:46:31 +02:00

164 lines
4.6 KiB
Go

package otel
import (
"context"
"net/http"
"sync"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/prometheus"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/sdk/instrumentation"
sdk_metric "go.opentelemetry.io/otel/sdk/metric"
"github.com/zitadel/zitadel/internal/telemetry/metrics"
otel_resource "github.com/zitadel/zitadel/internal/telemetry/otel"
"github.com/zitadel/zitadel/internal/zerrors"
)
type Metrics struct {
Provider metric.MeterProvider
Meter metric.Meter
Counters sync.Map
UpDownSumObserver sync.Map
ValueObservers sync.Map
Histograms sync.Map
}
func NewMetrics(meterName string) (metrics.Metrics, error) {
resource, err := otel_resource.ResourceWithService("ZITADEL")
if err != nil {
return nil, err
}
exporter, err := prometheus.New(prometheus.WithoutScopeInfo())
if err != nil {
return &Metrics{}, err
}
// create a view to filter out unwanted attributes
view := sdk_metric.NewView(
sdk_metric.Instrument{
Scope: instrumentation.Scope{Name: otelhttp.ScopeName},
},
sdk_metric.Stream{
AttributeFilter: attribute.NewAllowKeysFilter("http.method", "http.status_code", "http.target"),
},
)
meterProvider := sdk_metric.NewMeterProvider(
sdk_metric.WithReader(exporter),
sdk_metric.WithResource(resource),
sdk_metric.WithView(view),
)
return &Metrics{
Provider: meterProvider,
Meter: meterProvider.Meter(meterName),
}, nil
}
func (m *Metrics) GetExporter() http.Handler {
return promhttp.Handler()
}
func (m *Metrics) GetMetricsProvider() metric.MeterProvider {
return m.Provider
}
func (m *Metrics) RegisterCounter(name, description string) error {
if _, exists := m.Counters.Load(name); exists {
return nil
}
counter, err := m.Meter.Int64Counter(name, metric.WithDescription(description))
if err != nil {
return err
}
m.Counters.Store(name, counter)
return nil
}
func (m *Metrics) AddCount(ctx context.Context, name string, value int64, labels map[string]attribute.Value) error {
counter, exists := m.Counters.Load(name)
if !exists {
return zerrors.ThrowNotFound(nil, "METER-4u8fs", "Errors.Metrics.Counter.NotFound")
}
counter.(metric.Int64Counter).Add(ctx, value, MapToAddOption(labels)...)
return nil
}
func (m *Metrics) AddHistogramMeasurement(ctx context.Context, name string, value float64, labels map[string]attribute.Value) error {
histogram, exists := m.Histograms.Load(name)
if !exists {
return zerrors.ThrowNotFound(nil, "METER-5wwb1", "Errors.Metrics.Histogram.NotFound")
}
histogram.(metric.Float64Histogram).Record(ctx, value, MapToRecordOption(labels)...)
return nil
}
func (m *Metrics) RegisterHistogram(name, description, unit string, buckets []float64) error {
if _, exists := m.Histograms.Load(name); exists {
return nil
}
histogram, err := m.Meter.Float64Histogram(name,
metric.WithDescription(description),
metric.WithUnit(unit),
metric.WithExplicitBucketBoundaries(buckets...),
)
if err != nil {
return err
}
m.Histograms.Store(name, histogram)
return nil
}
func (m *Metrics) RegisterUpDownSumObserver(name, description string, callbackFunc metric.Int64Callback) error {
if _, exists := m.UpDownSumObserver.Load(name); exists {
return nil
}
counter, err := m.Meter.Int64ObservableUpDownCounter(name, metric.WithInt64Callback(callbackFunc), metric.WithDescription(description))
if err != nil {
return err
}
m.UpDownSumObserver.Store(name, counter)
return nil
}
func (m *Metrics) RegisterValueObserver(name, description string, callbackFunc metric.Int64Callback) error {
if _, exists := m.UpDownSumObserver.Load(name); exists {
return nil
}
gauge, err := m.Meter.Int64ObservableGauge(name, metric.WithInt64Callback(callbackFunc), metric.WithDescription(description))
if err != nil {
return err
}
m.UpDownSumObserver.Store(name, gauge)
return nil
}
func MapToAddOption(labels map[string]attribute.Value) []metric.AddOption {
return []metric.AddOption{metric.WithAttributes(labelsToAttributes(labels)...)}
}
func MapToRecordOption(labels map[string]attribute.Value) []metric.RecordOption {
return []metric.RecordOption{metric.WithAttributes(labelsToAttributes(labels)...)}
}
func labelsToAttributes(labels map[string]attribute.Value) []attribute.KeyValue {
if labels == nil {
return nil
}
attributes := make([]attribute.KeyValue, 0, len(labels))
for key, value := range labels {
attributes = append(attributes, attribute.KeyValue{
Key: attribute.Key(key),
Value: value,
})
}
return attributes
}