mirror of
https://github.com/zitadel/zitadel.git
synced 2025-12-04 01:32:30 +00:00
# Which Problems Are Solved
There were multiple issues in the OpenTelemetry (OTEL) implementation
and usage for tracing and metrics, which lead to high cardinality and
potential memory leaks:
- wrongly initiated tracing interceptors
- high cardinality in traces:
- HTTP/1.1 endpoints containing host names
- HTTP/1.1 endpoints containing object IDs like userID (e.g.
`/management/v1/users/2352839823/`)
- high amount of traces from internal processes (spooler)
- high cardinality in metrics endpoint:
- GRPC entries containing host names
- notification metrics containing instanceIDs and error messages
# How the Problems Are Solved
- Properly initialize the interceptors once and update them to use the
grpc stats handler (unary interceptors were deprecated).
- Remove host names from HTTP/1.1 span names and use path as default.
- Set / overwrite the uri for spans on the grpc-gateway with the uri
pattern (`/management/v1/users/{user_id}`). This is used for spans in
traces and metric entries.
- Created a new sampler which will only sample spans in the following
cases:
- remote was already sampled
- remote was not sampled, root span is of kind `Server` and based on
fraction set in the runtime configuration
- This will prevent having a lot of spans from the spooler back ground
jobs if they were not started by a client call querying an object (e.g.
UserByID).
- Filter out host names and alike from OTEL generated metrics (using a
`view`).
- Removed instance and error messages from notification metrics.
# Additional Changes
Fixed the middleware handling for serving Console. Telemetry and
instance selection are only used for the environment.json, but not on
statically served files.
# Additional Context
- closes #8096
- relates to #9074
- back ports to at least 2.66.x, 2.67.x and 2.68.x
(cherry picked from commit 990e1982c7)
128 lines
3.5 KiB
Go
128 lines
3.5 KiB
Go
package otel
|
|
|
|
import (
|
|
"context"
|
|
"net/http"
|
|
"sync"
|
|
|
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/exporters/prometheus"
|
|
"go.opentelemetry.io/otel/metric"
|
|
"go.opentelemetry.io/otel/sdk/instrumentation"
|
|
sdk_metric "go.opentelemetry.io/otel/sdk/metric"
|
|
|
|
"github.com/zitadel/zitadel/internal/telemetry/metrics"
|
|
otel_resource "github.com/zitadel/zitadel/internal/telemetry/otel"
|
|
"github.com/zitadel/zitadel/internal/zerrors"
|
|
)
|
|
|
|
type Metrics struct {
|
|
Provider metric.MeterProvider
|
|
Meter metric.Meter
|
|
Counters sync.Map
|
|
UpDownSumObserver sync.Map
|
|
ValueObservers sync.Map
|
|
}
|
|
|
|
func NewMetrics(meterName string) (metrics.Metrics, error) {
|
|
resource, err := otel_resource.ResourceWithService()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
exporter, err := prometheus.New()
|
|
if err != nil {
|
|
return &Metrics{}, err
|
|
}
|
|
// create a view to filter out unwanted attributes
|
|
view := sdk_metric.NewView(
|
|
sdk_metric.Instrument{
|
|
Scope: instrumentation.Scope{Name: otelhttp.ScopeName},
|
|
},
|
|
sdk_metric.Stream{
|
|
AttributeFilter: attribute.NewAllowKeysFilter("http.method", "http.status_code", "http.target"),
|
|
},
|
|
)
|
|
meterProvider := sdk_metric.NewMeterProvider(
|
|
sdk_metric.WithReader(exporter),
|
|
sdk_metric.WithResource(resource),
|
|
sdk_metric.WithView(view),
|
|
)
|
|
return &Metrics{
|
|
Provider: meterProvider,
|
|
Meter: meterProvider.Meter(meterName),
|
|
}, nil
|
|
}
|
|
|
|
func (m *Metrics) GetExporter() http.Handler {
|
|
return promhttp.Handler()
|
|
}
|
|
|
|
func (m *Metrics) GetMetricsProvider() metric.MeterProvider {
|
|
return m.Provider
|
|
}
|
|
|
|
func (m *Metrics) RegisterCounter(name, description string) error {
|
|
if _, exists := m.Counters.Load(name); exists {
|
|
return nil
|
|
}
|
|
counter, err := m.Meter.Int64Counter(name, metric.WithDescription(description))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
m.Counters.Store(name, counter)
|
|
return nil
|
|
}
|
|
|
|
func (m *Metrics) AddCount(ctx context.Context, name string, value int64, labels map[string]attribute.Value) error {
|
|
counter, exists := m.Counters.Load(name)
|
|
if !exists {
|
|
return zerrors.ThrowNotFound(nil, "METER-4u8fs", "Errors.Metrics.Counter.NotFound")
|
|
}
|
|
counter.(metric.Int64Counter).Add(ctx, value, MapToAddOption(labels)...)
|
|
return nil
|
|
}
|
|
|
|
func (m *Metrics) RegisterUpDownSumObserver(name, description string, callbackFunc metric.Int64Callback) error {
|
|
if _, exists := m.UpDownSumObserver.Load(name); exists {
|
|
return nil
|
|
}
|
|
|
|
counter, err := m.Meter.Int64ObservableUpDownCounter(name, metric.WithInt64Callback(callbackFunc), metric.WithDescription(description))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
m.UpDownSumObserver.Store(name, counter)
|
|
return nil
|
|
}
|
|
|
|
func (m *Metrics) RegisterValueObserver(name, description string, callbackFunc metric.Int64Callback) error {
|
|
if _, exists := m.UpDownSumObserver.Load(name); exists {
|
|
return nil
|
|
}
|
|
|
|
gauge, err := m.Meter.Int64ObservableGauge(name, metric.WithInt64Callback(callbackFunc), metric.WithDescription(description))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
m.UpDownSumObserver.Store(name, gauge)
|
|
return nil
|
|
}
|
|
|
|
func MapToAddOption(labels map[string]attribute.Value) []metric.AddOption {
|
|
if labels == nil {
|
|
return nil
|
|
}
|
|
keyValues := make([]attribute.KeyValue, 0, len(labels))
|
|
for key, value := range labels {
|
|
keyValues = append(keyValues, attribute.KeyValue{
|
|
Key: attribute.Key(key),
|
|
Value: value,
|
|
})
|
|
}
|
|
return []metric.AddOption{metric.WithAttributes(keyValues...)}
|
|
}
|