fix(OTEL): reduce high cardinality in traces and metrics (#9286)

# Which Problems Are Solved

There were multiple issues in the OpenTelemetry (OTEL) implementation
and usage for tracing and metrics, which lead to high cardinality and
potential memory leaks:
- wrongly initiated tracing interceptors
- high cardinality in traces:
  - HTTP/1.1 endpoints containing host names
- HTTP/1.1 endpoints containing object IDs like userID (e.g.
`/management/v1/users/2352839823/`)
- high amount of traces from internal processes (spooler)
- high cardinality in metrics endpoint:
  - GRPC entries containing host names
  - notification metrics containing instanceIDs and error messages

# How the Problems Are Solved

- Properly initialize the interceptors once and update them to use the
grpc stats handler (unary interceptors were deprecated).
- Remove host names from HTTP/1.1 span names and use path as default.
- Set / overwrite the uri for spans on the grpc-gateway with the uri
pattern (`/management/v1/users/{user_id}`). This is used for spans in
traces and metric entries.
- Created a new sampler which will only sample spans in the following
cases:
  - remote was already sampled
- remote was not sampled, root span is of kind `Server` and based on
fraction set in the runtime configuration
- This will prevent having a lot of spans from the spooler back ground
jobs if they were not started by a client call querying an object (e.g.
UserByID).
- Filter out host names and alike from OTEL generated metrics (using a
`view`).
- Removed instance and error messages from notification metrics.

# Additional Changes

Fixed the middleware handling for serving Console. Telemetry and
instance selection are only used for the environment.json, but not on
statically served files.

# Additional Context

- closes #8096
- relates to #9074
- back ports to at least 2.66.x, 2.67.x and 2.68.x

(cherry picked from commit 990e1982c7)
This commit is contained in:
Livio Spring
2025-02-04 09:55:26 +01:00
parent 92e2ba0ea8
commit 01bbcc1a48
14 changed files with 237 additions and 60 deletions

View File

@@ -1,6 +1,7 @@
package metrics
import (
"context"
"net/http"
"strings"
@@ -35,7 +36,8 @@ const (
type StatusRecorder struct {
http.ResponseWriter
Status int
RequestURI *string
Status int
}
func (r *StatusRecorder) WriteHeader(status int) {
@@ -56,6 +58,18 @@ func NewMetricsHandler(handler http.Handler, metricMethods []MetricType, ignored
return &h
}
type key int
const requestURI key = iota
func SetRequestURIPattern(ctx context.Context, pattern string) {
uri, ok := ctx.Value(requestURI).(*string)
if !ok {
return
}
*uri = pattern
}
// ServeHTTP serves HTTP requests (http.Handler)
func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
if len(h.methods) == 0 {
@@ -69,13 +83,16 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
return
}
}
uri := strings.Split(r.RequestURI, "?")[0]
recorder := &StatusRecorder{
ResponseWriter: w,
RequestURI: &uri,
Status: 200,
}
r = r.WithContext(context.WithValue(r.Context(), requestURI, &uri))
h.handler.ServeHTTP(recorder, r)
if h.containsMetricsMethod(MetricTypeRequestCount) {
RegisterRequestCounter(r)
RegisterRequestCounter(recorder, r)
}
if h.containsMetricsMethod(MetricTypeTotalCount) {
RegisterTotalRequestCounter(r)
@@ -94,9 +111,9 @@ func (h *Handler) containsMetricsMethod(method MetricType) bool {
return false
}
func RegisterRequestCounter(r *http.Request) {
func RegisterRequestCounter(recorder *StatusRecorder, r *http.Request) {
var labels = map[string]attribute.Value{
URI: attribute.StringValue(strings.Split(r.RequestURI, "?")[0]),
URI: attribute.StringValue(*recorder.RequestURI),
Method: attribute.StringValue(r.Method),
}
RegisterCounter(RequestCounter, RequestCountDescription)
@@ -110,7 +127,7 @@ func RegisterTotalRequestCounter(r *http.Request) {
func RegisterRequestCodeCounter(recorder *StatusRecorder, r *http.Request) {
var labels = map[string]attribute.Value{
URI: attribute.StringValue(strings.Split(r.RequestURI, "?")[0]),
URI: attribute.StringValue(*recorder.RequestURI),
Method: attribute.StringValue(r.Method),
ReturnCode: attribute.IntValue(recorder.Status),
}

View File

@@ -6,9 +6,11 @@ import (
"sync"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/prometheus"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/sdk/instrumentation"
sdk_metric "go.opentelemetry.io/otel/sdk/metric"
"github.com/zitadel/zitadel/internal/telemetry/metrics"
@@ -33,9 +35,19 @@ func NewMetrics(meterName string) (metrics.Metrics, error) {
if err != nil {
return &Metrics{}, err
}
// create a view to filter out unwanted attributes
view := sdk_metric.NewView(
sdk_metric.Instrument{
Scope: instrumentation.Scope{Name: otelhttp.ScopeName},
},
sdk_metric.Stream{
AttributeFilter: attribute.NewAllowKeysFilter("http.method", "http.status_code", "http.target"),
},
)
meterProvider := sdk_metric.NewMeterProvider(
sdk_metric.WithReader(exporter),
sdk_metric.WithResource(resource),
sdk_metric.WithView(view),
)
return &Metrics{
Provider: meterProvider,