fix(OTEL): reduce high cardinality in traces and metrics (#9286)

# Which Problems Are Solved

There were multiple issues in the OpenTelemetry (OTEL) implementation
and usage for tracing and metrics, which lead to high cardinality and
potential memory leaks:
- wrongly initiated tracing interceptors
- high cardinality in traces:
  - HTTP/1.1 endpoints containing host names
- HTTP/1.1 endpoints containing object IDs like userID (e.g.
`/management/v1/users/2352839823/`)
- high amount of traces from internal processes (spooler)
- high cardinality in metrics endpoint:
  - GRPC entries containing host names
  - notification metrics containing instanceIDs and error messages

# How the Problems Are Solved

- Properly initialize the interceptors once and update them to use the
grpc stats handler (unary interceptors were deprecated).
- Remove host names from HTTP/1.1 span names and use path as default.
- Set / overwrite the uri for spans on the grpc-gateway with the uri
pattern (`/management/v1/users/{user_id}`). This is used for spans in
traces and metric entries.
- Created a new sampler which will only sample spans in the following
cases:
  - remote was already sampled
- remote was not sampled, root span is of kind `Server` and based on
fraction set in the runtime configuration
- This will prevent having a lot of spans from the spooler back ground
jobs if they were not started by a client call querying an object (e.g.
UserByID).
- Filter out host names and alike from OTEL generated metrics (using a
`view`).
- Removed instance and error messages from notification metrics.

# Additional Changes

Fixed the middleware handling for serving Console. Telemetry and
instance selection are only used for the environment.json, but not on
statically served files.

# Additional Context

- closes #8096 
- relates to #9074
- back ports to at least 2.66.x, 2.67.x and 2.68.x
This commit is contained in:
Livio Spring
2025-02-04 09:55:26 +01:00
committed by GitHub
parent 04b9e9b144
commit 990e1982c7
14 changed files with 237 additions and 60 deletions

View File

@@ -28,7 +28,7 @@ type Tracer struct {
}
func (c *Config) NewTracer() error {
sampler := sdk_trace.ParentBased(sdk_trace.TraceIDRatioBased(c.Fraction))
sampler := otel.NewSampler(sdk_trace.TraceIDRatioBased(c.Fraction))
exporter, err := texporter.New(texporter.WithProjectID(c.ProjectID))
if err != nil {
return err

View File

@@ -26,7 +26,7 @@ type Tracer struct {
}
func (c *Config) NewTracer() error {
sampler := sdk_trace.ParentBased(sdk_trace.TraceIDRatioBased(c.Fraction))
sampler := otel.NewSampler(sdk_trace.TraceIDRatioBased(c.Fraction))
exporter, err := stdout.New(stdout.WithPrettyPrint())
if err != nil {
return err

View File

@@ -6,6 +6,7 @@ import (
otlpgrpc "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
sdk_trace "go.opentelemetry.io/otel/sdk/trace"
api_trace "go.opentelemetry.io/otel/trace"
"github.com/zitadel/zitadel/internal/telemetry/tracing"
"github.com/zitadel/zitadel/internal/zerrors"
@@ -47,7 +48,7 @@ func FractionFromConfig(i interface{}) (float64, error) {
}
func (c *Config) NewTracer() error {
sampler := sdk_trace.ParentBased(sdk_trace.TraceIDRatioBased(c.Fraction))
sampler := NewSampler(sdk_trace.TraceIDRatioBased(c.Fraction))
exporter, err := otlpgrpc.New(context.Background(), otlpgrpc.WithEndpoint(c.Endpoint), otlpgrpc.WithInsecure())
if err != nil {
return err
@@ -56,3 +57,19 @@ func (c *Config) NewTracer() error {
tracing.T, err = NewTracer(sampler, exporter)
return err
}
// NewSampler returns a sampler decorator which behaves differently,
// based on the parent of the span. If the span has no parent and is of kind server,
// the decorated sampler is used to make sampling decision.
// If the span has a parent, depending on whether the parent is remote and whether it
// is sampled, one of the following samplers will apply:
// - remote parent sampled -> always sample
// - remote parent not sampled -> sample based on the decorated sampler (fraction based)
// - local parent sampled -> always sample
// - local parent not sampled -> never sample
func NewSampler(sampler sdk_trace.Sampler) sdk_trace.Sampler {
return sdk_trace.ParentBased(
tracing.SpanKindBased(sampler, api_trace.SpanKindServer),
sdk_trace.WithRemoteParentNotSampled(sampler),
)
}

View File

@@ -0,0 +1,46 @@
package tracing
import (
"fmt"
"slices"
sdk_trace "go.opentelemetry.io/otel/sdk/trace"
"go.opentelemetry.io/otel/trace"
)
type spanKindSampler struct {
sampler sdk_trace.Sampler
kinds []trace.SpanKind
}
// ShouldSample implements the [sdk_trace.Sampler] interface.
// It will not sample any spans which do not match the configured span kinds.
// For spans which do match, the decorated sampler is used to make the sampling decision.
func (sk spanKindSampler) ShouldSample(p sdk_trace.SamplingParameters) sdk_trace.SamplingResult {
psc := trace.SpanContextFromContext(p.ParentContext)
if !slices.Contains(sk.kinds, p.Kind) {
return sdk_trace.SamplingResult{
Decision: sdk_trace.Drop,
Tracestate: psc.TraceState(),
}
}
s := sk.sampler.ShouldSample(p)
return s
}
func (sk spanKindSampler) Description() string {
return fmt.Sprintf("SpanKindBased{sampler:%s,kinds:%v}",
sk.sampler.Description(),
sk.kinds,
)
}
// SpanKindBased returns a sampler decorator which behaves differently, based on the kind of the span.
// If the span kind does not match one of the configured kinds, it will not be sampled.
// If the span kind matches, the decorated sampler is used to make sampling decision.
func SpanKindBased(sampler sdk_trace.Sampler, kinds ...trace.SpanKind) sdk_trace.Sampler {
return spanKindSampler{
sampler: sampler,
kinds: kinds,
}
}

View File

@@ -0,0 +1,80 @@
package tracing
import (
"context"
"testing"
"github.com/stretchr/testify/assert"
sdk_trace "go.opentelemetry.io/otel/sdk/trace"
"go.opentelemetry.io/otel/trace"
)
func TestSpanKindBased(t *testing.T) {
type args struct {
sampler sdk_trace.Sampler
kinds []trace.SpanKind
}
type want struct {
description string
sampled int
}
tests := []struct {
name string
args args
want want
}{
{
"never sample, no sample",
args{
sampler: sdk_trace.NeverSample(),
kinds: []trace.SpanKind{trace.SpanKindServer},
},
want{
description: "SpanKindBased{sampler:AlwaysOffSampler,kinds:[server]}",
sampled: 0,
},
},
{
"always sample, no kind, no sample",
args{
sampler: sdk_trace.AlwaysSample(),
kinds: nil,
},
want{
description: "SpanKindBased{sampler:AlwaysOnSampler,kinds:[]}",
sampled: 0,
},
},
{
"always sample, 2 kinds, 2 samples",
args{
sampler: sdk_trace.AlwaysSample(),
kinds: []trace.SpanKind{trace.SpanKindServer, trace.SpanKindClient},
},
want{
description: "SpanKindBased{sampler:AlwaysOnSampler,kinds:[server client]}",
sampled: 2,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
sampler := SpanKindBased(tt.args.sampler, tt.args.kinds...)
assert.Equal(t, tt.want.description, sampler.Description())
p := sdk_trace.NewTracerProvider(sdk_trace.WithSampler(sampler))
tr := p.Tracer("test")
var sampled int
for i := trace.SpanKindUnspecified; i <= trace.SpanKindConsumer; i++ {
ctx := context.Background()
_, span := tr.Start(ctx, "test", trace.WithSpanKind(i))
if span.SpanContext().IsSampled() {
sampled++
}
}
assert.Equal(t, tt.want.sampled, sampled)
})
}
}