Livio Spring 82cd1cee08
fix(service ping): correct endpoint, validate and randomize default interval (#10166)
# Which Problems Are Solved

The production endpoint of the service ping was wrong.
Additionally we discussed in the sprint review, that we could randomize
the default interval to prevent all systems to report data at the very
same time and also require a minimal interval.

# How the Problems Are Solved

- fixed the endpoint
- If the interval is set to @daily (default), we generate a random time
(minute, hour) as a cron format.
- Check if the interval is more than 30min and return an error if not.
- Fixed yaml indent on `ResourceCount`

# Additional Changes

None

# Additional Context

as discussed internally
2025-07-04 13:45:15 +00:00

294 lines
8.2 KiB
Go

package serviceping
import (
"context"
"errors"
"fmt"
"math/rand"
"net/http"
"time"
"github.com/muhlemmer/gu"
"github.com/riverqueue/river"
"github.com/robfig/cron/v3"
"github.com/zitadel/logging"
"github.com/zitadel/zitadel/cmd/build"
"github.com/zitadel/zitadel/internal/eventstore"
"github.com/zitadel/zitadel/internal/query"
"github.com/zitadel/zitadel/internal/queue"
"github.com/zitadel/zitadel/internal/v2/system"
"github.com/zitadel/zitadel/internal/zerrors"
analytics "github.com/zitadel/zitadel/pkg/grpc/analytics/v2beta"
)
const (
QueueName = "service_ping_report"
minInterval = 30 * time.Minute
)
var (
ErrInvalidReportType = errors.New("invalid report type")
_ river.Worker[*ServicePingReport] = (*Worker)(nil)
)
type Worker struct {
river.WorkerDefaults[*ServicePingReport]
reportClient analytics.TelemetryServiceClient
db Queries
queue Queue
config *Config
systemID string
version string
}
type Queries interface {
SearchInstances(ctx context.Context, queries *query.InstanceSearchQueries) (*query.Instances, error)
ListResourceCounts(ctx context.Context, lastID int, size int) ([]query.ResourceCount, error)
}
type Queue interface {
Insert(ctx context.Context, args river.JobArgs, opts ...queue.InsertOpt) error
}
// Register implements the [queue.Worker] interface.
func (w *Worker) Register(workers *river.Workers, queues map[string]river.QueueConfig) {
river.AddWorker[*ServicePingReport](workers, w)
queues[QueueName] = river.QueueConfig{
MaxWorkers: 1, // for now, we only use a single worker to prevent too much side effects on other queues
}
}
// Work implements the [river.Worker] interface.
func (w *Worker) Work(ctx context.Context, job *river.Job[*ServicePingReport]) (err error) {
defer func() {
err = w.handleClientError(err)
}()
switch job.Args.ReportType {
case ReportTypeBaseInformation:
reportID, err := w.reportBaseInformation(ctx)
if err != nil {
return err
}
return w.createReportJobs(ctx, reportID)
case ReportTypeResourceCounts:
return w.reportResourceCounts(ctx, job.Args.ReportID)
default:
logging.WithFields("reportType", job.Args.ReportType, "reportID", job.Args.ReportID).
Error("unknown job type")
return river.JobCancel(ErrInvalidReportType)
}
}
func (w *Worker) reportBaseInformation(ctx context.Context) (string, error) {
instances, err := w.db.SearchInstances(ctx, &query.InstanceSearchQueries{})
if err != nil {
return "", err
}
instanceInformation := instanceInformationToPb(instances)
resp, err := w.reportClient.ReportBaseInformation(ctx, &analytics.ReportBaseInformationRequest{
SystemId: w.systemID,
Version: w.version,
Instances: instanceInformation,
})
if err != nil {
return "", err
}
return resp.GetReportId(), nil
}
func (w *Worker) reportResourceCounts(ctx context.Context, reportID string) error {
lastID := 0
// iterate over the resource counts until there are no more counts to report
// or the context gets cancelled
for {
select {
case <-ctx.Done():
return nil
default:
counts, err := w.db.ListResourceCounts(ctx, lastID, w.config.Telemetry.ResourceCount.BulkSize)
if err != nil {
return err
}
// if there are no counts, we can stop the loop
if len(counts) == 0 {
return nil
}
request := &analytics.ReportResourceCountsRequest{
SystemId: w.systemID,
ResourceCounts: resourceCountsToPb(counts),
}
if reportID != "" {
request.ReportId = gu.Ptr(reportID)
}
resp, err := w.reportClient.ReportResourceCounts(ctx, request)
if err != nil {
return err
}
// in case the resource counts returned by the database are less than the bulk size,
// we can assume that we have reached the end of the resource counts and can stop the loop
if len(counts) < w.config.Telemetry.ResourceCount.BulkSize {
return nil
}
// update the lastID for the next iteration
lastID = counts[len(counts)-1].ID
// In case we get a report ID back from the server (it could be the first call of the report),
// we update it to use it for the next batch.
if resp.GetReportId() != "" && resp.GetReportId() != reportID {
reportID = resp.GetReportId()
}
}
}
}
func (w *Worker) handleClientError(err error) error {
telemetryError := new(TelemetryError)
if !errors.As(err, &telemetryError) {
// If the error is not a TelemetryError, we can assume that it is a transient error
// and can be retried by the queue.
return err
}
switch telemetryError.StatusCode {
case http.StatusBadRequest,
http.StatusNotFound,
http.StatusNotImplemented,
http.StatusConflict,
http.StatusPreconditionFailed:
// In case of these errors, we can assume that a retry does not make sense,
// so we can cancel the job.
return river.JobCancel(err)
default:
// As of now we assume that all other errors are transient and can be retried.
// So we just return the error, which will be handled by the queue as a failed attempt.
return err
}
}
func (w *Worker) createReportJobs(ctx context.Context, reportID string) error {
errs := make([]error, 0)
if w.config.Telemetry.ResourceCount.Enabled {
err := w.addReportJob(ctx, reportID, ReportTypeResourceCounts)
if err != nil {
errs = append(errs, err)
}
}
return errors.Join(errs...)
}
func (w *Worker) addReportJob(ctx context.Context, reportID string, reportType ReportType) error {
job := &ServicePingReport{
ReportID: reportID,
ReportType: reportType,
}
return w.queue.Insert(ctx, job,
queue.WithQueueName(QueueName),
queue.WithMaxAttempts(w.config.MaxAttempts),
)
}
type systemIDReducer struct {
id string
}
func (s *systemIDReducer) Reduce() error {
return nil
}
func (s *systemIDReducer) AppendEvents(events ...eventstore.Event) {
for _, event := range events {
if idEvent, ok := event.(*system.IDGeneratedEvent); ok {
s.id = idEvent.ID
}
}
}
func (s *systemIDReducer) Query() *eventstore.SearchQueryBuilder {
return eventstore.NewSearchQueryBuilder(eventstore.ColumnsEvent).
AddQuery().
AggregateTypes(system.AggregateType).
EventTypes(system.IDGeneratedType).
Builder()
}
func Register(
ctx context.Context,
q *queue.Queue,
queries *query.Queries,
eventstoreClient *eventstore.Eventstore,
config *Config,
) error {
if !config.Enabled {
return nil
}
systemID := new(systemIDReducer)
err := eventstoreClient.FilterToQueryReducer(ctx, systemID)
if err != nil {
return err
}
q.AddWorkers(&Worker{
reportClient: NewClient(config),
db: queries,
queue: q,
config: config,
systemID: systemID.id,
version: build.Version(),
})
return nil
}
func Start(config *Config, q *queue.Queue) error {
if !config.Enabled {
return nil
}
schedule, err := parseAndValidateSchedule(config.Interval)
if err != nil {
return err
}
q.AddPeriodicJob(
schedule,
&ServicePingReport{},
queue.WithQueueName(QueueName),
queue.WithMaxAttempts(config.MaxAttempts),
)
return nil
}
func parseAndValidateSchedule(interval string) (cron.Schedule, error) {
if interval == "@daily" {
interval = randomizeDaily()
}
schedule, err := cron.ParseStandard(interval)
if err != nil {
return nil, zerrors.ThrowInvalidArgument(err, "SERV-NJqiof", "invalid interval")
}
var intervalDuration time.Duration
switch s := schedule.(type) {
case *cron.SpecSchedule:
// For cron.SpecSchedule, we need to calculate the interval duration
// by getting the next time and subtracting it from the time after that.
// This is because the schedule could be a specific time, that is less than 30 minutes away,
// but still run only once a day and therefore is valid.
next := s.Next(time.Now())
nextAfter := s.Next(next)
intervalDuration = nextAfter.Sub(next)
case cron.ConstantDelaySchedule:
intervalDuration = s.Delay
}
if intervalDuration < minInterval {
return nil, zerrors.ThrowInvalidArgumentf(nil, "SERV-FJ12", "interval must be at least %s", minInterval)
}
logging.WithFields("interval", interval).Info("scheduling service ping")
return schedule, nil
}
// randomizeDaily generates a random time for the daily cron job
// to prevent all systems from sending the report at the same time.
func randomizeDaily() string {
minute := rand.Intn(60)
hour := rand.Intn(24)
return fmt.Sprintf("%d %d * * *", minute, hour)
}