zitadel/internal/serviceping/worker.go

package serviceping

import (
	"context"
	"errors"
	"fmt"
	"math/rand"
	"net/http"
	"time"

	"github.com/muhlemmer/gu"
	"github.com/riverqueue/river"
	"github.com/robfig/cron/v3"
	"github.com/zitadel/logging"

	"github.com/zitadel/zitadel/cmd/build"
	"github.com/zitadel/zitadel/internal/eventstore"
	"github.com/zitadel/zitadel/internal/query"
	"github.com/zitadel/zitadel/internal/queue"
	"github.com/zitadel/zitadel/internal/v2/system"
	"github.com/zitadel/zitadel/internal/zerrors"
	analytics "github.com/zitadel/zitadel/pkg/grpc/analytics/v2beta"
)

const (
	QueueName   = "service_ping_report"
	minInterval = 30 * time.Minute
)

var (
	ErrInvalidReportType = errors.New("invalid report type")

	_ river.Worker[*ServicePingReport] = (*Worker)(nil)
)

type Worker struct {
	river.WorkerDefaults[*ServicePingReport]

	reportClient analytics.TelemetryServiceClient
	db           Queries
	queue        Queue

	config   *Config
	systemID string
	version  string
}

type Queries interface {
	SearchInstances(ctx context.Context, queries *query.InstanceSearchQueries) (*query.Instances, error)
	ListResourceCounts(ctx context.Context, lastID int, size int) ([]query.ResourceCount, error)
}

type Queue interface {
	Insert(ctx context.Context, args river.JobArgs, opts ...queue.InsertOpt) error
}

// Register implements the [queue.Worker] interface.
func (w *Worker) Register(workers *river.Workers, queues map[string]river.QueueConfig) {
	river.AddWorker[*ServicePingReport](workers, w)
	queues[QueueName] = river.QueueConfig{
		MaxWorkers: 1, // for now, we only use a single worker to prevent too much side effects on other queues
	}
}

// Work implements the [river.Worker] interface.
func (w *Worker) Work(ctx context.Context, job *river.Job[*ServicePingReport]) (err error) {
	defer func() {
		err = w.handleClientError(err)
	}()
	switch job.Args.ReportType {
	case ReportTypeBaseInformation:
		reportID, err := w.reportBaseInformation(ctx)
		if err != nil {
			return err
		}
		return w.createReportJobs(ctx, reportID)
	case ReportTypeResourceCounts:
		return w.reportResourceCounts(ctx, job.Args.ReportID)
	default:
		logging.WithFields("reportType", job.Args.ReportType, "reportID", job.Args.ReportID).
			Error("unknown job type")
		return river.JobCancel(ErrInvalidReportType)
	}
}

func (w *Worker) reportBaseInformation(ctx context.Context) (string, error) {
	instances, err := w.db.SearchInstances(ctx, &query.InstanceSearchQueries{})
	if err != nil {
		return "", err
	}
	instanceInformation := instanceInformationToPb(instances)
	resp, err := w.reportClient.ReportBaseInformation(ctx, &analytics.ReportBaseInformationRequest{
		SystemId:  w.systemID,
		Version:   w.version,
		Instances: instanceInformation,
	})
	if err != nil {
		return "", err
	}
	return resp.GetReportId(), nil
}

func (w *Worker) reportResourceCounts(ctx context.Context, reportID string) error {
	lastID := 0
	// iterate over the resource counts until there are no more counts to report
	// or the context gets cancelled
	for {
		select {
		case <-ctx.Done():
			return nil
		default:
			counts, err := w.db.ListResourceCounts(ctx, lastID, w.config.Telemetry.ResourceCount.BulkSize)
			if err != nil {
				return err
			}
			// if there are no counts, we can stop the loop
			if len(counts) == 0 {
				return nil
			}
			request := &analytics.ReportResourceCountsRequest{
				SystemId:       w.systemID,
				ResourceCounts: resourceCountsToPb(counts),
			}
			if reportID != "" {
				request.ReportId = gu.Ptr(reportID)
			}
			resp, err := w.reportClient.ReportResourceCounts(ctx, request)
			if err != nil {
				return err
			}
			// in case the resource counts returned by the database are less than the bulk size,
			// we can assume that we have reached the end of the resource counts and can stop the loop
			if len(counts) < w.config.Telemetry.ResourceCount.BulkSize {
				return nil
			}
			// update the lastID for the next iteration
			lastID = counts[len(counts)-1].ID
			// In case we get a report ID back from the server (it could be the first call of the report),
			// we update it to use it for the next batch.
			if resp.GetReportId() != "" && resp.GetReportId() != reportID {
				reportID = resp.GetReportId()
			}
		}
	}
}

func (w *Worker) handleClientError(err error) error {
	telemetryError := new(TelemetryError)
	if !errors.As(err, &telemetryError) {
		// If the error is not a TelemetryError, we can assume that it is a transient error
		// and can be retried by the queue.
		return err
	}
	switch telemetryError.StatusCode {
	case http.StatusBadRequest,
		http.StatusNotFound,
		http.StatusNotImplemented,
		http.StatusConflict,
		http.StatusPreconditionFailed:
		// In case of these errors, we can assume that a retry does not make sense,
		// so we can cancel the job.
		return river.JobCancel(err)
	default:
		// As of now we assume that all other errors are transient and can be retried.
		// So we just return the error, which will be handled by the queue as a failed attempt.
		return err
	}
}

func (w *Worker) createReportJobs(ctx context.Context, reportID string) error {
	errs := make([]error, 0)
	if w.config.Telemetry.ResourceCount.Enabled {
		err := w.addReportJob(ctx, reportID, ReportTypeResourceCounts)
		if err != nil {
			errs = append(errs, err)
		}
	}
	return errors.Join(errs...)
}

func (w *Worker) addReportJob(ctx context.Context, reportID string, reportType ReportType) error {
	job := &ServicePingReport{
		ReportID:   reportID,
		ReportType: reportType,
	}
	return w.queue.Insert(ctx, job,
		queue.WithQueueName(QueueName),
		queue.WithMaxAttempts(w.config.MaxAttempts),
	)
}

type systemIDReducer struct {
	id string
}

func (s *systemIDReducer) Reduce() error {
	return nil
}

func (s *systemIDReducer) AppendEvents(events ...eventstore.Event) {
	for _, event := range events {
		if idEvent, ok := event.(*system.IDGeneratedEvent); ok {
			s.id = idEvent.ID
		}
	}
}

func (s *systemIDReducer) Query() *eventstore.SearchQueryBuilder {
	return eventstore.NewSearchQueryBuilder(eventstore.ColumnsEvent).
		AddQuery().
		AggregateTypes(system.AggregateType).
		EventTypes(system.IDGeneratedType).
		Builder()
}

func Register(
	ctx context.Context,
	q *queue.Queue,
	queries *query.Queries,
	eventstoreClient *eventstore.Eventstore,
	config *Config,
) error {
	if !config.Enabled {
		return nil
	}
	systemID := new(systemIDReducer)
	err := eventstoreClient.FilterToQueryReducer(ctx, systemID)
	if err != nil {
		return err
	}
	q.AddWorkers(&Worker{
		reportClient: NewClient(config),
		db:           queries,
		queue:        q,
		config:       config,
		systemID:     systemID.id,
		version:      build.Version(),
	})
	return nil
}

func Start(config *Config, q *queue.Queue) error {
	if !config.Enabled {
		return nil
	}
	schedule, err := parseAndValidateSchedule(config.Interval)
	if err != nil {
		return err
	}
	q.AddPeriodicJob(
		schedule,
		&ServicePingReport{},
		queue.WithQueueName(QueueName),
		queue.WithMaxAttempts(config.MaxAttempts),
	)
	return nil
}

func parseAndValidateSchedule(interval string) (cron.Schedule, error) {
	if interval == "@daily" {
		interval = randomizeDaily()
	}
	schedule, err := cron.ParseStandard(interval)
	if err != nil {
		return nil, zerrors.ThrowInvalidArgument(err, "SERV-NJqiof", "invalid interval")
	}
	var intervalDuration time.Duration
	switch s := schedule.(type) {
	case *cron.SpecSchedule:
		// For cron.SpecSchedule, we need to calculate the interval duration
		// by getting the next time and subtracting it from the time after that.
		// This is because the schedule could be a specific time, that is less than 30 minutes away,
		// but still run only once a day and therefore is valid.
		next := s.Next(time.Now())
		nextAfter := s.Next(next)
		intervalDuration = nextAfter.Sub(next)
	case cron.ConstantDelaySchedule:
		intervalDuration = s.Delay
	}
	if intervalDuration < minInterval {
		return nil, zerrors.ThrowInvalidArgumentf(nil, "SERV-FJ12", "interval must be at least %s", minInterval)
	}
	logging.WithFields("interval", interval).Info("scheduling service ping")
	return schedule, nil
}

// randomizeDaily generates a random time for the daily cron job
// to prevent all systems from sending the report at the same time.
func randomizeDaily() string {
	minute := rand.Intn(60)
	hour := rand.Intn(24)
	return fmt.Sprintf("%d %d * * *", minute, hour)
}