2025-07-02 07:57:41 -04:00
|
|
|
package serviceping
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"errors"
|
2025-07-04 09:45:15 -04:00
|
|
|
"fmt"
|
|
|
|
"math/rand"
|
2025-07-02 07:57:41 -04:00
|
|
|
"net/http"
|
2025-07-04 09:45:15 -04:00
|
|
|
"time"
|
2025-07-02 07:57:41 -04:00
|
|
|
|
|
|
|
"github.com/muhlemmer/gu"
|
|
|
|
"github.com/riverqueue/river"
|
|
|
|
"github.com/robfig/cron/v3"
|
|
|
|
"github.com/zitadel/logging"
|
|
|
|
|
|
|
|
"github.com/zitadel/zitadel/cmd/build"
|
|
|
|
"github.com/zitadel/zitadel/internal/eventstore"
|
|
|
|
"github.com/zitadel/zitadel/internal/query"
|
|
|
|
"github.com/zitadel/zitadel/internal/queue"
|
|
|
|
"github.com/zitadel/zitadel/internal/v2/system"
|
2025-07-04 09:45:15 -04:00
|
|
|
"github.com/zitadel/zitadel/internal/zerrors"
|
2025-07-02 07:57:41 -04:00
|
|
|
analytics "github.com/zitadel/zitadel/pkg/grpc/analytics/v2beta"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2025-07-04 09:45:15 -04:00
|
|
|
QueueName = "service_ping_report"
|
|
|
|
minInterval = 30 * time.Minute
|
2025-07-02 07:57:41 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
ErrInvalidReportType = errors.New("invalid report type")
|
|
|
|
|
|
|
|
_ river.Worker[*ServicePingReport] = (*Worker)(nil)
|
|
|
|
)
|
|
|
|
|
|
|
|
type Worker struct {
|
|
|
|
river.WorkerDefaults[*ServicePingReport]
|
|
|
|
|
|
|
|
reportClient analytics.TelemetryServiceClient
|
|
|
|
db Queries
|
|
|
|
queue Queue
|
|
|
|
|
|
|
|
config *Config
|
|
|
|
systemID string
|
|
|
|
version string
|
|
|
|
}
|
|
|
|
|
|
|
|
type Queries interface {
|
|
|
|
SearchInstances(ctx context.Context, queries *query.InstanceSearchQueries) (*query.Instances, error)
|
|
|
|
ListResourceCounts(ctx context.Context, lastID int, size int) ([]query.ResourceCount, error)
|
|
|
|
}
|
|
|
|
|
|
|
|
type Queue interface {
|
|
|
|
Insert(ctx context.Context, args river.JobArgs, opts ...queue.InsertOpt) error
|
|
|
|
}
|
|
|
|
|
|
|
|
// Register implements the [queue.Worker] interface.
|
|
|
|
func (w *Worker) Register(workers *river.Workers, queues map[string]river.QueueConfig) {
|
|
|
|
river.AddWorker[*ServicePingReport](workers, w)
|
|
|
|
queues[QueueName] = river.QueueConfig{
|
|
|
|
MaxWorkers: 1, // for now, we only use a single worker to prevent too much side effects on other queues
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Work implements the [river.Worker] interface.
|
|
|
|
func (w *Worker) Work(ctx context.Context, job *river.Job[*ServicePingReport]) (err error) {
|
|
|
|
defer func() {
|
|
|
|
err = w.handleClientError(err)
|
|
|
|
}()
|
|
|
|
switch job.Args.ReportType {
|
|
|
|
case ReportTypeBaseInformation:
|
|
|
|
reportID, err := w.reportBaseInformation(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
return w.createReportJobs(ctx, reportID)
|
|
|
|
case ReportTypeResourceCounts:
|
|
|
|
return w.reportResourceCounts(ctx, job.Args.ReportID)
|
|
|
|
default:
|
|
|
|
logging.WithFields("reportType", job.Args.ReportType, "reportID", job.Args.ReportID).
|
|
|
|
Error("unknown job type")
|
|
|
|
return river.JobCancel(ErrInvalidReportType)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (w *Worker) reportBaseInformation(ctx context.Context) (string, error) {
|
|
|
|
instances, err := w.db.SearchInstances(ctx, &query.InstanceSearchQueries{})
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
instanceInformation := instanceInformationToPb(instances)
|
|
|
|
resp, err := w.reportClient.ReportBaseInformation(ctx, &analytics.ReportBaseInformationRequest{
|
|
|
|
SystemId: w.systemID,
|
|
|
|
Version: w.version,
|
|
|
|
Instances: instanceInformation,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
return resp.GetReportId(), nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (w *Worker) reportResourceCounts(ctx context.Context, reportID string) error {
|
|
|
|
lastID := 0
|
|
|
|
// iterate over the resource counts until there are no more counts to report
|
|
|
|
// or the context gets cancelled
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return nil
|
|
|
|
default:
|
|
|
|
counts, err := w.db.ListResourceCounts(ctx, lastID, w.config.Telemetry.ResourceCount.BulkSize)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// if there are no counts, we can stop the loop
|
|
|
|
if len(counts) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
request := &analytics.ReportResourceCountsRequest{
|
|
|
|
SystemId: w.systemID,
|
|
|
|
ResourceCounts: resourceCountsToPb(counts),
|
|
|
|
}
|
|
|
|
if reportID != "" {
|
|
|
|
request.ReportId = gu.Ptr(reportID)
|
|
|
|
}
|
|
|
|
resp, err := w.reportClient.ReportResourceCounts(ctx, request)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// in case the resource counts returned by the database are less than the bulk size,
|
|
|
|
// we can assume that we have reached the end of the resource counts and can stop the loop
|
|
|
|
if len(counts) < w.config.Telemetry.ResourceCount.BulkSize {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
// update the lastID for the next iteration
|
|
|
|
lastID = counts[len(counts)-1].ID
|
|
|
|
// In case we get a report ID back from the server (it could be the first call of the report),
|
|
|
|
// we update it to use it for the next batch.
|
|
|
|
if resp.GetReportId() != "" && resp.GetReportId() != reportID {
|
|
|
|
reportID = resp.GetReportId()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (w *Worker) handleClientError(err error) error {
|
|
|
|
telemetryError := new(TelemetryError)
|
|
|
|
if !errors.As(err, &telemetryError) {
|
|
|
|
// If the error is not a TelemetryError, we can assume that it is a transient error
|
|
|
|
// and can be retried by the queue.
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
switch telemetryError.StatusCode {
|
|
|
|
case http.StatusBadRequest,
|
|
|
|
http.StatusNotFound,
|
|
|
|
http.StatusNotImplemented,
|
|
|
|
http.StatusConflict,
|
|
|
|
http.StatusPreconditionFailed:
|
|
|
|
// In case of these errors, we can assume that a retry does not make sense,
|
|
|
|
// so we can cancel the job.
|
|
|
|
return river.JobCancel(err)
|
|
|
|
default:
|
|
|
|
// As of now we assume that all other errors are transient and can be retried.
|
|
|
|
// So we just return the error, which will be handled by the queue as a failed attempt.
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (w *Worker) createReportJobs(ctx context.Context, reportID string) error {
|
|
|
|
errs := make([]error, 0)
|
|
|
|
if w.config.Telemetry.ResourceCount.Enabled {
|
|
|
|
err := w.addReportJob(ctx, reportID, ReportTypeResourceCounts)
|
|
|
|
if err != nil {
|
|
|
|
errs = append(errs, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return errors.Join(errs...)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (w *Worker) addReportJob(ctx context.Context, reportID string, reportType ReportType) error {
|
|
|
|
job := &ServicePingReport{
|
|
|
|
ReportID: reportID,
|
|
|
|
ReportType: reportType,
|
|
|
|
}
|
|
|
|
return w.queue.Insert(ctx, job,
|
|
|
|
queue.WithQueueName(QueueName),
|
|
|
|
queue.WithMaxAttempts(w.config.MaxAttempts),
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
type systemIDReducer struct {
|
|
|
|
id string
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *systemIDReducer) Reduce() error {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *systemIDReducer) AppendEvents(events ...eventstore.Event) {
|
|
|
|
for _, event := range events {
|
|
|
|
if idEvent, ok := event.(*system.IDGeneratedEvent); ok {
|
|
|
|
s.id = idEvent.ID
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *systemIDReducer) Query() *eventstore.SearchQueryBuilder {
|
|
|
|
return eventstore.NewSearchQueryBuilder(eventstore.ColumnsEvent).
|
|
|
|
AddQuery().
|
|
|
|
AggregateTypes(system.AggregateType).
|
|
|
|
EventTypes(system.IDGeneratedType).
|
|
|
|
Builder()
|
|
|
|
}
|
|
|
|
|
|
|
|
func Register(
|
|
|
|
ctx context.Context,
|
|
|
|
q *queue.Queue,
|
|
|
|
queries *query.Queries,
|
|
|
|
eventstoreClient *eventstore.Eventstore,
|
|
|
|
config *Config,
|
|
|
|
) error {
|
|
|
|
if !config.Enabled {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
systemID := new(systemIDReducer)
|
|
|
|
err := eventstoreClient.FilterToQueryReducer(ctx, systemID)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
q.AddWorkers(&Worker{
|
|
|
|
reportClient: NewClient(config),
|
|
|
|
db: queries,
|
|
|
|
queue: q,
|
|
|
|
config: config,
|
|
|
|
systemID: systemID.id,
|
|
|
|
version: build.Version(),
|
|
|
|
})
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func Start(config *Config, q *queue.Queue) error {
|
|
|
|
if !config.Enabled {
|
|
|
|
return nil
|
|
|
|
}
|
2025-07-04 09:45:15 -04:00
|
|
|
schedule, err := parseAndValidateSchedule(config.Interval)
|
2025-07-02 07:57:41 -04:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
q.AddPeriodicJob(
|
|
|
|
schedule,
|
|
|
|
&ServicePingReport{},
|
|
|
|
queue.WithQueueName(QueueName),
|
|
|
|
queue.WithMaxAttempts(config.MaxAttempts),
|
|
|
|
)
|
|
|
|
return nil
|
|
|
|
}
|
2025-07-04 09:45:15 -04:00
|
|
|
|
|
|
|
func parseAndValidateSchedule(interval string) (cron.Schedule, error) {
|
|
|
|
if interval == "@daily" {
|
|
|
|
interval = randomizeDaily()
|
|
|
|
}
|
|
|
|
schedule, err := cron.ParseStandard(interval)
|
|
|
|
if err != nil {
|
|
|
|
return nil, zerrors.ThrowInvalidArgument(err, "SERV-NJqiof", "invalid interval")
|
|
|
|
}
|
|
|
|
var intervalDuration time.Duration
|
|
|
|
switch s := schedule.(type) {
|
|
|
|
case *cron.SpecSchedule:
|
|
|
|
// For cron.SpecSchedule, we need to calculate the interval duration
|
|
|
|
// by getting the next time and subtracting it from the time after that.
|
|
|
|
// This is because the schedule could be a specific time, that is less than 30 minutes away,
|
|
|
|
// but still run only once a day and therefore is valid.
|
|
|
|
next := s.Next(time.Now())
|
|
|
|
nextAfter := s.Next(next)
|
|
|
|
intervalDuration = nextAfter.Sub(next)
|
|
|
|
case cron.ConstantDelaySchedule:
|
|
|
|
intervalDuration = s.Delay
|
|
|
|
}
|
|
|
|
if intervalDuration < minInterval {
|
|
|
|
return nil, zerrors.ThrowInvalidArgumentf(nil, "SERV-FJ12", "interval must be at least %s", minInterval)
|
|
|
|
}
|
|
|
|
logging.WithFields("interval", interval).Info("scheduling service ping")
|
|
|
|
return schedule, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// randomizeDaily generates a random time for the daily cron job
|
|
|
|
// to prevent all systems from sending the report at the same time.
|
|
|
|
func randomizeDaily() string {
|
|
|
|
minute := rand.Intn(60)
|
|
|
|
hour := rand.Intn(24)
|
|
|
|
return fmt.Sprintf("%d %d * * *", minute, hour)
|
|
|
|
}
|