refactor(handler): cache active instances (#9008)

# Which Problems Are Solved

Scheduled handlers use `eventstore.InstanceIDs` to get the all active
instances within a given timeframe. This function scrapes through all
events written within that time frame which can cause heavy load on the
database.

# How the Problems Are Solved

A new query cache `activeInstances` is introduced which caches the ids
of all instances queried by id or host within the configured timeframe.

# Additional Changes

- Changed `default.yaml`
  - Removed `HandleActiveInstances` from custom handler configs
- Added `MaxActiveInstances` to define the maximal amount of cached
instance ids
- fixed start-from-init and start-from-setup to start auth and admin
projections twice
- fixed org cache invalidation to use correct index

# Additional Context

- part of #8999
This commit is contained in:
Silvan
2024-12-06 12:32:53 +01:00
committed by GitHub
parent a81d42a61a
commit 77cd430b3a
25 changed files with 181 additions and 188 deletions

View File

@@ -18,9 +18,11 @@ type Config struct {
BulkLimit uint64
FailureCountUntilSkip uint64
HandleActiveInstances time.Duration
TransactionDuration time.Duration
Handlers map[string]*ConfigOverwrites
ActiveInstancer interface {
ActiveInstances() []string
}
}
type ConfigOverwrites struct {
@@ -34,6 +36,9 @@ func Register(ctx context.Context, config Config, view *view.View, static static
return
}
// make sure the slice does not contain old values
projections = nil
projections = append(projections, newStyling(ctx,
config.overwrite("Styling"),
static,
@@ -63,13 +68,13 @@ func ProjectInstance(ctx context.Context) error {
func (config Config) overwrite(viewModel string) handler2.Config {
c := handler2.Config{
Client: config.Client,
Eventstore: config.Eventstore,
BulkLimit: uint16(config.BulkLimit),
RequeueEvery: 3 * time.Minute,
HandleActiveInstances: config.HandleActiveInstances,
MaxFailureCount: uint8(config.FailureCountUntilSkip),
TransactionDuration: config.TransactionDuration,
Client: config.Client,
Eventstore: config.Eventstore,
BulkLimit: uint16(config.BulkLimit),
RequeueEvery: 3 * time.Minute,
MaxFailureCount: uint8(config.FailureCountUntilSkip),
TransactionDuration: config.TransactionDuration,
ActiveInstancer: config.ActiveInstancer,
}
overwrite, ok := config.Handlers[viewModel]
if !ok {

View File

@@ -6,6 +6,7 @@ import (
admin_handler "github.com/zitadel/zitadel/internal/admin/repository/eventsourcing/handler"
admin_view "github.com/zitadel/zitadel/internal/admin/repository/eventsourcing/view"
"github.com/zitadel/zitadel/internal/database"
"github.com/zitadel/zitadel/internal/query"
"github.com/zitadel/zitadel/internal/static"
)
@@ -13,7 +14,7 @@ type Config struct {
Spooler admin_handler.Config
}
func Start(ctx context.Context, conf Config, static static.Storage, dbClient *database.DB) error {
func Start(ctx context.Context, conf Config, static static.Storage, dbClient *database.DB, queries *query.Queries) error {
view, err := admin_view.StartView(dbClient)
if err != nil {
return err

View File

@@ -19,9 +19,12 @@ type Config struct {
BulkLimit uint64
FailureCountUntilSkip uint64
HandleActiveInstances time.Duration
TransactionDuration time.Duration
Handlers map[string]*ConfigOverwrites
ActiveInstancer interface {
ActiveInstances() []string
}
}
type ConfigOverwrites struct {
@@ -31,6 +34,9 @@ type ConfigOverwrites struct {
var projections []*handler.Handler
func Register(ctx context.Context, configs Config, view *view.View, queries *query2.Queries) {
// make sure the slice does not contain old values
projections = nil
projections = append(projections, newUser(ctx,
configs.overwrite("User"),
view,
@@ -77,13 +83,13 @@ func ProjectInstance(ctx context.Context) error {
func (config Config) overwrite(viewModel string) handler2.Config {
c := handler2.Config{
Client: config.Client,
Eventstore: config.Eventstore,
BulkLimit: uint16(config.BulkLimit),
RequeueEvery: 3 * time.Minute,
HandleActiveInstances: config.HandleActiveInstances,
MaxFailureCount: uint8(config.FailureCountUntilSkip),
TransactionDuration: config.TransactionDuration,
Client: config.Client,
Eventstore: config.Eventstore,
BulkLimit: uint16(config.BulkLimit),
RequeueEvery: 3 * time.Minute,
MaxFailureCount: uint8(config.FailureCountUntilSkip),
TransactionDuration: config.TransactionDuration,
ActiveInstancer: config.ActiveInstancer,
}
overwrite, ok := config.Handlers[viewModel]
if !ok {

View File

@@ -4,7 +4,6 @@ import (
"context"
"errors"
"sort"
"sync"
"time"
"github.com/jackc/pgx/v5/pgconn"
@@ -24,10 +23,6 @@ type Eventstore struct {
pusher Pusher
querier Querier
searcher Searcher
instances []string
lastInstanceQuery time.Time
instancesMu sync.Mutex
}
var (
@@ -68,8 +63,6 @@ func NewEventstore(config *Config) *Eventstore {
pusher: config.Pusher,
querier: config.Querier,
searcher: config.Searcher,
instancesMu: sync.Mutex{},
}
}
@@ -243,27 +236,10 @@ func (es *Eventstore) LatestSequence(ctx context.Context, queryFactory *SearchQu
return es.querier.LatestSequence(ctx, queryFactory)
}
// InstanceIDs returns the instance ids found by the search query
// forceDBCall forces to query the database, the instance ids are not cached
func (es *Eventstore) InstanceIDs(ctx context.Context, maxAge time.Duration, forceDBCall bool, queryFactory *SearchQueryBuilder) ([]string, error) {
es.instancesMu.Lock()
defer es.instancesMu.Unlock()
if !forceDBCall && time.Since(es.lastInstanceQuery) <= maxAge {
return es.instances, nil
}
instances, err := es.querier.InstanceIDs(ctx, queryFactory)
if err != nil {
return nil, err
}
if !forceDBCall {
es.instances = instances
es.lastInstanceQuery = time.Now()
}
return instances, nil
// InstanceIDs returns the distinct instance ids found by the search query
// Warning: this function can have high impact on performance, only use this function during setup
func (es *Eventstore) InstanceIDs(ctx context.Context, queryFactory *SearchQueryBuilder) ([]string, error) {
return es.querier.InstanceIDs(ctx, queryFactory)
}
func (es *Eventstore) Client() *database.DB {

View File

@@ -41,7 +41,6 @@ func NewFieldHandler(config *Config, name string, eventTypes map[eventstore.Aggr
bulkLimit: config.BulkLimit,
eventTypes: eventTypes,
requeueEvery: config.RequeueEvery,
handleActiveInstances: config.HandleActiveInstances,
now: time.Now,
maxFailureCount: config.MaxFailureCount,
retryFailedAfter: config.RetryFailedAfter,

View File

@@ -23,7 +23,7 @@ import (
)
type EventStore interface {
InstanceIDs(ctx context.Context, maxAge time.Duration, forceLoad bool, query *eventstore.SearchQueryBuilder) ([]string, error)
InstanceIDs(ctx context.Context, query *eventstore.SearchQueryBuilder) ([]string, error)
FilterToQueryReducer(ctx context.Context, reducer eventstore.QueryReducer) error
Filter(ctx context.Context, queryFactory *eventstore.SearchQueryBuilder) ([]eventstore.Event, error)
Push(ctx context.Context, cmds ...eventstore.Command) ([]eventstore.Event, error)
@@ -34,14 +34,17 @@ type Config struct {
Client *database.DB
Eventstore EventStore
BulkLimit uint16
RequeueEvery time.Duration
RetryFailedAfter time.Duration
HandleActiveInstances time.Duration
TransactionDuration time.Duration
MaxFailureCount uint8
BulkLimit uint16
RequeueEvery time.Duration
RetryFailedAfter time.Duration
TransactionDuration time.Duration
MaxFailureCount uint8
TriggerWithoutEvents Reduce
ActiveInstancer interface {
ActiveInstances() []string
}
}
type Handler struct {
@@ -52,17 +55,18 @@ type Handler struct {
bulkLimit uint16
eventTypes map[eventstore.AggregateType][]eventstore.EventType
maxFailureCount uint8
retryFailedAfter time.Duration
requeueEvery time.Duration
handleActiveInstances time.Duration
txDuration time.Duration
now nowFunc
maxFailureCount uint8
retryFailedAfter time.Duration
requeueEvery time.Duration
txDuration time.Duration
now nowFunc
triggeredInstancesSync sync.Map
triggerWithoutEvents Reduce
cacheInvalidations []func(ctx context.Context, aggregates []*eventstore.Aggregate)
queryInstances func() ([]string, error)
}
var _ migration.Migration = (*Handler)(nil)
@@ -162,13 +166,18 @@ func NewHandler(
bulkLimit: config.BulkLimit,
eventTypes: aggregates,
requeueEvery: config.RequeueEvery,
handleActiveInstances: config.HandleActiveInstances,
now: time.Now,
maxFailureCount: config.MaxFailureCount,
retryFailedAfter: config.RetryFailedAfter,
triggeredInstancesSync: sync.Map{},
triggerWithoutEvents: config.TriggerWithoutEvents,
txDuration: config.TransactionDuration,
queryInstances: func() ([]string, error) {
if config.ActiveInstancer != nil {
return config.ActiveInstancer.ActiveInstances(), nil
}
return nil, nil
},
}
return handler
@@ -239,7 +248,7 @@ func (h *Handler) schedule(ctx context.Context) {
t.Stop()
return
case <-t.C:
instances, err := h.queryInstances(ctx)
instances, err := h.queryInstances()
h.log().OnError(err).Debug("unable to query instances")
h.triggerInstances(call.WithTimestamp(ctx), instances)
@@ -356,19 +365,6 @@ func (*existingInstances) Reduce() error {
var _ eventstore.QueryReducer = (*existingInstances)(nil)
func (h *Handler) queryInstances(ctx context.Context) ([]string, error) {
if h.handleActiveInstances == 0 {
return h.existingInstances(ctx)
}
query := eventstore.NewSearchQueryBuilder(eventstore.ColumnsInstanceIDs).
AwaitOpenTransactions().
AllowTimeTravel().
CreationDateAfter(h.now().Add(-1 * h.handleActiveInstances))
return h.es.InstanceIDs(ctx, h.requeueEvery, false, query)
}
func (h *Handler) existingInstances(ctx context.Context) ([]string, error) {
ai := existingInstances{}
if err := h.es.FilterToQueryReducer(ctx, &ai); err != nil {

View File

@@ -7,12 +7,17 @@ type projection struct {
reducers []AggregateReducer
}
// Name implements Projection
// ActiveInstances implements [Projection]
func (p *projection) ActiveInstances() []string {
return nil
}
// Name implements [Projection]
func (p *projection) Name() string {
return p.name
}
// Reducers implements Projection
// Reducers implements [Projection]
func (p *projection) Reducers() []AggregateReducer {
return p.reducers
}

View File

@@ -46,6 +46,20 @@ func (m *MockQueries) EXPECT() *MockQueriesMockRecorder {
return m.recorder
}
// ActiveInstances mocks base method.
func (m *MockQueries) ActiveInstances() []string {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "ActiveInstances")
ret0, _ := ret[0].([]string)
return ret0
}
// ActiveInstances indicates an expected call of ActiveInstances.
func (mr *MockQueriesMockRecorder) ActiveInstances() *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ActiveInstances", reflect.TypeOf((*MockQueries)(nil).ActiveInstances))
}
// ActiveLabelPolicyByOrg mocks base method.
func (m *MockQueries) ActiveLabelPolicyByOrg(ctx context.Context, orgID string, withOwnerRemoved bool) (*query.LabelPolicy, error) {
m.ctrl.T.Helper()

View File

@@ -43,19 +43,18 @@ type NotificationWorker struct {
}
type WorkerConfig struct {
LegacyEnabled bool
Workers uint8
BulkLimit uint16
RequeueEvery time.Duration
RetryWorkers uint8
RetryRequeueEvery time.Duration
HandleActiveInstances time.Duration
TransactionDuration time.Duration
MaxAttempts uint8
MaxTtl time.Duration
MinRetryDelay time.Duration
MaxRetryDelay time.Duration
RetryDelayFactor float32
LegacyEnabled bool
Workers uint8
BulkLimit uint16
RequeueEvery time.Duration
RetryWorkers uint8
RetryRequeueEvery time.Duration
TransactionDuration time.Duration
MaxAttempts uint8
MaxTtl time.Duration
MinRetryDelay time.Duration
MaxRetryDelay time.Duration
RetryDelayFactor float32
}
// nowFunc makes [time.Now] mockable
@@ -312,29 +311,7 @@ func (w *NotificationWorker) log(workerID int, retry bool) *logging.Entry {
}
func (w *NotificationWorker) queryInstances(ctx context.Context, retry bool) ([]string, error) {
if w.config.HandleActiveInstances == 0 {
return w.existingInstances(ctx)
}
query := eventstore.NewSearchQueryBuilder(eventstore.ColumnsInstanceIDs).
AwaitOpenTransactions().
AllowTimeTravel().
CreationDateAfter(w.now().Add(-1 * w.config.HandleActiveInstances))
maxAge := w.config.RequeueEvery
if retry {
maxAge = w.config.RetryRequeueEvery
}
return w.es.InstanceIDs(ctx, maxAge, false, query)
}
func (w *NotificationWorker) existingInstances(ctx context.Context) ([]string, error) {
ai := existingInstances{}
if err := w.es.FilterToQueryReducer(ctx, &ai); err != nil {
return nil, err
}
return ai, nil
return w.queries.ActiveInstances(), nil
}
func (w *NotificationWorker) triggerInstances(ctx context.Context, instances []string, workerID int, retry bool) {

View File

@@ -877,16 +877,15 @@ func newNotificationWorker(t *testing.T, ctrl *gomock.Controller, queries *mock.
},
},
config: WorkerConfig{
Workers: 1,
BulkLimit: 10,
RequeueEvery: 2 * time.Second,
HandleActiveInstances: 0,
TransactionDuration: 5 * time.Second,
MaxAttempts: f.maxAttempts,
MaxTtl: 5 * time.Minute,
MinRetryDelay: 1 * time.Second,
MaxRetryDelay: 10 * time.Second,
RetryDelayFactor: 2,
Workers: 1,
BulkLimit: 10,
RequeueEvery: 2 * time.Second,
TransactionDuration: 5 * time.Second,
MaxAttempts: f.maxAttempts,
MaxTtl: 5 * time.Minute,
MinRetryDelay: 1 * time.Second,
MaxRetryDelay: 10 * time.Second,
RetryDelayFactor: 2,
},
now: f.now,
backOff: f.backOff,

View File

@@ -31,6 +31,8 @@ type Queries interface {
InstanceByID(ctx context.Context, id string) (instance authz.Instance, err error)
GetActiveSigningWebKey(ctx context.Context) (*jose.JSONWebKey, error)
ActivePrivateSigningKey(ctx context.Context, t time.Time) (keys *query.PrivateKeys, err error)
ActiveInstances() []string
}
type NotificationQueries struct {

View File

@@ -2,7 +2,9 @@ package query
import (
"context"
"time"
"github.com/hashicorp/golang-lru/v2/expirable"
"github.com/zitadel/logging"
"github.com/zitadel/zitadel/internal/cache"
@@ -13,9 +15,16 @@ import (
type Caches struct {
instance cache.Cache[instanceIndex, string, *authzInstance]
org cache.Cache[orgIndex, string, *Org]
activeInstances *expirable.LRU[string, bool]
}
func startCaches(background context.Context, connectors connector.Connectors) (_ *Caches, err error) {
type ActiveInstanceConfig struct {
MaxEntries int
TTL time.Duration
}
func startCaches(background context.Context, connectors connector.Connectors, instanceConfig ActiveInstanceConfig) (_ *Caches, err error) {
caches := new(Caches)
caches.instance, err = connector.StartCache[instanceIndex, string, *authzInstance](background, instanceIndexValues(), cache.PurposeAuthzInstance, connectors.Config.Instance, connectors)
if err != nil {
@@ -26,6 +35,8 @@ func startCaches(background context.Context, connectors connector.Connectors) (_
return nil, err
}
caches.activeInstances = expirable.NewLRU[string, bool](instanceConfig.MaxEntries, nil, instanceConfig.TTL)
caches.registerInstanceInvalidation()
caches.registerOrgInvalidation()
return caches, nil

View File

@@ -143,6 +143,10 @@ func (q *InstanceSearchQueries) toQuery(query sq.SelectBuilder) sq.SelectBuilder
return query
}
func (q *Queries) ActiveInstances() []string {
return q.caches.activeInstances.Keys()
}
func (q *Queries) SearchInstances(ctx context.Context, queries *InstanceSearchQueries) (instances *Instances, err error) {
ctx, span := tracing.NewSpan(ctx)
defer func() { span.EndWithError(err) }()
@@ -198,10 +202,13 @@ var (
)
func (q *Queries) InstanceByHost(ctx context.Context, instanceHost, publicHost string) (_ authz.Instance, err error) {
var instance *authzInstance
ctx, span := tracing.NewSpan(ctx)
defer func() {
if err != nil {
err = fmt.Errorf("unable to get instance by host: instanceHost %s, publicHost %s: %w", instanceHost, publicHost, err)
} else {
q.caches.activeInstances.Add(instance.ID, true)
}
span.EndWithError(err)
}()
@@ -225,6 +232,12 @@ func (q *Queries) InstanceByHost(ctx context.Context, instanceHost, publicHost s
func (q *Queries) InstanceByID(ctx context.Context, id string) (_ authz.Instance, err error) {
ctx, span := tracing.NewSpan(ctx)
defer func() { span.EndWithError(err) }()
defer func() {
if err != nil {
return
}
q.caches.activeInstances.Add(id, true)
}()
instance, ok := q.caches.instance.Get(ctx, instanceIndexByID, id)
if ok {

View File

@@ -517,6 +517,6 @@ func (o *Org) Keys(index orgIndex) []string {
}
func (c *Caches) registerOrgInvalidation() {
invalidate := cacheInvalidationFunc(c.instance, instanceIndexByID, getAggregateID)
invalidate := cacheInvalidationFunc(c.org, orgIndexByID, getAggregateID)
projection.OrgProjection.RegisterCacheInvalidation(invalidate)
}

View File

@@ -12,15 +12,18 @@ type Config struct {
BulkLimit uint64
Customizations map[string]CustomConfig
HandleActiveInstances time.Duration
MaxActiveInstances uint32
TransactionDuration time.Duration
ActiveInstancer interface {
ActiveInstances() []string
}
}
type CustomConfig struct {
RequeueEvery *time.Duration
RetryFailedAfter *time.Duration
MaxFailureCount *uint8
ConcurrentInstances *uint
BulkLimit *uint16
HandleActiveInstances *time.Duration
TransactionDuration *time.Duration
RequeueEvery *time.Duration
RetryFailedAfter *time.Duration
MaxFailureCount *uint8
ConcurrentInstances *uint
BulkLimit *uint16
TransactionDuration *time.Duration
}

View File

@@ -2,7 +2,6 @@ package projection
import (
"context"
"time"
"github.com/zitadel/zitadel/internal/eventstore"
"github.com/zitadel/zitadel/internal/eventstore/handler/v2"
@@ -28,7 +27,7 @@ func (m *mockEventStore) appendFilterResponse(events []eventstore.Event) *mockEv
return m
}
func (m *mockEventStore) InstanceIDs(ctx context.Context, _ time.Duration, _ bool, query *eventstore.SearchQueryBuilder) ([]string, error) {
func (m *mockEventStore) InstanceIDs(ctx context.Context, query *eventstore.SearchQueryBuilder) ([]string, error) {
m.instanceIDCounter++
return m.instanceIDsResponse[m.instanceIDCounter-1], nil
}

View File

@@ -99,14 +99,14 @@ var (
func Create(ctx context.Context, sqlClient *database.DB, es handler.EventStore, config Config, keyEncryptionAlgorithm crypto.EncryptionAlgorithm, certEncryptionAlgorithm crypto.EncryptionAlgorithm, systemUsers map[string]*internal_authz.SystemAPIUser) error {
projectionConfig = handler.Config{
Client: sqlClient,
Eventstore: es,
BulkLimit: uint16(config.BulkLimit),
RequeueEvery: config.RequeueEvery,
HandleActiveInstances: config.HandleActiveInstances,
MaxFailureCount: config.MaxFailureCount,
RetryFailedAfter: config.RetryFailedAfter,
TransactionDuration: config.TransactionDuration,
Client: sqlClient,
Eventstore: es,
BulkLimit: uint16(config.BulkLimit),
RequeueEvery: config.RequeueEvery,
MaxFailureCount: config.MaxFailureCount,
RetryFailedAfter: config.RetryFailedAfter,
TransactionDuration: config.TransactionDuration,
ActiveInstancer: config.ActiveInstancer,
}
OrgProjection = newOrgProjection(ctx, applyCustomConfig(projectionConfig, config.Customizations["orgs"]))
@@ -223,9 +223,6 @@ func applyCustomConfig(config handler.Config, customConfig CustomConfig) handler
if customConfig.RetryFailedAfter != nil {
config.RetryFailedAfter = *customConfig.RetryFailedAfter
}
if customConfig.HandleActiveInstances != nil {
config.HandleActiveInstances = *customConfig.HandleActiveInstances
}
if customConfig.TransactionDuration != nil {
config.TransactionDuration = *customConfig.TransactionDuration
}

View File

@@ -84,6 +84,7 @@ func StartQueries(
repo.checkPermission = permissionCheck(repo)
projections.ActiveInstancer = repo
err = projection.Create(ctx, projectionSqlClient, es, projections, keyEncryptionAlgorithm, certEncryptionAlgorithm, systemAPIUsers)
if err != nil {
return nil, err
@@ -91,7 +92,15 @@ func StartQueries(
if startProjections {
projection.Start(ctx)
}
repo.caches, err = startCaches(ctx, cacheConnectors)
repo.caches, err = startCaches(
ctx,
cacheConnectors,
ActiveInstanceConfig{
MaxEntries: int(projections.MaxActiveInstances),
TTL: projections.HandleActiveInstances,
},
)
if err != nil {
return nil, err
}