mirror of
https://github.com/zitadel/zitadel.git
synced 2025-12-06 16:02:15 +00:00
fix(projections): overhaul the event projection system (#10560)
This PR overhauls our event projection system to make it more robust and
prevent skipped events under high load. The core change replaces our
custom, transaction-based locking with standard PostgreSQL advisory
locks. We also introduce a worker pool to manage concurrency and prevent
database connection exhaustion.
### Key Changes
* **Advisory Locks for Projections:** Replaces exclusive row locks and
inspection of `pg_stat_activity` with PostgreSQL advisory locks for
managing projection state. This is a more reliable and standard approach
to distributed locking.
* **Simplified Await Logic:** Removes the complex logic for awaiting
open transactions, simplifying it to a more straightforward time-based
filtering of events.
* **Projection Worker Pool:** Implements a worker pool to limit
concurrent projection triggers, preventing connection exhaustion and
improving stability under load. A new `MaxParallelTriggers`
configuration option is introduced.
### Problem Solved
Under high throughput, a race condition could cause projections to miss
events from the eventstore. This led to inconsistent data in projection
tables (e.g., a user grant might be missing). This PR fixes the
underlying locking and concurrency issues to ensure all events are
processed reliably.
### How it Works
1. **Event Writing:** When writing events, a *shared* advisory lock is
taken. This signals that a write is in progress.
2. **Event Handling (Projections):**
* A projection worker attempts to acquire an *exclusive* advisory lock
for that specific projection. If the lock is already held, it means
another worker is on the job, so the current one backs off.
* Once the lock is acquired, the worker briefly acquires and releases
the same *shared* lock used by event writers. This acts as a barrier,
ensuring it waits for any in-flight writes to complete.
* Finally, it processes all events that occurred before its transaction
began.
### Additional Information
* ZITADEL no longer modifies the `application_name` PostgreSQL variable
during event writes.
* The lock on the `current_states` table is now `FOR NO KEY UPDATE`.
* Fixes https://github.com/zitadel/zitadel/issues/8509
---------
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Tim Möhlmann <tim+github@zitadel.com>
(cherry picked from commit 0575f67e94)
This commit is contained in:
@@ -12,6 +12,7 @@ import (
|
||||
"github.com/shopspring/decimal"
|
||||
"github.com/zitadel/logging"
|
||||
|
||||
"github.com/zitadel/zitadel/internal/api/authz"
|
||||
"github.com/zitadel/zitadel/internal/database"
|
||||
"github.com/zitadel/zitadel/internal/database/dialect"
|
||||
"github.com/zitadel/zitadel/internal/eventstore"
|
||||
@@ -65,6 +66,32 @@ func query(ctx context.Context, criteria querier, searchQuery *eventstore.Search
|
||||
if where == "" || query == "" {
|
||||
return zerrors.ThrowInvalidArgument(nil, "SQL-rWeBw", "invalid query factory")
|
||||
}
|
||||
|
||||
var contextQuerier interface {
|
||||
QueryContext(context.Context, func(rows *sql.Rows) error, string, ...interface{}) error
|
||||
ExecContext(context.Context, string, ...any) (sql.Result, error)
|
||||
}
|
||||
contextQuerier = criteria.Client()
|
||||
if q.Tx != nil {
|
||||
contextQuerier = &tx{Tx: q.Tx}
|
||||
}
|
||||
|
||||
if q.AwaitOpenTransactions && q.Columns == eventstore.ColumnsEvent {
|
||||
instanceID := authz.GetInstance(ctx).InstanceID()
|
||||
if q.InstanceID != nil {
|
||||
instanceID = q.InstanceID.Value.(string)
|
||||
}
|
||||
|
||||
_, err = contextQuerier.ExecContext(ctx,
|
||||
"select pg_advisory_lock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1)), pg_advisory_unlock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1))",
|
||||
instanceID,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
where += awaitOpenTransactions(useV1)
|
||||
}
|
||||
query += where
|
||||
|
||||
// instead of using the max function of the database (which doesn't work for postgres)
|
||||
@@ -100,28 +127,8 @@ func query(ctx context.Context, criteria querier, searchQuery *eventstore.Search
|
||||
query += " OFFSET ?"
|
||||
}
|
||||
|
||||
if q.LockRows {
|
||||
query += " FOR UPDATE"
|
||||
switch q.LockOption {
|
||||
case eventstore.LockOptionWait: // default behavior
|
||||
case eventstore.LockOptionNoWait:
|
||||
query += " NOWAIT"
|
||||
case eventstore.LockOptionSkipLocked:
|
||||
query += " SKIP LOCKED"
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
query = criteria.placeholder(query)
|
||||
|
||||
var contextQuerier interface {
|
||||
QueryContext(context.Context, func(rows *sql.Rows) error, string, ...interface{}) error
|
||||
}
|
||||
contextQuerier = criteria.Client()
|
||||
if q.Tx != nil {
|
||||
contextQuerier = &tx{Tx: q.Tx}
|
||||
}
|
||||
|
||||
err = contextQuerier.QueryContext(ctx,
|
||||
func(rows *sql.Rows) error {
|
||||
for rows.Next() {
|
||||
@@ -289,22 +296,6 @@ func prepareConditions(criteria querier, query *repository.SearchQuery, useV1 bo
|
||||
args = append(args, excludeAggregateIDsArgs...)
|
||||
}
|
||||
|
||||
if query.AwaitOpenTransactions {
|
||||
instanceIDs := make(database.TextArray[string], 0, 3)
|
||||
if query.InstanceID != nil {
|
||||
instanceIDs = append(instanceIDs, query.InstanceID.Value.(string))
|
||||
} else if query.InstanceIDs != nil {
|
||||
instanceIDs = append(instanceIDs, query.InstanceIDs.Value.(database.TextArray[string])...)
|
||||
}
|
||||
|
||||
for i := range instanceIDs {
|
||||
instanceIDs[i] = "zitadel_es_pusher_" + instanceIDs[i]
|
||||
}
|
||||
|
||||
clauses += awaitOpenTransactions(useV1)
|
||||
args = append(args, instanceIDs)
|
||||
}
|
||||
|
||||
if clauses == "" {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user