fix(projections): overhaul the event projection system (#10560)

This PR overhauls our event projection system to make it more robust and
prevent skipped events under high load. The core change replaces our
custom, transaction-based locking with standard PostgreSQL advisory
locks. We also introduce a worker pool to manage concurrency and prevent
database connection exhaustion.

### Key Changes

* **Advisory Locks for Projections:** Replaces exclusive row locks and
inspection of `pg_stat_activity` with PostgreSQL advisory locks for
managing projection state. This is a more reliable and standard approach
to distributed locking.
* **Simplified Await Logic:** Removes the complex logic for awaiting
open transactions, simplifying it to a more straightforward time-based
filtering of events.
* **Projection Worker Pool:** Implements a worker pool to limit
concurrent projection triggers, preventing connection exhaustion and
improving stability under load. A new `MaxParallelTriggers`
configuration option is introduced.

### Problem Solved

Under high throughput, a race condition could cause projections to miss
events from the eventstore. This led to inconsistent data in projection
tables (e.g., a user grant might be missing). This PR fixes the
underlying locking and concurrency issues to ensure all events are
processed reliably.

### How it Works

1. **Event Writing:** When writing events, a *shared* advisory lock is
taken. This signals that a write is in progress.
2.  **Event Handling (Projections):**
* A projection worker attempts to acquire an *exclusive* advisory lock
for that specific projection. If the lock is already held, it means
another worker is on the job, so the current one backs off.
* Once the lock is acquired, the worker briefly acquires and releases
the same *shared* lock used by event writers. This acts as a barrier,
ensuring it waits for any in-flight writes to complete.
* Finally, it processes all events that occurred before its transaction
began.

### Additional Information

* ZITADEL no longer modifies the `application_name` PostgreSQL variable
during event writes.
*   The lock on the `current_states` table is now `FOR NO KEY UPDATE`.
*   Fixes https://github.com/zitadel/zitadel/issues/8509

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Tim Möhlmann <tim+github@zitadel.com>
(cherry picked from commit 0575f67e94)
This commit is contained in:
Silvan
2025-09-03 17:29:00 +02:00
committed by Livio Spring
parent c3fdb991d8
commit 19d1ab9c94
25 changed files with 286 additions and 410 deletions

View File

@@ -17,8 +17,8 @@ import (
// awaitOpenTransactions ensures event ordering, so we don't events younger that open transactions
var (
awaitOpenTransactionsV1 = ` AND EXTRACT(EPOCH FROM created_at) < (SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY(?) AND state <> 'idle')`
awaitOpenTransactionsV2 = ` AND "position" < (SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY(?) AND state <> 'idle')`
awaitOpenTransactionsV1 = ` AND created_at <= now()`
awaitOpenTransactionsV2 = ` AND "position" <= EXTRACT(EPOCH FROM now())`
)
func awaitOpenTransactions(useV1 bool) string {

View File

@@ -12,6 +12,7 @@ import (
"github.com/shopspring/decimal"
"github.com/zitadel/logging"
"github.com/zitadel/zitadel/internal/api/authz"
"github.com/zitadel/zitadel/internal/database"
"github.com/zitadel/zitadel/internal/database/dialect"
"github.com/zitadel/zitadel/internal/eventstore"
@@ -65,6 +66,32 @@ func query(ctx context.Context, criteria querier, searchQuery *eventstore.Search
if where == "" || query == "" {
return zerrors.ThrowInvalidArgument(nil, "SQL-rWeBw", "invalid query factory")
}
var contextQuerier interface {
QueryContext(context.Context, func(rows *sql.Rows) error, string, ...interface{}) error
ExecContext(context.Context, string, ...any) (sql.Result, error)
}
contextQuerier = criteria.Client()
if q.Tx != nil {
contextQuerier = &tx{Tx: q.Tx}
}
if q.AwaitOpenTransactions && q.Columns == eventstore.ColumnsEvent {
instanceID := authz.GetInstance(ctx).InstanceID()
if q.InstanceID != nil {
instanceID = q.InstanceID.Value.(string)
}
_, err = contextQuerier.ExecContext(ctx,
"select pg_advisory_lock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1)), pg_advisory_unlock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1))",
instanceID,
)
if err != nil {
return err
}
where += awaitOpenTransactions(useV1)
}
query += where
// instead of using the max function of the database (which doesn't work for postgres)
@@ -100,28 +127,8 @@ func query(ctx context.Context, criteria querier, searchQuery *eventstore.Search
query += " OFFSET ?"
}
if q.LockRows {
query += " FOR UPDATE"
switch q.LockOption {
case eventstore.LockOptionWait: // default behavior
case eventstore.LockOptionNoWait:
query += " NOWAIT"
case eventstore.LockOptionSkipLocked:
query += " SKIP LOCKED"
}
}
query = criteria.placeholder(query)
var contextQuerier interface {
QueryContext(context.Context, func(rows *sql.Rows) error, string, ...interface{}) error
}
contextQuerier = criteria.Client()
if q.Tx != nil {
contextQuerier = &tx{Tx: q.Tx}
}
err = contextQuerier.QueryContext(ctx,
func(rows *sql.Rows) error {
for rows.Next() {
@@ -289,22 +296,6 @@ func prepareConditions(criteria querier, query *repository.SearchQuery, useV1 bo
args = append(args, excludeAggregateIDsArgs...)
}
if query.AwaitOpenTransactions {
instanceIDs := make(database.TextArray[string], 0, 3)
if query.InstanceID != nil {
instanceIDs = append(instanceIDs, query.InstanceID.Value.(string))
} else if query.InstanceIDs != nil {
instanceIDs = append(instanceIDs, query.InstanceIDs.Value.(database.TextArray[string])...)
}
for i := range instanceIDs {
instanceIDs[i] = "zitadel_es_pusher_" + instanceIDs[i]
}
clauses += awaitOpenTransactions(useV1)
args = append(args, instanceIDs)
}
if clauses == "" {
return "", nil
}

View File

@@ -405,8 +405,8 @@ func Test_prepareCondition(t *testing.T) {
useV1: true,
},
res: res{
clause: " WHERE aggregate_type = ANY(?) AND EXTRACT(EPOCH FROM created_at) < (SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY(?) AND state <> 'idle')",
values: []interface{}{[]eventstore.AggregateType{"user", "org"}, database.TextArray[string]{}},
clause: " WHERE aggregate_type = ANY(?)",
values: []interface{}{[]eventstore.AggregateType{"user", "org"}},
},
},
{
@@ -422,8 +422,8 @@ func Test_prepareCondition(t *testing.T) {
},
},
res: res{
clause: ` WHERE aggregate_type = ANY(?) AND "position" < (SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY(?) AND state <> 'idle')`,
values: []interface{}{[]eventstore.AggregateType{"user", "org"}, database.TextArray[string]{}},
clause: ` WHERE aggregate_type = ANY(?)`,
values: []interface{}{[]eventstore.AggregateType{"user", "org"}},
},
},
{
@@ -442,8 +442,8 @@ func Test_prepareCondition(t *testing.T) {
useV1: true,
},
res: res{
clause: " WHERE aggregate_type = ANY(?) AND aggregate_id = ? AND event_type = ANY(?) AND EXTRACT(EPOCH FROM created_at) < (SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY(?) AND state <> 'idle')",
values: []interface{}{[]eventstore.AggregateType{"user", "org"}, "1234", []eventstore.EventType{"user.created", "org.created"}, database.TextArray[string]{}},
clause: " WHERE aggregate_type = ANY(?) AND aggregate_id = ? AND event_type = ANY(?)",
values: []interface{}{[]eventstore.AggregateType{"user", "org"}, "1234", []eventstore.EventType{"user.created", "org.created"}},
},
},
{
@@ -461,8 +461,8 @@ func Test_prepareCondition(t *testing.T) {
},
},
res: res{
clause: ` WHERE aggregate_type = ANY(?) AND aggregate_id = ? AND event_type = ANY(?) AND "position" < (SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY(?) AND state <> 'idle')`,
values: []interface{}{[]eventstore.AggregateType{"user", "org"}, "1234", []eventstore.EventType{"user.created", "org.created"}, database.TextArray[string]{}},
clause: ` WHERE aggregate_type = ANY(?) AND aggregate_id = ? AND event_type = ANY(?)`,
values: []interface{}{[]eventstore.AggregateType{"user", "org"}, "1234", []eventstore.EventType{"user.created", "org.created"}},
},
},
}
@@ -693,10 +693,14 @@ func Test_query_events_mocked(t *testing.T) {
useV1: true,
},
fields: fields{
mock: newMockClient(t).expectQuery(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 AND EXTRACT(EPOCH FROM created_at) < (SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY($2) AND state <> 'idle') ORDER BY event_sequence DESC`),
[]driver.Value{eventstore.AggregateType("user"), database.TextArray[string]{}},
),
mock: newMockClient(t).
expectExec(regexp.QuoteMeta(
`select pg_advisory_lock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1)), pg_advisory_unlock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1))`),
[]driver.Value{""}).
expectQuery(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 AND created_at <= now() ORDER BY event_sequence DESC`),
[]driver.Value{eventstore.AggregateType("user")},
),
},
res: res{
wantErr: false,
@@ -716,10 +720,14 @@ func Test_query_events_mocked(t *testing.T) {
useV1: true,
},
fields: fields{
mock: newMockClient(t).expectQuery(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 AND EXTRACT(EPOCH FROM created_at) < (SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY($2) AND state <> 'idle') ORDER BY event_sequence LIMIT $3`),
[]driver.Value{eventstore.AggregateType("user"), database.TextArray[string]{}, uint64(5)},
),
mock: newMockClient(t).
expectExec(regexp.QuoteMeta(
`select pg_advisory_lock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1)), pg_advisory_unlock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1))`),
[]driver.Value{""}).
expectQuery(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 AND created_at <= now() ORDER BY event_sequence LIMIT $2`),
[]driver.Value{eventstore.AggregateType("user"), uint64(5)},
),
},
res: res{
wantErr: false,
@@ -739,76 +747,14 @@ func Test_query_events_mocked(t *testing.T) {
useV1: true,
},
fields: fields{
mock: newMockClient(t).expectQuery(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 AND EXTRACT(EPOCH FROM created_at) < (SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY($2) AND state <> 'idle') ORDER BY event_sequence DESC LIMIT $3`),
[]driver.Value{eventstore.AggregateType("user"), database.TextArray[string]{}, uint64(5)},
),
},
res: res{
wantErr: false,
},
},
{
name: "lock, wait",
args: args{
dest: &[]*repository.Event{},
query: eventstore.NewSearchQueryBuilder(eventstore.ColumnsEvent).
OrderDesc().
Limit(5).
AddQuery().
AggregateTypes("user").
Builder().LockRowsDuringTx(nil, eventstore.LockOptionWait),
useV1: true,
},
fields: fields{
mock: newMockClient(t).expectQuery(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 ORDER BY event_sequence DESC LIMIT $2 FOR UPDATE`),
[]driver.Value{eventstore.AggregateType("user"), uint64(5)},
),
},
res: res{
wantErr: false,
},
},
{
name: "lock, no wait",
args: args{
dest: &[]*repository.Event{},
query: eventstore.NewSearchQueryBuilder(eventstore.ColumnsEvent).
OrderDesc().
Limit(5).
AddQuery().
AggregateTypes("user").
Builder().LockRowsDuringTx(nil, eventstore.LockOptionNoWait),
useV1: true,
},
fields: fields{
mock: newMockClient(t).expectQuery(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 ORDER BY event_sequence DESC LIMIT $2 FOR UPDATE NOWAIT`),
[]driver.Value{eventstore.AggregateType("user"), uint64(5)},
),
},
res: res{
wantErr: false,
},
},
{
name: "lock, skip locked",
args: args{
dest: &[]*repository.Event{},
query: eventstore.NewSearchQueryBuilder(eventstore.ColumnsEvent).
OrderDesc().
Limit(5).
AddQuery().
AggregateTypes("user").
Builder().LockRowsDuringTx(nil, eventstore.LockOptionSkipLocked),
useV1: true,
},
fields: fields{
mock: newMockClient(t).expectQuery(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 ORDER BY event_sequence DESC LIMIT $2 FOR UPDATE SKIP LOCKED`),
[]driver.Value{eventstore.AggregateType("user"), uint64(5)},
),
mock: newMockClient(t).
expectExec(regexp.QuoteMeta(
`select pg_advisory_lock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1)), pg_advisory_unlock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1))`),
[]driver.Value{""}).
expectQuery(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 AND created_at <= now() ORDER BY event_sequence DESC LIMIT $2`),
[]driver.Value{eventstore.AggregateType("user"), uint64(5)},
),
},
res: res{
wantErr: false,
@@ -828,10 +774,14 @@ func Test_query_events_mocked(t *testing.T) {
useV1: true,
},
fields: fields{
mock: newMockClient(t).expectQueryErr(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 AND EXTRACT(EPOCH FROM created_at) < (SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY($2) AND state <> 'idle') ORDER BY event_sequence DESC`),
[]driver.Value{eventstore.AggregateType("user"), database.TextArray[string]{}},
sql.ErrConnDone),
mock: newMockClient(t).
expectExec(regexp.QuoteMeta(
`select pg_advisory_lock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1)), pg_advisory_unlock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1))`),
[]driver.Value{""}).
expectQueryErr(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 AND created_at <= now() ORDER BY event_sequence DESC`),
[]driver.Value{eventstore.AggregateType("user")},
sql.ErrConnDone),
},
res: res{
wantErr: true,
@@ -851,10 +801,14 @@ func Test_query_events_mocked(t *testing.T) {
useV1: true,
},
fields: fields{
mock: newMockClient(t).expectQueryScanErr(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 AND EXTRACT(EPOCH FROM created_at) < (SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY($2) AND state <> 'idle') ORDER BY event_sequence DESC`),
[]driver.Value{eventstore.AggregateType("user"), database.TextArray[string]{}},
&repository.Event{Seq: 100}),
mock: newMockClient(t).
expectExec(regexp.QuoteMeta(
`select pg_advisory_lock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1)), pg_advisory_unlock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1))`),
[]driver.Value{""}).
expectQueryScanErr(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE aggregate_type = $1 AND created_at <= now() ORDER BY event_sequence DESC`),
[]driver.Value{eventstore.AggregateType("user")},
&repository.Event{Seq: 100}),
},
res: res{
wantErr: true,
@@ -886,10 +840,14 @@ func Test_query_events_mocked(t *testing.T) {
useV1: true,
},
fields: fields{
mock: newMockClient(t).expectQuery(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE (aggregate_type = $1 OR (aggregate_type = $2 AND aggregate_id = $3)) AND EXTRACT(EPOCH FROM created_at) < (SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY($4) AND state <> 'idle') ORDER BY event_sequence DESC LIMIT $5`),
[]driver.Value{eventstore.AggregateType("user"), eventstore.AggregateType("org"), "asdf42", database.TextArray[string]{}, uint64(5)},
),
mock: newMockClient(t).
expectExec(regexp.QuoteMeta(
`select pg_advisory_lock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1)), pg_advisory_unlock('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1))`),
[]driver.Value{""}).
expectQuery(
regexp.QuoteMeta(`SELECT creation_date, event_type, event_sequence, event_data, editor_user, resource_owner, instance_id, aggregate_type, aggregate_id, aggregate_version FROM eventstore.events WHERE (aggregate_type = $1 OR (aggregate_type = $2 AND aggregate_id = $3)) AND created_at <= now() ORDER BY event_sequence DESC LIMIT $4`),
[]driver.Value{eventstore.AggregateType("user"), eventstore.AggregateType("org"), "asdf42", uint64(5)},
),
},
res: res{
wantErr: false,
@@ -1040,6 +998,11 @@ func (m *dbMock) expectQueryErr(expectedQuery string, args []driver.Value, err e
return m
}
func (m *dbMock) expectExec(expectedQuery string, args []driver.Value) *dbMock {
m.mock.ExpectExec(expectedQuery).WithArgs(args...).WillReturnResult(sqlmock.NewResult(1, 1))
return m
}
func newMockClient(t *testing.T) *dbMock {
t.Helper()
db, mock, err := sqlmock.New(sqlmock.ValueConverterOption(new(db_mock.TypeConverter)))