mirror of
https://github.com/zitadel/zitadel.git
synced 2025-12-06 14:52:35 +00:00
fix(projections): overhaul the event projection system (#10560)
This PR overhauls our event projection system to make it more robust and
prevent skipped events under high load. The core change replaces our
custom, transaction-based locking with standard PostgreSQL advisory
locks. We also introduce a worker pool to manage concurrency and prevent
database connection exhaustion.
### Key Changes
* **Advisory Locks for Projections:** Replaces exclusive row locks and
inspection of `pg_stat_activity` with PostgreSQL advisory locks for
managing projection state. This is a more reliable and standard approach
to distributed locking.
* **Simplified Await Logic:** Removes the complex logic for awaiting
open transactions, simplifying it to a more straightforward time-based
filtering of events.
* **Projection Worker Pool:** Implements a worker pool to limit
concurrent projection triggers, preventing connection exhaustion and
improving stability under load. A new `MaxParallelTriggers`
configuration option is introduced.
### Problem Solved
Under high throughput, a race condition could cause projections to miss
events from the eventstore. This led to inconsistent data in projection
tables (e.g., a user grant might be missing). This PR fixes the
underlying locking and concurrency issues to ensure all events are
processed reliably.
### How it Works
1. **Event Writing:** When writing events, a *shared* advisory lock is
taken. This signals that a write is in progress.
2. **Event Handling (Projections):**
* A projection worker attempts to acquire an *exclusive* advisory lock
for that specific projection. If the lock is already held, it means
another worker is on the job, so the current one backs off.
* Once the lock is acquired, the worker briefly acquires and releases
the same *shared* lock used by event writers. This acts as a barrier,
ensuring it waits for any in-flight writes to complete.
* Finally, it processes all events that occurred before its transaction
began.
### Additional Information
* ZITADEL no longer modifies the `application_name` PostgreSQL variable
during event writes.
* The lock on the `current_states` table is now `FOR NO KEY UPDATE`.
* Fixes https://github.com/zitadel/zitadel/issues/8509
---------
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Tim Möhlmann <tim+github@zitadel.com>
(cherry picked from commit 0575f67e94)
This commit is contained in:
@@ -19,123 +19,6 @@ import (
|
||||
"github.com/zitadel/zitadel/internal/zerrors"
|
||||
)
|
||||
|
||||
func TestHandler_lockState(t *testing.T) {
|
||||
type fields struct {
|
||||
projection Projection
|
||||
mock *mock.SQLMock
|
||||
}
|
||||
type args struct {
|
||||
instanceID string
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
fields fields
|
||||
args args
|
||||
isErr func(t *testing.T, err error)
|
||||
}{
|
||||
{
|
||||
name: "tx closed",
|
||||
fields: fields{
|
||||
projection: &projection{
|
||||
name: "projection",
|
||||
},
|
||||
mock: mock.NewSQLMock(t,
|
||||
mock.ExpectBegin(nil),
|
||||
mock.ExcpectExec(
|
||||
lockStateStmt,
|
||||
mock.WithExecArgs(
|
||||
"projection",
|
||||
"instance",
|
||||
),
|
||||
mock.WithExecErr(sql.ErrTxDone),
|
||||
),
|
||||
),
|
||||
},
|
||||
args: args{
|
||||
instanceID: "instance",
|
||||
},
|
||||
isErr: func(t *testing.T, err error) {
|
||||
if !errors.Is(err, sql.ErrTxDone) {
|
||||
t.Errorf("unexpected error, want: %v got: %v", sql.ErrTxDone, err)
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "no rows affeced",
|
||||
fields: fields{
|
||||
projection: &projection{
|
||||
name: "projection",
|
||||
},
|
||||
mock: mock.NewSQLMock(t,
|
||||
mock.ExpectBegin(nil),
|
||||
mock.ExcpectExec(
|
||||
lockStateStmt,
|
||||
mock.WithExecArgs(
|
||||
"projection",
|
||||
"instance",
|
||||
),
|
||||
mock.WithExecNoRowsAffected(),
|
||||
),
|
||||
),
|
||||
},
|
||||
args: args{
|
||||
instanceID: "instance",
|
||||
},
|
||||
isErr: func(t *testing.T, err error) {
|
||||
if !errors.Is(err, zerrors.ThrowInternal(nil, "V2-lpiK0", "")) {
|
||||
t.Errorf("unexpected error: want internal (V2lpiK0), got: %v", err)
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "rows affected",
|
||||
fields: fields{
|
||||
projection: &projection{
|
||||
name: "projection",
|
||||
},
|
||||
mock: mock.NewSQLMock(t,
|
||||
mock.ExpectBegin(nil),
|
||||
mock.ExcpectExec(
|
||||
lockStateStmt,
|
||||
mock.WithExecArgs(
|
||||
"projection",
|
||||
"instance",
|
||||
),
|
||||
mock.WithExecRowsAffected(1),
|
||||
),
|
||||
),
|
||||
},
|
||||
args: args{
|
||||
instanceID: "instance",
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
if tt.isErr == nil {
|
||||
tt.isErr = func(t *testing.T, err error) {
|
||||
if err != nil {
|
||||
t.Error("expected no error got:", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
h := &Handler{
|
||||
projection: tt.fields.projection,
|
||||
}
|
||||
|
||||
tx, err := tt.fields.mock.DB.BeginTx(context.Background(), nil)
|
||||
if err != nil {
|
||||
t.Fatalf("unable to begin transaction: %v", err)
|
||||
}
|
||||
|
||||
err = h.lockState(tx, tt.args.instanceID)
|
||||
tt.isErr(t, err)
|
||||
|
||||
tt.fields.mock.Assert(t)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandler_updateLastUpdated(t *testing.T) {
|
||||
type fields struct {
|
||||
projection Projection
|
||||
@@ -309,41 +192,6 @@ func TestHandler_currentState(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "no row but lock err",
|
||||
fields: fields{
|
||||
projection: &projection{
|
||||
name: "projection",
|
||||
},
|
||||
mock: mock.NewSQLMock(t,
|
||||
mock.ExpectBegin(nil),
|
||||
mock.ExpectQuery(currentStateStmt,
|
||||
mock.WithQueryArgs(
|
||||
"instance",
|
||||
"projection",
|
||||
),
|
||||
mock.WithQueryErr(sql.ErrNoRows),
|
||||
),
|
||||
mock.ExcpectExec(lockStateStmt,
|
||||
mock.WithExecArgs(
|
||||
"projection",
|
||||
"instance",
|
||||
),
|
||||
mock.WithExecErr(sql.ErrTxDone),
|
||||
),
|
||||
),
|
||||
},
|
||||
args: args{
|
||||
ctx: authz.WithInstanceID(context.Background(), "instance"),
|
||||
},
|
||||
want: want{
|
||||
isErr: func(t *testing.T, err error) {
|
||||
if !errors.Is(err, sql.ErrTxDone) {
|
||||
t.Errorf("unexpected error, want: %v, got: %v", sql.ErrTxDone, err)
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "state locked",
|
||||
fields: fields{
|
||||
@@ -440,7 +288,7 @@ func TestHandler_currentState(t *testing.T) {
|
||||
t.Fatalf("unable to begin transaction: %v", err)
|
||||
}
|
||||
|
||||
gotCurrentState, err := h.currentState(tt.args.ctx, tx, new(triggerConfig))
|
||||
gotCurrentState, err := h.currentState(tt.args.ctx, tx)
|
||||
|
||||
tt.want.isErr(t, err)
|
||||
if !reflect.DeepEqual(gotCurrentState, tt.want.currentState) {
|
||||
|
||||
Reference in New Issue
Block a user