2022-02-09 15:01:19 +01:00
package setup
import (
2022-03-23 09:02:39 +01:00
"context"
2023-03-01 01:11:23 +01:00
"embed"
2022-02-09 15:01:19 +01:00
_ "embed"
2024-01-25 17:28:20 +01:00
"net/http"
perf(oidc): nest position clause for session terminated query (#8738)
# Which Problems Are Solved
Optimize the query that checks for terminated sessions in the access
token verifier. The verifier is used in auth middleware, userinfo and
introspection.
# How the Problems Are Solved
The previous implementation built a query for certain events and then
appended a single `PositionAfter` clause. This caused the postgreSQL
planner to use indexes only for the instance ID, aggregate IDs,
aggregate types and event types. Followed by an expensive sequential
scan for the position. This resulting in internal over-fetching of rows
before the final filter was applied.
![Screenshot_20241007_105803](https://github.com/user-attachments/assets/f2d91976-be87-428b-b604-a211399b821c)
Furthermore, the query was searching for events which are not always
applicable. For example, there was always a session ID search and if
there was a user ID, we would also search for a browser fingerprint in
event payload (expensive). Even if those argument string would be empty.
This PR changes:
1. Nest the position query, so that a full `instance_id, aggregate_id,
aggregate_type, event_type, "position"` index can be matched.
2. Redefine the `es_wm` index to include the `position` column.
3. Only search for events for the IDs that actually have a value. Do not
search (noop) if none of session ID, user ID or fingerpint ID are set.
New query plan:
![Screenshot_20241007_110648](https://github.com/user-attachments/assets/c3234c33-1b76-4b33-a4a9-796f69f3d775)
# Additional Changes
- cleanup how we load multi-statement migrations and make that a bit
more reusable.
# Additional Context
- Related to https://github.com/zitadel/zitadel/issues/7639
2024-10-07 15:49:55 +03:00
"path/filepath"
2022-02-09 15:01:19 +01:00
"github.com/spf13/cobra"
2022-03-23 09:02:39 +01:00
"github.com/spf13/viper"
2022-04-27 01:01:45 +02:00
"github.com/zitadel/logging"
2022-03-23 09:02:39 +01:00
2022-11-04 10:21:58 +01:00
"github.com/zitadel/zitadel/cmd/build"
2024-01-25 17:28:20 +01:00
"github.com/zitadel/zitadel/cmd/encryption"
2022-06-27 12:32:34 +02:00
"github.com/zitadel/zitadel/cmd/key"
"github.com/zitadel/zitadel/cmd/tls"
2024-01-25 17:28:20 +01:00
admin_handler "github.com/zitadel/zitadel/internal/admin/repository/eventsourcing/handler"
admin_view "github.com/zitadel/zitadel/internal/admin/repository/eventsourcing/view"
internal_authz "github.com/zitadel/zitadel/internal/api/authz"
auth_handler "github.com/zitadel/zitadel/internal/auth/repository/eventsourcing/handler"
auth_view "github.com/zitadel/zitadel/internal/auth/repository/eventsourcing/view"
"github.com/zitadel/zitadel/internal/authz"
authz_es "github.com/zitadel/zitadel/internal/authz/repository/eventsourcing/eventstore"
2024-11-04 11:44:51 +01:00
"github.com/zitadel/zitadel/internal/cache/connector"
2024-01-25 17:28:20 +01:00
"github.com/zitadel/zitadel/internal/command"
cryptoDB "github.com/zitadel/zitadel/internal/crypto/database"
2022-04-27 01:01:45 +02:00
"github.com/zitadel/zitadel/internal/database"
2023-12-20 18:13:04 +02:00
"github.com/zitadel/zitadel/internal/database/dialect"
2024-01-25 17:28:20 +01:00
"github.com/zitadel/zitadel/internal/domain"
2022-04-27 01:01:45 +02:00
"github.com/zitadel/zitadel/internal/eventstore"
2023-10-19 12:19:10 +02:00
old_es "github.com/zitadel/zitadel/internal/eventstore/repository/sql"
new_es "github.com/zitadel/zitadel/internal/eventstore/v3"
2023-12-05 12:12:01 +01:00
"github.com/zitadel/zitadel/internal/i18n"
2022-04-27 01:01:45 +02:00
"github.com/zitadel/zitadel/internal/migration"
2024-01-25 17:28:20 +01:00
notify_handler "github.com/zitadel/zitadel/internal/notification"
"github.com/zitadel/zitadel/internal/query"
2022-11-04 10:21:58 +01:00
"github.com/zitadel/zitadel/internal/query/projection"
2024-05-30 11:35:30 +02:00
es_v4 "github.com/zitadel/zitadel/internal/v2/eventstore"
es_v4_pg "github.com/zitadel/zitadel/internal/v2/eventstore/postgres"
2024-01-25 17:28:20 +01:00
"github.com/zitadel/zitadel/internal/webauthn"
2022-03-23 09:02:39 +01:00
)
var (
//go:embed steps.yaml
defaultSteps [ ] byte
2022-04-25 17:05:20 +02:00
stepFiles [ ] string
2022-02-09 15:01:19 +01:00
)
func New ( ) * cobra . Command {
2022-04-25 17:05:20 +02:00
cmd := & cobra . Command {
2022-02-09 15:01:19 +01:00
Use : "setup" ,
Short : "setup ZITADEL instance" ,
Long : ` sets up data to start ZITADEL .
Requirements :
- cockroachdb ` ,
2022-03-23 09:02:39 +01:00
Run : func ( cmd * cobra . Command , args [ ] string ) {
2022-06-24 14:38:22 +02:00
err := tls . ModeFromFlag ( cmd )
logging . OnError ( err ) . Fatal ( "invalid tlsMode" )
2024-01-25 17:28:20 +01:00
err = BindInitProjections ( cmd )
logging . OnError ( err ) . Fatal ( "unable to bind \"init-projections\" flag" )
2024-05-30 11:35:30 +02:00
err = bindForMirror ( cmd )
logging . OnError ( err ) . Fatal ( "unable to bind \"for-mirror\" flag" )
2022-03-28 10:05:09 +02:00
config := MustNewConfig ( viper . GetViper ( ) )
steps := MustNewSteps ( viper . New ( ) )
2022-03-23 09:02:39 +01:00
2022-04-12 16:20:17 +02:00
masterKey , err := key . MasterKey ( cmd )
logging . OnError ( err ) . Panic ( "No master key provided" )
2024-05-30 11:35:30 +02:00
Setup ( cmd . Context ( ) , config , steps , masterKey )
2022-02-09 15:01:19 +01:00
} ,
}
2022-04-25 17:05:20 +02:00
2023-04-28 13:55:35 +02:00
cmd . AddCommand ( NewCleanup ( ) )
2022-04-25 17:05:20 +02:00
Flags ( cmd )
return cmd
}
func Flags ( cmd * cobra . Command ) {
cmd . PersistentFlags ( ) . StringArrayVar ( & stepFiles , "steps" , nil , "paths to step files to overwrite default steps" )
2024-01-25 17:28:20 +01:00
cmd . Flags ( ) . Bool ( "init-projections" , viper . GetBool ( "InitProjections" ) , "beta feature: initializes projections after they are created, allows smooth start as projections are up to date" )
2024-05-30 11:35:30 +02:00
cmd . Flags ( ) . Bool ( "for-mirror" , viper . GetBool ( "ForMirror" ) , "use this flag if you want to mirror your existing data" )
2022-04-25 17:05:20 +02:00
key . AddMasterKeyFlag ( cmd )
2022-06-24 14:38:22 +02:00
tls . AddTLSModeFlag ( cmd )
2022-02-09 15:01:19 +01:00
}
2022-03-23 09:02:39 +01:00
2024-01-25 17:28:20 +01:00
func BindInitProjections ( cmd * cobra . Command ) error {
return viper . BindPFlag ( "InitProjections.Enabled" , cmd . Flags ( ) . Lookup ( "init-projections" ) )
}
2024-05-30 11:35:30 +02:00
func bindForMirror ( cmd * cobra . Command ) error {
return viper . BindPFlag ( "ForMirror" , cmd . Flags ( ) . Lookup ( "for-mirror" ) )
}
func Setup ( ctx context . Context , config * Config , steps * Steps , masterKey string ) {
2022-07-20 11:20:49 +02:00
logging . Info ( "setup started" )
2023-12-05 12:12:01 +01:00
i18n . MustLoadSupportedLanguagesFromDir ( )
2023-12-20 18:13:04 +02:00
queryDBClient , err := database . Connect ( config . Database , false , dialect . DBPurposeQuery )
2023-10-19 12:19:10 +02:00
logging . OnError ( err ) . Fatal ( "unable to connect to database" )
2023-12-20 18:13:04 +02:00
esPusherDBClient , err := database . Connect ( config . Database , false , dialect . DBPurposeEventPusher )
logging . OnError ( err ) . Fatal ( "unable to connect to database" )
projectionDBClient , err := database . Connect ( config . Database , false , dialect . DBPurposeProjectionSpooler )
2022-03-23 09:02:39 +01:00
logging . OnError ( err ) . Fatal ( "unable to connect to database" )
2023-12-20 18:13:04 +02:00
config . Eventstore . Querier = old_es . NewCRDB ( queryDBClient )
2024-07-03 17:00:56 +02:00
esV3 := new_es . NewEventstore ( esPusherDBClient )
config . Eventstore . Pusher = esV3
config . Eventstore . Searcher = esV3
2023-10-19 12:19:10 +02:00
eventstoreClient := eventstore . NewEventstore ( config . Eventstore )
2024-07-03 17:00:56 +02:00
2022-03-23 09:02:39 +01:00
logging . OnError ( err ) . Fatal ( "unable to start eventstore" )
2024-05-30 11:35:30 +02:00
eventstoreV4 := es_v4 . NewEventstoreFromOne ( es_v4_pg . New ( queryDBClient , & es_v4_pg . Config {
MaxRetries : config . Eventstore . MaxRetries ,
} ) )
2022-03-28 10:05:09 +02:00
2023-12-20 18:13:04 +02:00
steps . s1ProjectionTable = & ProjectionTable { dbClient : queryDBClient . DB }
steps . s2AssetsTable = & AssetTable { dbClient : queryDBClient . DB }
2022-04-13 07:42:48 +02:00
2024-05-30 11:35:30 +02:00
steps . FirstInstance . Skip = config . ForMirror || steps . FirstInstance . Skip
2022-07-27 10:22:20 +02:00
steps . FirstInstance . instanceSetup = config . DefaultInstance
steps . FirstInstance . userEncryptionKey = config . EncryptionKeys . User
steps . FirstInstance . smtpEncryptionKey = config . EncryptionKeys . SMTP
2023-06-15 08:16:39 +02:00
steps . FirstInstance . oidcEncryptionKey = config . EncryptionKeys . OIDC
2022-07-27 10:22:20 +02:00
steps . FirstInstance . masterKey = masterKey
2023-12-20 18:13:04 +02:00
steps . FirstInstance . db = queryDBClient
2022-07-27 10:22:20 +02:00
steps . FirstInstance . es = eventstoreClient
steps . FirstInstance . defaults = config . SystemDefaults
steps . FirstInstance . zitadelRoles = config . InternalAuthZ . RolePermissionMappings
steps . FirstInstance . externalDomain = config . ExternalDomain
steps . FirstInstance . externalSecure = config . ExternalSecure
steps . FirstInstance . externalPort = config . ExternalPort
2022-03-23 09:02:39 +01:00
2023-12-20 18:13:04 +02:00
steps . s5LastFailed = & LastFailed { dbClient : queryDBClient . DB }
steps . s6OwnerRemoveColumns = & OwnerRemoveColumns { dbClient : queryDBClient . DB }
steps . s7LogstoreTables = & LogstoreTables { dbClient : queryDBClient . DB , username : config . Database . Username ( ) , dbType : config . Database . Type ( ) }
steps . s8AuthTokens = & AuthTokenIndexes { dbClient : queryDBClient }
2023-10-19 12:19:10 +02:00
steps . CorrectCreationDate . dbClient = esPusherDBClient
2023-12-20 18:13:04 +02:00
steps . s12AddOTPColumns = & AddOTPColumns { dbClient : queryDBClient }
steps . s13FixQuotaProjection = & FixQuotaConstraints { dbClient : queryDBClient }
2023-10-19 12:19:10 +02:00
steps . s14NewEventsTable = & NewEventsTable { dbClient : esPusherDBClient }
2023-12-20 18:13:04 +02:00
steps . s15CurrentStates = & CurrentProjectionState { dbClient : queryDBClient }
steps . s16UniqueConstraintsLower = & UniqueConstraintToLower { dbClient : queryDBClient }
steps . s17AddOffsetToUniqueConstraints = & AddOffsetToCurrentStates { dbClient : queryDBClient }
steps . s18AddLowerFieldsToLoginNames = & AddLowerFieldsToLoginNames { dbClient : queryDBClient }
2023-12-31 15:30:25 +01:00
steps . s19AddCurrentStatesIndex = & AddCurrentSequencesIndex { dbClient : queryDBClient }
2024-01-09 19:36:46 +01:00
steps . s20AddByUserSessionIndex = & AddByUserIndexToSession { dbClient : queryDBClient }
2024-01-17 11:16:48 +01:00
steps . s21AddBlockFieldToLimits = & AddBlockFieldToLimits { dbClient : queryDBClient }
2024-01-25 17:28:20 +01:00
steps . s22ActiveInstancesIndex = & ActiveInstanceEvents { dbClient : queryDBClient }
2024-03-08 14:33:53 +01:00
steps . s23CorrectGlobalUniqueConstraints = & CorrectGlobalUniqueConstraints { dbClient : esPusherDBClient }
2024-03-20 12:18:46 +02:00
steps . s24AddActorToAuthTokens = & AddActorToAuthTokens { dbClient : queryDBClient }
2024-03-28 07:21:21 +01:00
steps . s25User11AddLowerFieldsToVerifiedEmail = & User11AddLowerFieldsToVerifiedEmail { dbClient : esPusherDBClient }
2024-05-22 17:26:02 +02:00
steps . s26AuthUsers3 = & AuthUsers3 { dbClient : esPusherDBClient }
2024-05-23 07:04:07 +02:00
steps . s27IDPTemplate6SAMLNameIDFormat = & IDPTemplate6SAMLNameIDFormat { dbClient : esPusherDBClient }
2024-07-03 17:00:56 +02:00
steps . s28AddFieldTable = & AddFieldTable { dbClient : esPusherDBClient }
steps . s29FillFieldsForProjectGrant = & FillFieldsForProjectGrant { eventstore : eventstoreClient }
2024-07-05 10:36:00 +03:00
steps . s30FillFieldsForOrgDomainVerified = & FillFieldsForOrgDomainVerified { eventstore : eventstoreClient }
2024-07-08 17:54:19 +02:00
steps . s31AddAggregateIndexToFields = & AddAggregateIndexToFields { dbClient : esPusherDBClient }
2024-09-03 15:19:00 +02:00
steps . s32AddAuthSessionID = & AddAuthSessionID { dbClient : esPusherDBClient }
2024-09-26 09:14:33 +02:00
steps . s33SMSConfigs3TwilioAddVerifyServiceSid = & SMSConfigs3TwilioAddVerifyServiceSid { dbClient : esPusherDBClient }
2024-10-04 16:15:41 +03:00
steps . s34AddCacheSchema = & AddCacheSchema { dbClient : queryDBClient }
perf(oidc): nest position clause for session terminated query (#8738)
# Which Problems Are Solved
Optimize the query that checks for terminated sessions in the access
token verifier. The verifier is used in auth middleware, userinfo and
introspection.
# How the Problems Are Solved
The previous implementation built a query for certain events and then
appended a single `PositionAfter` clause. This caused the postgreSQL
planner to use indexes only for the instance ID, aggregate IDs,
aggregate types and event types. Followed by an expensive sequential
scan for the position. This resulting in internal over-fetching of rows
before the final filter was applied.
![Screenshot_20241007_105803](https://github.com/user-attachments/assets/f2d91976-be87-428b-b604-a211399b821c)
Furthermore, the query was searching for events which are not always
applicable. For example, there was always a session ID search and if
there was a user ID, we would also search for a browser fingerprint in
event payload (expensive). Even if those argument string would be empty.
This PR changes:
1. Nest the position query, so that a full `instance_id, aggregate_id,
aggregate_type, event_type, "position"` index can be matched.
2. Redefine the `es_wm` index to include the `position` column.
3. Only search for events for the IDs that actually have a value. Do not
search (noop) if none of session ID, user ID or fingerpint ID are set.
New query plan:
![Screenshot_20241007_110648](https://github.com/user-attachments/assets/c3234c33-1b76-4b33-a4a9-796f69f3d775)
# Additional Changes
- cleanup how we load multi-statement migrations and make that a bit
more reusable.
# Additional Context
- Related to https://github.com/zitadel/zitadel/issues/7639
2024-10-07 15:49:55 +03:00
steps . s35AddPositionToIndexEsWm = & AddPositionToIndexEsWm { dbClient : esPusherDBClient }
2024-11-11 12:28:27 +01:00
steps . s36FillV2Milestones = & FillV3Milestones { dbClient : queryDBClient , eventstore : eventstoreClient }
2024-10-31 15:57:17 +01:00
steps . s37Apps7OIDConfigsBackChannelLogoutURI = & Apps7OIDConfigsBackChannelLogoutURI { dbClient : esPusherDBClient }
steps . s38BackChannelLogoutNotificationStart = & BackChannelLogoutNotificationStart { dbClient : esPusherDBClient , esClient : eventstoreClient }
2024-11-26 17:26:41 +02:00
steps . s39DeleteStaleOrgFields = & DeleteStaleOrgFields { dbClient : esPusherDBClient }
2024-12-04 14:51:40 +01:00
steps . s40InitPushFunc = & InitPushFunc { dbClient : esPusherDBClient }
2024-12-04 19:10:10 +01:00
steps . s41FillFieldsForInstanceDomains = & FillFieldsForInstanceDomains { eventstore : eventstoreClient }
2023-10-19 12:19:10 +02:00
2023-12-20 18:13:04 +02:00
err = projection . Create ( ctx , projectionDBClient , eventstoreClient , config . Projections , nil , nil , nil )
2022-11-04 10:21:58 +01:00
logging . OnError ( err ) . Fatal ( "unable to start projections" )
2022-07-20 11:20:49 +02:00
repeatableSteps := [ ] migration . RepeatableMigration {
& externalConfigChange {
es : eventstoreClient ,
ExternalDomain : config . ExternalDomain ,
ExternalPort : config . ExternalPort ,
ExternalSecure : config . ExternalSecure ,
2023-07-14 09:49:57 +03:00
defaults : config . SystemDefaults ,
2022-07-20 11:20:49 +02:00
} ,
2022-11-04 10:21:58 +01:00
& projectionTables {
es : eventstoreClient ,
Version : build . Version ( ) ,
} ,
2022-07-20 11:20:49 +02:00
}
2024-03-08 14:33:53 +01:00
for _ , step := range [ ] migration . Migration {
steps . s14NewEventsTable ,
2024-12-04 14:51:40 +01:00
steps . s40InitPushFunc ,
2024-03-08 14:33:53 +01:00
steps . s1ProjectionTable ,
steps . s2AssetsTable ,
2024-07-03 17:00:56 +02:00
steps . s28AddFieldTable ,
2024-07-08 17:54:19 +02:00
steps . s31AddAggregateIndexToFields ,
2024-03-08 14:33:53 +01:00
steps . FirstInstance ,
steps . s5LastFailed ,
steps . s6OwnerRemoveColumns ,
steps . s7LogstoreTables ,
steps . s8AuthTokens ,
steps . s12AddOTPColumns ,
steps . s13FixQuotaProjection ,
steps . s15CurrentStates ,
steps . s16UniqueConstraintsLower ,
steps . s17AddOffsetToUniqueConstraints ,
steps . s19AddCurrentStatesIndex ,
steps . s20AddByUserSessionIndex ,
steps . s22ActiveInstancesIndex ,
steps . s23CorrectGlobalUniqueConstraints ,
2024-03-20 12:18:46 +02:00
steps . s24AddActorToAuthTokens ,
2024-05-22 17:26:02 +02:00
steps . s26AuthUsers3 ,
2024-07-03 17:00:56 +02:00
steps . s29FillFieldsForProjectGrant ,
2024-07-05 10:36:00 +03:00
steps . s30FillFieldsForOrgDomainVerified ,
2024-10-04 16:15:41 +03:00
steps . s34AddCacheSchema ,
perf(oidc): nest position clause for session terminated query (#8738)
# Which Problems Are Solved
Optimize the query that checks for terminated sessions in the access
token verifier. The verifier is used in auth middleware, userinfo and
introspection.
# How the Problems Are Solved
The previous implementation built a query for certain events and then
appended a single `PositionAfter` clause. This caused the postgreSQL
planner to use indexes only for the instance ID, aggregate IDs,
aggregate types and event types. Followed by an expensive sequential
scan for the position. This resulting in internal over-fetching of rows
before the final filter was applied.
![Screenshot_20241007_105803](https://github.com/user-attachments/assets/f2d91976-be87-428b-b604-a211399b821c)
Furthermore, the query was searching for events which are not always
applicable. For example, there was always a session ID search and if
there was a user ID, we would also search for a browser fingerprint in
event payload (expensive). Even if those argument string would be empty.
This PR changes:
1. Nest the position query, so that a full `instance_id, aggregate_id,
aggregate_type, event_type, "position"` index can be matched.
2. Redefine the `es_wm` index to include the `position` column.
3. Only search for events for the IDs that actually have a value. Do not
search (noop) if none of session ID, user ID or fingerpint ID are set.
New query plan:
![Screenshot_20241007_110648](https://github.com/user-attachments/assets/c3234c33-1b76-4b33-a4a9-796f69f3d775)
# Additional Changes
- cleanup how we load multi-statement migrations and make that a bit
more reusable.
# Additional Context
- Related to https://github.com/zitadel/zitadel/issues/7639
2024-10-07 15:49:55 +03:00
steps . s35AddPositionToIndexEsWm ,
2024-10-28 09:29:34 +01:00
steps . s36FillV2Milestones ,
2024-10-31 15:57:17 +01:00
steps . s38BackChannelLogoutNotificationStart ,
2024-12-04 19:10:10 +01:00
steps . s41FillFieldsForInstanceDomains ,
2024-03-08 14:33:53 +01:00
} {
mustExecuteMigration ( ctx , eventstoreClient , step , "migration failed" )
}
2022-04-25 17:05:20 +02:00
2022-07-20 11:20:49 +02:00
for _ , repeatableStep := range repeatableSteps {
2024-03-08 14:33:53 +01:00
mustExecuteMigration ( ctx , eventstoreClient , repeatableStep , "unable to migrate repeatable step" )
2022-04-25 17:05:20 +02:00
}
2023-12-08 13:14:22 +01:00
2024-01-17 11:16:48 +01:00
// These steps are executed after the repeatable steps because they add fields projections
2024-03-08 14:33:53 +01:00
for _ , step := range [ ] migration . Migration {
steps . s18AddLowerFieldsToLoginNames ,
steps . s21AddBlockFieldToLimits ,
2024-03-28 07:21:21 +01:00
steps . s25User11AddLowerFieldsToVerifiedEmail ,
2024-05-23 07:04:07 +02:00
steps . s27IDPTemplate6SAMLNameIDFormat ,
2024-09-03 15:19:00 +02:00
steps . s32AddAuthSessionID ,
2024-09-26 09:14:33 +02:00
steps . s33SMSConfigs3TwilioAddVerifyServiceSid ,
2024-10-31 15:57:17 +01:00
steps . s37Apps7OIDConfigsBackChannelLogoutURI ,
2024-11-26 17:26:41 +02:00
steps . s39DeleteStaleOrgFields ,
2024-03-08 14:33:53 +01:00
} {
mustExecuteMigration ( ctx , eventstoreClient , step , "migration failed" )
}
2024-01-25 17:28:20 +01:00
// projection initialization must be done last, since the steps above might add required columns to the projections
2024-05-30 11:35:30 +02:00
if ! config . ForMirror && config . InitProjections . Enabled {
2024-01-25 17:28:20 +01:00
initProjections (
ctx ,
eventstoreClient ,
2024-05-30 11:35:30 +02:00
eventstoreV4 ,
2024-01-25 17:28:20 +01:00
queryDBClient ,
projectionDBClient ,
masterKey ,
config ,
)
}
2022-04-25 17:05:20 +02:00
}
2023-03-01 01:11:23 +01:00
2024-03-08 14:33:53 +01:00
func mustExecuteMigration ( ctx context . Context , eventstoreClient * eventstore . Eventstore , step migration . Migration , errorMsg string ) {
err := migration . Migrate ( ctx , eventstoreClient , step )
logging . WithFields ( "name" , step . String ( ) ) . OnError ( err ) . Fatal ( errorMsg )
}
perf(oidc): nest position clause for session terminated query (#8738)
# Which Problems Are Solved
Optimize the query that checks for terminated sessions in the access
token verifier. The verifier is used in auth middleware, userinfo and
introspection.
# How the Problems Are Solved
The previous implementation built a query for certain events and then
appended a single `PositionAfter` clause. This caused the postgreSQL
planner to use indexes only for the instance ID, aggregate IDs,
aggregate types and event types. Followed by an expensive sequential
scan for the position. This resulting in internal over-fetching of rows
before the final filter was applied.
![Screenshot_20241007_105803](https://github.com/user-attachments/assets/f2d91976-be87-428b-b604-a211399b821c)
Furthermore, the query was searching for events which are not always
applicable. For example, there was always a session ID search and if
there was a user ID, we would also search for a browser fingerprint in
event payload (expensive). Even if those argument string would be empty.
This PR changes:
1. Nest the position query, so that a full `instance_id, aggregate_id,
aggregate_type, event_type, "position"` index can be matched.
2. Redefine the `es_wm` index to include the `position` column.
3. Only search for events for the IDs that actually have a value. Do not
search (noop) if none of session ID, user ID or fingerpint ID are set.
New query plan:
![Screenshot_20241007_110648](https://github.com/user-attachments/assets/c3234c33-1b76-4b33-a4a9-796f69f3d775)
# Additional Changes
- cleanup how we load multi-statement migrations and make that a bit
more reusable.
# Additional Context
- Related to https://github.com/zitadel/zitadel/issues/7639
2024-10-07 15:49:55 +03:00
// readStmt reads a single file from the embedded FS,
// under the folder/typ/filename path.
// Typ describes the database dialect and may be omitted if no
// dialect specific migration is specified.
2023-03-01 01:11:23 +01:00
func readStmt ( fs embed . FS , folder , typ , filename string ) ( string , error ) {
perf(oidc): nest position clause for session terminated query (#8738)
# Which Problems Are Solved
Optimize the query that checks for terminated sessions in the access
token verifier. The verifier is used in auth middleware, userinfo and
introspection.
# How the Problems Are Solved
The previous implementation built a query for certain events and then
appended a single `PositionAfter` clause. This caused the postgreSQL
planner to use indexes only for the instance ID, aggregate IDs,
aggregate types and event types. Followed by an expensive sequential
scan for the position. This resulting in internal over-fetching of rows
before the final filter was applied.
![Screenshot_20241007_105803](https://github.com/user-attachments/assets/f2d91976-be87-428b-b604-a211399b821c)
Furthermore, the query was searching for events which are not always
applicable. For example, there was always a session ID search and if
there was a user ID, we would also search for a browser fingerprint in
event payload (expensive). Even if those argument string would be empty.
This PR changes:
1. Nest the position query, so that a full `instance_id, aggregate_id,
aggregate_type, event_type, "position"` index can be matched.
2. Redefine the `es_wm` index to include the `position` column.
3. Only search for events for the IDs that actually have a value. Do not
search (noop) if none of session ID, user ID or fingerpint ID are set.
New query plan:
![Screenshot_20241007_110648](https://github.com/user-attachments/assets/c3234c33-1b76-4b33-a4a9-796f69f3d775)
# Additional Changes
- cleanup how we load multi-statement migrations and make that a bit
more reusable.
# Additional Context
- Related to https://github.com/zitadel/zitadel/issues/7639
2024-10-07 15:49:55 +03:00
stmt , err := fs . ReadFile ( filepath . Join ( folder , typ , filename ) )
2023-03-01 01:11:23 +01:00
return string ( stmt ) , err
}
2024-01-25 17:28:20 +01:00
perf(oidc): nest position clause for session terminated query (#8738)
# Which Problems Are Solved
Optimize the query that checks for terminated sessions in the access
token verifier. The verifier is used in auth middleware, userinfo and
introspection.
# How the Problems Are Solved
The previous implementation built a query for certain events and then
appended a single `PositionAfter` clause. This caused the postgreSQL
planner to use indexes only for the instance ID, aggregate IDs,
aggregate types and event types. Followed by an expensive sequential
scan for the position. This resulting in internal over-fetching of rows
before the final filter was applied.
![Screenshot_20241007_105803](https://github.com/user-attachments/assets/f2d91976-be87-428b-b604-a211399b821c)
Furthermore, the query was searching for events which are not always
applicable. For example, there was always a session ID search and if
there was a user ID, we would also search for a browser fingerprint in
event payload (expensive). Even if those argument string would be empty.
This PR changes:
1. Nest the position query, so that a full `instance_id, aggregate_id,
aggregate_type, event_type, "position"` index can be matched.
2. Redefine the `es_wm` index to include the `position` column.
3. Only search for events for the IDs that actually have a value. Do not
search (noop) if none of session ID, user ID or fingerpint ID are set.
New query plan:
![Screenshot_20241007_110648](https://github.com/user-attachments/assets/c3234c33-1b76-4b33-a4a9-796f69f3d775)
# Additional Changes
- cleanup how we load multi-statement migrations and make that a bit
more reusable.
# Additional Context
- Related to https://github.com/zitadel/zitadel/issues/7639
2024-10-07 15:49:55 +03:00
type statement struct {
file string
query string
}
// readStatements reads all files from the embedded FS,
// under the folder/type path.
// Typ describes the database dialect and may be omitted if no
// dialect specific migration is specified.
func readStatements ( fs embed . FS , folder , typ string ) ( [ ] statement , error ) {
basePath := filepath . Join ( folder , typ )
dir , err := fs . ReadDir ( basePath )
if err != nil {
return nil , err
}
statements := make ( [ ] statement , len ( dir ) )
for i , file := range dir {
statements [ i ] . file = file . Name ( )
statements [ i ] . query , err = readStmt ( fs , folder , typ , file . Name ( ) )
if err != nil {
return nil , err
}
}
return statements , nil
}
2024-01-25 17:28:20 +01:00
func initProjections (
ctx context . Context ,
eventstoreClient * eventstore . Eventstore ,
2024-05-30 11:35:30 +02:00
eventstoreV4 * es_v4 . EventStore ,
2024-01-25 17:28:20 +01:00
queryDBClient ,
projectionDBClient * database . DB ,
masterKey string ,
config * Config ,
) {
logging . Info ( "init-projections is currently in beta" )
keyStorage , err := cryptoDB . NewKeyStorage ( queryDBClient , masterKey )
logging . OnError ( err ) . Fatal ( "unable to start key storage" )
keys , err := encryption . EnsureEncryptionKeys ( ctx , config . EncryptionKeys , keyStorage )
logging . OnError ( err ) . Fatal ( "unable to ensure encryption keys" )
err = projection . Create (
ctx ,
queryDBClient ,
eventstoreClient ,
projection . Config {
RetryFailedAfter : config . InitProjections . RetryFailedAfter ,
MaxFailureCount : config . InitProjections . MaxFailureCount ,
BulkLimit : config . InitProjections . BulkLimit ,
} ,
keys . OIDC ,
keys . SAML ,
config . SystemAPIUsers ,
)
logging . OnError ( err ) . Fatal ( "unable to start projections" )
for _ , p := range projection . Projections ( ) {
err := migration . Migrate ( ctx , eventstoreClient , p )
logging . WithFields ( "name" , p . String ( ) ) . OnError ( err ) . Fatal ( "migration failed" )
}
staticStorage , err := config . AssetStorage . NewStorage ( queryDBClient . DB )
logging . OnError ( err ) . Fatal ( "unable to start asset storage" )
adminView , err := admin_view . StartView ( queryDBClient )
logging . OnError ( err ) . Fatal ( "unable to start admin view" )
admin_handler . Register ( ctx ,
admin_handler . Config {
Client : queryDBClient ,
Eventstore : eventstoreClient ,
BulkLimit : config . InitProjections . BulkLimit ,
FailureCountUntilSkip : uint64 ( config . InitProjections . MaxFailureCount ) ,
} ,
adminView ,
staticStorage ,
)
for _ , p := range admin_handler . Projections ( ) {
err := migration . Migrate ( ctx , eventstoreClient , p )
logging . WithFields ( "name" , p . String ( ) ) . OnError ( err ) . Fatal ( "migration failed" )
}
sessionTokenVerifier := internal_authz . SessionTokenVerifier ( keys . OIDC )
2024-11-04 11:44:51 +01:00
cacheConnectors , err := connector . StartConnectors ( config . Caches , queryDBClient )
logging . OnError ( err ) . Fatal ( "unable to start caches" )
2024-01-25 17:28:20 +01:00
queries , err := query . StartQueries (
ctx ,
eventstoreClient ,
2024-05-30 11:35:30 +02:00
eventstoreV4 . Querier ,
2024-01-25 17:28:20 +01:00
queryDBClient ,
projectionDBClient ,
2024-11-04 11:44:51 +01:00
cacheConnectors ,
2024-01-25 17:28:20 +01:00
config . Projections ,
config . SystemDefaults ,
keys . IDPConfig ,
keys . OTP ,
keys . OIDC ,
keys . SAML ,
2024-11-28 11:06:52 +01:00
keys . Target ,
2024-01-25 17:28:20 +01:00
config . InternalAuthZ . RolePermissionMappings ,
sessionTokenVerifier ,
func ( q * query . Queries ) domain . PermissionCheck {
return func ( ctx context . Context , permission , orgID , resourceID string ) ( err error ) {
return internal_authz . CheckPermission ( ctx , & authz_es . UserMembershipRepo { Queries : q } , config . InternalAuthZ . RolePermissionMappings , permission , orgID , resourceID )
}
} ,
0 , // not needed for projections
nil , // not needed for projections
false ,
)
logging . OnError ( err ) . Fatal ( "unable to start queries" )
authView , err := auth_view . StartView ( queryDBClient , keys . OIDC , queries , eventstoreClient )
logging . OnError ( err ) . Fatal ( "unable to start admin view" )
auth_handler . Register ( ctx ,
auth_handler . Config {
Client : queryDBClient ,
Eventstore : eventstoreClient ,
BulkLimit : config . InitProjections . BulkLimit ,
FailureCountUntilSkip : uint64 ( config . InitProjections . MaxFailureCount ) ,
} ,
authView ,
queries ,
)
for _ , p := range auth_handler . Projections ( ) {
err := migration . Migrate ( ctx , eventstoreClient , p )
logging . WithFields ( "name" , p . String ( ) ) . OnError ( err ) . Fatal ( "migration failed" )
}
authZRepo , err := authz . Start ( queries , eventstoreClient , queryDBClient , keys . OIDC , config . ExternalSecure )
logging . OnError ( err ) . Fatal ( "unable to start authz repo" )
permissionCheck := func ( ctx context . Context , permission , orgID , resourceID string ) ( err error ) {
return internal_authz . CheckPermission ( ctx , authZRepo , config . InternalAuthZ . RolePermissionMappings , permission , orgID , resourceID )
}
2024-11-04 11:44:51 +01:00
commands , err := command . StartCommands ( ctx ,
2024-01-25 17:28:20 +01:00
eventstoreClient ,
2024-11-04 11:44:51 +01:00
cacheConnectors ,
2024-01-25 17:28:20 +01:00
config . SystemDefaults ,
config . InternalAuthZ . RolePermissionMappings ,
staticStorage ,
& webauthn . Config {
DisplayName : config . WebAuthNName ,
ExternalSecure : config . ExternalSecure ,
} ,
config . ExternalDomain ,
config . ExternalSecure ,
config . ExternalPort ,
keys . IDPConfig ,
keys . OTP ,
keys . SMTP ,
keys . SMS ,
keys . User ,
keys . DomainVerification ,
keys . OIDC ,
keys . SAML ,
2024-11-28 11:06:52 +01:00
keys . Target ,
2024-01-25 17:28:20 +01:00
& http . Client { } ,
permissionCheck ,
sessionTokenVerifier ,
config . OIDC . DefaultAccessTokenLifetime ,
config . OIDC . DefaultRefreshTokenExpiration ,
config . OIDC . DefaultRefreshTokenIdleExpiration ,
config . DefaultInstance . SecretGenerators ,
)
logging . OnError ( err ) . Fatal ( "unable to start commands" )
notify_handler . Register (
ctx ,
config . Projections . Customizations [ "notifications" ] ,
config . Projections . Customizations [ "notificationsquotas" ] ,
2024-10-31 15:57:17 +01:00
config . Projections . Customizations [ "backchannel" ] ,
2024-01-25 17:28:20 +01:00
config . Projections . Customizations [ "telemetry" ] ,
feat(notification): use event worker pool (#8962)
# Which Problems Are Solved
The current handling of notification follows the same pattern as all
other projections:
Created events are handled sequentially (based on "position") by a
handler. During the process, a lot of information is aggregated (user,
texts, templates, ...).
This leads to back pressure on the projection since the handling of
events might take longer than the time before a new event (to be
handled) is created.
# How the Problems Are Solved
- The current user notification handler creates separate notification
events based on the user / session events.
- These events contain all the present and required information
including the userID.
- These notification events get processed by notification workers, which
gather the necessary information (recipient address, texts, templates)
to send out these notifications.
- If a notification fails, a retry event is created based on the current
notification request including the current state of the user (this
prevents race conditions, where a user is changed in the meantime and
the notification already gets the new state).
- The retry event will be handled after a backoff delay. This delay
increases with every attempt.
- If the configured amount of attempts is reached or the message expired
(based on config), a cancel event is created, letting the workers know,
the notification must no longer be handled.
- In case of successful send, a sent event is created for the
notification aggregate and the existing "sent" events for the user /
session object is stored.
- The following is added to the defaults.yaml to allow configuration of
the notification workers:
```yaml
Notifications:
# The amount of workers processing the notification request events.
# If set to 0, no notification request events will be handled. This can be useful when running in
# multi binary / pod setup and allowing only certain executables to process the events.
Workers: 1 # ZITADEL_NOTIFIACATIONS_WORKERS
# The amount of events a single worker will process in a run.
BulkLimit: 10 # ZITADEL_NOTIFIACATIONS_BULKLIMIT
# Time interval between scheduled notifications for request events
RequeueEvery: 2s # ZITADEL_NOTIFIACATIONS_REQUEUEEVERY
# The amount of workers processing the notification retry events.
# If set to 0, no notification retry events will be handled. This can be useful when running in
# multi binary / pod setup and allowing only certain executables to process the events.
RetryWorkers: 1 # ZITADEL_NOTIFIACATIONS_RETRYWORKERS
# Time interval between scheduled notifications for retry events
RetryRequeueEvery: 2s # ZITADEL_NOTIFIACATIONS_RETRYREQUEUEEVERY
# Only instances are projected, for which at least a projection-relevant event exists within the timeframe
# from HandleActiveInstances duration in the past until the projection's current time
# If set to 0 (default), every instance is always considered active
HandleActiveInstances: 0s # ZITADEL_NOTIFIACATIONS_HANDLEACTIVEINSTANCES
# The maximum duration a transaction remains open
# before it spots left folding additional events
# and updates the table.
TransactionDuration: 1m # ZITADEL_NOTIFIACATIONS_TRANSACTIONDURATION
# Automatically cancel the notification after the amount of failed attempts
MaxAttempts: 3 # ZITADEL_NOTIFIACATIONS_MAXATTEMPTS
# Automatically cancel the notification if it cannot be handled within a specific time
MaxTtl: 5m # ZITADEL_NOTIFIACATIONS_MAXTTL
# Failed attempts are retried after a confogired delay (with exponential backoff).
# Set a minimum and maximum delay and a factor for the backoff
MinRetryDelay: 1s # ZITADEL_NOTIFIACATIONS_MINRETRYDELAY
MaxRetryDelay: 20s # ZITADEL_NOTIFIACATIONS_MAXRETRYDELAY
# Any factor below 1 will be set to 1
RetryDelayFactor: 1.5 # ZITADEL_NOTIFIACATIONS_RETRYDELAYFACTOR
```
# Additional Changes
None
# Additional Context
- closes #8931
2024-11-27 16:01:17 +01:00
config . Notifications ,
2024-01-25 17:28:20 +01:00
* config . Telemetry ,
config . ExternalDomain ,
config . ExternalPort ,
config . ExternalSecure ,
commands ,
queries ,
eventstoreClient ,
config . Login . DefaultOTPEmailURLV2 ,
config . SystemDefaults . Notifications . FileSystemPath ,
keys . User ,
keys . SMTP ,
keys . SMS ,
2024-10-31 15:57:17 +01:00
keys . OIDC ,
config . OIDC . DefaultBackChannelLogoutLifetime ,
feat(notification): use event worker pool (#8962)
# Which Problems Are Solved
The current handling of notification follows the same pattern as all
other projections:
Created events are handled sequentially (based on "position") by a
handler. During the process, a lot of information is aggregated (user,
texts, templates, ...).
This leads to back pressure on the projection since the handling of
events might take longer than the time before a new event (to be
handled) is created.
# How the Problems Are Solved
- The current user notification handler creates separate notification
events based on the user / session events.
- These events contain all the present and required information
including the userID.
- These notification events get processed by notification workers, which
gather the necessary information (recipient address, texts, templates)
to send out these notifications.
- If a notification fails, a retry event is created based on the current
notification request including the current state of the user (this
prevents race conditions, where a user is changed in the meantime and
the notification already gets the new state).
- The retry event will be handled after a backoff delay. This delay
increases with every attempt.
- If the configured amount of attempts is reached or the message expired
(based on config), a cancel event is created, letting the workers know,
the notification must no longer be handled.
- In case of successful send, a sent event is created for the
notification aggregate and the existing "sent" events for the user /
session object is stored.
- The following is added to the defaults.yaml to allow configuration of
the notification workers:
```yaml
Notifications:
# The amount of workers processing the notification request events.
# If set to 0, no notification request events will be handled. This can be useful when running in
# multi binary / pod setup and allowing only certain executables to process the events.
Workers: 1 # ZITADEL_NOTIFIACATIONS_WORKERS
# The amount of events a single worker will process in a run.
BulkLimit: 10 # ZITADEL_NOTIFIACATIONS_BULKLIMIT
# Time interval between scheduled notifications for request events
RequeueEvery: 2s # ZITADEL_NOTIFIACATIONS_REQUEUEEVERY
# The amount of workers processing the notification retry events.
# If set to 0, no notification retry events will be handled. This can be useful when running in
# multi binary / pod setup and allowing only certain executables to process the events.
RetryWorkers: 1 # ZITADEL_NOTIFIACATIONS_RETRYWORKERS
# Time interval between scheduled notifications for retry events
RetryRequeueEvery: 2s # ZITADEL_NOTIFIACATIONS_RETRYREQUEUEEVERY
# Only instances are projected, for which at least a projection-relevant event exists within the timeframe
# from HandleActiveInstances duration in the past until the projection's current time
# If set to 0 (default), every instance is always considered active
HandleActiveInstances: 0s # ZITADEL_NOTIFIACATIONS_HANDLEACTIVEINSTANCES
# The maximum duration a transaction remains open
# before it spots left folding additional events
# and updates the table.
TransactionDuration: 1m # ZITADEL_NOTIFIACATIONS_TRANSACTIONDURATION
# Automatically cancel the notification after the amount of failed attempts
MaxAttempts: 3 # ZITADEL_NOTIFIACATIONS_MAXATTEMPTS
# Automatically cancel the notification if it cannot be handled within a specific time
MaxTtl: 5m # ZITADEL_NOTIFIACATIONS_MAXTTL
# Failed attempts are retried after a confogired delay (with exponential backoff).
# Set a minimum and maximum delay and a factor for the backoff
MinRetryDelay: 1s # ZITADEL_NOTIFIACATIONS_MINRETRYDELAY
MaxRetryDelay: 20s # ZITADEL_NOTIFIACATIONS_MAXRETRYDELAY
# Any factor below 1 will be set to 1
RetryDelayFactor: 1.5 # ZITADEL_NOTIFIACATIONS_RETRYDELAYFACTOR
```
# Additional Changes
None
# Additional Context
- closes #8931
2024-11-27 16:01:17 +01:00
queryDBClient ,
2024-01-25 17:28:20 +01:00
)
for _ , p := range notify_handler . Projections ( ) {
err := migration . Migrate ( ctx , eventstoreClient , p )
logging . WithFields ( "name" , p . String ( ) ) . OnError ( err ) . Fatal ( "migration failed" )
}
}