zitadel/internal/api/saml/certificate.go

212 lines
5.6 KiB
Go
Raw Normal View History

package saml
import (
"context"
"fmt"
"time"
"github.com/go-jose/go-jose/v4"
"github.com/zitadel/logging"
"github.com/zitadel/saml/pkg/provider/key"
"github.com/zitadel/zitadel/internal/api/authz"
"github.com/zitadel/zitadel/internal/crypto"
"github.com/zitadel/zitadel/internal/eventstore"
"github.com/zitadel/zitadel/internal/query"
"github.com/zitadel/zitadel/internal/repository/instance"
"github.com/zitadel/zitadel/internal/repository/keypair"
"github.com/zitadel/zitadel/internal/zerrors"
)
const (
locksTable = "projections.locks"
signingKey = "signing_key"
samlUser = "SAML"
retryBackoff = 500 * time.Millisecond
retryCount = 3
lockDuration = retryCount * retryBackoff * 5
gracefulPeriod = 10 * time.Minute
)
type CertificateAndKey struct {
algorithm jose.SignatureAlgorithm
id string
key interface{}
certificate interface{}
}
func (c *CertificateAndKey) SignatureAlgorithm() jose.SignatureAlgorithm {
return c.algorithm
}
func (c *CertificateAndKey) Key() interface{} {
return c.key
}
func (c *CertificateAndKey) Certificate() interface{} {
return c.certificate
}
func (c *CertificateAndKey) ID() string {
return c.id
}
feat(v3alpha): web key resource (#8262) # Which Problems Are Solved Implement a new API service that allows management of OIDC signing web keys. This allows users to manage rotation of the instance level keys. which are currently managed based on expiry. The API accepts the generation of the following key types and parameters: - RSA keys with 2048, 3072 or 4096 bit in size and: - Signing with SHA-256 (RS256) - Signing with SHA-384 (RS384) - Signing with SHA-512 (RS512) - ECDSA keys with - P256 curve - P384 curve - P512 curve - ED25519 keys # How the Problems Are Solved Keys are serialized for storage using the JSON web key format from the `jose` library. This is the format that will be used by OIDC for signing, verification and publication. Each instance can have a number of key pairs. All existing public keys are meant to be used for token verification and publication the keys endpoint. Keys can be activated and the active private key is meant to sign new tokens. There is always exactly 1 active signing key: 1. When the first key for an instance is generated, it is automatically activated. 2. Activation of the next key automatically deactivates the previously active key. 3. Keys cannot be manually deactivated from the API 4. Active keys cannot be deleted # Additional Changes - Query methods that later will be used by the OIDC package are already implemented. Preparation for #8031 - Fix indentation in french translation for instance event - Move user_schema translations to consistent positions in all translation files # Additional Context - Closes #8030 - Part of #7809 --------- Co-authored-by: Elio Bischof <elio@zitadel.com>
2024-08-14 17:18:14 +03:00
func (p *Storage) GetCertificateAndKey(ctx context.Context, usage crypto.KeyUsage) (certAndKey *key.CertificateAndKey, err error) {
err = retry(func() error {
certAndKey, err = p.getCertificateAndKey(ctx, usage)
if err != nil {
return err
}
if certAndKey == nil {
return zerrors.ThrowInternal(err, "SAML-8u01nks", "no certificate found")
}
return nil
})
return certAndKey, err
}
feat(v3alpha): web key resource (#8262) # Which Problems Are Solved Implement a new API service that allows management of OIDC signing web keys. This allows users to manage rotation of the instance level keys. which are currently managed based on expiry. The API accepts the generation of the following key types and parameters: - RSA keys with 2048, 3072 or 4096 bit in size and: - Signing with SHA-256 (RS256) - Signing with SHA-384 (RS384) - Signing with SHA-512 (RS512) - ECDSA keys with - P256 curve - P384 curve - P512 curve - ED25519 keys # How the Problems Are Solved Keys are serialized for storage using the JSON web key format from the `jose` library. This is the format that will be used by OIDC for signing, verification and publication. Each instance can have a number of key pairs. All existing public keys are meant to be used for token verification and publication the keys endpoint. Keys can be activated and the active private key is meant to sign new tokens. There is always exactly 1 active signing key: 1. When the first key for an instance is generated, it is automatically activated. 2. Activation of the next key automatically deactivates the previously active key. 3. Keys cannot be manually deactivated from the API 4. Active keys cannot be deleted # Additional Changes - Query methods that later will be used by the OIDC package are already implemented. Preparation for #8031 - Fix indentation in french translation for instance event - Move user_schema translations to consistent positions in all translation files # Additional Context - Closes #8030 - Part of #7809 --------- Co-authored-by: Elio Bischof <elio@zitadel.com>
2024-08-14 17:18:14 +03:00
func (p *Storage) getCertificateAndKey(ctx context.Context, usage crypto.KeyUsage) (*key.CertificateAndKey, error) {
certs, err := p.query.ActiveCertificates(ctx, time.Now().Add(gracefulPeriod), usage)
if err != nil {
return nil, err
}
if len(certs.Certificates) > 0 {
return p.certificateToCertificateAndKey(selectCertificate(certs.Certificates))
}
var position float64
if certs.State != nil {
position = certs.State.Position
}
return nil, p.refreshCertificate(ctx, usage, position)
}
func (p *Storage) refreshCertificate(
ctx context.Context,
feat(v3alpha): web key resource (#8262) # Which Problems Are Solved Implement a new API service that allows management of OIDC signing web keys. This allows users to manage rotation of the instance level keys. which are currently managed based on expiry. The API accepts the generation of the following key types and parameters: - RSA keys with 2048, 3072 or 4096 bit in size and: - Signing with SHA-256 (RS256) - Signing with SHA-384 (RS384) - Signing with SHA-512 (RS512) - ECDSA keys with - P256 curve - P384 curve - P512 curve - ED25519 keys # How the Problems Are Solved Keys are serialized for storage using the JSON web key format from the `jose` library. This is the format that will be used by OIDC for signing, verification and publication. Each instance can have a number of key pairs. All existing public keys are meant to be used for token verification and publication the keys endpoint. Keys can be activated and the active private key is meant to sign new tokens. There is always exactly 1 active signing key: 1. When the first key for an instance is generated, it is automatically activated. 2. Activation of the next key automatically deactivates the previously active key. 3. Keys cannot be manually deactivated from the API 4. Active keys cannot be deleted # Additional Changes - Query methods that later will be used by the OIDC package are already implemented. Preparation for #8031 - Fix indentation in french translation for instance event - Move user_schema translations to consistent positions in all translation files # Additional Context - Closes #8030 - Part of #7809 --------- Co-authored-by: Elio Bischof <elio@zitadel.com>
2024-08-14 17:18:14 +03:00
usage crypto.KeyUsage,
position float64,
) error {
ok, err := p.ensureIsLatestCertificate(ctx, position)
if err != nil {
logging.WithError(err).Error("could not ensure latest key")
return err
}
if !ok {
logging.Warn("view not up to date, retrying later")
return err
}
err = p.lockAndGenerateCertificateAndKey(ctx, usage)
logging.OnError(err).Warn("could not create signing key")
return nil
}
func (p *Storage) ensureIsLatestCertificate(ctx context.Context, position float64) (bool, error) {
maxSequence, err := p.getMaxKeySequence(ctx)
if err != nil {
return false, fmt.Errorf("error retrieving new events: %w", err)
}
return position >= maxSequence, nil
}
feat(v3alpha): web key resource (#8262) # Which Problems Are Solved Implement a new API service that allows management of OIDC signing web keys. This allows users to manage rotation of the instance level keys. which are currently managed based on expiry. The API accepts the generation of the following key types and parameters: - RSA keys with 2048, 3072 or 4096 bit in size and: - Signing with SHA-256 (RS256) - Signing with SHA-384 (RS384) - Signing with SHA-512 (RS512) - ECDSA keys with - P256 curve - P384 curve - P512 curve - ED25519 keys # How the Problems Are Solved Keys are serialized for storage using the JSON web key format from the `jose` library. This is the format that will be used by OIDC for signing, verification and publication. Each instance can have a number of key pairs. All existing public keys are meant to be used for token verification and publication the keys endpoint. Keys can be activated and the active private key is meant to sign new tokens. There is always exactly 1 active signing key: 1. When the first key for an instance is generated, it is automatically activated. 2. Activation of the next key automatically deactivates the previously active key. 3. Keys cannot be manually deactivated from the API 4. Active keys cannot be deleted # Additional Changes - Query methods that later will be used by the OIDC package are already implemented. Preparation for #8031 - Fix indentation in french translation for instance event - Move user_schema translations to consistent positions in all translation files # Additional Context - Closes #8030 - Part of #7809 --------- Co-authored-by: Elio Bischof <elio@zitadel.com>
2024-08-14 17:18:14 +03:00
func (p *Storage) lockAndGenerateCertificateAndKey(ctx context.Context, usage crypto.KeyUsage) error {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
ctx = setSAMLCtx(ctx)
errs := p.locker.Lock(ctx, lockDuration, authz.GetInstance(ctx).InstanceID())
err, ok := <-errs
if err != nil || !ok {
if zerrors.IsErrorAlreadyExists(err) {
return nil
}
logging.OnError(err).Debug("initial lock failed")
return err
}
switch usage {
feat(v3alpha): web key resource (#8262) # Which Problems Are Solved Implement a new API service that allows management of OIDC signing web keys. This allows users to manage rotation of the instance level keys. which are currently managed based on expiry. The API accepts the generation of the following key types and parameters: - RSA keys with 2048, 3072 or 4096 bit in size and: - Signing with SHA-256 (RS256) - Signing with SHA-384 (RS384) - Signing with SHA-512 (RS512) - ECDSA keys with - P256 curve - P384 curve - P512 curve - ED25519 keys # How the Problems Are Solved Keys are serialized for storage using the JSON web key format from the `jose` library. This is the format that will be used by OIDC for signing, verification and publication. Each instance can have a number of key pairs. All existing public keys are meant to be used for token verification and publication the keys endpoint. Keys can be activated and the active private key is meant to sign new tokens. There is always exactly 1 active signing key: 1. When the first key for an instance is generated, it is automatically activated. 2. Activation of the next key automatically deactivates the previously active key. 3. Keys cannot be manually deactivated from the API 4. Active keys cannot be deleted # Additional Changes - Query methods that later will be used by the OIDC package are already implemented. Preparation for #8031 - Fix indentation in french translation for instance event - Move user_schema translations to consistent positions in all translation files # Additional Context - Closes #8030 - Part of #7809 --------- Co-authored-by: Elio Bischof <elio@zitadel.com>
2024-08-14 17:18:14 +03:00
case crypto.KeyUsageSAMLMetadataSigning, crypto.KeyUsageSAMLResponseSinging:
certAndKey, err := p.GetCertificateAndKey(ctx, crypto.KeyUsageSAMLCA)
if err != nil {
return fmt.Errorf("error while reading ca certificate: %w", err)
}
if certAndKey.Key == nil || certAndKey.Certificate == nil {
return fmt.Errorf("has no ca certificate")
}
switch usage {
feat(v3alpha): web key resource (#8262) # Which Problems Are Solved Implement a new API service that allows management of OIDC signing web keys. This allows users to manage rotation of the instance level keys. which are currently managed based on expiry. The API accepts the generation of the following key types and parameters: - RSA keys with 2048, 3072 or 4096 bit in size and: - Signing with SHA-256 (RS256) - Signing with SHA-384 (RS384) - Signing with SHA-512 (RS512) - ECDSA keys with - P256 curve - P384 curve - P512 curve - ED25519 keys # How the Problems Are Solved Keys are serialized for storage using the JSON web key format from the `jose` library. This is the format that will be used by OIDC for signing, verification and publication. Each instance can have a number of key pairs. All existing public keys are meant to be used for token verification and publication the keys endpoint. Keys can be activated and the active private key is meant to sign new tokens. There is always exactly 1 active signing key: 1. When the first key for an instance is generated, it is automatically activated. 2. Activation of the next key automatically deactivates the previously active key. 3. Keys cannot be manually deactivated from the API 4. Active keys cannot be deleted # Additional Changes - Query methods that later will be used by the OIDC package are already implemented. Preparation for #8031 - Fix indentation in french translation for instance event - Move user_schema translations to consistent positions in all translation files # Additional Context - Closes #8030 - Part of #7809 --------- Co-authored-by: Elio Bischof <elio@zitadel.com>
2024-08-14 17:18:14 +03:00
case crypto.KeyUsageSAMLMetadataSigning:
return p.command.GenerateSAMLMetadataCertificate(setSAMLCtx(ctx), p.certificateAlgorithm, certAndKey.Key, certAndKey.Certificate)
feat(v3alpha): web key resource (#8262) # Which Problems Are Solved Implement a new API service that allows management of OIDC signing web keys. This allows users to manage rotation of the instance level keys. which are currently managed based on expiry. The API accepts the generation of the following key types and parameters: - RSA keys with 2048, 3072 or 4096 bit in size and: - Signing with SHA-256 (RS256) - Signing with SHA-384 (RS384) - Signing with SHA-512 (RS512) - ECDSA keys with - P256 curve - P384 curve - P512 curve - ED25519 keys # How the Problems Are Solved Keys are serialized for storage using the JSON web key format from the `jose` library. This is the format that will be used by OIDC for signing, verification and publication. Each instance can have a number of key pairs. All existing public keys are meant to be used for token verification and publication the keys endpoint. Keys can be activated and the active private key is meant to sign new tokens. There is always exactly 1 active signing key: 1. When the first key for an instance is generated, it is automatically activated. 2. Activation of the next key automatically deactivates the previously active key. 3. Keys cannot be manually deactivated from the API 4. Active keys cannot be deleted # Additional Changes - Query methods that later will be used by the OIDC package are already implemented. Preparation for #8031 - Fix indentation in french translation for instance event - Move user_schema translations to consistent positions in all translation files # Additional Context - Closes #8030 - Part of #7809 --------- Co-authored-by: Elio Bischof <elio@zitadel.com>
2024-08-14 17:18:14 +03:00
case crypto.KeyUsageSAMLResponseSinging:
return p.command.GenerateSAMLResponseCertificate(setSAMLCtx(ctx), p.certificateAlgorithm, certAndKey.Key, certAndKey.Certificate)
default:
return fmt.Errorf("unknown usage")
}
feat(v3alpha): web key resource (#8262) # Which Problems Are Solved Implement a new API service that allows management of OIDC signing web keys. This allows users to manage rotation of the instance level keys. which are currently managed based on expiry. The API accepts the generation of the following key types and parameters: - RSA keys with 2048, 3072 or 4096 bit in size and: - Signing with SHA-256 (RS256) - Signing with SHA-384 (RS384) - Signing with SHA-512 (RS512) - ECDSA keys with - P256 curve - P384 curve - P512 curve - ED25519 keys # How the Problems Are Solved Keys are serialized for storage using the JSON web key format from the `jose` library. This is the format that will be used by OIDC for signing, verification and publication. Each instance can have a number of key pairs. All existing public keys are meant to be used for token verification and publication the keys endpoint. Keys can be activated and the active private key is meant to sign new tokens. There is always exactly 1 active signing key: 1. When the first key for an instance is generated, it is automatically activated. 2. Activation of the next key automatically deactivates the previously active key. 3. Keys cannot be manually deactivated from the API 4. Active keys cannot be deleted # Additional Changes - Query methods that later will be used by the OIDC package are already implemented. Preparation for #8031 - Fix indentation in french translation for instance event - Move user_schema translations to consistent positions in all translation files # Additional Context - Closes #8030 - Part of #7809 --------- Co-authored-by: Elio Bischof <elio@zitadel.com>
2024-08-14 17:18:14 +03:00
case crypto.KeyUsageSAMLCA:
return p.command.GenerateSAMLCACertificate(setSAMLCtx(ctx), p.certificateAlgorithm)
default:
return fmt.Errorf("unknown certificate usage")
}
}
func (p *Storage) getMaxKeySequence(ctx context.Context) (float64, error) {
return p.eventstore.LatestSequence(ctx,
eventstore.NewSearchQueryBuilder(eventstore.ColumnsMaxSequence).
ResourceOwner(authz.GetInstance(ctx).InstanceID()).
AwaitOpenTransactions().
AddQuery().
fix(perf): simplify eventstore queries by removing or in projection handlers (#9530) # Which Problems Are Solved [A recent performance enhancement]((https://github.com/zitadel/zitadel/pull/9497)) aimed at optimizing event store queries, specifically those involving multiple aggregate type filters, has successfully improved index utilization. While the query planner now correctly selects relevant indexes, it employs [bitmap index scans](https://www.postgresql.org/docs/current/indexes-bitmap-scans.html) to retrieve data. This approach, while beneficial in many scenarios, introduces a potential I/O bottleneck. The bitmap index scan first identifies the required database blocks and then utilizes a bitmap to access the corresponding rows from the table's heap. This subsequent "bitmap heap scan" can result in significant I/O overhead, particularly when queries return a substantial number of rows across numerous data pages. ## Impact: Under heavy load or with queries filtering for a wide range of events across multiple aggregate types, this increased I/O activity may lead to: - Increased query latency. - Elevated disk utilization. - Potential performance degradation of the event store and dependent systems. # How the Problems Are Solved To address this I/O bottleneck and further optimize query performance, the projection handler has been modified. Instead of employing multiple OR clauses for each aggregate type, the aggregate and event type filters are now combined using IN ARRAY filters. Technical Details: This change allows the PostgreSQL query planner to leverage [index-only scans](https://www.postgresql.org/docs/current/indexes-index-only-scans.html). By utilizing IN ARRAY filters, the database can efficiently retrieve the necessary data directly from the index, eliminating the need to access the table's heap. This results in: * Reduced I/O: Index-only scans significantly minimize disk I/O operations, as the database avoids reading data pages from the main table. * Improved Query Performance: By reducing I/O, query execution times are substantially improved, leading to lower latency. # Additional Changes - rollback of https://github.com/zitadel/zitadel/pull/9497 # Additional Information ## Query Plan of previous query ```sql SELECT created_at, event_type, "sequence", "position", payload, creator, "owner", instance_id, aggregate_type, aggregate_id, revision FROM eventstore.events2 WHERE instance_id = '<INSTANCE_ID>' AND ( ( instance_id = '<INSTANCE_ID>' AND "position" > <POSITION> AND aggregate_type = 'project' AND event_type = ANY(ARRAY[ 'project.application.added' ,'project.application.changed' ,'project.application.deactivated' ,'project.application.reactivated' ,'project.application.removed' ,'project.removed' ,'project.application.config.api.added' ,'project.application.config.api.changed' ,'project.application.config.api.secret.changed' ,'project.application.config.api.secret.updated' ,'project.application.config.oidc.added' ,'project.application.config.oidc.changed' ,'project.application.config.oidc.secret.changed' ,'project.application.config.oidc.secret.updated' ,'project.application.config.saml.added' ,'project.application.config.saml.changed' ]) ) OR ( instance_id = '<INSTANCE_ID>' AND "position" > <POSITION> AND aggregate_type = 'org' AND event_type = 'org.removed' ) OR ( instance_id = '<INSTANCE_ID>' AND "position" > <POSITION> AND aggregate_type = 'instance' AND event_type = 'instance.removed' ) ) AND "position" > 1741600905.3495 AND "position" < ( SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY(ARRAY['zitadel_es_pusher_', 'zitadel_es_pusher', 'zitadel_es_pusher_<INSTANCE_ID>']) AND state <> 'idle' ) ORDER BY "position", in_tx_order LIMIT 200 OFFSET 1; ``` ``` Limit (cost=120.08..120.09 rows=7 width=361) (actual time=2.167..2.172 rows=0 loops=1) Output: events2.created_at, events2.event_type, events2.sequence, events2."position", events2.payload, events2.creator, events2.owner, events2.instance_id, events2.aggregate_type, events2.aggregate_id, events2.revision, events2.in_tx_order InitPlan 1 -> Aggregate (cost=2.74..2.76 rows=1 width=32) (actual time=1.813..1.815 rows=1 loops=1) Output: COALESCE(EXTRACT(epoch FROM min(s.xact_start)), EXTRACT(epoch FROM now())) -> Nested Loop (cost=0.00..2.74 rows=1 width=8) (actual time=1.803..1.805 rows=0 loops=1) Output: s.xact_start Join Filter: (d.oid = s.datid) -> Seq Scan on pg_catalog.pg_database d (cost=0.00..1.07 rows=1 width=4) (actual time=0.016..0.021 rows=1 loops=1) Output: d.oid, d.datname, d.datdba, d.encoding, d.datlocprovider, d.datistemplate, d.datallowconn, d.dathasloginevt, d.datconnlimit, d.datfrozenxid, d.datminmxid, d.dattablespace, d.datcollate, d.datctype, d.datlocale, d.daticurules, d.datcollversion, d.datacl Filter: (d.datname = current_database()) Rows Removed by Filter: 4 -> Function Scan on pg_catalog.pg_stat_get_activity s (cost=0.00..1.63 rows=3 width=16) (actual time=1.781..1.781 rows=0 loops=1) Output: s.datid, s.pid, s.usesysid, s.application_name, s.state, s.query, s.wait_event_type, s.wait_event, s.xact_start, s.query_start, s.backend_start, s.state_change, s.client_addr, s.client_hostname, s.client_port, s.backend_xid, s.backend_xmin, s.backend_type, s.ssl, s.sslversion, s.sslcipher, s.sslbits, s.ssl_client_dn, s.ssl_client_serial, s.ssl_issuer_dn, s.gss_auth, s.gss_princ, s.gss_enc, s.gss_delegation, s.leader_pid, s.query_id Function Call: pg_stat_get_activity(NULL::integer) Filter: ((s.state <> 'idle'::text) AND (s.application_name = ANY ('{zitadel_es_pusher_,zitadel_es_pusher,zitadel_es_pusher_<INSTANCE_ID>}'::text[]))) Rows Removed by Filter: 49 -> Sort (cost=117.31..117.33 rows=8 width=361) (actual time=2.167..2.168 rows=0 loops=1) Output: events2.created_at, events2.event_type, events2.sequence, events2."position", events2.payload, events2.creator, events2.owner, events2.instance_id, events2.aggregate_type, events2.aggregate_id, events2.revision, events2.in_tx_order Sort Key: events2."position", events2.in_tx_order Sort Method: quicksort Memory: 25kB -> Bitmap Heap Scan on eventstore.events2 (cost=84.92..117.19 rows=8 width=361) (actual time=2.088..2.089 rows=0 loops=1) Output: events2.created_at, events2.event_type, events2.sequence, events2."position", events2.payload, events2.creator, events2.owner, events2.instance_id, events2.aggregate_type, events2.aggregate_id, events2.revision, events2.in_tx_order Recheck Cond: (((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = 'project'::text) AND (events2.event_type = ANY ('{project.application.added,project.application.changed,project.application.deactivated,project.application.reactivated,project.application.removed,project.removed,project.application.config.api.added,project.application.config.api.changed,project.application.config.api.secret.changed,project.application.config.api.secret.updated,project.application.config.oidc.added,project.application.config.oidc.changed,project.application.config.oidc.secret.changed,project.application.config.oidc.secret.updated,project.application.config.saml.added,project.application.config.saml.changed}'::text[])) AND (events2."position" > <POSITION>) AND (events2."position" > 1741600905.3495) AND (events2."position" < (InitPlan 1).col1)) OR ((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = 'org'::text) AND (events2.event_type = 'org.removed'::text) AND (events2."position" > <POSITION>) AND (events2."position" > 1741600905.3495) AND (events2."position" < (InitPlan 1).col1)) OR ((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = 'instance'::text) AND (events2.event_type = 'instance.removed'::text) AND (events2."position" > <POSITION>) AND (events2."position" > 1741600905.3495) AND (events2."position" < (InitPlan 1).col1))) -> BitmapOr (cost=84.88..84.88 rows=8 width=0) (actual time=2.080..2.081 rows=0 loops=1) -> Bitmap Index Scan on es_projection (cost=0.00..75.44 rows=8 width=0) (actual time=2.016..2.017 rows=0 loops=1) Index Cond: ((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = 'project'::text) AND (events2.event_type = ANY ('{project.application.added,project.application.changed,project.application.deactivated,project.application.reactivated,project.application.removed,project.removed,project.application.config.api.added,project.application.config.api.changed,project.application.config.api.secret.changed,project.application.config.api.secret.updated,project.application.config.oidc.added,project.application.config.oidc.changed,project.application.config.oidc.secret.changed,project.application.config.oidc.secret.updated,project.application.config.saml.added,project.application.config.saml.changed}'::text[])) AND (events2."position" > <POSITION>) AND (events2."position" > 1741600905.3495) AND (events2."position" < (InitPlan 1).col1)) -> Bitmap Index Scan on es_projection (cost=0.00..4.71 rows=1 width=0) (actual time=0.016..0.016 rows=0 loops=1) Index Cond: ((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = 'org'::text) AND (events2.event_type = 'org.removed'::text) AND (events2."position" > <POSITION>) AND (events2."position" > 1741600905.3495) AND (events2."position" < (InitPlan 1).col1)) -> Bitmap Index Scan on es_projection (cost=0.00..4.71 rows=1 width=0) (actual time=0.045..0.045 rows=0 loops=1) Index Cond: ((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = 'instance'::text) AND (events2.event_type = 'instance.removed'::text) AND (events2."position" > <POSITION>) AND (events2."position" > 1741600905.3495) AND (events2."position" < (InitPlan 1).col1)) Query Identifier: 3194938266011254479 Planning Time: 1.295 ms Execution Time: 2.832 ms ``` ## Query Plan of new query ```sql SELECT created_at, event_type, "sequence", "position", payload, creator, "owner", instance_id, aggregate_type, aggregate_id, revision FROM eventstore.events2 WHERE instance_id = '<INSTANCE_ID>' AND "position" > <POSITION> AND aggregate_type = ANY(ARRAY['project', 'instance', 'org']) AND event_type = ANY(ARRAY[ 'project.application.added' ,'project.application.changed' ,'project.application.deactivated' ,'project.application.reactivated' ,'project.application.removed' ,'project.removed' ,'project.application.config.api.added' ,'project.application.config.api.changed' ,'project.application.config.api.secret.changed' ,'project.application.config.api.secret.updated' ,'project.application.config.oidc.added' ,'project.application.config.oidc.changed' ,'project.application.config.oidc.secret.changed' ,'project.application.config.oidc.secret.updated' ,'project.application.config.saml.added' ,'project.application.config.saml.changed' ,'org.removed' ,'instance.removed' ]) AND "position" < ( SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY(ARRAY['zitadel_es_pusher_', 'zitadel_es_pusher', 'zitadel_es_pusher_<INSTANCE_ID>']) AND state <> 'idle' ) ORDER BY "position", in_tx_order LIMIT 200 OFFSET 1; ``` ``` Limit (cost=293.34..293.36 rows=8 width=361) (actual time=4.686..4.689 rows=0 loops=1) Output: events2.created_at, events2.event_type, events2.sequence, events2."position", events2.payload, events2.creator, events2.owner, events2.instance_id, events2.aggregate_type, events2.aggregate_id, events2.revision, events2.in_tx_order InitPlan 1 -> Aggregate (cost=2.74..2.76 rows=1 width=32) (actual time=1.717..1.719 rows=1 loops=1) Output: COALESCE(EXTRACT(epoch FROM min(s.xact_start)), EXTRACT(epoch FROM now())) -> Nested Loop (cost=0.00..2.74 rows=1 width=8) (actual time=1.658..1.659 rows=0 loops=1) Output: s.xact_start Join Filter: (d.oid = s.datid) -> Seq Scan on pg_catalog.pg_database d (cost=0.00..1.07 rows=1 width=4) (actual time=0.026..0.028 rows=1 loops=1) Output: d.oid, d.datname, d.datdba, d.encoding, d.datlocprovider, d.datistemplate, d.datallowconn, d.dathasloginevt, d.datconnlimit, d.datfrozenxid, d.datminmxid, d.dattablespace, d.datcollate, d.datctype, d.datlocale, d.daticurules, d.datcollversion, d.datacl Filter: (d.datname = current_database()) Rows Removed by Filter: 4 -> Function Scan on pg_catalog.pg_stat_get_activity s (cost=0.00..1.63 rows=3 width=16) (actual time=1.628..1.628 rows=0 loops=1) Output: s.datid, s.pid, s.usesysid, s.application_name, s.state, s.query, s.wait_event_type, s.wait_event, s.xact_start, s.query_start, s.backend_start, s.state_change, s.client_addr, s.client_hostname, s.client_port, s.backend_xid, s.backend_xmin, s.backend_type, s.ssl, s.sslversion, s.sslcipher, s.sslbits, s.ssl_client_dn, s.ssl_client_serial, s.ssl_issuer_dn, s.gss_auth, s.gss_princ, s.gss_enc, s.gss_delegation, s.leader_pid, s.query_id Function Call: pg_stat_get_activity(NULL::integer) Filter: ((s.state <> 'idle'::text) AND (s.application_name = ANY ('{zitadel_es_pusher_,zitadel_es_pusher,zitadel_es_pusher_<INSTANCE_ID>}'::text[]))) Rows Removed by Filter: 42 -> Sort (cost=290.58..290.60 rows=9 width=361) (actual time=4.685..4.685 rows=0 loops=1) Output: events2.created_at, events2.event_type, events2.sequence, events2."position", events2.payload, events2.creator, events2.owner, events2.instance_id, events2.aggregate_type, events2.aggregate_id, events2.revision, events2.in_tx_order Sort Key: events2."position", events2.in_tx_order Sort Method: quicksort Memory: 25kB -> Index Scan using es_projection on eventstore.events2 (cost=0.70..290.43 rows=9 width=361) (actual time=4.616..4.617 rows=0 loops=1) Output: events2.created_at, events2.event_type, events2.sequence, events2."position", events2.payload, events2.creator, events2.owner, events2.instance_id, events2.aggregate_type, events2.aggregate_id, events2.revision, events2.in_tx_order Index Cond: ((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = ANY ('{project,instance,org}'::text[])) AND (events2.event_type = ANY ('{project.application.added,project.application.changed,project.application.deactivated,project.application.reactivated,project.application.removed,project.removed,project.application.config.api.added,project.application.config.api.changed,project.application.config.api.secret.changed,project.application.config.api.secret.updated,project.application.config.oidc.added,project.application.config.oidc.changed,project.application.config.oidc.secret.changed,project.application.config.oidc.secret.updated,project.application.config.saml.added,project.application.config.saml.changed,org.removed,instance.removed}'::text[])) AND (events2."position" > <POSITION>) AND (events2."position" < (InitPlan 1).col1)) Query Identifier: -8254550537132386499 Planning Time: 2.864 ms Execution Time: 5.414 ms ```
2025-03-13 16:50:23 +01:00
AggregateTypes(
keypair.AggregateType,
instance.AggregateType,
).
EventTypes(
keypair.AddedEventType,
keypair.AddedCertificateEventType,
fix(perf): simplify eventstore queries by removing or in projection handlers (#9530) # Which Problems Are Solved [A recent performance enhancement]((https://github.com/zitadel/zitadel/pull/9497)) aimed at optimizing event store queries, specifically those involving multiple aggregate type filters, has successfully improved index utilization. While the query planner now correctly selects relevant indexes, it employs [bitmap index scans](https://www.postgresql.org/docs/current/indexes-bitmap-scans.html) to retrieve data. This approach, while beneficial in many scenarios, introduces a potential I/O bottleneck. The bitmap index scan first identifies the required database blocks and then utilizes a bitmap to access the corresponding rows from the table's heap. This subsequent "bitmap heap scan" can result in significant I/O overhead, particularly when queries return a substantial number of rows across numerous data pages. ## Impact: Under heavy load or with queries filtering for a wide range of events across multiple aggregate types, this increased I/O activity may lead to: - Increased query latency. - Elevated disk utilization. - Potential performance degradation of the event store and dependent systems. # How the Problems Are Solved To address this I/O bottleneck and further optimize query performance, the projection handler has been modified. Instead of employing multiple OR clauses for each aggregate type, the aggregate and event type filters are now combined using IN ARRAY filters. Technical Details: This change allows the PostgreSQL query planner to leverage [index-only scans](https://www.postgresql.org/docs/current/indexes-index-only-scans.html). By utilizing IN ARRAY filters, the database can efficiently retrieve the necessary data directly from the index, eliminating the need to access the table's heap. This results in: * Reduced I/O: Index-only scans significantly minimize disk I/O operations, as the database avoids reading data pages from the main table. * Improved Query Performance: By reducing I/O, query execution times are substantially improved, leading to lower latency. # Additional Changes - rollback of https://github.com/zitadel/zitadel/pull/9497 # Additional Information ## Query Plan of previous query ```sql SELECT created_at, event_type, "sequence", "position", payload, creator, "owner", instance_id, aggregate_type, aggregate_id, revision FROM eventstore.events2 WHERE instance_id = '<INSTANCE_ID>' AND ( ( instance_id = '<INSTANCE_ID>' AND "position" > <POSITION> AND aggregate_type = 'project' AND event_type = ANY(ARRAY[ 'project.application.added' ,'project.application.changed' ,'project.application.deactivated' ,'project.application.reactivated' ,'project.application.removed' ,'project.removed' ,'project.application.config.api.added' ,'project.application.config.api.changed' ,'project.application.config.api.secret.changed' ,'project.application.config.api.secret.updated' ,'project.application.config.oidc.added' ,'project.application.config.oidc.changed' ,'project.application.config.oidc.secret.changed' ,'project.application.config.oidc.secret.updated' ,'project.application.config.saml.added' ,'project.application.config.saml.changed' ]) ) OR ( instance_id = '<INSTANCE_ID>' AND "position" > <POSITION> AND aggregate_type = 'org' AND event_type = 'org.removed' ) OR ( instance_id = '<INSTANCE_ID>' AND "position" > <POSITION> AND aggregate_type = 'instance' AND event_type = 'instance.removed' ) ) AND "position" > 1741600905.3495 AND "position" < ( SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY(ARRAY['zitadel_es_pusher_', 'zitadel_es_pusher', 'zitadel_es_pusher_<INSTANCE_ID>']) AND state <> 'idle' ) ORDER BY "position", in_tx_order LIMIT 200 OFFSET 1; ``` ``` Limit (cost=120.08..120.09 rows=7 width=361) (actual time=2.167..2.172 rows=0 loops=1) Output: events2.created_at, events2.event_type, events2.sequence, events2."position", events2.payload, events2.creator, events2.owner, events2.instance_id, events2.aggregate_type, events2.aggregate_id, events2.revision, events2.in_tx_order InitPlan 1 -> Aggregate (cost=2.74..2.76 rows=1 width=32) (actual time=1.813..1.815 rows=1 loops=1) Output: COALESCE(EXTRACT(epoch FROM min(s.xact_start)), EXTRACT(epoch FROM now())) -> Nested Loop (cost=0.00..2.74 rows=1 width=8) (actual time=1.803..1.805 rows=0 loops=1) Output: s.xact_start Join Filter: (d.oid = s.datid) -> Seq Scan on pg_catalog.pg_database d (cost=0.00..1.07 rows=1 width=4) (actual time=0.016..0.021 rows=1 loops=1) Output: d.oid, d.datname, d.datdba, d.encoding, d.datlocprovider, d.datistemplate, d.datallowconn, d.dathasloginevt, d.datconnlimit, d.datfrozenxid, d.datminmxid, d.dattablespace, d.datcollate, d.datctype, d.datlocale, d.daticurules, d.datcollversion, d.datacl Filter: (d.datname = current_database()) Rows Removed by Filter: 4 -> Function Scan on pg_catalog.pg_stat_get_activity s (cost=0.00..1.63 rows=3 width=16) (actual time=1.781..1.781 rows=0 loops=1) Output: s.datid, s.pid, s.usesysid, s.application_name, s.state, s.query, s.wait_event_type, s.wait_event, s.xact_start, s.query_start, s.backend_start, s.state_change, s.client_addr, s.client_hostname, s.client_port, s.backend_xid, s.backend_xmin, s.backend_type, s.ssl, s.sslversion, s.sslcipher, s.sslbits, s.ssl_client_dn, s.ssl_client_serial, s.ssl_issuer_dn, s.gss_auth, s.gss_princ, s.gss_enc, s.gss_delegation, s.leader_pid, s.query_id Function Call: pg_stat_get_activity(NULL::integer) Filter: ((s.state <> 'idle'::text) AND (s.application_name = ANY ('{zitadel_es_pusher_,zitadel_es_pusher,zitadel_es_pusher_<INSTANCE_ID>}'::text[]))) Rows Removed by Filter: 49 -> Sort (cost=117.31..117.33 rows=8 width=361) (actual time=2.167..2.168 rows=0 loops=1) Output: events2.created_at, events2.event_type, events2.sequence, events2."position", events2.payload, events2.creator, events2.owner, events2.instance_id, events2.aggregate_type, events2.aggregate_id, events2.revision, events2.in_tx_order Sort Key: events2."position", events2.in_tx_order Sort Method: quicksort Memory: 25kB -> Bitmap Heap Scan on eventstore.events2 (cost=84.92..117.19 rows=8 width=361) (actual time=2.088..2.089 rows=0 loops=1) Output: events2.created_at, events2.event_type, events2.sequence, events2."position", events2.payload, events2.creator, events2.owner, events2.instance_id, events2.aggregate_type, events2.aggregate_id, events2.revision, events2.in_tx_order Recheck Cond: (((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = 'project'::text) AND (events2.event_type = ANY ('{project.application.added,project.application.changed,project.application.deactivated,project.application.reactivated,project.application.removed,project.removed,project.application.config.api.added,project.application.config.api.changed,project.application.config.api.secret.changed,project.application.config.api.secret.updated,project.application.config.oidc.added,project.application.config.oidc.changed,project.application.config.oidc.secret.changed,project.application.config.oidc.secret.updated,project.application.config.saml.added,project.application.config.saml.changed}'::text[])) AND (events2."position" > <POSITION>) AND (events2."position" > 1741600905.3495) AND (events2."position" < (InitPlan 1).col1)) OR ((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = 'org'::text) AND (events2.event_type = 'org.removed'::text) AND (events2."position" > <POSITION>) AND (events2."position" > 1741600905.3495) AND (events2."position" < (InitPlan 1).col1)) OR ((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = 'instance'::text) AND (events2.event_type = 'instance.removed'::text) AND (events2."position" > <POSITION>) AND (events2."position" > 1741600905.3495) AND (events2."position" < (InitPlan 1).col1))) -> BitmapOr (cost=84.88..84.88 rows=8 width=0) (actual time=2.080..2.081 rows=0 loops=1) -> Bitmap Index Scan on es_projection (cost=0.00..75.44 rows=8 width=0) (actual time=2.016..2.017 rows=0 loops=1) Index Cond: ((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = 'project'::text) AND (events2.event_type = ANY ('{project.application.added,project.application.changed,project.application.deactivated,project.application.reactivated,project.application.removed,project.removed,project.application.config.api.added,project.application.config.api.changed,project.application.config.api.secret.changed,project.application.config.api.secret.updated,project.application.config.oidc.added,project.application.config.oidc.changed,project.application.config.oidc.secret.changed,project.application.config.oidc.secret.updated,project.application.config.saml.added,project.application.config.saml.changed}'::text[])) AND (events2."position" > <POSITION>) AND (events2."position" > 1741600905.3495) AND (events2."position" < (InitPlan 1).col1)) -> Bitmap Index Scan on es_projection (cost=0.00..4.71 rows=1 width=0) (actual time=0.016..0.016 rows=0 loops=1) Index Cond: ((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = 'org'::text) AND (events2.event_type = 'org.removed'::text) AND (events2."position" > <POSITION>) AND (events2."position" > 1741600905.3495) AND (events2."position" < (InitPlan 1).col1)) -> Bitmap Index Scan on es_projection (cost=0.00..4.71 rows=1 width=0) (actual time=0.045..0.045 rows=0 loops=1) Index Cond: ((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = 'instance'::text) AND (events2.event_type = 'instance.removed'::text) AND (events2."position" > <POSITION>) AND (events2."position" > 1741600905.3495) AND (events2."position" < (InitPlan 1).col1)) Query Identifier: 3194938266011254479 Planning Time: 1.295 ms Execution Time: 2.832 ms ``` ## Query Plan of new query ```sql SELECT created_at, event_type, "sequence", "position", payload, creator, "owner", instance_id, aggregate_type, aggregate_id, revision FROM eventstore.events2 WHERE instance_id = '<INSTANCE_ID>' AND "position" > <POSITION> AND aggregate_type = ANY(ARRAY['project', 'instance', 'org']) AND event_type = ANY(ARRAY[ 'project.application.added' ,'project.application.changed' ,'project.application.deactivated' ,'project.application.reactivated' ,'project.application.removed' ,'project.removed' ,'project.application.config.api.added' ,'project.application.config.api.changed' ,'project.application.config.api.secret.changed' ,'project.application.config.api.secret.updated' ,'project.application.config.oidc.added' ,'project.application.config.oidc.changed' ,'project.application.config.oidc.secret.changed' ,'project.application.config.oidc.secret.updated' ,'project.application.config.saml.added' ,'project.application.config.saml.changed' ,'org.removed' ,'instance.removed' ]) AND "position" < ( SELECT COALESCE(EXTRACT(EPOCH FROM min(xact_start)), EXTRACT(EPOCH FROM now())) FROM pg_stat_activity WHERE datname = current_database() AND application_name = ANY(ARRAY['zitadel_es_pusher_', 'zitadel_es_pusher', 'zitadel_es_pusher_<INSTANCE_ID>']) AND state <> 'idle' ) ORDER BY "position", in_tx_order LIMIT 200 OFFSET 1; ``` ``` Limit (cost=293.34..293.36 rows=8 width=361) (actual time=4.686..4.689 rows=0 loops=1) Output: events2.created_at, events2.event_type, events2.sequence, events2."position", events2.payload, events2.creator, events2.owner, events2.instance_id, events2.aggregate_type, events2.aggregate_id, events2.revision, events2.in_tx_order InitPlan 1 -> Aggregate (cost=2.74..2.76 rows=1 width=32) (actual time=1.717..1.719 rows=1 loops=1) Output: COALESCE(EXTRACT(epoch FROM min(s.xact_start)), EXTRACT(epoch FROM now())) -> Nested Loop (cost=0.00..2.74 rows=1 width=8) (actual time=1.658..1.659 rows=0 loops=1) Output: s.xact_start Join Filter: (d.oid = s.datid) -> Seq Scan on pg_catalog.pg_database d (cost=0.00..1.07 rows=1 width=4) (actual time=0.026..0.028 rows=1 loops=1) Output: d.oid, d.datname, d.datdba, d.encoding, d.datlocprovider, d.datistemplate, d.datallowconn, d.dathasloginevt, d.datconnlimit, d.datfrozenxid, d.datminmxid, d.dattablespace, d.datcollate, d.datctype, d.datlocale, d.daticurules, d.datcollversion, d.datacl Filter: (d.datname = current_database()) Rows Removed by Filter: 4 -> Function Scan on pg_catalog.pg_stat_get_activity s (cost=0.00..1.63 rows=3 width=16) (actual time=1.628..1.628 rows=0 loops=1) Output: s.datid, s.pid, s.usesysid, s.application_name, s.state, s.query, s.wait_event_type, s.wait_event, s.xact_start, s.query_start, s.backend_start, s.state_change, s.client_addr, s.client_hostname, s.client_port, s.backend_xid, s.backend_xmin, s.backend_type, s.ssl, s.sslversion, s.sslcipher, s.sslbits, s.ssl_client_dn, s.ssl_client_serial, s.ssl_issuer_dn, s.gss_auth, s.gss_princ, s.gss_enc, s.gss_delegation, s.leader_pid, s.query_id Function Call: pg_stat_get_activity(NULL::integer) Filter: ((s.state <> 'idle'::text) AND (s.application_name = ANY ('{zitadel_es_pusher_,zitadel_es_pusher,zitadel_es_pusher_<INSTANCE_ID>}'::text[]))) Rows Removed by Filter: 42 -> Sort (cost=290.58..290.60 rows=9 width=361) (actual time=4.685..4.685 rows=0 loops=1) Output: events2.created_at, events2.event_type, events2.sequence, events2."position", events2.payload, events2.creator, events2.owner, events2.instance_id, events2.aggregate_type, events2.aggregate_id, events2.revision, events2.in_tx_order Sort Key: events2."position", events2.in_tx_order Sort Method: quicksort Memory: 25kB -> Index Scan using es_projection on eventstore.events2 (cost=0.70..290.43 rows=9 width=361) (actual time=4.616..4.617 rows=0 loops=1) Output: events2.created_at, events2.event_type, events2.sequence, events2."position", events2.payload, events2.creator, events2.owner, events2.instance_id, events2.aggregate_type, events2.aggregate_id, events2.revision, events2.in_tx_order Index Cond: ((events2.instance_id = '<INSTANCE_ID>'::text) AND (events2.aggregate_type = ANY ('{project,instance,org}'::text[])) AND (events2.event_type = ANY ('{project.application.added,project.application.changed,project.application.deactivated,project.application.reactivated,project.application.removed,project.removed,project.application.config.api.added,project.application.config.api.changed,project.application.config.api.secret.changed,project.application.config.api.secret.updated,project.application.config.oidc.added,project.application.config.oidc.changed,project.application.config.oidc.secret.changed,project.application.config.oidc.secret.updated,project.application.config.saml.added,project.application.config.saml.changed,org.removed,instance.removed}'::text[])) AND (events2."position" > <POSITION>) AND (events2."position" < (InitPlan 1).col1)) Query Identifier: -8254550537132386499 Planning Time: 2.864 ms Execution Time: 5.414 ms ```
2025-03-13 16:50:23 +01:00
instance.InstanceRemovedEventType,
).
Builder(),
)
}
func (p *Storage) certificateToCertificateAndKey(certificate query.Certificate) (_ *key.CertificateAndKey, err error) {
keyData, err := crypto.Decrypt(certificate.Key(), p.encAlg)
if err != nil {
return nil, err
}
privateKey, err := crypto.BytesToPrivateKey(keyData)
if err != nil {
return nil, err
}
cert, err := crypto.BytesToCertificate(certificate.Certificate())
if err != nil {
return nil, err
}
return &key.CertificateAndKey{
Key: privateKey,
Certificate: cert,
}, nil
}
func selectCertificate(certs []query.Certificate) query.Certificate {
return certs[len(certs)-1]
}
func setSAMLCtx(ctx context.Context) context.Context {
return authz.SetCtxData(ctx, authz.CtxData{UserID: samlUser, OrgID: authz.GetInstance(ctx).InstanceID()})
}
func retry(retryable func() error) (err error) {
for i := 0; i < retryCount; i++ {
err = retryable()
if err == nil {
return nil
}
time.Sleep(retryBackoff)
}
return err
}