zitadel/cmd/setup/cleanup.go

52 lines
1.5 KiB
Go
Raw Normal View History

package setup
import (
"context"
"github.com/spf13/cobra"
"github.com/spf13/viper"
"github.com/zitadel/logging"
"github.com/zitadel/zitadel/internal/database"
"github.com/zitadel/zitadel/internal/eventstore"
old_es "github.com/zitadel/zitadel/internal/eventstore/repository/sql"
new_es "github.com/zitadel/zitadel/internal/eventstore/v3"
"github.com/zitadel/zitadel/internal/migration"
)
func NewCleanup() *cobra.Command {
return &cobra.Command{
Use: "cleanup",
Short: "cleans up migration if they got stuck",
Long: `cleans up migration if they got stuck`,
Run: func(cmd *cobra.Command, args []string) {
config := MustNewConfig(viper.GetViper())
fix: Auto cleanup failed Setup steps if process is killed (#9736) # Which Problems Are Solved When running a long-running Zitadel Setup, Kubernetes might decide to move a pod to a new node automatically. Currently, this puts any migrations into a broken state that an operator needs to manually run the "cleanup" command on - assuming they catch the error. The only super long running commands are typically projection pre-fill operations, which depending on the size of the event table for that projection, can take many hours - plenty of time for Kubernetes to make unexpected decisions, especially in a busy cluster. # How the Problems Are Solved This change listens on `os.Interrupt` and `syscall.SIGTERM`, cancels the current Setup context, and runs the `Cleanup` command. The logs then look something like this: ```shell ... INFO[0000] verify migration caller="/Users/zach/src/zitadel/internal/migration/migration.go:43" name=repeatable_delete_stale_org_fields INFO[0000] starting migration caller="/Users/zach/src/zitadel/internal/migration/migration.go:66" name=repeatable_delete_stale_org_fields INFO[0000] execute delete query caller="/Users/zach/src/zitadel/cmd/setup/39.go:37" instance_id=281297936179003398 migration=repeatable_delete_stale_org_fields progress=1/1 INFO[0000] verify migration caller="/Users/zach/src/zitadel/internal/migration/migration.go:43" name=repeatable_fill_fields_for_instance_domains INFO[0000] starting migration caller="/Users/zach/src/zitadel/internal/migration/migration.go:66" name=repeatable_fill_fields_for_instance_domains ----- SIGTERM signal issued ----- INFO[0000] received interrupt signal, shutting down: interrupt caller="/Users/zach/src/zitadel/cmd/setup/setup.go:121" INFO[0000] query failed caller="/Users/zach/src/zitadel/internal/eventstore/repository/sql/query.go:135" error="timeout: context already done: context canceled" DEBU[0000] filter eventstore failed caller="/Users/zach/src/zitadel/internal/eventstore/handler/v2/field_handler.go:155" error="ID=SQL-KyeAx Message=unable to filter events Parent=(timeout: context already done: context canceled)" projection=instance_domain_fields DEBU[0000] unable to rollback tx caller="/Users/zach/src/zitadel/internal/eventstore/handler/v2/field_handler.go:110" error="sql: transaction has already been committed or rolled back" projection=instance_domain_fields INFO[0000] process events failed caller="/Users/zach/src/zitadel/internal/eventstore/handler/v2/field_handler.go:72" error="ID=SQL-KyeAx Message=unable to filter events Parent=(timeout: context already done: context canceled)" projection=instance_domain_fields DEBU[0000] trigger iteration caller="/Users/zach/src/zitadel/internal/eventstore/handler/v2/field_handler.go:73" iteration=0 projection=instance_domain_fields ERRO[0000] migration failed caller="/Users/zach/src/zitadel/internal/migration/migration.go:68" error="ID=SQL-KyeAx Message=unable to filter events Parent=(timeout: context already done: context canceled)" name=repeatable_fill_fields_for_instance_domains ERRO[0000] migration finish failed caller="/Users/zach/src/zitadel/internal/migration/migration.go:71" error="context canceled" name=repeatable_fill_fields_for_instance_domains ----- Cleanup before exiting ----- INFO[0000] cleanup started caller="/Users/zach/src/zitadel/cmd/setup/cleanup.go:30" INFO[0000] cleanup migration caller="/Users/zach/src/zitadel/cmd/setup/cleanup.go:47" name=repeatable_fill_fields_for_instance_domains ``` # Additional Changes * `mustExecuteMigration` -> `executeMigration`: **must**Execute logged a Fatal error previously which calls os.Exit so no cleanup was possible. Instead, this PR returns an error and assigns it to a shared error in the Setup closure that defer can check. * `initProjections` now returns an error instead of exiting # Additional Context This behavior might be unwelcome or at least unexpected in some cases. Putting it behind a feature flag or config setting is likely a good followup. --------- Co-authored-by: Silvan <27845747+adlerhurst@users.noreply.github.com>
2025-04-22 05:34:02 -04:00
Cleanup(cmd.Context(), config)
},
}
}
fix: Auto cleanup failed Setup steps if process is killed (#9736) # Which Problems Are Solved When running a long-running Zitadel Setup, Kubernetes might decide to move a pod to a new node automatically. Currently, this puts any migrations into a broken state that an operator needs to manually run the "cleanup" command on - assuming they catch the error. The only super long running commands are typically projection pre-fill operations, which depending on the size of the event table for that projection, can take many hours - plenty of time for Kubernetes to make unexpected decisions, especially in a busy cluster. # How the Problems Are Solved This change listens on `os.Interrupt` and `syscall.SIGTERM`, cancels the current Setup context, and runs the `Cleanup` command. The logs then look something like this: ```shell ... INFO[0000] verify migration caller="/Users/zach/src/zitadel/internal/migration/migration.go:43" name=repeatable_delete_stale_org_fields INFO[0000] starting migration caller="/Users/zach/src/zitadel/internal/migration/migration.go:66" name=repeatable_delete_stale_org_fields INFO[0000] execute delete query caller="/Users/zach/src/zitadel/cmd/setup/39.go:37" instance_id=281297936179003398 migration=repeatable_delete_stale_org_fields progress=1/1 INFO[0000] verify migration caller="/Users/zach/src/zitadel/internal/migration/migration.go:43" name=repeatable_fill_fields_for_instance_domains INFO[0000] starting migration caller="/Users/zach/src/zitadel/internal/migration/migration.go:66" name=repeatable_fill_fields_for_instance_domains ----- SIGTERM signal issued ----- INFO[0000] received interrupt signal, shutting down: interrupt caller="/Users/zach/src/zitadel/cmd/setup/setup.go:121" INFO[0000] query failed caller="/Users/zach/src/zitadel/internal/eventstore/repository/sql/query.go:135" error="timeout: context already done: context canceled" DEBU[0000] filter eventstore failed caller="/Users/zach/src/zitadel/internal/eventstore/handler/v2/field_handler.go:155" error="ID=SQL-KyeAx Message=unable to filter events Parent=(timeout: context already done: context canceled)" projection=instance_domain_fields DEBU[0000] unable to rollback tx caller="/Users/zach/src/zitadel/internal/eventstore/handler/v2/field_handler.go:110" error="sql: transaction has already been committed or rolled back" projection=instance_domain_fields INFO[0000] process events failed caller="/Users/zach/src/zitadel/internal/eventstore/handler/v2/field_handler.go:72" error="ID=SQL-KyeAx Message=unable to filter events Parent=(timeout: context already done: context canceled)" projection=instance_domain_fields DEBU[0000] trigger iteration caller="/Users/zach/src/zitadel/internal/eventstore/handler/v2/field_handler.go:73" iteration=0 projection=instance_domain_fields ERRO[0000] migration failed caller="/Users/zach/src/zitadel/internal/migration/migration.go:68" error="ID=SQL-KyeAx Message=unable to filter events Parent=(timeout: context already done: context canceled)" name=repeatable_fill_fields_for_instance_domains ERRO[0000] migration finish failed caller="/Users/zach/src/zitadel/internal/migration/migration.go:71" error="context canceled" name=repeatable_fill_fields_for_instance_domains ----- Cleanup before exiting ----- INFO[0000] cleanup started caller="/Users/zach/src/zitadel/cmd/setup/cleanup.go:30" INFO[0000] cleanup migration caller="/Users/zach/src/zitadel/cmd/setup/cleanup.go:47" name=repeatable_fill_fields_for_instance_domains ``` # Additional Changes * `mustExecuteMigration` -> `executeMigration`: **must**Execute logged a Fatal error previously which calls os.Exit so no cleanup was possible. Instead, this PR returns an error and assigns it to a shared error in the Setup closure that defer can check. * `initProjections` now returns an error instead of exiting # Additional Context This behavior might be unwelcome or at least unexpected in some cases. Putting it behind a feature flag or config setting is likely a good followup. --------- Co-authored-by: Silvan <27845747+adlerhurst@users.noreply.github.com>
2025-04-22 05:34:02 -04:00
func Cleanup(ctx context.Context, config *Config) {
logging.Info("cleanup started")
dbClient, err := database.Connect(config.Database, false)
logging.OnError(err).Fatal("unable to connect to database")
config.Eventstore.Pusher = new_es.NewEventstore(dbClient)
chore!: Introduce ZITADEL v3 (#9645) This PR summarizes multiple changes specifically only available with ZITADEL v3: - feat: Web Keys management (https://github.com/zitadel/zitadel/pull/9526) - fix(cmd): ensure proper working of mirror (https://github.com/zitadel/zitadel/pull/9509) - feat(Authz): system user support for permission check v2 (https://github.com/zitadel/zitadel/pull/9640) - chore(license): change from Apache to AGPL (https://github.com/zitadel/zitadel/pull/9597) - feat(console): list v2 sessions (https://github.com/zitadel/zitadel/pull/9539) - fix(console): add loginV2 feature flag (https://github.com/zitadel/zitadel/pull/9682) - fix(feature flags): allow reading "own" flags (https://github.com/zitadel/zitadel/pull/9649) - feat(console): add Actions V2 UI (https://github.com/zitadel/zitadel/pull/9591) BREAKING CHANGE - feat(webkey): migrate to v2beta API (https://github.com/zitadel/zitadel/pull/9445) - chore!: remove CockroachDB Support (https://github.com/zitadel/zitadel/pull/9444) - feat(actions): migrate to v2beta API (https://github.com/zitadel/zitadel/pull/9489) --------- Co-authored-by: Livio Spring <livio.a@gmail.com> Co-authored-by: Stefan Benz <46600784+stebenz@users.noreply.github.com> Co-authored-by: Silvan <27845747+adlerhurst@users.noreply.github.com> Co-authored-by: Ramon <mail@conblem.me> Co-authored-by: Elio Bischof <elio@zitadel.com> Co-authored-by: Kenta Yamaguchi <56732734+KEY60228@users.noreply.github.com> Co-authored-by: Harsha Reddy <harsha.reddy@klaviyo.com> Co-authored-by: Livio Spring <livio@zitadel.com> Co-authored-by: Max Peintner <max@caos.ch> Co-authored-by: Iraq <66622793+kkrime@users.noreply.github.com> Co-authored-by: Florian Forster <florian@zitadel.com> Co-authored-by: Tim Möhlmann <tim+github@zitadel.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Max Peintner <peintnerm@gmail.com>
2025-04-02 16:53:06 +02:00
config.Eventstore.Querier = old_es.NewPostgres(dbClient)
es := eventstore.NewEventstore(config.Eventstore)
step, err := migration.LastStuckStep(ctx, es)
logging.OnError(err).Fatal("unable to query latest migration")
if step == nil {
logging.Info("there is no stuck migration please run `zitadel setup`")
return
}
logging.WithFields("name", step.Name).Info("cleanup migration")
err = migration.CancelStep(ctx, es, step)
logging.OnError(err).Fatal("cleanup migration failed please retry")
}