From 40bfac268df6f83f7889130dd4b87e7747fdfa5a Mon Sep 17 00:00:00 2001 From: Mathias Beaulieu-Duncan Date: Mon, 25 May 2026 20:04:30 -0400 Subject: [PATCH] patches: add 0007 acquire.go wait for STATE on slow-init disks (CM5 eMMC) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After upgrade kexec into v1.13.2, CM5 eMMC takes ~2m13s between the SDHCI controller registering and mmc0 actually becoming usable. The Talos config acquire state machine (`acquire.go::stateDisk`) checks STATE in the first seconds of boot, sees `VolumePhaseMissing`, and transitions one-way to `stateEmbedded` -> `stateMaintenanceEnter`. When STATE later becomes ready, the state machine doesn't re-enter `stateDisk`, so the node stays in maintenance forever despite the on-disk config.yaml being intact. This patch makes stateDisk tolerate transient phase=missing for up to 5 minutes (stateMissingDiskTimeout) before falling through to embedded. A 5-second ticker on the outer Run loop ensures the timeout can fire even when no further volume-status events arrive (e.g. truly missing STATE on a fresh install). Validated 2026-05-25 via canonical 3-CP rolling upgrade on a freshly flashed v1.12.4 home-test cluster: all 3 blades upgraded sequentially to v1.13.2-7 (this patch), each came back stage=running with config loaded automatically and k8s Ready within ~5 min, no manual remediation. See doc-compute-blade-kubernetes/talos-upgrade-validation/session-2026-05-25/E2E-VALIDATED.md. Fast-init hardware sees no change — STATE reaches ready within seconds and the existing path runs. --- ...olume-on-slow-init-disks-in-config-a.patch | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 patches/siderolabs/talos/0007-Wait-for-STATE-volume-on-slow-init-disks-in-config-a.patch diff --git a/patches/siderolabs/talos/0007-Wait-for-STATE-volume-on-slow-init-disks-in-config-a.patch b/patches/siderolabs/talos/0007-Wait-for-STATE-volume-on-slow-init-disks-in-config-a.patch new file mode 100644 index 0000000..550684b --- /dev/null +++ b/patches/siderolabs/talos/0007-Wait-for-STATE-volume-on-slow-init-disks-in-config-a.patch @@ -0,0 +1,114 @@ +From 27501076fcb1e183e68313ef0e14092d7f403063 Mon Sep 17 00:00:00 2001 +From: Mathias Beaulieu-Duncan +Date: Mon, 25 May 2026 17:10:13 -0400 +Subject: [PATCH] Wait for STATE volume on slow-init disks in config acquire +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +On slow-init storage (notably RPi CM5 eMMC, which takes ~2 minutes 13 +seconds from kernel boot to mmc0/CQE-enabled), the config acquire state +machine's stateDisk step sees STATE in phase=missing during the early +boot window and immediately transitions to stateEmbedded. With no +embedded config either, the node falls through to maintenance mode and +stays there even after the STATE volume later reaches phase=ready — +the state machine is one-way and never re-enters stateDisk. + +Result: in-place upgrades to v1.13.2 on CM5 hardware leave the node in +maintenance with its on-disk config.yaml intact but unread, until a +human runs `talosctl apply-config --insecure` to re-feed the same file +that's already on the STATE filesystem. + +This patch makes stateDisk tolerate transient phase=missing for up to +5 minutes (stateMissingDiskTimeout) before falling through. The outer +Run loop gets a 5-second ticker so the timeout can fire even when no +further volume-status events arrive (e.g. truly missing STATE on a +fresh install). Fast-init hardware sees no change — STATE reaches ready +within seconds and the existing path runs. + +Tested on RPi CM5 (eMMC, 6.12.47 RPi-downstream kernel) — boot path +that previously dropped to maintenance now waits ~2m15s, sees STATE +reach ready, and continues to stateDone with the persisted config. +--- + .../pkg/controllers/config/acquire.go | 37 ++++++++++++++++++- + 1 file changed, 36 insertions(+), 1 deletion(-) + +diff --git a/internal/app/machined/pkg/controllers/config/acquire.go b/internal/app/machined/pkg/controllers/config/acquire.go +index a70fddb..7a3758f 100644 +--- a/internal/app/machined/pkg/controllers/config/acquire.go ++++ b/internal/app/machined/pkg/controllers/config/acquire.go +@@ -18,6 +18,7 @@ import ( + "path/filepath" + "slices" + "strings" ++ "time" + + "github.com/cosi-project/runtime/pkg/controller" + "github.com/cosi-project/runtime/pkg/safe" +@@ -86,8 +87,16 @@ type AcquireController struct { + diskConfig config.Provider + storedEmbeddedConfig []byte + skipMaskingEmbeddedConfig bool ++ firstSeenStateMissing time.Time + } + ++// stateMissingDiskTimeout is how long stateDisk will wait for the STATE volume ++// to leave the "missing" phase before falling through to embedded/maintenance. ++// On slow-init storage (e.g. RPi CM5 eMMC) the volume can take ~2-3 minutes to ++// appear after kernel boot; transitioning to embedded too eagerly leaves the ++// node permanently in maintenance even though the on-disk config is intact. ++const stateMissingDiskTimeout = 5 * time.Minute ++ + // Name implements controller.Controller interface. + func (ctrl *AcquireController) Name() string { + return "config.AcquireController" +@@ -177,11 +186,17 @@ func (ctrl *AcquireController) Run(ctx context.Context, r controller.Runtime, lo + // initialize with empty sources + ctrl.configSourcesUsed = []string{} + ++ // periodic wake-up so stateDisk can re-check slow-init volumes and trip ++ // its timeout even when no resource events fire. ++ tick := time.NewTicker(5 * time.Second) ++ defer tick.Stop() ++ + for { + select { + case <-ctx.Done(): + return nil + case <-r.EventCh(): ++ case <-tick.C: + } + + // check the spec first +@@ -262,7 +277,27 @@ func (ctrl *AcquireController) stateDisk(ctx context.Context, r controller.Runti + // wait for the status to be available + return nil, nil, nil + case stateVolumeStatus.TypedSpec().Phase == block.VolumePhaseMissing: +- // STATE is missing, proceed to stateEmbedded ++ // STATE is reported missing. On slow-init storage (e.g. RPi CM5 eMMC) ++ // the volume can take a couple of minutes to appear after kernel boot, ++ // so wait up to stateMissingDiskTimeout for it to reach ready before ++ // falling through to embedded/maintenance. ++ if ctrl.firstSeenStateMissing.IsZero() { ++ ctrl.firstSeenStateMissing = time.Now() ++ } ++ ++ if time.Since(ctrl.firstSeenStateMissing) < stateMissingDiskTimeout { ++ logger.Info("STATE volume not yet available, waiting", ++ zap.Duration("elapsed", time.Since(ctrl.firstSeenStateMissing)), ++ zap.Duration("timeout", stateMissingDiskTimeout), ++ ) ++ ++ return nil, nil, nil ++ } ++ ++ logger.Warn("STATE volume still missing after timeout, proceeding to embedded", ++ zap.Duration("waited", time.Since(ctrl.firstSeenStateMissing)), ++ ) ++ + return ctrl.stateEmbedded, nil, nil + case stateVolumeStatus.TypedSpec().Phase == block.VolumePhaseReady: + // STATE is ready, proceed to to the action +-- +2.50.1 (Apple Git-155) +