From 27501076fcb1e183e68313ef0e14092d7f403063 Mon Sep 17 00:00:00 2001 From: Mathias Beaulieu-Duncan Date: Mon, 25 May 2026 17:10:13 -0400 Subject: [PATCH] Wait for STATE volume on slow-init disks in config acquire MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On slow-init storage (notably RPi CM5 eMMC, which takes ~2 minutes 13 seconds from kernel boot to mmc0/CQE-enabled), the config acquire state machine's stateDisk step sees STATE in phase=missing during the early boot window and immediately transitions to stateEmbedded. With no embedded config either, the node falls through to maintenance mode and stays there even after the STATE volume later reaches phase=ready — the state machine is one-way and never re-enters stateDisk. Result: in-place upgrades to v1.13.2 on CM5 hardware leave the node in maintenance with its on-disk config.yaml intact but unread, until a human runs `talosctl apply-config --insecure` to re-feed the same file that's already on the STATE filesystem. This patch makes stateDisk tolerate transient phase=missing for up to 5 minutes (stateMissingDiskTimeout) before falling through. The outer Run loop gets a 5-second ticker so the timeout can fire even when no further volume-status events arrive (e.g. truly missing STATE on a fresh install). Fast-init hardware sees no change — STATE reaches ready within seconds and the existing path runs. Tested on RPi CM5 (eMMC, 6.12.47 RPi-downstream kernel) — boot path that previously dropped to maintenance now waits ~2m15s, sees STATE reach ready, and continues to stateDone with the persisted config. --- .../pkg/controllers/config/acquire.go | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/internal/app/machined/pkg/controllers/config/acquire.go b/internal/app/machined/pkg/controllers/config/acquire.go index a70fddb..7a3758f 100644 --- a/internal/app/machined/pkg/controllers/config/acquire.go +++ b/internal/app/machined/pkg/controllers/config/acquire.go @@ -18,6 +18,7 @@ import ( "path/filepath" "slices" "strings" + "time" "github.com/cosi-project/runtime/pkg/controller" "github.com/cosi-project/runtime/pkg/safe" @@ -86,8 +87,16 @@ type AcquireController struct { diskConfig config.Provider storedEmbeddedConfig []byte skipMaskingEmbeddedConfig bool + firstSeenStateMissing time.Time } +// stateMissingDiskTimeout is how long stateDisk will wait for the STATE volume +// to leave the "missing" phase before falling through to embedded/maintenance. +// On slow-init storage (e.g. RPi CM5 eMMC) the volume can take ~2-3 minutes to +// appear after kernel boot; transitioning to embedded too eagerly leaves the +// node permanently in maintenance even though the on-disk config is intact. +const stateMissingDiskTimeout = 5 * time.Minute + // Name implements controller.Controller interface. func (ctrl *AcquireController) Name() string { return "config.AcquireController" @@ -177,11 +186,17 @@ func (ctrl *AcquireController) Run(ctx context.Context, r controller.Runtime, lo // initialize with empty sources ctrl.configSourcesUsed = []string{} + // periodic wake-up so stateDisk can re-check slow-init volumes and trip + // its timeout even when no resource events fire. + tick := time.NewTicker(5 * time.Second) + defer tick.Stop() + for { select { case <-ctx.Done(): return nil case <-r.EventCh(): + case <-tick.C: } // check the spec first @@ -262,7 +277,27 @@ func (ctrl *AcquireController) stateDisk(ctx context.Context, r controller.Runti // wait for the status to be available return nil, nil, nil case stateVolumeStatus.TypedSpec().Phase == block.VolumePhaseMissing: - // STATE is missing, proceed to stateEmbedded + // STATE is reported missing. On slow-init storage (e.g. RPi CM5 eMMC) + // the volume can take a couple of minutes to appear after kernel boot, + // so wait up to stateMissingDiskTimeout for it to reach ready before + // falling through to embedded/maintenance. + if ctrl.firstSeenStateMissing.IsZero() { + ctrl.firstSeenStateMissing = time.Now() + } + + if time.Since(ctrl.firstSeenStateMissing) < stateMissingDiskTimeout { + logger.Info("STATE volume not yet available, waiting", + zap.Duration("elapsed", time.Since(ctrl.firstSeenStateMissing)), + zap.Duration("timeout", stateMissingDiskTimeout), + ) + + return nil, nil, nil + } + + logger.Warn("STATE volume still missing after timeout, proceeding to embedded", + zap.Duration("waited", time.Since(ctrl.firstSeenStateMissing)), + ) + return ctrl.stateEmbedded, nil, nil case stateVolumeStatus.TypedSpec().Phase == block.VolumePhaseReady: // STATE is ready, proceed to to the action -- 2.50.1 (Apple Git-155)