From 8246411b7b4f78ea87824feac7fd195322c469b2 Mon Sep 17 00:00:00 2001 From: Svrnty Date: Mon, 25 May 2026 13:27:29 -0400 Subject: [PATCH] Harden CTO sandcastle provider gate --- evals/reports/2026-05-25-live-drift.yaml | 8 ++-- .../2026-05-25-live-promotion-readiness.yaml | 4 +- ...5-25-local-regression-execution-slice.yaml | 44 +++++++++---------- lib/cto-worker.sh | 12 +++++ 4 files changed, 40 insertions(+), 28 deletions(-) diff --git a/evals/reports/2026-05-25-live-drift.yaml b/evals/reports/2026-05-25-live-drift.yaml index 6ac2906..cf3032e 100644 --- a/evals/reports/2026-05-25-live-drift.yaml +++ b/evals/reports/2026-05-25-live-drift.yaml @@ -6,7 +6,7 @@ eval_id: live-profile-drift profile: cto-planb status: pass score: 100 -checked_at: '2026-05-25T17:21:42Z' +checked_at: '2026-05-25T17:27:03Z' checks: correctness: pass verification: pass @@ -76,7 +76,7 @@ commands: - command: hermes -p cto-planb skills list cwd: /home/svrnty/workspaces/hermes returncode: 0 - duration_ms: 214 + duration_ms: 203 stdout: " Installed Skills \n\u250F\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\ @@ -113,7 +113,7 @@ commands: - command: hermes -p cto-planb mcp list cwd: /home/svrnty/workspaces/hermes returncode: 0 - duration_ms: 430 + duration_ms: 401 stdout: "\n MCP Servers:\n\n Name Transport Tools\ \ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\ @@ -126,7 +126,7 @@ commands: - command: ./install.sh --dry-run cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 3 + duration_ms: 2 stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\ \ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\ == DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\ diff --git a/evals/reports/2026-05-25-live-promotion-readiness.yaml b/evals/reports/2026-05-25-live-promotion-readiness.yaml index 913d419..ee5a978 100644 --- a/evals/reports/2026-05-25-live-promotion-readiness.yaml +++ b/evals/reports/2026-05-25-live-promotion-readiness.yaml @@ -59,7 +59,7 @@ eval_results: command: command: hermes -p cto-planb skills list returncode: 0 - duration_ms: 215 + duration_ms: 229 stdout: " Installed Skills \n\u250F\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\ @@ -100,7 +100,7 @@ eval_results: command: command: hermes -p cto-planb mcp list returncode: 0 - duration_ms: 435 + duration_ms: 450 stdout: "\n MCP Servers:\n\n Name Transport \ \ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\ diff --git a/evals/reports/2026-05-25-local-regression-execution-slice.yaml b/evals/reports/2026-05-25-local-regression-execution-slice.yaml index 084119c..df32a08 100644 --- a/evals/reports/2026-05-25-local-regression-execution-slice.yaml +++ b/evals/reports/2026-05-25-local-regression-execution-slice.yaml @@ -31,26 +31,26 @@ eval_results: evidence: - cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml - duration_ms: 35 + duration_ms: 39 - eval_id: promotion-fixture-execution status: pass evidence: - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json - duration_ms: 741 + duration_ms: 780 - eval_id: live-promotion-readiness status: pass evidence: - cto/evals/reports/2026-05-25-live-promotion-readiness.yaml command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml - duration_ms: 687 + duration_ms: 717 - eval_id: static-prd-contract status: pass evidence: - tests/e2e/test_j_cto_webui_prd.py command: pytest -q tests/e2e/test_j_cto_webui_prd.py - duration_ms: 1180 + duration_ms: 1227 - eval_id: webui-cto-event-browser status: pass evidence: @@ -59,37 +59,37 @@ eval_results: command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py tests/test_approval_queue.py - duration_ms: 3186 + duration_ms: 3273 - eval_id: webui-cto-live-streaming status: pass evidence: - hermes-webui/tests/test_cto_live_streaming_e2e.py command: pytest -q tests/test_cto_live_streaming_e2e.py - duration_ms: 2097 + duration_ms: 1831 - eval_id: live-profile-drift status: pass evidence: - cto/evals/reports/2026-05-25-live-drift.yaml command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml - duration_ms: 690 + duration_ms: 649 - eval_id: eval-report-scoring status: pass evidence: - cto/evals/reports/*.yaml command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py "$r"; done - duration_ms: 291 + duration_ms: 294 - eval_id: diff-whitespace-check status: pass evidence: - git diff --check command: git diff --check - duration_ms: 7 + duration_ms: 6 commands: - command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 35 + duration_ms: 39 stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml ' @@ -98,7 +98,7 @@ commands: --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 741 + duration_ms: 780 stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json @@ -108,7 +108,7 @@ commands: - command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 687 + duration_ms: 717 stdout: 'wrote evals/reports/2026-05-25-live-promotion-readiness.yaml ' @@ -116,10 +116,10 @@ commands: - command: pytest -q tests/e2e/test_j_cto_webui_prd.py cwd: /home/svrnty/workspaces/hermes returncode: 0 - duration_ms: 1180 + duration_ms: 1227 stdout: '.......... [100%] - 10 passed in 1.00s + 10 passed in 1.05s ' stderr: '' @@ -128,27 +128,27 @@ commands: tests/test_approval_queue.py cwd: /home/svrnty/workspaces/hermes/hermes-webui returncode: 0 - duration_ms: 3186 + duration_ms: 3273 stdout: '...................................... [100%] - 38 passed in 2.72s + 38 passed in 2.78s ' stderr: '' - command: pytest -q tests/test_cto_live_streaming_e2e.py cwd: /home/svrnty/workspaces/hermes/hermes-webui returncode: 0 - duration_ms: 2097 - stdout: '. [100%] + duration_ms: 1831 + stdout: '.. [100%] - 1 passed in 1.77s + 2 passed in 1.49s ' stderr: '' - command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 690 + duration_ms: 649 stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml ' @@ -157,7 +157,7 @@ commands: "$r"; done cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 291 + duration_ms: 294 stdout: 'ok ok @@ -183,7 +183,7 @@ commands: - command: git diff --check cwd: /home/svrnty/workspaces/hermes returncode: 0 - duration_ms: 7 + duration_ms: 6 stdout: '' stderr: '' notes: diff --git a/lib/cto-worker.sh b/lib/cto-worker.sh index e6dc7aa..f178934 100755 --- a/lib/cto-worker.sh +++ b/lib/cto-worker.sh @@ -36,6 +36,18 @@ cmd_sandcastle() { [ -d "$target" ] || { echo "ERROR: target repo $target not found" >&2; return 1; } [ -f "$prompt_file" ] || { echo "ERROR: prompt file $prompt_file not found" >&2; return 1; } + case "$provider" in + docker|podman) ;; + noSandbox|nosandbox|head) + echo "BLOCK: unsafe sandcastle provider/strategy requires JP approval: $provider" >&2 + return 1 + ;; + *) + echo "BLOCK: unsupported sandcastle provider: $provider" >&2 + return 1 + ;; + esac + # Hard rule: never run against read-only workspace siblings. case "$(basename "$target")" in hermes-agent|hermes-webui|marketingskills|sandcastle)