From 0ebd2f69ea5f6884201e75cc6475faf5196d9991 Mon Sep 17 00:00:00 2001 From: Svrnty Date: Mon, 25 May 2026 13:41:12 -0400 Subject: [PATCH] Tighten CTO live promotion opt-in audit --- evals/reports/2026-05-25-live-drift.yaml | 8 ++-- .../2026-05-25-live-promotion-readiness.yaml | 4 +- ...5-25-local-regression-execution-slice.yaml | 48 +++++++++---------- evals/runners/run-live-promotion-readiness.py | 20 ++++++-- 4 files changed, 47 insertions(+), 33 deletions(-) diff --git a/evals/reports/2026-05-25-live-drift.yaml b/evals/reports/2026-05-25-live-drift.yaml index 8ee6695..9ba1b38 100644 --- a/evals/reports/2026-05-25-live-drift.yaml +++ b/evals/reports/2026-05-25-live-drift.yaml @@ -6,7 +6,7 @@ eval_id: live-profile-drift profile: cto-planb status: pass score: 100 -checked_at: '2026-05-25T17:37:05Z' +checked_at: '2026-05-25T17:40:32Z' checks: correctness: pass verification: pass @@ -76,7 +76,7 @@ commands: - command: hermes -p cto-planb skills list cwd: /home/svrnty/workspaces/hermes returncode: 0 - duration_ms: 221 + duration_ms: 251 stdout: " Installed Skills \n\u250F\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\ @@ -113,7 +113,7 @@ commands: - command: hermes -p cto-planb mcp list cwd: /home/svrnty/workspaces/hermes returncode: 0 - duration_ms: 465 + duration_ms: 497 stdout: "\n MCP Servers:\n\n Name Transport Tools\ \ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\ @@ -126,7 +126,7 @@ commands: - command: ./install.sh --dry-run cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 4 + duration_ms: 3 stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\ \ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\ == DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\ diff --git a/evals/reports/2026-05-25-live-promotion-readiness.yaml b/evals/reports/2026-05-25-live-promotion-readiness.yaml index 620a99b..b0b587a 100644 --- a/evals/reports/2026-05-25-live-promotion-readiness.yaml +++ b/evals/reports/2026-05-25-live-promotion-readiness.yaml @@ -100,7 +100,7 @@ eval_results: command: command: hermes -p cto-planb mcp list returncode: 0 - duration_ms: 462 + duration_ms: 458 stdout: "\n MCP Servers:\n\n Name Transport \ \ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\ @@ -116,7 +116,9 @@ eval_results: - Live paid/mutating promotion execution is disabled unless HERMES_CTO_LIVE_PROMOTION=1 - HERMES_CTO_LIVE_PROMOTION_ACK must match the required acknowledgement string live_requested: false + live_acknowledged: false live_execution_allowed: false + opt_in_state_valid: true live_execution: requested: false allowed: false diff --git a/evals/reports/2026-05-25-local-regression-execution-slice.yaml b/evals/reports/2026-05-25-local-regression-execution-slice.yaml index a83b6ba..77d58a2 100644 --- a/evals/reports/2026-05-25-local-regression-execution-slice.yaml +++ b/evals/reports/2026-05-25-local-regression-execution-slice.yaml @@ -31,26 +31,26 @@ eval_results: evidence: - cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml - duration_ms: 34 + duration_ms: 37 - eval_id: promotion-fixture-execution status: pass evidence: - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json - duration_ms: 755 + duration_ms: 799 - eval_id: live-promotion-readiness status: pass evidence: - cto/evals/reports/2026-05-25-live-promotion-readiness.yaml command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml - duration_ms: 726 + duration_ms: 720 - eval_id: static-prd-contract status: pass evidence: - tests/e2e/test_j_cto_webui_prd.py command: pytest -q tests/e2e/test_j_cto_webui_prd.py - duration_ms: 1282 + duration_ms: 2151 - eval_id: webui-cto-event-browser status: pass evidence: @@ -59,43 +59,43 @@ eval_results: command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py tests/test_approval_queue.py - duration_ms: 3152 + duration_ms: 3692 - eval_id: webui-cto-live-streaming status: pass evidence: - hermes-webui/tests/test_cto_live_streaming_e2e.py command: pytest -q tests/test_cto_live_streaming_e2e.py - duration_ms: 1852 + duration_ms: 1921 - eval_id: live-profile-drift status: pass evidence: - cto/evals/reports/2026-05-25-live-drift.yaml command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml - duration_ms: 731 + duration_ms: 792 - eval_id: acceptance-audit status: pass evidence: - cto/evals/reports/2026-05-25-acceptance-audit.yaml command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml - duration_ms: 44 + duration_ms: 49 - eval_id: eval-report-scoring status: pass evidence: - cto/evals/reports/*.yaml command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py "$r"; done - duration_ms: 339 + duration_ms: 341 - eval_id: diff-whitespace-check status: pass evidence: - git diff --check command: git diff --check - duration_ms: 5 + duration_ms: 7 commands: - command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 34 + duration_ms: 37 stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml ' @@ -104,7 +104,7 @@ commands: --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 755 + duration_ms: 799 stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json @@ -114,7 +114,7 @@ commands: - command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 726 + duration_ms: 720 stdout: 'wrote evals/reports/2026-05-25-live-promotion-readiness.yaml ' @@ -122,7 +122,7 @@ commands: - command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 44 + duration_ms: 49 stdout: 'wrote evals/reports/2026-05-25-acceptance-audit.yaml ' @@ -130,10 +130,10 @@ commands: - command: pytest -q tests/e2e/test_j_cto_webui_prd.py cwd: /home/svrnty/workspaces/hermes returncode: 0 - duration_ms: 1282 - stdout: '........... [100%] + duration_ms: 2151 + stdout: '............ [100%] - 11 passed in 1.11s + 12 passed in 1.92s ' stderr: '' @@ -142,27 +142,27 @@ commands: tests/test_approval_queue.py cwd: /home/svrnty/workspaces/hermes/hermes-webui returncode: 0 - duration_ms: 3152 + duration_ms: 3692 stdout: '...................................... [100%] - 38 passed in 2.74s + 38 passed in 3.11s ' stderr: '' - command: pytest -q tests/test_cto_live_streaming_e2e.py cwd: /home/svrnty/workspaces/hermes/hermes-webui returncode: 0 - duration_ms: 1852 + duration_ms: 1921 stdout: '.. [100%] - 2 passed in 1.49s + 2 passed in 1.48s ' stderr: '' - command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 731 + duration_ms: 792 stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml ' @@ -171,7 +171,7 @@ commands: "$r"; done cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 339 + duration_ms: 341 stdout: 'ok ok @@ -199,7 +199,7 @@ commands: - command: git diff --check cwd: /home/svrnty/workspaces/hermes returncode: 0 - duration_ms: 5 + duration_ms: 7 stdout: '' stderr: '' notes: diff --git a/evals/runners/run-live-promotion-readiness.py b/evals/runners/run-live-promotion-readiness.py index deb082d..7b91c2b 100755 --- a/evals/runners/run-live-promotion-readiness.py +++ b/evals/runners/run-live-promotion-readiness.py @@ -25,6 +25,13 @@ FIXTURES = CTO_ROOT / "evals" / "fixtures" / "manifest.yaml" REQUIRED_LIVE_ACK = "i-understand-this-may-spend-tokens-and-edit-temp-workspaces" +def _artifact_path(path: Path) -> str: + try: + return str(path.relative_to(REPO_ROOT)) + except ValueError: + return str(path) + + def _run(cmd: list[str], *, cwd: Path, timeout: int = 60) -> dict[str, Any]: started = time.time() try: @@ -79,9 +86,12 @@ def build_report(output: Path) -> dict[str, Any]: skills = _run(["hermes", "-p", "cto-planb", "skills", "list"], cwd=REPO_ROOT) if hermes_available else None mcp = _run(["hermes", "-p", "cto-planb", "mcp", "list"], cwd=REPO_ROOT) if hermes_available else None - live_requested = os.environ.get("HERMES_CTO_LIVE_PROMOTION") == "1" - live_ack = os.environ.get("HERMES_CTO_LIVE_PROMOTION_ACK") == REQUIRED_LIVE_ACK + live_requested_raw = os.environ.get("HERMES_CTO_LIVE_PROMOTION", "") + live_ack_raw = os.environ.get("HERMES_CTO_LIVE_PROMOTION_ACK", "") + live_requested = live_requested_raw == "1" + live_ack = live_ack_raw == REQUIRED_LIVE_ACK live_execution_allowed = live_requested and live_ack + opt_in_state_valid = (not live_requested_raw and not live_ack_raw) or live_execution_allowed eval_results = [ _result( @@ -110,13 +120,15 @@ def build_report(output: Path) -> dict[str, Any]: ), _result( "live-execution-opt-in-policy", - True, + opt_in_state_valid, [ "Live paid/mutating promotion execution is disabled unless HERMES_CTO_LIVE_PROMOTION=1", "HERMES_CTO_LIVE_PROMOTION_ACK must match the required acknowledgement string", ], live_requested=live_requested, + live_acknowledged=live_ack, live_execution_allowed=live_execution_allowed, + opt_in_state_valid=opt_in_state_valid, ), ] all_passed = all(item["status"] == "pass" for item in eval_results) @@ -149,7 +161,7 @@ def build_report(output: Path) -> dict[str, Any]: "artifacts": { "transcript": "sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md", "diff": "local-worktree", - "logs": str(output.relative_to(REPO_ROOT)), + "logs": _artifact_path(output), "screenshots": [], }, "eval_results": eval_results,