Tighten CTO live promotion opt-in audit

This commit is contained in:
Svrnty 2026-05-25 13:41:12 -04:00
parent 2beb72064b
commit 0ebd2f69ea
4 changed files with 47 additions and 33 deletions

View File

@ -6,7 +6,7 @@ eval_id: live-profile-drift
profile: cto-planb profile: cto-planb
status: pass status: pass
score: 100 score: 100
checked_at: '2026-05-25T17:37:05Z' checked_at: '2026-05-25T17:40:32Z'
checks: checks:
correctness: pass correctness: pass
verification: pass verification: pass
@ -76,7 +76,7 @@ commands:
- command: hermes -p cto-planb skills list - command: hermes -p cto-planb skills list
cwd: /home/svrnty/workspaces/hermes cwd: /home/svrnty/workspaces/hermes
returncode: 0 returncode: 0
duration_ms: 221 duration_ms: 251
stdout: " Installed Skills \n\u250F\ stdout: " Installed Skills \n\u250F\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
@ -113,7 +113,7 @@ commands:
- command: hermes -p cto-planb mcp list - command: hermes -p cto-planb mcp list
cwd: /home/svrnty/workspaces/hermes cwd: /home/svrnty/workspaces/hermes
returncode: 0 returncode: 0
duration_ms: 465 duration_ms: 497
stdout: "\n MCP Servers:\n\n Name Transport Tools\ stdout: "\n MCP Servers:\n\n Name Transport Tools\
\ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
@ -126,7 +126,7 @@ commands:
- command: ./install.sh --dry-run - command: ./install.sh --dry-run
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 4 duration_ms: 3
stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\ stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\
\ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\ \ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\
== DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\ == DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\

View File

@ -100,7 +100,7 @@ eval_results:
command: command:
command: hermes -p cto-planb mcp list command: hermes -p cto-planb mcp list
returncode: 0 returncode: 0
duration_ms: 462 duration_ms: 458
stdout: "\n MCP Servers:\n\n Name Transport \ stdout: "\n MCP Servers:\n\n Name Transport \
\ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\
@ -116,7 +116,9 @@ eval_results:
- Live paid/mutating promotion execution is disabled unless HERMES_CTO_LIVE_PROMOTION=1 - Live paid/mutating promotion execution is disabled unless HERMES_CTO_LIVE_PROMOTION=1
- HERMES_CTO_LIVE_PROMOTION_ACK must match the required acknowledgement string - HERMES_CTO_LIVE_PROMOTION_ACK must match the required acknowledgement string
live_requested: false live_requested: false
live_acknowledged: false
live_execution_allowed: false live_execution_allowed: false
opt_in_state_valid: true
live_execution: live_execution:
requested: false requested: false
allowed: false allowed: false

View File

@ -31,26 +31,26 @@ eval_results:
evidence: evidence:
- cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml - cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
duration_ms: 34 duration_ms: 37
- eval_id: promotion-fixture-execution - eval_id: promotion-fixture-execution
status: pass status: pass
evidence: evidence:
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
duration_ms: 755 duration_ms: 799
- eval_id: live-promotion-readiness - eval_id: live-promotion-readiness
status: pass status: pass
evidence: evidence:
- cto/evals/reports/2026-05-25-live-promotion-readiness.yaml - cto/evals/reports/2026-05-25-live-promotion-readiness.yaml
command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
duration_ms: 726 duration_ms: 720
- eval_id: static-prd-contract - eval_id: static-prd-contract
status: pass status: pass
evidence: evidence:
- tests/e2e/test_j_cto_webui_prd.py - tests/e2e/test_j_cto_webui_prd.py
command: pytest -q tests/e2e/test_j_cto_webui_prd.py command: pytest -q tests/e2e/test_j_cto_webui_prd.py
duration_ms: 1282 duration_ms: 2151
- eval_id: webui-cto-event-browser - eval_id: webui-cto-event-browser
status: pass status: pass
evidence: evidence:
@ -59,43 +59,43 @@ eval_results:
command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
tests/test_approval_queue.py tests/test_approval_queue.py
duration_ms: 3152 duration_ms: 3692
- eval_id: webui-cto-live-streaming - eval_id: webui-cto-live-streaming
status: pass status: pass
evidence: evidence:
- hermes-webui/tests/test_cto_live_streaming_e2e.py - hermes-webui/tests/test_cto_live_streaming_e2e.py
command: pytest -q tests/test_cto_live_streaming_e2e.py command: pytest -q tests/test_cto_live_streaming_e2e.py
duration_ms: 1852 duration_ms: 1921
- eval_id: live-profile-drift - eval_id: live-profile-drift
status: pass status: pass
evidence: evidence:
- cto/evals/reports/2026-05-25-live-drift.yaml - cto/evals/reports/2026-05-25-live-drift.yaml
command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
duration_ms: 731 duration_ms: 792
- eval_id: acceptance-audit - eval_id: acceptance-audit
status: pass status: pass
evidence: evidence:
- cto/evals/reports/2026-05-25-acceptance-audit.yaml - cto/evals/reports/2026-05-25-acceptance-audit.yaml
command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml
duration_ms: 44 duration_ms: 49
- eval_id: eval-report-scoring - eval_id: eval-report-scoring
status: pass status: pass
evidence: evidence:
- cto/evals/reports/*.yaml - cto/evals/reports/*.yaml
command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
"$r"; done "$r"; done
duration_ms: 339 duration_ms: 341
- eval_id: diff-whitespace-check - eval_id: diff-whitespace-check
status: pass status: pass
evidence: evidence:
- git diff --check - git diff --check
command: git diff --check command: git diff --check
duration_ms: 5 duration_ms: 7
commands: commands:
- command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml - command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 34 duration_ms: 37
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
' '
@ -104,7 +104,7 @@ commands:
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 755 duration_ms: 799
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
@ -114,7 +114,7 @@ commands:
- command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml - command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 726 duration_ms: 720
stdout: 'wrote evals/reports/2026-05-25-live-promotion-readiness.yaml stdout: 'wrote evals/reports/2026-05-25-live-promotion-readiness.yaml
' '
@ -122,7 +122,7 @@ commands:
- command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml - command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 44 duration_ms: 49
stdout: 'wrote evals/reports/2026-05-25-acceptance-audit.yaml stdout: 'wrote evals/reports/2026-05-25-acceptance-audit.yaml
' '
@ -130,10 +130,10 @@ commands:
- command: pytest -q tests/e2e/test_j_cto_webui_prd.py - command: pytest -q tests/e2e/test_j_cto_webui_prd.py
cwd: /home/svrnty/workspaces/hermes cwd: /home/svrnty/workspaces/hermes
returncode: 0 returncode: 0
duration_ms: 1282 duration_ms: 2151
stdout: '........... [100%] stdout: '............ [100%]
11 passed in 1.11s 12 passed in 1.92s
' '
stderr: '' stderr: ''
@ -142,27 +142,27 @@ commands:
tests/test_approval_queue.py tests/test_approval_queue.py
cwd: /home/svrnty/workspaces/hermes/hermes-webui cwd: /home/svrnty/workspaces/hermes/hermes-webui
returncode: 0 returncode: 0
duration_ms: 3152 duration_ms: 3692
stdout: '...................................... [100%] stdout: '...................................... [100%]
38 passed in 2.74s 38 passed in 3.11s
' '
stderr: '' stderr: ''
- command: pytest -q tests/test_cto_live_streaming_e2e.py - command: pytest -q tests/test_cto_live_streaming_e2e.py
cwd: /home/svrnty/workspaces/hermes/hermes-webui cwd: /home/svrnty/workspaces/hermes/hermes-webui
returncode: 0 returncode: 0
duration_ms: 1852 duration_ms: 1921
stdout: '.. [100%] stdout: '.. [100%]
2 passed in 1.49s 2 passed in 1.48s
' '
stderr: '' stderr: ''
- command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml - command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 731 duration_ms: 792
stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
' '
@ -171,7 +171,7 @@ commands:
"$r"; done "$r"; done
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 339 duration_ms: 341
stdout: 'ok stdout: 'ok
ok ok
@ -199,7 +199,7 @@ commands:
- command: git diff --check - command: git diff --check
cwd: /home/svrnty/workspaces/hermes cwd: /home/svrnty/workspaces/hermes
returncode: 0 returncode: 0
duration_ms: 5 duration_ms: 7
stdout: '' stdout: ''
stderr: '' stderr: ''
notes: notes:

View File

@ -25,6 +25,13 @@ FIXTURES = CTO_ROOT / "evals" / "fixtures" / "manifest.yaml"
REQUIRED_LIVE_ACK = "i-understand-this-may-spend-tokens-and-edit-temp-workspaces" REQUIRED_LIVE_ACK = "i-understand-this-may-spend-tokens-and-edit-temp-workspaces"
def _artifact_path(path: Path) -> str:
try:
return str(path.relative_to(REPO_ROOT))
except ValueError:
return str(path)
def _run(cmd: list[str], *, cwd: Path, timeout: int = 60) -> dict[str, Any]: def _run(cmd: list[str], *, cwd: Path, timeout: int = 60) -> dict[str, Any]:
started = time.time() started = time.time()
try: try:
@ -79,9 +86,12 @@ def build_report(output: Path) -> dict[str, Any]:
skills = _run(["hermes", "-p", "cto-planb", "skills", "list"], cwd=REPO_ROOT) if hermes_available else None skills = _run(["hermes", "-p", "cto-planb", "skills", "list"], cwd=REPO_ROOT) if hermes_available else None
mcp = _run(["hermes", "-p", "cto-planb", "mcp", "list"], cwd=REPO_ROOT) if hermes_available else None mcp = _run(["hermes", "-p", "cto-planb", "mcp", "list"], cwd=REPO_ROOT) if hermes_available else None
live_requested = os.environ.get("HERMES_CTO_LIVE_PROMOTION") == "1" live_requested_raw = os.environ.get("HERMES_CTO_LIVE_PROMOTION", "")
live_ack = os.environ.get("HERMES_CTO_LIVE_PROMOTION_ACK") == REQUIRED_LIVE_ACK live_ack_raw = os.environ.get("HERMES_CTO_LIVE_PROMOTION_ACK", "")
live_requested = live_requested_raw == "1"
live_ack = live_ack_raw == REQUIRED_LIVE_ACK
live_execution_allowed = live_requested and live_ack live_execution_allowed = live_requested and live_ack
opt_in_state_valid = (not live_requested_raw and not live_ack_raw) or live_execution_allowed
eval_results = [ eval_results = [
_result( _result(
@ -110,13 +120,15 @@ def build_report(output: Path) -> dict[str, Any]:
), ),
_result( _result(
"live-execution-opt-in-policy", "live-execution-opt-in-policy",
True, opt_in_state_valid,
[ [
"Live paid/mutating promotion execution is disabled unless HERMES_CTO_LIVE_PROMOTION=1", "Live paid/mutating promotion execution is disabled unless HERMES_CTO_LIVE_PROMOTION=1",
"HERMES_CTO_LIVE_PROMOTION_ACK must match the required acknowledgement string", "HERMES_CTO_LIVE_PROMOTION_ACK must match the required acknowledgement string",
], ],
live_requested=live_requested, live_requested=live_requested,
live_acknowledged=live_ack,
live_execution_allowed=live_execution_allowed, live_execution_allowed=live_execution_allowed,
opt_in_state_valid=opt_in_state_valid,
), ),
] ]
all_passed = all(item["status"] == "pass" for item in eval_results) all_passed = all(item["status"] == "pass" for item in eval_results)
@ -149,7 +161,7 @@ def build_report(output: Path) -> dict[str, Any]:
"artifacts": { "artifacts": {
"transcript": "sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md", "transcript": "sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md",
"diff": "local-worktree", "diff": "local-worktree",
"logs": str(output.relative_to(REPO_ROOT)), "logs": _artifact_path(output),
"screenshots": [], "screenshots": [],
}, },
"eval_results": eval_results, "eval_results": eval_results,