Tighten CTO live promotion opt-in audit
This commit is contained in:
parent
2beb72064b
commit
0ebd2f69ea
@ -6,7 +6,7 @@ eval_id: live-profile-drift
|
|||||||
profile: cto-planb
|
profile: cto-planb
|
||||||
status: pass
|
status: pass
|
||||||
score: 100
|
score: 100
|
||||||
checked_at: '2026-05-25T17:37:05Z'
|
checked_at: '2026-05-25T17:40:32Z'
|
||||||
checks:
|
checks:
|
||||||
correctness: pass
|
correctness: pass
|
||||||
verification: pass
|
verification: pass
|
||||||
@ -76,7 +76,7 @@ commands:
|
|||||||
- command: hermes -p cto-planb skills list
|
- command: hermes -p cto-planb skills list
|
||||||
cwd: /home/svrnty/workspaces/hermes
|
cwd: /home/svrnty/workspaces/hermes
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 221
|
duration_ms: 251
|
||||||
stdout: " Installed Skills \n\u250F\
|
stdout: " Installed Skills \n\u250F\
|
||||||
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
|
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
|
||||||
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
|
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
|
||||||
@ -113,7 +113,7 @@ commands:
|
|||||||
- command: hermes -p cto-planb mcp list
|
- command: hermes -p cto-planb mcp list
|
||||||
cwd: /home/svrnty/workspaces/hermes
|
cwd: /home/svrnty/workspaces/hermes
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 465
|
duration_ms: 497
|
||||||
stdout: "\n MCP Servers:\n\n Name Transport Tools\
|
stdout: "\n MCP Servers:\n\n Name Transport Tools\
|
||||||
\ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
\ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
||||||
\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
||||||
@ -126,7 +126,7 @@ commands:
|
|||||||
- command: ./install.sh --dry-run
|
- command: ./install.sh --dry-run
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 4
|
duration_ms: 3
|
||||||
stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\
|
stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\
|
||||||
\ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\
|
\ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\
|
||||||
== DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\
|
== DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\
|
||||||
|
|||||||
@ -100,7 +100,7 @@ eval_results:
|
|||||||
command:
|
command:
|
||||||
command: hermes -p cto-planb mcp list
|
command: hermes -p cto-planb mcp list
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 462
|
duration_ms: 458
|
||||||
stdout: "\n MCP Servers:\n\n Name Transport \
|
stdout: "\n MCP Servers:\n\n Name Transport \
|
||||||
\ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
\ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
||||||
\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\
|
\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\
|
||||||
@ -116,7 +116,9 @@ eval_results:
|
|||||||
- Live paid/mutating promotion execution is disabled unless HERMES_CTO_LIVE_PROMOTION=1
|
- Live paid/mutating promotion execution is disabled unless HERMES_CTO_LIVE_PROMOTION=1
|
||||||
- HERMES_CTO_LIVE_PROMOTION_ACK must match the required acknowledgement string
|
- HERMES_CTO_LIVE_PROMOTION_ACK must match the required acknowledgement string
|
||||||
live_requested: false
|
live_requested: false
|
||||||
|
live_acknowledged: false
|
||||||
live_execution_allowed: false
|
live_execution_allowed: false
|
||||||
|
opt_in_state_valid: true
|
||||||
live_execution:
|
live_execution:
|
||||||
requested: false
|
requested: false
|
||||||
allowed: false
|
allowed: false
|
||||||
|
|||||||
@ -31,26 +31,26 @@ eval_results:
|
|||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
- cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
||||||
command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
||||||
duration_ms: 34
|
duration_ms: 37
|
||||||
- eval_id: promotion-fixture-execution
|
- eval_id: promotion-fixture-execution
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
||||||
duration_ms: 755
|
duration_ms: 799
|
||||||
- eval_id: live-promotion-readiness
|
- eval_id: live-promotion-readiness
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/2026-05-25-live-promotion-readiness.yaml
|
- cto/evals/reports/2026-05-25-live-promotion-readiness.yaml
|
||||||
command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
|
command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
|
||||||
duration_ms: 726
|
duration_ms: 720
|
||||||
- eval_id: static-prd-contract
|
- eval_id: static-prd-contract
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- tests/e2e/test_j_cto_webui_prd.py
|
- tests/e2e/test_j_cto_webui_prd.py
|
||||||
command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
||||||
duration_ms: 1282
|
duration_ms: 2151
|
||||||
- eval_id: webui-cto-event-browser
|
- eval_id: webui-cto-event-browser
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
@ -59,43 +59,43 @@ eval_results:
|
|||||||
command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
|
command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
|
||||||
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
|
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
|
||||||
tests/test_approval_queue.py
|
tests/test_approval_queue.py
|
||||||
duration_ms: 3152
|
duration_ms: 3692
|
||||||
- eval_id: webui-cto-live-streaming
|
- eval_id: webui-cto-live-streaming
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- hermes-webui/tests/test_cto_live_streaming_e2e.py
|
- hermes-webui/tests/test_cto_live_streaming_e2e.py
|
||||||
command: pytest -q tests/test_cto_live_streaming_e2e.py
|
command: pytest -q tests/test_cto_live_streaming_e2e.py
|
||||||
duration_ms: 1852
|
duration_ms: 1921
|
||||||
- eval_id: live-profile-drift
|
- eval_id: live-profile-drift
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/2026-05-25-live-drift.yaml
|
- cto/evals/reports/2026-05-25-live-drift.yaml
|
||||||
command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
||||||
duration_ms: 731
|
duration_ms: 792
|
||||||
- eval_id: acceptance-audit
|
- eval_id: acceptance-audit
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/2026-05-25-acceptance-audit.yaml
|
- cto/evals/reports/2026-05-25-acceptance-audit.yaml
|
||||||
command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml
|
command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml
|
||||||
duration_ms: 44
|
duration_ms: 49
|
||||||
- eval_id: eval-report-scoring
|
- eval_id: eval-report-scoring
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/*.yaml
|
- cto/evals/reports/*.yaml
|
||||||
command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
|
command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
|
||||||
"$r"; done
|
"$r"; done
|
||||||
duration_ms: 339
|
duration_ms: 341
|
||||||
- eval_id: diff-whitespace-check
|
- eval_id: diff-whitespace-check
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- git diff --check
|
- git diff --check
|
||||||
command: git diff --check
|
command: git diff --check
|
||||||
duration_ms: 5
|
duration_ms: 7
|
||||||
commands:
|
commands:
|
||||||
- command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
- command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 34
|
duration_ms: 37
|
||||||
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
||||||
|
|
||||||
'
|
'
|
||||||
@ -104,7 +104,7 @@ commands:
|
|||||||
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 755
|
duration_ms: 799
|
||||||
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
|
|
||||||
wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
||||||
@ -114,7 +114,7 @@ commands:
|
|||||||
- command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
|
- command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 726
|
duration_ms: 720
|
||||||
stdout: 'wrote evals/reports/2026-05-25-live-promotion-readiness.yaml
|
stdout: 'wrote evals/reports/2026-05-25-live-promotion-readiness.yaml
|
||||||
|
|
||||||
'
|
'
|
||||||
@ -122,7 +122,7 @@ commands:
|
|||||||
- command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml
|
- command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 44
|
duration_ms: 49
|
||||||
stdout: 'wrote evals/reports/2026-05-25-acceptance-audit.yaml
|
stdout: 'wrote evals/reports/2026-05-25-acceptance-audit.yaml
|
||||||
|
|
||||||
'
|
'
|
||||||
@ -130,10 +130,10 @@ commands:
|
|||||||
- command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
- command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
||||||
cwd: /home/svrnty/workspaces/hermes
|
cwd: /home/svrnty/workspaces/hermes
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 1282
|
duration_ms: 2151
|
||||||
stdout: '........... [100%]
|
stdout: '............ [100%]
|
||||||
|
|
||||||
11 passed in 1.11s
|
12 passed in 1.92s
|
||||||
|
|
||||||
'
|
'
|
||||||
stderr: ''
|
stderr: ''
|
||||||
@ -142,27 +142,27 @@ commands:
|
|||||||
tests/test_approval_queue.py
|
tests/test_approval_queue.py
|
||||||
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 3152
|
duration_ms: 3692
|
||||||
stdout: '...................................... [100%]
|
stdout: '...................................... [100%]
|
||||||
|
|
||||||
38 passed in 2.74s
|
38 passed in 3.11s
|
||||||
|
|
||||||
'
|
'
|
||||||
stderr: ''
|
stderr: ''
|
||||||
- command: pytest -q tests/test_cto_live_streaming_e2e.py
|
- command: pytest -q tests/test_cto_live_streaming_e2e.py
|
||||||
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 1852
|
duration_ms: 1921
|
||||||
stdout: '.. [100%]
|
stdout: '.. [100%]
|
||||||
|
|
||||||
2 passed in 1.49s
|
2 passed in 1.48s
|
||||||
|
|
||||||
'
|
'
|
||||||
stderr: ''
|
stderr: ''
|
||||||
- command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
- command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 731
|
duration_ms: 792
|
||||||
stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
|
stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
|
||||||
|
|
||||||
'
|
'
|
||||||
@ -171,7 +171,7 @@ commands:
|
|||||||
"$r"; done
|
"$r"; done
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 339
|
duration_ms: 341
|
||||||
stdout: 'ok
|
stdout: 'ok
|
||||||
|
|
||||||
ok
|
ok
|
||||||
@ -199,7 +199,7 @@ commands:
|
|||||||
- command: git diff --check
|
- command: git diff --check
|
||||||
cwd: /home/svrnty/workspaces/hermes
|
cwd: /home/svrnty/workspaces/hermes
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 5
|
duration_ms: 7
|
||||||
stdout: ''
|
stdout: ''
|
||||||
stderr: ''
|
stderr: ''
|
||||||
notes:
|
notes:
|
||||||
|
|||||||
@ -25,6 +25,13 @@ FIXTURES = CTO_ROOT / "evals" / "fixtures" / "manifest.yaml"
|
|||||||
REQUIRED_LIVE_ACK = "i-understand-this-may-spend-tokens-and-edit-temp-workspaces"
|
REQUIRED_LIVE_ACK = "i-understand-this-may-spend-tokens-and-edit-temp-workspaces"
|
||||||
|
|
||||||
|
|
||||||
|
def _artifact_path(path: Path) -> str:
|
||||||
|
try:
|
||||||
|
return str(path.relative_to(REPO_ROOT))
|
||||||
|
except ValueError:
|
||||||
|
return str(path)
|
||||||
|
|
||||||
|
|
||||||
def _run(cmd: list[str], *, cwd: Path, timeout: int = 60) -> dict[str, Any]:
|
def _run(cmd: list[str], *, cwd: Path, timeout: int = 60) -> dict[str, Any]:
|
||||||
started = time.time()
|
started = time.time()
|
||||||
try:
|
try:
|
||||||
@ -79,9 +86,12 @@ def build_report(output: Path) -> dict[str, Any]:
|
|||||||
skills = _run(["hermes", "-p", "cto-planb", "skills", "list"], cwd=REPO_ROOT) if hermes_available else None
|
skills = _run(["hermes", "-p", "cto-planb", "skills", "list"], cwd=REPO_ROOT) if hermes_available else None
|
||||||
mcp = _run(["hermes", "-p", "cto-planb", "mcp", "list"], cwd=REPO_ROOT) if hermes_available else None
|
mcp = _run(["hermes", "-p", "cto-planb", "mcp", "list"], cwd=REPO_ROOT) if hermes_available else None
|
||||||
|
|
||||||
live_requested = os.environ.get("HERMES_CTO_LIVE_PROMOTION") == "1"
|
live_requested_raw = os.environ.get("HERMES_CTO_LIVE_PROMOTION", "")
|
||||||
live_ack = os.environ.get("HERMES_CTO_LIVE_PROMOTION_ACK") == REQUIRED_LIVE_ACK
|
live_ack_raw = os.environ.get("HERMES_CTO_LIVE_PROMOTION_ACK", "")
|
||||||
|
live_requested = live_requested_raw == "1"
|
||||||
|
live_ack = live_ack_raw == REQUIRED_LIVE_ACK
|
||||||
live_execution_allowed = live_requested and live_ack
|
live_execution_allowed = live_requested and live_ack
|
||||||
|
opt_in_state_valid = (not live_requested_raw and not live_ack_raw) or live_execution_allowed
|
||||||
|
|
||||||
eval_results = [
|
eval_results = [
|
||||||
_result(
|
_result(
|
||||||
@ -110,13 +120,15 @@ def build_report(output: Path) -> dict[str, Any]:
|
|||||||
),
|
),
|
||||||
_result(
|
_result(
|
||||||
"live-execution-opt-in-policy",
|
"live-execution-opt-in-policy",
|
||||||
True,
|
opt_in_state_valid,
|
||||||
[
|
[
|
||||||
"Live paid/mutating promotion execution is disabled unless HERMES_CTO_LIVE_PROMOTION=1",
|
"Live paid/mutating promotion execution is disabled unless HERMES_CTO_LIVE_PROMOTION=1",
|
||||||
"HERMES_CTO_LIVE_PROMOTION_ACK must match the required acknowledgement string",
|
"HERMES_CTO_LIVE_PROMOTION_ACK must match the required acknowledgement string",
|
||||||
],
|
],
|
||||||
live_requested=live_requested,
|
live_requested=live_requested,
|
||||||
|
live_acknowledged=live_ack,
|
||||||
live_execution_allowed=live_execution_allowed,
|
live_execution_allowed=live_execution_allowed,
|
||||||
|
opt_in_state_valid=opt_in_state_valid,
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
all_passed = all(item["status"] == "pass" for item in eval_results)
|
all_passed = all(item["status"] == "pass" for item in eval_results)
|
||||||
@ -149,7 +161,7 @@ def build_report(output: Path) -> dict[str, Any]:
|
|||||||
"artifacts": {
|
"artifacts": {
|
||||||
"transcript": "sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md",
|
"transcript": "sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md",
|
||||||
"diff": "local-worktree",
|
"diff": "local-worktree",
|
||||||
"logs": str(output.relative_to(REPO_ROOT)),
|
"logs": _artifact_path(output),
|
||||||
"screenshots": [],
|
"screenshots": [],
|
||||||
},
|
},
|
||||||
"eval_results": eval_results,
|
"eval_results": eval_results,
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user