Include CTO cancel coverage in evals

This commit is contained in:
Svrnty 2026-05-25 13:15:28 -04:00
parent a576288d49
commit cf3d10f8b9
4 changed files with 27 additions and 25 deletions

View File

@ -6,7 +6,7 @@ eval_id: live-profile-drift
profile: cto-planb profile: cto-planb
status: pass status: pass
score: 100 score: 100
checked_at: '2026-05-25T17:10:50Z' checked_at: '2026-05-25T17:14:09Z'
checks: checks:
correctness: pass correctness: pass
verification: pass verification: pass
@ -76,7 +76,7 @@ commands:
- command: hermes -p cto-planb skills list - command: hermes -p cto-planb skills list
cwd: /home/svrnty/workspaces/hermes cwd: /home/svrnty/workspaces/hermes
returncode: 0 returncode: 0
duration_ms: 210 duration_ms: 211
stdout: " Installed Skills \n\u250F\ stdout: " Installed Skills \n\u250F\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
@ -113,7 +113,7 @@ commands:
- command: hermes -p cto-planb mcp list - command: hermes -p cto-planb mcp list
cwd: /home/svrnty/workspaces/hermes cwd: /home/svrnty/workspaces/hermes
returncode: 0 returncode: 0
duration_ms: 464 duration_ms: 401
stdout: "\n MCP Servers:\n\n Name Transport Tools\ stdout: "\n MCP Servers:\n\n Name Transport Tools\
\ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
@ -126,7 +126,7 @@ commands:
- command: ./install.sh --dry-run - command: ./install.sh --dry-run
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 2 duration_ms: 4
stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\ stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\
\ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\ \ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\
== DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\ == DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\

View File

@ -59,7 +59,7 @@ eval_results:
command: command:
command: hermes -p cto-planb skills list command: hermes -p cto-planb skills list
returncode: 0 returncode: 0
duration_ms: 240 duration_ms: 232
stdout: " Installed Skills \n\u250F\ stdout: " Installed Skills \n\u250F\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
@ -100,7 +100,7 @@ eval_results:
command: command:
command: hermes -p cto-planb mcp list command: hermes -p cto-planb mcp list
returncode: 0 returncode: 0
duration_ms: 431 duration_ms: 397
stdout: "\n MCP Servers:\n\n Name Transport \ stdout: "\n MCP Servers:\n\n Name Transport \
\ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\

View File

@ -38,7 +38,7 @@ eval_results:
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
duration_ms: 743 duration_ms: 739
- eval_id: live-promotion-readiness - eval_id: live-promotion-readiness
status: pass status: pass
evidence: evidence:
@ -50,33 +50,34 @@ eval_results:
evidence: evidence:
- tests/e2e/test_j_cto_webui_prd.py - tests/e2e/test_j_cto_webui_prd.py
command: pytest -q tests/e2e/test_j_cto_webui_prd.py command: pytest -q tests/e2e/test_j_cto_webui_prd.py
duration_ms: 1212 duration_ms: 1198
- eval_id: webui-cto-event-browser - eval_id: webui-cto-event-browser
status: pass status: pass
evidence: evidence:
- hermes-webui/tests/test_cto_browser_e2e.py - hermes-webui/tests/test_cto_browser_e2e.py
- hermes-webui/tests/test_cancel_interrupt.py
command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
duration_ms: 2689 duration_ms: 3090
- eval_id: webui-cto-live-streaming - eval_id: webui-cto-live-streaming
status: pass status: pass
evidence: evidence:
- hermes-webui/tests/test_cto_live_streaming_e2e.py - hermes-webui/tests/test_cto_live_streaming_e2e.py
command: pytest -q tests/test_cto_live_streaming_e2e.py command: pytest -q tests/test_cto_live_streaming_e2e.py
duration_ms: 1785 duration_ms: 1906
- eval_id: live-profile-drift - eval_id: live-profile-drift
status: pass status: pass
evidence: evidence:
- cto/evals/reports/2026-05-25-live-drift.yaml - cto/evals/reports/2026-05-25-live-drift.yaml
command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
duration_ms: 718 duration_ms: 661
- eval_id: eval-report-scoring - eval_id: eval-report-scoring
status: pass status: pass
evidence: evidence:
- cto/evals/reports/*.yaml - cto/evals/reports/*.yaml
command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
"$r"; done "$r"; done
duration_ms: 297 duration_ms: 289
- eval_id: diff-whitespace-check - eval_id: diff-whitespace-check
status: pass status: pass
evidence: evidence:
@ -96,7 +97,7 @@ commands:
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 743 duration_ms: 739
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
@ -114,38 +115,38 @@ commands:
- command: pytest -q tests/e2e/test_j_cto_webui_prd.py - command: pytest -q tests/e2e/test_j_cto_webui_prd.py
cwd: /home/svrnty/workspaces/hermes cwd: /home/svrnty/workspaces/hermes
returncode: 0 returncode: 0
duration_ms: 1212 duration_ms: 1198
stdout: '.......... [100%] stdout: '.......... [100%]
10 passed in 1.04s 10 passed in 1.02s
' '
stderr: '' stderr: ''
- command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py - command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
cwd: /home/svrnty/workspaces/hermes/hermes-webui cwd: /home/svrnty/workspaces/hermes/hermes-webui
returncode: 0 returncode: 0
duration_ms: 2689 duration_ms: 3090
stdout: '............... [100%] stdout: '...................... [100%]
15 passed in 2.38s 22 passed in 2.63s
' '
stderr: '' stderr: ''
- command: pytest -q tests/test_cto_live_streaming_e2e.py - command: pytest -q tests/test_cto_live_streaming_e2e.py
cwd: /home/svrnty/workspaces/hermes/hermes-webui cwd: /home/svrnty/workspaces/hermes/hermes-webui
returncode: 0 returncode: 0
duration_ms: 1785 duration_ms: 1906
stdout: '. [100%] stdout: '. [100%]
1 passed in 1.47s 1 passed in 1.49s
' '
stderr: '' stderr: ''
- command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml - command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 718 duration_ms: 661
stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
' '
@ -154,7 +155,7 @@ commands:
"$r"; done "$r"; done
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 297 duration_ms: 289
stdout: 'ok stdout: 'ok
ok ok

View File

@ -162,6 +162,7 @@ def build_report(output: Path) -> dict[str, Any]:
"tests/test_live_tool_callback_events.py", "tests/test_live_tool_callback_events.py",
"tests/test_cto_webui_journal_e2e.py", "tests/test_cto_webui_journal_e2e.py",
"tests/test_cto_browser_e2e.py", "tests/test_cto_browser_e2e.py",
"tests/test_cancel_interrupt.py",
], ],
cwd=WEBUI_ROOT, cwd=WEBUI_ROOT,
timeout=180, timeout=180,
@ -197,7 +198,7 @@ def build_report(output: Path) -> dict[str, Any]:
_eval_result("promotion-fixture-execution", fixtures, ["cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml"]), _eval_result("promotion-fixture-execution", fixtures, ["cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml"]),
_eval_result("live-promotion-readiness", live_readiness, ["cto/evals/reports/2026-05-25-live-promotion-readiness.yaml"]), _eval_result("live-promotion-readiness", live_readiness, ["cto/evals/reports/2026-05-25-live-promotion-readiness.yaml"]),
_eval_result("static-prd-contract", prd, ["tests/e2e/test_j_cto_webui_prd.py"]), _eval_result("static-prd-contract", prd, ["tests/e2e/test_j_cto_webui_prd.py"]),
_eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py"]), _eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py", "hermes-webui/tests/test_cancel_interrupt.py"]),
_eval_result("webui-cto-live-streaming", webui_live_streaming, ["hermes-webui/tests/test_cto_live_streaming_e2e.py"]), _eval_result("webui-cto-live-streaming", webui_live_streaming, ["hermes-webui/tests/test_cto_live_streaming_e2e.py"]),
_eval_result("live-profile-drift", drift, ["cto/evals/reports/2026-05-25-live-drift.yaml"]), _eval_result("live-profile-drift", drift, ["cto/evals/reports/2026-05-25-live-drift.yaml"]),
_eval_result("eval-report-scoring", score, ["cto/evals/reports/*.yaml"]), _eval_result("eval-report-scoring", score, ["cto/evals/reports/*.yaml"]),