Include CTO cancel coverage in evals
This commit is contained in:
parent
a576288d49
commit
cf3d10f8b9
@ -6,7 +6,7 @@ eval_id: live-profile-drift
|
|||||||
profile: cto-planb
|
profile: cto-planb
|
||||||
status: pass
|
status: pass
|
||||||
score: 100
|
score: 100
|
||||||
checked_at: '2026-05-25T17:10:50Z'
|
checked_at: '2026-05-25T17:14:09Z'
|
||||||
checks:
|
checks:
|
||||||
correctness: pass
|
correctness: pass
|
||||||
verification: pass
|
verification: pass
|
||||||
@ -76,7 +76,7 @@ commands:
|
|||||||
- command: hermes -p cto-planb skills list
|
- command: hermes -p cto-planb skills list
|
||||||
cwd: /home/svrnty/workspaces/hermes
|
cwd: /home/svrnty/workspaces/hermes
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 210
|
duration_ms: 211
|
||||||
stdout: " Installed Skills \n\u250F\
|
stdout: " Installed Skills \n\u250F\
|
||||||
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
|
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
|
||||||
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
|
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
|
||||||
@ -113,7 +113,7 @@ commands:
|
|||||||
- command: hermes -p cto-planb mcp list
|
- command: hermes -p cto-planb mcp list
|
||||||
cwd: /home/svrnty/workspaces/hermes
|
cwd: /home/svrnty/workspaces/hermes
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 464
|
duration_ms: 401
|
||||||
stdout: "\n MCP Servers:\n\n Name Transport Tools\
|
stdout: "\n MCP Servers:\n\n Name Transport Tools\
|
||||||
\ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
\ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
||||||
\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
||||||
@ -126,7 +126,7 @@ commands:
|
|||||||
- command: ./install.sh --dry-run
|
- command: ./install.sh --dry-run
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 2
|
duration_ms: 4
|
||||||
stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\
|
stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\
|
||||||
\ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\
|
\ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\
|
||||||
== DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\
|
== DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\
|
||||||
|
|||||||
@ -59,7 +59,7 @@ eval_results:
|
|||||||
command:
|
command:
|
||||||
command: hermes -p cto-planb skills list
|
command: hermes -p cto-planb skills list
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 240
|
duration_ms: 232
|
||||||
stdout: " Installed Skills \n\u250F\
|
stdout: " Installed Skills \n\u250F\
|
||||||
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
|
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
|
||||||
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
|
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
|
||||||
@ -100,7 +100,7 @@ eval_results:
|
|||||||
command:
|
command:
|
||||||
command: hermes -p cto-planb mcp list
|
command: hermes -p cto-planb mcp list
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 431
|
duration_ms: 397
|
||||||
stdout: "\n MCP Servers:\n\n Name Transport \
|
stdout: "\n MCP Servers:\n\n Name Transport \
|
||||||
\ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
\ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
||||||
\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\
|
\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\
|
||||||
|
|||||||
@ -38,7 +38,7 @@ eval_results:
|
|||||||
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
||||||
duration_ms: 743
|
duration_ms: 739
|
||||||
- eval_id: live-promotion-readiness
|
- eval_id: live-promotion-readiness
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
@ -50,33 +50,34 @@ eval_results:
|
|||||||
evidence:
|
evidence:
|
||||||
- tests/e2e/test_j_cto_webui_prd.py
|
- tests/e2e/test_j_cto_webui_prd.py
|
||||||
command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
||||||
duration_ms: 1212
|
duration_ms: 1198
|
||||||
- eval_id: webui-cto-event-browser
|
- eval_id: webui-cto-event-browser
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- hermes-webui/tests/test_cto_browser_e2e.py
|
- hermes-webui/tests/test_cto_browser_e2e.py
|
||||||
|
- hermes-webui/tests/test_cancel_interrupt.py
|
||||||
command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
|
command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
|
||||||
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py
|
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
|
||||||
duration_ms: 2689
|
duration_ms: 3090
|
||||||
- eval_id: webui-cto-live-streaming
|
- eval_id: webui-cto-live-streaming
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- hermes-webui/tests/test_cto_live_streaming_e2e.py
|
- hermes-webui/tests/test_cto_live_streaming_e2e.py
|
||||||
command: pytest -q tests/test_cto_live_streaming_e2e.py
|
command: pytest -q tests/test_cto_live_streaming_e2e.py
|
||||||
duration_ms: 1785
|
duration_ms: 1906
|
||||||
- eval_id: live-profile-drift
|
- eval_id: live-profile-drift
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/2026-05-25-live-drift.yaml
|
- cto/evals/reports/2026-05-25-live-drift.yaml
|
||||||
command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
||||||
duration_ms: 718
|
duration_ms: 661
|
||||||
- eval_id: eval-report-scoring
|
- eval_id: eval-report-scoring
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/*.yaml
|
- cto/evals/reports/*.yaml
|
||||||
command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
|
command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
|
||||||
"$r"; done
|
"$r"; done
|
||||||
duration_ms: 297
|
duration_ms: 289
|
||||||
- eval_id: diff-whitespace-check
|
- eval_id: diff-whitespace-check
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
@ -96,7 +97,7 @@ commands:
|
|||||||
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 743
|
duration_ms: 739
|
||||||
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
|
|
||||||
wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
||||||
@ -114,38 +115,38 @@ commands:
|
|||||||
- command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
- command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
||||||
cwd: /home/svrnty/workspaces/hermes
|
cwd: /home/svrnty/workspaces/hermes
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 1212
|
duration_ms: 1198
|
||||||
stdout: '.......... [100%]
|
stdout: '.......... [100%]
|
||||||
|
|
||||||
10 passed in 1.04s
|
10 passed in 1.02s
|
||||||
|
|
||||||
'
|
'
|
||||||
stderr: ''
|
stderr: ''
|
||||||
- command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
|
- command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
|
||||||
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py
|
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
|
||||||
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 2689
|
duration_ms: 3090
|
||||||
stdout: '............... [100%]
|
stdout: '...................... [100%]
|
||||||
|
|
||||||
15 passed in 2.38s
|
22 passed in 2.63s
|
||||||
|
|
||||||
'
|
'
|
||||||
stderr: ''
|
stderr: ''
|
||||||
- command: pytest -q tests/test_cto_live_streaming_e2e.py
|
- command: pytest -q tests/test_cto_live_streaming_e2e.py
|
||||||
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 1785
|
duration_ms: 1906
|
||||||
stdout: '. [100%]
|
stdout: '. [100%]
|
||||||
|
|
||||||
1 passed in 1.47s
|
1 passed in 1.49s
|
||||||
|
|
||||||
'
|
'
|
||||||
stderr: ''
|
stderr: ''
|
||||||
- command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
- command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 718
|
duration_ms: 661
|
||||||
stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
|
stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
|
||||||
|
|
||||||
'
|
'
|
||||||
@ -154,7 +155,7 @@ commands:
|
|||||||
"$r"; done
|
"$r"; done
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 297
|
duration_ms: 289
|
||||||
stdout: 'ok
|
stdout: 'ok
|
||||||
|
|
||||||
ok
|
ok
|
||||||
|
|||||||
@ -162,6 +162,7 @@ def build_report(output: Path) -> dict[str, Any]:
|
|||||||
"tests/test_live_tool_callback_events.py",
|
"tests/test_live_tool_callback_events.py",
|
||||||
"tests/test_cto_webui_journal_e2e.py",
|
"tests/test_cto_webui_journal_e2e.py",
|
||||||
"tests/test_cto_browser_e2e.py",
|
"tests/test_cto_browser_e2e.py",
|
||||||
|
"tests/test_cancel_interrupt.py",
|
||||||
],
|
],
|
||||||
cwd=WEBUI_ROOT,
|
cwd=WEBUI_ROOT,
|
||||||
timeout=180,
|
timeout=180,
|
||||||
@ -197,7 +198,7 @@ def build_report(output: Path) -> dict[str, Any]:
|
|||||||
_eval_result("promotion-fixture-execution", fixtures, ["cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml"]),
|
_eval_result("promotion-fixture-execution", fixtures, ["cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml"]),
|
||||||
_eval_result("live-promotion-readiness", live_readiness, ["cto/evals/reports/2026-05-25-live-promotion-readiness.yaml"]),
|
_eval_result("live-promotion-readiness", live_readiness, ["cto/evals/reports/2026-05-25-live-promotion-readiness.yaml"]),
|
||||||
_eval_result("static-prd-contract", prd, ["tests/e2e/test_j_cto_webui_prd.py"]),
|
_eval_result("static-prd-contract", prd, ["tests/e2e/test_j_cto_webui_prd.py"]),
|
||||||
_eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py"]),
|
_eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py", "hermes-webui/tests/test_cancel_interrupt.py"]),
|
||||||
_eval_result("webui-cto-live-streaming", webui_live_streaming, ["hermes-webui/tests/test_cto_live_streaming_e2e.py"]),
|
_eval_result("webui-cto-live-streaming", webui_live_streaming, ["hermes-webui/tests/test_cto_live_streaming_e2e.py"]),
|
||||||
_eval_result("live-profile-drift", drift, ["cto/evals/reports/2026-05-25-live-drift.yaml"]),
|
_eval_result("live-profile-drift", drift, ["cto/evals/reports/2026-05-25-live-drift.yaml"]),
|
||||||
_eval_result("eval-report-scoring", score, ["cto/evals/reports/*.yaml"]),
|
_eval_result("eval-report-scoring", score, ["cto/evals/reports/*.yaml"]),
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user