diff --git a/evals/reports/2026-05-25-live-drift.yaml b/evals/reports/2026-05-25-live-drift.yaml index 5a2118a..0f515b0 100644 --- a/evals/reports/2026-05-25-live-drift.yaml +++ b/evals/reports/2026-05-25-live-drift.yaml @@ -6,7 +6,7 @@ eval_id: live-profile-drift profile: cto-planb status: pass score: 100 -checked_at: '2026-05-25T17:10:50Z' +checked_at: '2026-05-25T17:14:09Z' checks: correctness: pass verification: pass @@ -76,7 +76,7 @@ commands: - command: hermes -p cto-planb skills list cwd: /home/svrnty/workspaces/hermes returncode: 0 - duration_ms: 210 + duration_ms: 211 stdout: " Installed Skills \n\u250F\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\ @@ -113,7 +113,7 @@ commands: - command: hermes -p cto-planb mcp list cwd: /home/svrnty/workspaces/hermes returncode: 0 - duration_ms: 464 + duration_ms: 401 stdout: "\n MCP Servers:\n\n Name Transport Tools\ \ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\ @@ -126,7 +126,7 @@ commands: - command: ./install.sh --dry-run cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 2 + duration_ms: 4 stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\ \ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\ == DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\ diff --git a/evals/reports/2026-05-25-live-promotion-readiness.yaml b/evals/reports/2026-05-25-live-promotion-readiness.yaml index 045dc05..bf8ed3f 100644 --- a/evals/reports/2026-05-25-live-promotion-readiness.yaml +++ b/evals/reports/2026-05-25-live-promotion-readiness.yaml @@ -59,7 +59,7 @@ eval_results: command: command: hermes -p cto-planb skills list returncode: 0 - duration_ms: 240 + duration_ms: 232 stdout: " Installed Skills \n\u250F\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\ @@ -100,7 +100,7 @@ eval_results: command: command: hermes -p cto-planb mcp list returncode: 0 - duration_ms: 431 + duration_ms: 397 stdout: "\n MCP Servers:\n\n Name Transport \ \ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\ diff --git a/evals/reports/2026-05-25-local-regression-execution-slice.yaml b/evals/reports/2026-05-25-local-regression-execution-slice.yaml index 8fdbfdb..c81c30f 100644 --- a/evals/reports/2026-05-25-local-regression-execution-slice.yaml +++ b/evals/reports/2026-05-25-local-regression-execution-slice.yaml @@ -38,7 +38,7 @@ eval_results: - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json - duration_ms: 743 + duration_ms: 739 - eval_id: live-promotion-readiness status: pass evidence: @@ -50,33 +50,34 @@ eval_results: evidence: - tests/e2e/test_j_cto_webui_prd.py command: pytest -q tests/e2e/test_j_cto_webui_prd.py - duration_ms: 1212 + duration_ms: 1198 - eval_id: webui-cto-event-browser status: pass evidence: - hermes-webui/tests/test_cto_browser_e2e.py + - hermes-webui/tests/test_cancel_interrupt.py command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py - tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py - duration_ms: 2689 + tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py + duration_ms: 3090 - eval_id: webui-cto-live-streaming status: pass evidence: - hermes-webui/tests/test_cto_live_streaming_e2e.py command: pytest -q tests/test_cto_live_streaming_e2e.py - duration_ms: 1785 + duration_ms: 1906 - eval_id: live-profile-drift status: pass evidence: - cto/evals/reports/2026-05-25-live-drift.yaml command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml - duration_ms: 718 + duration_ms: 661 - eval_id: eval-report-scoring status: pass evidence: - cto/evals/reports/*.yaml command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py "$r"; done - duration_ms: 297 + duration_ms: 289 - eval_id: diff-whitespace-check status: pass evidence: @@ -96,7 +97,7 @@ commands: --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 743 + duration_ms: 739 stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json @@ -114,38 +115,38 @@ commands: - command: pytest -q tests/e2e/test_j_cto_webui_prd.py cwd: /home/svrnty/workspaces/hermes returncode: 0 - duration_ms: 1212 + duration_ms: 1198 stdout: '.......... [100%] - 10 passed in 1.04s + 10 passed in 1.02s ' stderr: '' - command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py - tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py + tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py cwd: /home/svrnty/workspaces/hermes/hermes-webui returncode: 0 - duration_ms: 2689 - stdout: '............... [100%] + duration_ms: 3090 + stdout: '...................... [100%] - 15 passed in 2.38s + 22 passed in 2.63s ' stderr: '' - command: pytest -q tests/test_cto_live_streaming_e2e.py cwd: /home/svrnty/workspaces/hermes/hermes-webui returncode: 0 - duration_ms: 1785 + duration_ms: 1906 stdout: '. [100%] - 1 passed in 1.47s + 1 passed in 1.49s ' stderr: '' - command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 718 + duration_ms: 661 stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml ' @@ -154,7 +155,7 @@ commands: "$r"; done cwd: /home/svrnty/workspaces/hermes/cto returncode: 0 - duration_ms: 297 + duration_ms: 289 stdout: 'ok ok diff --git a/evals/runners/run-local-regression.py b/evals/runners/run-local-regression.py index a299599..f3e8efe 100755 --- a/evals/runners/run-local-regression.py +++ b/evals/runners/run-local-regression.py @@ -162,6 +162,7 @@ def build_report(output: Path) -> dict[str, Any]: "tests/test_live_tool_callback_events.py", "tests/test_cto_webui_journal_e2e.py", "tests/test_cto_browser_e2e.py", + "tests/test_cancel_interrupt.py", ], cwd=WEBUI_ROOT, timeout=180, @@ -197,7 +198,7 @@ def build_report(output: Path) -> dict[str, Any]: _eval_result("promotion-fixture-execution", fixtures, ["cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml"]), _eval_result("live-promotion-readiness", live_readiness, ["cto/evals/reports/2026-05-25-live-promotion-readiness.yaml"]), _eval_result("static-prd-contract", prd, ["tests/e2e/test_j_cto_webui_prd.py"]), - _eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py"]), + _eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py", "hermes-webui/tests/test_cancel_interrupt.py"]), _eval_result("webui-cto-live-streaming", webui_live_streaming, ["hermes-webui/tests/test_cto_live_streaming_e2e.py"]), _eval_result("live-profile-drift", drift, ["cto/evals/reports/2026-05-25-live-drift.yaml"]), _eval_result("eval-report-scoring", score, ["cto/evals/reports/*.yaml"]),