Include CTO cancel coverage in evals

2026-05-25 13:15:28 -04:00 · 2026-05-25 13:15:28 -04:00 · cf3d10f8b9
commit cf3d10f8b9
parent a576288d49
4 changed files with 27 additions and 25 deletions
--- a/evals/reports/2026-05-25-live-drift.yaml
+++ b/evals/reports/2026-05-25-live-drift.yaml
@ -6,7 +6,7 @@ eval_id: live-profile-drift
 profile: cto-planb
 status: pass
 score: 100
-checked_at: '2026-05-25T17:10:50Z'
+checked_at: '2026-05-25T17:14:09Z'
 checks:
  correctness: pass
  verification: pass
@ -76,7 +76,7 @@ commands:
 - command: hermes -p cto-planb skills list
  cwd: /home/svrnty/workspaces/hermes
  returncode: 0
-  duration_ms: 210
+  duration_ms: 211
  stdout: "                        Installed Skills                        \n\u250F\
    \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
    \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
@ -113,7 +113,7 @@ commands:
 - command: hermes -p cto-planb mcp list
  cwd: /home/svrnty/workspaces/hermes
  returncode: 0
-  duration_ms: 464
+  duration_ms: 401
  stdout: "\n  MCP Servers:\n\n  Name             Transport                      Tools\
    \        Status    \n  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
    \u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
@ -126,7 +126,7 @@ commands:
 - command: ./install.sh --dry-run
  cwd: /home/svrnty/workspaces/hermes/cto
  returncode: 0
-  duration_ms: 2
+  duration_ms: 4
  stdout: "== preflight ==\n  hermes \u2713  python3 \u2713  sqlite3 \u2713  HERMES_HOME\
    \ \u2713\n  sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\
    == DRY RUN \u2014 no mutations ==\n  would: ln -sfn /home/svrnty/workspaces/hermes/cto\
--- a/evals/reports/2026-05-25-live-promotion-readiness.yaml
+++ b/evals/reports/2026-05-25-live-promotion-readiness.yaml
@ -59,7 +59,7 @@ eval_results:
  command:
    command: hermes -p cto-planb skills list
    returncode: 0
-    duration_ms: 240
+    duration_ms: 232
    stdout: "                        Installed Skills                        \n\u250F\
      \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
      \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
@ -100,7 +100,7 @@ eval_results:
  command:
    command: hermes -p cto-planb mcp list
    returncode: 0
-    duration_ms: 431
+    duration_ms: 397
    stdout: "\n  MCP Servers:\n\n  Name             Transport                    \
      \  Tools        Status    \n  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
      \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\
--- a/evals/reports/2026-05-25-local-regression-execution-slice.yaml
+++ b/evals/reports/2026-05-25-local-regression-execution-slice.yaml
@ -38,7 +38,7 @@ eval_results:
  - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
  command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
    --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
-  duration_ms: 743
+  duration_ms: 739
 - eval_id: live-promotion-readiness
  status: pass
  evidence:
@ -50,33 +50,34 @@ eval_results:
  evidence:
  - tests/e2e/test_j_cto_webui_prd.py
  command: pytest -q tests/e2e/test_j_cto_webui_prd.py
-  duration_ms: 1212
+  duration_ms: 1198
 - eval_id: webui-cto-event-browser
  status: pass
  evidence:
  - hermes-webui/tests/test_cto_browser_e2e.py
  - hermes-webui/tests/test_cancel_interrupt.py
  command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
-    tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py
+    tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
-  duration_ms: 2689
+  duration_ms: 3090
 - eval_id: webui-cto-live-streaming
  status: pass
  evidence:
  - hermes-webui/tests/test_cto_live_streaming_e2e.py
  command: pytest -q tests/test_cto_live_streaming_e2e.py
-  duration_ms: 1785
+  duration_ms: 1906
 - eval_id: live-profile-drift
  status: pass
  evidence:
  - cto/evals/reports/2026-05-25-live-drift.yaml
  command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
-  duration_ms: 718
+  duration_ms: 661
 - eval_id: eval-report-scoring
  status: pass
  evidence:
  - cto/evals/reports/*.yaml
  command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
    "$r"; done
-  duration_ms: 297
+  duration_ms: 289
 - eval_id: diff-whitespace-check
  status: pass
  evidence:
@ -96,7 +97,7 @@ commands:
    --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
  cwd: /home/svrnty/workspaces/hermes/cto
  returncode: 0
-  duration_ms: 743
+  duration_ms: 739
  stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
    wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
@ -114,38 +115,38 @@ commands:
 - command: pytest -q tests/e2e/test_j_cto_webui_prd.py
  cwd: /home/svrnty/workspaces/hermes
  returncode: 0
-  duration_ms: 1212
+  duration_ms: 1198
  stdout: '..........                                                               [100%]
-    10 passed in 1.04s
+    10 passed in 1.02s
    '
  stderr: ''
 - command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
-    tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py
+    tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
  cwd: /home/svrnty/workspaces/hermes/hermes-webui
  returncode: 0
-  duration_ms: 2689
+  duration_ms: 3090
-  stdout: '...............                                                          [100%]
+  stdout: '......................                                                   [100%]
-    15 passed in 2.38s
+    22 passed in 2.63s
    '
  stderr: ''
 - command: pytest -q tests/test_cto_live_streaming_e2e.py
  cwd: /home/svrnty/workspaces/hermes/hermes-webui
  returncode: 0
-  duration_ms: 1785
+  duration_ms: 1906
  stdout: '.                                                                        [100%]
-    1 passed in 1.47s
+    1 passed in 1.49s
    '
  stderr: ''
 - command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
  cwd: /home/svrnty/workspaces/hermes/cto
  returncode: 0
-  duration_ms: 718
+  duration_ms: 661
  stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
    '
@ -154,7 +155,7 @@ commands:
    "$r"; done
  cwd: /home/svrnty/workspaces/hermes/cto
  returncode: 0
-  duration_ms: 297
+  duration_ms: 289
  stdout: 'ok
    ok
--- a/evals/runners/run-local-regression.py
+++ b/evals/runners/run-local-regression.py
@ -162,6 +162,7 @@ def build_report(output: Path) -> dict[str, Any]:
            "tests/test_live_tool_callback_events.py",
            "tests/test_cto_webui_journal_e2e.py",
            "tests/test_cto_browser_e2e.py",
            "tests/test_cancel_interrupt.py",
        ],
        cwd=WEBUI_ROOT,
        timeout=180,
@ -197,7 +198,7 @@ def build_report(output: Path) -> dict[str, Any]:
        _eval_result("promotion-fixture-execution", fixtures, ["cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml"]),
        _eval_result("live-promotion-readiness", live_readiness, ["cto/evals/reports/2026-05-25-live-promotion-readiness.yaml"]),
        _eval_result("static-prd-contract", prd, ["tests/e2e/test_j_cto_webui_prd.py"]),
-        _eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py"]),
+        _eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py", "hermes-webui/tests/test_cancel_interrupt.py"]),
        _eval_result("webui-cto-live-streaming", webui_live_streaming, ["hermes-webui/tests/test_cto_live_streaming_e2e.py"]),
        _eval_result("live-profile-drift", drift, ["cto/evals/reports/2026-05-25-live-drift.yaml"]),
        _eval_result("eval-report-scoring", score, ["cto/evals/reports/*.yaml"]),