Refresh CTO WebUI audit eval proof

This commit is contained in:
Svrnty 2026-05-25 13:21:01 -04:00
parent cf3d10f8b9
commit e5040db9bc
4 changed files with 28 additions and 25 deletions

View File

@ -6,7 +6,7 @@ eval_id: live-profile-drift
profile: cto-planb
status: pass
score: 100
checked_at: '2026-05-25T17:14:09Z'
checked_at: '2026-05-25T17:20:57Z'
checks:
correctness: pass
verification: pass
@ -76,7 +76,7 @@ commands:
- command: hermes -p cto-planb skills list
cwd: /home/svrnty/workspaces/hermes
returncode: 0
duration_ms: 211
duration_ms: 235
stdout: " Installed Skills \n\u250F\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
@ -113,7 +113,7 @@ commands:
- command: hermes -p cto-planb mcp list
cwd: /home/svrnty/workspaces/hermes
returncode: 0
duration_ms: 401
duration_ms: 470
stdout: "\n MCP Servers:\n\n Name Transport Tools\
\ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\

View File

@ -59,7 +59,7 @@ eval_results:
command:
command: hermes -p cto-planb skills list
returncode: 0
duration_ms: 232
duration_ms: 210
stdout: " Installed Skills \n\u250F\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
@ -100,7 +100,7 @@ eval_results:
command:
command: hermes -p cto-planb mcp list
returncode: 0
duration_ms: 397
duration_ms: 443
stdout: "\n MCP Servers:\n\n Name Transport \
\ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\

View File

@ -38,19 +38,19 @@ eval_results:
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
duration_ms: 739
duration_ms: 744
- eval_id: live-promotion-readiness
status: pass
evidence:
- cto/evals/reports/2026-05-25-live-promotion-readiness.yaml
command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
duration_ms: 668
duration_ms: 693
- eval_id: static-prd-contract
status: pass
evidence:
- tests/e2e/test_j_cto_webui_prd.py
command: pytest -q tests/e2e/test_j_cto_webui_prd.py
duration_ms: 1198
duration_ms: 1216
- eval_id: webui-cto-event-browser
status: pass
evidence:
@ -58,32 +58,33 @@ eval_results:
- hermes-webui/tests/test_cancel_interrupt.py
command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
duration_ms: 3090
tests/test_approval_queue.py
duration_ms: 2364
- eval_id: webui-cto-live-streaming
status: pass
evidence:
- hermes-webui/tests/test_cto_live_streaming_e2e.py
command: pytest -q tests/test_cto_live_streaming_e2e.py
duration_ms: 1906
duration_ms: 1220
- eval_id: live-profile-drift
status: pass
evidence:
- cto/evals/reports/2026-05-25-live-drift.yaml
command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
duration_ms: 661
duration_ms: 752
- eval_id: eval-report-scoring
status: pass
evidence:
- cto/evals/reports/*.yaml
command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
"$r"; done
duration_ms: 289
duration_ms: 319
- eval_id: diff-whitespace-check
status: pass
evidence:
- git diff --check
command: git diff --check
duration_ms: 6
duration_ms: 4
commands:
- command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
cwd: /home/svrnty/workspaces/hermes/cto
@ -97,7 +98,7 @@ commands:
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0
duration_ms: 739
duration_ms: 744
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
@ -107,7 +108,7 @@ commands:
- command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0
duration_ms: 668
duration_ms: 693
stdout: 'wrote evals/reports/2026-05-25-live-promotion-readiness.yaml
'
@ -115,38 +116,39 @@ commands:
- command: pytest -q tests/e2e/test_j_cto_webui_prd.py
cwd: /home/svrnty/workspaces/hermes
returncode: 0
duration_ms: 1198
duration_ms: 1216
stdout: '.......... [100%]
10 passed in 1.02s
10 passed in 1.04s
'
stderr: ''
- command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
tests/test_approval_queue.py
cwd: /home/svrnty/workspaces/hermes/hermes-webui
returncode: 0
duration_ms: 3090
stdout: '...................... [100%]
duration_ms: 2364
stdout: '...................................... [100%]
22 passed in 2.63s
38 passed in 1.89s
'
stderr: ''
- command: pytest -q tests/test_cto_live_streaming_e2e.py
cwd: /home/svrnty/workspaces/hermes/hermes-webui
returncode: 0
duration_ms: 1906
duration_ms: 1220
stdout: '. [100%]
1 passed in 1.49s
1 passed in 0.88s
'
stderr: ''
- command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0
duration_ms: 661
duration_ms: 752
stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
'
@ -155,7 +157,7 @@ commands:
"$r"; done
cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0
duration_ms: 289
duration_ms: 319
stdout: 'ok
ok
@ -181,7 +183,7 @@ commands:
- command: git diff --check
cwd: /home/svrnty/workspaces/hermes
returncode: 0
duration_ms: 6
duration_ms: 4
stdout: ''
stderr: ''
notes:

View File

@ -163,6 +163,7 @@ def build_report(output: Path) -> dict[str, Any]:
"tests/test_cto_webui_journal_e2e.py",
"tests/test_cto_browser_e2e.py",
"tests/test_cancel_interrupt.py",
"tests/test_approval_queue.py",
],
cwd=WEBUI_ROOT,
timeout=180,