Refresh CTO eval proof reports
This commit is contained in:
parent
4ed306928a
commit
d4dfff5584
@ -6,7 +6,7 @@ eval_id: live-profile-drift
|
||||
profile: cto-planb
|
||||
status: pass
|
||||
score: 100
|
||||
checked_at: '2026-05-25T16:56:06Z'
|
||||
checked_at: '2026-05-25T17:07:15Z'
|
||||
checks:
|
||||
correctness: pass
|
||||
verification: pass
|
||||
@ -113,7 +113,7 @@ commands:
|
||||
- command: hermes -p cto-planb mcp list
|
||||
cwd: /home/svrnty/workspaces/hermes
|
||||
returncode: 0
|
||||
duration_ms: 401
|
||||
duration_ms: 440
|
||||
stdout: "\n MCP Servers:\n\n Name Transport Tools\
|
||||
\ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
||||
\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
||||
@ -126,7 +126,7 @@ commands:
|
||||
- command: ./install.sh --dry-run
|
||||
cwd: /home/svrnty/workspaces/hermes/cto
|
||||
returncode: 0
|
||||
duration_ms: 2
|
||||
duration_ms: 3
|
||||
stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\
|
||||
\ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\
|
||||
== DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\
|
||||
|
||||
@ -38,45 +38,45 @@ eval_results:
|
||||
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||
command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
||||
duration_ms: 710
|
||||
duration_ms: 750
|
||||
- eval_id: static-prd-contract
|
||||
status: pass
|
||||
evidence:
|
||||
- tests/e2e/test_j_cto_webui_prd.py
|
||||
command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
||||
duration_ms: 1143
|
||||
duration_ms: 1223
|
||||
- eval_id: webui-cto-event-browser
|
||||
status: pass
|
||||
evidence:
|
||||
- hermes-webui/tests/test_cto_browser_e2e.py
|
||||
command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
|
||||
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py
|
||||
duration_ms: 2592
|
||||
duration_ms: 3006
|
||||
- eval_id: webui-cto-live-streaming
|
||||
status: pass
|
||||
evidence:
|
||||
- hermes-webui/tests/test_cto_live_streaming_e2e.py
|
||||
command: pytest -q tests/test_cto_live_streaming_e2e.py
|
||||
duration_ms: 1786
|
||||
duration_ms: 2195
|
||||
- eval_id: live-profile-drift
|
||||
status: pass
|
||||
evidence:
|
||||
- cto/evals/reports/2026-05-25-live-drift.yaml
|
||||
command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
||||
duration_ms: 658
|
||||
duration_ms: 706
|
||||
- eval_id: eval-report-scoring
|
||||
status: pass
|
||||
evidence:
|
||||
- cto/evals/reports/*.yaml
|
||||
command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
|
||||
"$r"; done
|
||||
duration_ms: 260
|
||||
duration_ms: 275
|
||||
- eval_id: diff-whitespace-check
|
||||
status: pass
|
||||
evidence:
|
||||
- git diff --check
|
||||
command: git diff --check
|
||||
duration_ms: 5
|
||||
duration_ms: 7
|
||||
commands:
|
||||
- command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
||||
cwd: /home/svrnty/workspaces/hermes/cto
|
||||
@ -90,7 +90,7 @@ commands:
|
||||
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
||||
cwd: /home/svrnty/workspaces/hermes/cto
|
||||
returncode: 0
|
||||
duration_ms: 710
|
||||
duration_ms: 750
|
||||
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||
|
||||
wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
||||
@ -100,10 +100,10 @@ commands:
|
||||
- command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
||||
cwd: /home/svrnty/workspaces/hermes
|
||||
returncode: 0
|
||||
duration_ms: 1143
|
||||
duration_ms: 1223
|
||||
stdout: '.......... [100%]
|
||||
|
||||
10 passed in 0.95s
|
||||
10 passed in 1.05s
|
||||
|
||||
'
|
||||
stderr: ''
|
||||
@ -111,27 +111,27 @@ commands:
|
||||
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py
|
||||
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
||||
returncode: 0
|
||||
duration_ms: 2592
|
||||
stdout: '.............. [100%]
|
||||
duration_ms: 3006
|
||||
stdout: '............... [100%]
|
||||
|
||||
14 passed in 2.32s
|
||||
15 passed in 2.71s
|
||||
|
||||
'
|
||||
stderr: ''
|
||||
- command: pytest -q tests/test_cto_live_streaming_e2e.py
|
||||
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
||||
returncode: 0
|
||||
duration_ms: 1786
|
||||
duration_ms: 2195
|
||||
stdout: '. [100%]
|
||||
|
||||
1 passed in 1.46s
|
||||
1 passed in 1.79s
|
||||
|
||||
'
|
||||
stderr: ''
|
||||
- command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
||||
cwd: /home/svrnty/workspaces/hermes/cto
|
||||
returncode: 0
|
||||
duration_ms: 658
|
||||
duration_ms: 706
|
||||
stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
|
||||
|
||||
'
|
||||
@ -140,7 +140,7 @@ commands:
|
||||
"$r"; done
|
||||
cwd: /home/svrnty/workspaces/hermes/cto
|
||||
returncode: 0
|
||||
duration_ms: 260
|
||||
duration_ms: 275
|
||||
stdout: 'ok
|
||||
|
||||
ok
|
||||
@ -164,7 +164,7 @@ commands:
|
||||
- command: git diff --check
|
||||
cwd: /home/svrnty/workspaces/hermes
|
||||
returncode: 0
|
||||
duration_ms: 5
|
||||
duration_ms: 7
|
||||
stdout: ''
|
||||
stderr: ''
|
||||
notes:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user