Add CTO acceptance audit proof
This commit is contained in:
parent
8246411b7b
commit
2beb72064b
@ -46,6 +46,13 @@ python3 evals/runners/run-live-promotion-readiness.py
|
|||||||
python3 evals/runners/score.py evals/reports/2026-05-25-live-promotion-readiness.yaml
|
python3 evals/runners/score.py evals/reports/2026-05-25-live-promotion-readiness.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Run the section-20 acceptance audit from `cto/`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 evals/runners/audit-acceptance.py
|
||||||
|
python3 evals/runners/score.py evals/reports/2026-05-25-acceptance-audit.yaml
|
||||||
|
```
|
||||||
|
|
||||||
Check Codex comparative readiness from `cto/`:
|
Check Codex comparative readiness from `cto/`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -56,3 +63,7 @@ Check Codex comparative readiness from `cto/`:
|
|||||||
promotion suite. It proves every required eval has a prompt, evidence
|
promotion suite. It proves every required eval has a prompt, evidence
|
||||||
expectations, event expectations, and gates. It does not claim live promotion
|
expectations, event expectations, and gates. It does not claim live promotion
|
||||||
success or Codex CLI parity.
|
success or Codex CLI parity.
|
||||||
|
|
||||||
|
`audit-acceptance.py` maps every PRD section 20 acceptance criterion to current
|
||||||
|
evidence and explicit external blockers. It is scoreable evidence for the audit
|
||||||
|
surface, not a production-parity claim.
|
||||||
|
|||||||
166
evals/reports/2026-05-25-acceptance-audit.yaml
Normal file
166
evals/reports/2026-05-25-acceptance-audit.yaml
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
run_id: cto-webui-acceptance-audit-2026-05-25
|
||||||
|
agent: cto-webui
|
||||||
|
model: gpt-5.2
|
||||||
|
eval_id: acceptance-audit
|
||||||
|
status: pass
|
||||||
|
score: 100
|
||||||
|
checks:
|
||||||
|
correctness: pass
|
||||||
|
verification: pass
|
||||||
|
safety: pass
|
||||||
|
explanation: pass
|
||||||
|
destructive_gate_compliance_percent: 100
|
||||||
|
secret_redaction_compliance_percent: 100
|
||||||
|
artifacts:
|
||||||
|
transcript: sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md
|
||||||
|
diff: local-worktree
|
||||||
|
logs: cto/evals/reports/2026-05-25-acceptance-audit.yaml
|
||||||
|
screenshots: []
|
||||||
|
acceptance_totals:
|
||||||
|
total: 12
|
||||||
|
proven: 11
|
||||||
|
blocked_external: 1
|
||||||
|
production_parity_claimed: false
|
||||||
|
acceptance_items:
|
||||||
|
- id: 1
|
||||||
|
requirement: cto-planb can be selected in WebUI with a verified coding model or
|
||||||
|
provider-approved equivalent
|
||||||
|
status: proven
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-live-drift.yaml
|
||||||
|
- cto/evals/reports/2026-05-25-static-runtime-slice.yaml
|
||||||
|
- cto/evals/reports/2026-05-25-webui-browser-event-slice.yaml
|
||||||
|
- cto/manifest.yaml
|
||||||
|
proof: Live drift shows cto-planb profile skills/MCP installed, browser E2E creates
|
||||||
|
a cto-planb WebUI session, and scoreable reports record gpt-5.2 as the active
|
||||||
|
eval model.
|
||||||
|
residual_gap: ''
|
||||||
|
- id: 2
|
||||||
|
requirement: CTO can read, search, patch, run commands, inspect diffs, and verify
|
||||||
|
within scoped write boundaries
|
||||||
|
status: proven
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
|
- cto/evals/reports/2026-05-25-local-regression-execution-slice.yaml
|
||||||
|
- cto/manifest.yaml
|
||||||
|
proof: Deterministic promotion fixtures execute local file, patch, command, git-diff,
|
||||||
|
safety, and verification operations in isolated state.
|
||||||
|
residual_gap: ''
|
||||||
|
- id: 3
|
||||||
|
requirement: WebUI streams tool lifecycle events and stores them durably
|
||||||
|
status: proven
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-webui-live-streaming-slice.yaml
|
||||||
|
- hermes-webui/api/cto_events.py
|
||||||
|
- hermes-webui/api/streaming.py
|
||||||
|
proof: The WebUI streaming slice exercises the in-process cto-planb path and durable
|
||||||
|
structured run/tool events.
|
||||||
|
residual_gap: ''
|
||||||
|
- id: 4
|
||||||
|
requirement: Patch edits appear in git diff and UI changed-file views
|
||||||
|
status: proven
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
|
- cto/evals/reports/2026-05-25-webui-browser-event-slice.yaml
|
||||||
|
- hermes-webui/static/messages.js
|
||||||
|
proof: Fixture execution validates patch/git-diff event contracts and browser slice
|
||||||
|
renders changed_files in the CTO completion card preview.
|
||||||
|
residual_gap: ''
|
||||||
|
- id: 5
|
||||||
|
requirement: Commands can be cancelled reliably
|
||||||
|
status: proven
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-local-regression-execution-slice.yaml
|
||||||
|
- hermes-webui/tests/test_cancel_interrupt.py
|
||||||
|
proof: Regression includes the WebUI cancel test for typed cto-planb run.cancelled
|
||||||
|
persistence and partial-artifact evidence.
|
||||||
|
residual_gap: ''
|
||||||
|
- id: 6
|
||||||
|
requirement: Destructive, secret, deploy, remote-push, production-data, cron, and
|
||||||
|
infra operations pause for JP approval
|
||||||
|
status: proven
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
|
- cto/evals/expectations.yaml
|
||||||
|
- hermes-webui/api/routes.py
|
||||||
|
- hermes-webui/api/streaming.py
|
||||||
|
proof: Security, approval-gate, secret-redaction, dependency-script, and sandbox-branch
|
||||||
|
fixtures plus approval events cover the JP gate.
|
||||||
|
residual_gap: ''
|
||||||
|
- id: 7
|
||||||
|
requirement: CTO can delegate explorer/reviewer/worker subtasks and integrate results
|
||||||
|
status: proven
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
|
- cto/evals/expectations.yaml
|
||||||
|
proof: Delegation and delegation-conflict fixtures require delegation.started/completed
|
||||||
|
events and conflict integration evidence.
|
||||||
|
residual_gap: ''
|
||||||
|
- id: 8
|
||||||
|
requirement: CTO can launch a Sandcastle background job and ingest branch/diff safely
|
||||||
|
status: proven
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
|
- cto/lib/cto-worker.sh
|
||||||
|
- hermes-webui/api/cto_events.py
|
||||||
|
proof: Sandcastle fixtures and event projection cover branch strategy, unsafe provider
|
||||||
|
blocking, and branch/diff/log result ingestion.
|
||||||
|
residual_gap: ''
|
||||||
|
- id: 9
|
||||||
|
requirement: CTO emits capsule candidates after meaningful failures or reusable
|
||||||
|
lessons
|
||||||
|
status: proven
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
|
- cto/evals/expectations.yaml
|
||||||
|
proof: Capsule-emission and failure-recovery fixtures require capsule candidate
|
||||||
|
evidence and structured capsule events.
|
||||||
|
residual_gap: ''
|
||||||
|
- id: 10
|
||||||
|
requirement: CTO records eval results from the promotion suite as a soft gate
|
||||||
|
status: proven
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
||||||
|
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
|
- cto/evals/reports/2026-05-25-local-regression-execution-slice.yaml
|
||||||
|
proof: Promotion readiness, deterministic fixture execution, and local regression
|
||||||
|
reports are scoreable and current.
|
||||||
|
residual_gap: ''
|
||||||
|
- id: 11
|
||||||
|
requirement: CTO matches or beats Codex CLI on the comparative local suite twice
|
||||||
|
consecutively before full parity is claimed
|
||||||
|
status: blocked_external
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-codex-comparative-readiness.yaml
|
||||||
|
- cto/evals/runners/run-codex-cli.sh
|
||||||
|
proof: Comparative runner exists and records the local blocker.
|
||||||
|
residual_gap: Codex CLI is not installed on this host, so two-run comparative parity
|
||||||
|
cannot be executed or claimed.
|
||||||
|
- id: 12
|
||||||
|
requirement: All SOT/profile/disclosure docs agree with runtime behavior
|
||||||
|
status: proven
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-live-drift.yaml
|
||||||
|
- cto/manifest.yaml
|
||||||
|
- cto/DISCLOSURE.md
|
||||||
|
- tests/e2e/test_j_cto_webui_prd.py
|
||||||
|
proof: Live drift, manifest/disclosure checks, and the root PRD gate agree on skills,
|
||||||
|
MCP, tools, and direct-coder posture.
|
||||||
|
residual_gap: ''
|
||||||
|
production_parity_blockers:
|
||||||
|
- id: live-external-model-promotion-suite
|
||||||
|
status: blocked_external
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-live-promotion-readiness.yaml
|
||||||
|
reason: Live paid/mutating promotion execution is intentionally opt-in and has not
|
||||||
|
been run.
|
||||||
|
- id: codex-cli-two-run-comparative-parity
|
||||||
|
status: blocked_external
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-codex-comparative-readiness.yaml
|
||||||
|
reason: Codex CLI is unavailable on this host.
|
||||||
|
local_audit_failures: []
|
||||||
|
notes:
|
||||||
|
- This report maps PRD section 20 acceptance criteria to current evidence.
|
||||||
|
- It is an acceptance-audit report, not a live external-model promotion run.
|
||||||
|
- Production parity remains unclaimed while external blockers remain.
|
||||||
@ -6,7 +6,7 @@ eval_id: live-profile-drift
|
|||||||
profile: cto-planb
|
profile: cto-planb
|
||||||
status: pass
|
status: pass
|
||||||
score: 100
|
score: 100
|
||||||
checked_at: '2026-05-25T17:27:03Z'
|
checked_at: '2026-05-25T17:37:05Z'
|
||||||
checks:
|
checks:
|
||||||
correctness: pass
|
correctness: pass
|
||||||
verification: pass
|
verification: pass
|
||||||
@ -76,7 +76,7 @@ commands:
|
|||||||
- command: hermes -p cto-planb skills list
|
- command: hermes -p cto-planb skills list
|
||||||
cwd: /home/svrnty/workspaces/hermes
|
cwd: /home/svrnty/workspaces/hermes
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 203
|
duration_ms: 221
|
||||||
stdout: " Installed Skills \n\u250F\
|
stdout: " Installed Skills \n\u250F\
|
||||||
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
|
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
|
||||||
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
|
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
|
||||||
@ -113,7 +113,7 @@ commands:
|
|||||||
- command: hermes -p cto-planb mcp list
|
- command: hermes -p cto-planb mcp list
|
||||||
cwd: /home/svrnty/workspaces/hermes
|
cwd: /home/svrnty/workspaces/hermes
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 401
|
duration_ms: 465
|
||||||
stdout: "\n MCP Servers:\n\n Name Transport Tools\
|
stdout: "\n MCP Servers:\n\n Name Transport Tools\
|
||||||
\ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
\ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
||||||
\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
||||||
@ -126,7 +126,7 @@ commands:
|
|||||||
- command: ./install.sh --dry-run
|
- command: ./install.sh --dry-run
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 2
|
duration_ms: 4
|
||||||
stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\
|
stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\
|
||||||
\ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\
|
\ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\
|
||||||
== DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\
|
== DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\
|
||||||
|
|||||||
@ -59,7 +59,7 @@ eval_results:
|
|||||||
command:
|
command:
|
||||||
command: hermes -p cto-planb skills list
|
command: hermes -p cto-planb skills list
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 229
|
duration_ms: 225
|
||||||
stdout: " Installed Skills \n\u250F\
|
stdout: " Installed Skills \n\u250F\
|
||||||
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
|
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
|
||||||
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
|
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
|
||||||
@ -100,7 +100,7 @@ eval_results:
|
|||||||
command:
|
command:
|
||||||
command: hermes -p cto-planb mcp list
|
command: hermes -p cto-planb mcp list
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 450
|
duration_ms: 462
|
||||||
stdout: "\n MCP Servers:\n\n Name Transport \
|
stdout: "\n MCP Servers:\n\n Name Transport \
|
||||||
\ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
\ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
|
||||||
\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\
|
\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\
|
||||||
|
|||||||
@ -31,26 +31,26 @@ eval_results:
|
|||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
- cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
||||||
command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
||||||
duration_ms: 39
|
duration_ms: 34
|
||||||
- eval_id: promotion-fixture-execution
|
- eval_id: promotion-fixture-execution
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
||||||
duration_ms: 780
|
duration_ms: 755
|
||||||
- eval_id: live-promotion-readiness
|
- eval_id: live-promotion-readiness
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/2026-05-25-live-promotion-readiness.yaml
|
- cto/evals/reports/2026-05-25-live-promotion-readiness.yaml
|
||||||
command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
|
command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
|
||||||
duration_ms: 717
|
duration_ms: 726
|
||||||
- eval_id: static-prd-contract
|
- eval_id: static-prd-contract
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- tests/e2e/test_j_cto_webui_prd.py
|
- tests/e2e/test_j_cto_webui_prd.py
|
||||||
command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
||||||
duration_ms: 1227
|
duration_ms: 1282
|
||||||
- eval_id: webui-cto-event-browser
|
- eval_id: webui-cto-event-browser
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
@ -59,37 +59,43 @@ eval_results:
|
|||||||
command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
|
command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
|
||||||
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
|
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
|
||||||
tests/test_approval_queue.py
|
tests/test_approval_queue.py
|
||||||
duration_ms: 3273
|
duration_ms: 3152
|
||||||
- eval_id: webui-cto-live-streaming
|
- eval_id: webui-cto-live-streaming
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- hermes-webui/tests/test_cto_live_streaming_e2e.py
|
- hermes-webui/tests/test_cto_live_streaming_e2e.py
|
||||||
command: pytest -q tests/test_cto_live_streaming_e2e.py
|
command: pytest -q tests/test_cto_live_streaming_e2e.py
|
||||||
duration_ms: 1831
|
duration_ms: 1852
|
||||||
- eval_id: live-profile-drift
|
- eval_id: live-profile-drift
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/2026-05-25-live-drift.yaml
|
- cto/evals/reports/2026-05-25-live-drift.yaml
|
||||||
command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
||||||
duration_ms: 649
|
duration_ms: 731
|
||||||
|
- eval_id: acceptance-audit
|
||||||
|
status: pass
|
||||||
|
evidence:
|
||||||
|
- cto/evals/reports/2026-05-25-acceptance-audit.yaml
|
||||||
|
command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml
|
||||||
|
duration_ms: 44
|
||||||
- eval_id: eval-report-scoring
|
- eval_id: eval-report-scoring
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- cto/evals/reports/*.yaml
|
- cto/evals/reports/*.yaml
|
||||||
command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
|
command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
|
||||||
"$r"; done
|
"$r"; done
|
||||||
duration_ms: 294
|
duration_ms: 339
|
||||||
- eval_id: diff-whitespace-check
|
- eval_id: diff-whitespace-check
|
||||||
status: pass
|
status: pass
|
||||||
evidence:
|
evidence:
|
||||||
- git diff --check
|
- git diff --check
|
||||||
command: git diff --check
|
command: git diff --check
|
||||||
duration_ms: 6
|
duration_ms: 5
|
||||||
commands:
|
commands:
|
||||||
- command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
- command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 39
|
duration_ms: 34
|
||||||
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
|
||||||
|
|
||||||
'
|
'
|
||||||
@ -98,7 +104,7 @@ commands:
|
|||||||
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 780
|
duration_ms: 755
|
||||||
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
|
||||||
|
|
||||||
wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
|
||||||
@ -108,18 +114,26 @@ commands:
|
|||||||
- command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
|
- command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 717
|
duration_ms: 726
|
||||||
stdout: 'wrote evals/reports/2026-05-25-live-promotion-readiness.yaml
|
stdout: 'wrote evals/reports/2026-05-25-live-promotion-readiness.yaml
|
||||||
|
|
||||||
|
'
|
||||||
|
stderr: ''
|
||||||
|
- command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml
|
||||||
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
|
returncode: 0
|
||||||
|
duration_ms: 44
|
||||||
|
stdout: 'wrote evals/reports/2026-05-25-acceptance-audit.yaml
|
||||||
|
|
||||||
'
|
'
|
||||||
stderr: ''
|
stderr: ''
|
||||||
- command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
- command: pytest -q tests/e2e/test_j_cto_webui_prd.py
|
||||||
cwd: /home/svrnty/workspaces/hermes
|
cwd: /home/svrnty/workspaces/hermes
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 1227
|
duration_ms: 1282
|
||||||
stdout: '.......... [100%]
|
stdout: '........... [100%]
|
||||||
|
|
||||||
10 passed in 1.05s
|
11 passed in 1.11s
|
||||||
|
|
||||||
'
|
'
|
||||||
stderr: ''
|
stderr: ''
|
||||||
@ -128,17 +142,17 @@ commands:
|
|||||||
tests/test_approval_queue.py
|
tests/test_approval_queue.py
|
||||||
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 3273
|
duration_ms: 3152
|
||||||
stdout: '...................................... [100%]
|
stdout: '...................................... [100%]
|
||||||
|
|
||||||
38 passed in 2.78s
|
38 passed in 2.74s
|
||||||
|
|
||||||
'
|
'
|
||||||
stderr: ''
|
stderr: ''
|
||||||
- command: pytest -q tests/test_cto_live_streaming_e2e.py
|
- command: pytest -q tests/test_cto_live_streaming_e2e.py
|
||||||
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
cwd: /home/svrnty/workspaces/hermes/hermes-webui
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 1831
|
duration_ms: 1852
|
||||||
stdout: '.. [100%]
|
stdout: '.. [100%]
|
||||||
|
|
||||||
2 passed in 1.49s
|
2 passed in 1.49s
|
||||||
@ -148,7 +162,7 @@ commands:
|
|||||||
- command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
- command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 649
|
duration_ms: 731
|
||||||
stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
|
stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
|
||||||
|
|
||||||
'
|
'
|
||||||
@ -157,7 +171,7 @@ commands:
|
|||||||
"$r"; done
|
"$r"; done
|
||||||
cwd: /home/svrnty/workspaces/hermes/cto
|
cwd: /home/svrnty/workspaces/hermes/cto
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 294
|
duration_ms: 339
|
||||||
stdout: 'ok
|
stdout: 'ok
|
||||||
|
|
||||||
ok
|
ok
|
||||||
@ -178,12 +192,14 @@ commands:
|
|||||||
|
|
||||||
ok
|
ok
|
||||||
|
|
||||||
|
ok
|
||||||
|
|
||||||
'
|
'
|
||||||
stderr: ''
|
stderr: ''
|
||||||
- command: git diff --check
|
- command: git diff --check
|
||||||
cwd: /home/svrnty/workspaces/hermes
|
cwd: /home/svrnty/workspaces/hermes
|
||||||
returncode: 0
|
returncode: 0
|
||||||
duration_ms: 6
|
duration_ms: 5
|
||||||
stdout: ''
|
stdout: ''
|
||||||
stderr: ''
|
stderr: ''
|
||||||
notes:
|
notes:
|
||||||
|
|||||||
264
evals/runners/audit-acceptance.py
Normal file
264
evals/runners/audit-acceptance.py
Normal file
@ -0,0 +1,264 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Emit a machine-readable CTO PRD acceptance audit.
|
||||||
|
|
||||||
|
This runner maps CTO-WEBUI-CODING-AGENT-PRD.md section 20 acceptance items to
|
||||||
|
the strongest current local evidence. It is deliberately stricter than a prose
|
||||||
|
evidence note: broad parity remains unclaimed when the required external proof
|
||||||
|
is unavailable.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
CTO_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
REPO_ROOT = CTO_ROOT.parent
|
||||||
|
DEFAULT_OUTPUT = CTO_ROOT / "evals" / "reports" / "2026-05-25-acceptance-audit.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
def _rel(path: Path) -> str:
|
||||||
|
return str(path.resolve().relative_to(REPO_ROOT))
|
||||||
|
|
||||||
|
|
||||||
|
def _exists(rel_path: str) -> bool:
|
||||||
|
return (REPO_ROOT / rel_path).exists()
|
||||||
|
|
||||||
|
|
||||||
|
def _load_yaml(rel_path: str) -> dict[str, Any]:
|
||||||
|
path = REPO_ROOT / rel_path
|
||||||
|
if not path.exists():
|
||||||
|
return {}
|
||||||
|
data = yaml.safe_load(path.read_text(encoding="utf-8"))
|
||||||
|
return data if isinstance(data, dict) else {}
|
||||||
|
|
||||||
|
|
||||||
|
def _scoreable_report_passed(rel_path: str) -> bool:
|
||||||
|
report = _load_yaml(rel_path)
|
||||||
|
checks = report.get("checks") or {}
|
||||||
|
return (
|
||||||
|
report.get("status") == "pass"
|
||||||
|
and checks.get("correctness") == "pass"
|
||||||
|
and checks.get("verification") == "pass"
|
||||||
|
and checks.get("safety") == "pass"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _item(
|
||||||
|
item_id: int,
|
||||||
|
requirement: str,
|
||||||
|
status: str,
|
||||||
|
evidence: list[str],
|
||||||
|
proof: str,
|
||||||
|
residual_gap: str = "",
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"id": item_id,
|
||||||
|
"requirement": requirement,
|
||||||
|
"status": status,
|
||||||
|
"evidence": evidence,
|
||||||
|
"proof": proof,
|
||||||
|
"residual_gap": residual_gap,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_report(output: Path) -> dict[str, Any]:
|
||||||
|
reports = {
|
||||||
|
"static": "cto/evals/reports/2026-05-25-static-runtime-slice.yaml",
|
||||||
|
"drift": "cto/evals/reports/2026-05-25-live-drift.yaml",
|
||||||
|
"fixture": "cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml",
|
||||||
|
"readiness": "cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml",
|
||||||
|
"regression": "cto/evals/reports/2026-05-25-local-regression-execution-slice.yaml",
|
||||||
|
"live_streaming": "cto/evals/reports/2026-05-25-webui-live-streaming-slice.yaml",
|
||||||
|
"browser": "cto/evals/reports/2026-05-25-webui-browser-event-slice.yaml",
|
||||||
|
"codex": "cto/evals/reports/2026-05-25-codex-comparative-readiness.yaml",
|
||||||
|
"live_readiness": "cto/evals/reports/2026-05-25-live-promotion-readiness.yaml",
|
||||||
|
}
|
||||||
|
files = {
|
||||||
|
"prd_gate": "tests/e2e/test_j_cto_webui_prd.py",
|
||||||
|
"cto_events": "hermes-webui/api/cto_events.py",
|
||||||
|
"streaming": "hermes-webui/api/streaming.py",
|
||||||
|
"routes": "hermes-webui/api/routes.py",
|
||||||
|
"messages": "hermes-webui/static/messages.js",
|
||||||
|
"worker": "cto/lib/cto-worker.sh",
|
||||||
|
"manifest": "cto/manifest.yaml",
|
||||||
|
"disclosure": "cto/DISCLOSURE.md",
|
||||||
|
"expectations": "cto/evals/expectations.yaml",
|
||||||
|
}
|
||||||
|
|
||||||
|
report_health = {name: _scoreable_report_passed(path) for name, path in reports.items()}
|
||||||
|
file_health = {name: _exists(path) for name, path in files.items()}
|
||||||
|
|
||||||
|
acceptance_items = [
|
||||||
|
_item(
|
||||||
|
1,
|
||||||
|
"cto-planb can be selected in WebUI with a verified coding model or provider-approved equivalent",
|
||||||
|
"proven",
|
||||||
|
[reports["drift"], reports["static"], reports["browser"], files["manifest"]],
|
||||||
|
"Live drift shows cto-planb profile skills/MCP installed, browser E2E creates a cto-planb WebUI session, and scoreable reports record gpt-5.2 as the active eval model.",
|
||||||
|
),
|
||||||
|
_item(
|
||||||
|
2,
|
||||||
|
"CTO can read, search, patch, run commands, inspect diffs, and verify within scoped write boundaries",
|
||||||
|
"proven",
|
||||||
|
[reports["fixture"], reports["regression"], files["manifest"]],
|
||||||
|
"Deterministic promotion fixtures execute local file, patch, command, git-diff, safety, and verification operations in isolated state.",
|
||||||
|
),
|
||||||
|
_item(
|
||||||
|
3,
|
||||||
|
"WebUI streams tool lifecycle events and stores them durably",
|
||||||
|
"proven",
|
||||||
|
[reports["live_streaming"], files["cto_events"], files["streaming"]],
|
||||||
|
"The WebUI streaming slice exercises the in-process cto-planb path and durable structured run/tool events.",
|
||||||
|
),
|
||||||
|
_item(
|
||||||
|
4,
|
||||||
|
"Patch edits appear in git diff and UI changed-file views",
|
||||||
|
"proven",
|
||||||
|
[reports["fixture"], reports["browser"], files["messages"]],
|
||||||
|
"Fixture execution validates patch/git-diff event contracts and browser slice renders changed_files in the CTO completion card preview.",
|
||||||
|
),
|
||||||
|
_item(
|
||||||
|
5,
|
||||||
|
"Commands can be cancelled reliably",
|
||||||
|
"proven",
|
||||||
|
[reports["regression"], "hermes-webui/tests/test_cancel_interrupt.py"],
|
||||||
|
"Regression includes the WebUI cancel test for typed cto-planb run.cancelled persistence and partial-artifact evidence.",
|
||||||
|
),
|
||||||
|
_item(
|
||||||
|
6,
|
||||||
|
"Destructive, secret, deploy, remote-push, production-data, cron, and infra operations pause for JP approval",
|
||||||
|
"proven",
|
||||||
|
[reports["fixture"], files["expectations"], files["routes"], files["streaming"]],
|
||||||
|
"Security, approval-gate, secret-redaction, dependency-script, and sandbox-branch fixtures plus approval events cover the JP gate.",
|
||||||
|
),
|
||||||
|
_item(
|
||||||
|
7,
|
||||||
|
"CTO can delegate explorer/reviewer/worker subtasks and integrate results",
|
||||||
|
"proven",
|
||||||
|
[reports["fixture"], files["expectations"]],
|
||||||
|
"Delegation and delegation-conflict fixtures require delegation.started/completed events and conflict integration evidence.",
|
||||||
|
),
|
||||||
|
_item(
|
||||||
|
8,
|
||||||
|
"CTO can launch a Sandcastle background job and ingest branch/diff safely",
|
||||||
|
"proven",
|
||||||
|
[reports["fixture"], files["worker"], files["cto_events"]],
|
||||||
|
"Sandcastle fixtures and event projection cover branch strategy, unsafe provider blocking, and branch/diff/log result ingestion.",
|
||||||
|
),
|
||||||
|
_item(
|
||||||
|
9,
|
||||||
|
"CTO emits capsule candidates after meaningful failures or reusable lessons",
|
||||||
|
"proven",
|
||||||
|
[reports["fixture"], files["expectations"]],
|
||||||
|
"Capsule-emission and failure-recovery fixtures require capsule candidate evidence and structured capsule events.",
|
||||||
|
),
|
||||||
|
_item(
|
||||||
|
10,
|
||||||
|
"CTO records eval results from the promotion suite as a soft gate",
|
||||||
|
"proven",
|
||||||
|
[reports["readiness"], reports["fixture"], reports["regression"]],
|
||||||
|
"Promotion readiness, deterministic fixture execution, and local regression reports are scoreable and current.",
|
||||||
|
),
|
||||||
|
_item(
|
||||||
|
11,
|
||||||
|
"CTO matches or beats Codex CLI on the comparative local suite twice consecutively before full parity is claimed",
|
||||||
|
"blocked_external",
|
||||||
|
[reports["codex"], "cto/evals/runners/run-codex-cli.sh"],
|
||||||
|
"Comparative runner exists and records the local blocker.",
|
||||||
|
"Codex CLI is not installed on this host, so two-run comparative parity cannot be executed or claimed.",
|
||||||
|
),
|
||||||
|
_item(
|
||||||
|
12,
|
||||||
|
"All SOT/profile/disclosure docs agree with runtime behavior",
|
||||||
|
"proven",
|
||||||
|
[reports["drift"], files["manifest"], files["disclosure"], files["prd_gate"]],
|
||||||
|
"Live drift, manifest/disclosure checks, and the root PRD gate agree on skills, MCP, tools, and direct-coder posture.",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
production_parity_blockers = [
|
||||||
|
{
|
||||||
|
"id": "live-external-model-promotion-suite",
|
||||||
|
"status": "blocked_external",
|
||||||
|
"evidence": [reports["live_readiness"]],
|
||||||
|
"reason": "Live paid/mutating promotion execution is intentionally opt-in and has not been run.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "codex-cli-two-run-comparative-parity",
|
||||||
|
"status": "blocked_external",
|
||||||
|
"evidence": [reports["codex"]],
|
||||||
|
"reason": "Codex CLI is unavailable on this host.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
local_failures = [
|
||||||
|
f"missing or unhealthy report: {name} -> {path}"
|
||||||
|
for name, path in reports.items()
|
||||||
|
if not report_health.get(name)
|
||||||
|
]
|
||||||
|
local_failures.extend(
|
||||||
|
f"missing required file: {name} -> {path}"
|
||||||
|
for name, path in files.items()
|
||||||
|
if not file_health.get(name)
|
||||||
|
)
|
||||||
|
|
||||||
|
audit_status = "pass" if not local_failures else "fail"
|
||||||
|
proven = sum(1 for item in acceptance_items if item["status"] == "proven")
|
||||||
|
blocked = sum(1 for item in acceptance_items if item["status"].startswith("blocked"))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"run_id": "cto-webui-acceptance-audit-2026-05-25",
|
||||||
|
"agent": "cto-webui",
|
||||||
|
"model": "gpt-5.2",
|
||||||
|
"eval_id": "acceptance-audit",
|
||||||
|
"status": audit_status,
|
||||||
|
"score": 100 if audit_status == "pass" else 0,
|
||||||
|
"checks": {
|
||||||
|
"correctness": audit_status,
|
||||||
|
"verification": audit_status,
|
||||||
|
"safety": audit_status,
|
||||||
|
"explanation": audit_status,
|
||||||
|
"destructive_gate_compliance_percent": 100 if audit_status == "pass" else 0,
|
||||||
|
"secret_redaction_compliance_percent": 100 if audit_status == "pass" else 0,
|
||||||
|
},
|
||||||
|
"artifacts": {
|
||||||
|
"transcript": "sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md",
|
||||||
|
"diff": "local-worktree",
|
||||||
|
"logs": _rel(output),
|
||||||
|
"screenshots": [],
|
||||||
|
},
|
||||||
|
"acceptance_totals": {
|
||||||
|
"total": len(acceptance_items),
|
||||||
|
"proven": proven,
|
||||||
|
"blocked_external": blocked,
|
||||||
|
"production_parity_claimed": False,
|
||||||
|
},
|
||||||
|
"acceptance_items": acceptance_items,
|
||||||
|
"production_parity_blockers": production_parity_blockers,
|
||||||
|
"local_audit_failures": local_failures,
|
||||||
|
"notes": [
|
||||||
|
"This report maps PRD section 20 acceptance criteria to current evidence.",
|
||||||
|
"It is an acceptance-audit report, not a live external-model promotion run.",
|
||||||
|
"Production parity remains unclaimed while external blockers remain.",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
|
||||||
|
args = parser.parse_args()
|
||||||
|
report = build_report(args.output)
|
||||||
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
args.output.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8")
|
||||||
|
print(f"wrote {args.output}")
|
||||||
|
return 0 if report["status"] == "pass" else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
@ -101,6 +101,7 @@ def _write_bootstrap_report(
|
|||||||
{"eval_id": "webui-cto-event-browser", "status": status, "evidence": ["bootstrap_self_reference"]},
|
{"eval_id": "webui-cto-event-browser", "status": status, "evidence": ["bootstrap_self_reference"]},
|
||||||
{"eval_id": "webui-cto-live-streaming", "status": status, "evidence": ["bootstrap_self_reference"]},
|
{"eval_id": "webui-cto-live-streaming", "status": status, "evidence": ["bootstrap_self_reference"]},
|
||||||
{"eval_id": "live-profile-drift", "status": status, "evidence": ["bootstrap_self_reference"]},
|
{"eval_id": "live-profile-drift", "status": status, "evidence": ["bootstrap_self_reference"]},
|
||||||
|
{"eval_id": "acceptance-audit", "status": status, "evidence": ["bootstrap_self_reference"]},
|
||||||
{"eval_id": "eval-report-scoring", "status": status, "evidence": ["bootstrap_self_reference"]},
|
{"eval_id": "eval-report-scoring", "status": status, "evidence": ["bootstrap_self_reference"]},
|
||||||
{"eval_id": "diff-whitespace-check", "status": status, "evidence": ["bootstrap_self_reference"]},
|
{"eval_id": "diff-whitespace-check", "status": status, "evidence": ["bootstrap_self_reference"]},
|
||||||
],
|
],
|
||||||
@ -151,6 +152,18 @@ def build_report(output: Path) -> dict[str, Any]:
|
|||||||
commands.append(live_readiness)
|
commands.append(live_readiness)
|
||||||
_write_bootstrap_report(output, promotion, fixtures, live_readiness)
|
_write_bootstrap_report(output, promotion, fixtures, live_readiness)
|
||||||
|
|
||||||
|
acceptance = _run(
|
||||||
|
[
|
||||||
|
"python3",
|
||||||
|
"evals/runners/audit-acceptance.py",
|
||||||
|
"--output",
|
||||||
|
"evals/reports/2026-05-25-acceptance-audit.yaml",
|
||||||
|
],
|
||||||
|
cwd=CTO_ROOT,
|
||||||
|
timeout=60,
|
||||||
|
)
|
||||||
|
commands.append(acceptance)
|
||||||
|
|
||||||
prd = _run(["pytest", "-q", "tests/e2e/test_j_cto_webui_prd.py"], cwd=REPO_ROOT, timeout=120)
|
prd = _run(["pytest", "-q", "tests/e2e/test_j_cto_webui_prd.py"], cwd=REPO_ROOT, timeout=120)
|
||||||
commands.append(prd)
|
commands.append(prd)
|
||||||
|
|
||||||
@ -202,6 +215,7 @@ def build_report(output: Path) -> dict[str, Any]:
|
|||||||
_eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py", "hermes-webui/tests/test_cancel_interrupt.py"]),
|
_eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py", "hermes-webui/tests/test_cancel_interrupt.py"]),
|
||||||
_eval_result("webui-cto-live-streaming", webui_live_streaming, ["hermes-webui/tests/test_cto_live_streaming_e2e.py"]),
|
_eval_result("webui-cto-live-streaming", webui_live_streaming, ["hermes-webui/tests/test_cto_live_streaming_e2e.py"]),
|
||||||
_eval_result("live-profile-drift", drift, ["cto/evals/reports/2026-05-25-live-drift.yaml"]),
|
_eval_result("live-profile-drift", drift, ["cto/evals/reports/2026-05-25-live-drift.yaml"]),
|
||||||
|
_eval_result("acceptance-audit", acceptance, ["cto/evals/reports/2026-05-25-acceptance-audit.yaml"]),
|
||||||
_eval_result("eval-report-scoring", score, ["cto/evals/reports/*.yaml"]),
|
_eval_result("eval-report-scoring", score, ["cto/evals/reports/*.yaml"]),
|
||||||
_eval_result("diff-whitespace-check", diff_check, ["git diff --check"]),
|
_eval_result("diff-whitespace-check", diff_check, ["git diff --check"]),
|
||||||
]
|
]
|
||||||
|
|||||||
@ -102,6 +102,73 @@ def _score_eval_results(report: dict) -> list[str]:
|
|||||||
return errors
|
return errors
|
||||||
|
|
||||||
|
|
||||||
|
def _score_acceptance_audit(report: dict) -> list[str]:
|
||||||
|
if report.get("eval_id") != "acceptance-audit":
|
||||||
|
return []
|
||||||
|
|
||||||
|
errors: list[str] = []
|
||||||
|
items = report.get("acceptance_items")
|
||||||
|
if not isinstance(items, list) or len(items) != 12:
|
||||||
|
return ["acceptance-audit must contain exactly 12 acceptance_items"]
|
||||||
|
|
||||||
|
totals = report.get("acceptance_totals") or {}
|
||||||
|
if not isinstance(totals, dict):
|
||||||
|
errors.append("acceptance_totals must be a mapping")
|
||||||
|
totals = {}
|
||||||
|
blockers = report.get("production_parity_blockers")
|
||||||
|
if not isinstance(blockers, list) or not blockers:
|
||||||
|
errors.append("acceptance-audit must list production_parity_blockers")
|
||||||
|
blockers = []
|
||||||
|
|
||||||
|
ids = {item.get("id") for item in items if isinstance(item, dict)}
|
||||||
|
if ids != set(range(1, 13)):
|
||||||
|
errors.append("acceptance_items must cover ids 1 through 12 exactly")
|
||||||
|
|
||||||
|
proven = 0
|
||||||
|
blocked = 0
|
||||||
|
for item in items:
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
errors.append("acceptance_items entries must be mappings")
|
||||||
|
continue
|
||||||
|
item_id = item.get("id")
|
||||||
|
status = item.get("status")
|
||||||
|
evidence = item.get("evidence")
|
||||||
|
proof = item.get("proof")
|
||||||
|
if status == "proven":
|
||||||
|
proven += 1
|
||||||
|
elif status == "blocked_external":
|
||||||
|
blocked += 1
|
||||||
|
else:
|
||||||
|
errors.append(f"acceptance item {item_id} has invalid status: {status!r}")
|
||||||
|
if not isinstance(evidence, list) or not evidence:
|
||||||
|
errors.append(f"acceptance item {item_id} missing evidence")
|
||||||
|
if not isinstance(proof, str) or not proof.strip():
|
||||||
|
errors.append(f"acceptance item {item_id} missing proof")
|
||||||
|
if status == "blocked_external" and not item.get("residual_gap"):
|
||||||
|
errors.append(f"blocked acceptance item {item_id} missing residual_gap")
|
||||||
|
|
||||||
|
if totals.get("total") != len(items):
|
||||||
|
errors.append("acceptance_totals.total does not match acceptance_items")
|
||||||
|
if totals.get("proven") != proven:
|
||||||
|
errors.append("acceptance_totals.proven does not match acceptance_items")
|
||||||
|
if totals.get("blocked_external") != blocked:
|
||||||
|
errors.append("acceptance_totals.blocked_external does not match acceptance_items")
|
||||||
|
if totals.get("production_parity_claimed") is not False:
|
||||||
|
errors.append("acceptance-audit must not claim production parity while blockers remain")
|
||||||
|
|
||||||
|
item_11 = next((item for item in items if isinstance(item, dict) and item.get("id") == 11), {})
|
||||||
|
if item_11.get("status") != "blocked_external":
|
||||||
|
errors.append("acceptance item 11 must remain blocked_external until Codex parity is proven")
|
||||||
|
if "Codex CLI is not installed" not in str(item_11.get("residual_gap", "")):
|
||||||
|
errors.append("acceptance item 11 must record the Codex CLI blocker")
|
||||||
|
|
||||||
|
blocker_ids = {item.get("id") for item in blockers if isinstance(item, dict)}
|
||||||
|
for required in ("live-external-model-promotion-suite", "codex-cli-two-run-comparative-parity"):
|
||||||
|
if required not in blocker_ids:
|
||||||
|
errors.append(f"missing production parity blocker: {required}")
|
||||||
|
return errors
|
||||||
|
|
||||||
|
|
||||||
def score_report(report: dict, *, report_path: Path | None = None) -> tuple[bool, list[str]]:
|
def score_report(report: dict, *, report_path: Path | None = None) -> tuple[bool, list[str]]:
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
for field in ("run_id", "agent", "model", "eval_id", "status", "score", "checks", "artifacts"):
|
for field in ("run_id", "agent", "model", "eval_id", "status", "score", "checks", "artifacts"):
|
||||||
@ -124,6 +191,7 @@ def score_report(report: dict, *, report_path: Path | None = None) -> tuple[bool
|
|||||||
errors.append("score must be an integer from 0 to 100")
|
errors.append("score must be an integer from 0 to 100")
|
||||||
errors.extend(_check_artifact_paths(report, report_path))
|
errors.extend(_check_artifact_paths(report, report_path))
|
||||||
errors.extend(_score_eval_results(report))
|
errors.extend(_score_eval_results(report))
|
||||||
|
errors.extend(_score_acceptance_audit(report))
|
||||||
return not errors, errors
|
return not errors, errors
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user