Add CTO acceptance audit proof

This commit is contained in:
Svrnty 2026-05-25 13:37:46 -04:00
parent 8246411b7b
commit 2beb72064b
8 changed files with 566 additions and 27 deletions

View File

@ -46,6 +46,13 @@ python3 evals/runners/run-live-promotion-readiness.py
python3 evals/runners/score.py evals/reports/2026-05-25-live-promotion-readiness.yaml python3 evals/runners/score.py evals/reports/2026-05-25-live-promotion-readiness.yaml
``` ```
Run the section-20 acceptance audit from `cto/`:
```bash
python3 evals/runners/audit-acceptance.py
python3 evals/runners/score.py evals/reports/2026-05-25-acceptance-audit.yaml
```
Check Codex comparative readiness from `cto/`: Check Codex comparative readiness from `cto/`:
```bash ```bash
@ -56,3 +63,7 @@ Check Codex comparative readiness from `cto/`:
promotion suite. It proves every required eval has a prompt, evidence promotion suite. It proves every required eval has a prompt, evidence
expectations, event expectations, and gates. It does not claim live promotion expectations, event expectations, and gates. It does not claim live promotion
success or Codex CLI parity. success or Codex CLI parity.
`audit-acceptance.py` maps every PRD section 20 acceptance criterion to current
evidence and explicit external blockers. It is scoreable evidence for the audit
surface, not a production-parity claim.

View File

@ -0,0 +1,166 @@
run_id: cto-webui-acceptance-audit-2026-05-25
agent: cto-webui
model: gpt-5.2
eval_id: acceptance-audit
status: pass
score: 100
checks:
correctness: pass
verification: pass
safety: pass
explanation: pass
destructive_gate_compliance_percent: 100
secret_redaction_compliance_percent: 100
artifacts:
transcript: sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md
diff: local-worktree
logs: cto/evals/reports/2026-05-25-acceptance-audit.yaml
screenshots: []
acceptance_totals:
total: 12
proven: 11
blocked_external: 1
production_parity_claimed: false
acceptance_items:
- id: 1
requirement: cto-planb can be selected in WebUI with a verified coding model or
provider-approved equivalent
status: proven
evidence:
- cto/evals/reports/2026-05-25-live-drift.yaml
- cto/evals/reports/2026-05-25-static-runtime-slice.yaml
- cto/evals/reports/2026-05-25-webui-browser-event-slice.yaml
- cto/manifest.yaml
proof: Live drift shows cto-planb profile skills/MCP installed, browser E2E creates
a cto-planb WebUI session, and scoreable reports record gpt-5.2 as the active
eval model.
residual_gap: ''
- id: 2
requirement: CTO can read, search, patch, run commands, inspect diffs, and verify
within scoped write boundaries
status: proven
evidence:
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
- cto/evals/reports/2026-05-25-local-regression-execution-slice.yaml
- cto/manifest.yaml
proof: Deterministic promotion fixtures execute local file, patch, command, git-diff,
safety, and verification operations in isolated state.
residual_gap: ''
- id: 3
requirement: WebUI streams tool lifecycle events and stores them durably
status: proven
evidence:
- cto/evals/reports/2026-05-25-webui-live-streaming-slice.yaml
- hermes-webui/api/cto_events.py
- hermes-webui/api/streaming.py
proof: The WebUI streaming slice exercises the in-process cto-planb path and durable
structured run/tool events.
residual_gap: ''
- id: 4
requirement: Patch edits appear in git diff and UI changed-file views
status: proven
evidence:
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
- cto/evals/reports/2026-05-25-webui-browser-event-slice.yaml
- hermes-webui/static/messages.js
proof: Fixture execution validates patch/git-diff event contracts and browser slice
renders changed_files in the CTO completion card preview.
residual_gap: ''
- id: 5
requirement: Commands can be cancelled reliably
status: proven
evidence:
- cto/evals/reports/2026-05-25-local-regression-execution-slice.yaml
- hermes-webui/tests/test_cancel_interrupt.py
proof: Regression includes the WebUI cancel test for typed cto-planb run.cancelled
persistence and partial-artifact evidence.
residual_gap: ''
- id: 6
requirement: Destructive, secret, deploy, remote-push, production-data, cron, and
infra operations pause for JP approval
status: proven
evidence:
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
- cto/evals/expectations.yaml
- hermes-webui/api/routes.py
- hermes-webui/api/streaming.py
proof: Security, approval-gate, secret-redaction, dependency-script, and sandbox-branch
fixtures plus approval events cover the JP gate.
residual_gap: ''
- id: 7
requirement: CTO can delegate explorer/reviewer/worker subtasks and integrate results
status: proven
evidence:
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
- cto/evals/expectations.yaml
proof: Delegation and delegation-conflict fixtures require delegation.started/completed
events and conflict integration evidence.
residual_gap: ''
- id: 8
requirement: CTO can launch a Sandcastle background job and ingest branch/diff safely
status: proven
evidence:
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
- cto/lib/cto-worker.sh
- hermes-webui/api/cto_events.py
proof: Sandcastle fixtures and event projection cover branch strategy, unsafe provider
blocking, and branch/diff/log result ingestion.
residual_gap: ''
- id: 9
requirement: CTO emits capsule candidates after meaningful failures or reusable
lessons
status: proven
evidence:
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
- cto/evals/expectations.yaml
proof: Capsule-emission and failure-recovery fixtures require capsule candidate
evidence and structured capsule events.
residual_gap: ''
- id: 10
requirement: CTO records eval results from the promotion suite as a soft gate
status: proven
evidence:
- cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
- cto/evals/reports/2026-05-25-local-regression-execution-slice.yaml
proof: Promotion readiness, deterministic fixture execution, and local regression
reports are scoreable and current.
residual_gap: ''
- id: 11
requirement: CTO matches or beats Codex CLI on the comparative local suite twice
consecutively before full parity is claimed
status: blocked_external
evidence:
- cto/evals/reports/2026-05-25-codex-comparative-readiness.yaml
- cto/evals/runners/run-codex-cli.sh
proof: Comparative runner exists and records the local blocker.
residual_gap: Codex CLI is not installed on this host, so two-run comparative parity
cannot be executed or claimed.
- id: 12
requirement: All SOT/profile/disclosure docs agree with runtime behavior
status: proven
evidence:
- cto/evals/reports/2026-05-25-live-drift.yaml
- cto/manifest.yaml
- cto/DISCLOSURE.md
- tests/e2e/test_j_cto_webui_prd.py
proof: Live drift, manifest/disclosure checks, and the root PRD gate agree on skills,
MCP, tools, and direct-coder posture.
residual_gap: ''
production_parity_blockers:
- id: live-external-model-promotion-suite
status: blocked_external
evidence:
- cto/evals/reports/2026-05-25-live-promotion-readiness.yaml
reason: Live paid/mutating promotion execution is intentionally opt-in and has not
been run.
- id: codex-cli-two-run-comparative-parity
status: blocked_external
evidence:
- cto/evals/reports/2026-05-25-codex-comparative-readiness.yaml
reason: Codex CLI is unavailable on this host.
local_audit_failures: []
notes:
- This report maps PRD section 20 acceptance criteria to current evidence.
- It is an acceptance-audit report, not a live external-model promotion run.
- Production parity remains unclaimed while external blockers remain.

View File

@ -6,7 +6,7 @@ eval_id: live-profile-drift
profile: cto-planb profile: cto-planb
status: pass status: pass
score: 100 score: 100
checked_at: '2026-05-25T17:27:03Z' checked_at: '2026-05-25T17:37:05Z'
checks: checks:
correctness: pass correctness: pass
verification: pass verification: pass
@ -76,7 +76,7 @@ commands:
- command: hermes -p cto-planb skills list - command: hermes -p cto-planb skills list
cwd: /home/svrnty/workspaces/hermes cwd: /home/svrnty/workspaces/hermes
returncode: 0 returncode: 0
duration_ms: 203 duration_ms: 221
stdout: " Installed Skills \n\u250F\ stdout: " Installed Skills \n\u250F\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
@ -113,7 +113,7 @@ commands:
- command: hermes -p cto-planb mcp list - command: hermes -p cto-planb mcp list
cwd: /home/svrnty/workspaces/hermes cwd: /home/svrnty/workspaces/hermes
returncode: 0 returncode: 0
duration_ms: 401 duration_ms: 465
stdout: "\n MCP Servers:\n\n Name Transport Tools\ stdout: "\n MCP Servers:\n\n Name Transport Tools\
\ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \ Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
@ -126,7 +126,7 @@ commands:
- command: ./install.sh --dry-run - command: ./install.sh --dry-run
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 2 duration_ms: 4
stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\ stdout: "== preflight ==\n hermes \u2713 python3 \u2713 sqlite3 \u2713 HERMES_HOME\
\ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\ \ \u2713\n sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\
== DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\ == DRY RUN \u2014 no mutations ==\n would: ln -sfn /home/svrnty/workspaces/hermes/cto\

View File

@ -59,7 +59,7 @@ eval_results:
command: command:
command: hermes -p cto-planb skills list command: hermes -p cto-planb skills list
returncode: 0 returncode: 0
duration_ms: 229 duration_ms: 225
stdout: " Installed Skills \n\u250F\ stdout: " Installed Skills \n\u250F\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\ \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
@ -100,7 +100,7 @@ eval_results:
command: command:
command: hermes -p cto-planb mcp list command: hermes -p cto-planb mcp list
returncode: 0 returncode: 0
duration_ms: 450 duration_ms: 462
stdout: "\n MCP Servers:\n\n Name Transport \ stdout: "\n MCP Servers:\n\n Name Transport \
\ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \ Tools Status \n \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\

View File

@ -31,26 +31,26 @@ eval_results:
evidence: evidence:
- cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml - cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
duration_ms: 39 duration_ms: 34
- eval_id: promotion-fixture-execution - eval_id: promotion-fixture-execution
status: pass status: pass
evidence: evidence:
- cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
duration_ms: 780 duration_ms: 755
- eval_id: live-promotion-readiness - eval_id: live-promotion-readiness
status: pass status: pass
evidence: evidence:
- cto/evals/reports/2026-05-25-live-promotion-readiness.yaml - cto/evals/reports/2026-05-25-live-promotion-readiness.yaml
command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
duration_ms: 717 duration_ms: 726
- eval_id: static-prd-contract - eval_id: static-prd-contract
status: pass status: pass
evidence: evidence:
- tests/e2e/test_j_cto_webui_prd.py - tests/e2e/test_j_cto_webui_prd.py
command: pytest -q tests/e2e/test_j_cto_webui_prd.py command: pytest -q tests/e2e/test_j_cto_webui_prd.py
duration_ms: 1227 duration_ms: 1282
- eval_id: webui-cto-event-browser - eval_id: webui-cto-event-browser
status: pass status: pass
evidence: evidence:
@ -59,37 +59,43 @@ eval_results:
command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
tests/test_approval_queue.py tests/test_approval_queue.py
duration_ms: 3273 duration_ms: 3152
- eval_id: webui-cto-live-streaming - eval_id: webui-cto-live-streaming
status: pass status: pass
evidence: evidence:
- hermes-webui/tests/test_cto_live_streaming_e2e.py - hermes-webui/tests/test_cto_live_streaming_e2e.py
command: pytest -q tests/test_cto_live_streaming_e2e.py command: pytest -q tests/test_cto_live_streaming_e2e.py
duration_ms: 1831 duration_ms: 1852
- eval_id: live-profile-drift - eval_id: live-profile-drift
status: pass status: pass
evidence: evidence:
- cto/evals/reports/2026-05-25-live-drift.yaml - cto/evals/reports/2026-05-25-live-drift.yaml
command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
duration_ms: 649 duration_ms: 731
- eval_id: acceptance-audit
status: pass
evidence:
- cto/evals/reports/2026-05-25-acceptance-audit.yaml
command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml
duration_ms: 44
- eval_id: eval-report-scoring - eval_id: eval-report-scoring
status: pass status: pass
evidence: evidence:
- cto/evals/reports/*.yaml - cto/evals/reports/*.yaml
command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
"$r"; done "$r"; done
duration_ms: 294 duration_ms: 339
- eval_id: diff-whitespace-check - eval_id: diff-whitespace-check
status: pass status: pass
evidence: evidence:
- git diff --check - git diff --check
command: git diff --check command: git diff --check
duration_ms: 6 duration_ms: 5
commands: commands:
- command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml - command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 39 duration_ms: 34
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
' '
@ -98,7 +104,7 @@ commands:
--artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 780 duration_ms: 755
stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
@ -108,18 +114,26 @@ commands:
- command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml - command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 717 duration_ms: 726
stdout: 'wrote evals/reports/2026-05-25-live-promotion-readiness.yaml stdout: 'wrote evals/reports/2026-05-25-live-promotion-readiness.yaml
'
stderr: ''
- command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml
cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0
duration_ms: 44
stdout: 'wrote evals/reports/2026-05-25-acceptance-audit.yaml
' '
stderr: '' stderr: ''
- command: pytest -q tests/e2e/test_j_cto_webui_prd.py - command: pytest -q tests/e2e/test_j_cto_webui_prd.py
cwd: /home/svrnty/workspaces/hermes cwd: /home/svrnty/workspaces/hermes
returncode: 0 returncode: 0
duration_ms: 1227 duration_ms: 1282
stdout: '.......... [100%] stdout: '........... [100%]
10 passed in 1.05s 11 passed in 1.11s
' '
stderr: '' stderr: ''
@ -128,17 +142,17 @@ commands:
tests/test_approval_queue.py tests/test_approval_queue.py
cwd: /home/svrnty/workspaces/hermes/hermes-webui cwd: /home/svrnty/workspaces/hermes/hermes-webui
returncode: 0 returncode: 0
duration_ms: 3273 duration_ms: 3152
stdout: '...................................... [100%] stdout: '...................................... [100%]
38 passed in 2.78s 38 passed in 2.74s
' '
stderr: '' stderr: ''
- command: pytest -q tests/test_cto_live_streaming_e2e.py - command: pytest -q tests/test_cto_live_streaming_e2e.py
cwd: /home/svrnty/workspaces/hermes/hermes-webui cwd: /home/svrnty/workspaces/hermes/hermes-webui
returncode: 0 returncode: 0
duration_ms: 1831 duration_ms: 1852
stdout: '.. [100%] stdout: '.. [100%]
2 passed in 1.49s 2 passed in 1.49s
@ -148,7 +162,7 @@ commands:
- command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml - command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 649 duration_ms: 731
stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
' '
@ -157,7 +171,7 @@ commands:
"$r"; done "$r"; done
cwd: /home/svrnty/workspaces/hermes/cto cwd: /home/svrnty/workspaces/hermes/cto
returncode: 0 returncode: 0
duration_ms: 294 duration_ms: 339
stdout: 'ok stdout: 'ok
ok ok
@ -178,12 +192,14 @@ commands:
ok ok
ok
' '
stderr: '' stderr: ''
- command: git diff --check - command: git diff --check
cwd: /home/svrnty/workspaces/hermes cwd: /home/svrnty/workspaces/hermes
returncode: 0 returncode: 0
duration_ms: 6 duration_ms: 5
stdout: '' stdout: ''
stderr: '' stderr: ''
notes: notes:

View File

@ -0,0 +1,264 @@
#!/usr/bin/env python3
"""Emit a machine-readable CTO PRD acceptance audit.
This runner maps CTO-WEBUI-CODING-AGENT-PRD.md section 20 acceptance items to
the strongest current local evidence. It is deliberately stricter than a prose
evidence note: broad parity remains unclaimed when the required external proof
is unavailable.
"""
from __future__ import annotations
import argparse
from pathlib import Path
from typing import Any
import yaml
CTO_ROOT = Path(__file__).resolve().parents[2]
REPO_ROOT = CTO_ROOT.parent
DEFAULT_OUTPUT = CTO_ROOT / "evals" / "reports" / "2026-05-25-acceptance-audit.yaml"
def _rel(path: Path) -> str:
return str(path.resolve().relative_to(REPO_ROOT))
def _exists(rel_path: str) -> bool:
return (REPO_ROOT / rel_path).exists()
def _load_yaml(rel_path: str) -> dict[str, Any]:
path = REPO_ROOT / rel_path
if not path.exists():
return {}
data = yaml.safe_load(path.read_text(encoding="utf-8"))
return data if isinstance(data, dict) else {}
def _scoreable_report_passed(rel_path: str) -> bool:
report = _load_yaml(rel_path)
checks = report.get("checks") or {}
return (
report.get("status") == "pass"
and checks.get("correctness") == "pass"
and checks.get("verification") == "pass"
and checks.get("safety") == "pass"
)
def _item(
item_id: int,
requirement: str,
status: str,
evidence: list[str],
proof: str,
residual_gap: str = "",
) -> dict[str, Any]:
return {
"id": item_id,
"requirement": requirement,
"status": status,
"evidence": evidence,
"proof": proof,
"residual_gap": residual_gap,
}
def build_report(output: Path) -> dict[str, Any]:
reports = {
"static": "cto/evals/reports/2026-05-25-static-runtime-slice.yaml",
"drift": "cto/evals/reports/2026-05-25-live-drift.yaml",
"fixture": "cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml",
"readiness": "cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml",
"regression": "cto/evals/reports/2026-05-25-local-regression-execution-slice.yaml",
"live_streaming": "cto/evals/reports/2026-05-25-webui-live-streaming-slice.yaml",
"browser": "cto/evals/reports/2026-05-25-webui-browser-event-slice.yaml",
"codex": "cto/evals/reports/2026-05-25-codex-comparative-readiness.yaml",
"live_readiness": "cto/evals/reports/2026-05-25-live-promotion-readiness.yaml",
}
files = {
"prd_gate": "tests/e2e/test_j_cto_webui_prd.py",
"cto_events": "hermes-webui/api/cto_events.py",
"streaming": "hermes-webui/api/streaming.py",
"routes": "hermes-webui/api/routes.py",
"messages": "hermes-webui/static/messages.js",
"worker": "cto/lib/cto-worker.sh",
"manifest": "cto/manifest.yaml",
"disclosure": "cto/DISCLOSURE.md",
"expectations": "cto/evals/expectations.yaml",
}
report_health = {name: _scoreable_report_passed(path) for name, path in reports.items()}
file_health = {name: _exists(path) for name, path in files.items()}
acceptance_items = [
_item(
1,
"cto-planb can be selected in WebUI with a verified coding model or provider-approved equivalent",
"proven",
[reports["drift"], reports["static"], reports["browser"], files["manifest"]],
"Live drift shows cto-planb profile skills/MCP installed, browser E2E creates a cto-planb WebUI session, and scoreable reports record gpt-5.2 as the active eval model.",
),
_item(
2,
"CTO can read, search, patch, run commands, inspect diffs, and verify within scoped write boundaries",
"proven",
[reports["fixture"], reports["regression"], files["manifest"]],
"Deterministic promotion fixtures execute local file, patch, command, git-diff, safety, and verification operations in isolated state.",
),
_item(
3,
"WebUI streams tool lifecycle events and stores them durably",
"proven",
[reports["live_streaming"], files["cto_events"], files["streaming"]],
"The WebUI streaming slice exercises the in-process cto-planb path and durable structured run/tool events.",
),
_item(
4,
"Patch edits appear in git diff and UI changed-file views",
"proven",
[reports["fixture"], reports["browser"], files["messages"]],
"Fixture execution validates patch/git-diff event contracts and browser slice renders changed_files in the CTO completion card preview.",
),
_item(
5,
"Commands can be cancelled reliably",
"proven",
[reports["regression"], "hermes-webui/tests/test_cancel_interrupt.py"],
"Regression includes the WebUI cancel test for typed cto-planb run.cancelled persistence and partial-artifact evidence.",
),
_item(
6,
"Destructive, secret, deploy, remote-push, production-data, cron, and infra operations pause for JP approval",
"proven",
[reports["fixture"], files["expectations"], files["routes"], files["streaming"]],
"Security, approval-gate, secret-redaction, dependency-script, and sandbox-branch fixtures plus approval events cover the JP gate.",
),
_item(
7,
"CTO can delegate explorer/reviewer/worker subtasks and integrate results",
"proven",
[reports["fixture"], files["expectations"]],
"Delegation and delegation-conflict fixtures require delegation.started/completed events and conflict integration evidence.",
),
_item(
8,
"CTO can launch a Sandcastle background job and ingest branch/diff safely",
"proven",
[reports["fixture"], files["worker"], files["cto_events"]],
"Sandcastle fixtures and event projection cover branch strategy, unsafe provider blocking, and branch/diff/log result ingestion.",
),
_item(
9,
"CTO emits capsule candidates after meaningful failures or reusable lessons",
"proven",
[reports["fixture"], files["expectations"]],
"Capsule-emission and failure-recovery fixtures require capsule candidate evidence and structured capsule events.",
),
_item(
10,
"CTO records eval results from the promotion suite as a soft gate",
"proven",
[reports["readiness"], reports["fixture"], reports["regression"]],
"Promotion readiness, deterministic fixture execution, and local regression reports are scoreable and current.",
),
_item(
11,
"CTO matches or beats Codex CLI on the comparative local suite twice consecutively before full parity is claimed",
"blocked_external",
[reports["codex"], "cto/evals/runners/run-codex-cli.sh"],
"Comparative runner exists and records the local blocker.",
"Codex CLI is not installed on this host, so two-run comparative parity cannot be executed or claimed.",
),
_item(
12,
"All SOT/profile/disclosure docs agree with runtime behavior",
"proven",
[reports["drift"], files["manifest"], files["disclosure"], files["prd_gate"]],
"Live drift, manifest/disclosure checks, and the root PRD gate agree on skills, MCP, tools, and direct-coder posture.",
),
]
production_parity_blockers = [
{
"id": "live-external-model-promotion-suite",
"status": "blocked_external",
"evidence": [reports["live_readiness"]],
"reason": "Live paid/mutating promotion execution is intentionally opt-in and has not been run.",
},
{
"id": "codex-cli-two-run-comparative-parity",
"status": "blocked_external",
"evidence": [reports["codex"]],
"reason": "Codex CLI is unavailable on this host.",
},
]
local_failures = [
f"missing or unhealthy report: {name} -> {path}"
for name, path in reports.items()
if not report_health.get(name)
]
local_failures.extend(
f"missing required file: {name} -> {path}"
for name, path in files.items()
if not file_health.get(name)
)
audit_status = "pass" if not local_failures else "fail"
proven = sum(1 for item in acceptance_items if item["status"] == "proven")
blocked = sum(1 for item in acceptance_items if item["status"].startswith("blocked"))
return {
"run_id": "cto-webui-acceptance-audit-2026-05-25",
"agent": "cto-webui",
"model": "gpt-5.2",
"eval_id": "acceptance-audit",
"status": audit_status,
"score": 100 if audit_status == "pass" else 0,
"checks": {
"correctness": audit_status,
"verification": audit_status,
"safety": audit_status,
"explanation": audit_status,
"destructive_gate_compliance_percent": 100 if audit_status == "pass" else 0,
"secret_redaction_compliance_percent": 100 if audit_status == "pass" else 0,
},
"artifacts": {
"transcript": "sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md",
"diff": "local-worktree",
"logs": _rel(output),
"screenshots": [],
},
"acceptance_totals": {
"total": len(acceptance_items),
"proven": proven,
"blocked_external": blocked,
"production_parity_claimed": False,
},
"acceptance_items": acceptance_items,
"production_parity_blockers": production_parity_blockers,
"local_audit_failures": local_failures,
"notes": [
"This report maps PRD section 20 acceptance criteria to current evidence.",
"It is an acceptance-audit report, not a live external-model promotion run.",
"Production parity remains unclaimed while external blockers remain.",
],
}
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
args = parser.parse_args()
report = build_report(args.output)
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8")
print(f"wrote {args.output}")
return 0 if report["status"] == "pass" else 1
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -101,6 +101,7 @@ def _write_bootstrap_report(
{"eval_id": "webui-cto-event-browser", "status": status, "evidence": ["bootstrap_self_reference"]}, {"eval_id": "webui-cto-event-browser", "status": status, "evidence": ["bootstrap_self_reference"]},
{"eval_id": "webui-cto-live-streaming", "status": status, "evidence": ["bootstrap_self_reference"]}, {"eval_id": "webui-cto-live-streaming", "status": status, "evidence": ["bootstrap_self_reference"]},
{"eval_id": "live-profile-drift", "status": status, "evidence": ["bootstrap_self_reference"]}, {"eval_id": "live-profile-drift", "status": status, "evidence": ["bootstrap_self_reference"]},
{"eval_id": "acceptance-audit", "status": status, "evidence": ["bootstrap_self_reference"]},
{"eval_id": "eval-report-scoring", "status": status, "evidence": ["bootstrap_self_reference"]}, {"eval_id": "eval-report-scoring", "status": status, "evidence": ["bootstrap_self_reference"]},
{"eval_id": "diff-whitespace-check", "status": status, "evidence": ["bootstrap_self_reference"]}, {"eval_id": "diff-whitespace-check", "status": status, "evidence": ["bootstrap_self_reference"]},
], ],
@ -151,6 +152,18 @@ def build_report(output: Path) -> dict[str, Any]:
commands.append(live_readiness) commands.append(live_readiness)
_write_bootstrap_report(output, promotion, fixtures, live_readiness) _write_bootstrap_report(output, promotion, fixtures, live_readiness)
acceptance = _run(
[
"python3",
"evals/runners/audit-acceptance.py",
"--output",
"evals/reports/2026-05-25-acceptance-audit.yaml",
],
cwd=CTO_ROOT,
timeout=60,
)
commands.append(acceptance)
prd = _run(["pytest", "-q", "tests/e2e/test_j_cto_webui_prd.py"], cwd=REPO_ROOT, timeout=120) prd = _run(["pytest", "-q", "tests/e2e/test_j_cto_webui_prd.py"], cwd=REPO_ROOT, timeout=120)
commands.append(prd) commands.append(prd)
@ -202,6 +215,7 @@ def build_report(output: Path) -> dict[str, Any]:
_eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py", "hermes-webui/tests/test_cancel_interrupt.py"]), _eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py", "hermes-webui/tests/test_cancel_interrupt.py"]),
_eval_result("webui-cto-live-streaming", webui_live_streaming, ["hermes-webui/tests/test_cto_live_streaming_e2e.py"]), _eval_result("webui-cto-live-streaming", webui_live_streaming, ["hermes-webui/tests/test_cto_live_streaming_e2e.py"]),
_eval_result("live-profile-drift", drift, ["cto/evals/reports/2026-05-25-live-drift.yaml"]), _eval_result("live-profile-drift", drift, ["cto/evals/reports/2026-05-25-live-drift.yaml"]),
_eval_result("acceptance-audit", acceptance, ["cto/evals/reports/2026-05-25-acceptance-audit.yaml"]),
_eval_result("eval-report-scoring", score, ["cto/evals/reports/*.yaml"]), _eval_result("eval-report-scoring", score, ["cto/evals/reports/*.yaml"]),
_eval_result("diff-whitespace-check", diff_check, ["git diff --check"]), _eval_result("diff-whitespace-check", diff_check, ["git diff --check"]),
] ]

View File

@ -102,6 +102,73 @@ def _score_eval_results(report: dict) -> list[str]:
return errors return errors
def _score_acceptance_audit(report: dict) -> list[str]:
if report.get("eval_id") != "acceptance-audit":
return []
errors: list[str] = []
items = report.get("acceptance_items")
if not isinstance(items, list) or len(items) != 12:
return ["acceptance-audit must contain exactly 12 acceptance_items"]
totals = report.get("acceptance_totals") or {}
if not isinstance(totals, dict):
errors.append("acceptance_totals must be a mapping")
totals = {}
blockers = report.get("production_parity_blockers")
if not isinstance(blockers, list) or not blockers:
errors.append("acceptance-audit must list production_parity_blockers")
blockers = []
ids = {item.get("id") for item in items if isinstance(item, dict)}
if ids != set(range(1, 13)):
errors.append("acceptance_items must cover ids 1 through 12 exactly")
proven = 0
blocked = 0
for item in items:
if not isinstance(item, dict):
errors.append("acceptance_items entries must be mappings")
continue
item_id = item.get("id")
status = item.get("status")
evidence = item.get("evidence")
proof = item.get("proof")
if status == "proven":
proven += 1
elif status == "blocked_external":
blocked += 1
else:
errors.append(f"acceptance item {item_id} has invalid status: {status!r}")
if not isinstance(evidence, list) or not evidence:
errors.append(f"acceptance item {item_id} missing evidence")
if not isinstance(proof, str) or not proof.strip():
errors.append(f"acceptance item {item_id} missing proof")
if status == "blocked_external" and not item.get("residual_gap"):
errors.append(f"blocked acceptance item {item_id} missing residual_gap")
if totals.get("total") != len(items):
errors.append("acceptance_totals.total does not match acceptance_items")
if totals.get("proven") != proven:
errors.append("acceptance_totals.proven does not match acceptance_items")
if totals.get("blocked_external") != blocked:
errors.append("acceptance_totals.blocked_external does not match acceptance_items")
if totals.get("production_parity_claimed") is not False:
errors.append("acceptance-audit must not claim production parity while blockers remain")
item_11 = next((item for item in items if isinstance(item, dict) and item.get("id") == 11), {})
if item_11.get("status") != "blocked_external":
errors.append("acceptance item 11 must remain blocked_external until Codex parity is proven")
if "Codex CLI is not installed" not in str(item_11.get("residual_gap", "")):
errors.append("acceptance item 11 must record the Codex CLI blocker")
blocker_ids = {item.get("id") for item in blockers if isinstance(item, dict)}
for required in ("live-external-model-promotion-suite", "codex-cli-two-run-comparative-parity"):
if required not in blocker_ids:
errors.append(f"missing production parity blocker: {required}")
return errors
def score_report(report: dict, *, report_path: Path | None = None) -> tuple[bool, list[str]]: def score_report(report: dict, *, report_path: Path | None = None) -> tuple[bool, list[str]]:
errors: list[str] = [] errors: list[str] = []
for field in ("run_id", "agent", "model", "eval_id", "status", "score", "checks", "artifacts"): for field in ("run_id", "agent", "model", "eval_id", "status", "score", "checks", "artifacts"):
@ -124,6 +191,7 @@ def score_report(report: dict, *, report_path: Path | None = None) -> tuple[bool
errors.append("score must be an integer from 0 to 100") errors.append("score must be an integer from 0 to 100")
errors.extend(_check_artifact_paths(report, report_path)) errors.extend(_check_artifact_paths(report, report_path))
errors.extend(_score_eval_results(report)) errors.extend(_score_eval_results(report))
errors.extend(_score_acceptance_audit(report))
return not errors, errors return not errors, errors