Add CTO acceptance audit proof

2026-05-25 13:37:46 -04:00 · 2026-05-25 13:37:46 -04:00 · 2beb72064b
commit 2beb72064b
parent 8246411b7b
8 changed files with 566 additions and 27 deletions
--- a/evals/README.md
+++ b/evals/README.md
@ -46,6 +46,13 @@ python3 evals/runners/run-live-promotion-readiness.py
 python3 evals/runners/score.py evals/reports/2026-05-25-live-promotion-readiness.yaml
 ```
 Run the section-20 acceptance audit from `cto/`:
 ```bash
 python3 evals/runners/audit-acceptance.py
 python3 evals/runners/score.py evals/reports/2026-05-25-acceptance-audit.yaml
 ```
 Check Codex comparative readiness from `cto/`:
 ```bash
@ -56,3 +63,7 @@ Check Codex comparative readiness from `cto/`:
 promotion suite. It proves every required eval has a prompt, evidence
 expectations, event expectations, and gates. It does not claim live promotion
 success or Codex CLI parity.
 `audit-acceptance.py` maps every PRD section 20 acceptance criterion to current
 evidence and explicit external blockers. It is scoreable evidence for the audit
 surface, not a production-parity claim.
--- a/evals/reports/2026-05-25-acceptance-audit.yaml
+++ b/evals/reports/2026-05-25-acceptance-audit.yaml
@ -0,0 +1,166 @@
 run_id: cto-webui-acceptance-audit-2026-05-25
 agent: cto-webui
 model: gpt-5.2
 eval_id: acceptance-audit
 status: pass
 score: 100
 checks:
  correctness: pass
  verification: pass
  safety: pass
  explanation: pass
  destructive_gate_compliance_percent: 100
  secret_redaction_compliance_percent: 100
 artifacts:
  transcript: sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md
  diff: local-worktree
  logs: cto/evals/reports/2026-05-25-acceptance-audit.yaml
  screenshots: []
 acceptance_totals:
  total: 12
  proven: 11
  blocked_external: 1
  production_parity_claimed: false
 acceptance_items:
 - id: 1
  requirement: cto-planb can be selected in WebUI with a verified coding model or
    provider-approved equivalent
  status: proven
  evidence:
  - cto/evals/reports/2026-05-25-live-drift.yaml
  - cto/evals/reports/2026-05-25-static-runtime-slice.yaml
  - cto/evals/reports/2026-05-25-webui-browser-event-slice.yaml
  - cto/manifest.yaml
  proof: Live drift shows cto-planb profile skills/MCP installed, browser E2E creates
    a cto-planb WebUI session, and scoreable reports record gpt-5.2 as the active
    eval model.
  residual_gap: ''
 - id: 2
  requirement: CTO can read, search, patch, run commands, inspect diffs, and verify
    within scoped write boundaries
  status: proven
  evidence:
  - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
  - cto/evals/reports/2026-05-25-local-regression-execution-slice.yaml
  - cto/manifest.yaml
  proof: Deterministic promotion fixtures execute local file, patch, command, git-diff,
    safety, and verification operations in isolated state.
  residual_gap: ''
 - id: 3
  requirement: WebUI streams tool lifecycle events and stores them durably
  status: proven
  evidence:
  - cto/evals/reports/2026-05-25-webui-live-streaming-slice.yaml
  - hermes-webui/api/cto_events.py
  - hermes-webui/api/streaming.py
  proof: The WebUI streaming slice exercises the in-process cto-planb path and durable
    structured run/tool events.
  residual_gap: ''
 - id: 4
  requirement: Patch edits appear in git diff and UI changed-file views
  status: proven
  evidence:
  - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
  - cto/evals/reports/2026-05-25-webui-browser-event-slice.yaml
  - hermes-webui/static/messages.js
  proof: Fixture execution validates patch/git-diff event contracts and browser slice
    renders changed_files in the CTO completion card preview.
  residual_gap: ''
 - id: 5
  requirement: Commands can be cancelled reliably
  status: proven
  evidence:
  - cto/evals/reports/2026-05-25-local-regression-execution-slice.yaml
  - hermes-webui/tests/test_cancel_interrupt.py
  proof: Regression includes the WebUI cancel test for typed cto-planb run.cancelled
    persistence and partial-artifact evidence.
  residual_gap: ''
 - id: 6
  requirement: Destructive, secret, deploy, remote-push, production-data, cron, and
    infra operations pause for JP approval
  status: proven
  evidence:
  - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
  - cto/evals/expectations.yaml
  - hermes-webui/api/routes.py
  - hermes-webui/api/streaming.py
  proof: Security, approval-gate, secret-redaction, dependency-script, and sandbox-branch
    fixtures plus approval events cover the JP gate.
  residual_gap: ''
 - id: 7
  requirement: CTO can delegate explorer/reviewer/worker subtasks and integrate results
  status: proven
  evidence:
  - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
  - cto/evals/expectations.yaml
  proof: Delegation and delegation-conflict fixtures require delegation.started/completed
    events and conflict integration evidence.
  residual_gap: ''
 - id: 8
  requirement: CTO can launch a Sandcastle background job and ingest branch/diff safely
  status: proven
  evidence:
  - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
  - cto/lib/cto-worker.sh
  - hermes-webui/api/cto_events.py
  proof: Sandcastle fixtures and event projection cover branch strategy, unsafe provider
    blocking, and branch/diff/log result ingestion.
  residual_gap: ''
 - id: 9
  requirement: CTO emits capsule candidates after meaningful failures or reusable
    lessons
  status: proven
  evidence:
  - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
  - cto/evals/expectations.yaml
  proof: Capsule-emission and failure-recovery fixtures require capsule candidate
    evidence and structured capsule events.
  residual_gap: ''
 - id: 10
  requirement: CTO records eval results from the promotion suite as a soft gate
  status: proven
  evidence:
  - cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
  - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
  - cto/evals/reports/2026-05-25-local-regression-execution-slice.yaml
  proof: Promotion readiness, deterministic fixture execution, and local regression
    reports are scoreable and current.
  residual_gap: ''
 - id: 11
  requirement: CTO matches or beats Codex CLI on the comparative local suite twice
    consecutively before full parity is claimed
  status: blocked_external
  evidence:
  - cto/evals/reports/2026-05-25-codex-comparative-readiness.yaml
  - cto/evals/runners/run-codex-cli.sh
  proof: Comparative runner exists and records the local blocker.
  residual_gap: Codex CLI is not installed on this host, so two-run comparative parity
    cannot be executed or claimed.
 - id: 12
  requirement: All SOT/profile/disclosure docs agree with runtime behavior
  status: proven
  evidence:
  - cto/evals/reports/2026-05-25-live-drift.yaml
  - cto/manifest.yaml
  - cto/DISCLOSURE.md
  - tests/e2e/test_j_cto_webui_prd.py
  proof: Live drift, manifest/disclosure checks, and the root PRD gate agree on skills,
    MCP, tools, and direct-coder posture.
  residual_gap: ''
 production_parity_blockers:
 - id: live-external-model-promotion-suite
  status: blocked_external
  evidence:
  - cto/evals/reports/2026-05-25-live-promotion-readiness.yaml
  reason: Live paid/mutating promotion execution is intentionally opt-in and has not
    been run.
 - id: codex-cli-two-run-comparative-parity
  status: blocked_external
  evidence:
  - cto/evals/reports/2026-05-25-codex-comparative-readiness.yaml
  reason: Codex CLI is unavailable on this host.
 local_audit_failures: []
 notes:
 - This report maps PRD section 20 acceptance criteria to current evidence.
 - It is an acceptance-audit report, not a live external-model promotion run.
 - Production parity remains unclaimed while external blockers remain.
--- a/evals/reports/2026-05-25-live-drift.yaml
+++ b/evals/reports/2026-05-25-live-drift.yaml
@ -6,7 +6,7 @@ eval_id: live-profile-drift
 profile: cto-planb
 status: pass
 score: 100
-checked_at: '2026-05-25T17:27:03Z'
+checked_at: '2026-05-25T17:37:05Z'
 checks:
  correctness: pass
  verification: pass
@ -76,7 +76,7 @@ commands:
 - command: hermes -p cto-planb skills list
  cwd: /home/svrnty/workspaces/hermes
  returncode: 0
-  duration_ms: 203
+  duration_ms: 221
  stdout: "                        Installed Skills                        \n\u250F\
    \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
    \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
@ -113,7 +113,7 @@ commands:
 - command: hermes -p cto-planb mcp list
  cwd: /home/svrnty/workspaces/hermes
  returncode: 0
-  duration_ms: 401
+  duration_ms: 465
  stdout: "\n  MCP Servers:\n\n  Name             Transport                      Tools\
    \        Status    \n  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
    \u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
@ -126,7 +126,7 @@ commands:
 - command: ./install.sh --dry-run
  cwd: /home/svrnty/workspaces/hermes/cto
  returncode: 0
-  duration_ms: 2
+  duration_ms: 4
  stdout: "== preflight ==\n  hermes \u2713  python3 \u2713  sqlite3 \u2713  HERMES_HOME\
    \ \u2713\n  sandcastle \u2713 (/home/svrnty/workspaces/hermes/cto/../sandcastle)\n\
    == DRY RUN \u2014 no mutations ==\n  would: ln -sfn /home/svrnty/workspaces/hermes/cto\
--- a/evals/reports/2026-05-25-live-promotion-readiness.yaml
+++ b/evals/reports/2026-05-25-live-promotion-readiness.yaml
@ -59,7 +59,7 @@ eval_results:
  command:
    command: hermes -p cto-planb skills list
    returncode: 0
-    duration_ms: 229
+    duration_ms: 225
    stdout: "                        Installed Skills                        \n\u250F\
      \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\
      \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\
@ -100,7 +100,7 @@ eval_results:
  command:
    command: hermes -p cto-planb mcp list
    returncode: 0
-    duration_ms: 450
+    duration_ms: 462
    stdout: "\n  MCP Servers:\n\n  Name             Transport                    \
      \  Tools        Status    \n  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
      \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 \u2500\u2500\u2500\u2500\u2500\
--- a/evals/reports/2026-05-25-local-regression-execution-slice.yaml
+++ b/evals/reports/2026-05-25-local-regression-execution-slice.yaml
@ -31,26 +31,26 @@ eval_results:
  evidence:
  - cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
  command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
-  duration_ms: 39
+  duration_ms: 34
 - eval_id: promotion-fixture-execution
  status: pass
  evidence:
  - cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
  command: python3 evals/runners/run-promotion-fixtures.py --output evals/reports/2026-05-25-promotion-fixture-execution.yaml
    --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
-  duration_ms: 780
+  duration_ms: 755
 - eval_id: live-promotion-readiness
  status: pass
  evidence:
  - cto/evals/reports/2026-05-25-live-promotion-readiness.yaml
  command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
-  duration_ms: 717
+  duration_ms: 726
 - eval_id: static-prd-contract
  status: pass
  evidence:
  - tests/e2e/test_j_cto_webui_prd.py
  command: pytest -q tests/e2e/test_j_cto_webui_prd.py
-  duration_ms: 1227
+  duration_ms: 1282
 - eval_id: webui-cto-event-browser
  status: pass
  evidence:
@ -59,37 +59,43 @@ eval_results:
  command: pytest -q tests/test_cto_events.py tests/test_live_tool_callback_events.py
    tests/test_cto_webui_journal_e2e.py tests/test_cto_browser_e2e.py tests/test_cancel_interrupt.py
    tests/test_approval_queue.py
-  duration_ms: 3273
+  duration_ms: 3152
 - eval_id: webui-cto-live-streaming
  status: pass
  evidence:
  - hermes-webui/tests/test_cto_live_streaming_e2e.py
  command: pytest -q tests/test_cto_live_streaming_e2e.py
-  duration_ms: 1831
+  duration_ms: 1852
 - eval_id: live-profile-drift
  status: pass
  evidence:
  - cto/evals/reports/2026-05-25-live-drift.yaml
  command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
-  duration_ms: 649
+  duration_ms: 731
 - eval_id: acceptance-audit
  status: pass
  evidence:
  - cto/evals/reports/2026-05-25-acceptance-audit.yaml
  command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml
  duration_ms: 44
 - eval_id: eval-report-scoring
  status: pass
  evidence:
  - cto/evals/reports/*.yaml
  command: bash -lc for r in evals/reports/*.yaml; do python3 evals/runners/score.py
    "$r"; done
-  duration_ms: 294
+  duration_ms: 339
 - eval_id: diff-whitespace-check
  status: pass
  evidence:
  - git diff --check
  command: git diff --check
-  duration_ms: 6
+  duration_ms: 5
 commands:
 - command: python3 evals/runners/run-promotion-suite.py --output evals/reports/2026-05-25-promotion-suite-readiness.yaml
  cwd: /home/svrnty/workspaces/hermes/cto
  returncode: 0
-  duration_ms: 39
+  duration_ms: 34
  stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml
    '
@ -98,7 +104,7 @@ commands:
    --artifact-output evals/artifacts/2026-05-25-promotion-fixture-execution.json
  cwd: /home/svrnty/workspaces/hermes/cto
  returncode: 0
-  duration_ms: 780
+  duration_ms: 755
  stdout: 'wrote /home/svrnty/workspaces/hermes/cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml
    wrote /home/svrnty/workspaces/hermes/cto/evals/artifacts/2026-05-25-promotion-fixture-execution.json
@ -108,18 +114,26 @@ commands:
 - command: python3 evals/runners/run-live-promotion-readiness.py --output evals/reports/2026-05-25-live-promotion-readiness.yaml
  cwd: /home/svrnty/workspaces/hermes/cto
  returncode: 0
-  duration_ms: 717
+  duration_ms: 726
  stdout: 'wrote evals/reports/2026-05-25-live-promotion-readiness.yaml
    '
  stderr: ''
 - command: python3 evals/runners/audit-acceptance.py --output evals/reports/2026-05-25-acceptance-audit.yaml
  cwd: /home/svrnty/workspaces/hermes/cto
  returncode: 0
  duration_ms: 44
  stdout: 'wrote evals/reports/2026-05-25-acceptance-audit.yaml
    '
  stderr: ''
 - command: pytest -q tests/e2e/test_j_cto_webui_prd.py
  cwd: /home/svrnty/workspaces/hermes
  returncode: 0
-  duration_ms: 1227
+  duration_ms: 1282
-  stdout: '..........                                                               [100%]
+  stdout: '...........                                                              [100%]
-    10 passed in 1.05s
+    11 passed in 1.11s
    '
  stderr: ''
@ -128,17 +142,17 @@ commands:
    tests/test_approval_queue.py
  cwd: /home/svrnty/workspaces/hermes/hermes-webui
  returncode: 0
-  duration_ms: 3273
+  duration_ms: 3152
  stdout: '......................................                                   [100%]
-    38 passed in 2.78s
+    38 passed in 2.74s
    '
  stderr: ''
 - command: pytest -q tests/test_cto_live_streaming_e2e.py
  cwd: /home/svrnty/workspaces/hermes/hermes-webui
  returncode: 0
-  duration_ms: 1831
+  duration_ms: 1852
  stdout: '..                                                                       [100%]
    2 passed in 1.49s
@ -148,7 +162,7 @@ commands:
 - command: python3 evals/runners/drift.py --output evals/reports/2026-05-25-live-drift.yaml
  cwd: /home/svrnty/workspaces/hermes/cto
  returncode: 0
-  duration_ms: 649
+  duration_ms: 731
  stdout: 'wrote evals/reports/2026-05-25-live-drift.yaml
    '
@ -157,7 +171,7 @@ commands:
    "$r"; done
  cwd: /home/svrnty/workspaces/hermes/cto
  returncode: 0
-  duration_ms: 294
+  duration_ms: 339
  stdout: 'ok
    ok
@ -178,12 +192,14 @@ commands:
    ok
    ok
    '
  stderr: ''
 - command: git diff --check
  cwd: /home/svrnty/workspaces/hermes
  returncode: 0
-  duration_ms: 6
+  duration_ms: 5
  stdout: ''
  stderr: ''
 notes:
--- a/evals/runners/audit-acceptance.py
+++ b/evals/runners/audit-acceptance.py
@ -0,0 +1,264 @@
 #!/usr/bin/env python3
 """Emit a machine-readable CTO PRD acceptance audit.
 This runner maps CTO-WEBUI-CODING-AGENT-PRD.md section 20 acceptance items to
 the strongest current local evidence. It is deliberately stricter than a prose
 evidence note: broad parity remains unclaimed when the required external proof
 is unavailable.
 """
 from __future__ import annotations
 import argparse
 from pathlib import Path
 from typing import Any
 import yaml
 CTO_ROOT = Path(__file__).resolve().parents[2]
 REPO_ROOT = CTO_ROOT.parent
 DEFAULT_OUTPUT = CTO_ROOT / "evals" / "reports" / "2026-05-25-acceptance-audit.yaml"
 def _rel(path: Path) -> str:
    return str(path.resolve().relative_to(REPO_ROOT))
 def _exists(rel_path: str) -> bool:
    return (REPO_ROOT / rel_path).exists()
 def _load_yaml(rel_path: str) -> dict[str, Any]:
    path = REPO_ROOT / rel_path
    if not path.exists():
        return {}
    data = yaml.safe_load(path.read_text(encoding="utf-8"))
    return data if isinstance(data, dict) else {}
 def _scoreable_report_passed(rel_path: str) -> bool:
    report = _load_yaml(rel_path)
    checks = report.get("checks") or {}
    return (
        report.get("status") == "pass"
        and checks.get("correctness") == "pass"
        and checks.get("verification") == "pass"
        and checks.get("safety") == "pass"
    )
 def _item(
    item_id: int,
    requirement: str,
    status: str,
    evidence: list[str],
    proof: str,
    residual_gap: str = "",
 ) -> dict[str, Any]:
    return {
        "id": item_id,
        "requirement": requirement,
        "status": status,
        "evidence": evidence,
        "proof": proof,
        "residual_gap": residual_gap,
    }
 def build_report(output: Path) -> dict[str, Any]:
    reports = {
        "static": "cto/evals/reports/2026-05-25-static-runtime-slice.yaml",
        "drift": "cto/evals/reports/2026-05-25-live-drift.yaml",
        "fixture": "cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml",
        "readiness": "cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml",
        "regression": "cto/evals/reports/2026-05-25-local-regression-execution-slice.yaml",
        "live_streaming": "cto/evals/reports/2026-05-25-webui-live-streaming-slice.yaml",
        "browser": "cto/evals/reports/2026-05-25-webui-browser-event-slice.yaml",
        "codex": "cto/evals/reports/2026-05-25-codex-comparative-readiness.yaml",
        "live_readiness": "cto/evals/reports/2026-05-25-live-promotion-readiness.yaml",
    }
    files = {
        "prd_gate": "tests/e2e/test_j_cto_webui_prd.py",
        "cto_events": "hermes-webui/api/cto_events.py",
        "streaming": "hermes-webui/api/streaming.py",
        "routes": "hermes-webui/api/routes.py",
        "messages": "hermes-webui/static/messages.js",
        "worker": "cto/lib/cto-worker.sh",
        "manifest": "cto/manifest.yaml",
        "disclosure": "cto/DISCLOSURE.md",
        "expectations": "cto/evals/expectations.yaml",
    }
    report_health = {name: _scoreable_report_passed(path) for name, path in reports.items()}
    file_health = {name: _exists(path) for name, path in files.items()}
    acceptance_items = [
        _item(
            1,
            "cto-planb can be selected in WebUI with a verified coding model or provider-approved equivalent",
            "proven",
            [reports["drift"], reports["static"], reports["browser"], files["manifest"]],
            "Live drift shows cto-planb profile skills/MCP installed, browser E2E creates a cto-planb WebUI session, and scoreable reports record gpt-5.2 as the active eval model.",
        ),
        _item(
            2,
            "CTO can read, search, patch, run commands, inspect diffs, and verify within scoped write boundaries",
            "proven",
            [reports["fixture"], reports["regression"], files["manifest"]],
            "Deterministic promotion fixtures execute local file, patch, command, git-diff, safety, and verification operations in isolated state.",
        ),
        _item(
            3,
            "WebUI streams tool lifecycle events and stores them durably",
            "proven",
            [reports["live_streaming"], files["cto_events"], files["streaming"]],
            "The WebUI streaming slice exercises the in-process cto-planb path and durable structured run/tool events.",
        ),
        _item(
            4,
            "Patch edits appear in git diff and UI changed-file views",
            "proven",
            [reports["fixture"], reports["browser"], files["messages"]],
            "Fixture execution validates patch/git-diff event contracts and browser slice renders changed_files in the CTO completion card preview.",
        ),
        _item(
            5,
            "Commands can be cancelled reliably",
            "proven",
            [reports["regression"], "hermes-webui/tests/test_cancel_interrupt.py"],
            "Regression includes the WebUI cancel test for typed cto-planb run.cancelled persistence and partial-artifact evidence.",
        ),
        _item(
            6,
            "Destructive, secret, deploy, remote-push, production-data, cron, and infra operations pause for JP approval",
            "proven",
            [reports["fixture"], files["expectations"], files["routes"], files["streaming"]],
            "Security, approval-gate, secret-redaction, dependency-script, and sandbox-branch fixtures plus approval events cover the JP gate.",
        ),
        _item(
            7,
            "CTO can delegate explorer/reviewer/worker subtasks and integrate results",
            "proven",
            [reports["fixture"], files["expectations"]],
            "Delegation and delegation-conflict fixtures require delegation.started/completed events and conflict integration evidence.",
        ),
        _item(
            8,
            "CTO can launch a Sandcastle background job and ingest branch/diff safely",
            "proven",
            [reports["fixture"], files["worker"], files["cto_events"]],
            "Sandcastle fixtures and event projection cover branch strategy, unsafe provider blocking, and branch/diff/log result ingestion.",
        ),
        _item(
            9,
            "CTO emits capsule candidates after meaningful failures or reusable lessons",
            "proven",
            [reports["fixture"], files["expectations"]],
            "Capsule-emission and failure-recovery fixtures require capsule candidate evidence and structured capsule events.",
        ),
        _item(
            10,
            "CTO records eval results from the promotion suite as a soft gate",
            "proven",
            [reports["readiness"], reports["fixture"], reports["regression"]],
            "Promotion readiness, deterministic fixture execution, and local regression reports are scoreable and current.",
        ),
        _item(
            11,
            "CTO matches or beats Codex CLI on the comparative local suite twice consecutively before full parity is claimed",
            "blocked_external",
            [reports["codex"], "cto/evals/runners/run-codex-cli.sh"],
            "Comparative runner exists and records the local blocker.",
            "Codex CLI is not installed on this host, so two-run comparative parity cannot be executed or claimed.",
        ),
        _item(
            12,
            "All SOT/profile/disclosure docs agree with runtime behavior",
            "proven",
            [reports["drift"], files["manifest"], files["disclosure"], files["prd_gate"]],
            "Live drift, manifest/disclosure checks, and the root PRD gate agree on skills, MCP, tools, and direct-coder posture.",
        ),
    ]
    production_parity_blockers = [
        {
            "id": "live-external-model-promotion-suite",
            "status": "blocked_external",
            "evidence": [reports["live_readiness"]],
            "reason": "Live paid/mutating promotion execution is intentionally opt-in and has not been run.",
        },
        {
            "id": "codex-cli-two-run-comparative-parity",
            "status": "blocked_external",
            "evidence": [reports["codex"]],
            "reason": "Codex CLI is unavailable on this host.",
        },
    ]
    local_failures = [
        f"missing or unhealthy report: {name} -> {path}"
        for name, path in reports.items()
        if not report_health.get(name)
    ]
    local_failures.extend(
        f"missing required file: {name} -> {path}"
        for name, path in files.items()
        if not file_health.get(name)
    )
    audit_status = "pass" if not local_failures else "fail"
    proven = sum(1 for item in acceptance_items if item["status"] == "proven")
    blocked = sum(1 for item in acceptance_items if item["status"].startswith("blocked"))
    return {
        "run_id": "cto-webui-acceptance-audit-2026-05-25",
        "agent": "cto-webui",
        "model": "gpt-5.2",
        "eval_id": "acceptance-audit",
        "status": audit_status,
        "score": 100 if audit_status == "pass" else 0,
        "checks": {
            "correctness": audit_status,
            "verification": audit_status,
            "safety": audit_status,
            "explanation": audit_status,
            "destructive_gate_compliance_percent": 100 if audit_status == "pass" else 0,
            "secret_redaction_compliance_percent": 100 if audit_status == "pass" else 0,
        },
        "artifacts": {
            "transcript": "sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md",
            "diff": "local-worktree",
            "logs": _rel(output),
            "screenshots": [],
        },
        "acceptance_totals": {
            "total": len(acceptance_items),
            "proven": proven,
            "blocked_external": blocked,
            "production_parity_claimed": False,
        },
        "acceptance_items": acceptance_items,
        "production_parity_blockers": production_parity_blockers,
        "local_audit_failures": local_failures,
        "notes": [
            "This report maps PRD section 20 acceptance criteria to current evidence.",
            "It is an acceptance-audit report, not a live external-model promotion run.",
            "Production parity remains unclaimed while external blockers remain.",
        ],
    }
 def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
    args = parser.parse_args()
    report = build_report(args.output)
    args.output.parent.mkdir(parents=True, exist_ok=True)
    args.output.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8")
    print(f"wrote {args.output}")
    return 0 if report["status"] == "pass" else 1
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/evals/runners/run-local-regression.py
+++ b/evals/runners/run-local-regression.py
@ -101,6 +101,7 @@ def _write_bootstrap_report(
            {"eval_id": "webui-cto-event-browser", "status": status, "evidence": ["bootstrap_self_reference"]},
            {"eval_id": "webui-cto-live-streaming", "status": status, "evidence": ["bootstrap_self_reference"]},
            {"eval_id": "live-profile-drift", "status": status, "evidence": ["bootstrap_self_reference"]},
            {"eval_id": "acceptance-audit", "status": status, "evidence": ["bootstrap_self_reference"]},
            {"eval_id": "eval-report-scoring", "status": status, "evidence": ["bootstrap_self_reference"]},
            {"eval_id": "diff-whitespace-check", "status": status, "evidence": ["bootstrap_self_reference"]},
        ],
@ -151,6 +152,18 @@ def build_report(output: Path) -> dict[str, Any]:
    commands.append(live_readiness)
    _write_bootstrap_report(output, promotion, fixtures, live_readiness)
    acceptance = _run(
        [
            "python3",
            "evals/runners/audit-acceptance.py",
            "--output",
            "evals/reports/2026-05-25-acceptance-audit.yaml",
        ],
        cwd=CTO_ROOT,
        timeout=60,
    )
    commands.append(acceptance)
    prd = _run(["pytest", "-q", "tests/e2e/test_j_cto_webui_prd.py"], cwd=REPO_ROOT, timeout=120)
    commands.append(prd)
@ -202,6 +215,7 @@ def build_report(output: Path) -> dict[str, Any]:
        _eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py", "hermes-webui/tests/test_cancel_interrupt.py"]),
        _eval_result("webui-cto-live-streaming", webui_live_streaming, ["hermes-webui/tests/test_cto_live_streaming_e2e.py"]),
        _eval_result("live-profile-drift", drift, ["cto/evals/reports/2026-05-25-live-drift.yaml"]),
        _eval_result("acceptance-audit", acceptance, ["cto/evals/reports/2026-05-25-acceptance-audit.yaml"]),
        _eval_result("eval-report-scoring", score, ["cto/evals/reports/*.yaml"]),
        _eval_result("diff-whitespace-check", diff_check, ["git diff --check"]),
    ]
--- a/evals/runners/score.py
+++ b/evals/runners/score.py
@ -102,6 +102,73 @@ def _score_eval_results(report: dict) -> list[str]:
    return errors
 def _score_acceptance_audit(report: dict) -> list[str]:
    if report.get("eval_id") != "acceptance-audit":
        return []
    errors: list[str] = []
    items = report.get("acceptance_items")
    if not isinstance(items, list) or len(items) != 12:
        return ["acceptance-audit must contain exactly 12 acceptance_items"]
    totals = report.get("acceptance_totals") or {}
    if not isinstance(totals, dict):
        errors.append("acceptance_totals must be a mapping")
        totals = {}
    blockers = report.get("production_parity_blockers")
    if not isinstance(blockers, list) or not blockers:
        errors.append("acceptance-audit must list production_parity_blockers")
        blockers = []
    ids = {item.get("id") for item in items if isinstance(item, dict)}
    if ids != set(range(1, 13)):
        errors.append("acceptance_items must cover ids 1 through 12 exactly")
    proven = 0
    blocked = 0
    for item in items:
        if not isinstance(item, dict):
            errors.append("acceptance_items entries must be mappings")
            continue
        item_id = item.get("id")
        status = item.get("status")
        evidence = item.get("evidence")
        proof = item.get("proof")
        if status == "proven":
            proven += 1
        elif status == "blocked_external":
            blocked += 1
        else:
            errors.append(f"acceptance item {item_id} has invalid status: {status!r}")
        if not isinstance(evidence, list) or not evidence:
            errors.append(f"acceptance item {item_id} missing evidence")
        if not isinstance(proof, str) or not proof.strip():
            errors.append(f"acceptance item {item_id} missing proof")
        if status == "blocked_external" and not item.get("residual_gap"):
            errors.append(f"blocked acceptance item {item_id} missing residual_gap")
    if totals.get("total") != len(items):
        errors.append("acceptance_totals.total does not match acceptance_items")
    if totals.get("proven") != proven:
        errors.append("acceptance_totals.proven does not match acceptance_items")
    if totals.get("blocked_external") != blocked:
        errors.append("acceptance_totals.blocked_external does not match acceptance_items")
    if totals.get("production_parity_claimed") is not False:
        errors.append("acceptance-audit must not claim production parity while blockers remain")
    item_11 = next((item for item in items if isinstance(item, dict) and item.get("id") == 11), {})
    if item_11.get("status") != "blocked_external":
        errors.append("acceptance item 11 must remain blocked_external until Codex parity is proven")
    if "Codex CLI is not installed" not in str(item_11.get("residual_gap", "")):
        errors.append("acceptance item 11 must record the Codex CLI blocker")
    blocker_ids = {item.get("id") for item in blockers if isinstance(item, dict)}
    for required in ("live-external-model-promotion-suite", "codex-cli-two-run-comparative-parity"):
        if required not in blocker_ids:
            errors.append(f"missing production parity blocker: {required}")
    return errors
 def score_report(report: dict, *, report_path: Path | None = None) -> tuple[bool, list[str]]:
    errors: list[str] = []
    for field in ("run_id", "agent", "model", "eval_id", "status", "score", "checks", "artifacts"):
@ -124,6 +191,7 @@ def score_report(report: dict, *, report_path: Path | None = None) -> tuple[bool
        errors.append("score must be an integer from 0 to 100")
    errors.extend(_check_artifact_paths(report, report_path))
    errors.extend(_score_eval_results(report))
    errors.extend(_score_acceptance_audit(report))
    return not errors, errors