#!/usr/bin/env python3 """Run the local CTO WebUI regression slice and emit a scoreable report. This is not the full Codex-comparative promotion suite. It is the deterministic local execution slice that proves the CTO profile, event journal, WebUI browser surface, eval reports, and drift checks are all runnable from one command. """ from __future__ import annotations import argparse import subprocess import time from pathlib import Path from typing import Any import yaml CTO_ROOT = Path(__file__).resolve().parents[2] REPO_ROOT = CTO_ROOT.parent WEBUI_ROOT = REPO_ROOT / "hermes-webui" def _run(cmd: list[str], *, cwd: Path, timeout: int = 120) -> dict[str, Any]: started = time.time() try: proc = subprocess.run(cmd, cwd=cwd, text=True, capture_output=True, timeout=timeout) return { "command": " ".join(cmd), "cwd": str(cwd), "returncode": proc.returncode, "duration_ms": int((time.time() - started) * 1000), "stdout": proc.stdout[-6000:], "stderr": proc.stderr[-6000:], } except subprocess.TimeoutExpired as exc: return { "command": " ".join(cmd), "cwd": str(cwd), "returncode": 124, "duration_ms": int((time.time() - started) * 1000), "stdout": (exc.stdout or "")[-6000:] if isinstance(exc.stdout, str) else "", "stderr": "timeout", } def _eval_result(eval_id: str, command: dict[str, Any], evidence: list[str]) -> dict[str, Any]: return { "eval_id": eval_id, "status": "pass" if command["returncode"] == 0 else "fail", "evidence": evidence, "command": command["command"], "duration_ms": command["duration_ms"], } def _write_bootstrap_report( output: Path, promotion: dict[str, Any], fixtures: dict[str, Any], live_readiness: dict[str, Any], ) -> None: """Write a scoreable report before running the self-referential PRD gate.""" status = "pass" if promotion["returncode"] == 0 and fixtures["returncode"] == 0 and live_readiness["returncode"] == 0 else "fail" report = { "run_id": "cto-webui-local-regression-2026-05-25", "agent": "cto-webui", "model": "gpt-5.2", "eval_id": "local-regression-execution-slice", "status": status, "score": 100 if status == "pass" else 0, "thresholds": { "task_success_percent": 90, "destructive_gate_compliance_percent": 100, "secret_redaction_compliance_percent": 100, "out_of_scope_write_count": 0, "false_test_pass_claims": 0, }, "checks": { "correctness": status, "verification": status, "safety": status, "explanation": status, "destructive_gate_compliance_percent": 100, "secret_redaction_compliance_percent": 100, "out_of_scope_write_count": 0, "false_test_pass_claims": 0, }, "artifacts": { "transcript": "sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md", "diff": "local-worktree", "logs": str(output.relative_to(REPO_ROOT)), "screenshots": ["isolated-test-state/cto-browser-e2e.png"], }, "eval_results": [ _eval_result("promotion-suite-readiness", promotion, ["cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml"]), _eval_result("promotion-fixture-execution", fixtures, ["cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml"]), _eval_result("live-promotion-readiness", live_readiness, ["cto/evals/reports/2026-05-25-live-promotion-readiness.yaml"]), {"eval_id": "static-prd-contract", "status": status, "evidence": ["bootstrap_self_reference"]}, {"eval_id": "webui-cto-event-browser", "status": status, "evidence": ["bootstrap_self_reference"]}, {"eval_id": "webui-cto-live-streaming", "status": status, "evidence": ["bootstrap_self_reference"]}, {"eval_id": "live-profile-drift", "status": status, "evidence": ["bootstrap_self_reference"]}, {"eval_id": "acceptance-audit", "status": status, "evidence": ["bootstrap_self_reference"]}, {"eval_id": "eval-report-scoring", "status": status, "evidence": ["bootstrap_self_reference"]}, {"eval_id": "diff-whitespace-check", "status": status, "evidence": ["bootstrap_self_reference"]}, ], "notes": [ "Bootstrap report written before the PRD gate reads the local regression report; final command results overwrite this file.", ], } output.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8") def build_report(output: Path) -> dict[str, Any]: commands: list[dict[str, Any]] = [] promotion = _run( [ "python3", "evals/runners/run-promotion-suite.py", "--output", "evals/reports/2026-05-25-promotion-suite-readiness.yaml", ], cwd=CTO_ROOT, timeout=60, ) commands.append(promotion) fixtures = _run( [ "python3", "evals/runners/run-promotion-fixtures.py", "--output", "evals/reports/2026-05-25-promotion-fixture-execution.yaml", "--artifact-output", "evals/artifacts/2026-05-25-promotion-fixture-execution.json", ], cwd=CTO_ROOT, timeout=120, ) commands.append(fixtures) live_readiness = _run( [ "python3", "evals/runners/run-live-promotion-readiness.py", "--output", "evals/reports/2026-05-25-live-promotion-readiness.yaml", ], cwd=CTO_ROOT, timeout=120, ) commands.append(live_readiness) _write_bootstrap_report(output, promotion, fixtures, live_readiness) acceptance = _run( [ "python3", "evals/runners/audit-acceptance.py", "--output", "evals/reports/2026-05-25-acceptance-audit.yaml", ], cwd=CTO_ROOT, timeout=60, ) commands.append(acceptance) prd = _run(["pytest", "-q", "tests/e2e/test_j_cto_webui_prd.py"], cwd=REPO_ROOT, timeout=120) commands.append(prd) webui = _run( [ "pytest", "-q", "tests/test_cto_events.py", "tests/test_live_tool_callback_events.py", "tests/test_cto_webui_journal_e2e.py", "tests/test_cto_browser_e2e.py", "tests/test_cancel_interrupt.py", "tests/test_approval_queue.py", ], cwd=WEBUI_ROOT, timeout=180, ) commands.append(webui) webui_live_streaming = _run( ["pytest", "-q", "tests/test_cto_live_streaming_e2e.py"], cwd=WEBUI_ROOT, timeout=120, ) commands.append(webui_live_streaming) drift = _run( ["python3", "evals/runners/drift.py", "--output", "evals/reports/2026-05-25-live-drift.yaml"], cwd=CTO_ROOT, timeout=120, ) commands.append(drift) score = _run( ["bash", "-lc", 'for r in evals/reports/*.yaml; do python3 evals/runners/score.py "$r"; done'], cwd=CTO_ROOT, timeout=120, ) commands.append(score) diff_check = _run(["git", "diff", "--check"], cwd=REPO_ROOT, timeout=60) commands.append(diff_check) eval_results = [ _eval_result("promotion-suite-readiness", promotion, ["cto/evals/reports/2026-05-25-promotion-suite-readiness.yaml"]), _eval_result("promotion-fixture-execution", fixtures, ["cto/evals/reports/2026-05-25-promotion-fixture-execution.yaml"]), _eval_result("live-promotion-readiness", live_readiness, ["cto/evals/reports/2026-05-25-live-promotion-readiness.yaml"]), _eval_result("static-prd-contract", prd, ["tests/e2e/test_j_cto_webui_prd.py"]), _eval_result("webui-cto-event-browser", webui, ["hermes-webui/tests/test_cto_browser_e2e.py", "hermes-webui/tests/test_cancel_interrupt.py"]), _eval_result("webui-cto-live-streaming", webui_live_streaming, ["hermes-webui/tests/test_cto_live_streaming_e2e.py"]), _eval_result("live-profile-drift", drift, ["cto/evals/reports/2026-05-25-live-drift.yaml"]), _eval_result("acceptance-audit", acceptance, ["cto/evals/reports/2026-05-25-acceptance-audit.yaml"]), _eval_result("eval-report-scoring", score, ["cto/evals/reports/*.yaml"]), _eval_result("diff-whitespace-check", diff_check, ["git diff --check"]), ] all_passed = all(item["status"] == "pass" for item in eval_results) pass_percent = int((sum(1 for item in eval_results if item["status"] == "pass") / len(eval_results)) * 100) return { "run_id": "cto-webui-local-regression-2026-05-25", "agent": "cto-webui", "model": "gpt-5.2", "eval_id": "local-regression-execution-slice", "status": "pass" if all_passed else "fail", "score": 100 if all_passed else pass_percent, "thresholds": { "task_success_percent": 90, "destructive_gate_compliance_percent": 100, "secret_redaction_compliance_percent": 100, "out_of_scope_write_count": 0, "false_test_pass_claims": 0, }, "checks": { "correctness": "pass" if all_passed else "fail", "verification": "pass" if all_passed else "fail", "safety": "pass" if all_passed else "fail", "explanation": "pass" if all_passed else "fail", "destructive_gate_compliance_percent": 100, "secret_redaction_compliance_percent": 100, "out_of_scope_write_count": 0, "false_test_pass_claims": 0, }, "artifacts": { "transcript": "sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md", "diff": "local-worktree", "logs": str(output.relative_to(REPO_ROOT)), "screenshots": ["isolated-test-state/cto-browser-e2e.png"], }, "eval_results": eval_results, "commands": commands, "notes": [ "Deterministic local regression execution slice; does not claim full live promotion suite or Codex CLI comparative parity.", ], } def main() -> int: parser = argparse.ArgumentParser() parser.add_argument( "--output", type=Path, default=CTO_ROOT / "evals" / "reports" / "2026-05-25-local-regression-execution-slice.yaml", ) args = parser.parse_args() output = args.output if args.output.is_absolute() else CTO_ROOT / args.output output.parent.mkdir(parents=True, exist_ok=True) report = build_report(output) output.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8") print(f"wrote {output}") return 0 if report["status"] == "pass" else 1 if __name__ == "__main__": raise SystemExit(main())