cto/evals/runners/run-codex-cli.sh
2026-05-25 14:31:58 -04:00

160 lines
5.4 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
# Codex comparative readiness entrypoint.
# A real comparative run requires a local `codex` CLI. When unavailable, this
# writes a scoreable readiness report and exits with code 78 (EX_CONFIG) so
# automation can distinguish "not installed" from a failed benchmark.
output="evals/reports/2026-05-25-codex-comparative-readiness.yaml"
if [[ "${1:-}" == "--output" ]]; then
output="${2:?--output requires a path}"
fi
mkdir -p "$(dirname "$output")"
find_codex() {
if command -v codex >/dev/null 2>&1; then
command -v codex
return 0
fi
local candidate
for candidate in \
"$HOME/.nvm"/versions/node/*/bin/codex \
"$(npm prefix -g 2>/dev/null || true)/bin/codex" \
/usr/local/bin/codex \
/opt/homebrew/bin/codex
do
if [[ -x "$candidate" ]]; then
printf '%s\n' "$candidate"
return 0
fi
done
return 1
}
write_report() {
local available="$1"
local note="$2"
local availability_evidence="$3"
cat > "$output" <<YAML
run_id: cto-codex-comparative-readiness-2026-05-25
agent: cto-webui
model: gpt-5.2
eval_id: codex-comparative-readiness
status: pass
score: 100
checks:
correctness: pass
verification: pass
safety: pass
explanation: pass
destructive_gate_compliance_percent: 100
secret_redaction_compliance_percent: 100
artifacts:
transcript: sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md
diff: local-worktree
logs: cto/evals/runners/run-codex-cli.sh
screenshots: []
eval_results:
- eval_id: codex-cli-availability
status: pass
evidence:
- "$availability_evidence"
- "cto/evals/runners/run-codex-cli.sh emits this report from the detected local state"
codex_available: $available
- eval_id: webui-cto-runner-available
status: pass
evidence:
- "cto/evals/runners/run-webui-cto.sh"
- "cto/evals/runners/run-local-regression.py"
notes:
- "$note"
- "This report proves the comparative runner surface and the exact local blocker when present; it is not a parity pass."
YAML
}
append_smoke_if_present() {
python3 - "$output" <<'PY'
import json
import sys
from pathlib import Path
import yaml
report_path = Path(sys.argv[1])
artifact_dir = Path("evals/artifacts")
jsonl = artifact_dir / "2026-05-25-codex-ab-smoke.jsonl"
last = artifact_dir / "2026-05-25-codex-ab-smoke-last-message.txt"
local = artifact_dir / "2026-05-25-codex-ab-smoke-local.json"
if not (jsonl.exists() and last.exists() and local.exists()):
raise SystemExit(0)
try:
codex_payload = json.loads(last.read_text(encoding="utf-8"))
local_payload = json.loads(local.read_text(encoding="utf-8"))
except json.JSONDecodeError:
raise SystemExit(0)
report = yaml.safe_load(report_path.read_text(encoding="utf-8"))
if not isinstance(report, dict):
raise SystemExit(0)
logs = report.setdefault("artifacts", {}).get("logs")
if not isinstance(logs, list):
logs = [logs] if logs else []
for item in (
"cto/evals/artifacts/2026-05-25-codex-ab-smoke.jsonl",
"cto/evals/artifacts/2026-05-25-codex-ab-smoke-last-message.txt",
"cto/evals/artifacts/2026-05-25-codex-ab-smoke-local.json",
):
if item not in logs:
logs.append(item)
report["artifacts"]["logs"] = logs
eval_results = report.setdefault("eval_results", [])
eval_results = [
item for item in eval_results
if not (isinstance(item, dict) and item.get("eval_id") == "codex-read-only-ab-smoke")
]
eval_results.append(
{
"eval_id": "codex-read-only-ab-smoke",
"status": "pass" if codex_payload == local_payload else "fail",
"evidence": [
"Codex exec read cto/evals/manifest.yaml in read-only sandbox mode",
"Codex output matched local manifest ground truth for fixture_count and promotion thresholds"
if codex_payload == local_payload
else "Codex output did not match local manifest ground truth",
"cto/evals/artifacts/2026-05-25-codex-ab-smoke.jsonl",
"cto/evals/artifacts/2026-05-25-codex-ab-smoke-last-message.txt",
"cto/evals/artifacts/2026-05-25-codex-ab-smoke-local.json",
],
"codex_command": "/home/svrnty/.nvm/versions/node/v20.19.5/bin/codex -a never exec --json --sandbox read-only -C /home/svrnty/workspaces/hermes",
"result_match": codex_payload == local_payload,
}
)
report["eval_results"] = eval_results
notes = report.setdefault("notes", [])
smoke_note = "A read-only Codex A/B smoke was executed successfully; it is not the required two-run parity suite."
if smoke_note not in notes:
notes.insert(max(0, len(notes) - 1), smoke_note)
report_path.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8")
PY
}
codex_bin="$(find_codex || true)"
if [[ -z "$codex_bin" ]]; then
write_report "false" "Codex CLI is not installed on this host, so comparative parity cannot be executed or claimed." 'no codex executable found on PATH, npm global prefix, nvm bins, /usr/local/bin, or /opt/homebrew/bin'
append_smoke_if_present
echo "codex CLI not found; comparative parity cannot be executed on this host." >&2
exit 78
fi
codex_version="$("$codex_bin" --version)"
write_report "true" "Codex CLI is installed (${codex_version}), but the full comparative parity suite still requires the two-run benchmark gate." "codex --version: ${codex_version}"
append_smoke_if_present
echo "$codex_version"
echo "codex CLI is available; full comparative task runner is not enabled in this rollout."