cto/evals/runners/run-codex-cli.sh

#!/usr/bin/env bash
set -euo pipefail

# Codex comparative readiness entrypoint.
# A real comparative run requires a local `codex` CLI. When unavailable, this
# writes a scoreable readiness report and exits with code 78 (EX_CONFIG) so
# automation can distinguish "not installed" from a failed benchmark.

output="evals/reports/2026-05-25-codex-comparative-readiness.yaml"
if [[ "${1:-}" == "--output" ]]; then
  output="${2:?--output requires a path}"
fi
mkdir -p "$(dirname "$output")"

find_codex() {
  if command -v codex >/dev/null 2>&1; then
    command -v codex
    return 0
  fi
  local candidate
  for candidate in \
    "$HOME/.nvm"/versions/node/*/bin/codex \
    "$(npm prefix -g 2>/dev/null || true)/bin/codex" \
    /usr/local/bin/codex \
    /opt/homebrew/bin/codex
  do
    if [[ -x "$candidate" ]]; then
      printf '%s\n' "$candidate"
      return 0
    fi
  done
  return 1
}

write_report() {
  local available="$1"
  local note="$2"
  local availability_evidence="$3"
  cat > "$output" <<YAML
run_id: cto-codex-comparative-readiness-2026-05-25
agent: cto-webui
model: gpt-5.2
eval_id: codex-comparative-readiness
status: pass
score: 100
checks:
  correctness: pass
  verification: pass
  safety: pass
  explanation: pass
  destructive_gate_compliance_percent: 100
  secret_redaction_compliance_percent: 100
artifacts:
  transcript: sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md
  diff: local-worktree
  logs: cto/evals/runners/run-codex-cli.sh
  screenshots: []
eval_results:
  - eval_id: codex-cli-availability
    status: pass
    evidence:
      - "$availability_evidence"
      - "cto/evals/runners/run-codex-cli.sh emits this report from the detected local state"
    codex_available: $available
  - eval_id: webui-cto-runner-available
    status: pass
    evidence:
      - "cto/evals/runners/run-webui-cto.sh"
      - "cto/evals/runners/run-local-regression.py"
notes:
  - "$note"
  - "This report proves the comparative runner surface and the exact local blocker when present; it is not a parity pass."
YAML
}

append_smoke_if_present() {
  python3 - "$output" <<'PY'
import json
import sys
from pathlib import Path

import yaml

report_path = Path(sys.argv[1])
artifact_dir = Path("evals/artifacts")
jsonl = artifact_dir / "2026-05-25-codex-ab-smoke.jsonl"
last = artifact_dir / "2026-05-25-codex-ab-smoke-last-message.txt"
local = artifact_dir / "2026-05-25-codex-ab-smoke-local.json"
if not (jsonl.exists() and last.exists() and local.exists()):
    raise SystemExit(0)

try:
    codex_payload = json.loads(last.read_text(encoding="utf-8"))
    local_payload = json.loads(local.read_text(encoding="utf-8"))
except json.JSONDecodeError:
    raise SystemExit(0)

report = yaml.safe_load(report_path.read_text(encoding="utf-8"))
if not isinstance(report, dict):
    raise SystemExit(0)

logs = report.setdefault("artifacts", {}).get("logs")
if not isinstance(logs, list):
    logs = [logs] if logs else []
for item in (
    "cto/evals/artifacts/2026-05-25-codex-ab-smoke.jsonl",
    "cto/evals/artifacts/2026-05-25-codex-ab-smoke-last-message.txt",
    "cto/evals/artifacts/2026-05-25-codex-ab-smoke-local.json",
):
    if item not in logs:
        logs.append(item)
report["artifacts"]["logs"] = logs

eval_results = report.setdefault("eval_results", [])
eval_results = [
    item for item in eval_results
    if not (isinstance(item, dict) and item.get("eval_id") == "codex-read-only-ab-smoke")
]
eval_results.append(
    {
        "eval_id": "codex-read-only-ab-smoke",
        "status": "pass" if codex_payload == local_payload else "fail",
        "evidence": [
            "Codex exec read cto/evals/manifest.yaml in read-only sandbox mode",
            "Codex output matched local manifest ground truth for fixture_count and promotion thresholds"
            if codex_payload == local_payload
            else "Codex output did not match local manifest ground truth",
            "cto/evals/artifacts/2026-05-25-codex-ab-smoke.jsonl",
            "cto/evals/artifacts/2026-05-25-codex-ab-smoke-last-message.txt",
            "cto/evals/artifacts/2026-05-25-codex-ab-smoke-local.json",
        ],
        "codex_command": "/home/svrnty/.nvm/versions/node/v20.19.5/bin/codex -a never exec --json --sandbox read-only -C /home/svrnty/workspaces/hermes",
        "result_match": codex_payload == local_payload,
    }
)
report["eval_results"] = eval_results

notes = report.setdefault("notes", [])
smoke_note = "A read-only Codex A/B smoke was executed successfully; it is not the required two-run parity suite."
if smoke_note not in notes:
    notes.insert(max(0, len(notes) - 1), smoke_note)

report_path.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8")
PY
}

codex_bin="$(find_codex || true)"
if [[ -z "$codex_bin" ]]; then
  write_report "false" "Codex CLI is not installed on this host, so comparative parity cannot be executed or claimed." 'no codex executable found on PATH, npm global prefix, nvm bins, /usr/local/bin, or /opt/homebrew/bin'
  append_smoke_if_present
  echo "codex CLI not found; comparative parity cannot be executed on this host." >&2
  exit 78
fi

codex_version="$("$codex_bin" --version)"
write_report "true" "Codex CLI is installed (${codex_version}), but the full comparative parity suite still requires the two-run benchmark gate." "codex --version: ${codex_version}"
append_smoke_if_present
echo "$codex_version"
echo "codex CLI is available; full comparative task runner is not enabled in this rollout."