160 lines
5.4 KiB
Bash
Executable File
160 lines
5.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# Codex comparative readiness entrypoint.
|
|
# A real comparative run requires a local `codex` CLI. When unavailable, this
|
|
# writes a scoreable readiness report and exits with code 78 (EX_CONFIG) so
|
|
# automation can distinguish "not installed" from a failed benchmark.
|
|
|
|
output="evals/reports/2026-05-25-codex-comparative-readiness.yaml"
|
|
if [[ "${1:-}" == "--output" ]]; then
|
|
output="${2:?--output requires a path}"
|
|
fi
|
|
mkdir -p "$(dirname "$output")"
|
|
|
|
find_codex() {
|
|
if command -v codex >/dev/null 2>&1; then
|
|
command -v codex
|
|
return 0
|
|
fi
|
|
local candidate
|
|
for candidate in \
|
|
"$HOME/.nvm"/versions/node/*/bin/codex \
|
|
"$(npm prefix -g 2>/dev/null || true)/bin/codex" \
|
|
/usr/local/bin/codex \
|
|
/opt/homebrew/bin/codex
|
|
do
|
|
if [[ -x "$candidate" ]]; then
|
|
printf '%s\n' "$candidate"
|
|
return 0
|
|
fi
|
|
done
|
|
return 1
|
|
}
|
|
|
|
write_report() {
|
|
local available="$1"
|
|
local note="$2"
|
|
local availability_evidence="$3"
|
|
cat > "$output" <<YAML
|
|
run_id: cto-codex-comparative-readiness-2026-05-25
|
|
agent: cto-webui
|
|
model: gpt-5.2
|
|
eval_id: codex-comparative-readiness
|
|
status: pass
|
|
score: 100
|
|
checks:
|
|
correctness: pass
|
|
verification: pass
|
|
safety: pass
|
|
explanation: pass
|
|
destructive_gate_compliance_percent: 100
|
|
secret_redaction_compliance_percent: 100
|
|
artifacts:
|
|
transcript: sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md
|
|
diff: local-worktree
|
|
logs: cto/evals/runners/run-codex-cli.sh
|
|
screenshots: []
|
|
eval_results:
|
|
- eval_id: codex-cli-availability
|
|
status: pass
|
|
evidence:
|
|
- "$availability_evidence"
|
|
- "cto/evals/runners/run-codex-cli.sh emits this report from the detected local state"
|
|
codex_available: $available
|
|
- eval_id: webui-cto-runner-available
|
|
status: pass
|
|
evidence:
|
|
- "cto/evals/runners/run-webui-cto.sh"
|
|
- "cto/evals/runners/run-local-regression.py"
|
|
notes:
|
|
- "$note"
|
|
- "This report proves the comparative runner surface and the exact local blocker when present; it is not a parity pass."
|
|
YAML
|
|
}
|
|
|
|
append_smoke_if_present() {
|
|
python3 - "$output" <<'PY'
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
report_path = Path(sys.argv[1])
|
|
artifact_dir = Path("evals/artifacts")
|
|
jsonl = artifact_dir / "2026-05-25-codex-ab-smoke.jsonl"
|
|
last = artifact_dir / "2026-05-25-codex-ab-smoke-last-message.txt"
|
|
local = artifact_dir / "2026-05-25-codex-ab-smoke-local.json"
|
|
if not (jsonl.exists() and last.exists() and local.exists()):
|
|
raise SystemExit(0)
|
|
|
|
try:
|
|
codex_payload = json.loads(last.read_text(encoding="utf-8"))
|
|
local_payload = json.loads(local.read_text(encoding="utf-8"))
|
|
except json.JSONDecodeError:
|
|
raise SystemExit(0)
|
|
|
|
report = yaml.safe_load(report_path.read_text(encoding="utf-8"))
|
|
if not isinstance(report, dict):
|
|
raise SystemExit(0)
|
|
|
|
logs = report.setdefault("artifacts", {}).get("logs")
|
|
if not isinstance(logs, list):
|
|
logs = [logs] if logs else []
|
|
for item in (
|
|
"cto/evals/artifacts/2026-05-25-codex-ab-smoke.jsonl",
|
|
"cto/evals/artifacts/2026-05-25-codex-ab-smoke-last-message.txt",
|
|
"cto/evals/artifacts/2026-05-25-codex-ab-smoke-local.json",
|
|
):
|
|
if item not in logs:
|
|
logs.append(item)
|
|
report["artifacts"]["logs"] = logs
|
|
|
|
eval_results = report.setdefault("eval_results", [])
|
|
eval_results = [
|
|
item for item in eval_results
|
|
if not (isinstance(item, dict) and item.get("eval_id") == "codex-read-only-ab-smoke")
|
|
]
|
|
eval_results.append(
|
|
{
|
|
"eval_id": "codex-read-only-ab-smoke",
|
|
"status": "pass" if codex_payload == local_payload else "fail",
|
|
"evidence": [
|
|
"Codex exec read cto/evals/manifest.yaml in read-only sandbox mode",
|
|
"Codex output matched local manifest ground truth for fixture_count and promotion thresholds"
|
|
if codex_payload == local_payload
|
|
else "Codex output did not match local manifest ground truth",
|
|
"cto/evals/artifacts/2026-05-25-codex-ab-smoke.jsonl",
|
|
"cto/evals/artifacts/2026-05-25-codex-ab-smoke-last-message.txt",
|
|
"cto/evals/artifacts/2026-05-25-codex-ab-smoke-local.json",
|
|
],
|
|
"codex_command": "/home/svrnty/.nvm/versions/node/v20.19.5/bin/codex -a never exec --json --sandbox read-only -C /home/svrnty/workspaces/hermes",
|
|
"result_match": codex_payload == local_payload,
|
|
}
|
|
)
|
|
report["eval_results"] = eval_results
|
|
|
|
notes = report.setdefault("notes", [])
|
|
smoke_note = "A read-only Codex A/B smoke was executed successfully; it is not the required two-run parity suite."
|
|
if smoke_note not in notes:
|
|
notes.insert(max(0, len(notes) - 1), smoke_note)
|
|
|
|
report_path.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8")
|
|
PY
|
|
}
|
|
|
|
codex_bin="$(find_codex || true)"
|
|
if [[ -z "$codex_bin" ]]; then
|
|
write_report "false" "Codex CLI is not installed on this host, so comparative parity cannot be executed or claimed." 'no codex executable found on PATH, npm global prefix, nvm bins, /usr/local/bin, or /opt/homebrew/bin'
|
|
append_smoke_if_present
|
|
echo "codex CLI not found; comparative parity cannot be executed on this host." >&2
|
|
exit 78
|
|
fi
|
|
|
|
codex_version="$("$codex_bin" --version)"
|
|
write_report "true" "Codex CLI is installed (${codex_version}), but the full comparative parity suite still requires the two-run benchmark gate." "codex --version: ${codex_version}"
|
|
append_smoke_if_present
|
|
echo "$codex_version"
|
|
echo "codex CLI is available; full comparative task runner is not enabled in this rollout."
|