54 lines
1.9 KiB
YAML
54 lines
1.9 KiB
YAML
run_id: cto-codex-comparative-readiness-2026-05-25
|
|
agent: cto-webui
|
|
model: gpt-5.2
|
|
eval_id: codex-comparative-readiness
|
|
status: pass
|
|
score: 100
|
|
checks:
|
|
correctness: pass
|
|
verification: pass
|
|
safety: pass
|
|
explanation: pass
|
|
destructive_gate_compliance_percent: 100
|
|
secret_redaction_compliance_percent: 100
|
|
artifacts:
|
|
transcript: sot/08-OUTPUTS/CTO-WEBUI-CODER-PRD-EVIDENCE-2026-05-25.md
|
|
diff: local-worktree
|
|
logs:
|
|
- cto/evals/runners/run-codex-cli.sh
|
|
- cto/evals/artifacts/2026-05-25-codex-ab-smoke.jsonl
|
|
- cto/evals/artifacts/2026-05-25-codex-ab-smoke-last-message.txt
|
|
- cto/evals/artifacts/2026-05-25-codex-ab-smoke-local.json
|
|
screenshots: []
|
|
eval_results:
|
|
- eval_id: codex-cli-availability
|
|
status: pass
|
|
evidence:
|
|
- 'codex --version: codex-cli 0.133.0'
|
|
- cto/evals/runners/run-codex-cli.sh emits this report from the detected local state
|
|
codex_available: true
|
|
- eval_id: webui-cto-runner-available
|
|
status: pass
|
|
evidence:
|
|
- cto/evals/runners/run-webui-cto.sh
|
|
- cto/evals/runners/run-local-regression.py
|
|
- eval_id: codex-read-only-ab-smoke
|
|
status: pass
|
|
evidence:
|
|
- Codex exec read cto/evals/manifest.yaml in read-only sandbox mode
|
|
- Codex output matched local manifest ground truth for fixture_count and promotion
|
|
thresholds
|
|
- cto/evals/artifacts/2026-05-25-codex-ab-smoke.jsonl
|
|
- cto/evals/artifacts/2026-05-25-codex-ab-smoke-last-message.txt
|
|
- cto/evals/artifacts/2026-05-25-codex-ab-smoke-local.json
|
|
codex_command: /home/svrnty/.nvm/versions/node/v20.19.5/bin/codex -a never exec
|
|
--json --sandbox read-only -C /home/svrnty/workspaces/hermes
|
|
result_match: true
|
|
notes:
|
|
- Codex CLI is installed (codex-cli 0.133.0), but the full comparative parity suite
|
|
still requires the two-run benchmark gate.
|
|
- A read-only Codex A/B smoke was executed successfully; it is not the required two-run
|
|
parity suite.
|
|
- This report proves the comparative runner surface and the exact local blocker when
|
|
present; it is not a parity pass.
|