diff --git a/WORKBOARD.yaml b/WORKBOARD.yaml index 6e2e4d2..a34caaa 100644 --- a/WORKBOARD.yaml +++ b/WORKBOARD.yaml @@ -31,6 +31,6 @@ items: owner: "" - id: SVRNTY-VISION-WORK-009 title: BTE-WORK-038 ComfyUI GX10 Restore Proof - status: blocked-on-gx10-cuda-initialization + status: blocked-on-gx10-gpu-visibility source: docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-comfyui-gx10-restore-proof.md owner: jp diff --git a/docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-gx10-cuda-restore-retry-proof.json b/docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-gx10-cuda-restore-retry-proof.json new file mode 100644 index 0000000..c87099d --- /dev/null +++ b/docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-gx10-cuda-restore-retry-proof.json @@ -0,0 +1,99 @@ +{ + "schema": "svrnty-vision.bte-work-038-gx10-cuda-restore-proof.v1", + "timestamp": "2026-06-20T16:29:03Z", + "work_item_id": "SVRNTY-VISION-WORK-009", + "route": "svrnty-vision", + "status": "blocked_on_gx10_gpu_visibility_no_comfyui_start_no_bte_retry", + "approval_ref": "APPROVED: BTE-WORK-038 GX10 CUDA/ComfyUI restore and single third retry", + "authority_effect": "gx10_cuda_user_space_restore_attempted", + "product_ready_claim": false, + "provider_gateway_health": { + "route_identity": "svrnty-vision", + "url_ref": "http://localhost:8092/healthz", + "http_status": 200, + "body_status": "ok", + "version": "0.1.0" + }, + "provider_gateway_health_from_bte_runtime_namespace": { + "route_identity": "svrnty-vision", + "url_ref": "http://172.20.0.1:8092/healthz", + "http_status": 200, + "body_status": "ok", + "version": "0.1.0" + }, + "local_gx10_cuda_before_restore": { + "route_identity": "local-gx10-cuda", + "host": "gx10-f38f", + "ssh_target": "svrnty@100.90.100.10", + "nvidia_smi_result": "no_devices_found", + "torch_cuda_available": false, + "torch_cuda_device_count": 0, + "nvidia_device_nodes_present": true, + "nvidia_kernel_modules_loaded": true, + "nvidia_persistenced_active": true, + "nvidia_persistenced_compact_failure": "device 000f:01:00.0 registered then failed to open", + "sudo_noninteractive_available": false, + "failure_class": "gx10_gpu_registered_but_not_openable" + }, + "local_gx10_cuda_restore_attempt": { + "route_identity": "local-gx10-cuda", + "method": "nvidia-modprobe -u -c=0", + "attempted": true, + "nvidia_modprobe_setuid_root": true, + "nvidia_smi_after_restore": "no_devices_found", + "torch_cuda_available_after_restore": false, + "torch_cuda_device_count_after_restore": 0, + "failure_class": "user_space_modprobe_did_not_restore_cuda", + "raw_log_stored_in_proof": false + }, + "local_comfyui_gx10_health_after_cuda_attempt": { + "route_identity": "local-comfyui-gx10", + "host": "gx10-f38f", + "url_ref": "http://100.90.100.10:8188", + "system_stats_url_ref": "http://100.90.100.10:8188/system_stats", + "http_status": 0, + "reachability_method": "curl_get_system_stats_after_cuda_restore_attempt", + "failure_class": "connection_refused", + "comfyui_start_attempted_this_slice": false, + "reason": "CUDA health gate failed before ComfyUI start.", + "required_for_bte_third_retry": 200 + }, + "bte_retry_gate": { + "bte_third_retry_attempted": false, + "reason": "local_gx10_cuda_restore_attempt did not make CUDA visible and local_comfyui_gx10_health_after_cuda_attempt.http_status was not 200", + "prior_retry_asset_ref": "5c1eedc5-e281-4c8c-82d3-bc4d764d2111", + "prior_retry_saga_ref": "ee27c2b3-a415-47a8-8d75-7bd834f6b99e", + "approval_expired_after": "first_failed_or_blocked_cuda_comfyui_bte_live_effect" + }, + "source_refs": { + "prior_provider_restore_proof": "docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-comfyui-gx10-restore-proof.json", + "prior_provider_restore_commit": "svrnty-vision@61a4befffbde58cf88660d9c410ad3473ddbea01", + "bte_runtime_proof": "../bte/docs/goal-runs/bte-sovereign-creative-supercomputer-product-ready/proof/bte-work-038-runtime-rest-provider-proof.json", + "svrnty_vision_agent_contract": "AGENTS.md", + "network_skill_reference": "/home/svrnty/.codex/skills/network/SKILL.md" + }, + "tool_effects": { + "mutated_svrnty_vision_files": true, + "network": true, + "provider_gateway_health_checked": true, + "nvidia_modprobe_attempted": true, + "remote_comfyui_start_attempted": false, + "remote_comfyui_left_running": false, + "bte_rest_call": false, + "provider_call": false, + "local_generation_call": false, + "mcp_registration": false, + "profile_exposure_change": false, + "core_mutation": false, + "bte_repo_mutation_from_this_packet": false, + "archive_delete_execution": false, + "raw_payload_storage": false, + "product_ready_claim": false, + "release_claim": false, + "production_readiness_claim": false + }, + "validation": { + "validator": "tools/validate_svrnty_vision_bte_work_038_gx10_cuda_restore.py", + "route_validator": "tools/validate_svrnty_vision_child.py" + } +} diff --git a/docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-gx10-cuda-restore-retry-proof.md b/docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-gx10-cuda-restore-retry-proof.md new file mode 100644 index 0000000..801b3c8 --- /dev/null +++ b/docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-gx10-cuda-restore-retry-proof.md @@ -0,0 +1,43 @@ +--- +type: provider-route-proof +id: SVRNTY-VISION-WORK-009 +status: blocked-on-gx10-gpu-visibility +machine_contract: svrnty-vision-work-009-gx10-cuda-restore-retry-proof.json +--- + +# Svrnty Vision BTE-WORK-038 GX10 CUDA Restore Proof + +This packet records the provider-route side of the approved `BTE-WORK-038` +GX10 CUDA/ComfyUI restore retry. It is compact proof metadata only. + +## Result + +- `svrnty-vision` health was HTTP 200 from the host path. +- `svrnty-vision` health was HTTP 200 from the BTE runtime namespace path. +- `gx10-f38f` had NVIDIA device nodes and kernel modules loaded. +- `nvidia-persistenced` was active but had registered the GPU and failed to + open it. +- `nvidia-smi` reported no devices before and after `nvidia-modprobe -u -c=0`. +- Torch CUDA remained unavailable with zero CUDA devices. +- ComfyUI was not started because the CUDA health gate failed first. +- The BTE third retry was not attempted. + +## Blocker + +The blocker is GX10 GPU visibility to the NVIDIA userspace stack. The compact +error class is `user_space_modprobe_did_not_restore_cuda`. + +## Payload Posture + +No secrets were read. +No raw prompt was stored. +No raw provider payload was stored. +No generated binary was stored in proof. +No BTE REST creative call was made. +No provider generation call was made. +No Product Ready claim was made. + +## Validation + +- `python3 tools/validate_svrnty_vision_bte_work_038_gx10_cuda_restore.py` +- `python3 tools/validate_svrnty_vision_child.py` diff --git a/tools/validate_svrnty_vision_bte_work_038_gx10_cuda_restore.py b/tools/validate_svrnty_vision_bte_work_038_gx10_cuda_restore.py new file mode 100755 index 0000000..11f5f87 --- /dev/null +++ b/tools/validate_svrnty_vision_bte_work_038_gx10_cuda_restore.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +"""Validate the Svrnty Vision BTE-WORK-038 GX10 CUDA restore proof.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + + +ROOT = Path(__file__).resolve().parents[1] +PROOF = ( + ROOT + / "docs" + / "goal-runs" + / "bte-work-038-comfyui-gx10-restore-proof" + / "svrnty-vision-work-009-gx10-cuda-restore-retry-proof.json" +) +REPORT = PROOF.with_suffix(".md") +APPROVAL_REF = "APPROVED: BTE-WORK-038 GX10 CUDA/ComfyUI restore and single third retry" + + +def require(condition: bool, errors: list[str], code: str) -> None: + if not condition: + errors.append(code) + + +def load_json(path: Path, errors: list[str]) -> dict[str, Any]: + if not path.exists(): + errors.append(f"missing:{path.relative_to(ROOT)}") + return {} + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + errors.append(f"{path.name}:invalid_json:{exc.lineno}:{exc.colno}") + return {} + if not isinstance(payload, dict): + errors.append(f"{path.name}:not_object") + return {} + return payload + + +def main() -> int: + errors: list[str] = [] + proof = load_json(PROOF, errors) + report = REPORT.read_text(encoding="utf-8") if REPORT.exists() else "" + + require(REPORT.exists(), errors, "report:missing") + require(proof.get("schema") == "svrnty-vision.bte-work-038-gx10-cuda-restore-proof.v1", errors, "schema") + require(proof.get("work_item_id") == "SVRNTY-VISION-WORK-009", errors, "work_item_id") + require(proof.get("route") == "svrnty-vision", errors, "route") + require(proof.get("status") == "blocked_on_gx10_gpu_visibility_no_comfyui_start_no_bte_retry", errors, "status") + require(proof.get("approval_ref") == APPROVAL_REF, errors, "approval_ref") + require(proof.get("product_ready_claim") is False, errors, "product_ready_claim") + + gateway = proof.get("provider_gateway_health", {}) + require(gateway.get("http_status") == 200, errors, "gateway:http_status") + require(gateway.get("body_status") == "ok", errors, "gateway:body_status") + bte_namespace = proof.get("provider_gateway_health_from_bte_runtime_namespace", {}) + require(bte_namespace.get("http_status") == 200, errors, "bte_namespace:http_status") + require(bte_namespace.get("body_status") == "ok", errors, "bte_namespace:body_status") + + before = proof.get("local_gx10_cuda_before_restore", {}) + require(before.get("nvidia_smi_result") == "no_devices_found", errors, "before:nvidia_smi") + require(before.get("torch_cuda_available") is False, errors, "before:torch_available") + require(before.get("torch_cuda_device_count") == 0, errors, "before:torch_count") + require(before.get("nvidia_device_nodes_present") is True, errors, "before:device_nodes") + require(before.get("nvidia_kernel_modules_loaded") is True, errors, "before:kernel_modules") + require(before.get("sudo_noninteractive_available") is False, errors, "before:sudo") + require(before.get("failure_class") == "gx10_gpu_registered_but_not_openable", errors, "before:failure_class") + + restore = proof.get("local_gx10_cuda_restore_attempt", {}) + require(restore.get("method") == "nvidia-modprobe -u -c=0", errors, "restore:method") + require(restore.get("attempted") is True, errors, "restore:attempted") + require(restore.get("nvidia_modprobe_setuid_root") is True, errors, "restore:setuid") + require(restore.get("nvidia_smi_after_restore") == "no_devices_found", errors, "restore:nvidia_smi") + require(restore.get("torch_cuda_available_after_restore") is False, errors, "restore:torch_available") + require(restore.get("torch_cuda_device_count_after_restore") == 0, errors, "restore:torch_count") + require(restore.get("failure_class") == "user_space_modprobe_did_not_restore_cuda", errors, "restore:failure_class") + require(restore.get("raw_log_stored_in_proof") is False, errors, "restore:raw_log") + + comfy = proof.get("local_comfyui_gx10_health_after_cuda_attempt", {}) + require(comfy.get("http_status") == 0, errors, "comfy:http_status") + require(comfy.get("failure_class") == "connection_refused", errors, "comfy:failure_class") + require(comfy.get("comfyui_start_attempted_this_slice") is False, errors, "comfy:not_started") + require(comfy.get("required_for_bte_third_retry") == 200, errors, "comfy:required") + + retry = proof.get("bte_retry_gate", {}) + require(retry.get("bte_third_retry_attempted") is False, errors, "retry:attempted") + require(retry.get("prior_retry_asset_ref") == "5c1eedc5-e281-4c8c-82d3-bc4d764d2111", errors, "retry:asset") + require(retry.get("prior_retry_saga_ref") == "ee27c2b3-a415-47a8-8d75-7bd834f6b99e", errors, "retry:saga") + require(retry.get("approval_expired_after") == "first_failed_or_blocked_cuda_comfyui_bte_live_effect", errors, "retry:expiry") + + effects = proof.get("tool_effects", {}) + require(effects.get("nvidia_modprobe_attempted") is True, errors, "effects:nvidia_modprobe") + require(effects.get("remote_comfyui_start_attempted") is False, errors, "effects:comfy_start") + for key in ( + "bte_rest_call", + "provider_call", + "local_generation_call", + "mcp_registration", + "profile_exposure_change", + "core_mutation", + "archive_delete_execution", + "raw_payload_storage", + "product_ready_claim", + "release_claim", + "production_readiness_claim", + ): + require(effects.get(key) is False, errors, f"effects:{key}") + + for snippet in ( + "GX10 CUDA Restore Proof", + "nvidia-smi` reported no devices", + "Torch CUDA remained unavailable", + "ComfyUI was not started", + "BTE third retry was not attempted", + "No raw provider payload was stored.", + "No Product Ready claim was made.", + ): + require(snippet in report, errors, f"report:missing:{snippet}") + + result = { + "ok": not errors, + "validator": "svrnty-vision-bte-work-038-gx10-cuda-restore-v1", + "checked": [str(PROOF.relative_to(ROOT)), str(REPORT.relative_to(ROOT))], + "errors": errors, + "warnings": [], + } + print(json.dumps(result, indent=2, sort_keys=True)) + return 0 if result["ok"] else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/validate_svrnty_vision_child.py b/tools/validate_svrnty_vision_child.py index 6ae0dfe..de0e66d 100755 --- a/tools/validate_svrnty_vision_child.py +++ b/tools/validate_svrnty_vision_child.py @@ -46,6 +46,10 @@ OPTIONAL_PROOF_VALIDATORS = [ "docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-comfyui-gx10-restore-proof.json", "tools/validate_svrnty_vision_bte_work_038_comfyui_gx10_restore.py", ), + ( + "docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-gx10-cuda-restore-retry-proof.json", + "tools/validate_svrnty_vision_bte_work_038_gx10_cuda_restore.py", + ), ]