Record BTE GX10 CUDA restore blocker

This commit is contained in:
Svrnty
2026-06-20 12:31:15 -04:00
parent 61a4befffb
commit 715ca77bf5
5 changed files with 282 additions and 1 deletions
+1 -1
View File
@@ -31,6 +31,6 @@ items:
owner: ""
- id: SVRNTY-VISION-WORK-009
title: BTE-WORK-038 ComfyUI GX10 Restore Proof
status: blocked-on-gx10-cuda-initialization
status: blocked-on-gx10-gpu-visibility
source: docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-comfyui-gx10-restore-proof.md
owner: jp
@@ -0,0 +1,99 @@
{
"schema": "svrnty-vision.bte-work-038-gx10-cuda-restore-proof.v1",
"timestamp": "2026-06-20T16:29:03Z",
"work_item_id": "SVRNTY-VISION-WORK-009",
"route": "svrnty-vision",
"status": "blocked_on_gx10_gpu_visibility_no_comfyui_start_no_bte_retry",
"approval_ref": "APPROVED: BTE-WORK-038 GX10 CUDA/ComfyUI restore and single third retry",
"authority_effect": "gx10_cuda_user_space_restore_attempted",
"product_ready_claim": false,
"provider_gateway_health": {
"route_identity": "svrnty-vision",
"url_ref": "http://localhost:8092/healthz",
"http_status": 200,
"body_status": "ok",
"version": "0.1.0"
},
"provider_gateway_health_from_bte_runtime_namespace": {
"route_identity": "svrnty-vision",
"url_ref": "http://172.20.0.1:8092/healthz",
"http_status": 200,
"body_status": "ok",
"version": "0.1.0"
},
"local_gx10_cuda_before_restore": {
"route_identity": "local-gx10-cuda",
"host": "gx10-f38f",
"ssh_target": "svrnty@100.90.100.10",
"nvidia_smi_result": "no_devices_found",
"torch_cuda_available": false,
"torch_cuda_device_count": 0,
"nvidia_device_nodes_present": true,
"nvidia_kernel_modules_loaded": true,
"nvidia_persistenced_active": true,
"nvidia_persistenced_compact_failure": "device 000f:01:00.0 registered then failed to open",
"sudo_noninteractive_available": false,
"failure_class": "gx10_gpu_registered_but_not_openable"
},
"local_gx10_cuda_restore_attempt": {
"route_identity": "local-gx10-cuda",
"method": "nvidia-modprobe -u -c=0",
"attempted": true,
"nvidia_modprobe_setuid_root": true,
"nvidia_smi_after_restore": "no_devices_found",
"torch_cuda_available_after_restore": false,
"torch_cuda_device_count_after_restore": 0,
"failure_class": "user_space_modprobe_did_not_restore_cuda",
"raw_log_stored_in_proof": false
},
"local_comfyui_gx10_health_after_cuda_attempt": {
"route_identity": "local-comfyui-gx10",
"host": "gx10-f38f",
"url_ref": "http://100.90.100.10:8188",
"system_stats_url_ref": "http://100.90.100.10:8188/system_stats",
"http_status": 0,
"reachability_method": "curl_get_system_stats_after_cuda_restore_attempt",
"failure_class": "connection_refused",
"comfyui_start_attempted_this_slice": false,
"reason": "CUDA health gate failed before ComfyUI start.",
"required_for_bte_third_retry": 200
},
"bte_retry_gate": {
"bte_third_retry_attempted": false,
"reason": "local_gx10_cuda_restore_attempt did not make CUDA visible and local_comfyui_gx10_health_after_cuda_attempt.http_status was not 200",
"prior_retry_asset_ref": "5c1eedc5-e281-4c8c-82d3-bc4d764d2111",
"prior_retry_saga_ref": "ee27c2b3-a415-47a8-8d75-7bd834f6b99e",
"approval_expired_after": "first_failed_or_blocked_cuda_comfyui_bte_live_effect"
},
"source_refs": {
"prior_provider_restore_proof": "docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-comfyui-gx10-restore-proof.json",
"prior_provider_restore_commit": "svrnty-vision@61a4befffbde58cf88660d9c410ad3473ddbea01",
"bte_runtime_proof": "../bte/docs/goal-runs/bte-sovereign-creative-supercomputer-product-ready/proof/bte-work-038-runtime-rest-provider-proof.json",
"svrnty_vision_agent_contract": "AGENTS.md",
"network_skill_reference": "/home/svrnty/.codex/skills/network/SKILL.md"
},
"tool_effects": {
"mutated_svrnty_vision_files": true,
"network": true,
"provider_gateway_health_checked": true,
"nvidia_modprobe_attempted": true,
"remote_comfyui_start_attempted": false,
"remote_comfyui_left_running": false,
"bte_rest_call": false,
"provider_call": false,
"local_generation_call": false,
"mcp_registration": false,
"profile_exposure_change": false,
"core_mutation": false,
"bte_repo_mutation_from_this_packet": false,
"archive_delete_execution": false,
"raw_payload_storage": false,
"product_ready_claim": false,
"release_claim": false,
"production_readiness_claim": false
},
"validation": {
"validator": "tools/validate_svrnty_vision_bte_work_038_gx10_cuda_restore.py",
"route_validator": "tools/validate_svrnty_vision_child.py"
}
}
@@ -0,0 +1,43 @@
---
type: provider-route-proof
id: SVRNTY-VISION-WORK-009
status: blocked-on-gx10-gpu-visibility
machine_contract: svrnty-vision-work-009-gx10-cuda-restore-retry-proof.json
---
# Svrnty Vision BTE-WORK-038 GX10 CUDA Restore Proof
This packet records the provider-route side of the approved `BTE-WORK-038`
GX10 CUDA/ComfyUI restore retry. It is compact proof metadata only.
## Result
- `svrnty-vision` health was HTTP 200 from the host path.
- `svrnty-vision` health was HTTP 200 from the BTE runtime namespace path.
- `gx10-f38f` had NVIDIA device nodes and kernel modules loaded.
- `nvidia-persistenced` was active but had registered the GPU and failed to
open it.
- `nvidia-smi` reported no devices before and after `nvidia-modprobe -u -c=0`.
- Torch CUDA remained unavailable with zero CUDA devices.
- ComfyUI was not started because the CUDA health gate failed first.
- The BTE third retry was not attempted.
## Blocker
The blocker is GX10 GPU visibility to the NVIDIA userspace stack. The compact
error class is `user_space_modprobe_did_not_restore_cuda`.
## Payload Posture
No secrets were read.
No raw prompt was stored.
No raw provider payload was stored.
No generated binary was stored in proof.
No BTE REST creative call was made.
No provider generation call was made.
No Product Ready claim was made.
## Validation
- `python3 tools/validate_svrnty_vision_bte_work_038_gx10_cuda_restore.py`
- `python3 tools/validate_svrnty_vision_child.py`
@@ -0,0 +1,135 @@
#!/usr/bin/env python3
"""Validate the Svrnty Vision BTE-WORK-038 GX10 CUDA restore proof."""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parents[1]
PROOF = (
ROOT
/ "docs"
/ "goal-runs"
/ "bte-work-038-comfyui-gx10-restore-proof"
/ "svrnty-vision-work-009-gx10-cuda-restore-retry-proof.json"
)
REPORT = PROOF.with_suffix(".md")
APPROVAL_REF = "APPROVED: BTE-WORK-038 GX10 CUDA/ComfyUI restore and single third retry"
def require(condition: bool, errors: list[str], code: str) -> None:
if not condition:
errors.append(code)
def load_json(path: Path, errors: list[str]) -> dict[str, Any]:
if not path.exists():
errors.append(f"missing:{path.relative_to(ROOT)}")
return {}
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError as exc:
errors.append(f"{path.name}:invalid_json:{exc.lineno}:{exc.colno}")
return {}
if not isinstance(payload, dict):
errors.append(f"{path.name}:not_object")
return {}
return payload
def main() -> int:
errors: list[str] = []
proof = load_json(PROOF, errors)
report = REPORT.read_text(encoding="utf-8") if REPORT.exists() else ""
require(REPORT.exists(), errors, "report:missing")
require(proof.get("schema") == "svrnty-vision.bte-work-038-gx10-cuda-restore-proof.v1", errors, "schema")
require(proof.get("work_item_id") == "SVRNTY-VISION-WORK-009", errors, "work_item_id")
require(proof.get("route") == "svrnty-vision", errors, "route")
require(proof.get("status") == "blocked_on_gx10_gpu_visibility_no_comfyui_start_no_bte_retry", errors, "status")
require(proof.get("approval_ref") == APPROVAL_REF, errors, "approval_ref")
require(proof.get("product_ready_claim") is False, errors, "product_ready_claim")
gateway = proof.get("provider_gateway_health", {})
require(gateway.get("http_status") == 200, errors, "gateway:http_status")
require(gateway.get("body_status") == "ok", errors, "gateway:body_status")
bte_namespace = proof.get("provider_gateway_health_from_bte_runtime_namespace", {})
require(bte_namespace.get("http_status") == 200, errors, "bte_namespace:http_status")
require(bte_namespace.get("body_status") == "ok", errors, "bte_namespace:body_status")
before = proof.get("local_gx10_cuda_before_restore", {})
require(before.get("nvidia_smi_result") == "no_devices_found", errors, "before:nvidia_smi")
require(before.get("torch_cuda_available") is False, errors, "before:torch_available")
require(before.get("torch_cuda_device_count") == 0, errors, "before:torch_count")
require(before.get("nvidia_device_nodes_present") is True, errors, "before:device_nodes")
require(before.get("nvidia_kernel_modules_loaded") is True, errors, "before:kernel_modules")
require(before.get("sudo_noninteractive_available") is False, errors, "before:sudo")
require(before.get("failure_class") == "gx10_gpu_registered_but_not_openable", errors, "before:failure_class")
restore = proof.get("local_gx10_cuda_restore_attempt", {})
require(restore.get("method") == "nvidia-modprobe -u -c=0", errors, "restore:method")
require(restore.get("attempted") is True, errors, "restore:attempted")
require(restore.get("nvidia_modprobe_setuid_root") is True, errors, "restore:setuid")
require(restore.get("nvidia_smi_after_restore") == "no_devices_found", errors, "restore:nvidia_smi")
require(restore.get("torch_cuda_available_after_restore") is False, errors, "restore:torch_available")
require(restore.get("torch_cuda_device_count_after_restore") == 0, errors, "restore:torch_count")
require(restore.get("failure_class") == "user_space_modprobe_did_not_restore_cuda", errors, "restore:failure_class")
require(restore.get("raw_log_stored_in_proof") is False, errors, "restore:raw_log")
comfy = proof.get("local_comfyui_gx10_health_after_cuda_attempt", {})
require(comfy.get("http_status") == 0, errors, "comfy:http_status")
require(comfy.get("failure_class") == "connection_refused", errors, "comfy:failure_class")
require(comfy.get("comfyui_start_attempted_this_slice") is False, errors, "comfy:not_started")
require(comfy.get("required_for_bte_third_retry") == 200, errors, "comfy:required")
retry = proof.get("bte_retry_gate", {})
require(retry.get("bte_third_retry_attempted") is False, errors, "retry:attempted")
require(retry.get("prior_retry_asset_ref") == "5c1eedc5-e281-4c8c-82d3-bc4d764d2111", errors, "retry:asset")
require(retry.get("prior_retry_saga_ref") == "ee27c2b3-a415-47a8-8d75-7bd834f6b99e", errors, "retry:saga")
require(retry.get("approval_expired_after") == "first_failed_or_blocked_cuda_comfyui_bte_live_effect", errors, "retry:expiry")
effects = proof.get("tool_effects", {})
require(effects.get("nvidia_modprobe_attempted") is True, errors, "effects:nvidia_modprobe")
require(effects.get("remote_comfyui_start_attempted") is False, errors, "effects:comfy_start")
for key in (
"bte_rest_call",
"provider_call",
"local_generation_call",
"mcp_registration",
"profile_exposure_change",
"core_mutation",
"archive_delete_execution",
"raw_payload_storage",
"product_ready_claim",
"release_claim",
"production_readiness_claim",
):
require(effects.get(key) is False, errors, f"effects:{key}")
for snippet in (
"GX10 CUDA Restore Proof",
"nvidia-smi` reported no devices",
"Torch CUDA remained unavailable",
"ComfyUI was not started",
"BTE third retry was not attempted",
"No raw provider payload was stored.",
"No Product Ready claim was made.",
):
require(snippet in report, errors, f"report:missing:{snippet}")
result = {
"ok": not errors,
"validator": "svrnty-vision-bte-work-038-gx10-cuda-restore-v1",
"checked": [str(PROOF.relative_to(ROOT)), str(REPORT.relative_to(ROOT))],
"errors": errors,
"warnings": [],
}
print(json.dumps(result, indent=2, sort_keys=True))
return 0 if result["ok"] else 1
if __name__ == "__main__":
raise SystemExit(main())
+4
View File
@@ -46,6 +46,10 @@ OPTIONAL_PROOF_VALIDATORS = [
"docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-comfyui-gx10-restore-proof.json",
"tools/validate_svrnty_vision_bte_work_038_comfyui_gx10_restore.py",
),
(
"docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-gx10-cuda-restore-retry-proof.json",
"tools/validate_svrnty_vision_bte_work_038_gx10_cuda_restore.py",
),
]