Record BTE GX10 CUDA restore blocker
This commit is contained in:
+1
-1
@@ -31,6 +31,6 @@ items:
|
||||
owner: ""
|
||||
- id: SVRNTY-VISION-WORK-009
|
||||
title: BTE-WORK-038 ComfyUI GX10 Restore Proof
|
||||
status: blocked-on-gx10-cuda-initialization
|
||||
status: blocked-on-gx10-gpu-visibility
|
||||
source: docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-comfyui-gx10-restore-proof.md
|
||||
owner: jp
|
||||
|
||||
+99
@@ -0,0 +1,99 @@
|
||||
{
|
||||
"schema": "svrnty-vision.bte-work-038-gx10-cuda-restore-proof.v1",
|
||||
"timestamp": "2026-06-20T16:29:03Z",
|
||||
"work_item_id": "SVRNTY-VISION-WORK-009",
|
||||
"route": "svrnty-vision",
|
||||
"status": "blocked_on_gx10_gpu_visibility_no_comfyui_start_no_bte_retry",
|
||||
"approval_ref": "APPROVED: BTE-WORK-038 GX10 CUDA/ComfyUI restore and single third retry",
|
||||
"authority_effect": "gx10_cuda_user_space_restore_attempted",
|
||||
"product_ready_claim": false,
|
||||
"provider_gateway_health": {
|
||||
"route_identity": "svrnty-vision",
|
||||
"url_ref": "http://localhost:8092/healthz",
|
||||
"http_status": 200,
|
||||
"body_status": "ok",
|
||||
"version": "0.1.0"
|
||||
},
|
||||
"provider_gateway_health_from_bte_runtime_namespace": {
|
||||
"route_identity": "svrnty-vision",
|
||||
"url_ref": "http://172.20.0.1:8092/healthz",
|
||||
"http_status": 200,
|
||||
"body_status": "ok",
|
||||
"version": "0.1.0"
|
||||
},
|
||||
"local_gx10_cuda_before_restore": {
|
||||
"route_identity": "local-gx10-cuda",
|
||||
"host": "gx10-f38f",
|
||||
"ssh_target": "svrnty@100.90.100.10",
|
||||
"nvidia_smi_result": "no_devices_found",
|
||||
"torch_cuda_available": false,
|
||||
"torch_cuda_device_count": 0,
|
||||
"nvidia_device_nodes_present": true,
|
||||
"nvidia_kernel_modules_loaded": true,
|
||||
"nvidia_persistenced_active": true,
|
||||
"nvidia_persistenced_compact_failure": "device 000f:01:00.0 registered then failed to open",
|
||||
"sudo_noninteractive_available": false,
|
||||
"failure_class": "gx10_gpu_registered_but_not_openable"
|
||||
},
|
||||
"local_gx10_cuda_restore_attempt": {
|
||||
"route_identity": "local-gx10-cuda",
|
||||
"method": "nvidia-modprobe -u -c=0",
|
||||
"attempted": true,
|
||||
"nvidia_modprobe_setuid_root": true,
|
||||
"nvidia_smi_after_restore": "no_devices_found",
|
||||
"torch_cuda_available_after_restore": false,
|
||||
"torch_cuda_device_count_after_restore": 0,
|
||||
"failure_class": "user_space_modprobe_did_not_restore_cuda",
|
||||
"raw_log_stored_in_proof": false
|
||||
},
|
||||
"local_comfyui_gx10_health_after_cuda_attempt": {
|
||||
"route_identity": "local-comfyui-gx10",
|
||||
"host": "gx10-f38f",
|
||||
"url_ref": "http://100.90.100.10:8188",
|
||||
"system_stats_url_ref": "http://100.90.100.10:8188/system_stats",
|
||||
"http_status": 0,
|
||||
"reachability_method": "curl_get_system_stats_after_cuda_restore_attempt",
|
||||
"failure_class": "connection_refused",
|
||||
"comfyui_start_attempted_this_slice": false,
|
||||
"reason": "CUDA health gate failed before ComfyUI start.",
|
||||
"required_for_bte_third_retry": 200
|
||||
},
|
||||
"bte_retry_gate": {
|
||||
"bte_third_retry_attempted": false,
|
||||
"reason": "local_gx10_cuda_restore_attempt did not make CUDA visible and local_comfyui_gx10_health_after_cuda_attempt.http_status was not 200",
|
||||
"prior_retry_asset_ref": "5c1eedc5-e281-4c8c-82d3-bc4d764d2111",
|
||||
"prior_retry_saga_ref": "ee27c2b3-a415-47a8-8d75-7bd834f6b99e",
|
||||
"approval_expired_after": "first_failed_or_blocked_cuda_comfyui_bte_live_effect"
|
||||
},
|
||||
"source_refs": {
|
||||
"prior_provider_restore_proof": "docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-comfyui-gx10-restore-proof.json",
|
||||
"prior_provider_restore_commit": "svrnty-vision@61a4befffbde58cf88660d9c410ad3473ddbea01",
|
||||
"bte_runtime_proof": "../bte/docs/goal-runs/bte-sovereign-creative-supercomputer-product-ready/proof/bte-work-038-runtime-rest-provider-proof.json",
|
||||
"svrnty_vision_agent_contract": "AGENTS.md",
|
||||
"network_skill_reference": "/home/svrnty/.codex/skills/network/SKILL.md"
|
||||
},
|
||||
"tool_effects": {
|
||||
"mutated_svrnty_vision_files": true,
|
||||
"network": true,
|
||||
"provider_gateway_health_checked": true,
|
||||
"nvidia_modprobe_attempted": true,
|
||||
"remote_comfyui_start_attempted": false,
|
||||
"remote_comfyui_left_running": false,
|
||||
"bte_rest_call": false,
|
||||
"provider_call": false,
|
||||
"local_generation_call": false,
|
||||
"mcp_registration": false,
|
||||
"profile_exposure_change": false,
|
||||
"core_mutation": false,
|
||||
"bte_repo_mutation_from_this_packet": false,
|
||||
"archive_delete_execution": false,
|
||||
"raw_payload_storage": false,
|
||||
"product_ready_claim": false,
|
||||
"release_claim": false,
|
||||
"production_readiness_claim": false
|
||||
},
|
||||
"validation": {
|
||||
"validator": "tools/validate_svrnty_vision_bte_work_038_gx10_cuda_restore.py",
|
||||
"route_validator": "tools/validate_svrnty_vision_child.py"
|
||||
}
|
||||
}
|
||||
+43
@@ -0,0 +1,43 @@
|
||||
---
|
||||
type: provider-route-proof
|
||||
id: SVRNTY-VISION-WORK-009
|
||||
status: blocked-on-gx10-gpu-visibility
|
||||
machine_contract: svrnty-vision-work-009-gx10-cuda-restore-retry-proof.json
|
||||
---
|
||||
|
||||
# Svrnty Vision BTE-WORK-038 GX10 CUDA Restore Proof
|
||||
|
||||
This packet records the provider-route side of the approved `BTE-WORK-038`
|
||||
GX10 CUDA/ComfyUI restore retry. It is compact proof metadata only.
|
||||
|
||||
## Result
|
||||
|
||||
- `svrnty-vision` health was HTTP 200 from the host path.
|
||||
- `svrnty-vision` health was HTTP 200 from the BTE runtime namespace path.
|
||||
- `gx10-f38f` had NVIDIA device nodes and kernel modules loaded.
|
||||
- `nvidia-persistenced` was active but had registered the GPU and failed to
|
||||
open it.
|
||||
- `nvidia-smi` reported no devices before and after `nvidia-modprobe -u -c=0`.
|
||||
- Torch CUDA remained unavailable with zero CUDA devices.
|
||||
- ComfyUI was not started because the CUDA health gate failed first.
|
||||
- The BTE third retry was not attempted.
|
||||
|
||||
## Blocker
|
||||
|
||||
The blocker is GX10 GPU visibility to the NVIDIA userspace stack. The compact
|
||||
error class is `user_space_modprobe_did_not_restore_cuda`.
|
||||
|
||||
## Payload Posture
|
||||
|
||||
No secrets were read.
|
||||
No raw prompt was stored.
|
||||
No raw provider payload was stored.
|
||||
No generated binary was stored in proof.
|
||||
No BTE REST creative call was made.
|
||||
No provider generation call was made.
|
||||
No Product Ready claim was made.
|
||||
|
||||
## Validation
|
||||
|
||||
- `python3 tools/validate_svrnty_vision_bte_work_038_gx10_cuda_restore.py`
|
||||
- `python3 tools/validate_svrnty_vision_child.py`
|
||||
@@ -0,0 +1,135 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate the Svrnty Vision BTE-WORK-038 GX10 CUDA restore proof."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
PROOF = (
|
||||
ROOT
|
||||
/ "docs"
|
||||
/ "goal-runs"
|
||||
/ "bte-work-038-comfyui-gx10-restore-proof"
|
||||
/ "svrnty-vision-work-009-gx10-cuda-restore-retry-proof.json"
|
||||
)
|
||||
REPORT = PROOF.with_suffix(".md")
|
||||
APPROVAL_REF = "APPROVED: BTE-WORK-038 GX10 CUDA/ComfyUI restore and single third retry"
|
||||
|
||||
|
||||
def require(condition: bool, errors: list[str], code: str) -> None:
|
||||
if not condition:
|
||||
errors.append(code)
|
||||
|
||||
|
||||
def load_json(path: Path, errors: list[str]) -> dict[str, Any]:
|
||||
if not path.exists():
|
||||
errors.append(f"missing:{path.relative_to(ROOT)}")
|
||||
return {}
|
||||
try:
|
||||
payload = json.loads(path.read_text(encoding="utf-8"))
|
||||
except json.JSONDecodeError as exc:
|
||||
errors.append(f"{path.name}:invalid_json:{exc.lineno}:{exc.colno}")
|
||||
return {}
|
||||
if not isinstance(payload, dict):
|
||||
errors.append(f"{path.name}:not_object")
|
||||
return {}
|
||||
return payload
|
||||
|
||||
|
||||
def main() -> int:
|
||||
errors: list[str] = []
|
||||
proof = load_json(PROOF, errors)
|
||||
report = REPORT.read_text(encoding="utf-8") if REPORT.exists() else ""
|
||||
|
||||
require(REPORT.exists(), errors, "report:missing")
|
||||
require(proof.get("schema") == "svrnty-vision.bte-work-038-gx10-cuda-restore-proof.v1", errors, "schema")
|
||||
require(proof.get("work_item_id") == "SVRNTY-VISION-WORK-009", errors, "work_item_id")
|
||||
require(proof.get("route") == "svrnty-vision", errors, "route")
|
||||
require(proof.get("status") == "blocked_on_gx10_gpu_visibility_no_comfyui_start_no_bte_retry", errors, "status")
|
||||
require(proof.get("approval_ref") == APPROVAL_REF, errors, "approval_ref")
|
||||
require(proof.get("product_ready_claim") is False, errors, "product_ready_claim")
|
||||
|
||||
gateway = proof.get("provider_gateway_health", {})
|
||||
require(gateway.get("http_status") == 200, errors, "gateway:http_status")
|
||||
require(gateway.get("body_status") == "ok", errors, "gateway:body_status")
|
||||
bte_namespace = proof.get("provider_gateway_health_from_bte_runtime_namespace", {})
|
||||
require(bte_namespace.get("http_status") == 200, errors, "bte_namespace:http_status")
|
||||
require(bte_namespace.get("body_status") == "ok", errors, "bte_namespace:body_status")
|
||||
|
||||
before = proof.get("local_gx10_cuda_before_restore", {})
|
||||
require(before.get("nvidia_smi_result") == "no_devices_found", errors, "before:nvidia_smi")
|
||||
require(before.get("torch_cuda_available") is False, errors, "before:torch_available")
|
||||
require(before.get("torch_cuda_device_count") == 0, errors, "before:torch_count")
|
||||
require(before.get("nvidia_device_nodes_present") is True, errors, "before:device_nodes")
|
||||
require(before.get("nvidia_kernel_modules_loaded") is True, errors, "before:kernel_modules")
|
||||
require(before.get("sudo_noninteractive_available") is False, errors, "before:sudo")
|
||||
require(before.get("failure_class") == "gx10_gpu_registered_but_not_openable", errors, "before:failure_class")
|
||||
|
||||
restore = proof.get("local_gx10_cuda_restore_attempt", {})
|
||||
require(restore.get("method") == "nvidia-modprobe -u -c=0", errors, "restore:method")
|
||||
require(restore.get("attempted") is True, errors, "restore:attempted")
|
||||
require(restore.get("nvidia_modprobe_setuid_root") is True, errors, "restore:setuid")
|
||||
require(restore.get("nvidia_smi_after_restore") == "no_devices_found", errors, "restore:nvidia_smi")
|
||||
require(restore.get("torch_cuda_available_after_restore") is False, errors, "restore:torch_available")
|
||||
require(restore.get("torch_cuda_device_count_after_restore") == 0, errors, "restore:torch_count")
|
||||
require(restore.get("failure_class") == "user_space_modprobe_did_not_restore_cuda", errors, "restore:failure_class")
|
||||
require(restore.get("raw_log_stored_in_proof") is False, errors, "restore:raw_log")
|
||||
|
||||
comfy = proof.get("local_comfyui_gx10_health_after_cuda_attempt", {})
|
||||
require(comfy.get("http_status") == 0, errors, "comfy:http_status")
|
||||
require(comfy.get("failure_class") == "connection_refused", errors, "comfy:failure_class")
|
||||
require(comfy.get("comfyui_start_attempted_this_slice") is False, errors, "comfy:not_started")
|
||||
require(comfy.get("required_for_bte_third_retry") == 200, errors, "comfy:required")
|
||||
|
||||
retry = proof.get("bte_retry_gate", {})
|
||||
require(retry.get("bte_third_retry_attempted") is False, errors, "retry:attempted")
|
||||
require(retry.get("prior_retry_asset_ref") == "5c1eedc5-e281-4c8c-82d3-bc4d764d2111", errors, "retry:asset")
|
||||
require(retry.get("prior_retry_saga_ref") == "ee27c2b3-a415-47a8-8d75-7bd834f6b99e", errors, "retry:saga")
|
||||
require(retry.get("approval_expired_after") == "first_failed_or_blocked_cuda_comfyui_bte_live_effect", errors, "retry:expiry")
|
||||
|
||||
effects = proof.get("tool_effects", {})
|
||||
require(effects.get("nvidia_modprobe_attempted") is True, errors, "effects:nvidia_modprobe")
|
||||
require(effects.get("remote_comfyui_start_attempted") is False, errors, "effects:comfy_start")
|
||||
for key in (
|
||||
"bte_rest_call",
|
||||
"provider_call",
|
||||
"local_generation_call",
|
||||
"mcp_registration",
|
||||
"profile_exposure_change",
|
||||
"core_mutation",
|
||||
"archive_delete_execution",
|
||||
"raw_payload_storage",
|
||||
"product_ready_claim",
|
||||
"release_claim",
|
||||
"production_readiness_claim",
|
||||
):
|
||||
require(effects.get(key) is False, errors, f"effects:{key}")
|
||||
|
||||
for snippet in (
|
||||
"GX10 CUDA Restore Proof",
|
||||
"nvidia-smi` reported no devices",
|
||||
"Torch CUDA remained unavailable",
|
||||
"ComfyUI was not started",
|
||||
"BTE third retry was not attempted",
|
||||
"No raw provider payload was stored.",
|
||||
"No Product Ready claim was made.",
|
||||
):
|
||||
require(snippet in report, errors, f"report:missing:{snippet}")
|
||||
|
||||
result = {
|
||||
"ok": not errors,
|
||||
"validator": "svrnty-vision-bte-work-038-gx10-cuda-restore-v1",
|
||||
"checked": [str(PROOF.relative_to(ROOT)), str(REPORT.relative_to(ROOT))],
|
||||
"errors": errors,
|
||||
"warnings": [],
|
||||
}
|
||||
print(json.dumps(result, indent=2, sort_keys=True))
|
||||
return 0 if result["ok"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -46,6 +46,10 @@ OPTIONAL_PROOF_VALIDATORS = [
|
||||
"docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-comfyui-gx10-restore-proof.json",
|
||||
"tools/validate_svrnty_vision_bte_work_038_comfyui_gx10_restore.py",
|
||||
),
|
||||
(
|
||||
"docs/goal-runs/bte-work-038-comfyui-gx10-restore-proof/svrnty-vision-work-009-gx10-cuda-restore-retry-proof.json",
|
||||
"tools/validate_svrnty_vision_bte_work_038_gx10_cuda_restore.py",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user