feat: port VLM + FLUX from BTE (Phase 4b)
VLM router (POST /vlm/analyze):
- Proxies to Spark 2 (Qwen3-VL via vLLM, OpenAI-compatible /v1/chat/completions)
- Port of BTE Svrnty.Bte.Domain/Features/AssetContext/OpenAiVlmClient.cs
+ VlmRubric.cs (rubric prompt builder + score parser)
- Anthropic dialect intentionally dropped — sovereign-only
- New rubric_mode="raw" passes brand_context through verbatim so BTE
ExtractBrandSaga / ImageSetSourceReader (extraction-style prompts that
expect their own JSON schema) get unwrapped JSON back without losing
the score-axis path
FLUX router (POST /flux/render):
- Proxies to Spark 1 (FLUX.2-dev on ComfyUI; /prompt + /history poll + /view)
- Port of SparkBComfyClient.cs + LocalFluxImageProvider.cs + StopgapFluxWorkflow.cs
- Accepts a pre-assembled workflow_json (BTE IRecipeAssembler emits one)
or builds the stopgap FLUX.2 graph from prompt + dims
Tests (pytest):
- test_vlm_parse.py — rubric prompt + score parse, 502 on Spark-down, mocked round-trip
- test_flux_workflow.py — stopgap graph shape, seed variance/determinism, 502 on Spark-down
- test_healthz.py updated (palette/rembg still 4a stubs)
16 pytest tests green.
Smoke (no Spark reachable):
- GET /healthz → 200 {"status":"ok"}
- POST /vlm/analyze → 502 "Spark 2 unreachable" (clear error)
- POST /flux/render → 502 "Spark 1 unreachable" (clear error)
Per BTE refactor audit §3 V — vision capabilities extracted from BTE to the
sovereign vision gateway. Phase 4c (delete-from-BTE) + Phase 4d (HTTP adapter)
follow in BTE.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,17 +1,247 @@
|
||||
"""FLUX image generation — stub until Phase 4b wires the ComfyUI HTTP client."""
|
||||
"""FLUX image generation — proxies to Spark 1 (ComfyUI HTTP API).
|
||||
|
||||
Ported from BTE's SparkBComfyClient.cs + LocalFluxImageProvider.cs +
|
||||
StopgapFluxWorkflow.cs (Phase 4b). Workflow is either supplied by the caller
|
||||
(BTE's IRecipeAssembler emits a full graph) or assembled inline from the
|
||||
stopgap FLUX.2 template.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
import urllib.parse
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter, HTTPException, status
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from svrnty_vision.settings import settings
|
||||
|
||||
router = APIRouter(prefix="/flux", tags=["flux"])
|
||||
|
||||
|
||||
@router.post("/render")
|
||||
async def render() -> None:
|
||||
"""Render an image via Spark 1 (FLUX on ComfyUI).
|
||||
# Default values match BTE's StopgapFluxWorkflow.cs.
|
||||
DEFAULT_GUIDANCE = 2.5
|
||||
DEFAULT_STEPS = 20
|
||||
POLL_INTERVAL_SECONDS = 2.0
|
||||
|
||||
Phase 4a: stub. Phase 4b: proxies to Spark 1 ComfyUI workflow.
|
||||
|
||||
class RenderRequest(BaseModel):
|
||||
"""Either `workflow_json` (a pre-assembled ComfyUI graph) or `prompt` + dims
|
||||
(we build the stopgap FLUX.2-dev graph). `workflow_json` wins when both set.
|
||||
"""
|
||||
|
||||
prompt: str | None = None
|
||||
width: int = 1024
|
||||
height: int = 1024
|
||||
workflow_json: str | None = None
|
||||
guidance: float = DEFAULT_GUIDANCE
|
||||
steps: int = DEFAULT_STEPS
|
||||
seed: int | None = None # null → random per call (ComfyUI dedup avoidance)
|
||||
|
||||
|
||||
class RenderResponse(BaseModel):
|
||||
prompt_id: str
|
||||
image_base64: str
|
||||
content_type: str = "image/png"
|
||||
duration_ms: int
|
||||
provider: str = "local"
|
||||
model: str = "flux2"
|
||||
|
||||
|
||||
def build_stopgap_workflow(
|
||||
prompt: str,
|
||||
width: int,
|
||||
height: int,
|
||||
guidance: float = DEFAULT_GUIDANCE,
|
||||
steps: int = DEFAULT_STEPS,
|
||||
seed: int | None = None,
|
||||
) -> str:
|
||||
"""Ported from StopgapFluxWorkflow.Build (Svrnty.Bte.Shared.Recipes).
|
||||
|
||||
FLUX.2 stack on Spark 1: flux2_dev_fp8mixed UNET, Mistral-3 CLIP (type "flux2"),
|
||||
flux2-vae. KSampler cfg=1.0; distilled guidance via FluxGuidance node.
|
||||
Random seed per call: ComfyUI dedupes identical workflows.
|
||||
"""
|
||||
if seed is None:
|
||||
seed = random.randint(1, 2**31 - 1)
|
||||
|
||||
graph: dict[str, Any] = {
|
||||
"5": {
|
||||
"class_type": "EmptySD3LatentImage",
|
||||
"inputs": {"width": width, "height": height, "batch_size": 1},
|
||||
},
|
||||
"6": {
|
||||
"class_type": "CLIPTextEncode",
|
||||
"inputs": {"clip": ["11", 0], "text": prompt},
|
||||
},
|
||||
"7": {
|
||||
"class_type": "CLIPTextEncode",
|
||||
"inputs": {"clip": ["11", 0], "text": ""},
|
||||
},
|
||||
"8": {
|
||||
"class_type": "VAEDecode",
|
||||
"inputs": {"samples": ["13", 0], "vae": ["10", 0]},
|
||||
},
|
||||
"9": {
|
||||
"class_type": "SaveImage",
|
||||
"inputs": {"filename_prefix": "bte", "images": ["8", 0]},
|
||||
},
|
||||
"10": {
|
||||
"class_type": "VAELoader",
|
||||
"inputs": {"vae_name": "flux2-vae.safetensors"},
|
||||
},
|
||||
"11": {
|
||||
"class_type": "CLIPLoader",
|
||||
"inputs": {
|
||||
"clip_name": "mistral_3_small_flux2_fp8.safetensors",
|
||||
"type": "flux2",
|
||||
},
|
||||
},
|
||||
"12": {
|
||||
"class_type": "UNETLoader",
|
||||
"inputs": {
|
||||
"unet_name": "flux2_dev_fp8mixed.safetensors",
|
||||
"weight_dtype": "default",
|
||||
},
|
||||
},
|
||||
"13": {
|
||||
"class_type": "KSampler",
|
||||
"inputs": {
|
||||
"model": ["12", 0],
|
||||
"positive": ["26", 0],
|
||||
"negative": ["7", 0],
|
||||
"latent_image": ["5", 0],
|
||||
"seed": seed,
|
||||
"steps": steps,
|
||||
"cfg": 1.0,
|
||||
"sampler_name": "euler",
|
||||
"scheduler": "simple",
|
||||
"denoise": 1.0,
|
||||
},
|
||||
},
|
||||
"26": {
|
||||
"class_type": "FluxGuidance",
|
||||
"inputs": {"conditioning": ["6", 0], "guidance": guidance},
|
||||
},
|
||||
}
|
||||
return json.dumps(graph)
|
||||
|
||||
|
||||
async def _queue_prompt(
|
||||
client: httpx.AsyncClient, endpoint: str, workflow_json: str
|
||||
) -> str:
|
||||
workflow = json.loads(workflow_json)
|
||||
resp = await client.post(f"{endpoint}/prompt", json={"prompt": workflow})
|
||||
resp.raise_for_status()
|
||||
body = resp.json()
|
||||
if "prompt_id" not in body:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"ComfyUI /prompt returned no prompt_id: {body}",
|
||||
)
|
||||
return body["prompt_id"]
|
||||
|
||||
|
||||
async def _poll_history(
|
||||
client: httpx.AsyncClient, endpoint: str, prompt_id: str
|
||||
) -> list[dict[str, str]]:
|
||||
deadline = time.monotonic() + settings.vision_request_timeout_seconds
|
||||
while time.monotonic() < deadline:
|
||||
resp = await client.get(f"{endpoint}/history/{prompt_id}")
|
||||
resp.raise_for_status()
|
||||
doc = resp.json()
|
||||
entry = doc.get(prompt_id)
|
||||
if entry and entry.get("status", {}).get("completed"):
|
||||
return _extract_images(entry)
|
||||
await asyncio.sleep(POLL_INTERVAL_SECONDS)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_501_NOT_IMPLEMENTED,
|
||||
detail="flux.render not implemented in Phase 4a — see BTE-REFACTOR-EXECUTION-PLAN Phase 4b",
|
||||
status_code=status.HTTP_504_GATEWAY_TIMEOUT,
|
||||
detail=(
|
||||
f"ComfyUI prompt {prompt_id} did not complete within "
|
||||
f"{settings.vision_request_timeout_seconds}s."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _extract_images(entry: dict[str, Any]) -> list[dict[str, str]]:
|
||||
refs: list[dict[str, str]] = []
|
||||
outputs = entry.get("outputs", {})
|
||||
for node in outputs.values():
|
||||
for img in node.get("images", []) or []:
|
||||
refs.append(
|
||||
{
|
||||
"filename": img.get("filename", ""),
|
||||
"subfolder": img.get("subfolder", ""),
|
||||
"type": img.get("type", "output"),
|
||||
}
|
||||
)
|
||||
return refs
|
||||
|
||||
|
||||
async def _download_image(
|
||||
client: httpx.AsyncClient, endpoint: str, ref: dict[str, str]
|
||||
) -> bytes:
|
||||
qs = urllib.parse.urlencode(
|
||||
{
|
||||
"filename": ref["filename"],
|
||||
"subfolder": ref["subfolder"],
|
||||
"type": ref["type"],
|
||||
}
|
||||
)
|
||||
resp = await client.get(f"{endpoint}/view?{qs}")
|
||||
resp.raise_for_status()
|
||||
return resp.content
|
||||
|
||||
|
||||
@router.post("/render", response_model=RenderResponse)
|
||||
async def render(req: RenderRequest) -> RenderResponse:
|
||||
"""Render an image via Spark 1 (FLUX.2-dev on ComfyUI).
|
||||
|
||||
If `workflow_json` is supplied (e.g. from BTE's IRecipeAssembler), it is
|
||||
POSTed verbatim. Otherwise a stopgap FLUX.2 graph is built from `prompt`.
|
||||
"""
|
||||
workflow_json = req.workflow_json
|
||||
if not workflow_json:
|
||||
if not req.prompt:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Provide workflow_json or prompt.",
|
||||
)
|
||||
workflow_json = build_stopgap_workflow(
|
||||
req.prompt, req.width, req.height, req.guidance, req.steps, req.seed
|
||||
)
|
||||
|
||||
endpoint = settings.spark1_flux_url.rstrip("/")
|
||||
started = time.monotonic()
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=settings.vision_request_timeout_seconds
|
||||
) as client:
|
||||
prompt_id = await _queue_prompt(client, endpoint, workflow_json)
|
||||
images = await _poll_history(client, endpoint, prompt_id)
|
||||
if not images:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"ComfyUI returned no output images for prompt {prompt_id}.",
|
||||
)
|
||||
png_bytes = await _download_image(client, endpoint, images[0])
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"Spark 1 (ComfyUI) at {endpoint} unreachable: {type(e).__name__}: {e}",
|
||||
) from e
|
||||
|
||||
duration_ms = int((time.monotonic() - started) * 1000)
|
||||
return RenderResponse(
|
||||
prompt_id=prompt_id,
|
||||
image_base64=base64.b64encode(png_bytes).decode("ascii"),
|
||||
content_type="image/png",
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
@@ -1,17 +1,193 @@
|
||||
"""VLM (vision-language model) analysis — stub until Phase 4b moves Qwen3-VL code."""
|
||||
"""VLM (vision-language model) analysis — proxies to Spark 2 (Qwen3-VL via vLLM).
|
||||
|
||||
Ported from BTE's OpenAiVlmClient.cs + VlmRubric.cs (Phase 4b). Cloud Anthropic
|
||||
dialect intentionally dropped — svrnty-vision is sovereign-only.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter, HTTPException, status
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from svrnty_vision.settings import settings
|
||||
|
||||
router = APIRouter(prefix="/vlm", tags=["vlm"])
|
||||
|
||||
|
||||
@router.post("/analyze")
|
||||
async def analyze() -> None:
|
||||
"""Analyze an image with a vision-language model.
|
||||
class AnalyzeRequest(BaseModel):
|
||||
"""At least one of `image_base64` or `image_url` must be supplied.
|
||||
|
||||
Phase 4a: stub. Phase 4b: proxies to Spark 2 (Qwen3-VL via vLLM).
|
||||
`rubric_mode` is `polished` (premium aesthetic) or `ugc` (organic).
|
||||
`brand_context` is the prompt anchor (brand description / extraction directives).
|
||||
Set `rubric_mode = "raw"` to bypass the brand-scoring rubric and pass
|
||||
`brand_context` through verbatim (used by extraction prompts, e.g. screenshot DNA).
|
||||
"""
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_501_NOT_IMPLEMENTED,
|
||||
detail="vlm.analyze not implemented in Phase 4a — see BTE-REFACTOR-EXECUTION-PLAN Phase 4b",
|
||||
|
||||
image_base64: str | None = None
|
||||
image_url: str | None = None
|
||||
content_type: str = "image/png"
|
||||
brand_context: str = ""
|
||||
rubric_mode: str = "polished"
|
||||
model: str | None = None # override settings.spark2_vlm_model
|
||||
max_tokens: int = 1024
|
||||
|
||||
|
||||
class AnalyzeResponse(BaseModel):
|
||||
brand_fit_score: Decimal | None = Field(
|
||||
default=None,
|
||||
description="0.00–5.00, or null when rubric_mode='raw' (no scores parsed)",
|
||||
)
|
||||
visual_polish_score: Decimal | None = None
|
||||
rubric_mode: str
|
||||
justification: str = ""
|
||||
model_id: str
|
||||
raw_scores_json: str = Field(
|
||||
description="The JSON object the VLM returned (or its full text if rubric_mode='raw')."
|
||||
)
|
||||
|
||||
|
||||
def build_rubric_prompt(brand_context: str, rubric_mode: str) -> str:
|
||||
"""Ported from VlmRubric.BuildRubricPrompt.
|
||||
|
||||
`rubric_mode='raw'` bypasses the brand-scoring rubric — caller's brand_context
|
||||
becomes the full prompt (used for extraction-style calls).
|
||||
"""
|
||||
if rubric_mode == "raw":
|
||||
return brand_context
|
||||
|
||||
return (
|
||||
"You are a brand-asset reviewer. Brand context:\n"
|
||||
"---\n"
|
||||
f"{brand_context}\n"
|
||||
"---\n"
|
||||
f"Rubric mode: {rubric_mode} (polished = premium production aesthetic; "
|
||||
"ugc = organic, casual, hand-shot feel).\n\n"
|
||||
"Score the attached image on TWO axes, each 0.00 – 5.00:\n"
|
||||
"- brand_fit: how well it embodies the brand voice/tokens above.\n"
|
||||
"- visual_polish: technical execution under the chosen rubric mode.\n\n"
|
||||
"Respond with ONE compact JSON object only — no prose, no fences:\n"
|
||||
'{"brand_fit": <num>, "visual_polish": <num>, "justification": "<≤200 chars>"}'
|
||||
)
|
||||
|
||||
|
||||
def parse_scores(
|
||||
response_text: str, rubric_mode: str, model_id: str
|
||||
) -> AnalyzeResponse:
|
||||
"""Ported from VlmRubric.ParseScores.
|
||||
|
||||
For `rubric_mode='raw'` we return the response text untouched; scores stay None.
|
||||
Callers (e.g. BTE ImageSetSourceReader / ExtractBrandSaga) parse the JSON
|
||||
themselves.
|
||||
"""
|
||||
if rubric_mode == "raw":
|
||||
return AnalyzeResponse(
|
||||
brand_fit_score=None,
|
||||
visual_polish_score=None,
|
||||
rubric_mode=rubric_mode,
|
||||
justification="",
|
||||
model_id=model_id,
|
||||
raw_scores_json=response_text,
|
||||
)
|
||||
|
||||
trimmed = response_text.strip()
|
||||
start = trimmed.find("{")
|
||||
end = trimmed.rfind("}")
|
||||
if start < 0 or end <= start:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"VLM response did not contain a JSON object: {response_text}",
|
||||
)
|
||||
|
||||
raw_json = trimmed[start : end + 1]
|
||||
try:
|
||||
obj = json.loads(raw_json)
|
||||
except json.JSONDecodeError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"VLM JSON parse failed: {e}",
|
||||
) from e
|
||||
|
||||
return AnalyzeResponse(
|
||||
brand_fit_score=Decimal(str(obj["brand_fit"])),
|
||||
visual_polish_score=Decimal(str(obj["visual_polish"])),
|
||||
rubric_mode=rubric_mode,
|
||||
justification=str(obj.get("justification", "")),
|
||||
model_id=model_id,
|
||||
raw_scores_json=raw_json,
|
||||
)
|
||||
|
||||
|
||||
async def _resolve_data_uri(req: AnalyzeRequest) -> str:
|
||||
if req.image_base64:
|
||||
return f"data:{req.content_type};base64,{req.image_base64}"
|
||||
if req.image_url:
|
||||
# vLLM Qwen3-VL accepts http(s) URLs directly; pass through.
|
||||
if req.image_url.startswith(("http://", "https://", "data:")):
|
||||
return req.image_url
|
||||
# Otherwise treat as local path → read + b64.
|
||||
try:
|
||||
with open(req.image_url, "rb") as f:
|
||||
raw = f.read()
|
||||
except OSError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"image_url unreadable as file: {e}",
|
||||
) from e
|
||||
return f"data:{req.content_type};base64,{base64.b64encode(raw).decode('ascii')}"
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Provide image_base64 or image_url.",
|
||||
)
|
||||
|
||||
|
||||
@router.post("/analyze", response_model=AnalyzeResponse)
|
||||
async def analyze(req: AnalyzeRequest) -> AnalyzeResponse:
|
||||
"""Analyze an image with Qwen3-VL on Spark 2 (vLLM, OpenAI-compatible)."""
|
||||
data_uri = await _resolve_data_uri(req)
|
||||
rubric = build_rubric_prompt(req.brand_context, req.rubric_mode)
|
||||
model = req.model or settings.spark2_vlm_model
|
||||
|
||||
body: dict[str, Any] = {
|
||||
"model": model,
|
||||
"max_tokens": req.max_tokens,
|
||||
"temperature": 0,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": rubric},
|
||||
{"type": "image_url", "image_url": {"url": data_uri}},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
url = settings.spark2_vlm_url.rstrip("/") + "/v1/chat/completions"
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=settings.vision_request_timeout_seconds
|
||||
) as client:
|
||||
resp = await client.post(url, json=body)
|
||||
resp.raise_for_status()
|
||||
payload = resp.json()
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"Spark 2 (vLLM) at {url} unreachable: {type(e).__name__}: {e}",
|
||||
) from e
|
||||
|
||||
try:
|
||||
text = payload["choices"][0]["message"]["content"] or ""
|
||||
except (KeyError, IndexError, TypeError) as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"Spark 2 response shape unexpected: {e}",
|
||||
) from e
|
||||
|
||||
return parse_scores(text, req.rubric_mode, model)
|
||||
|
||||
Reference in New Issue
Block a user