Vision-module-auto/claude_vision_auto/vision_analyzer.py
Svrnty 41cecca0e2 Initial release of Claude Vision Auto v1.0.0
Vision-based auto-approval system for Claude Code CLI using MiniCPM-V vision model.

Features:
- Automatic detection and response to approval prompts
- Screenshot capture and vision analysis via Ollama
- Support for multiple screenshot tools (scrot, gnome-screenshot, etc.)
- Configurable timing and behavior
- Debug mode for troubleshooting
- Comprehensive documentation

Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Jean-Philippe Brule <jp@svrnty.io>
2025-10-29 10:09:01 -04:00

134 lines
3.9 KiB
Python

"""
Vision analysis using MiniCPM-V via Ollama
"""
import base64
import requests
from pathlib import Path
from typing import Optional
from . import config
VISION_PROMPT = """You are analyzing a terminal screenshot showing a Claude Code approval prompt.
Look for:
- Menu options like "1. Yes", "2. Yes, allow all", "3. No"
- Questions asking for approval (create/edit/delete files)
- Yes/No questions
If you see an approval prompt with numbered options:
- Respond ONLY with the number to select "Yes" (usually "1")
- Output format: Just the number, nothing else
If you see a yes/no question:
- Respond with: y
If you don't see any prompt requiring input:
- Respond with: WAIT
Your response (one word only):"""
class VisionAnalyzer:
"""Analyzes screenshots using vision model"""
def __init__(self, ollama_url: str = None, model: str = None):
"""
Initialize vision analyzer
Args:
ollama_url: Ollama API URL (default from config)
model: Vision model name (default from config)
"""
self.ollama_url = ollama_url or config.OLLAMA_URL
self.model = model or config.VISION_MODEL
def analyze_screenshot(self, image_path: str) -> Optional[str]:
"""
Analyze screenshot and determine what response to give
Args:
image_path: Path to screenshot image
Returns:
Response to send ("1", "y", "WAIT", etc.) or None on error
"""
try:
# Read and encode image
with open(image_path, 'rb') as f:
image_data = base64.b64encode(f.read()).decode('utf-8')
# Send to Ollama
payload = {
"model": self.model,
"prompt": VISION_PROMPT,
"images": [image_data],
"stream": False
}
if config.DEBUG:
print(f"[DEBUG] Sending to Ollama: {self.ollama_url}")
print(f"[DEBUG] Model: {self.model}")
response = requests.post(
self.ollama_url,
json=payload,
timeout=config.VISION_TIMEOUT
)
response.raise_for_status()
result = response.json()
answer = result.get('response', '').strip()
if config.DEBUG:
print(f"[DEBUG] Vision model response: {answer}")
return answer
except requests.Timeout:
if config.DEBUG:
print("[DEBUG] Vision analysis timeout")
return None
except requests.RequestException as e:
if config.DEBUG:
print(f"[DEBUG] Vision API error: {e}")
return None
except Exception as e:
if config.DEBUG:
print(f"[DEBUG] Unexpected error in vision analysis: {e}")
return None
def test_connection(self) -> bool:
"""
Test if Ollama is accessible and model is available
Returns:
True if connection successful, False otherwise
"""
try:
# Try to list tags
tags_url = self.ollama_url.replace('/api/generate', '/api/tags')
response = requests.get(tags_url, timeout=5)
response.raise_for_status()
data = response.json()
models = [m['name'] for m in data.get('models', [])]
if config.DEBUG:
print(f"[DEBUG] Available models: {models}")
# Check if our model is available
model_available = any(self.model in m for m in models)
if not model_available:
print(f"Warning: Model '{self.model}' not found in Ollama")
print(f"Available models: {', '.join(models)}")
return model_available
except Exception as e:
if config.DEBUG:
print(f"[DEBUG] Connection test failed: {e}")
return False