Vision-module-auto/claude_vision_auto/vision_analyzer.py
Svrnty 52b0813b64 feat: Add YAML-based configuration system
- Add default_config.yaml with customizable settings
- Model selection (minicpm-v, llama3.2-vision, llava)
- Customizable vision prompt for better responses
- Timing parameters (idle threshold, response delay)
- Approval keywords configuration
- User config at ~/.config/claude-vision-auto/config.yaml
- New command: claude-vision-config to generate user config
- Environment variables still override config files
- Added PyYAML dependency

Configuration priority:
1. Environment variables (highest)
2. User config (~/.config/claude-vision-auto/config.yaml)
3. Default config (package default)

Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Jean-Philippe Brule <jp@svrnty.io>
2025-10-29 10:19:35 -04:00

114 lines
3.3 KiB
Python

"""
Vision analysis using MiniCPM-V via Ollama
"""
import base64
import requests
from pathlib import Path
from typing import Optional
from . import config
class VisionAnalyzer:
"""Analyzes screenshots using vision model"""
def __init__(self, ollama_url: str = None, model: str = None):
"""
Initialize vision analyzer
Args:
ollama_url: Ollama API URL (default from config)
model: Vision model name (default from config)
"""
self.ollama_url = ollama_url or config.OLLAMA_URL
self.model = model or config.VISION_MODEL
def analyze_screenshot(self, image_path: str) -> Optional[str]:
"""
Analyze screenshot and determine what response to give
Args:
image_path: Path to screenshot image
Returns:
Response to send ("1", "y", "WAIT", etc.) or None on error
"""
try:
# Read and encode image
with open(image_path, 'rb') as f:
image_data = base64.b64encode(f.read()).decode('utf-8')
# Send to Ollama
payload = {
"model": self.model,
"prompt": config.VISION_PROMPT,
"images": [image_data],
"stream": False
}
if config.DEBUG:
print(f"[DEBUG] Sending to Ollama: {self.ollama_url}")
print(f"[DEBUG] Model: {self.model}")
response = requests.post(
self.ollama_url,
json=payload,
timeout=config.VISION_TIMEOUT
)
response.raise_for_status()
result = response.json()
answer = result.get('response', '').strip()
if config.DEBUG:
print(f"[DEBUG] Vision model response: {answer}")
return answer
except requests.Timeout:
if config.DEBUG:
print("[DEBUG] Vision analysis timeout")
return None
except requests.RequestException as e:
if config.DEBUG:
print(f"[DEBUG] Vision API error: {e}")
return None
except Exception as e:
if config.DEBUG:
print(f"[DEBUG] Unexpected error in vision analysis: {e}")
return None
def test_connection(self) -> bool:
"""
Test if Ollama is accessible and model is available
Returns:
True if connection successful, False otherwise
"""
try:
# Try to list tags
tags_url = self.ollama_url.replace('/api/generate', '/api/tags')
response = requests.get(tags_url, timeout=5)
response.raise_for_status()
data = response.json()
models = [m['name'] for m in data.get('models', [])]
if config.DEBUG:
print(f"[DEBUG] Available models: {models}")
# Check if our model is available
model_available = any(self.model in m for m in models)
if not model_available:
print(f"Warning: Model '{self.model}' not found in Ollama")
print(f"Available models: {', '.join(models)}")
return model_available
except Exception as e:
if config.DEBUG:
print(f"[DEBUG] Connection test failed: {e}")
return False