- Add default_config.yaml with customizable settings - Model selection (minicpm-v, llama3.2-vision, llava) - Customizable vision prompt for better responses - Timing parameters (idle threshold, response delay) - Approval keywords configuration - User config at ~/.config/claude-vision-auto/config.yaml - New command: claude-vision-config to generate user config - Environment variables still override config files - Added PyYAML dependency Configuration priority: 1. Environment variables (highest) 2. User config (~/.config/claude-vision-auto/config.yaml) 3. Default config (package default) Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Jean-Philippe Brule <jp@svrnty.io>
114 lines
3.3 KiB
Python
114 lines
3.3 KiB
Python
"""
|
|
Vision analysis using MiniCPM-V via Ollama
|
|
"""
|
|
|
|
import base64
|
|
import requests
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
from . import config
|
|
|
|
|
|
class VisionAnalyzer:
|
|
"""Analyzes screenshots using vision model"""
|
|
|
|
def __init__(self, ollama_url: str = None, model: str = None):
|
|
"""
|
|
Initialize vision analyzer
|
|
|
|
Args:
|
|
ollama_url: Ollama API URL (default from config)
|
|
model: Vision model name (default from config)
|
|
"""
|
|
self.ollama_url = ollama_url or config.OLLAMA_URL
|
|
self.model = model or config.VISION_MODEL
|
|
|
|
def analyze_screenshot(self, image_path: str) -> Optional[str]:
|
|
"""
|
|
Analyze screenshot and determine what response to give
|
|
|
|
Args:
|
|
image_path: Path to screenshot image
|
|
|
|
Returns:
|
|
Response to send ("1", "y", "WAIT", etc.) or None on error
|
|
"""
|
|
try:
|
|
# Read and encode image
|
|
with open(image_path, 'rb') as f:
|
|
image_data = base64.b64encode(f.read()).decode('utf-8')
|
|
|
|
# Send to Ollama
|
|
payload = {
|
|
"model": self.model,
|
|
"prompt": config.VISION_PROMPT,
|
|
"images": [image_data],
|
|
"stream": False
|
|
}
|
|
|
|
if config.DEBUG:
|
|
print(f"[DEBUG] Sending to Ollama: {self.ollama_url}")
|
|
print(f"[DEBUG] Model: {self.model}")
|
|
|
|
response = requests.post(
|
|
self.ollama_url,
|
|
json=payload,
|
|
timeout=config.VISION_TIMEOUT
|
|
)
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
answer = result.get('response', '').strip()
|
|
|
|
if config.DEBUG:
|
|
print(f"[DEBUG] Vision model response: {answer}")
|
|
|
|
return answer
|
|
|
|
except requests.Timeout:
|
|
if config.DEBUG:
|
|
print("[DEBUG] Vision analysis timeout")
|
|
return None
|
|
except requests.RequestException as e:
|
|
if config.DEBUG:
|
|
print(f"[DEBUG] Vision API error: {e}")
|
|
return None
|
|
except Exception as e:
|
|
if config.DEBUG:
|
|
print(f"[DEBUG] Unexpected error in vision analysis: {e}")
|
|
return None
|
|
|
|
def test_connection(self) -> bool:
|
|
"""
|
|
Test if Ollama is accessible and model is available
|
|
|
|
Returns:
|
|
True if connection successful, False otherwise
|
|
"""
|
|
try:
|
|
# Try to list tags
|
|
tags_url = self.ollama_url.replace('/api/generate', '/api/tags')
|
|
response = requests.get(tags_url, timeout=5)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
models = [m['name'] for m in data.get('models', [])]
|
|
|
|
if config.DEBUG:
|
|
print(f"[DEBUG] Available models: {models}")
|
|
|
|
# Check if our model is available
|
|
model_available = any(self.model in m for m in models)
|
|
|
|
if not model_available:
|
|
print(f"Warning: Model '{self.model}' not found in Ollama")
|
|
print(f"Available models: {', '.join(models)}")
|
|
|
|
return model_available
|
|
|
|
except Exception as e:
|
|
if config.DEBUG:
|
|
print(f"[DEBUG] Connection test failed: {e}")
|
|
return False
|