Vision-module-auto/claude_vision_auto/vision_analyzer.py

"""
Vision analysis using MiniCPM-V via Ollama
"""

import base64
import requests
from pathlib import Path
from typing import Optional

from . import config


VISION_PROMPT = """You are analyzing a terminal screenshot showing a Claude Code approval prompt.

Look for:
- Menu options like "1. Yes", "2. Yes, allow all", "3. No"
- Questions asking for approval (create/edit/delete files)
- Yes/No questions

If you see an approval prompt with numbered options:
- Respond ONLY with the number to select "Yes" (usually "1")
- Output format: Just the number, nothing else

If you see a yes/no question:
- Respond with: y

If you don't see any prompt requiring input:
- Respond with: WAIT

Your response (one word only):"""


class VisionAnalyzer:
    """Analyzes screenshots using vision model"""

    def __init__(self, ollama_url: str = None, model: str = None):
        """
        Initialize vision analyzer

        Args:
            ollama_url: Ollama API URL (default from config)
            model: Vision model name (default from config)
        """
        self.ollama_url = ollama_url or config.OLLAMA_URL
        self.model = model or config.VISION_MODEL

    def analyze_screenshot(self, image_path: str) -> Optional[str]:
        """
        Analyze screenshot and determine what response to give

        Args:
            image_path: Path to screenshot image

        Returns:
            Response to send ("1", "y", "WAIT", etc.) or None on error
        """
        try:
            # Read and encode image
            with open(image_path, 'rb') as f:
                image_data = base64.b64encode(f.read()).decode('utf-8')

            # Send to Ollama
            payload = {
                "model": self.model,
                "prompt": VISION_PROMPT,
                "images": [image_data],
                "stream": False
            }

            if config.DEBUG:
                print(f"[DEBUG] Sending to Ollama: {self.ollama_url}")
                print(f"[DEBUG] Model: {self.model}")

            response = requests.post(
                self.ollama_url,
                json=payload,
                timeout=config.VISION_TIMEOUT
            )
            response.raise_for_status()

            result = response.json()
            answer = result.get('response', '').strip()

            if config.DEBUG:
                print(f"[DEBUG] Vision model response: {answer}")

            return answer

        except requests.Timeout:
            if config.DEBUG:
                print("[DEBUG] Vision analysis timeout")
            return None
        except requests.RequestException as e:
            if config.DEBUG:
                print(f"[DEBUG] Vision API error: {e}")
            return None
        except Exception as e:
            if config.DEBUG:
                print(f"[DEBUG] Unexpected error in vision analysis: {e}")
            return None

    def test_connection(self) -> bool:
        """
        Test if Ollama is accessible and model is available

        Returns:
            True if connection successful, False otherwise
        """
        try:
            # Try to list tags
            tags_url = self.ollama_url.replace('/api/generate', '/api/tags')
            response = requests.get(tags_url, timeout=5)
            response.raise_for_status()

            data = response.json()
            models = [m['name'] for m in data.get('models', [])]

            if config.DEBUG:
                print(f"[DEBUG] Available models: {models}")

            # Check if our model is available
            model_available = any(self.model in m for m in models)

            if not model_available:
                print(f"Warning: Model '{self.model}' not found in Ollama")
                print(f"Available models: {', '.join(models)}")

            return model_available

        except Exception as e:
            if config.DEBUG:
                print(f"[DEBUG] Connection test failed: {e}")
            return False