Add vision support, gRPC reflection toggle, and chat improvements

- Add Vision framework integration for image analysis (OCR, classification) - Add image attachment support in chat UI with drag & drop - Add recent images sidebar from Downloads/Desktop - Add copy to clipboard button for assistant responses - Add gRPC reflection service with toggle in settings - Create proper .proto file and generate Swift code - Add server restart when toggling reflection setting - Fix port number formatting in settings (remove comma grouping) - Update gRPC dependencies to v2.x 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-30 16:18:06 -05:00
parent 62ab635aec
commit 638656e7ca
18 changed files with 2474 additions and 478 deletions
@@ -6,6 +6,7 @@ public enum AppleIntelligenceError: Error, CustomStringConvertible, Sendable {
    case modelNotAvailable
    case generationFailed(String)
    case sessionCreationFailed
+    case imageAnalysisFailed(String)

    public var description: String {
        switch self {
@@ -15,6 +16,8 @@ public enum AppleIntelligenceError: Error, CustomStringConvertible, Sendable {
            return "Generation failed: \(reason)"
        case .sessionCreationFailed:
            return "Failed to create language model session"
+        case .imageAnalysisFailed(let reason):
+            return "Image analysis failed: \(reason)"
        }
    }
 }
@@ -24,6 +27,9 @@ public actor AppleIntelligenceService {
    /// The language model session
    private var session: LanguageModelSession?

+    /// Vision analysis service for image processing
+    private let visionService = VisionAnalysisService()
+
    /// Whether the model is available
    public private(set) var isAvailable: Bool = false

@@ -60,21 +66,42 @@ public actor AppleIntelligenceService {
    }

    /// Generate a completion for the given prompt (non-streaming)
-    public func complete(prompt: String, temperature: Float?, maxTokens: Int?) async throws -> String {
+    public func complete(
+        prompt: String,
+        temperature: Float?,
+        maxTokens: Int?,
+        images: [(data: Data, filename: String?)] = []
+    ) async throws -> (text: String, analyses: [VisionAnalysisResult]) {
        guard isAvailable, let session = session else {
            throw AppleIntelligenceError.modelNotAvailable
        }

-        let response = try await session.respond(to: prompt)
-        return response.content
+        // Analyze images if provided
+        var analyses: [VisionAnalysisResult] = []
+        var enhancedPrompt = prompt
+
+        if !images.isEmpty {
+            do {
+                analyses = try await visionService.analyzeMultiple(images: images)
+                let analysesWithFilenames = zip(analyses, images).map { (result: $0.0, filename: $0.1.filename) }
+                let context = await visionService.formatAnalysesAsPromptContext(analyses: analysesWithFilenames)
+                enhancedPrompt = context + "\n\n" + prompt
+            } catch {
+                throw AppleIntelligenceError.imageAnalysisFailed(error.localizedDescription)
+            }
+        }
+
+        let response = try await session.respond(to: enhancedPrompt)
+        return (text: response.content, analyses: analyses)
    }

    /// Generate a streaming completion for the given prompt
    public func streamComplete(
        prompt: String,
        temperature: Float?,
-        maxTokens: Int?
-    ) -> AsyncThrowingStream<String, Error> {
+        maxTokens: Int?,
+        images: [(data: Data, filename: String?)] = []
+    ) -> AsyncThrowingStream<(text: String, analyses: [VisionAnalysisResult]?), Error> {
        AsyncThrowingStream { continuation in
            Task {
                guard self.isAvailable, let session = self.session else {
@@ -82,10 +109,33 @@ public actor AppleIntelligenceService {
                    return
                }

+                // Analyze images first if provided
+                var analyses: [VisionAnalysisResult] = []
+                var enhancedPrompt = prompt
+
+                if !images.isEmpty {
+                    do {
+                        analyses = try await self.visionService.analyzeMultiple(images: images)
+                        let analysesWithFilenames = zip(analyses, images).map { (result: $0.0, filename: $0.1.filename) }
+                        let context = await self.visionService.formatAnalysesAsPromptContext(analyses: analysesWithFilenames)
+                        enhancedPrompt = context + "\n\n" + prompt
+                    } catch {
+                        continuation.finish(throwing: AppleIntelligenceError.imageAnalysisFailed(error.localizedDescription))
+                        return
+                    }
+                }
+
                do {
-                    let stream = session.streamResponse(to: prompt)
+                    let stream = session.streamResponse(to: enhancedPrompt)
+                    var isFirst = true
                    for try await partialResponse in stream {
-                        continuation.yield(partialResponse.content)
+                        // Include analyses only in first chunk
+                        if isFirst {
+                            continuation.yield((text: partialResponse.content, analyses: analyses))
+                            isFirst = false
+                        } else {
+                            continuation.yield((text: partialResponse.content, analyses: nil))
+                        }
                    }
                    continuation.finish()
                } catch {
@@ -0,0 +1,243 @@
+import Foundation
+import Vision
+import CoreImage
+
+#if canImport(AppKit)
+import AppKit
+#endif
+
+/// Result of Vision framework analysis on an image
+public struct VisionAnalysisResult: Sendable {
+    public let textContent: String
+    public let labels: [String]
+    public let description: String
+
+    public init(textContent: String = "", labels: [String] = [], description: String = "") {
+        self.textContent = textContent
+        self.labels = labels
+        self.description = description
+    }
+
+    /// Format analysis for LLM context
+    public func formatAsContext(imageIndex: Int, filename: String?) -> String {
+        var parts: [String] = []
+
+        let imageName = filename ?? "Image \(imageIndex + 1)"
+
+        if !textContent.isEmpty {
+            parts.append("Text: \"\(textContent)\"")
+        }
+
+        if !labels.isEmpty {
+            parts.append("Objects: \(labels.joined(separator: ", "))")
+        }
+
+        if parts.isEmpty {
+            return "\(imageName): No content detected"
+        }
+
+        return "\(imageName): \(parts.joined(separator: " | "))"
+    }
+}
+
+/// Errors from Vision analysis
+public enum VisionAnalysisError: Error, CustomStringConvertible, Sendable {
+    case invalidImageData
+    case analysisFailure(String)
+    case unsupportedFormat
+
+    public var description: String {
+        switch self {
+        case .invalidImageData:
+            return "Invalid or corrupted image data"
+        case .analysisFailure(let reason):
+            return "Vision analysis failed: \(reason)"
+        case .unsupportedFormat:
+            return "Unsupported image format"
+        }
+    }
+}
+
+/// Service for analyzing images using Apple's Vision framework
+public actor VisionAnalysisService {
+
+    /// Configuration for which analyses to perform
+    public struct AnalysisOptions: Sendable {
+        public var performOCR: Bool
+        public var performClassification: Bool
+
+        public init(performOCR: Bool = true, performClassification: Bool = true) {
+            self.performOCR = performOCR
+            self.performClassification = performClassification
+        }
+
+        public static let all = AnalysisOptions()
+        public static let textOnly = AnalysisOptions(performOCR: true, performClassification: false)
+    }
+
+    public init() {}
+
+    /// Analyze a single image
+    public func analyze(
+        imageData: Data,
+        options: AnalysisOptions = .all
+    ) async throws -> VisionAnalysisResult {
+        guard let cgImage = createCGImage(from: imageData) else {
+            throw VisionAnalysisError.invalidImageData
+        }
+
+        var textContent = ""
+        var labels: [String] = []
+
+        // Perform OCR
+        if options.performOCR {
+            textContent = try await performTextRecognition(on: cgImage)
+        }
+
+        // Perform image classification
+        if options.performClassification {
+            labels = try await performImageClassification(on: cgImage)
+        }
+
+        // Build description
+        var descriptionParts: [String] = []
+        if !textContent.isEmpty {
+            let truncatedText = textContent.count > 200
+                ? String(textContent.prefix(200)) + "..."
+                : textContent
+            descriptionParts.append("Contains text: \"\(truncatedText)\"")
+        }
+        if !labels.isEmpty {
+            descriptionParts.append("Shows: \(labels.prefix(5).joined(separator: ", "))")
+        }
+
+        let description = descriptionParts.isEmpty
+            ? "Image with no recognizable content"
+            : descriptionParts.joined(separator: ". ")
+
+        return VisionAnalysisResult(
+            textContent: textContent,
+            labels: labels,
+            description: description
+        )
+    }
+
+    /// Analyze multiple images
+    public func analyzeMultiple(
+        images: [(data: Data, filename: String?)],
+        options: AnalysisOptions = .all
+    ) async throws -> [VisionAnalysisResult] {
+        var results: [VisionAnalysisResult] = []
+
+        for image in images {
+            let result = try await analyze(imageData: image.data, options: options)
+            results.append(result)
+        }
+
+        return results
+    }
+
+    /// Format multiple analyses as a combined context string for LLM
+    public func formatAnalysesAsPromptContext(
+        analyses: [(result: VisionAnalysisResult, filename: String?)]
+    ) -> String {
+        guard !analyses.isEmpty else { return "" }
+
+        var lines: [String] = ["[Image Analysis]"]
+
+        for (index, analysis) in analyses.enumerated() {
+            lines.append(analysis.result.formatAsContext(
+                imageIndex: index,
+                filename: analysis.filename
+            ))
+        }
+
+        lines.append("[End Image Analysis]")
+
+        return lines.joined(separator: "\n")
+    }
+
+    // MARK: - Private Methods
+
+    private func createCGImage(from data: Data) -> CGImage? {
+        #if canImport(AppKit)
+        guard let nsImage = NSImage(data: data),
+              let cgImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
+            // Try CIImage as fallback
+            guard let ciImage = CIImage(data: data) else { return nil }
+            let context = CIContext()
+            return context.createCGImage(ciImage, from: ciImage.extent)
+        }
+        return cgImage
+        #else
+        guard let ciImage = CIImage(data: data) else { return nil }
+        let context = CIContext()
+        return context.createCGImage(ciImage, from: ciImage.extent)
+        #endif
+    }
+
+    private func performTextRecognition(on image: CGImage) async throws -> String {
+        try await withCheckedThrowingContinuation { continuation in
+            let request = VNRecognizeTextRequest { request, error in
+                if let error = error {
+                    continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
+                    return
+                }
+
+                guard let observations = request.results as? [VNRecognizedTextObservation] else {
+                    continuation.resume(returning: "")
+                    return
+                }
+
+                let recognizedText = observations.compactMap { observation in
+                    observation.topCandidates(1).first?.string
+                }.joined(separator: "\n")
+
+                continuation.resume(returning: recognizedText)
+            }
+
+            request.recognitionLevel = .accurate
+            request.usesLanguageCorrection = true
+
+            let handler = VNImageRequestHandler(cgImage: image, options: [:])
+
+            do {
+                try handler.perform([request])
+            } catch {
+                continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
+            }
+        }
+    }
+
+    private func performImageClassification(on image: CGImage) async throws -> [String] {
+        try await withCheckedThrowingContinuation { continuation in
+            let request = VNClassifyImageRequest { request, error in
+                if let error = error {
+                    continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
+                    return
+                }
+
+                guard let observations = request.results as? [VNClassificationObservation] else {
+                    continuation.resume(returning: [])
+                    return
+                }
+
+                // Filter to high-confidence labels and take top 10
+                let labels = observations
+                    .filter { $0.confidence > 0.3 }
+                    .prefix(10)
+                    .map { $0.identifier.replacingOccurrences(of: "_", with: " ") }
+
+                continuation.resume(returning: Array(labels))
+            }
+
+            let handler = VNImageRequestHandler(cgImage: image, options: [:])
+
+            do {
+                try handler.perform([request])
+            } catch {
+                continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
+            }
+        }
+    }
+}