swift-apple-intelligence-grpc/Sources/AppleIntelligenceCore/Services/VisionAnalysisService.swift
Mathias Beaulieu-Duncan 638656e7ca Add vision support, gRPC reflection toggle, and chat improvements
- Add Vision framework integration for image analysis (OCR, classification)
- Add image attachment support in chat UI with drag & drop
- Add recent images sidebar from Downloads/Desktop
- Add copy to clipboard button for assistant responses
- Add gRPC reflection service with toggle in settings
- Create proper .proto file and generate Swift code
- Add server restart when toggling reflection setting
- Fix port number formatting in settings (remove comma grouping)
- Update gRPC dependencies to v2.x

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-30 16:18:06 -05:00

244 lines
7.9 KiB
Swift

import Foundation
import Vision
import CoreImage
#if canImport(AppKit)
import AppKit
#endif
/// Result of Vision framework analysis on an image
public struct VisionAnalysisResult: Sendable {
public let textContent: String
public let labels: [String]
public let description: String
public init(textContent: String = "", labels: [String] = [], description: String = "") {
self.textContent = textContent
self.labels = labels
self.description = description
}
/// Format analysis for LLM context
public func formatAsContext(imageIndex: Int, filename: String?) -> String {
var parts: [String] = []
let imageName = filename ?? "Image \(imageIndex + 1)"
if !textContent.isEmpty {
parts.append("Text: \"\(textContent)\"")
}
if !labels.isEmpty {
parts.append("Objects: \(labels.joined(separator: ", "))")
}
if parts.isEmpty {
return "\(imageName): No content detected"
}
return "\(imageName): \(parts.joined(separator: " | "))"
}
}
/// Errors from Vision analysis
public enum VisionAnalysisError: Error, CustomStringConvertible, Sendable {
case invalidImageData
case analysisFailure(String)
case unsupportedFormat
public var description: String {
switch self {
case .invalidImageData:
return "Invalid or corrupted image data"
case .analysisFailure(let reason):
return "Vision analysis failed: \(reason)"
case .unsupportedFormat:
return "Unsupported image format"
}
}
}
/// Service for analyzing images using Apple's Vision framework
public actor VisionAnalysisService {
/// Configuration for which analyses to perform
public struct AnalysisOptions: Sendable {
public var performOCR: Bool
public var performClassification: Bool
public init(performOCR: Bool = true, performClassification: Bool = true) {
self.performOCR = performOCR
self.performClassification = performClassification
}
public static let all = AnalysisOptions()
public static let textOnly = AnalysisOptions(performOCR: true, performClassification: false)
}
public init() {}
/// Analyze a single image
public func analyze(
imageData: Data,
options: AnalysisOptions = .all
) async throws -> VisionAnalysisResult {
guard let cgImage = createCGImage(from: imageData) else {
throw VisionAnalysisError.invalidImageData
}
var textContent = ""
var labels: [String] = []
// Perform OCR
if options.performOCR {
textContent = try await performTextRecognition(on: cgImage)
}
// Perform image classification
if options.performClassification {
labels = try await performImageClassification(on: cgImage)
}
// Build description
var descriptionParts: [String] = []
if !textContent.isEmpty {
let truncatedText = textContent.count > 200
? String(textContent.prefix(200)) + "..."
: textContent
descriptionParts.append("Contains text: \"\(truncatedText)\"")
}
if !labels.isEmpty {
descriptionParts.append("Shows: \(labels.prefix(5).joined(separator: ", "))")
}
let description = descriptionParts.isEmpty
? "Image with no recognizable content"
: descriptionParts.joined(separator: ". ")
return VisionAnalysisResult(
textContent: textContent,
labels: labels,
description: description
)
}
/// Analyze multiple images
public func analyzeMultiple(
images: [(data: Data, filename: String?)],
options: AnalysisOptions = .all
) async throws -> [VisionAnalysisResult] {
var results: [VisionAnalysisResult] = []
for image in images {
let result = try await analyze(imageData: image.data, options: options)
results.append(result)
}
return results
}
/// Format multiple analyses as a combined context string for LLM
public func formatAnalysesAsPromptContext(
analyses: [(result: VisionAnalysisResult, filename: String?)]
) -> String {
guard !analyses.isEmpty else { return "" }
var lines: [String] = ["[Image Analysis]"]
for (index, analysis) in analyses.enumerated() {
lines.append(analysis.result.formatAsContext(
imageIndex: index,
filename: analysis.filename
))
}
lines.append("[End Image Analysis]")
return lines.joined(separator: "\n")
}
// MARK: - Private Methods
private func createCGImage(from data: Data) -> CGImage? {
#if canImport(AppKit)
guard let nsImage = NSImage(data: data),
let cgImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
// Try CIImage as fallback
guard let ciImage = CIImage(data: data) else { return nil }
let context = CIContext()
return context.createCGImage(ciImage, from: ciImage.extent)
}
return cgImage
#else
guard let ciImage = CIImage(data: data) else { return nil }
let context = CIContext()
return context.createCGImage(ciImage, from: ciImage.extent)
#endif
}
private func performTextRecognition(on image: CGImage) async throws -> String {
try await withCheckedThrowingContinuation { continuation in
let request = VNRecognizeTextRequest { request, error in
if let error = error {
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
return
}
guard let observations = request.results as? [VNRecognizedTextObservation] else {
continuation.resume(returning: "")
return
}
let recognizedText = observations.compactMap { observation in
observation.topCandidates(1).first?.string
}.joined(separator: "\n")
continuation.resume(returning: recognizedText)
}
request.recognitionLevel = .accurate
request.usesLanguageCorrection = true
let handler = VNImageRequestHandler(cgImage: image, options: [:])
do {
try handler.perform([request])
} catch {
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
}
}
}
private func performImageClassification(on image: CGImage) async throws -> [String] {
try await withCheckedThrowingContinuation { continuation in
let request = VNClassifyImageRequest { request, error in
if let error = error {
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
return
}
guard let observations = request.results as? [VNClassificationObservation] else {
continuation.resume(returning: [])
return
}
// Filter to high-confidence labels and take top 10
let labels = observations
.filter { $0.confidence > 0.3 }
.prefix(10)
.map { $0.identifier.replacingOccurrences(of: "_", with: " ") }
continuation.resume(returning: Array(labels))
}
let handler = VNImageRequestHandler(cgImage: image, options: [:])
do {
try handler.perform([request])
} catch {
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
}
}
}
}