import Foundation import Vision import CoreImage #if canImport(AppKit) import AppKit #endif /// Result of Vision framework analysis on an image public struct VisionAnalysisResult: Sendable { public let textContent: String public let labels: [String] public let description: String public init(textContent: String = "", labels: [String] = [], description: String = "") { self.textContent = textContent self.labels = labels self.description = description } /// Format analysis for LLM context public func formatAsContext(imageIndex: Int, filename: String?) -> String { var parts: [String] = [] let imageName = filename ?? "Image \(imageIndex + 1)" if !textContent.isEmpty { parts.append("Text: \"\(textContent)\"") } if !labels.isEmpty { parts.append("Objects: \(labels.joined(separator: ", "))") } if parts.isEmpty { return "\(imageName): No content detected" } return "\(imageName): \(parts.joined(separator: " | "))" } } /// Errors from Vision analysis public enum VisionAnalysisError: Error, CustomStringConvertible, Sendable { case invalidImageData case analysisFailure(String) case unsupportedFormat public var description: String { switch self { case .invalidImageData: return "Invalid or corrupted image data" case .analysisFailure(let reason): return "Vision analysis failed: \(reason)" case .unsupportedFormat: return "Unsupported image format" } } } /// Service for analyzing images using Apple's Vision framework public actor VisionAnalysisService { /// Configuration for which analyses to perform public struct AnalysisOptions: Sendable { public var performOCR: Bool public var performClassification: Bool public init(performOCR: Bool = true, performClassification: Bool = true) { self.performOCR = performOCR self.performClassification = performClassification } public static let all = AnalysisOptions() public static let textOnly = AnalysisOptions(performOCR: true, performClassification: false) } public init() {} /// Analyze a single image public func analyze( imageData: Data, options: AnalysisOptions = .all ) async throws -> VisionAnalysisResult { guard let cgImage = createCGImage(from: imageData) else { throw VisionAnalysisError.invalidImageData } var textContent = "" var labels: [String] = [] // Perform OCR if options.performOCR { textContent = try await performTextRecognition(on: cgImage) } // Perform image classification if options.performClassification { labels = try await performImageClassification(on: cgImage) } // Build description var descriptionParts: [String] = [] if !textContent.isEmpty { let truncatedText = textContent.count > 200 ? String(textContent.prefix(200)) + "..." : textContent descriptionParts.append("Contains text: \"\(truncatedText)\"") } if !labels.isEmpty { descriptionParts.append("Shows: \(labels.prefix(5).joined(separator: ", "))") } let description = descriptionParts.isEmpty ? "Image with no recognizable content" : descriptionParts.joined(separator: ". ") return VisionAnalysisResult( textContent: textContent, labels: labels, description: description ) } /// Analyze multiple images public func analyzeMultiple( images: [(data: Data, filename: String?)], options: AnalysisOptions = .all ) async throws -> [VisionAnalysisResult] { var results: [VisionAnalysisResult] = [] for image in images { let result = try await analyze(imageData: image.data, options: options) results.append(result) } return results } /// Format multiple analyses as a combined context string for LLM public func formatAnalysesAsPromptContext( analyses: [(result: VisionAnalysisResult, filename: String?)] ) -> String { guard !analyses.isEmpty else { return "" } var lines: [String] = ["[Image Analysis]"] for (index, analysis) in analyses.enumerated() { lines.append(analysis.result.formatAsContext( imageIndex: index, filename: analysis.filename )) } lines.append("[End Image Analysis]") return lines.joined(separator: "\n") } // MARK: - Private Methods private func createCGImage(from data: Data) -> CGImage? { #if canImport(AppKit) guard let nsImage = NSImage(data: data), let cgImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else { // Try CIImage as fallback guard let ciImage = CIImage(data: data) else { return nil } let context = CIContext() return context.createCGImage(ciImage, from: ciImage.extent) } return cgImage #else guard let ciImage = CIImage(data: data) else { return nil } let context = CIContext() return context.createCGImage(ciImage, from: ciImage.extent) #endif } private func performTextRecognition(on image: CGImage) async throws -> String { try await withCheckedThrowingContinuation { continuation in let request = VNRecognizeTextRequest { request, error in if let error = error { continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription)) return } guard let observations = request.results as? [VNRecognizedTextObservation] else { continuation.resume(returning: "") return } let recognizedText = observations.compactMap { observation in observation.topCandidates(1).first?.string }.joined(separator: "\n") continuation.resume(returning: recognizedText) } request.recognitionLevel = .accurate request.usesLanguageCorrection = true let handler = VNImageRequestHandler(cgImage: image, options: [:]) do { try handler.perform([request]) } catch { continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription)) } } } private func performImageClassification(on image: CGImage) async throws -> [String] { try await withCheckedThrowingContinuation { continuation in let request = VNClassifyImageRequest { request, error in if let error = error { continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription)) return } guard let observations = request.results as? [VNClassificationObservation] else { continuation.resume(returning: []) return } // Filter to high-confidence labels and take top 10 let labels = observations .filter { $0.confidence > 0.3 } .prefix(10) .map { $0.identifier.replacingOccurrences(of: "_", with: " ") } continuation.resume(returning: Array(labels)) } let handler = VNImageRequestHandler(cgImage: image, options: [:]) do { try handler.perform([request]) } catch { continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription)) } } } }