- Add Vision framework integration for image analysis (OCR, classification) - Add image attachment support in chat UI with drag & drop - Add recent images sidebar from Downloads/Desktop - Add copy to clipboard button for assistant responses - Add gRPC reflection service with toggle in settings - Create proper .proto file and generate Swift code - Add server restart when toggling reflection setting - Fix port number formatting in settings (remove comma grouping) - Update gRPC dependencies to v2.x 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
244 lines
7.9 KiB
Swift
244 lines
7.9 KiB
Swift
import Foundation
|
|
import Vision
|
|
import CoreImage
|
|
|
|
#if canImport(AppKit)
|
|
import AppKit
|
|
#endif
|
|
|
|
/// Result of Vision framework analysis on an image
|
|
public struct VisionAnalysisResult: Sendable {
|
|
public let textContent: String
|
|
public let labels: [String]
|
|
public let description: String
|
|
|
|
public init(textContent: String = "", labels: [String] = [], description: String = "") {
|
|
self.textContent = textContent
|
|
self.labels = labels
|
|
self.description = description
|
|
}
|
|
|
|
/// Format analysis for LLM context
|
|
public func formatAsContext(imageIndex: Int, filename: String?) -> String {
|
|
var parts: [String] = []
|
|
|
|
let imageName = filename ?? "Image \(imageIndex + 1)"
|
|
|
|
if !textContent.isEmpty {
|
|
parts.append("Text: \"\(textContent)\"")
|
|
}
|
|
|
|
if !labels.isEmpty {
|
|
parts.append("Objects: \(labels.joined(separator: ", "))")
|
|
}
|
|
|
|
if parts.isEmpty {
|
|
return "\(imageName): No content detected"
|
|
}
|
|
|
|
return "\(imageName): \(parts.joined(separator: " | "))"
|
|
}
|
|
}
|
|
|
|
/// Errors from Vision analysis
|
|
public enum VisionAnalysisError: Error, CustomStringConvertible, Sendable {
|
|
case invalidImageData
|
|
case analysisFailure(String)
|
|
case unsupportedFormat
|
|
|
|
public var description: String {
|
|
switch self {
|
|
case .invalidImageData:
|
|
return "Invalid or corrupted image data"
|
|
case .analysisFailure(let reason):
|
|
return "Vision analysis failed: \(reason)"
|
|
case .unsupportedFormat:
|
|
return "Unsupported image format"
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Service for analyzing images using Apple's Vision framework
|
|
public actor VisionAnalysisService {
|
|
|
|
/// Configuration for which analyses to perform
|
|
public struct AnalysisOptions: Sendable {
|
|
public var performOCR: Bool
|
|
public var performClassification: Bool
|
|
|
|
public init(performOCR: Bool = true, performClassification: Bool = true) {
|
|
self.performOCR = performOCR
|
|
self.performClassification = performClassification
|
|
}
|
|
|
|
public static let all = AnalysisOptions()
|
|
public static let textOnly = AnalysisOptions(performOCR: true, performClassification: false)
|
|
}
|
|
|
|
public init() {}
|
|
|
|
/// Analyze a single image
|
|
public func analyze(
|
|
imageData: Data,
|
|
options: AnalysisOptions = .all
|
|
) async throws -> VisionAnalysisResult {
|
|
guard let cgImage = createCGImage(from: imageData) else {
|
|
throw VisionAnalysisError.invalidImageData
|
|
}
|
|
|
|
var textContent = ""
|
|
var labels: [String] = []
|
|
|
|
// Perform OCR
|
|
if options.performOCR {
|
|
textContent = try await performTextRecognition(on: cgImage)
|
|
}
|
|
|
|
// Perform image classification
|
|
if options.performClassification {
|
|
labels = try await performImageClassification(on: cgImage)
|
|
}
|
|
|
|
// Build description
|
|
var descriptionParts: [String] = []
|
|
if !textContent.isEmpty {
|
|
let truncatedText = textContent.count > 200
|
|
? String(textContent.prefix(200)) + "..."
|
|
: textContent
|
|
descriptionParts.append("Contains text: \"\(truncatedText)\"")
|
|
}
|
|
if !labels.isEmpty {
|
|
descriptionParts.append("Shows: \(labels.prefix(5).joined(separator: ", "))")
|
|
}
|
|
|
|
let description = descriptionParts.isEmpty
|
|
? "Image with no recognizable content"
|
|
: descriptionParts.joined(separator: ". ")
|
|
|
|
return VisionAnalysisResult(
|
|
textContent: textContent,
|
|
labels: labels,
|
|
description: description
|
|
)
|
|
}
|
|
|
|
/// Analyze multiple images
|
|
public func analyzeMultiple(
|
|
images: [(data: Data, filename: String?)],
|
|
options: AnalysisOptions = .all
|
|
) async throws -> [VisionAnalysisResult] {
|
|
var results: [VisionAnalysisResult] = []
|
|
|
|
for image in images {
|
|
let result = try await analyze(imageData: image.data, options: options)
|
|
results.append(result)
|
|
}
|
|
|
|
return results
|
|
}
|
|
|
|
/// Format multiple analyses as a combined context string for LLM
|
|
public func formatAnalysesAsPromptContext(
|
|
analyses: [(result: VisionAnalysisResult, filename: String?)]
|
|
) -> String {
|
|
guard !analyses.isEmpty else { return "" }
|
|
|
|
var lines: [String] = ["[Image Analysis]"]
|
|
|
|
for (index, analysis) in analyses.enumerated() {
|
|
lines.append(analysis.result.formatAsContext(
|
|
imageIndex: index,
|
|
filename: analysis.filename
|
|
))
|
|
}
|
|
|
|
lines.append("[End Image Analysis]")
|
|
|
|
return lines.joined(separator: "\n")
|
|
}
|
|
|
|
// MARK: - Private Methods
|
|
|
|
private func createCGImage(from data: Data) -> CGImage? {
|
|
#if canImport(AppKit)
|
|
guard let nsImage = NSImage(data: data),
|
|
let cgImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
|
|
// Try CIImage as fallback
|
|
guard let ciImage = CIImage(data: data) else { return nil }
|
|
let context = CIContext()
|
|
return context.createCGImage(ciImage, from: ciImage.extent)
|
|
}
|
|
return cgImage
|
|
#else
|
|
guard let ciImage = CIImage(data: data) else { return nil }
|
|
let context = CIContext()
|
|
return context.createCGImage(ciImage, from: ciImage.extent)
|
|
#endif
|
|
}
|
|
|
|
private func performTextRecognition(on image: CGImage) async throws -> String {
|
|
try await withCheckedThrowingContinuation { continuation in
|
|
let request = VNRecognizeTextRequest { request, error in
|
|
if let error = error {
|
|
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
|
|
return
|
|
}
|
|
|
|
guard let observations = request.results as? [VNRecognizedTextObservation] else {
|
|
continuation.resume(returning: "")
|
|
return
|
|
}
|
|
|
|
let recognizedText = observations.compactMap { observation in
|
|
observation.topCandidates(1).first?.string
|
|
}.joined(separator: "\n")
|
|
|
|
continuation.resume(returning: recognizedText)
|
|
}
|
|
|
|
request.recognitionLevel = .accurate
|
|
request.usesLanguageCorrection = true
|
|
|
|
let handler = VNImageRequestHandler(cgImage: image, options: [:])
|
|
|
|
do {
|
|
try handler.perform([request])
|
|
} catch {
|
|
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
|
|
}
|
|
}
|
|
}
|
|
|
|
private func performImageClassification(on image: CGImage) async throws -> [String] {
|
|
try await withCheckedThrowingContinuation { continuation in
|
|
let request = VNClassifyImageRequest { request, error in
|
|
if let error = error {
|
|
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
|
|
return
|
|
}
|
|
|
|
guard let observations = request.results as? [VNClassificationObservation] else {
|
|
continuation.resume(returning: [])
|
|
return
|
|
}
|
|
|
|
// Filter to high-confidence labels and take top 10
|
|
let labels = observations
|
|
.filter { $0.confidence > 0.3 }
|
|
.prefix(10)
|
|
.map { $0.identifier.replacingOccurrences(of: "_", with: " ") }
|
|
|
|
continuation.resume(returning: Array(labels))
|
|
}
|
|
|
|
let handler = VNImageRequestHandler(cgImage: image, options: [:])
|
|
|
|
do {
|
|
try handler.perform([request])
|
|
} catch {
|
|
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
|
|
}
|
|
}
|
|
}
|
|
}
|