import Foundation import AppKit import AVFoundation import Speech import UniformTypeIdentifiers import AppleIntelligenceCore @MainActor @Observable final class ChatViewModel { var messages: [ChatMessage] = [] var inputText: String = "" var isLoading: Bool = false var errorMessage: String? // Image attachment state var pendingImages: [ImageAttachment] = [] // Voice input/output state var isRecording: Bool = false var isSpeaking: Bool = false var speakingMessageId: UUID? var recordingLevel: Float = 0 private var service: AppleIntelligenceService? private var ttsService: TextToSpeechService? private var sttService: SpeechToTextService? private var currentTask: Task? // Audio recording - multi-language support private var audioEngine: AVAudioEngine? private var speechRecognizers: [String: SFSpeechRecognizer] = [:] private var activeRecognizer: SFSpeechRecognizer? private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? private var recognitionTask: SFSpeechRecognitionTask? // Supported speech recognition languages (Canadian English and French) private static let supportedLocales = ["en-CA", "fr-CA"] var detectedLanguage: String = "en-CA" // Audio playback - use direct speech synthesis for reliability private var speechSynthesizer: AVSpeechSynthesizer? private var speechDelegate: SpeechSynthesizerDelegate? // Maximum images per message private let maxImagesPerMessage = 5 // Supported image types static let supportedImageTypes: [UTType] = [.png, .jpeg, .gif, .webP, .heic] // Recent images from Downloads and Desktop var recentImages: [URL] = [] func initialize() async { service = await AppleIntelligenceService() ttsService = TextToSpeechService() sttService = await SpeechToTextService() // Initialize speech recognizers for all supported locales for localeId in Self.supportedLocales { if let recognizer = SFSpeechRecognizer(locale: Locale(identifier: localeId)) { speechRecognizers[localeId] = recognizer } } // Default to system locale if supported, otherwise en-CA let systemLocale = Locale.current.identifier if speechRecognizers[systemLocale] != nil { detectedLanguage = systemLocale } else if systemLocale.starts(with: "fr") { detectedLanguage = "fr-CA" } else { detectedLanguage = "en-CA" } activeRecognizer = speechRecognizers[detectedLanguage] loadRecentImages() } // MARK: - Recent Images func loadRecentImages() { let fileManager = FileManager.default let homeDir = fileManager.homeDirectoryForCurrentUser let folders = [ homeDir.appendingPathComponent("Downloads"), homeDir.appendingPathComponent("Desktop") ] let imageExtensions = ["png", "jpg", "jpeg", "gif", "webp", "heic", "heif"] var allImages: [(url: URL, date: Date)] = [] for folder in folders { guard let contents = try? fileManager.contentsOfDirectory( at: folder, includingPropertiesForKeys: [.contentModificationDateKey, .isRegularFileKey], options: [.skipsHiddenFiles] ) else { continue } for url in contents { let ext = url.pathExtension.lowercased() guard imageExtensions.contains(ext) else { continue } if let attributes = try? url.resourceValues(forKeys: [.contentModificationDateKey, .isRegularFileKey]), attributes.isRegularFile == true, let modDate = attributes.contentModificationDate { allImages.append((url: url, date: modDate)) } } } // Sort by date descending and take last 10 recentImages = allImages .sorted { $0.date > $1.date } .prefix(10) .map { $0.url } } func addRecentImage(_ url: URL) { addImage(from: url) } var isServiceAvailable: Bool { get async { await service?.isAvailable ?? false } } var canSend: Bool { !inputText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty || !pendingImages.isEmpty } // MARK: - Image Handling func addImage(from url: URL) { guard pendingImages.count < maxImagesPerMessage else { errorMessage = "Maximum \(maxImagesPerMessage) images per message" return } do { let data = try Data(contentsOf: url) let attachment = ImageAttachment(data: data, filename: url.lastPathComponent) pendingImages.append(attachment) errorMessage = nil } catch { errorMessage = "Failed to load image: \(error.localizedDescription)" } } func addImageFromPasteboard() { guard let image = NSPasteboard.general.readObjects( forClasses: [NSImage.self], options: nil )?.first as? NSImage else { return } guard pendingImages.count < maxImagesPerMessage else { errorMessage = "Maximum \(maxImagesPerMessage) images per message" return } if let tiffData = image.tiffRepresentation, let bitmap = NSBitmapImageRep(data: tiffData), let pngData = bitmap.representation(using: .png, properties: [:]) { let attachment = ImageAttachment(data: pngData, filename: "pasted_image.png") pendingImages.append(attachment) errorMessage = nil } } func removePendingImage(_ attachment: ImageAttachment) { pendingImages.removeAll { $0.id == attachment.id } } func clearPendingImages() { pendingImages.removeAll() } // MARK: - Messaging func sendMessage() { let text = inputText.trimmingCharacters(in: .whitespacesAndNewlines) guard !text.isEmpty || !pendingImages.isEmpty else { return } guard !isLoading else { return } // Capture images before clearing let imagesToSend = pendingImages // Add user message with images let userMessage = ChatMessage(role: .user, content: text, images: imagesToSend) messages.append(userMessage) inputText = "" pendingImages = [] errorMessage = nil // Add placeholder for assistant response let assistantMessage = ChatMessage(role: .assistant, content: "", isStreaming: true) messages.append(assistantMessage) isLoading = true currentTask = Task { do { guard let service = service else { throw AppleIntelligenceError.modelNotAvailable } // Convert attachments to service format let images = imagesToSend.map { attachment in (data: attachment.data, filename: attachment.filename) } let stream = await service.streamComplete( prompt: text, temperature: nil, maxTokens: nil, images: images ) var fullResponse = "" for try await (partialResponse, _) in stream { fullResponse = partialResponse // Update the last message (assistant's response) if let index = messages.lastIndex(where: { $0.role == .assistant }) { messages[index].content = fullResponse } } // Mark streaming as complete if let index = messages.lastIndex(where: { $0.role == .assistant }) { messages[index].isStreaming = false } } catch { errorMessage = error.localizedDescription // Remove the empty assistant message on error if let index = messages.lastIndex(where: { $0.role == .assistant && $0.content.isEmpty }) { messages.remove(at: index) } } isLoading = false } } func stopGeneration() { currentTask?.cancel() currentTask = nil isLoading = false // Mark any streaming message as complete if let index = messages.lastIndex(where: { $0.isStreaming }) { messages[index].isStreaming = false } } func clearChat() { stopGeneration() messages.removeAll() errorMessage = nil } // MARK: - Voice Input (Speech-to-Text) func toggleRecording() { if isRecording { stopRecording() } else { startRecording() } } func startRecording() { Task { // Use nonisolated helper to avoid MainActor isolation inheritance in TCC callback let status = await Self.requestSpeechAuthorization() guard status == .authorized else { self.errorMessage = "Speech recognition not authorized" return } self.beginRecording() } } /// Request speech recognition authorization without MainActor isolation. /// This prevents Swift 6 strict concurrency from asserting MainActor in the TCC callback. private nonisolated static func requestSpeechAuthorization() async -> SFSpeechRecognizerAuthorizationStatus { await withCheckedContinuation { continuation in SFSpeechRecognizer.requestAuthorization { status in continuation.resume(returning: status) } } } /// Creates audio tap handler in nonisolated context to avoid MainActor isolation inheritance. /// Audio taps run on CoreAudio's RealtimeMessenger queue, not MainActor. private nonisolated static func createAudioTapHandler( request: SFSpeechAudioBufferRecognitionRequest, levelUpdater: RecordingLevelUpdater ) -> (AVAudioPCMBuffer, AVAudioTime) -> Void { return { buffer, _ in request.append(buffer) // Calculate audio level for visual feedback guard let channelData = buffer.floatChannelData else { return } let channelDataValue = channelData.pointee let channelDataValueArray = stride(from: 0, to: Int(buffer.frameLength), by: buffer.stride).map { channelDataValue[$0] } let rms = sqrt(channelDataValueArray.map { $0 * $0 }.reduce(0, +) / Float(buffer.frameLength)) let avgPower = 20 * log10(rms) let level = max(0, min(1, (avgPower + 50) / 50)) levelUpdater.updateLevel(level) } } private func beginRecording() { // Try to find an available recognizer let recognizer = activeRecognizer ?? speechRecognizers.values.first { $0.isAvailable } guard let speechRecognizer = recognizer, speechRecognizer.isAvailable else { errorMessage = "Speech recognition not available" return } // Stop any existing recording if audioEngine != nil { stopRecording() } audioEngine = AVAudioEngine() recognitionRequest = SFSpeechAudioBufferRecognitionRequest() guard let audioEngine = audioEngine, let recognitionRequest = recognitionRequest else { errorMessage = "Failed to initialize audio engine" return } recognitionRequest.shouldReportPartialResults = true // Enable automatic language detection if available (macOS 14+) if #available(macOS 14, *) { recognitionRequest.addsPunctuation = true } let inputNode = audioEngine.inputNode let recordingFormat = inputNode.outputFormat(forBus: 0) // Use nonisolated static function to create audio tap handler // This breaks MainActor isolation inheritance in the closure let levelUpdater = RecordingLevelUpdater(viewModel: self) let audioTapHandler = Self.createAudioTapHandler(request: recognitionRequest, levelUpdater: levelUpdater) inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat, block: audioTapHandler) audioEngine.prepare() do { try audioEngine.start() isRecording = true // Use a sendable wrapper for recognition results with language detection let resultHandler = RecognitionResultHandler(viewModel: self) recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in resultHandler.handleResult(result: result, error: error) } } catch { errorMessage = "Failed to start recording: \(error.localizedDescription)" cleanupRecording() } } /// Switch to a different language for speech recognition func switchLanguage(to localeId: String) { guard let recognizer = speechRecognizers[localeId] else { return } activeRecognizer = recognizer detectedLanguage = localeId } /// Get available languages for speech recognition var availableLanguages: [(id: String, name: String)] { speechRecognizers.keys.sorted().compactMap { localeId in let locale = Locale(identifier: localeId) let name = locale.localizedString(forIdentifier: localeId) ?? localeId return (id: localeId, name: name) } } func stopRecording() { recognitionRequest?.endAudio() cleanupRecording() } fileprivate func cleanupRecording() { audioEngine?.stop() audioEngine?.inputNode.removeTap(onBus: 0) audioEngine = nil recognitionRequest = nil recognitionTask?.cancel() recognitionTask = nil isRecording = false recordingLevel = 0 } // MARK: - Voice Output (Text-to-Speech) func speakMessage(_ message: ChatMessage) { guard !message.content.isEmpty else { return } // If already speaking this message, stop if isSpeaking && speakingMessageId == message.id { stopSpeaking() return } // Stop any current speech stopSpeaking() speakingMessageId = message.id isSpeaking = true // Create utterance let utterance = AVSpeechUtterance(string: message.content) utterance.rate = AVSpeechUtteranceDefaultSpeechRate utterance.pitchMultiplier = 1.0 utterance.volume = 1.0 // Use voice matching current speech recognition language if detectedLanguage == "fr-CA" { utterance.voice = AVSpeechSynthesisVoice(language: "fr-CA") } else { utterance.voice = AVSpeechSynthesisVoice(language: "en-CA") } // Create synthesizer and delegate let synthesizer = AVSpeechSynthesizer() speechDelegate = SpeechSynthesizerDelegate { [weak self] in Task { @MainActor in self?.isSpeaking = false self?.speakingMessageId = nil self?.speechDelegate = nil self?.speechSynthesizer = nil } } synthesizer.delegate = speechDelegate speechSynthesizer = synthesizer // Speak directly synthesizer.speak(utterance) } func stopSpeaking() { speechSynthesizer?.stopSpeaking(at: .immediate) speechSynthesizer = nil speechDelegate = nil isSpeaking = false speakingMessageId = nil } } // MARK: - Speech Synthesizer Delegate private final class SpeechSynthesizerDelegate: NSObject, AVSpeechSynthesizerDelegate, @unchecked Sendable { let onFinish: () -> Void init(onFinish: @escaping () -> Void) { self.onFinish = onFinish } func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) { onFinish() } func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) { onFinish() } } // MARK: - Sendable Wrappers for Audio Callbacks /// Wrapper to safely update recording level from audio callback thread private final class RecordingLevelUpdater: @unchecked Sendable { private weak var viewModel: ChatViewModel? init(viewModel: ChatViewModel) { self.viewModel = viewModel } func updateLevel(_ level: Float) { Task { @MainActor [weak viewModel] in viewModel?.recordingLevel = level } } } /// Wrapper to safely handle recognition results from Speech framework callback private final class RecognitionResultHandler: @unchecked Sendable { private weak var viewModel: ChatViewModel? init(viewModel: ChatViewModel) { self.viewModel = viewModel } func handleResult(result: SFSpeechRecognitionResult?, error: Error?) { // Extract data before crossing actor boundary (SFSpeechRecognitionResult is not Sendable) let transcription = result?.bestTranscription.formattedString let isFinal = result?.isFinal ?? false let hasError = error != nil Task { @MainActor [weak viewModel] in if let transcription = transcription { viewModel?.inputText = transcription } if hasError || isFinal { viewModel?.cleanupRecording() } } } }