swift-apple-intelligence-grpc/Sources/AppleIntelligenceApp/ViewModels/ChatViewModel.swift
Mathias Beaulieu-Duncan b754945923 Add Text-to-Speech and Speech-to-Text features
- Add TTS service using AVSpeechSynthesizer for voice output
- Add STT service using SpeechAnalyzer (macOS 26) for transcription
- Add voice input (microphone) button in chat with recording level indicator
- Add speak button on assistant messages for TTS playback
- Add language toggle (EN-CA/FR-CA) for bilingual speech recognition
- Fix Swift 6 strict concurrency issues in audio callbacks
- Update proto schema with TTS/STT message types and RPCs
- Update gRPC provider with speech service endpoints

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 02:57:30 -05:00

525 lines
17 KiB
Swift

import Foundation
import AppKit
import AVFoundation
import Speech
import UniformTypeIdentifiers
import AppleIntelligenceCore
@MainActor
@Observable
final class ChatViewModel {
var messages: [ChatMessage] = []
var inputText: String = ""
var isLoading: Bool = false
var errorMessage: String?
// Image attachment state
var pendingImages: [ImageAttachment] = []
// Voice input/output state
var isRecording: Bool = false
var isSpeaking: Bool = false
var speakingMessageId: UUID?
var recordingLevel: Float = 0
private var service: AppleIntelligenceService?
private var ttsService: TextToSpeechService?
private var sttService: SpeechToTextService?
private var currentTask: Task<Void, Never>?
// Audio recording - multi-language support
private var audioEngine: AVAudioEngine?
private var speechRecognizers: [String: SFSpeechRecognizer] = [:]
private var activeRecognizer: SFSpeechRecognizer?
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
// Supported speech recognition languages (Canadian English and French)
private static let supportedLocales = ["en-CA", "fr-CA"]
var detectedLanguage: String = "en-CA"
// Audio playback - use direct speech synthesis for reliability
private var speechSynthesizer: AVSpeechSynthesizer?
private var speechDelegate: SpeechSynthesizerDelegate?
// Maximum images per message
private let maxImagesPerMessage = 5
// Supported image types
static let supportedImageTypes: [UTType] = [.png, .jpeg, .gif, .webP, .heic]
// Recent images from Downloads and Desktop
var recentImages: [URL] = []
func initialize() async {
service = await AppleIntelligenceService()
ttsService = TextToSpeechService()
sttService = await SpeechToTextService()
// Initialize speech recognizers for all supported locales
for localeId in Self.supportedLocales {
if let recognizer = SFSpeechRecognizer(locale: Locale(identifier: localeId)) {
speechRecognizers[localeId] = recognizer
}
}
// Default to system locale if supported, otherwise en-CA
let systemLocale = Locale.current.identifier
if speechRecognizers[systemLocale] != nil {
detectedLanguage = systemLocale
} else if systemLocale.starts(with: "fr") {
detectedLanguage = "fr-CA"
} else {
detectedLanguage = "en-CA"
}
activeRecognizer = speechRecognizers[detectedLanguage]
loadRecentImages()
}
// MARK: - Recent Images
func loadRecentImages() {
let fileManager = FileManager.default
let homeDir = fileManager.homeDirectoryForCurrentUser
let folders = [
homeDir.appendingPathComponent("Downloads"),
homeDir.appendingPathComponent("Desktop")
]
let imageExtensions = ["png", "jpg", "jpeg", "gif", "webp", "heic", "heif"]
var allImages: [(url: URL, date: Date)] = []
for folder in folders {
guard let contents = try? fileManager.contentsOfDirectory(
at: folder,
includingPropertiesForKeys: [.contentModificationDateKey, .isRegularFileKey],
options: [.skipsHiddenFiles]
) else { continue }
for url in contents {
let ext = url.pathExtension.lowercased()
guard imageExtensions.contains(ext) else { continue }
if let attributes = try? url.resourceValues(forKeys: [.contentModificationDateKey, .isRegularFileKey]),
attributes.isRegularFile == true,
let modDate = attributes.contentModificationDate {
allImages.append((url: url, date: modDate))
}
}
}
// Sort by date descending and take last 10
recentImages = allImages
.sorted { $0.date > $1.date }
.prefix(10)
.map { $0.url }
}
func addRecentImage(_ url: URL) {
addImage(from: url)
}
var isServiceAvailable: Bool {
get async {
await service?.isAvailable ?? false
}
}
var canSend: Bool {
!inputText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty || !pendingImages.isEmpty
}
// MARK: - Image Handling
func addImage(from url: URL) {
guard pendingImages.count < maxImagesPerMessage else {
errorMessage = "Maximum \(maxImagesPerMessage) images per message"
return
}
do {
let data = try Data(contentsOf: url)
let attachment = ImageAttachment(data: data, filename: url.lastPathComponent)
pendingImages.append(attachment)
errorMessage = nil
} catch {
errorMessage = "Failed to load image: \(error.localizedDescription)"
}
}
func addImageFromPasteboard() {
guard let image = NSPasteboard.general.readObjects(
forClasses: [NSImage.self],
options: nil
)?.first as? NSImage else {
return
}
guard pendingImages.count < maxImagesPerMessage else {
errorMessage = "Maximum \(maxImagesPerMessage) images per message"
return
}
if let tiffData = image.tiffRepresentation,
let bitmap = NSBitmapImageRep(data: tiffData),
let pngData = bitmap.representation(using: .png, properties: [:]) {
let attachment = ImageAttachment(data: pngData, filename: "pasted_image.png")
pendingImages.append(attachment)
errorMessage = nil
}
}
func removePendingImage(_ attachment: ImageAttachment) {
pendingImages.removeAll { $0.id == attachment.id }
}
func clearPendingImages() {
pendingImages.removeAll()
}
// MARK: - Messaging
func sendMessage() {
let text = inputText.trimmingCharacters(in: .whitespacesAndNewlines)
guard !text.isEmpty || !pendingImages.isEmpty else { return }
guard !isLoading else { return }
// Capture images before clearing
let imagesToSend = pendingImages
// Add user message with images
let userMessage = ChatMessage(role: .user, content: text, images: imagesToSend)
messages.append(userMessage)
inputText = ""
pendingImages = []
errorMessage = nil
// Add placeholder for assistant response
let assistantMessage = ChatMessage(role: .assistant, content: "", isStreaming: true)
messages.append(assistantMessage)
isLoading = true
currentTask = Task {
do {
guard let service = service else {
throw AppleIntelligenceError.modelNotAvailable
}
// Convert attachments to service format
let images = imagesToSend.map { attachment in
(data: attachment.data, filename: attachment.filename)
}
let stream = await service.streamComplete(
prompt: text,
temperature: nil,
maxTokens: nil,
images: images
)
var fullResponse = ""
for try await (partialResponse, _) in stream {
fullResponse = partialResponse
// Update the last message (assistant's response)
if let index = messages.lastIndex(where: { $0.role == .assistant }) {
messages[index].content = fullResponse
}
}
// Mark streaming as complete
if let index = messages.lastIndex(where: { $0.role == .assistant }) {
messages[index].isStreaming = false
}
} catch {
errorMessage = error.localizedDescription
// Remove the empty assistant message on error
if let index = messages.lastIndex(where: { $0.role == .assistant && $0.content.isEmpty }) {
messages.remove(at: index)
}
}
isLoading = false
}
}
func stopGeneration() {
currentTask?.cancel()
currentTask = nil
isLoading = false
// Mark any streaming message as complete
if let index = messages.lastIndex(where: { $0.isStreaming }) {
messages[index].isStreaming = false
}
}
func clearChat() {
stopGeneration()
messages.removeAll()
errorMessage = nil
}
// MARK: - Voice Input (Speech-to-Text)
func toggleRecording() {
if isRecording {
stopRecording()
} else {
startRecording()
}
}
func startRecording() {
Task {
// Use nonisolated helper to avoid MainActor isolation inheritance in TCC callback
let status = await Self.requestSpeechAuthorization()
guard status == .authorized else {
self.errorMessage = "Speech recognition not authorized"
return
}
self.beginRecording()
}
}
/// Request speech recognition authorization without MainActor isolation.
/// This prevents Swift 6 strict concurrency from asserting MainActor in the TCC callback.
private nonisolated static func requestSpeechAuthorization() async -> SFSpeechRecognizerAuthorizationStatus {
await withCheckedContinuation { continuation in
SFSpeechRecognizer.requestAuthorization { status in
continuation.resume(returning: status)
}
}
}
/// Creates audio tap handler in nonisolated context to avoid MainActor isolation inheritance.
/// Audio taps run on CoreAudio's RealtimeMessenger queue, not MainActor.
private nonisolated static func createAudioTapHandler(
request: SFSpeechAudioBufferRecognitionRequest,
levelUpdater: RecordingLevelUpdater
) -> (AVAudioPCMBuffer, AVAudioTime) -> Void {
return { buffer, _ in
request.append(buffer)
// Calculate audio level for visual feedback
guard let channelData = buffer.floatChannelData else { return }
let channelDataValue = channelData.pointee
let channelDataValueArray = stride(from: 0, to: Int(buffer.frameLength), by: buffer.stride).map { channelDataValue[$0] }
let rms = sqrt(channelDataValueArray.map { $0 * $0 }.reduce(0, +) / Float(buffer.frameLength))
let avgPower = 20 * log10(rms)
let level = max(0, min(1, (avgPower + 50) / 50))
levelUpdater.updateLevel(level)
}
}
private func beginRecording() {
// Try to find an available recognizer
let recognizer = activeRecognizer ?? speechRecognizers.values.first { $0.isAvailable }
guard let speechRecognizer = recognizer, speechRecognizer.isAvailable else {
errorMessage = "Speech recognition not available"
return
}
// Stop any existing recording
if audioEngine != nil {
stopRecording()
}
audioEngine = AVAudioEngine()
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
guard let audioEngine = audioEngine,
let recognitionRequest = recognitionRequest else {
errorMessage = "Failed to initialize audio engine"
return
}
recognitionRequest.shouldReportPartialResults = true
// Enable automatic language detection if available (macOS 14+)
if #available(macOS 14, *) {
recognitionRequest.addsPunctuation = true
}
let inputNode = audioEngine.inputNode
let recordingFormat = inputNode.outputFormat(forBus: 0)
// Use nonisolated static function to create audio tap handler
// This breaks MainActor isolation inheritance in the closure
let levelUpdater = RecordingLevelUpdater(viewModel: self)
let audioTapHandler = Self.createAudioTapHandler(request: recognitionRequest, levelUpdater: levelUpdater)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat, block: audioTapHandler)
audioEngine.prepare()
do {
try audioEngine.start()
isRecording = true
// Use a sendable wrapper for recognition results with language detection
let resultHandler = RecognitionResultHandler(viewModel: self)
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
resultHandler.handleResult(result: result, error: error)
}
} catch {
errorMessage = "Failed to start recording: \(error.localizedDescription)"
cleanupRecording()
}
}
/// Switch to a different language for speech recognition
func switchLanguage(to localeId: String) {
guard let recognizer = speechRecognizers[localeId] else { return }
activeRecognizer = recognizer
detectedLanguage = localeId
}
/// Get available languages for speech recognition
var availableLanguages: [(id: String, name: String)] {
speechRecognizers.keys.sorted().compactMap { localeId in
let locale = Locale(identifier: localeId)
let name = locale.localizedString(forIdentifier: localeId) ?? localeId
return (id: localeId, name: name)
}
}
func stopRecording() {
recognitionRequest?.endAudio()
cleanupRecording()
}
fileprivate func cleanupRecording() {
audioEngine?.stop()
audioEngine?.inputNode.removeTap(onBus: 0)
audioEngine = nil
recognitionRequest = nil
recognitionTask?.cancel()
recognitionTask = nil
isRecording = false
recordingLevel = 0
}
// MARK: - Voice Output (Text-to-Speech)
func speakMessage(_ message: ChatMessage) {
guard !message.content.isEmpty else { return }
// If already speaking this message, stop
if isSpeaking && speakingMessageId == message.id {
stopSpeaking()
return
}
// Stop any current speech
stopSpeaking()
speakingMessageId = message.id
isSpeaking = true
// Create utterance
let utterance = AVSpeechUtterance(string: message.content)
utterance.rate = AVSpeechUtteranceDefaultSpeechRate
utterance.pitchMultiplier = 1.0
utterance.volume = 1.0
// Use voice matching current speech recognition language
if detectedLanguage == "fr-CA" {
utterance.voice = AVSpeechSynthesisVoice(language: "fr-CA")
} else {
utterance.voice = AVSpeechSynthesisVoice(language: "en-CA")
}
// Create synthesizer and delegate
let synthesizer = AVSpeechSynthesizer()
speechDelegate = SpeechSynthesizerDelegate { [weak self] in
Task { @MainActor in
self?.isSpeaking = false
self?.speakingMessageId = nil
self?.speechDelegate = nil
self?.speechSynthesizer = nil
}
}
synthesizer.delegate = speechDelegate
speechSynthesizer = synthesizer
// Speak directly
synthesizer.speak(utterance)
}
func stopSpeaking() {
speechSynthesizer?.stopSpeaking(at: .immediate)
speechSynthesizer = nil
speechDelegate = nil
isSpeaking = false
speakingMessageId = nil
}
}
// MARK: - Speech Synthesizer Delegate
private final class SpeechSynthesizerDelegate: NSObject, AVSpeechSynthesizerDelegate, @unchecked Sendable {
let onFinish: () -> Void
init(onFinish: @escaping () -> Void) {
self.onFinish = onFinish
}
func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
onFinish()
}
func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
onFinish()
}
}
// MARK: - Sendable Wrappers for Audio Callbacks
/// Wrapper to safely update recording level from audio callback thread
private final class RecordingLevelUpdater: @unchecked Sendable {
private weak var viewModel: ChatViewModel?
init(viewModel: ChatViewModel) {
self.viewModel = viewModel
}
func updateLevel(_ level: Float) {
Task { @MainActor [weak viewModel] in
viewModel?.recordingLevel = level
}
}
}
/// Wrapper to safely handle recognition results from Speech framework callback
private final class RecognitionResultHandler: @unchecked Sendable {
private weak var viewModel: ChatViewModel?
init(viewModel: ChatViewModel) {
self.viewModel = viewModel
}
func handleResult(result: SFSpeechRecognitionResult?, error: Error?) {
// Extract data before crossing actor boundary (SFSpeechRecognitionResult is not Sendable)
let transcription = result?.bestTranscription.formattedString
let isFinal = result?.isFinal ?? false
let hasError = error != nil
Task { @MainActor [weak viewModel] in
if let transcription = transcription {
viewModel?.inputText = transcription
}
if hasError || isFinal {
viewModel?.cleanupRecording()
}
}
}
}