- Add TTS service using AVSpeechSynthesizer for voice output - Add STT service using SpeechAnalyzer (macOS 26) for transcription - Add voice input (microphone) button in chat with recording level indicator - Add speak button on assistant messages for TTS playback - Add language toggle (EN-CA/FR-CA) for bilingual speech recognition - Fix Swift 6 strict concurrency issues in audio callbacks - Update proto schema with TTS/STT message types and RPCs - Update gRPC provider with speech service endpoints 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
525 lines
17 KiB
Swift
525 lines
17 KiB
Swift
import Foundation
|
|
import AppKit
|
|
import AVFoundation
|
|
import Speech
|
|
import UniformTypeIdentifiers
|
|
import AppleIntelligenceCore
|
|
|
|
@MainActor
|
|
@Observable
|
|
final class ChatViewModel {
|
|
var messages: [ChatMessage] = []
|
|
var inputText: String = ""
|
|
var isLoading: Bool = false
|
|
var errorMessage: String?
|
|
|
|
// Image attachment state
|
|
var pendingImages: [ImageAttachment] = []
|
|
|
|
// Voice input/output state
|
|
var isRecording: Bool = false
|
|
var isSpeaking: Bool = false
|
|
var speakingMessageId: UUID?
|
|
var recordingLevel: Float = 0
|
|
|
|
private var service: AppleIntelligenceService?
|
|
private var ttsService: TextToSpeechService?
|
|
private var sttService: SpeechToTextService?
|
|
private var currentTask: Task<Void, Never>?
|
|
|
|
// Audio recording - multi-language support
|
|
private var audioEngine: AVAudioEngine?
|
|
private var speechRecognizers: [String: SFSpeechRecognizer] = [:]
|
|
private var activeRecognizer: SFSpeechRecognizer?
|
|
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
|
private var recognitionTask: SFSpeechRecognitionTask?
|
|
|
|
// Supported speech recognition languages (Canadian English and French)
|
|
private static let supportedLocales = ["en-CA", "fr-CA"]
|
|
var detectedLanguage: String = "en-CA"
|
|
|
|
// Audio playback - use direct speech synthesis for reliability
|
|
private var speechSynthesizer: AVSpeechSynthesizer?
|
|
private var speechDelegate: SpeechSynthesizerDelegate?
|
|
|
|
// Maximum images per message
|
|
private let maxImagesPerMessage = 5
|
|
|
|
// Supported image types
|
|
static let supportedImageTypes: [UTType] = [.png, .jpeg, .gif, .webP, .heic]
|
|
|
|
// Recent images from Downloads and Desktop
|
|
var recentImages: [URL] = []
|
|
|
|
func initialize() async {
|
|
service = await AppleIntelligenceService()
|
|
ttsService = TextToSpeechService()
|
|
sttService = await SpeechToTextService()
|
|
|
|
// Initialize speech recognizers for all supported locales
|
|
for localeId in Self.supportedLocales {
|
|
if let recognizer = SFSpeechRecognizer(locale: Locale(identifier: localeId)) {
|
|
speechRecognizers[localeId] = recognizer
|
|
}
|
|
}
|
|
|
|
// Default to system locale if supported, otherwise en-CA
|
|
let systemLocale = Locale.current.identifier
|
|
if speechRecognizers[systemLocale] != nil {
|
|
detectedLanguage = systemLocale
|
|
} else if systemLocale.starts(with: "fr") {
|
|
detectedLanguage = "fr-CA"
|
|
} else {
|
|
detectedLanguage = "en-CA"
|
|
}
|
|
activeRecognizer = speechRecognizers[detectedLanguage]
|
|
|
|
loadRecentImages()
|
|
}
|
|
|
|
// MARK: - Recent Images
|
|
|
|
func loadRecentImages() {
|
|
let fileManager = FileManager.default
|
|
let homeDir = fileManager.homeDirectoryForCurrentUser
|
|
|
|
let folders = [
|
|
homeDir.appendingPathComponent("Downloads"),
|
|
homeDir.appendingPathComponent("Desktop")
|
|
]
|
|
|
|
let imageExtensions = ["png", "jpg", "jpeg", "gif", "webp", "heic", "heif"]
|
|
|
|
var allImages: [(url: URL, date: Date)] = []
|
|
|
|
for folder in folders {
|
|
guard let contents = try? fileManager.contentsOfDirectory(
|
|
at: folder,
|
|
includingPropertiesForKeys: [.contentModificationDateKey, .isRegularFileKey],
|
|
options: [.skipsHiddenFiles]
|
|
) else { continue }
|
|
|
|
for url in contents {
|
|
let ext = url.pathExtension.lowercased()
|
|
guard imageExtensions.contains(ext) else { continue }
|
|
|
|
if let attributes = try? url.resourceValues(forKeys: [.contentModificationDateKey, .isRegularFileKey]),
|
|
attributes.isRegularFile == true,
|
|
let modDate = attributes.contentModificationDate {
|
|
allImages.append((url: url, date: modDate))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sort by date descending and take last 10
|
|
recentImages = allImages
|
|
.sorted { $0.date > $1.date }
|
|
.prefix(10)
|
|
.map { $0.url }
|
|
}
|
|
|
|
func addRecentImage(_ url: URL) {
|
|
addImage(from: url)
|
|
}
|
|
|
|
var isServiceAvailable: Bool {
|
|
get async {
|
|
await service?.isAvailable ?? false
|
|
}
|
|
}
|
|
|
|
var canSend: Bool {
|
|
!inputText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty || !pendingImages.isEmpty
|
|
}
|
|
|
|
// MARK: - Image Handling
|
|
|
|
func addImage(from url: URL) {
|
|
guard pendingImages.count < maxImagesPerMessage else {
|
|
errorMessage = "Maximum \(maxImagesPerMessage) images per message"
|
|
return
|
|
}
|
|
|
|
do {
|
|
let data = try Data(contentsOf: url)
|
|
let attachment = ImageAttachment(data: data, filename: url.lastPathComponent)
|
|
pendingImages.append(attachment)
|
|
errorMessage = nil
|
|
} catch {
|
|
errorMessage = "Failed to load image: \(error.localizedDescription)"
|
|
}
|
|
}
|
|
|
|
func addImageFromPasteboard() {
|
|
guard let image = NSPasteboard.general.readObjects(
|
|
forClasses: [NSImage.self],
|
|
options: nil
|
|
)?.first as? NSImage else {
|
|
return
|
|
}
|
|
|
|
guard pendingImages.count < maxImagesPerMessage else {
|
|
errorMessage = "Maximum \(maxImagesPerMessage) images per message"
|
|
return
|
|
}
|
|
|
|
if let tiffData = image.tiffRepresentation,
|
|
let bitmap = NSBitmapImageRep(data: tiffData),
|
|
let pngData = bitmap.representation(using: .png, properties: [:]) {
|
|
let attachment = ImageAttachment(data: pngData, filename: "pasted_image.png")
|
|
pendingImages.append(attachment)
|
|
errorMessage = nil
|
|
}
|
|
}
|
|
|
|
func removePendingImage(_ attachment: ImageAttachment) {
|
|
pendingImages.removeAll { $0.id == attachment.id }
|
|
}
|
|
|
|
func clearPendingImages() {
|
|
pendingImages.removeAll()
|
|
}
|
|
|
|
// MARK: - Messaging
|
|
|
|
func sendMessage() {
|
|
let text = inputText.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
guard !text.isEmpty || !pendingImages.isEmpty else { return }
|
|
guard !isLoading else { return }
|
|
|
|
// Capture images before clearing
|
|
let imagesToSend = pendingImages
|
|
|
|
// Add user message with images
|
|
let userMessage = ChatMessage(role: .user, content: text, images: imagesToSend)
|
|
messages.append(userMessage)
|
|
inputText = ""
|
|
pendingImages = []
|
|
errorMessage = nil
|
|
|
|
// Add placeholder for assistant response
|
|
let assistantMessage = ChatMessage(role: .assistant, content: "", isStreaming: true)
|
|
messages.append(assistantMessage)
|
|
|
|
isLoading = true
|
|
|
|
currentTask = Task {
|
|
do {
|
|
guard let service = service else {
|
|
throw AppleIntelligenceError.modelNotAvailable
|
|
}
|
|
|
|
// Convert attachments to service format
|
|
let images = imagesToSend.map { attachment in
|
|
(data: attachment.data, filename: attachment.filename)
|
|
}
|
|
|
|
let stream = await service.streamComplete(
|
|
prompt: text,
|
|
temperature: nil,
|
|
maxTokens: nil,
|
|
images: images
|
|
)
|
|
|
|
var fullResponse = ""
|
|
for try await (partialResponse, _) in stream {
|
|
fullResponse = partialResponse
|
|
// Update the last message (assistant's response)
|
|
if let index = messages.lastIndex(where: { $0.role == .assistant }) {
|
|
messages[index].content = fullResponse
|
|
}
|
|
}
|
|
|
|
// Mark streaming as complete
|
|
if let index = messages.lastIndex(where: { $0.role == .assistant }) {
|
|
messages[index].isStreaming = false
|
|
}
|
|
|
|
} catch {
|
|
errorMessage = error.localizedDescription
|
|
// Remove the empty assistant message on error
|
|
if let index = messages.lastIndex(where: { $0.role == .assistant && $0.content.isEmpty }) {
|
|
messages.remove(at: index)
|
|
}
|
|
}
|
|
|
|
isLoading = false
|
|
}
|
|
}
|
|
|
|
func stopGeneration() {
|
|
currentTask?.cancel()
|
|
currentTask = nil
|
|
isLoading = false
|
|
|
|
// Mark any streaming message as complete
|
|
if let index = messages.lastIndex(where: { $0.isStreaming }) {
|
|
messages[index].isStreaming = false
|
|
}
|
|
}
|
|
|
|
func clearChat() {
|
|
stopGeneration()
|
|
messages.removeAll()
|
|
errorMessage = nil
|
|
}
|
|
|
|
// MARK: - Voice Input (Speech-to-Text)
|
|
|
|
func toggleRecording() {
|
|
if isRecording {
|
|
stopRecording()
|
|
} else {
|
|
startRecording()
|
|
}
|
|
}
|
|
|
|
func startRecording() {
|
|
Task {
|
|
// Use nonisolated helper to avoid MainActor isolation inheritance in TCC callback
|
|
let status = await Self.requestSpeechAuthorization()
|
|
|
|
guard status == .authorized else {
|
|
self.errorMessage = "Speech recognition not authorized"
|
|
return
|
|
}
|
|
self.beginRecording()
|
|
}
|
|
}
|
|
|
|
/// Request speech recognition authorization without MainActor isolation.
|
|
/// This prevents Swift 6 strict concurrency from asserting MainActor in the TCC callback.
|
|
private nonisolated static func requestSpeechAuthorization() async -> SFSpeechRecognizerAuthorizationStatus {
|
|
await withCheckedContinuation { continuation in
|
|
SFSpeechRecognizer.requestAuthorization { status in
|
|
continuation.resume(returning: status)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Creates audio tap handler in nonisolated context to avoid MainActor isolation inheritance.
|
|
/// Audio taps run on CoreAudio's RealtimeMessenger queue, not MainActor.
|
|
private nonisolated static func createAudioTapHandler(
|
|
request: SFSpeechAudioBufferRecognitionRequest,
|
|
levelUpdater: RecordingLevelUpdater
|
|
) -> (AVAudioPCMBuffer, AVAudioTime) -> Void {
|
|
return { buffer, _ in
|
|
request.append(buffer)
|
|
|
|
// Calculate audio level for visual feedback
|
|
guard let channelData = buffer.floatChannelData else { return }
|
|
let channelDataValue = channelData.pointee
|
|
let channelDataValueArray = stride(from: 0, to: Int(buffer.frameLength), by: buffer.stride).map { channelDataValue[$0] }
|
|
let rms = sqrt(channelDataValueArray.map { $0 * $0 }.reduce(0, +) / Float(buffer.frameLength))
|
|
let avgPower = 20 * log10(rms)
|
|
let level = max(0, min(1, (avgPower + 50) / 50))
|
|
|
|
levelUpdater.updateLevel(level)
|
|
}
|
|
}
|
|
|
|
private func beginRecording() {
|
|
// Try to find an available recognizer
|
|
let recognizer = activeRecognizer ?? speechRecognizers.values.first { $0.isAvailable }
|
|
guard let speechRecognizer = recognizer, speechRecognizer.isAvailable else {
|
|
errorMessage = "Speech recognition not available"
|
|
return
|
|
}
|
|
|
|
// Stop any existing recording
|
|
if audioEngine != nil {
|
|
stopRecording()
|
|
}
|
|
|
|
audioEngine = AVAudioEngine()
|
|
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
|
|
|
guard let audioEngine = audioEngine,
|
|
let recognitionRequest = recognitionRequest else {
|
|
errorMessage = "Failed to initialize audio engine"
|
|
return
|
|
}
|
|
|
|
recognitionRequest.shouldReportPartialResults = true
|
|
|
|
// Enable automatic language detection if available (macOS 14+)
|
|
if #available(macOS 14, *) {
|
|
recognitionRequest.addsPunctuation = true
|
|
}
|
|
|
|
let inputNode = audioEngine.inputNode
|
|
let recordingFormat = inputNode.outputFormat(forBus: 0)
|
|
|
|
// Use nonisolated static function to create audio tap handler
|
|
// This breaks MainActor isolation inheritance in the closure
|
|
let levelUpdater = RecordingLevelUpdater(viewModel: self)
|
|
let audioTapHandler = Self.createAudioTapHandler(request: recognitionRequest, levelUpdater: levelUpdater)
|
|
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat, block: audioTapHandler)
|
|
|
|
audioEngine.prepare()
|
|
|
|
do {
|
|
try audioEngine.start()
|
|
isRecording = true
|
|
|
|
// Use a sendable wrapper for recognition results with language detection
|
|
let resultHandler = RecognitionResultHandler(viewModel: self)
|
|
|
|
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
|
|
resultHandler.handleResult(result: result, error: error)
|
|
}
|
|
} catch {
|
|
errorMessage = "Failed to start recording: \(error.localizedDescription)"
|
|
cleanupRecording()
|
|
}
|
|
}
|
|
|
|
/// Switch to a different language for speech recognition
|
|
func switchLanguage(to localeId: String) {
|
|
guard let recognizer = speechRecognizers[localeId] else { return }
|
|
activeRecognizer = recognizer
|
|
detectedLanguage = localeId
|
|
}
|
|
|
|
/// Get available languages for speech recognition
|
|
var availableLanguages: [(id: String, name: String)] {
|
|
speechRecognizers.keys.sorted().compactMap { localeId in
|
|
let locale = Locale(identifier: localeId)
|
|
let name = locale.localizedString(forIdentifier: localeId) ?? localeId
|
|
return (id: localeId, name: name)
|
|
}
|
|
}
|
|
|
|
func stopRecording() {
|
|
recognitionRequest?.endAudio()
|
|
cleanupRecording()
|
|
}
|
|
|
|
fileprivate func cleanupRecording() {
|
|
audioEngine?.stop()
|
|
audioEngine?.inputNode.removeTap(onBus: 0)
|
|
audioEngine = nil
|
|
recognitionRequest = nil
|
|
recognitionTask?.cancel()
|
|
recognitionTask = nil
|
|
isRecording = false
|
|
recordingLevel = 0
|
|
}
|
|
|
|
// MARK: - Voice Output (Text-to-Speech)
|
|
|
|
func speakMessage(_ message: ChatMessage) {
|
|
guard !message.content.isEmpty else { return }
|
|
|
|
// If already speaking this message, stop
|
|
if isSpeaking && speakingMessageId == message.id {
|
|
stopSpeaking()
|
|
return
|
|
}
|
|
|
|
// Stop any current speech
|
|
stopSpeaking()
|
|
|
|
speakingMessageId = message.id
|
|
isSpeaking = true
|
|
|
|
// Create utterance
|
|
let utterance = AVSpeechUtterance(string: message.content)
|
|
utterance.rate = AVSpeechUtteranceDefaultSpeechRate
|
|
utterance.pitchMultiplier = 1.0
|
|
utterance.volume = 1.0
|
|
|
|
// Use voice matching current speech recognition language
|
|
if detectedLanguage == "fr-CA" {
|
|
utterance.voice = AVSpeechSynthesisVoice(language: "fr-CA")
|
|
} else {
|
|
utterance.voice = AVSpeechSynthesisVoice(language: "en-CA")
|
|
}
|
|
|
|
// Create synthesizer and delegate
|
|
let synthesizer = AVSpeechSynthesizer()
|
|
speechDelegate = SpeechSynthesizerDelegate { [weak self] in
|
|
Task { @MainActor in
|
|
self?.isSpeaking = false
|
|
self?.speakingMessageId = nil
|
|
self?.speechDelegate = nil
|
|
self?.speechSynthesizer = nil
|
|
}
|
|
}
|
|
synthesizer.delegate = speechDelegate
|
|
speechSynthesizer = synthesizer
|
|
|
|
// Speak directly
|
|
synthesizer.speak(utterance)
|
|
}
|
|
|
|
func stopSpeaking() {
|
|
speechSynthesizer?.stopSpeaking(at: .immediate)
|
|
speechSynthesizer = nil
|
|
speechDelegate = nil
|
|
isSpeaking = false
|
|
speakingMessageId = nil
|
|
}
|
|
}
|
|
|
|
// MARK: - Speech Synthesizer Delegate
|
|
|
|
private final class SpeechSynthesizerDelegate: NSObject, AVSpeechSynthesizerDelegate, @unchecked Sendable {
|
|
let onFinish: () -> Void
|
|
|
|
init(onFinish: @escaping () -> Void) {
|
|
self.onFinish = onFinish
|
|
}
|
|
|
|
func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
|
|
onFinish()
|
|
}
|
|
|
|
func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
|
|
onFinish()
|
|
}
|
|
}
|
|
|
|
// MARK: - Sendable Wrappers for Audio Callbacks
|
|
|
|
/// Wrapper to safely update recording level from audio callback thread
|
|
private final class RecordingLevelUpdater: @unchecked Sendable {
|
|
private weak var viewModel: ChatViewModel?
|
|
|
|
init(viewModel: ChatViewModel) {
|
|
self.viewModel = viewModel
|
|
}
|
|
|
|
func updateLevel(_ level: Float) {
|
|
Task { @MainActor [weak viewModel] in
|
|
viewModel?.recordingLevel = level
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Wrapper to safely handle recognition results from Speech framework callback
|
|
private final class RecognitionResultHandler: @unchecked Sendable {
|
|
private weak var viewModel: ChatViewModel?
|
|
|
|
init(viewModel: ChatViewModel) {
|
|
self.viewModel = viewModel
|
|
}
|
|
|
|
func handleResult(result: SFSpeechRecognitionResult?, error: Error?) {
|
|
// Extract data before crossing actor boundary (SFSpeechRecognitionResult is not Sendable)
|
|
let transcription = result?.bestTranscription.formattedString
|
|
let isFinal = result?.isFinal ?? false
|
|
let hasError = error != nil
|
|
|
|
Task { @MainActor [weak viewModel] in
|
|
if let transcription = transcription {
|
|
viewModel?.inputText = transcription
|
|
}
|
|
|
|
if hasError || isFinal {
|
|
viewModel?.cleanupRecording()
|
|
}
|
|
}
|
|
}
|
|
}
|