Add Text-to-Speech and Speech-to-Text features

- Add TTS service using AVSpeechSynthesizer for voice output - Add STT service using SpeechAnalyzer (macOS 26) for transcription - Add voice input (microphone) button in chat with recording level indicator - Add speak button on assistant messages for TTS playback - Add language toggle (EN-CA/FR-CA) for bilingual speech recognition - Fix Swift 6 strict concurrency issues in audio callbacks - Update proto schema with TTS/STT message types and RPCs - Update gRPC provider with speech service endpoints 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 02:57:30 -05:00
parent 638656e7ca
commit b754945923
10 changed files with 3151 additions and 8 deletions
@@ -8,11 +8,24 @@ public struct AppleIntelligenceProvider: Appleintelligence_AppleIntelligenceServ
    /// The underlying AI service
    private let service: AppleIntelligenceService

+    /// Text-to-Speech service
+    private let ttsService: TextToSpeechService?
+
+    /// Speech-to-Text service
+    private let sttService: SpeechToTextService?
+
    /// Optional API key for authentication
    private let apiKey: String?

-    public init(service: AppleIntelligenceService, apiKey: String? = nil) {
+    public init(
+        service: AppleIntelligenceService,
+        ttsService: TextToSpeechService? = nil,
+        sttService: SpeechToTextService? = nil,
+        apiKey: String? = nil
+    ) {
        self.service = service
+        self.ttsService = ttsService
+        self.sttService = sttService
        self.apiKey = apiKey
    }

@@ -139,6 +152,213 @@ public struct AppleIntelligenceProvider: Appleintelligence_AppleIntelligenceServ
        return ServerResponse(message: response)
    }

+    // MARK: - Text-to-Speech
+
+    public func textToSpeech(
+        request: GRPCCore.ServerRequest<Appleintelligence_TextToSpeechRequest>,
+        context: GRPCCore.ServerContext
+    ) async throws -> GRPCCore.ServerResponse<Appleintelligence_TextToSpeechResponse> {
+        try validateApiKey(metadata: request.metadata)
+
+        guard let ttsService = ttsService else {
+            throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
+        }
+
+        let message = request.message
+
+        // Convert proto config to service config
+        var config = SpeechConfig.default
+        if message.hasVoiceConfig {
+            let voiceConfig = message.voiceConfig
+            config = SpeechConfig(
+                voiceIdentifier: voiceConfig.voiceIdentifier.isEmpty ? nil : voiceConfig.voiceIdentifier,
+                speakingRate: voiceConfig.hasSpeakingRate ? voiceConfig.speakingRate : 0.5,
+                pitchMultiplier: voiceConfig.hasPitchMultiplier ? voiceConfig.pitchMultiplier : 1.0,
+                volume: voiceConfig.hasVolume ? voiceConfig.volume : 1.0
+            )
+        }
+
+        // Convert proto format to service format
+        let outputFormat: AudioOutputFormat
+        switch message.outputFormat {
+        case .wav, .unspecified:
+            outputFormat = .wav
+        case .mp3:
+            outputFormat = .mp3
+        case .UNRECOGNIZED:
+            outputFormat = .wav
+        }
+
+        do {
+            let result = try await ttsService.synthesize(
+                text: message.text,
+                config: config,
+                outputFormat: outputFormat
+            )
+
+            var response = Appleintelligence_TextToSpeechResponse()
+            response.audioData = result.audioData
+            response.format = outputFormat == .wav ? .wav : .mp3
+            response.sampleRate = Int32(result.sampleRate)
+            response.channels = Int32(result.channels)
+            response.durationSeconds = result.durationSeconds
+
+            return ServerResponse(message: response)
+        } catch let error as TextToSpeechError {
+            throw RPCError(code: .internalError, message: error.description)
+        }
+    }
+
+    public func listVoices(
+        request: GRPCCore.ServerRequest<Appleintelligence_ListVoicesRequest>,
+        context: GRPCCore.ServerContext
+    ) async throws -> GRPCCore.ServerResponse<Appleintelligence_ListVoicesResponse> {
+        try validateApiKey(metadata: request.metadata)
+
+        guard let ttsService = ttsService else {
+            throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
+        }
+
+        let message = request.message
+        let languageCode = message.hasLanguageCode ? message.languageCode : nil
+
+        let voices = await ttsService.listVoices(languageCode: languageCode)
+
+        var response = Appleintelligence_ListVoicesResponse()
+        response.voices = voices.map { voice in
+            var protoVoice = Appleintelligence_VoiceInfo()
+            protoVoice.identifier = voice.identifier
+            protoVoice.name = voice.name
+            protoVoice.language = voice.language
+            protoVoice.isPremium = voice.isPremium
+            protoVoice.gender = voice.gender
+            return protoVoice
+        }
+
+        return ServerResponse(message: response)
+    }
+
+    // MARK: - Speech-to-Text
+
+    public func transcribe(
+        request: GRPCCore.ServerRequest<Appleintelligence_TranscribeRequest>,
+        context: GRPCCore.ServerContext
+    ) async throws -> GRPCCore.ServerResponse<Appleintelligence_TranscribeResponse> {
+        try validateApiKey(metadata: request.metadata)
+
+        guard let sttService = sttService else {
+            throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
+        }
+
+        let message = request.message
+
+        guard message.hasAudio else {
+            throw RPCError(code: .invalidArgument, message: "Audio data is required")
+        }
+
+        // Convert proto config to service config
+        var config = TranscriptionConfig.default
+        if message.hasConfig {
+            let protoConfig = message.config
+            config = TranscriptionConfig(
+                languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
+                enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
+                enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
+            )
+        }
+
+        do {
+            let result = try await sttService.transcribe(
+                audioData: message.audio.data,
+                mimeType: message.audio.mimeType,
+                config: config
+            )
+
+            var response = Appleintelligence_TranscribeResponse()
+            response.text = result.text
+            response.detectedLanguage = result.detectedLanguage
+            response.confidence = result.confidence
+            response.segments = result.segments.map { segment in
+                var protoSegment = Appleintelligence_TranscriptionSegment()
+                protoSegment.text = segment.text
+                protoSegment.startTime = segment.startTime
+                protoSegment.endTime = segment.endTime
+                protoSegment.confidence = segment.confidence
+                return protoSegment
+            }
+
+            return ServerResponse(message: response)
+        } catch let error as SpeechToTextError {
+            throw RPCError(code: .internalError, message: error.description)
+        }
+    }
+
+    public func streamTranscribe(
+        request: GRPCCore.StreamingServerRequest<Appleintelligence_StreamingTranscribeRequest>,
+        context: GRPCCore.ServerContext
+    ) async throws -> GRPCCore.StreamingServerResponse<Appleintelligence_StreamingTranscribeResponse> {
+        try validateApiKey(metadata: request.metadata)
+
+        guard let sttService = sttService else {
+            throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
+        }
+
+        return StreamingServerResponse { writer in
+            var config = TranscriptionConfig.default
+
+            // Process incoming stream
+            for try await message in request.messages {
+                switch message.request {
+                case .config(let protoConfig):
+                    // First message should be config
+                    config = TranscriptionConfig(
+                        languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
+                        enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
+                        enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
+                    )
+
+                    // Start streaming transcription
+                    let stream = await sttService.streamTranscribe(config: config)
+                    Task {
+                        do {
+                            for try await update in stream {
+                                var response = Appleintelligence_StreamingTranscribeResponse()
+                                response.partialText = update.partialText
+                                response.isFinal = update.isFinal
+                                if let finalText = update.finalText {
+                                    response.finalText = finalText
+                                }
+                                response.segments = update.segments.map { segment in
+                                    var protoSegment = Appleintelligence_TranscriptionSegment()
+                                    protoSegment.text = segment.text
+                                    protoSegment.startTime = segment.startTime
+                                    protoSegment.endTime = segment.endTime
+                                    protoSegment.confidence = segment.confidence
+                                    return protoSegment
+                                }
+                                try await writer.write(response)
+                            }
+                        } catch {
+                            // Stream ended or error occurred
+                        }
+                    }
+
+                case .audioChunk(let chunk):
+                    // Feed audio chunk to service
+                    try await sttService.feedAudioChunk(chunk)
+
+                case .none:
+                    break
+                }
+            }
+
+            // End streaming session
+            await sttService.endStreamingSession()
+
+            return [:]
+        }
+    }
+
    // MARK: - Private Helpers

    /// Validate API key if configured