Add Text-to-Speech and Speech-to-Text features

- Add TTS service using AVSpeechSynthesizer for voice output
- Add STT service using SpeechAnalyzer (macOS 26) for transcription
- Add voice input (microphone) button in chat with recording level indicator
- Add speak button on assistant messages for TTS playback
- Add language toggle (EN-CA/FR-CA) for bilingual speech recognition
- Fix Swift 6 strict concurrency issues in audio callbacks
- Update proto schema with TTS/STT message types and RPCs
- Update gRPC provider with speech service endpoints

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Mathias Beaulieu-Duncan
2025-12-31 02:57:30 -05:00
parent 638656e7ca
commit b754945923
10 changed files with 3151 additions and 8 deletions
@@ -8,11 +8,24 @@ public struct AppleIntelligenceProvider: Appleintelligence_AppleIntelligenceServ
/// The underlying AI service
private let service: AppleIntelligenceService
/// Text-to-Speech service
private let ttsService: TextToSpeechService?
/// Speech-to-Text service
private let sttService: SpeechToTextService?
/// Optional API key for authentication
private let apiKey: String?
public init(service: AppleIntelligenceService, apiKey: String? = nil) {
public init(
service: AppleIntelligenceService,
ttsService: TextToSpeechService? = nil,
sttService: SpeechToTextService? = nil,
apiKey: String? = nil
) {
self.service = service
self.ttsService = ttsService
self.sttService = sttService
self.apiKey = apiKey
}
@@ -139,6 +152,213 @@ public struct AppleIntelligenceProvider: Appleintelligence_AppleIntelligenceServ
return ServerResponse(message: response)
}
// MARK: - Text-to-Speech
public func textToSpeech(
request: GRPCCore.ServerRequest<Appleintelligence_TextToSpeechRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_TextToSpeechResponse> {
try validateApiKey(metadata: request.metadata)
guard let ttsService = ttsService else {
throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
}
let message = request.message
// Convert proto config to service config
var config = SpeechConfig.default
if message.hasVoiceConfig {
let voiceConfig = message.voiceConfig
config = SpeechConfig(
voiceIdentifier: voiceConfig.voiceIdentifier.isEmpty ? nil : voiceConfig.voiceIdentifier,
speakingRate: voiceConfig.hasSpeakingRate ? voiceConfig.speakingRate : 0.5,
pitchMultiplier: voiceConfig.hasPitchMultiplier ? voiceConfig.pitchMultiplier : 1.0,
volume: voiceConfig.hasVolume ? voiceConfig.volume : 1.0
)
}
// Convert proto format to service format
let outputFormat: AudioOutputFormat
switch message.outputFormat {
case .wav, .unspecified:
outputFormat = .wav
case .mp3:
outputFormat = .mp3
case .UNRECOGNIZED:
outputFormat = .wav
}
do {
let result = try await ttsService.synthesize(
text: message.text,
config: config,
outputFormat: outputFormat
)
var response = Appleintelligence_TextToSpeechResponse()
response.audioData = result.audioData
response.format = outputFormat == .wav ? .wav : .mp3
response.sampleRate = Int32(result.sampleRate)
response.channels = Int32(result.channels)
response.durationSeconds = result.durationSeconds
return ServerResponse(message: response)
} catch let error as TextToSpeechError {
throw RPCError(code: .internalError, message: error.description)
}
}
public func listVoices(
request: GRPCCore.ServerRequest<Appleintelligence_ListVoicesRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_ListVoicesResponse> {
try validateApiKey(metadata: request.metadata)
guard let ttsService = ttsService else {
throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
}
let message = request.message
let languageCode = message.hasLanguageCode ? message.languageCode : nil
let voices = await ttsService.listVoices(languageCode: languageCode)
var response = Appleintelligence_ListVoicesResponse()
response.voices = voices.map { voice in
var protoVoice = Appleintelligence_VoiceInfo()
protoVoice.identifier = voice.identifier
protoVoice.name = voice.name
protoVoice.language = voice.language
protoVoice.isPremium = voice.isPremium
protoVoice.gender = voice.gender
return protoVoice
}
return ServerResponse(message: response)
}
// MARK: - Speech-to-Text
public func transcribe(
request: GRPCCore.ServerRequest<Appleintelligence_TranscribeRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_TranscribeResponse> {
try validateApiKey(metadata: request.metadata)
guard let sttService = sttService else {
throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
}
let message = request.message
guard message.hasAudio else {
throw RPCError(code: .invalidArgument, message: "Audio data is required")
}
// Convert proto config to service config
var config = TranscriptionConfig.default
if message.hasConfig {
let protoConfig = message.config
config = TranscriptionConfig(
languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
)
}
do {
let result = try await sttService.transcribe(
audioData: message.audio.data,
mimeType: message.audio.mimeType,
config: config
)
var response = Appleintelligence_TranscribeResponse()
response.text = result.text
response.detectedLanguage = result.detectedLanguage
response.confidence = result.confidence
response.segments = result.segments.map { segment in
var protoSegment = Appleintelligence_TranscriptionSegment()
protoSegment.text = segment.text
protoSegment.startTime = segment.startTime
protoSegment.endTime = segment.endTime
protoSegment.confidence = segment.confidence
return protoSegment
}
return ServerResponse(message: response)
} catch let error as SpeechToTextError {
throw RPCError(code: .internalError, message: error.description)
}
}
public func streamTranscribe(
request: GRPCCore.StreamingServerRequest<Appleintelligence_StreamingTranscribeRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.StreamingServerResponse<Appleintelligence_StreamingTranscribeResponse> {
try validateApiKey(metadata: request.metadata)
guard let sttService = sttService else {
throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
}
return StreamingServerResponse { writer in
var config = TranscriptionConfig.default
// Process incoming stream
for try await message in request.messages {
switch message.request {
case .config(let protoConfig):
// First message should be config
config = TranscriptionConfig(
languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
)
// Start streaming transcription
let stream = await sttService.streamTranscribe(config: config)
Task {
do {
for try await update in stream {
var response = Appleintelligence_StreamingTranscribeResponse()
response.partialText = update.partialText
response.isFinal = update.isFinal
if let finalText = update.finalText {
response.finalText = finalText
}
response.segments = update.segments.map { segment in
var protoSegment = Appleintelligence_TranscriptionSegment()
protoSegment.text = segment.text
protoSegment.startTime = segment.startTime
protoSegment.endTime = segment.endTime
protoSegment.confidence = segment.confidence
return protoSegment
}
try await writer.write(response)
}
} catch {
// Stream ended or error occurred
}
}
case .audioChunk(let chunk):
// Feed audio chunk to service
try await sttService.feedAudioChunk(chunk)
case .none:
break
}
}
// End streaming session
await sttService.endStreamingSession()
return [:]
}
}
// MARK: - Private Helpers
/// Validate API key if configured