Add Text-to-Speech and Speech-to-Text features
- Add TTS service using AVSpeechSynthesizer for voice output - Add STT service using SpeechAnalyzer (macOS 26) for transcription - Add voice input (microphone) button in chat with recording level indicator - Add speak button on assistant messages for TTS playback - Add language toggle (EN-CA/FR-CA) for bilingual speech recognition - Fix Swift 6 strict concurrency issues in audio callbacks - Update proto schema with TTS/STT message types and RPCs - Update gRPC provider with speech service endpoints 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -8,11 +8,24 @@ public struct AppleIntelligenceProvider: Appleintelligence_AppleIntelligenceServ
|
||||
/// The underlying AI service
|
||||
private let service: AppleIntelligenceService
|
||||
|
||||
/// Text-to-Speech service
|
||||
private let ttsService: TextToSpeechService?
|
||||
|
||||
/// Speech-to-Text service
|
||||
private let sttService: SpeechToTextService?
|
||||
|
||||
/// Optional API key for authentication
|
||||
private let apiKey: String?
|
||||
|
||||
public init(service: AppleIntelligenceService, apiKey: String? = nil) {
|
||||
public init(
|
||||
service: AppleIntelligenceService,
|
||||
ttsService: TextToSpeechService? = nil,
|
||||
sttService: SpeechToTextService? = nil,
|
||||
apiKey: String? = nil
|
||||
) {
|
||||
self.service = service
|
||||
self.ttsService = ttsService
|
||||
self.sttService = sttService
|
||||
self.apiKey = apiKey
|
||||
}
|
||||
|
||||
@@ -139,6 +152,213 @@ public struct AppleIntelligenceProvider: Appleintelligence_AppleIntelligenceServ
|
||||
return ServerResponse(message: response)
|
||||
}
|
||||
|
||||
// MARK: - Text-to-Speech
|
||||
|
||||
public func textToSpeech(
|
||||
request: GRPCCore.ServerRequest<Appleintelligence_TextToSpeechRequest>,
|
||||
context: GRPCCore.ServerContext
|
||||
) async throws -> GRPCCore.ServerResponse<Appleintelligence_TextToSpeechResponse> {
|
||||
try validateApiKey(metadata: request.metadata)
|
||||
|
||||
guard let ttsService = ttsService else {
|
||||
throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
|
||||
}
|
||||
|
||||
let message = request.message
|
||||
|
||||
// Convert proto config to service config
|
||||
var config = SpeechConfig.default
|
||||
if message.hasVoiceConfig {
|
||||
let voiceConfig = message.voiceConfig
|
||||
config = SpeechConfig(
|
||||
voiceIdentifier: voiceConfig.voiceIdentifier.isEmpty ? nil : voiceConfig.voiceIdentifier,
|
||||
speakingRate: voiceConfig.hasSpeakingRate ? voiceConfig.speakingRate : 0.5,
|
||||
pitchMultiplier: voiceConfig.hasPitchMultiplier ? voiceConfig.pitchMultiplier : 1.0,
|
||||
volume: voiceConfig.hasVolume ? voiceConfig.volume : 1.0
|
||||
)
|
||||
}
|
||||
|
||||
// Convert proto format to service format
|
||||
let outputFormat: AudioOutputFormat
|
||||
switch message.outputFormat {
|
||||
case .wav, .unspecified:
|
||||
outputFormat = .wav
|
||||
case .mp3:
|
||||
outputFormat = .mp3
|
||||
case .UNRECOGNIZED:
|
||||
outputFormat = .wav
|
||||
}
|
||||
|
||||
do {
|
||||
let result = try await ttsService.synthesize(
|
||||
text: message.text,
|
||||
config: config,
|
||||
outputFormat: outputFormat
|
||||
)
|
||||
|
||||
var response = Appleintelligence_TextToSpeechResponse()
|
||||
response.audioData = result.audioData
|
||||
response.format = outputFormat == .wav ? .wav : .mp3
|
||||
response.sampleRate = Int32(result.sampleRate)
|
||||
response.channels = Int32(result.channels)
|
||||
response.durationSeconds = result.durationSeconds
|
||||
|
||||
return ServerResponse(message: response)
|
||||
} catch let error as TextToSpeechError {
|
||||
throw RPCError(code: .internalError, message: error.description)
|
||||
}
|
||||
}
|
||||
|
||||
public func listVoices(
|
||||
request: GRPCCore.ServerRequest<Appleintelligence_ListVoicesRequest>,
|
||||
context: GRPCCore.ServerContext
|
||||
) async throws -> GRPCCore.ServerResponse<Appleintelligence_ListVoicesResponse> {
|
||||
try validateApiKey(metadata: request.metadata)
|
||||
|
||||
guard let ttsService = ttsService else {
|
||||
throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
|
||||
}
|
||||
|
||||
let message = request.message
|
||||
let languageCode = message.hasLanguageCode ? message.languageCode : nil
|
||||
|
||||
let voices = await ttsService.listVoices(languageCode: languageCode)
|
||||
|
||||
var response = Appleintelligence_ListVoicesResponse()
|
||||
response.voices = voices.map { voice in
|
||||
var protoVoice = Appleintelligence_VoiceInfo()
|
||||
protoVoice.identifier = voice.identifier
|
||||
protoVoice.name = voice.name
|
||||
protoVoice.language = voice.language
|
||||
protoVoice.isPremium = voice.isPremium
|
||||
protoVoice.gender = voice.gender
|
||||
return protoVoice
|
||||
}
|
||||
|
||||
return ServerResponse(message: response)
|
||||
}
|
||||
|
||||
// MARK: - Speech-to-Text
|
||||
|
||||
public func transcribe(
|
||||
request: GRPCCore.ServerRequest<Appleintelligence_TranscribeRequest>,
|
||||
context: GRPCCore.ServerContext
|
||||
) async throws -> GRPCCore.ServerResponse<Appleintelligence_TranscribeResponse> {
|
||||
try validateApiKey(metadata: request.metadata)
|
||||
|
||||
guard let sttService = sttService else {
|
||||
throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
|
||||
}
|
||||
|
||||
let message = request.message
|
||||
|
||||
guard message.hasAudio else {
|
||||
throw RPCError(code: .invalidArgument, message: "Audio data is required")
|
||||
}
|
||||
|
||||
// Convert proto config to service config
|
||||
var config = TranscriptionConfig.default
|
||||
if message.hasConfig {
|
||||
let protoConfig = message.config
|
||||
config = TranscriptionConfig(
|
||||
languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
|
||||
enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
|
||||
enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
|
||||
)
|
||||
}
|
||||
|
||||
do {
|
||||
let result = try await sttService.transcribe(
|
||||
audioData: message.audio.data,
|
||||
mimeType: message.audio.mimeType,
|
||||
config: config
|
||||
)
|
||||
|
||||
var response = Appleintelligence_TranscribeResponse()
|
||||
response.text = result.text
|
||||
response.detectedLanguage = result.detectedLanguage
|
||||
response.confidence = result.confidence
|
||||
response.segments = result.segments.map { segment in
|
||||
var protoSegment = Appleintelligence_TranscriptionSegment()
|
||||
protoSegment.text = segment.text
|
||||
protoSegment.startTime = segment.startTime
|
||||
protoSegment.endTime = segment.endTime
|
||||
protoSegment.confidence = segment.confidence
|
||||
return protoSegment
|
||||
}
|
||||
|
||||
return ServerResponse(message: response)
|
||||
} catch let error as SpeechToTextError {
|
||||
throw RPCError(code: .internalError, message: error.description)
|
||||
}
|
||||
}
|
||||
|
||||
public func streamTranscribe(
|
||||
request: GRPCCore.StreamingServerRequest<Appleintelligence_StreamingTranscribeRequest>,
|
||||
context: GRPCCore.ServerContext
|
||||
) async throws -> GRPCCore.StreamingServerResponse<Appleintelligence_StreamingTranscribeResponse> {
|
||||
try validateApiKey(metadata: request.metadata)
|
||||
|
||||
guard let sttService = sttService else {
|
||||
throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
|
||||
}
|
||||
|
||||
return StreamingServerResponse { writer in
|
||||
var config = TranscriptionConfig.default
|
||||
|
||||
// Process incoming stream
|
||||
for try await message in request.messages {
|
||||
switch message.request {
|
||||
case .config(let protoConfig):
|
||||
// First message should be config
|
||||
config = TranscriptionConfig(
|
||||
languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
|
||||
enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
|
||||
enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
|
||||
)
|
||||
|
||||
// Start streaming transcription
|
||||
let stream = await sttService.streamTranscribe(config: config)
|
||||
Task {
|
||||
do {
|
||||
for try await update in stream {
|
||||
var response = Appleintelligence_StreamingTranscribeResponse()
|
||||
response.partialText = update.partialText
|
||||
response.isFinal = update.isFinal
|
||||
if let finalText = update.finalText {
|
||||
response.finalText = finalText
|
||||
}
|
||||
response.segments = update.segments.map { segment in
|
||||
var protoSegment = Appleintelligence_TranscriptionSegment()
|
||||
protoSegment.text = segment.text
|
||||
protoSegment.startTime = segment.startTime
|
||||
protoSegment.endTime = segment.endTime
|
||||
protoSegment.confidence = segment.confidence
|
||||
return protoSegment
|
||||
}
|
||||
try await writer.write(response)
|
||||
}
|
||||
} catch {
|
||||
// Stream ended or error occurred
|
||||
}
|
||||
}
|
||||
|
||||
case .audioChunk(let chunk):
|
||||
// Feed audio chunk to service
|
||||
try await sttService.feedAudioChunk(chunk)
|
||||
|
||||
case .none:
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// End streaming session
|
||||
await sttService.endStreamingSession()
|
||||
|
||||
return [:]
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Private Helpers
|
||||
|
||||
/// Validate API key if configured
|
||||
|
||||
Reference in New Issue
Block a user