swift-apple-intelligence-grpc/Sources/AppleIntelligenceCore/Providers/AppleIntelligenceProvider.swift
Mathias Beaulieu-Duncan b754945923 Add Text-to-Speech and Speech-to-Text features
- Add TTS service using AVSpeechSynthesizer for voice output
- Add STT service using SpeechAnalyzer (macOS 26) for transcription
- Add voice input (microphone) button in chat with recording level indicator
- Add speak button on assistant messages for TTS playback
- Add language toggle (EN-CA/FR-CA) for bilingual speech recognition
- Fix Swift 6 strict concurrency issues in audio callbacks
- Update proto schema with TTS/STT message types and RPCs
- Update gRPC provider with speech service endpoints

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 02:57:30 -05:00

384 lines
15 KiB
Swift

import Foundation
import GRPCCore
import GRPCProtobuf
import GRPCNIOTransportHTTP2
/// gRPC service provider for Apple Intelligence
public struct AppleIntelligenceProvider: Appleintelligence_AppleIntelligenceService.ServiceProtocol {
/// The underlying AI service
private let service: AppleIntelligenceService
/// Text-to-Speech service
private let ttsService: TextToSpeechService?
/// Speech-to-Text service
private let sttService: SpeechToTextService?
/// Optional API key for authentication
private let apiKey: String?
public init(
service: AppleIntelligenceService,
ttsService: TextToSpeechService? = nil,
sttService: SpeechToTextService? = nil,
apiKey: String? = nil
) {
self.service = service
self.ttsService = ttsService
self.sttService = sttService
self.apiKey = apiKey
}
// MARK: - ServiceProtocol Implementation
public func complete(
request: GRPCCore.ServerRequest<Appleintelligence_CompletionRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_CompletionResponse> {
try validateApiKey(metadata: request.metadata)
let message = request.message
// Convert protobuf images to service format
let images = message.images.map { img in
(data: img.data, filename: img.filename.isEmpty ? nil : img.filename)
}
let (text, analyses) = try await service.complete(
prompt: message.prompt,
temperature: message.hasTemperature ? message.temperature : nil,
maxTokens: message.hasMaxTokens ? Int(message.maxTokens) : nil,
images: images
)
var response = Appleintelligence_CompletionResponse()
response.id = UUID().uuidString
response.text = text
response.finishReason = "stop"
// Include analysis results if requested
if message.includeAnalysis {
response.imageAnalyses = analyses.map { analysis in
var protoAnalysis = Appleintelligence_ImageAnalysis()
protoAnalysis.textContent = analysis.textContent
protoAnalysis.labels = analysis.labels
protoAnalysis.description_p = analysis.description
return protoAnalysis
}
}
return ServerResponse(message: response)
}
public func streamComplete(
request: GRPCCore.ServerRequest<Appleintelligence_CompletionRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.StreamingServerResponse<Appleintelligence_CompletionChunk> {
try validateApiKey(metadata: request.metadata)
let message = request.message
let completionId = UUID().uuidString
// Convert protobuf images to service format
let images = message.images.map { img in
(data: img.data, filename: img.filename.isEmpty ? nil : img.filename)
}
return StreamingServerResponse { writer in
let stream = await self.service.streamComplete(
prompt: message.prompt,
temperature: message.hasTemperature ? message.temperature : nil,
maxTokens: message.hasMaxTokens ? Int(message.maxTokens) : nil,
images: images
)
var lastContent = ""
var isFirstChunk = true
for try await (partialResponse, analyses) in stream {
// Calculate the delta (new text since last response)
let delta: String
if partialResponse.hasPrefix(lastContent) {
delta = String(partialResponse.dropFirst(lastContent.count))
} else {
delta = partialResponse
}
lastContent = partialResponse
if !delta.isEmpty || isFirstChunk {
var chunk = Appleintelligence_CompletionChunk()
chunk.id = completionId
chunk.delta = delta
chunk.isFinal = false
// Include analyses in first chunk if requested
if isFirstChunk && message.includeAnalysis, let analyses = analyses {
chunk.imageAnalyses = analyses.map { analysis in
var protoAnalysis = Appleintelligence_ImageAnalysis()
protoAnalysis.textContent = analysis.textContent
protoAnalysis.labels = analysis.labels
protoAnalysis.description_p = analysis.description
return protoAnalysis
}
}
try await writer.write(chunk)
isFirstChunk = false
}
}
// Send final chunk
var finalChunk = Appleintelligence_CompletionChunk()
finalChunk.id = completionId
finalChunk.delta = ""
finalChunk.isFinal = true
finalChunk.finishReason = "stop"
try await writer.write(finalChunk)
return [:]
}
}
public func health(
request: GRPCCore.ServerRequest<Appleintelligence_HealthRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_HealthResponse> {
let isHealthy = await service.isAvailable
let modelStatus = await service.getModelStatus()
var response = Appleintelligence_HealthResponse()
response.healthy = isHealthy
response.modelStatus = modelStatus
return ServerResponse(message: response)
}
// MARK: - Text-to-Speech
public func textToSpeech(
request: GRPCCore.ServerRequest<Appleintelligence_TextToSpeechRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_TextToSpeechResponse> {
try validateApiKey(metadata: request.metadata)
guard let ttsService = ttsService else {
throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
}
let message = request.message
// Convert proto config to service config
var config = SpeechConfig.default
if message.hasVoiceConfig {
let voiceConfig = message.voiceConfig
config = SpeechConfig(
voiceIdentifier: voiceConfig.voiceIdentifier.isEmpty ? nil : voiceConfig.voiceIdentifier,
speakingRate: voiceConfig.hasSpeakingRate ? voiceConfig.speakingRate : 0.5,
pitchMultiplier: voiceConfig.hasPitchMultiplier ? voiceConfig.pitchMultiplier : 1.0,
volume: voiceConfig.hasVolume ? voiceConfig.volume : 1.0
)
}
// Convert proto format to service format
let outputFormat: AudioOutputFormat
switch message.outputFormat {
case .wav, .unspecified:
outputFormat = .wav
case .mp3:
outputFormat = .mp3
case .UNRECOGNIZED:
outputFormat = .wav
}
do {
let result = try await ttsService.synthesize(
text: message.text,
config: config,
outputFormat: outputFormat
)
var response = Appleintelligence_TextToSpeechResponse()
response.audioData = result.audioData
response.format = outputFormat == .wav ? .wav : .mp3
response.sampleRate = Int32(result.sampleRate)
response.channels = Int32(result.channels)
response.durationSeconds = result.durationSeconds
return ServerResponse(message: response)
} catch let error as TextToSpeechError {
throw RPCError(code: .internalError, message: error.description)
}
}
public func listVoices(
request: GRPCCore.ServerRequest<Appleintelligence_ListVoicesRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_ListVoicesResponse> {
try validateApiKey(metadata: request.metadata)
guard let ttsService = ttsService else {
throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
}
let message = request.message
let languageCode = message.hasLanguageCode ? message.languageCode : nil
let voices = await ttsService.listVoices(languageCode: languageCode)
var response = Appleintelligence_ListVoicesResponse()
response.voices = voices.map { voice in
var protoVoice = Appleintelligence_VoiceInfo()
protoVoice.identifier = voice.identifier
protoVoice.name = voice.name
protoVoice.language = voice.language
protoVoice.isPremium = voice.isPremium
protoVoice.gender = voice.gender
return protoVoice
}
return ServerResponse(message: response)
}
// MARK: - Speech-to-Text
public func transcribe(
request: GRPCCore.ServerRequest<Appleintelligence_TranscribeRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_TranscribeResponse> {
try validateApiKey(metadata: request.metadata)
guard let sttService = sttService else {
throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
}
let message = request.message
guard message.hasAudio else {
throw RPCError(code: .invalidArgument, message: "Audio data is required")
}
// Convert proto config to service config
var config = TranscriptionConfig.default
if message.hasConfig {
let protoConfig = message.config
config = TranscriptionConfig(
languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
)
}
do {
let result = try await sttService.transcribe(
audioData: message.audio.data,
mimeType: message.audio.mimeType,
config: config
)
var response = Appleintelligence_TranscribeResponse()
response.text = result.text
response.detectedLanguage = result.detectedLanguage
response.confidence = result.confidence
response.segments = result.segments.map { segment in
var protoSegment = Appleintelligence_TranscriptionSegment()
protoSegment.text = segment.text
protoSegment.startTime = segment.startTime
protoSegment.endTime = segment.endTime
protoSegment.confidence = segment.confidence
return protoSegment
}
return ServerResponse(message: response)
} catch let error as SpeechToTextError {
throw RPCError(code: .internalError, message: error.description)
}
}
public func streamTranscribe(
request: GRPCCore.StreamingServerRequest<Appleintelligence_StreamingTranscribeRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.StreamingServerResponse<Appleintelligence_StreamingTranscribeResponse> {
try validateApiKey(metadata: request.metadata)
guard let sttService = sttService else {
throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
}
return StreamingServerResponse { writer in
var config = TranscriptionConfig.default
// Process incoming stream
for try await message in request.messages {
switch message.request {
case .config(let protoConfig):
// First message should be config
config = TranscriptionConfig(
languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
)
// Start streaming transcription
let stream = await sttService.streamTranscribe(config: config)
Task {
do {
for try await update in stream {
var response = Appleintelligence_StreamingTranscribeResponse()
response.partialText = update.partialText
response.isFinal = update.isFinal
if let finalText = update.finalText {
response.finalText = finalText
}
response.segments = update.segments.map { segment in
var protoSegment = Appleintelligence_TranscriptionSegment()
protoSegment.text = segment.text
protoSegment.startTime = segment.startTime
protoSegment.endTime = segment.endTime
protoSegment.confidence = segment.confidence
return protoSegment
}
try await writer.write(response)
}
} catch {
// Stream ended or error occurred
}
}
case .audioChunk(let chunk):
// Feed audio chunk to service
try await sttService.feedAudioChunk(chunk)
case .none:
break
}
}
// End streaming session
await sttService.endStreamingSession()
return [:]
}
}
// MARK: - Private Helpers
/// Validate API key if configured
private func validateApiKey(metadata: Metadata) throws {
guard let expectedKey = apiKey else {
return // No API key required
}
// Look for Authorization header in metadata
let authValues = metadata["authorization"]
guard let authHeader = authValues.first(where: { _ in true }),
case .string(let authString) = authHeader,
authString.hasPrefix("Bearer ") else {
throw RPCError(code: .unauthenticated, message: "Missing or invalid Authorization header")
}
let providedKey = String(authString.dropFirst("Bearer ".count))
guard providedKey == expectedKey else {
throw RPCError(code: .unauthenticated, message: "Invalid API key")
}
}
}