- Add TTS service using AVSpeechSynthesizer for voice output - Add STT service using SpeechAnalyzer (macOS 26) for transcription - Add voice input (microphone) button in chat with recording level indicator - Add speak button on assistant messages for TTS playback - Add language toggle (EN-CA/FR-CA) for bilingual speech recognition - Fix Swift 6 strict concurrency issues in audio callbacks - Update proto schema with TTS/STT message types and RPCs - Update gRPC provider with speech service endpoints 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
384 lines
15 KiB
Swift
384 lines
15 KiB
Swift
import Foundation
|
|
import GRPCCore
|
|
import GRPCProtobuf
|
|
import GRPCNIOTransportHTTP2
|
|
|
|
/// gRPC service provider for Apple Intelligence
|
|
public struct AppleIntelligenceProvider: Appleintelligence_AppleIntelligenceService.ServiceProtocol {
|
|
/// The underlying AI service
|
|
private let service: AppleIntelligenceService
|
|
|
|
/// Text-to-Speech service
|
|
private let ttsService: TextToSpeechService?
|
|
|
|
/// Speech-to-Text service
|
|
private let sttService: SpeechToTextService?
|
|
|
|
/// Optional API key for authentication
|
|
private let apiKey: String?
|
|
|
|
public init(
|
|
service: AppleIntelligenceService,
|
|
ttsService: TextToSpeechService? = nil,
|
|
sttService: SpeechToTextService? = nil,
|
|
apiKey: String? = nil
|
|
) {
|
|
self.service = service
|
|
self.ttsService = ttsService
|
|
self.sttService = sttService
|
|
self.apiKey = apiKey
|
|
}
|
|
|
|
// MARK: - ServiceProtocol Implementation
|
|
|
|
public func complete(
|
|
request: GRPCCore.ServerRequest<Appleintelligence_CompletionRequest>,
|
|
context: GRPCCore.ServerContext
|
|
) async throws -> GRPCCore.ServerResponse<Appleintelligence_CompletionResponse> {
|
|
try validateApiKey(metadata: request.metadata)
|
|
|
|
let message = request.message
|
|
|
|
// Convert protobuf images to service format
|
|
let images = message.images.map { img in
|
|
(data: img.data, filename: img.filename.isEmpty ? nil : img.filename)
|
|
}
|
|
|
|
let (text, analyses) = try await service.complete(
|
|
prompt: message.prompt,
|
|
temperature: message.hasTemperature ? message.temperature : nil,
|
|
maxTokens: message.hasMaxTokens ? Int(message.maxTokens) : nil,
|
|
images: images
|
|
)
|
|
|
|
var response = Appleintelligence_CompletionResponse()
|
|
response.id = UUID().uuidString
|
|
response.text = text
|
|
response.finishReason = "stop"
|
|
|
|
// Include analysis results if requested
|
|
if message.includeAnalysis {
|
|
response.imageAnalyses = analyses.map { analysis in
|
|
var protoAnalysis = Appleintelligence_ImageAnalysis()
|
|
protoAnalysis.textContent = analysis.textContent
|
|
protoAnalysis.labels = analysis.labels
|
|
protoAnalysis.description_p = analysis.description
|
|
return protoAnalysis
|
|
}
|
|
}
|
|
|
|
return ServerResponse(message: response)
|
|
}
|
|
|
|
public func streamComplete(
|
|
request: GRPCCore.ServerRequest<Appleintelligence_CompletionRequest>,
|
|
context: GRPCCore.ServerContext
|
|
) async throws -> GRPCCore.StreamingServerResponse<Appleintelligence_CompletionChunk> {
|
|
try validateApiKey(metadata: request.metadata)
|
|
|
|
let message = request.message
|
|
let completionId = UUID().uuidString
|
|
|
|
// Convert protobuf images to service format
|
|
let images = message.images.map { img in
|
|
(data: img.data, filename: img.filename.isEmpty ? nil : img.filename)
|
|
}
|
|
|
|
return StreamingServerResponse { writer in
|
|
let stream = await self.service.streamComplete(
|
|
prompt: message.prompt,
|
|
temperature: message.hasTemperature ? message.temperature : nil,
|
|
maxTokens: message.hasMaxTokens ? Int(message.maxTokens) : nil,
|
|
images: images
|
|
)
|
|
|
|
var lastContent = ""
|
|
var isFirstChunk = true
|
|
for try await (partialResponse, analyses) in stream {
|
|
// Calculate the delta (new text since last response)
|
|
let delta: String
|
|
if partialResponse.hasPrefix(lastContent) {
|
|
delta = String(partialResponse.dropFirst(lastContent.count))
|
|
} else {
|
|
delta = partialResponse
|
|
}
|
|
lastContent = partialResponse
|
|
|
|
if !delta.isEmpty || isFirstChunk {
|
|
var chunk = Appleintelligence_CompletionChunk()
|
|
chunk.id = completionId
|
|
chunk.delta = delta
|
|
chunk.isFinal = false
|
|
|
|
// Include analyses in first chunk if requested
|
|
if isFirstChunk && message.includeAnalysis, let analyses = analyses {
|
|
chunk.imageAnalyses = analyses.map { analysis in
|
|
var protoAnalysis = Appleintelligence_ImageAnalysis()
|
|
protoAnalysis.textContent = analysis.textContent
|
|
protoAnalysis.labels = analysis.labels
|
|
protoAnalysis.description_p = analysis.description
|
|
return protoAnalysis
|
|
}
|
|
}
|
|
|
|
try await writer.write(chunk)
|
|
isFirstChunk = false
|
|
}
|
|
}
|
|
|
|
// Send final chunk
|
|
var finalChunk = Appleintelligence_CompletionChunk()
|
|
finalChunk.id = completionId
|
|
finalChunk.delta = ""
|
|
finalChunk.isFinal = true
|
|
finalChunk.finishReason = "stop"
|
|
try await writer.write(finalChunk)
|
|
|
|
return [:]
|
|
}
|
|
}
|
|
|
|
public func health(
|
|
request: GRPCCore.ServerRequest<Appleintelligence_HealthRequest>,
|
|
context: GRPCCore.ServerContext
|
|
) async throws -> GRPCCore.ServerResponse<Appleintelligence_HealthResponse> {
|
|
let isHealthy = await service.isAvailable
|
|
let modelStatus = await service.getModelStatus()
|
|
|
|
var response = Appleintelligence_HealthResponse()
|
|
response.healthy = isHealthy
|
|
response.modelStatus = modelStatus
|
|
|
|
return ServerResponse(message: response)
|
|
}
|
|
|
|
// MARK: - Text-to-Speech
|
|
|
|
public func textToSpeech(
|
|
request: GRPCCore.ServerRequest<Appleintelligence_TextToSpeechRequest>,
|
|
context: GRPCCore.ServerContext
|
|
) async throws -> GRPCCore.ServerResponse<Appleintelligence_TextToSpeechResponse> {
|
|
try validateApiKey(metadata: request.metadata)
|
|
|
|
guard let ttsService = ttsService else {
|
|
throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
|
|
}
|
|
|
|
let message = request.message
|
|
|
|
// Convert proto config to service config
|
|
var config = SpeechConfig.default
|
|
if message.hasVoiceConfig {
|
|
let voiceConfig = message.voiceConfig
|
|
config = SpeechConfig(
|
|
voiceIdentifier: voiceConfig.voiceIdentifier.isEmpty ? nil : voiceConfig.voiceIdentifier,
|
|
speakingRate: voiceConfig.hasSpeakingRate ? voiceConfig.speakingRate : 0.5,
|
|
pitchMultiplier: voiceConfig.hasPitchMultiplier ? voiceConfig.pitchMultiplier : 1.0,
|
|
volume: voiceConfig.hasVolume ? voiceConfig.volume : 1.0
|
|
)
|
|
}
|
|
|
|
// Convert proto format to service format
|
|
let outputFormat: AudioOutputFormat
|
|
switch message.outputFormat {
|
|
case .wav, .unspecified:
|
|
outputFormat = .wav
|
|
case .mp3:
|
|
outputFormat = .mp3
|
|
case .UNRECOGNIZED:
|
|
outputFormat = .wav
|
|
}
|
|
|
|
do {
|
|
let result = try await ttsService.synthesize(
|
|
text: message.text,
|
|
config: config,
|
|
outputFormat: outputFormat
|
|
)
|
|
|
|
var response = Appleintelligence_TextToSpeechResponse()
|
|
response.audioData = result.audioData
|
|
response.format = outputFormat == .wav ? .wav : .mp3
|
|
response.sampleRate = Int32(result.sampleRate)
|
|
response.channels = Int32(result.channels)
|
|
response.durationSeconds = result.durationSeconds
|
|
|
|
return ServerResponse(message: response)
|
|
} catch let error as TextToSpeechError {
|
|
throw RPCError(code: .internalError, message: error.description)
|
|
}
|
|
}
|
|
|
|
public func listVoices(
|
|
request: GRPCCore.ServerRequest<Appleintelligence_ListVoicesRequest>,
|
|
context: GRPCCore.ServerContext
|
|
) async throws -> GRPCCore.ServerResponse<Appleintelligence_ListVoicesResponse> {
|
|
try validateApiKey(metadata: request.metadata)
|
|
|
|
guard let ttsService = ttsService else {
|
|
throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
|
|
}
|
|
|
|
let message = request.message
|
|
let languageCode = message.hasLanguageCode ? message.languageCode : nil
|
|
|
|
let voices = await ttsService.listVoices(languageCode: languageCode)
|
|
|
|
var response = Appleintelligence_ListVoicesResponse()
|
|
response.voices = voices.map { voice in
|
|
var protoVoice = Appleintelligence_VoiceInfo()
|
|
protoVoice.identifier = voice.identifier
|
|
protoVoice.name = voice.name
|
|
protoVoice.language = voice.language
|
|
protoVoice.isPremium = voice.isPremium
|
|
protoVoice.gender = voice.gender
|
|
return protoVoice
|
|
}
|
|
|
|
return ServerResponse(message: response)
|
|
}
|
|
|
|
// MARK: - Speech-to-Text
|
|
|
|
public func transcribe(
|
|
request: GRPCCore.ServerRequest<Appleintelligence_TranscribeRequest>,
|
|
context: GRPCCore.ServerContext
|
|
) async throws -> GRPCCore.ServerResponse<Appleintelligence_TranscribeResponse> {
|
|
try validateApiKey(metadata: request.metadata)
|
|
|
|
guard let sttService = sttService else {
|
|
throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
|
|
}
|
|
|
|
let message = request.message
|
|
|
|
guard message.hasAudio else {
|
|
throw RPCError(code: .invalidArgument, message: "Audio data is required")
|
|
}
|
|
|
|
// Convert proto config to service config
|
|
var config = TranscriptionConfig.default
|
|
if message.hasConfig {
|
|
let protoConfig = message.config
|
|
config = TranscriptionConfig(
|
|
languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
|
|
enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
|
|
enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
|
|
)
|
|
}
|
|
|
|
do {
|
|
let result = try await sttService.transcribe(
|
|
audioData: message.audio.data,
|
|
mimeType: message.audio.mimeType,
|
|
config: config
|
|
)
|
|
|
|
var response = Appleintelligence_TranscribeResponse()
|
|
response.text = result.text
|
|
response.detectedLanguage = result.detectedLanguage
|
|
response.confidence = result.confidence
|
|
response.segments = result.segments.map { segment in
|
|
var protoSegment = Appleintelligence_TranscriptionSegment()
|
|
protoSegment.text = segment.text
|
|
protoSegment.startTime = segment.startTime
|
|
protoSegment.endTime = segment.endTime
|
|
protoSegment.confidence = segment.confidence
|
|
return protoSegment
|
|
}
|
|
|
|
return ServerResponse(message: response)
|
|
} catch let error as SpeechToTextError {
|
|
throw RPCError(code: .internalError, message: error.description)
|
|
}
|
|
}
|
|
|
|
public func streamTranscribe(
|
|
request: GRPCCore.StreamingServerRequest<Appleintelligence_StreamingTranscribeRequest>,
|
|
context: GRPCCore.ServerContext
|
|
) async throws -> GRPCCore.StreamingServerResponse<Appleintelligence_StreamingTranscribeResponse> {
|
|
try validateApiKey(metadata: request.metadata)
|
|
|
|
guard let sttService = sttService else {
|
|
throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
|
|
}
|
|
|
|
return StreamingServerResponse { writer in
|
|
var config = TranscriptionConfig.default
|
|
|
|
// Process incoming stream
|
|
for try await message in request.messages {
|
|
switch message.request {
|
|
case .config(let protoConfig):
|
|
// First message should be config
|
|
config = TranscriptionConfig(
|
|
languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
|
|
enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
|
|
enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
|
|
)
|
|
|
|
// Start streaming transcription
|
|
let stream = await sttService.streamTranscribe(config: config)
|
|
Task {
|
|
do {
|
|
for try await update in stream {
|
|
var response = Appleintelligence_StreamingTranscribeResponse()
|
|
response.partialText = update.partialText
|
|
response.isFinal = update.isFinal
|
|
if let finalText = update.finalText {
|
|
response.finalText = finalText
|
|
}
|
|
response.segments = update.segments.map { segment in
|
|
var protoSegment = Appleintelligence_TranscriptionSegment()
|
|
protoSegment.text = segment.text
|
|
protoSegment.startTime = segment.startTime
|
|
protoSegment.endTime = segment.endTime
|
|
protoSegment.confidence = segment.confidence
|
|
return protoSegment
|
|
}
|
|
try await writer.write(response)
|
|
}
|
|
} catch {
|
|
// Stream ended or error occurred
|
|
}
|
|
}
|
|
|
|
case .audioChunk(let chunk):
|
|
// Feed audio chunk to service
|
|
try await sttService.feedAudioChunk(chunk)
|
|
|
|
case .none:
|
|
break
|
|
}
|
|
}
|
|
|
|
// End streaming session
|
|
await sttService.endStreamingSession()
|
|
|
|
return [:]
|
|
}
|
|
}
|
|
|
|
// MARK: - Private Helpers
|
|
|
|
/// Validate API key if configured
|
|
private func validateApiKey(metadata: Metadata) throws {
|
|
guard let expectedKey = apiKey else {
|
|
return // No API key required
|
|
}
|
|
|
|
// Look for Authorization header in metadata
|
|
let authValues = metadata["authorization"]
|
|
guard let authHeader = authValues.first(where: { _ in true }),
|
|
case .string(let authString) = authHeader,
|
|
authString.hasPrefix("Bearer ") else {
|
|
throw RPCError(code: .unauthenticated, message: "Missing or invalid Authorization header")
|
|
}
|
|
|
|
let providedKey = String(authString.dropFirst("Bearer ".count))
|
|
guard providedKey == expectedKey else {
|
|
throw RPCError(code: .unauthenticated, message: "Invalid API key")
|
|
}
|
|
}
|
|
}
|