Add Text-to-Speech and Speech-to-Text features

- Add TTS service using AVSpeechSynthesizer for voice output
- Add STT service using SpeechAnalyzer (macOS 26) for transcription
- Add voice input (microphone) button in chat with recording level indicator
- Add speak button on assistant messages for TTS playback
- Add language toggle (EN-CA/FR-CA) for bilingual speech recognition
- Fix Swift 6 strict concurrency issues in audio callbacks
- Update proto schema with TTS/STT message types and RPCs
- Update gRPC provider with speech service endpoints

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Mathias Beaulieu-Duncan
2025-12-31 02:57:30 -05:00
parent 638656e7ca
commit b754945923
10 changed files with 3151 additions and 8 deletions
@@ -0,0 +1,337 @@
import Foundation
import Speech
import AVFoundation
// MARK: - Result Types
/// Transcription result
public struct TranscriptionResult: Sendable {
public let text: String
public let segments: [TranscriptionSegmentResult]
public let detectedLanguage: String
public let confidence: Float
}
/// Individual transcription segment
public struct TranscriptionSegmentResult: Sendable {
public let text: String
public let startTime: Float
public let endTime: Float
public let confidence: Float
}
/// Streaming transcription update
public struct StreamingTranscriptionUpdate: Sendable {
public let partialText: String
public let isFinal: Bool
public let finalText: String?
public let segments: [TranscriptionSegmentResult]
}
/// Transcription configuration
public struct TranscriptionConfig: Sendable {
public var languageCode: String?
public var enablePunctuation: Bool
public var enableTimestamps: Bool
public static let `default` = TranscriptionConfig(
languageCode: nil,
enablePunctuation: true,
enableTimestamps: false
)
public init(
languageCode: String? = nil,
enablePunctuation: Bool = true,
enableTimestamps: Bool = false
) {
self.languageCode = languageCode
self.enablePunctuation = enablePunctuation
self.enableTimestamps = enableTimestamps
}
}
// MARK: - Errors
public enum SpeechToTextError: Error, CustomStringConvertible, Sendable {
case notAvailable
case authorizationDenied
case modelNotReady(String)
case transcriptionFailed(String)
case invalidAudioFormat
case audioProcessingFailed(String)
case unsupportedMimeType(String)
public var description: String {
switch self {
case .notAvailable: return "Speech recognition not available on this system"
case .authorizationDenied: return "Speech recognition authorization denied"
case .modelNotReady(let reason): return "Speech model not ready: \(reason)"
case .transcriptionFailed(let reason): return "Transcription failed: \(reason)"
case .invalidAudioFormat: return "Invalid audio format"
case .audioProcessingFailed(let reason): return "Audio processing failed: \(reason)"
case .unsupportedMimeType(let type): return "Unsupported audio MIME type: \(type)"
}
}
}
// MARK: - Service Actor
public actor SpeechToTextService {
/// Service availability status
public private(set) var isAvailable: Bool = false
/// Streaming session state
private var isStreamingActive: Bool = false
public init() async {
await checkAvailability()
}
// MARK: - Public API
/// Transcribe audio data (file-based)
public func transcribe(
audioData: Data,
mimeType: String,
config: TranscriptionConfig = .default
) async throws -> TranscriptionResult {
guard isAvailable else {
throw SpeechToTextError.notAvailable
}
// Convert audio data to file URL for processing
let tempURL = try createTempAudioFile(data: audioData, mimeType: mimeType)
defer { try? FileManager.default.removeItem(at: tempURL) }
return try await transcribeWithSFSpeechRecognizer(url: tempURL, config: config)
}
/// Stream transcription from audio chunks
public func streamTranscribe(
config: TranscriptionConfig = .default
) -> AsyncThrowingStream<StreamingTranscriptionUpdate, Error> {
AsyncThrowingStream { continuation in
Task {
guard self.isAvailable else {
continuation.finish(throwing: SpeechToTextError.notAvailable)
return
}
do {
try await self.startStreamingWithSFSpeechRecognizer(config: config, continuation: continuation)
} catch {
continuation.finish(throwing: error)
}
}
}
}
/// Feed audio chunk for streaming transcription
public func feedAudioChunk(_ chunk: Data) async throws {
guard isStreamingActive else {
throw SpeechToTextError.transcriptionFailed("No active streaming session")
}
// Audio chunk handling implemented in streaming methods
}
/// End streaming session
public func endStreamingSession() async {
isStreamingActive = false
}
/// Get status information
public func getStatus() -> String {
if isAvailable {
return "SFSpeechRecognizer available"
} else {
return "Speech recognition not available"
}
}
// MARK: - Private Implementation
private func checkAvailability() async {
// Check SFSpeechRecognizer availability
let status = SFSpeechRecognizer.authorizationStatus()
switch status {
case .authorized:
isAvailable = SFSpeechRecognizer.supportedLocales().count > 0
case .notDetermined:
// Request authorization
isAvailable = await withCheckedContinuation { continuation in
SFSpeechRecognizer.requestAuthorization { newStatus in
continuation.resume(returning: newStatus == .authorized)
}
}
default:
isAvailable = false
}
}
/// Create temporary audio file from data
private func createTempAudioFile(data: Data, mimeType: String) throws -> URL {
let ext = extensionForMimeType(mimeType)
let tempDir = FileManager.default.temporaryDirectory
let fileName = UUID().uuidString + "." + ext
let fileURL = tempDir.appendingPathComponent(fileName)
try data.write(to: fileURL)
return fileURL
}
/// Get file extension for MIME type
private func extensionForMimeType(_ mimeType: String) -> String {
switch mimeType.lowercased() {
case "audio/wav", "audio/wave", "audio/x-wav":
return "wav"
case "audio/mp3", "audio/mpeg":
return "mp3"
case "audio/m4a", "audio/mp4", "audio/x-m4a":
return "m4a"
case "audio/aac":
return "aac"
case "audio/flac":
return "flac"
default:
return "wav"
}
}
/// Transcribe using SFSpeechRecognizer
private func transcribeWithSFSpeechRecognizer(
url: URL,
config: TranscriptionConfig
) async throws -> TranscriptionResult {
let locale = Locale(identifier: config.languageCode ?? "en-US")
guard let recognizer = SFSpeechRecognizer(locale: locale) else {
throw SpeechToTextError.notAvailable
}
guard recognizer.isAvailable else {
throw SpeechToTextError.notAvailable
}
let request = SFSpeechURLRecognitionRequest(url: url)
request.shouldReportPartialResults = false
return try await withCheckedThrowingContinuation { continuation in
var hasResumed = false
recognizer.recognitionTask(with: request) { result, error in
guard !hasResumed else { return }
if let error = error {
hasResumed = true
continuation.resume(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription))
return
}
guard let result = result, result.isFinal else { return }
hasResumed = true
let transcription = result.bestTranscription
var segments: [TranscriptionSegmentResult] = []
if config.enableTimestamps {
for segment in transcription.segments {
segments.append(TranscriptionSegmentResult(
text: segment.substring,
startTime: Float(segment.timestamp),
endTime: Float(segment.timestamp + segment.duration),
confidence: segment.confidence
))
}
}
let transcriptionResult = TranscriptionResult(
text: transcription.formattedString,
segments: segments,
detectedLanguage: config.languageCode ?? "en-US",
confidence: segments.isEmpty ? 1.0 : segments.reduce(0) { $0 + $1.confidence } / Float(segments.count)
)
continuation.resume(returning: transcriptionResult)
}
}
}
/// Start streaming with SFSpeechRecognizer
private func startStreamingWithSFSpeechRecognizer(
config: TranscriptionConfig,
continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation
) async throws {
let locale = Locale(identifier: config.languageCode ?? "en-US")
guard let recognizer = SFSpeechRecognizer(locale: locale) else {
throw SpeechToTextError.notAvailable
}
guard recognizer.isAvailable else {
throw SpeechToTextError.notAvailable
}
isStreamingActive = true
let audioEngine = AVAudioEngine()
let request = SFSpeechAudioBufferRecognitionRequest()
request.shouldReportPartialResults = true
let inputNode = audioEngine.inputNode
let recordingFormat = inputNode.outputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { buffer, _ in
request.append(buffer)
}
audioEngine.prepare()
try audioEngine.start()
recognizer.recognitionTask(with: request) { result, error in
if let error = error {
continuation.finish(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription))
return
}
guard let result = result else { return }
let transcription = result.bestTranscription
var segments: [TranscriptionSegmentResult] = []
if config.enableTimestamps {
for segment in transcription.segments {
segments.append(TranscriptionSegmentResult(
text: segment.substring,
startTime: Float(segment.timestamp),
endTime: Float(segment.timestamp + segment.duration),
confidence: segment.confidence
))
}
}
let update = StreamingTranscriptionUpdate(
partialText: transcription.formattedString,
isFinal: result.isFinal,
finalText: result.isFinal ? transcription.formattedString : nil,
segments: segments
)
continuation.yield(update)
if result.isFinal {
audioEngine.stop()
inputNode.removeTap(onBus: 0)
continuation.finish()
}
}
// Wait for streaming to end
while isStreamingActive {
try await Task.sleep(for: .milliseconds(100))
}
audioEngine.stop()
inputNode.removeTap(onBus: 0)
request.endAudio()
}
}
@@ -0,0 +1,280 @@
import Foundation
import AVFoundation
// MARK: - Result Types
/// Result of text-to-speech synthesis
public struct TextToSpeechResult: Sendable {
public let audioData: Data
public let format: AudioOutputFormat
public let sampleRate: Int
public let channels: Int
public let durationSeconds: Float
}
/// Supported output formats
public enum AudioOutputFormat: Sendable {
case wav
case mp3
}
/// Voice information
public struct VoiceDescription: Sendable {
public let identifier: String
public let name: String
public let language: String
public let isPremium: Bool
public let gender: String
}
/// Configuration for speech synthesis
public struct SpeechConfig: Sendable {
public var voiceIdentifier: String?
public var speakingRate: Float // 0.0 - 1.0
public var pitchMultiplier: Float // 0.5 - 2.0
public var volume: Float // 0.0 - 1.0
public static let `default` = SpeechConfig(
voiceIdentifier: nil,
speakingRate: 0.5,
pitchMultiplier: 1.0,
volume: 1.0
)
public init(
voiceIdentifier: String? = nil,
speakingRate: Float = 0.5,
pitchMultiplier: Float = 1.0,
volume: Float = 1.0
) {
self.voiceIdentifier = voiceIdentifier
self.speakingRate = speakingRate
self.pitchMultiplier = pitchMultiplier
self.volume = volume
}
}
// MARK: - Errors
public enum TextToSpeechError: Error, CustomStringConvertible, Sendable {
case invalidVoice(String)
case synthesisFailure(String)
case encodingFailure(String)
case noAudioGenerated
case unsupportedFormat
public var description: String {
switch self {
case .invalidVoice(let id): return "Invalid voice identifier: \(id)"
case .synthesisFailure(let reason): return "Speech synthesis failed: \(reason)"
case .encodingFailure(let reason): return "Audio encoding failed: \(reason)"
case .noAudioGenerated: return "No audio was generated"
case .unsupportedFormat: return "Unsupported audio format"
}
}
}
// MARK: - Service Actor
public actor TextToSpeechService {
/// Keep strong reference to synthesizer during synthesis
private var activeSynthesizer: AVSpeechSynthesizer?
public init() {}
// MARK: - Public API
/// Synthesize text to speech
public func synthesize(
text: String,
config: SpeechConfig = .default,
outputFormat: AudioOutputFormat = .wav
) async throws -> TextToSpeechResult {
// Create utterance
let utterance = AVSpeechUtterance(string: text)
// Configure voice
if let voiceId = config.voiceIdentifier {
if let voice = AVSpeechSynthesisVoice(identifier: voiceId) {
utterance.voice = voice
} else {
throw TextToSpeechError.invalidVoice(voiceId)
}
} else {
// Use default English voice
utterance.voice = AVSpeechSynthesisVoice(language: "en-US")
}
// Configure speech parameters
utterance.rate = config.speakingRate
utterance.pitchMultiplier = config.pitchMultiplier
utterance.volume = config.volume
// Collect PCM data
let pcmData = try await collectPCMData(utterance: utterance)
// Convert to requested format
let audioData: Data
switch outputFormat {
case .wav:
audioData = createWAVData(from: pcmData)
case .mp3:
// Use WAV as fallback (MP3 encoding requires external library)
audioData = createWAVData(from: pcmData)
}
// Calculate duration
let bytesPerSample = 2 // Int16
let totalSamples = pcmData.samples.count / bytesPerSample / pcmData.channelCount
let duration = Float(totalSamples) / Float(pcmData.sampleRate)
return TextToSpeechResult(
audioData: audioData,
format: outputFormat,
sampleRate: Int(pcmData.sampleRate),
channels: pcmData.channelCount,
durationSeconds: duration
)
}
/// List available voices
public func listVoices(languageCode: String? = nil) -> [VoiceDescription] {
let voices = AVSpeechSynthesisVoice.speechVoices()
let filtered: [AVSpeechSynthesisVoice]
if let lang = languageCode {
filtered = voices.filter { $0.language.hasPrefix(lang) }
} else {
filtered = voices
}
return filtered.map { voice in
VoiceDescription(
identifier: voice.identifier,
name: voice.name,
language: voice.language,
isPremium: voice.quality == .enhanced || voice.quality == .premium,
gender: genderString(for: voice)
)
}
}
// MARK: - Private Implementation
/// PCM buffer data for internal processing
private struct PCMBufferData: Sendable {
let samples: Data
let sampleRate: Double
let channelCount: Int
}
/// Collect PCM data from synthesizer using write callback
private func collectPCMData(
utterance: AVSpeechUtterance
) async throws -> PCMBufferData {
// Create and store synthesizer to keep strong reference during synthesis
let synthesizer = AVSpeechSynthesizer()
self.activeSynthesizer = synthesizer
defer { self.activeSynthesizer = nil }
return try await withCheckedThrowingContinuation { continuation in
var pcmData = Data()
var sampleRate: Double = 0
var channelCount: Int = 0
var hasResumed = false
synthesizer.write(utterance) { buffer in
guard let pcmBuffer = buffer as? AVAudioPCMBuffer else {
// End of audio - empty buffer signals completion
if !hasResumed {
hasResumed = true
if pcmData.isEmpty {
continuation.resume(throwing: TextToSpeechError.noAudioGenerated)
} else {
continuation.resume(returning: PCMBufferData(
samples: pcmData,
sampleRate: sampleRate,
channelCount: channelCount
))
}
}
return
}
if pcmBuffer.frameLength > 0 {
// Store format from first buffer
if sampleRate == 0 {
sampleRate = pcmBuffer.format.sampleRate
channelCount = Int(pcmBuffer.format.channelCount)
}
// Convert float samples to Int16 PCM
if let channelData = pcmBuffer.floatChannelData {
let frameCount = Int(pcmBuffer.frameLength)
for frame in 0..<frameCount {
for channel in 0..<channelCount {
let sample = channelData[channel][frame]
let clampedSample = max(-1.0, min(1.0, sample))
let int16Sample = Int16(clampedSample * Float(Int16.max))
withUnsafeBytes(of: int16Sample.littleEndian) { bytes in
pcmData.append(contentsOf: bytes)
}
}
}
}
}
}
}
}
/// Create WAV data from PCM buffer data
private func createWAVData(from pcmData: PCMBufferData) -> Data {
let bitsPerSample = 16
let sampleRate = Int(pcmData.sampleRate)
let channels = pcmData.channelCount
let dataSize = pcmData.samples.count
var header = Data()
// RIFF header
header.append(contentsOf: "RIFF".utf8)
let fileSize = UInt32(dataSize + 36)
withUnsafeBytes(of: fileSize.littleEndian) { header.append(contentsOf: $0) }
header.append(contentsOf: "WAVE".utf8)
// fmt subchunk
header.append(contentsOf: "fmt ".utf8)
let subchunk1Size = UInt32(16)
withUnsafeBytes(of: subchunk1Size.littleEndian) { header.append(contentsOf: $0) }
let audioFormat = UInt16(1) // PCM
withUnsafeBytes(of: audioFormat.littleEndian) { header.append(contentsOf: $0) }
let numChannels = UInt16(channels)
withUnsafeBytes(of: numChannels.littleEndian) { header.append(contentsOf: $0) }
let sampleRateU32 = UInt32(sampleRate)
withUnsafeBytes(of: sampleRateU32.littleEndian) { header.append(contentsOf: $0) }
let byteRate = UInt32(sampleRate * channels * bitsPerSample / 8)
withUnsafeBytes(of: byteRate.littleEndian) { header.append(contentsOf: $0) }
let blockAlign = UInt16(channels * bitsPerSample / 8)
withUnsafeBytes(of: blockAlign.littleEndian) { header.append(contentsOf: $0) }
let bitsPerSampleU16 = UInt16(bitsPerSample)
withUnsafeBytes(of: bitsPerSampleU16.littleEndian) { header.append(contentsOf: $0) }
// data subchunk
header.append(contentsOf: "data".utf8)
let dataU32 = UInt32(dataSize)
withUnsafeBytes(of: dataU32.littleEndian) { header.append(contentsOf: $0) }
return header + pcmData.samples
}
/// Get gender string for voice
private func genderString(for voice: AVSpeechSynthesisVoice) -> String {
switch voice.gender {
case .male: return "male"
case .female: return "female"
case .unspecified: return "unspecified"
@unknown default: return "unknown"
}
}
}