- Add TTS service using AVSpeechSynthesizer for voice output - Add STT service using SpeechAnalyzer (macOS 26) for transcription - Add voice input (microphone) button in chat with recording level indicator - Add speak button on assistant messages for TTS playback - Add language toggle (EN-CA/FR-CA) for bilingual speech recognition - Fix Swift 6 strict concurrency issues in audio callbacks - Update proto schema with TTS/STT message types and RPCs - Update gRPC provider with speech service endpoints 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
281 lines
9.7 KiB
Swift
281 lines
9.7 KiB
Swift
import Foundation
|
|
import AVFoundation
|
|
|
|
// MARK: - Result Types
|
|
|
|
/// Result of text-to-speech synthesis
|
|
public struct TextToSpeechResult: Sendable {
|
|
public let audioData: Data
|
|
public let format: AudioOutputFormat
|
|
public let sampleRate: Int
|
|
public let channels: Int
|
|
public let durationSeconds: Float
|
|
}
|
|
|
|
/// Supported output formats
|
|
public enum AudioOutputFormat: Sendable {
|
|
case wav
|
|
case mp3
|
|
}
|
|
|
|
/// Voice information
|
|
public struct VoiceDescription: Sendable {
|
|
public let identifier: String
|
|
public let name: String
|
|
public let language: String
|
|
public let isPremium: Bool
|
|
public let gender: String
|
|
}
|
|
|
|
/// Configuration for speech synthesis
|
|
public struct SpeechConfig: Sendable {
|
|
public var voiceIdentifier: String?
|
|
public var speakingRate: Float // 0.0 - 1.0
|
|
public var pitchMultiplier: Float // 0.5 - 2.0
|
|
public var volume: Float // 0.0 - 1.0
|
|
|
|
public static let `default` = SpeechConfig(
|
|
voiceIdentifier: nil,
|
|
speakingRate: 0.5,
|
|
pitchMultiplier: 1.0,
|
|
volume: 1.0
|
|
)
|
|
|
|
public init(
|
|
voiceIdentifier: String? = nil,
|
|
speakingRate: Float = 0.5,
|
|
pitchMultiplier: Float = 1.0,
|
|
volume: Float = 1.0
|
|
) {
|
|
self.voiceIdentifier = voiceIdentifier
|
|
self.speakingRate = speakingRate
|
|
self.pitchMultiplier = pitchMultiplier
|
|
self.volume = volume
|
|
}
|
|
}
|
|
|
|
// MARK: - Errors
|
|
|
|
public enum TextToSpeechError: Error, CustomStringConvertible, Sendable {
|
|
case invalidVoice(String)
|
|
case synthesisFailure(String)
|
|
case encodingFailure(String)
|
|
case noAudioGenerated
|
|
case unsupportedFormat
|
|
|
|
public var description: String {
|
|
switch self {
|
|
case .invalidVoice(let id): return "Invalid voice identifier: \(id)"
|
|
case .synthesisFailure(let reason): return "Speech synthesis failed: \(reason)"
|
|
case .encodingFailure(let reason): return "Audio encoding failed: \(reason)"
|
|
case .noAudioGenerated: return "No audio was generated"
|
|
case .unsupportedFormat: return "Unsupported audio format"
|
|
}
|
|
}
|
|
}
|
|
|
|
// MARK: - Service Actor
|
|
|
|
public actor TextToSpeechService {
|
|
/// Keep strong reference to synthesizer during synthesis
|
|
private var activeSynthesizer: AVSpeechSynthesizer?
|
|
|
|
public init() {}
|
|
|
|
// MARK: - Public API
|
|
|
|
/// Synthesize text to speech
|
|
public func synthesize(
|
|
text: String,
|
|
config: SpeechConfig = .default,
|
|
outputFormat: AudioOutputFormat = .wav
|
|
) async throws -> TextToSpeechResult {
|
|
// Create utterance
|
|
let utterance = AVSpeechUtterance(string: text)
|
|
|
|
// Configure voice
|
|
if let voiceId = config.voiceIdentifier {
|
|
if let voice = AVSpeechSynthesisVoice(identifier: voiceId) {
|
|
utterance.voice = voice
|
|
} else {
|
|
throw TextToSpeechError.invalidVoice(voiceId)
|
|
}
|
|
} else {
|
|
// Use default English voice
|
|
utterance.voice = AVSpeechSynthesisVoice(language: "en-US")
|
|
}
|
|
|
|
// Configure speech parameters
|
|
utterance.rate = config.speakingRate
|
|
utterance.pitchMultiplier = config.pitchMultiplier
|
|
utterance.volume = config.volume
|
|
|
|
// Collect PCM data
|
|
let pcmData = try await collectPCMData(utterance: utterance)
|
|
|
|
// Convert to requested format
|
|
let audioData: Data
|
|
switch outputFormat {
|
|
case .wav:
|
|
audioData = createWAVData(from: pcmData)
|
|
case .mp3:
|
|
// Use WAV as fallback (MP3 encoding requires external library)
|
|
audioData = createWAVData(from: pcmData)
|
|
}
|
|
|
|
// Calculate duration
|
|
let bytesPerSample = 2 // Int16
|
|
let totalSamples = pcmData.samples.count / bytesPerSample / pcmData.channelCount
|
|
let duration = Float(totalSamples) / Float(pcmData.sampleRate)
|
|
|
|
return TextToSpeechResult(
|
|
audioData: audioData,
|
|
format: outputFormat,
|
|
sampleRate: Int(pcmData.sampleRate),
|
|
channels: pcmData.channelCount,
|
|
durationSeconds: duration
|
|
)
|
|
}
|
|
|
|
/// List available voices
|
|
public func listVoices(languageCode: String? = nil) -> [VoiceDescription] {
|
|
let voices = AVSpeechSynthesisVoice.speechVoices()
|
|
|
|
let filtered: [AVSpeechSynthesisVoice]
|
|
if let lang = languageCode {
|
|
filtered = voices.filter { $0.language.hasPrefix(lang) }
|
|
} else {
|
|
filtered = voices
|
|
}
|
|
|
|
return filtered.map { voice in
|
|
VoiceDescription(
|
|
identifier: voice.identifier,
|
|
name: voice.name,
|
|
language: voice.language,
|
|
isPremium: voice.quality == .enhanced || voice.quality == .premium,
|
|
gender: genderString(for: voice)
|
|
)
|
|
}
|
|
}
|
|
|
|
// MARK: - Private Implementation
|
|
|
|
/// PCM buffer data for internal processing
|
|
private struct PCMBufferData: Sendable {
|
|
let samples: Data
|
|
let sampleRate: Double
|
|
let channelCount: Int
|
|
}
|
|
|
|
/// Collect PCM data from synthesizer using write callback
|
|
private func collectPCMData(
|
|
utterance: AVSpeechUtterance
|
|
) async throws -> PCMBufferData {
|
|
// Create and store synthesizer to keep strong reference during synthesis
|
|
let synthesizer = AVSpeechSynthesizer()
|
|
self.activeSynthesizer = synthesizer
|
|
|
|
defer { self.activeSynthesizer = nil }
|
|
|
|
return try await withCheckedThrowingContinuation { continuation in
|
|
var pcmData = Data()
|
|
var sampleRate: Double = 0
|
|
var channelCount: Int = 0
|
|
var hasResumed = false
|
|
|
|
synthesizer.write(utterance) { buffer in
|
|
guard let pcmBuffer = buffer as? AVAudioPCMBuffer else {
|
|
// End of audio - empty buffer signals completion
|
|
if !hasResumed {
|
|
hasResumed = true
|
|
if pcmData.isEmpty {
|
|
continuation.resume(throwing: TextToSpeechError.noAudioGenerated)
|
|
} else {
|
|
continuation.resume(returning: PCMBufferData(
|
|
samples: pcmData,
|
|
sampleRate: sampleRate,
|
|
channelCount: channelCount
|
|
))
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
if pcmBuffer.frameLength > 0 {
|
|
// Store format from first buffer
|
|
if sampleRate == 0 {
|
|
sampleRate = pcmBuffer.format.sampleRate
|
|
channelCount = Int(pcmBuffer.format.channelCount)
|
|
}
|
|
|
|
// Convert float samples to Int16 PCM
|
|
if let channelData = pcmBuffer.floatChannelData {
|
|
let frameCount = Int(pcmBuffer.frameLength)
|
|
for frame in 0..<frameCount {
|
|
for channel in 0..<channelCount {
|
|
let sample = channelData[channel][frame]
|
|
let clampedSample = max(-1.0, min(1.0, sample))
|
|
let int16Sample = Int16(clampedSample * Float(Int16.max))
|
|
withUnsafeBytes(of: int16Sample.littleEndian) { bytes in
|
|
pcmData.append(contentsOf: bytes)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Create WAV data from PCM buffer data
|
|
private func createWAVData(from pcmData: PCMBufferData) -> Data {
|
|
let bitsPerSample = 16
|
|
let sampleRate = Int(pcmData.sampleRate)
|
|
let channels = pcmData.channelCount
|
|
let dataSize = pcmData.samples.count
|
|
|
|
var header = Data()
|
|
|
|
// RIFF header
|
|
header.append(contentsOf: "RIFF".utf8)
|
|
let fileSize = UInt32(dataSize + 36)
|
|
withUnsafeBytes(of: fileSize.littleEndian) { header.append(contentsOf: $0) }
|
|
header.append(contentsOf: "WAVE".utf8)
|
|
|
|
// fmt subchunk
|
|
header.append(contentsOf: "fmt ".utf8)
|
|
let subchunk1Size = UInt32(16)
|
|
withUnsafeBytes(of: subchunk1Size.littleEndian) { header.append(contentsOf: $0) }
|
|
let audioFormat = UInt16(1) // PCM
|
|
withUnsafeBytes(of: audioFormat.littleEndian) { header.append(contentsOf: $0) }
|
|
let numChannels = UInt16(channels)
|
|
withUnsafeBytes(of: numChannels.littleEndian) { header.append(contentsOf: $0) }
|
|
let sampleRateU32 = UInt32(sampleRate)
|
|
withUnsafeBytes(of: sampleRateU32.littleEndian) { header.append(contentsOf: $0) }
|
|
let byteRate = UInt32(sampleRate * channels * bitsPerSample / 8)
|
|
withUnsafeBytes(of: byteRate.littleEndian) { header.append(contentsOf: $0) }
|
|
let blockAlign = UInt16(channels * bitsPerSample / 8)
|
|
withUnsafeBytes(of: blockAlign.littleEndian) { header.append(contentsOf: $0) }
|
|
let bitsPerSampleU16 = UInt16(bitsPerSample)
|
|
withUnsafeBytes(of: bitsPerSampleU16.littleEndian) { header.append(contentsOf: $0) }
|
|
|
|
// data subchunk
|
|
header.append(contentsOf: "data".utf8)
|
|
let dataU32 = UInt32(dataSize)
|
|
withUnsafeBytes(of: dataU32.littleEndian) { header.append(contentsOf: $0) }
|
|
|
|
return header + pcmData.samples
|
|
}
|
|
|
|
/// Get gender string for voice
|
|
private func genderString(for voice: AVSpeechSynthesisVoice) -> String {
|
|
switch voice.gender {
|
|
case .male: return "male"
|
|
case .female: return "female"
|
|
case .unspecified: return "unspecified"
|
|
@unknown default: return "unknown"
|
|
}
|
|
}
|
|
}
|