import Foundation import AVFoundation // MARK: - Result Types /// Result of text-to-speech synthesis public struct TextToSpeechResult: Sendable { public let audioData: Data public let format: AudioOutputFormat public let sampleRate: Int public let channels: Int public let durationSeconds: Float } /// Supported output formats public enum AudioOutputFormat: Sendable { case wav case mp3 } /// Voice information public struct VoiceDescription: Sendable { public let identifier: String public let name: String public let language: String public let isPremium: Bool public let gender: String } /// Configuration for speech synthesis public struct SpeechConfig: Sendable { public var voiceIdentifier: String? public var speakingRate: Float // 0.0 - 1.0 public var pitchMultiplier: Float // 0.5 - 2.0 public var volume: Float // 0.0 - 1.0 public static let `default` = SpeechConfig( voiceIdentifier: nil, speakingRate: 0.5, pitchMultiplier: 1.0, volume: 1.0 ) public init( voiceIdentifier: String? = nil, speakingRate: Float = 0.5, pitchMultiplier: Float = 1.0, volume: Float = 1.0 ) { self.voiceIdentifier = voiceIdentifier self.speakingRate = speakingRate self.pitchMultiplier = pitchMultiplier self.volume = volume } } // MARK: - Errors public enum TextToSpeechError: Error, CustomStringConvertible, Sendable { case invalidVoice(String) case synthesisFailure(String) case encodingFailure(String) case noAudioGenerated case unsupportedFormat public var description: String { switch self { case .invalidVoice(let id): return "Invalid voice identifier: \(id)" case .synthesisFailure(let reason): return "Speech synthesis failed: \(reason)" case .encodingFailure(let reason): return "Audio encoding failed: \(reason)" case .noAudioGenerated: return "No audio was generated" case .unsupportedFormat: return "Unsupported audio format" } } } // MARK: - Service Actor public actor TextToSpeechService { /// Keep strong reference to synthesizer during synthesis private var activeSynthesizer: AVSpeechSynthesizer? public init() {} // MARK: - Public API /// Synthesize text to speech public func synthesize( text: String, config: SpeechConfig = .default, outputFormat: AudioOutputFormat = .wav ) async throws -> TextToSpeechResult { // Create utterance let utterance = AVSpeechUtterance(string: text) // Configure voice if let voiceId = config.voiceIdentifier { if let voice = AVSpeechSynthesisVoice(identifier: voiceId) { utterance.voice = voice } else { throw TextToSpeechError.invalidVoice(voiceId) } } else { // Use default English voice utterance.voice = AVSpeechSynthesisVoice(language: "en-US") } // Configure speech parameters utterance.rate = config.speakingRate utterance.pitchMultiplier = config.pitchMultiplier utterance.volume = config.volume // Collect PCM data let pcmData = try await collectPCMData(utterance: utterance) // Convert to requested format let audioData: Data switch outputFormat { case .wav: audioData = createWAVData(from: pcmData) case .mp3: // Use WAV as fallback (MP3 encoding requires external library) audioData = createWAVData(from: pcmData) } // Calculate duration let bytesPerSample = 2 // Int16 let totalSamples = pcmData.samples.count / bytesPerSample / pcmData.channelCount let duration = Float(totalSamples) / Float(pcmData.sampleRate) return TextToSpeechResult( audioData: audioData, format: outputFormat, sampleRate: Int(pcmData.sampleRate), channels: pcmData.channelCount, durationSeconds: duration ) } /// List available voices public func listVoices(languageCode: String? = nil) -> [VoiceDescription] { let voices = AVSpeechSynthesisVoice.speechVoices() let filtered: [AVSpeechSynthesisVoice] if let lang = languageCode { filtered = voices.filter { $0.language.hasPrefix(lang) } } else { filtered = voices } return filtered.map { voice in VoiceDescription( identifier: voice.identifier, name: voice.name, language: voice.language, isPremium: voice.quality == .enhanced || voice.quality == .premium, gender: genderString(for: voice) ) } } // MARK: - Private Implementation /// PCM buffer data for internal processing private struct PCMBufferData: Sendable { let samples: Data let sampleRate: Double let channelCount: Int } /// Collect PCM data from synthesizer using write callback private func collectPCMData( utterance: AVSpeechUtterance ) async throws -> PCMBufferData { // Create and store synthesizer to keep strong reference during synthesis let synthesizer = AVSpeechSynthesizer() self.activeSynthesizer = synthesizer defer { self.activeSynthesizer = nil } return try await withCheckedThrowingContinuation { continuation in var pcmData = Data() var sampleRate: Double = 0 var channelCount: Int = 0 var hasResumed = false synthesizer.write(utterance) { buffer in guard let pcmBuffer = buffer as? AVAudioPCMBuffer else { // End of audio - empty buffer signals completion if !hasResumed { hasResumed = true if pcmData.isEmpty { continuation.resume(throwing: TextToSpeechError.noAudioGenerated) } else { continuation.resume(returning: PCMBufferData( samples: pcmData, sampleRate: sampleRate, channelCount: channelCount )) } } return } if pcmBuffer.frameLength > 0 { // Store format from first buffer if sampleRate == 0 { sampleRate = pcmBuffer.format.sampleRate channelCount = Int(pcmBuffer.format.channelCount) } // Convert float samples to Int16 PCM if let channelData = pcmBuffer.floatChannelData { let frameCount = Int(pcmBuffer.frameLength) for frame in 0.. Data { let bitsPerSample = 16 let sampleRate = Int(pcmData.sampleRate) let channels = pcmData.channelCount let dataSize = pcmData.samples.count var header = Data() // RIFF header header.append(contentsOf: "RIFF".utf8) let fileSize = UInt32(dataSize + 36) withUnsafeBytes(of: fileSize.littleEndian) { header.append(contentsOf: $0) } header.append(contentsOf: "WAVE".utf8) // fmt subchunk header.append(contentsOf: "fmt ".utf8) let subchunk1Size = UInt32(16) withUnsafeBytes(of: subchunk1Size.littleEndian) { header.append(contentsOf: $0) } let audioFormat = UInt16(1) // PCM withUnsafeBytes(of: audioFormat.littleEndian) { header.append(contentsOf: $0) } let numChannels = UInt16(channels) withUnsafeBytes(of: numChannels.littleEndian) { header.append(contentsOf: $0) } let sampleRateU32 = UInt32(sampleRate) withUnsafeBytes(of: sampleRateU32.littleEndian) { header.append(contentsOf: $0) } let byteRate = UInt32(sampleRate * channels * bitsPerSample / 8) withUnsafeBytes(of: byteRate.littleEndian) { header.append(contentsOf: $0) } let blockAlign = UInt16(channels * bitsPerSample / 8) withUnsafeBytes(of: blockAlign.littleEndian) { header.append(contentsOf: $0) } let bitsPerSampleU16 = UInt16(bitsPerSample) withUnsafeBytes(of: bitsPerSampleU16.littleEndian) { header.append(contentsOf: $0) } // data subchunk header.append(contentsOf: "data".utf8) let dataU32 = UInt32(dataSize) withUnsafeBytes(of: dataU32.littleEndian) { header.append(contentsOf: $0) } return header + pcmData.samples } /// Get gender string for voice private func genderString(for voice: AVSpeechSynthesisVoice) -> String { switch voice.gender { case .male: return "male" case .female: return "female" case .unspecified: return "unspecified" @unknown default: return "unknown" } } }