swift-apple-intelligence-grpc/Sources/AppleIntelligenceCore/Services/TextToSpeechService.swift

import Foundation
import AVFoundation

// MARK: - Result Types

/// Result of text-to-speech synthesis
public struct TextToSpeechResult: Sendable {
    public let audioData: Data
    public let format: AudioOutputFormat
    public let sampleRate: Int
    public let channels: Int
    public let durationSeconds: Float
}

/// Supported output formats
public enum AudioOutputFormat: Sendable {
    case wav
    case mp3
}

/// Voice information
public struct VoiceDescription: Sendable {
    public let identifier: String
    public let name: String
    public let language: String
    public let isPremium: Bool
    public let gender: String
}

/// Configuration for speech synthesis
public struct SpeechConfig: Sendable {
    public var voiceIdentifier: String?
    public var speakingRate: Float   // 0.0 - 1.0
    public var pitchMultiplier: Float // 0.5 - 2.0
    public var volume: Float          // 0.0 - 1.0

    public static let `default` = SpeechConfig(
        voiceIdentifier: nil,
        speakingRate: 0.5,
        pitchMultiplier: 1.0,
        volume: 1.0
    )

    public init(
        voiceIdentifier: String? = nil,
        speakingRate: Float = 0.5,
        pitchMultiplier: Float = 1.0,
        volume: Float = 1.0
    ) {
        self.voiceIdentifier = voiceIdentifier
        self.speakingRate = speakingRate
        self.pitchMultiplier = pitchMultiplier
        self.volume = volume
    }
}

// MARK: - Errors

public enum TextToSpeechError: Error, CustomStringConvertible, Sendable {
    case invalidVoice(String)
    case synthesisFailure(String)
    case encodingFailure(String)
    case noAudioGenerated
    case unsupportedFormat

    public var description: String {
        switch self {
        case .invalidVoice(let id): return "Invalid voice identifier: \(id)"
        case .synthesisFailure(let reason): return "Speech synthesis failed: \(reason)"
        case .encodingFailure(let reason): return "Audio encoding failed: \(reason)"
        case .noAudioGenerated: return "No audio was generated"
        case .unsupportedFormat: return "Unsupported audio format"
        }
    }
}

// MARK: - Service Actor

public actor TextToSpeechService {
    /// Keep strong reference to synthesizer during synthesis
    private var activeSynthesizer: AVSpeechSynthesizer?

    public init() {}

    // MARK: - Public API

    /// Synthesize text to speech
    public func synthesize(
        text: String,
        config: SpeechConfig = .default,
        outputFormat: AudioOutputFormat = .wav
    ) async throws -> TextToSpeechResult {
        // Create utterance
        let utterance = AVSpeechUtterance(string: text)

        // Configure voice
        if let voiceId = config.voiceIdentifier {
            if let voice = AVSpeechSynthesisVoice(identifier: voiceId) {
                utterance.voice = voice
            } else {
                throw TextToSpeechError.invalidVoice(voiceId)
            }
        } else {
            // Use default English voice
            utterance.voice = AVSpeechSynthesisVoice(language: "en-US")
        }

        // Configure speech parameters
        utterance.rate = config.speakingRate
        utterance.pitchMultiplier = config.pitchMultiplier
        utterance.volume = config.volume

        // Collect PCM data
        let pcmData = try await collectPCMData(utterance: utterance)

        // Convert to requested format
        let audioData: Data
        switch outputFormat {
        case .wav:
            audioData = createWAVData(from: pcmData)
        case .mp3:
            // Use WAV as fallback (MP3 encoding requires external library)
            audioData = createWAVData(from: pcmData)
        }

        // Calculate duration
        let bytesPerSample = 2 // Int16
        let totalSamples = pcmData.samples.count / bytesPerSample / pcmData.channelCount
        let duration = Float(totalSamples) / Float(pcmData.sampleRate)

        return TextToSpeechResult(
            audioData: audioData,
            format: outputFormat,
            sampleRate: Int(pcmData.sampleRate),
            channels: pcmData.channelCount,
            durationSeconds: duration
        )
    }

    /// List available voices
    public func listVoices(languageCode: String? = nil) -> [VoiceDescription] {
        let voices = AVSpeechSynthesisVoice.speechVoices()

        let filtered: [AVSpeechSynthesisVoice]
        if let lang = languageCode {
            filtered = voices.filter { $0.language.hasPrefix(lang) }
        } else {
            filtered = voices
        }

        return filtered.map { voice in
            VoiceDescription(
                identifier: voice.identifier,
                name: voice.name,
                language: voice.language,
                isPremium: voice.quality == .enhanced || voice.quality == .premium,
                gender: genderString(for: voice)
            )
        }
    }

    // MARK: - Private Implementation

    /// PCM buffer data for internal processing
    private struct PCMBufferData: Sendable {
        let samples: Data
        let sampleRate: Double
        let channelCount: Int
    }

    /// Collect PCM data from synthesizer using write callback
    private func collectPCMData(
        utterance: AVSpeechUtterance
    ) async throws -> PCMBufferData {
        // Create and store synthesizer to keep strong reference during synthesis
        let synthesizer = AVSpeechSynthesizer()
        self.activeSynthesizer = synthesizer

        defer { self.activeSynthesizer = nil }

        return try await withCheckedThrowingContinuation { continuation in
            var pcmData = Data()
            var sampleRate: Double = 0
            var channelCount: Int = 0
            var hasResumed = false

            synthesizer.write(utterance) { buffer in
                guard let pcmBuffer = buffer as? AVAudioPCMBuffer else {
                    // End of audio - empty buffer signals completion
                    if !hasResumed {
                        hasResumed = true
                        if pcmData.isEmpty {
                            continuation.resume(throwing: TextToSpeechError.noAudioGenerated)
                        } else {
                            continuation.resume(returning: PCMBufferData(
                                samples: pcmData,
                                sampleRate: sampleRate,
                                channelCount: channelCount
                            ))
                        }
                    }
                    return
                }

                if pcmBuffer.frameLength > 0 {
                    // Store format from first buffer
                    if sampleRate == 0 {
                        sampleRate = pcmBuffer.format.sampleRate
                        channelCount = Int(pcmBuffer.format.channelCount)
                    }

                    // Convert float samples to Int16 PCM
                    if let channelData = pcmBuffer.floatChannelData {
                        let frameCount = Int(pcmBuffer.frameLength)
                        for frame in 0..<frameCount {
                            for channel in 0..<channelCount {
                                let sample = channelData[channel][frame]
                                let clampedSample = max(-1.0, min(1.0, sample))
                                let int16Sample = Int16(clampedSample * Float(Int16.max))
                                withUnsafeBytes(of: int16Sample.littleEndian) { bytes in
                                    pcmData.append(contentsOf: bytes)
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    /// Create WAV data from PCM buffer data
    private func createWAVData(from pcmData: PCMBufferData) -> Data {
        let bitsPerSample = 16
        let sampleRate = Int(pcmData.sampleRate)
        let channels = pcmData.channelCount
        let dataSize = pcmData.samples.count

        var header = Data()

        // RIFF header
        header.append(contentsOf: "RIFF".utf8)
        let fileSize = UInt32(dataSize + 36)
        withUnsafeBytes(of: fileSize.littleEndian) { header.append(contentsOf: $0) }
        header.append(contentsOf: "WAVE".utf8)

        // fmt subchunk
        header.append(contentsOf: "fmt ".utf8)
        let subchunk1Size = UInt32(16)
        withUnsafeBytes(of: subchunk1Size.littleEndian) { header.append(contentsOf: $0) }
        let audioFormat = UInt16(1) // PCM
        withUnsafeBytes(of: audioFormat.littleEndian) { header.append(contentsOf: $0) }
        let numChannels = UInt16(channels)
        withUnsafeBytes(of: numChannels.littleEndian) { header.append(contentsOf: $0) }
        let sampleRateU32 = UInt32(sampleRate)
        withUnsafeBytes(of: sampleRateU32.littleEndian) { header.append(contentsOf: $0) }
        let byteRate = UInt32(sampleRate * channels * bitsPerSample / 8)
        withUnsafeBytes(of: byteRate.littleEndian) { header.append(contentsOf: $0) }
        let blockAlign = UInt16(channels * bitsPerSample / 8)
        withUnsafeBytes(of: blockAlign.littleEndian) { header.append(contentsOf: $0) }
        let bitsPerSampleU16 = UInt16(bitsPerSample)
        withUnsafeBytes(of: bitsPerSampleU16.littleEndian) { header.append(contentsOf: $0) }

        // data subchunk
        header.append(contentsOf: "data".utf8)
        let dataU32 = UInt32(dataSize)
        withUnsafeBytes(of: dataU32.littleEndian) { header.append(contentsOf: $0) }

        return header + pcmData.samples
    }

    /// Get gender string for voice
    private func genderString(for voice: AVSpeechSynthesisVoice) -> String {
        switch voice.gender {
        case .male: return "male"
        case .female: return "female"
        case .unspecified: return "unspecified"
        @unknown default: return "unknown"
        }
    }
}