import Foundation import Speech import AVFoundation // MARK: - Result Types /// Transcription result public struct TranscriptionResult: Sendable { public let text: String public let segments: [TranscriptionSegmentResult] public let detectedLanguage: String public let confidence: Float } /// Individual transcription segment public struct TranscriptionSegmentResult: Sendable { public let text: String public let startTime: Float public let endTime: Float public let confidence: Float } /// Streaming transcription update public struct StreamingTranscriptionUpdate: Sendable { public let partialText: String public let isFinal: Bool public let finalText: String? public let segments: [TranscriptionSegmentResult] } /// Transcription configuration public struct TranscriptionConfig: Sendable { public var languageCode: String? public var enablePunctuation: Bool public var enableTimestamps: Bool public static let `default` = TranscriptionConfig( languageCode: nil, enablePunctuation: true, enableTimestamps: false ) public init( languageCode: String? = nil, enablePunctuation: Bool = true, enableTimestamps: Bool = false ) { self.languageCode = languageCode self.enablePunctuation = enablePunctuation self.enableTimestamps = enableTimestamps } } // MARK: - Errors public enum SpeechToTextError: Error, CustomStringConvertible, Sendable { case notAvailable case authorizationDenied case modelNotReady(String) case transcriptionFailed(String) case invalidAudioFormat case audioProcessingFailed(String) case unsupportedMimeType(String) public var description: String { switch self { case .notAvailable: return "Speech recognition not available on this system" case .authorizationDenied: return "Speech recognition authorization denied" case .modelNotReady(let reason): return "Speech model not ready: \(reason)" case .transcriptionFailed(let reason): return "Transcription failed: \(reason)" case .invalidAudioFormat: return "Invalid audio format" case .audioProcessingFailed(let reason): return "Audio processing failed: \(reason)" case .unsupportedMimeType(let type): return "Unsupported audio MIME type: \(type)" } } } // MARK: - Service Actor public actor SpeechToTextService { /// Service availability status public private(set) var isAvailable: Bool = false /// Streaming session state private var isStreamingActive: Bool = false public init() async { await checkAvailability() } // MARK: - Public API /// Transcribe audio data (file-based) public func transcribe( audioData: Data, mimeType: String, config: TranscriptionConfig = .default ) async throws -> TranscriptionResult { guard isAvailable else { throw SpeechToTextError.notAvailable } // Convert audio data to file URL for processing let tempURL = try createTempAudioFile(data: audioData, mimeType: mimeType) defer { try? FileManager.default.removeItem(at: tempURL) } return try await transcribeWithSFSpeechRecognizer(url: tempURL, config: config) } /// Stream transcription from audio chunks public func streamTranscribe( config: TranscriptionConfig = .default ) -> AsyncThrowingStream { AsyncThrowingStream { continuation in Task { guard self.isAvailable else { continuation.finish(throwing: SpeechToTextError.notAvailable) return } do { try await self.startStreamingWithSFSpeechRecognizer(config: config, continuation: continuation) } catch { continuation.finish(throwing: error) } } } } /// Feed audio chunk for streaming transcription public func feedAudioChunk(_ chunk: Data) async throws { guard isStreamingActive else { throw SpeechToTextError.transcriptionFailed("No active streaming session") } // Audio chunk handling implemented in streaming methods } /// End streaming session public func endStreamingSession() async { isStreamingActive = false } /// Get status information public func getStatus() -> String { if isAvailable { return "SFSpeechRecognizer available" } else { return "Speech recognition not available" } } // MARK: - Private Implementation private func checkAvailability() async { // Check SFSpeechRecognizer availability let status = SFSpeechRecognizer.authorizationStatus() switch status { case .authorized: isAvailable = SFSpeechRecognizer.supportedLocales().count > 0 case .notDetermined: // Request authorization isAvailable = await withCheckedContinuation { continuation in SFSpeechRecognizer.requestAuthorization { newStatus in continuation.resume(returning: newStatus == .authorized) } } default: isAvailable = false } } /// Create temporary audio file from data private func createTempAudioFile(data: Data, mimeType: String) throws -> URL { let ext = extensionForMimeType(mimeType) let tempDir = FileManager.default.temporaryDirectory let fileName = UUID().uuidString + "." + ext let fileURL = tempDir.appendingPathComponent(fileName) try data.write(to: fileURL) return fileURL } /// Get file extension for MIME type private func extensionForMimeType(_ mimeType: String) -> String { switch mimeType.lowercased() { case "audio/wav", "audio/wave", "audio/x-wav": return "wav" case "audio/mp3", "audio/mpeg": return "mp3" case "audio/m4a", "audio/mp4", "audio/x-m4a": return "m4a" case "audio/aac": return "aac" case "audio/flac": return "flac" default: return "wav" } } /// Transcribe using SFSpeechRecognizer private func transcribeWithSFSpeechRecognizer( url: URL, config: TranscriptionConfig ) async throws -> TranscriptionResult { let locale = Locale(identifier: config.languageCode ?? "en-US") guard let recognizer = SFSpeechRecognizer(locale: locale) else { throw SpeechToTextError.notAvailable } guard recognizer.isAvailable else { throw SpeechToTextError.notAvailable } let request = SFSpeechURLRecognitionRequest(url: url) request.shouldReportPartialResults = false return try await withCheckedThrowingContinuation { continuation in var hasResumed = false recognizer.recognitionTask(with: request) { result, error in guard !hasResumed else { return } if let error = error { hasResumed = true continuation.resume(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription)) return } guard let result = result, result.isFinal else { return } hasResumed = true let transcription = result.bestTranscription var segments: [TranscriptionSegmentResult] = [] if config.enableTimestamps { for segment in transcription.segments { segments.append(TranscriptionSegmentResult( text: segment.substring, startTime: Float(segment.timestamp), endTime: Float(segment.timestamp + segment.duration), confidence: segment.confidence )) } } let transcriptionResult = TranscriptionResult( text: transcription.formattedString, segments: segments, detectedLanguage: config.languageCode ?? "en-US", confidence: segments.isEmpty ? 1.0 : segments.reduce(0) { $0 + $1.confidence } / Float(segments.count) ) continuation.resume(returning: transcriptionResult) } } } /// Start streaming with SFSpeechRecognizer private func startStreamingWithSFSpeechRecognizer( config: TranscriptionConfig, continuation: AsyncThrowingStream.Continuation ) async throws { let locale = Locale(identifier: config.languageCode ?? "en-US") guard let recognizer = SFSpeechRecognizer(locale: locale) else { throw SpeechToTextError.notAvailable } guard recognizer.isAvailable else { throw SpeechToTextError.notAvailable } isStreamingActive = true let audioEngine = AVAudioEngine() let request = SFSpeechAudioBufferRecognitionRequest() request.shouldReportPartialResults = true let inputNode = audioEngine.inputNode let recordingFormat = inputNode.outputFormat(forBus: 0) inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { buffer, _ in request.append(buffer) } audioEngine.prepare() try audioEngine.start() recognizer.recognitionTask(with: request) { result, error in if let error = error { continuation.finish(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription)) return } guard let result = result else { return } let transcription = result.bestTranscription var segments: [TranscriptionSegmentResult] = [] if config.enableTimestamps { for segment in transcription.segments { segments.append(TranscriptionSegmentResult( text: segment.substring, startTime: Float(segment.timestamp), endTime: Float(segment.timestamp + segment.duration), confidence: segment.confidence )) } } let update = StreamingTranscriptionUpdate( partialText: transcription.formattedString, isFinal: result.isFinal, finalText: result.isFinal ? transcription.formattedString : nil, segments: segments ) continuation.yield(update) if result.isFinal { audioEngine.stop() inputNode.removeTap(onBus: 0) continuation.finish() } } // Wait for streaming to end while isStreamingActive { try await Task.sleep(for: .milliseconds(100)) } audioEngine.stop() inputNode.removeTap(onBus: 0) request.endAudio() } }