From b7549459236ea47895e5b36997399e90cd71e699 Mon Sep 17 00:00:00 2001 From: Mathias Beaulieu-Duncan Date: Wed, 31 Dec 2025 02:57:30 -0500 Subject: [PATCH] Add Text-to-Speech and Speech-to-Text features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add TTS service using AVSpeechSynthesizer for voice output - Add STT service using SpeechAnalyzer (macOS 26) for transcription - Add voice input (microphone) button in chat with recording level indicator - Add speak button on assistant messages for TTS playback - Add language toggle (EN-CA/FR-CA) for bilingual speech recognition - Fix Swift 6 strict concurrency issues in audio callbacks - Update proto schema with TTS/STT message types and RPCs - Update gRPC provider with speech service endpoints 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- Proto/apple_intelligence.proto | 115 ++ .../ViewModels/ChatViewModel.swift | 304 ++++++ .../AppleIntelligenceApp/Views/ChatView.swift | 68 +- .../Generated/apple_intelligence.grpc.swift | 819 ++++++++++++++- .../Generated/apple_intelligence.pb.swift | 988 ++++++++++++++++++ .../Providers/AppleIntelligenceProvider.swift | 222 +++- .../Resources/apple_intelligence.pb | Bin 3259 -> 3906 bytes .../Services/SpeechToTextService.swift | 337 ++++++ .../Services/TextToSpeechService.swift | 280 +++++ Sources/AppleIntelligenceServer/main.swift | 26 +- 10 files changed, 3151 insertions(+), 8 deletions(-) create mode 100644 Sources/AppleIntelligenceCore/Services/SpeechToTextService.swift create mode 100644 Sources/AppleIntelligenceCore/Services/TextToSpeechService.swift diff --git a/Proto/apple_intelligence.proto b/Proto/apple_intelligence.proto index c73947a..d144f91 100644 --- a/Proto/apple_intelligence.proto +++ b/Proto/apple_intelligence.proto @@ -51,6 +51,113 @@ message HealthResponse { string model_status = 2; } +// ============ TEXT-TO-SPEECH ============ + +// Audio format enumeration +enum AudioFormat { + AUDIO_FORMAT_UNSPECIFIED = 0; + AUDIO_FORMAT_WAV = 1; + AUDIO_FORMAT_MP3 = 2; +} + +// Voice configuration for TTS +message VoiceConfig { + string voice_identifier = 1; + optional float speaking_rate = 2; // 0.0-1.0, default 0.5 + optional float pitch_multiplier = 3; // 0.5-2.0, default 1.0 + optional float volume = 4; // 0.0-1.0, default 1.0 +} + +// TTS Request +message TextToSpeechRequest { + string text = 1; + AudioFormat output_format = 2; + optional VoiceConfig voice_config = 3; +} + +// TTS Response +message TextToSpeechResponse { + bytes audio_data = 1; + AudioFormat format = 2; + int32 sample_rate = 3; + int32 channels = 4; + float duration_seconds = 5; +} + +// List available voices request +message ListVoicesRequest { + optional string language_code = 1; +} + +// Voice information +message VoiceInfo { + string identifier = 1; + string name = 2; + string language = 3; + bool is_premium = 4; + string gender = 5; +} + +// List voices response +message ListVoicesResponse { + repeated VoiceInfo voices = 1; +} + +// ============ SPEECH-TO-TEXT ============ + +// STT Configuration +message TranscriptionConfig { + optional string language_code = 1; + optional bool enable_punctuation = 2; // default true + optional bool enable_timestamps = 3; // default false +} + +// Audio data for STT +message AudioInput { + bytes data = 1; + string mime_type = 2; // "audio/wav", "audio/mp3", "audio/m4a" + optional int32 sample_rate = 3; + optional int32 channels = 4; +} + +// File-based transcription request +message TranscribeRequest { + AudioInput audio = 1; + optional TranscriptionConfig config = 2; +} + +// Transcription segment with timing +message TranscriptionSegment { + string text = 1; + float start_time = 2; + float end_time = 3; + float confidence = 4; +} + +// Transcription response +message TranscribeResponse { + string text = 1; + repeated TranscriptionSegment segments = 2; + string detected_language = 3; + float confidence = 4; +} + +// Streaming STT request chunk +message StreamingTranscribeRequest { + oneof request { + TranscriptionConfig config = 1; // Send first to configure + bytes audio_chunk = 2; // Subsequent audio chunks + } +} + +// Streaming STT response +message StreamingTranscribeResponse { + string partial_text = 1; + bool is_final = 2; + string final_text = 3; + repeated TranscriptionSegment segments = 4; +} + // Apple Intelligence Service service AppleIntelligenceService { // Single completion request @@ -61,4 +168,12 @@ service AppleIntelligenceService { // Health check rpc Health(HealthRequest) returns (HealthResponse); + + // Text-to-Speech + rpc TextToSpeech(TextToSpeechRequest) returns (TextToSpeechResponse); + rpc ListVoices(ListVoicesRequest) returns (ListVoicesResponse); + + // Speech-to-Text + rpc Transcribe(TranscribeRequest) returns (TranscribeResponse); + rpc StreamTranscribe(stream StreamingTranscribeRequest) returns (stream StreamingTranscribeResponse); } diff --git a/Sources/AppleIntelligenceApp/ViewModels/ChatViewModel.swift b/Sources/AppleIntelligenceApp/ViewModels/ChatViewModel.swift index cf14370..1ed74b4 100644 --- a/Sources/AppleIntelligenceApp/ViewModels/ChatViewModel.swift +++ b/Sources/AppleIntelligenceApp/ViewModels/ChatViewModel.swift @@ -1,5 +1,7 @@ import Foundation import AppKit +import AVFoundation +import Speech import UniformTypeIdentifiers import AppleIntelligenceCore @@ -14,9 +16,32 @@ final class ChatViewModel { // Image attachment state var pendingImages: [ImageAttachment] = [] + // Voice input/output state + var isRecording: Bool = false + var isSpeaking: Bool = false + var speakingMessageId: UUID? + var recordingLevel: Float = 0 + private var service: AppleIntelligenceService? + private var ttsService: TextToSpeechService? + private var sttService: SpeechToTextService? private var currentTask: Task? + // Audio recording - multi-language support + private var audioEngine: AVAudioEngine? + private var speechRecognizers: [String: SFSpeechRecognizer] = [:] + private var activeRecognizer: SFSpeechRecognizer? + private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? + private var recognitionTask: SFSpeechRecognitionTask? + + // Supported speech recognition languages (Canadian English and French) + private static let supportedLocales = ["en-CA", "fr-CA"] + var detectedLanguage: String = "en-CA" + + // Audio playback - use direct speech synthesis for reliability + private var speechSynthesizer: AVSpeechSynthesizer? + private var speechDelegate: SpeechSynthesizerDelegate? + // Maximum images per message private let maxImagesPerMessage = 5 @@ -28,6 +53,27 @@ final class ChatViewModel { func initialize() async { service = await AppleIntelligenceService() + ttsService = TextToSpeechService() + sttService = await SpeechToTextService() + + // Initialize speech recognizers for all supported locales + for localeId in Self.supportedLocales { + if let recognizer = SFSpeechRecognizer(locale: Locale(identifier: localeId)) { + speechRecognizers[localeId] = recognizer + } + } + + // Default to system locale if supported, otherwise en-CA + let systemLocale = Locale.current.identifier + if speechRecognizers[systemLocale] != nil { + detectedLanguage = systemLocale + } else if systemLocale.starts(with: "fr") { + detectedLanguage = "fr-CA" + } else { + detectedLanguage = "en-CA" + } + activeRecognizer = speechRecognizers[detectedLanguage] + loadRecentImages() } @@ -217,4 +263,262 @@ final class ChatViewModel { messages.removeAll() errorMessage = nil } + + // MARK: - Voice Input (Speech-to-Text) + + func toggleRecording() { + if isRecording { + stopRecording() + } else { + startRecording() + } + } + + func startRecording() { + Task { + // Use nonisolated helper to avoid MainActor isolation inheritance in TCC callback + let status = await Self.requestSpeechAuthorization() + + guard status == .authorized else { + self.errorMessage = "Speech recognition not authorized" + return + } + self.beginRecording() + } + } + + /// Request speech recognition authorization without MainActor isolation. + /// This prevents Swift 6 strict concurrency from asserting MainActor in the TCC callback. + private nonisolated static func requestSpeechAuthorization() async -> SFSpeechRecognizerAuthorizationStatus { + await withCheckedContinuation { continuation in + SFSpeechRecognizer.requestAuthorization { status in + continuation.resume(returning: status) + } + } + } + + /// Creates audio tap handler in nonisolated context to avoid MainActor isolation inheritance. + /// Audio taps run on CoreAudio's RealtimeMessenger queue, not MainActor. + private nonisolated static func createAudioTapHandler( + request: SFSpeechAudioBufferRecognitionRequest, + levelUpdater: RecordingLevelUpdater + ) -> (AVAudioPCMBuffer, AVAudioTime) -> Void { + return { buffer, _ in + request.append(buffer) + + // Calculate audio level for visual feedback + guard let channelData = buffer.floatChannelData else { return } + let channelDataValue = channelData.pointee + let channelDataValueArray = stride(from: 0, to: Int(buffer.frameLength), by: buffer.stride).map { channelDataValue[$0] } + let rms = sqrt(channelDataValueArray.map { $0 * $0 }.reduce(0, +) / Float(buffer.frameLength)) + let avgPower = 20 * log10(rms) + let level = max(0, min(1, (avgPower + 50) / 50)) + + levelUpdater.updateLevel(level) + } + } + + private func beginRecording() { + // Try to find an available recognizer + let recognizer = activeRecognizer ?? speechRecognizers.values.first { $0.isAvailable } + guard let speechRecognizer = recognizer, speechRecognizer.isAvailable else { + errorMessage = "Speech recognition not available" + return + } + + // Stop any existing recording + if audioEngine != nil { + stopRecording() + } + + audioEngine = AVAudioEngine() + recognitionRequest = SFSpeechAudioBufferRecognitionRequest() + + guard let audioEngine = audioEngine, + let recognitionRequest = recognitionRequest else { + errorMessage = "Failed to initialize audio engine" + return + } + + recognitionRequest.shouldReportPartialResults = true + + // Enable automatic language detection if available (macOS 14+) + if #available(macOS 14, *) { + recognitionRequest.addsPunctuation = true + } + + let inputNode = audioEngine.inputNode + let recordingFormat = inputNode.outputFormat(forBus: 0) + + // Use nonisolated static function to create audio tap handler + // This breaks MainActor isolation inheritance in the closure + let levelUpdater = RecordingLevelUpdater(viewModel: self) + let audioTapHandler = Self.createAudioTapHandler(request: recognitionRequest, levelUpdater: levelUpdater) + inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat, block: audioTapHandler) + + audioEngine.prepare() + + do { + try audioEngine.start() + isRecording = true + + // Use a sendable wrapper for recognition results with language detection + let resultHandler = RecognitionResultHandler(viewModel: self) + + recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in + resultHandler.handleResult(result: result, error: error) + } + } catch { + errorMessage = "Failed to start recording: \(error.localizedDescription)" + cleanupRecording() + } + } + + /// Switch to a different language for speech recognition + func switchLanguage(to localeId: String) { + guard let recognizer = speechRecognizers[localeId] else { return } + activeRecognizer = recognizer + detectedLanguage = localeId + } + + /// Get available languages for speech recognition + var availableLanguages: [(id: String, name: String)] { + speechRecognizers.keys.sorted().compactMap { localeId in + let locale = Locale(identifier: localeId) + let name = locale.localizedString(forIdentifier: localeId) ?? localeId + return (id: localeId, name: name) + } + } + + func stopRecording() { + recognitionRequest?.endAudio() + cleanupRecording() + } + + fileprivate func cleanupRecording() { + audioEngine?.stop() + audioEngine?.inputNode.removeTap(onBus: 0) + audioEngine = nil + recognitionRequest = nil + recognitionTask?.cancel() + recognitionTask = nil + isRecording = false + recordingLevel = 0 + } + + // MARK: - Voice Output (Text-to-Speech) + + func speakMessage(_ message: ChatMessage) { + guard !message.content.isEmpty else { return } + + // If already speaking this message, stop + if isSpeaking && speakingMessageId == message.id { + stopSpeaking() + return + } + + // Stop any current speech + stopSpeaking() + + speakingMessageId = message.id + isSpeaking = true + + // Create utterance + let utterance = AVSpeechUtterance(string: message.content) + utterance.rate = AVSpeechUtteranceDefaultSpeechRate + utterance.pitchMultiplier = 1.0 + utterance.volume = 1.0 + + // Use voice matching current speech recognition language + if detectedLanguage == "fr-CA" { + utterance.voice = AVSpeechSynthesisVoice(language: "fr-CA") + } else { + utterance.voice = AVSpeechSynthesisVoice(language: "en-CA") + } + + // Create synthesizer and delegate + let synthesizer = AVSpeechSynthesizer() + speechDelegate = SpeechSynthesizerDelegate { [weak self] in + Task { @MainActor in + self?.isSpeaking = false + self?.speakingMessageId = nil + self?.speechDelegate = nil + self?.speechSynthesizer = nil + } + } + synthesizer.delegate = speechDelegate + speechSynthesizer = synthesizer + + // Speak directly + synthesizer.speak(utterance) + } + + func stopSpeaking() { + speechSynthesizer?.stopSpeaking(at: .immediate) + speechSynthesizer = nil + speechDelegate = nil + isSpeaking = false + speakingMessageId = nil + } +} + +// MARK: - Speech Synthesizer Delegate + +private final class SpeechSynthesizerDelegate: NSObject, AVSpeechSynthesizerDelegate, @unchecked Sendable { + let onFinish: () -> Void + + init(onFinish: @escaping () -> Void) { + self.onFinish = onFinish + } + + func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) { + onFinish() + } + + func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) { + onFinish() + } +} + +// MARK: - Sendable Wrappers for Audio Callbacks + +/// Wrapper to safely update recording level from audio callback thread +private final class RecordingLevelUpdater: @unchecked Sendable { + private weak var viewModel: ChatViewModel? + + init(viewModel: ChatViewModel) { + self.viewModel = viewModel + } + + func updateLevel(_ level: Float) { + Task { @MainActor [weak viewModel] in + viewModel?.recordingLevel = level + } + } +} + +/// Wrapper to safely handle recognition results from Speech framework callback +private final class RecognitionResultHandler: @unchecked Sendable { + private weak var viewModel: ChatViewModel? + + init(viewModel: ChatViewModel) { + self.viewModel = viewModel + } + + func handleResult(result: SFSpeechRecognitionResult?, error: Error?) { + // Extract data before crossing actor boundary (SFSpeechRecognitionResult is not Sendable) + let transcription = result?.bestTranscription.formattedString + let isFinal = result?.isFinal ?? false + let hasError = error != nil + + Task { @MainActor [weak viewModel] in + if let transcription = transcription { + viewModel?.inputText = transcription + } + + if hasError || isFinal { + viewModel?.cleanupRecording() + } + } + } } diff --git a/Sources/AppleIntelligenceApp/Views/ChatView.swift b/Sources/AppleIntelligenceApp/Views/ChatView.swift index c678407..495e9e8 100644 --- a/Sources/AppleIntelligenceApp/Views/ChatView.swift +++ b/Sources/AppleIntelligenceApp/Views/ChatView.swift @@ -23,7 +23,11 @@ struct ChatView: View { ScrollView { LazyVStack(spacing: 12) { ForEach(viewModel.messages) { message in - MessageBubble(message: message) + MessageBubble( + message: message, + isSpeaking: viewModel.speakingMessageId == message.id, + onSpeak: { viewModel.speakMessage(message) } + ) .id(message.id) } } @@ -286,6 +290,45 @@ struct ChatView: View { .buttonStyle(.plain) .help("Paste image from clipboard") + // Language toggle for speech recognition + Button { + // Toggle between en-CA and fr-CA + let newLang = viewModel.detectedLanguage == "en-CA" ? "fr-CA" : "en-CA" + viewModel.switchLanguage(to: newLang) + } label: { + Text(viewModel.detectedLanguage == "fr-CA" ? "FR" : "EN") + .font(.caption.bold()) + .foregroundStyle(.secondary) + .frame(width: 24, height: 24) + .background( + RoundedRectangle(cornerRadius: 4) + .fill(Color.secondary.opacity(0.1)) + ) + } + .buttonStyle(.plain) + .help("Speech language: \(viewModel.detectedLanguage) (click to toggle)") + + // Microphone button for voice input + Button { + viewModel.toggleRecording() + } label: { + ZStack { + if viewModel.isRecording { + // Recording indicator with level + Circle() + .fill(Color.red.opacity(0.3)) + .frame(width: 28 + CGFloat(viewModel.recordingLevel) * 10, + height: 28 + CGFloat(viewModel.recordingLevel) * 10) + .animation(.easeInOut(duration: 0.1), value: viewModel.recordingLevel) + } + Image(systemName: viewModel.isRecording ? "mic.fill" : "mic") + .font(.title3) + .foregroundStyle(viewModel.isRecording ? .red : .secondary) + } + } + .buttonStyle(.plain) + .help(viewModel.isRecording ? "Stop recording" : "Voice input") + TextField("Message...", text: $viewModel.inputText, axis: .vertical) .textFieldStyle(.plain) .lineLimit(1...5) @@ -386,6 +429,8 @@ struct RecentImageThumbnail: View { struct MessageBubble: View { let message: ChatMessage + var isSpeaking: Bool = false + var onSpeak: (() -> Void)? = nil @State private var showCopied = false var body: some View { @@ -419,10 +464,23 @@ struct MessageBubble: View { } } - // Copy button for assistant messages + // Action buttons for assistant messages if message.role == .assistant && !message.content.isEmpty && !message.isStreaming { - HStack { - Spacer() + HStack(spacing: 12) { + // Speaker button for TTS + Button { + onSpeak?() + } label: { + HStack(spacing: 4) { + Image(systemName: isSpeaking ? "stop.fill" : "speaker.wave.2") + Text(isSpeaking ? "Stop" : "Speak") + } + .font(.caption) + .foregroundStyle(isSpeaking ? .red : .secondary) + } + .buttonStyle(.plain) + + // Copy button Button { NSPasteboard.general.clearContents() NSPasteboard.general.setString(message.content, forType: .string) @@ -439,6 +497,8 @@ struct MessageBubble: View { .foregroundStyle(.secondary) } .buttonStyle(.plain) + + Spacer() } .padding(.top, 2) } diff --git a/Sources/AppleIntelligenceCore/Generated/apple_intelligence.grpc.swift b/Sources/AppleIntelligenceCore/Generated/apple_intelligence.grpc.swift index b3a4668..5cebb4e 100644 --- a/Sources/AppleIntelligenceCore/Generated/apple_intelligence.grpc.swift +++ b/Sources/AppleIntelligenceCore/Generated/apple_intelligence.grpc.swift @@ -56,11 +56,63 @@ public enum Appleintelligence_AppleIntelligenceService: Sendable { method: "Health" ) } + /// Namespace for "TextToSpeech" metadata. + public enum TextToSpeech: Sendable { + /// Request type for "TextToSpeech". + public typealias Input = Appleintelligence_TextToSpeechRequest + /// Response type for "TextToSpeech". + public typealias Output = Appleintelligence_TextToSpeechResponse + /// Descriptor for "TextToSpeech". + public static let descriptor = GRPCCore.MethodDescriptor( + service: GRPCCore.ServiceDescriptor(fullyQualifiedService: "appleintelligence.AppleIntelligenceService"), + method: "TextToSpeech" + ) + } + /// Namespace for "ListVoices" metadata. + public enum ListVoices: Sendable { + /// Request type for "ListVoices". + public typealias Input = Appleintelligence_ListVoicesRequest + /// Response type for "ListVoices". + public typealias Output = Appleintelligence_ListVoicesResponse + /// Descriptor for "ListVoices". + public static let descriptor = GRPCCore.MethodDescriptor( + service: GRPCCore.ServiceDescriptor(fullyQualifiedService: "appleintelligence.AppleIntelligenceService"), + method: "ListVoices" + ) + } + /// Namespace for "Transcribe" metadata. + public enum Transcribe: Sendable { + /// Request type for "Transcribe". + public typealias Input = Appleintelligence_TranscribeRequest + /// Response type for "Transcribe". + public typealias Output = Appleintelligence_TranscribeResponse + /// Descriptor for "Transcribe". + public static let descriptor = GRPCCore.MethodDescriptor( + service: GRPCCore.ServiceDescriptor(fullyQualifiedService: "appleintelligence.AppleIntelligenceService"), + method: "Transcribe" + ) + } + /// Namespace for "StreamTranscribe" metadata. + public enum StreamTranscribe: Sendable { + /// Request type for "StreamTranscribe". + public typealias Input = Appleintelligence_StreamingTranscribeRequest + /// Response type for "StreamTranscribe". + public typealias Output = Appleintelligence_StreamingTranscribeResponse + /// Descriptor for "StreamTranscribe". + public static let descriptor = GRPCCore.MethodDescriptor( + service: GRPCCore.ServiceDescriptor(fullyQualifiedService: "appleintelligence.AppleIntelligenceService"), + method: "StreamTranscribe" + ) + } /// Descriptors for all methods in the "appleintelligence.AppleIntelligenceService" service. public static let descriptors: [GRPCCore.MethodDescriptor] = [ Complete.descriptor, StreamComplete.descriptor, - Health.descriptor + Health.descriptor, + TextToSpeech.descriptor, + ListVoices.descriptor, + Transcribe.descriptor, + StreamTranscribe.descriptor ] } } @@ -143,6 +195,70 @@ extension Appleintelligence_AppleIntelligenceService { request: GRPCCore.StreamingServerRequest, context: GRPCCore.ServerContext ) async throws -> GRPCCore.StreamingServerResponse + + /// Handle the "TextToSpeech" method. + /// + /// > Source IDL Documentation: + /// > + /// > Text-to-Speech + /// + /// - Parameters: + /// - request: A streaming request of `Appleintelligence_TextToSpeechRequest` messages. + /// - context: Context providing information about the RPC. + /// - Throws: Any error which occurred during the processing of the request. Thrown errors + /// of type `RPCError` are mapped to appropriate statuses. All other errors are converted + /// to an internal error. + /// - Returns: A streaming response of `Appleintelligence_TextToSpeechResponse` messages. + func textToSpeech( + request: GRPCCore.StreamingServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.StreamingServerResponse + + /// Handle the "ListVoices" method. + /// + /// - Parameters: + /// - request: A streaming request of `Appleintelligence_ListVoicesRequest` messages. + /// - context: Context providing information about the RPC. + /// - Throws: Any error which occurred during the processing of the request. Thrown errors + /// of type `RPCError` are mapped to appropriate statuses. All other errors are converted + /// to an internal error. + /// - Returns: A streaming response of `Appleintelligence_ListVoicesResponse` messages. + func listVoices( + request: GRPCCore.StreamingServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.StreamingServerResponse + + /// Handle the "Transcribe" method. + /// + /// > Source IDL Documentation: + /// > + /// > Speech-to-Text + /// + /// - Parameters: + /// - request: A streaming request of `Appleintelligence_TranscribeRequest` messages. + /// - context: Context providing information about the RPC. + /// - Throws: Any error which occurred during the processing of the request. Thrown errors + /// of type `RPCError` are mapped to appropriate statuses. All other errors are converted + /// to an internal error. + /// - Returns: A streaming response of `Appleintelligence_TranscribeResponse` messages. + func transcribe( + request: GRPCCore.StreamingServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.StreamingServerResponse + + /// Handle the "StreamTranscribe" method. + /// + /// - Parameters: + /// - request: A streaming request of `Appleintelligence_StreamingTranscribeRequest` messages. + /// - context: Context providing information about the RPC. + /// - Throws: Any error which occurred during the processing of the request. Thrown errors + /// of type `RPCError` are mapped to appropriate statuses. All other errors are converted + /// to an internal error. + /// - Returns: A streaming response of `Appleintelligence_StreamingTranscribeResponse` messages. + func streamTranscribe( + request: GRPCCore.StreamingServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.StreamingServerResponse } /// Service protocol for the "appleintelligence.AppleIntelligenceService" service. @@ -210,6 +326,70 @@ extension Appleintelligence_AppleIntelligenceService { request: GRPCCore.ServerRequest, context: GRPCCore.ServerContext ) async throws -> GRPCCore.ServerResponse + + /// Handle the "TextToSpeech" method. + /// + /// > Source IDL Documentation: + /// > + /// > Text-to-Speech + /// + /// - Parameters: + /// - request: A request containing a single `Appleintelligence_TextToSpeechRequest` message. + /// - context: Context providing information about the RPC. + /// - Throws: Any error which occurred during the processing of the request. Thrown errors + /// of type `RPCError` are mapped to appropriate statuses. All other errors are converted + /// to an internal error. + /// - Returns: A response containing a single `Appleintelligence_TextToSpeechResponse` message. + func textToSpeech( + request: GRPCCore.ServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.ServerResponse + + /// Handle the "ListVoices" method. + /// + /// - Parameters: + /// - request: A request containing a single `Appleintelligence_ListVoicesRequest` message. + /// - context: Context providing information about the RPC. + /// - Throws: Any error which occurred during the processing of the request. Thrown errors + /// of type `RPCError` are mapped to appropriate statuses. All other errors are converted + /// to an internal error. + /// - Returns: A response containing a single `Appleintelligence_ListVoicesResponse` message. + func listVoices( + request: GRPCCore.ServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.ServerResponse + + /// Handle the "Transcribe" method. + /// + /// > Source IDL Documentation: + /// > + /// > Speech-to-Text + /// + /// - Parameters: + /// - request: A request containing a single `Appleintelligence_TranscribeRequest` message. + /// - context: Context providing information about the RPC. + /// - Throws: Any error which occurred during the processing of the request. Thrown errors + /// of type `RPCError` are mapped to appropriate statuses. All other errors are converted + /// to an internal error. + /// - Returns: A response containing a single `Appleintelligence_TranscribeResponse` message. + func transcribe( + request: GRPCCore.ServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.ServerResponse + + /// Handle the "StreamTranscribe" method. + /// + /// - Parameters: + /// - request: A streaming request of `Appleintelligence_StreamingTranscribeRequest` messages. + /// - context: Context providing information about the RPC. + /// - Throws: Any error which occurred during the processing of the request. Thrown errors + /// of type `RPCError` are mapped to appropriate statuses. All other errors are converted + /// to an internal error. + /// - Returns: A streaming response of `Appleintelligence_StreamingTranscribeResponse` messages. + func streamTranscribe( + request: GRPCCore.StreamingServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.StreamingServerResponse } /// Simple service protocol for the "appleintelligence.AppleIntelligenceService" service. @@ -276,6 +456,71 @@ extension Appleintelligence_AppleIntelligenceService { request: Appleintelligence_HealthRequest, context: GRPCCore.ServerContext ) async throws -> Appleintelligence_HealthResponse + + /// Handle the "TextToSpeech" method. + /// + /// > Source IDL Documentation: + /// > + /// > Text-to-Speech + /// + /// - Parameters: + /// - request: A `Appleintelligence_TextToSpeechRequest` message. + /// - context: Context providing information about the RPC. + /// - Throws: Any error which occurred during the processing of the request. Thrown errors + /// of type `RPCError` are mapped to appropriate statuses. All other errors are converted + /// to an internal error. + /// - Returns: A `Appleintelligence_TextToSpeechResponse` to respond with. + func textToSpeech( + request: Appleintelligence_TextToSpeechRequest, + context: GRPCCore.ServerContext + ) async throws -> Appleintelligence_TextToSpeechResponse + + /// Handle the "ListVoices" method. + /// + /// - Parameters: + /// - request: A `Appleintelligence_ListVoicesRequest` message. + /// - context: Context providing information about the RPC. + /// - Throws: Any error which occurred during the processing of the request. Thrown errors + /// of type `RPCError` are mapped to appropriate statuses. All other errors are converted + /// to an internal error. + /// - Returns: A `Appleintelligence_ListVoicesResponse` to respond with. + func listVoices( + request: Appleintelligence_ListVoicesRequest, + context: GRPCCore.ServerContext + ) async throws -> Appleintelligence_ListVoicesResponse + + /// Handle the "Transcribe" method. + /// + /// > Source IDL Documentation: + /// > + /// > Speech-to-Text + /// + /// - Parameters: + /// - request: A `Appleintelligence_TranscribeRequest` message. + /// - context: Context providing information about the RPC. + /// - Throws: Any error which occurred during the processing of the request. Thrown errors + /// of type `RPCError` are mapped to appropriate statuses. All other errors are converted + /// to an internal error. + /// - Returns: A `Appleintelligence_TranscribeResponse` to respond with. + func transcribe( + request: Appleintelligence_TranscribeRequest, + context: GRPCCore.ServerContext + ) async throws -> Appleintelligence_TranscribeResponse + + /// Handle the "StreamTranscribe" method. + /// + /// - Parameters: + /// - request: A stream of `Appleintelligence_StreamingTranscribeRequest` messages. + /// - response: A response stream of `Appleintelligence_StreamingTranscribeResponse` messages. + /// - context: Context providing information about the RPC. + /// - Throws: Any error which occurred during the processing of the request. Thrown errors + /// of type `RPCError` are mapped to appropriate statuses. All other errors are converted + /// to an internal error. + func streamTranscribe( + request: GRPCCore.RPCAsyncSequence, + response: GRPCCore.RPCWriter, + context: GRPCCore.ServerContext + ) async throws } } @@ -316,6 +561,50 @@ extension Appleintelligence_AppleIntelligenceService.StreamingServiceProtocol { ) } ) + router.registerHandler( + forMethod: Appleintelligence_AppleIntelligenceService.Method.TextToSpeech.descriptor, + deserializer: GRPCProtobuf.ProtobufDeserializer(), + serializer: GRPCProtobuf.ProtobufSerializer(), + handler: { request, context in + try await self.textToSpeech( + request: request, + context: context + ) + } + ) + router.registerHandler( + forMethod: Appleintelligence_AppleIntelligenceService.Method.ListVoices.descriptor, + deserializer: GRPCProtobuf.ProtobufDeserializer(), + serializer: GRPCProtobuf.ProtobufSerializer(), + handler: { request, context in + try await self.listVoices( + request: request, + context: context + ) + } + ) + router.registerHandler( + forMethod: Appleintelligence_AppleIntelligenceService.Method.Transcribe.descriptor, + deserializer: GRPCProtobuf.ProtobufDeserializer(), + serializer: GRPCProtobuf.ProtobufSerializer(), + handler: { request, context in + try await self.transcribe( + request: request, + context: context + ) + } + ) + router.registerHandler( + forMethod: Appleintelligence_AppleIntelligenceService.Method.StreamTranscribe.descriptor, + deserializer: GRPCProtobuf.ProtobufDeserializer(), + serializer: GRPCProtobuf.ProtobufSerializer(), + handler: { request, context in + try await self.streamTranscribe( + request: request, + context: context + ) + } + ) } } @@ -354,6 +643,39 @@ extension Appleintelligence_AppleIntelligenceService.ServiceProtocol { ) return GRPCCore.StreamingServerResponse(single: response) } + + public func textToSpeech( + request: GRPCCore.StreamingServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.StreamingServerResponse { + let response = try await self.textToSpeech( + request: GRPCCore.ServerRequest(stream: request), + context: context + ) + return GRPCCore.StreamingServerResponse(single: response) + } + + public func listVoices( + request: GRPCCore.StreamingServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.StreamingServerResponse { + let response = try await self.listVoices( + request: GRPCCore.ServerRequest(stream: request), + context: context + ) + return GRPCCore.StreamingServerResponse(single: response) + } + + public func transcribe( + request: GRPCCore.StreamingServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.StreamingServerResponse { + let response = try await self.transcribe( + request: GRPCCore.ServerRequest(stream: request), + context: context + ) + return GRPCCore.StreamingServerResponse(single: response) + } } // Default implementation of methods from 'ServiceProtocol'. @@ -401,6 +723,62 @@ extension Appleintelligence_AppleIntelligenceService.SimpleServiceProtocol { metadata: [:] ) } + + public func textToSpeech( + request: GRPCCore.ServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.ServerResponse { + return GRPCCore.ServerResponse( + message: try await self.textToSpeech( + request: request.message, + context: context + ), + metadata: [:] + ) + } + + public func listVoices( + request: GRPCCore.ServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.ServerResponse { + return GRPCCore.ServerResponse( + message: try await self.listVoices( + request: request.message, + context: context + ), + metadata: [:] + ) + } + + public func transcribe( + request: GRPCCore.ServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.ServerResponse { + return GRPCCore.ServerResponse( + message: try await self.transcribe( + request: request.message, + context: context + ), + metadata: [:] + ) + } + + public func streamTranscribe( + request: GRPCCore.StreamingServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.StreamingServerResponse { + return GRPCCore.StreamingServerResponse( + metadata: [:], + producer: { writer in + try await self.streamTranscribe( + request: request.messages, + response: writer, + context: context + ) + return [:] + } + ) + } } // MARK: appleintelligence.AppleIntelligenceService (client) @@ -484,6 +862,90 @@ extension Appleintelligence_AppleIntelligenceService { options: GRPCCore.CallOptions, onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result ) async throws -> Result where Result: Sendable + + /// Call the "TextToSpeech" method. + /// + /// > Source IDL Documentation: + /// > + /// > Text-to-Speech + /// + /// - Parameters: + /// - request: A request containing a single `Appleintelligence_TextToSpeechRequest` message. + /// - serializer: A serializer for `Appleintelligence_TextToSpeechRequest` messages. + /// - deserializer: A deserializer for `Appleintelligence_TextToSpeechResponse` messages. + /// - options: Options to apply to this RPC. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + func textToSpeech( + request: GRPCCore.ClientRequest, + serializer: some GRPCCore.MessageSerializer, + deserializer: some GRPCCore.MessageDeserializer, + options: GRPCCore.CallOptions, + onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result + ) async throws -> Result where Result: Sendable + + /// Call the "ListVoices" method. + /// + /// - Parameters: + /// - request: A request containing a single `Appleintelligence_ListVoicesRequest` message. + /// - serializer: A serializer for `Appleintelligence_ListVoicesRequest` messages. + /// - deserializer: A deserializer for `Appleintelligence_ListVoicesResponse` messages. + /// - options: Options to apply to this RPC. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + func listVoices( + request: GRPCCore.ClientRequest, + serializer: some GRPCCore.MessageSerializer, + deserializer: some GRPCCore.MessageDeserializer, + options: GRPCCore.CallOptions, + onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result + ) async throws -> Result where Result: Sendable + + /// Call the "Transcribe" method. + /// + /// > Source IDL Documentation: + /// > + /// > Speech-to-Text + /// + /// - Parameters: + /// - request: A request containing a single `Appleintelligence_TranscribeRequest` message. + /// - serializer: A serializer for `Appleintelligence_TranscribeRequest` messages. + /// - deserializer: A deserializer for `Appleintelligence_TranscribeResponse` messages. + /// - options: Options to apply to this RPC. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + func transcribe( + request: GRPCCore.ClientRequest, + serializer: some GRPCCore.MessageSerializer, + deserializer: some GRPCCore.MessageDeserializer, + options: GRPCCore.CallOptions, + onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result + ) async throws -> Result where Result: Sendable + + /// Call the "StreamTranscribe" method. + /// + /// - Parameters: + /// - request: A streaming request producing `Appleintelligence_StreamingTranscribeRequest` messages. + /// - serializer: A serializer for `Appleintelligence_StreamingTranscribeRequest` messages. + /// - deserializer: A deserializer for `Appleintelligence_StreamingTranscribeResponse` messages. + /// - options: Options to apply to this RPC. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + func streamTranscribe( + request: GRPCCore.StreamingClientRequest, + serializer: some GRPCCore.MessageSerializer, + deserializer: some GRPCCore.MessageDeserializer, + options: GRPCCore.CallOptions, + onResponse handleResponse: @Sendable @escaping (GRPCCore.StreamingClientResponse) async throws -> Result + ) async throws -> Result where Result: Sendable } /// Generated client for the "appleintelligence.AppleIntelligenceService" service. @@ -605,6 +1067,132 @@ extension Appleintelligence_AppleIntelligenceService { onResponse: handleResponse ) } + + /// Call the "TextToSpeech" method. + /// + /// > Source IDL Documentation: + /// > + /// > Text-to-Speech + /// + /// - Parameters: + /// - request: A request containing a single `Appleintelligence_TextToSpeechRequest` message. + /// - serializer: A serializer for `Appleintelligence_TextToSpeechRequest` messages. + /// - deserializer: A deserializer for `Appleintelligence_TextToSpeechResponse` messages. + /// - options: Options to apply to this RPC. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + public func textToSpeech( + request: GRPCCore.ClientRequest, + serializer: some GRPCCore.MessageSerializer, + deserializer: some GRPCCore.MessageDeserializer, + options: GRPCCore.CallOptions = .defaults, + onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result = { response in + try response.message + } + ) async throws -> Result where Result: Sendable { + try await self.client.unary( + request: request, + descriptor: Appleintelligence_AppleIntelligenceService.Method.TextToSpeech.descriptor, + serializer: serializer, + deserializer: deserializer, + options: options, + onResponse: handleResponse + ) + } + + /// Call the "ListVoices" method. + /// + /// - Parameters: + /// - request: A request containing a single `Appleintelligence_ListVoicesRequest` message. + /// - serializer: A serializer for `Appleintelligence_ListVoicesRequest` messages. + /// - deserializer: A deserializer for `Appleintelligence_ListVoicesResponse` messages. + /// - options: Options to apply to this RPC. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + public func listVoices( + request: GRPCCore.ClientRequest, + serializer: some GRPCCore.MessageSerializer, + deserializer: some GRPCCore.MessageDeserializer, + options: GRPCCore.CallOptions = .defaults, + onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result = { response in + try response.message + } + ) async throws -> Result where Result: Sendable { + try await self.client.unary( + request: request, + descriptor: Appleintelligence_AppleIntelligenceService.Method.ListVoices.descriptor, + serializer: serializer, + deserializer: deserializer, + options: options, + onResponse: handleResponse + ) + } + + /// Call the "Transcribe" method. + /// + /// > Source IDL Documentation: + /// > + /// > Speech-to-Text + /// + /// - Parameters: + /// - request: A request containing a single `Appleintelligence_TranscribeRequest` message. + /// - serializer: A serializer for `Appleintelligence_TranscribeRequest` messages. + /// - deserializer: A deserializer for `Appleintelligence_TranscribeResponse` messages. + /// - options: Options to apply to this RPC. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + public func transcribe( + request: GRPCCore.ClientRequest, + serializer: some GRPCCore.MessageSerializer, + deserializer: some GRPCCore.MessageDeserializer, + options: GRPCCore.CallOptions = .defaults, + onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result = { response in + try response.message + } + ) async throws -> Result where Result: Sendable { + try await self.client.unary( + request: request, + descriptor: Appleintelligence_AppleIntelligenceService.Method.Transcribe.descriptor, + serializer: serializer, + deserializer: deserializer, + options: options, + onResponse: handleResponse + ) + } + + /// Call the "StreamTranscribe" method. + /// + /// - Parameters: + /// - request: A streaming request producing `Appleintelligence_StreamingTranscribeRequest` messages. + /// - serializer: A serializer for `Appleintelligence_StreamingTranscribeRequest` messages. + /// - deserializer: A deserializer for `Appleintelligence_StreamingTranscribeResponse` messages. + /// - options: Options to apply to this RPC. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + public func streamTranscribe( + request: GRPCCore.StreamingClientRequest, + serializer: some GRPCCore.MessageSerializer, + deserializer: some GRPCCore.MessageDeserializer, + options: GRPCCore.CallOptions = .defaults, + onResponse handleResponse: @Sendable @escaping (GRPCCore.StreamingClientResponse) async throws -> Result + ) async throws -> Result where Result: Sendable { + try await self.client.bidirectionalStreaming( + request: request, + descriptor: Appleintelligence_AppleIntelligenceService.Method.StreamTranscribe.descriptor, + serializer: serializer, + deserializer: deserializer, + options: options, + onResponse: handleResponse + ) + } } } @@ -695,6 +1283,112 @@ extension Appleintelligence_AppleIntelligenceService.ClientProtocol { onResponse: handleResponse ) } + + /// Call the "TextToSpeech" method. + /// + /// > Source IDL Documentation: + /// > + /// > Text-to-Speech + /// + /// - Parameters: + /// - request: A request containing a single `Appleintelligence_TextToSpeechRequest` message. + /// - options: Options to apply to this RPC. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + public func textToSpeech( + request: GRPCCore.ClientRequest, + options: GRPCCore.CallOptions = .defaults, + onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result = { response in + try response.message + } + ) async throws -> Result where Result: Sendable { + try await self.textToSpeech( + request: request, + serializer: GRPCProtobuf.ProtobufSerializer(), + deserializer: GRPCProtobuf.ProtobufDeserializer(), + options: options, + onResponse: handleResponse + ) + } + + /// Call the "ListVoices" method. + /// + /// - Parameters: + /// - request: A request containing a single `Appleintelligence_ListVoicesRequest` message. + /// - options: Options to apply to this RPC. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + public func listVoices( + request: GRPCCore.ClientRequest, + options: GRPCCore.CallOptions = .defaults, + onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result = { response in + try response.message + } + ) async throws -> Result where Result: Sendable { + try await self.listVoices( + request: request, + serializer: GRPCProtobuf.ProtobufSerializer(), + deserializer: GRPCProtobuf.ProtobufDeserializer(), + options: options, + onResponse: handleResponse + ) + } + + /// Call the "Transcribe" method. + /// + /// > Source IDL Documentation: + /// > + /// > Speech-to-Text + /// + /// - Parameters: + /// - request: A request containing a single `Appleintelligence_TranscribeRequest` message. + /// - options: Options to apply to this RPC. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + public func transcribe( + request: GRPCCore.ClientRequest, + options: GRPCCore.CallOptions = .defaults, + onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result = { response in + try response.message + } + ) async throws -> Result where Result: Sendable { + try await self.transcribe( + request: request, + serializer: GRPCProtobuf.ProtobufSerializer(), + deserializer: GRPCProtobuf.ProtobufDeserializer(), + options: options, + onResponse: handleResponse + ) + } + + /// Call the "StreamTranscribe" method. + /// + /// - Parameters: + /// - request: A streaming request producing `Appleintelligence_StreamingTranscribeRequest` messages. + /// - options: Options to apply to this RPC. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + public func streamTranscribe( + request: GRPCCore.StreamingClientRequest, + options: GRPCCore.CallOptions = .defaults, + onResponse handleResponse: @Sendable @escaping (GRPCCore.StreamingClientResponse) async throws -> Result + ) async throws -> Result where Result: Sendable { + try await self.streamTranscribe( + request: request, + serializer: GRPCProtobuf.ProtobufSerializer(), + deserializer: GRPCProtobuf.ProtobufDeserializer(), + options: options, + onResponse: handleResponse + ) + } } // Helpers providing sugared APIs for 'ClientProtocol' methods. @@ -796,4 +1490,127 @@ extension Appleintelligence_AppleIntelligenceService.ClientProtocol { onResponse: handleResponse ) } + + /// Call the "TextToSpeech" method. + /// + /// > Source IDL Documentation: + /// > + /// > Text-to-Speech + /// + /// - Parameters: + /// - message: request message to send. + /// - metadata: Additional metadata to send, defaults to empty. + /// - options: Options to apply to this RPC, defaults to `.defaults`. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + public func textToSpeech( + _ message: Appleintelligence_TextToSpeechRequest, + metadata: GRPCCore.Metadata = [:], + options: GRPCCore.CallOptions = .defaults, + onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result = { response in + try response.message + } + ) async throws -> Result where Result: Sendable { + let request = GRPCCore.ClientRequest( + message: message, + metadata: metadata + ) + return try await self.textToSpeech( + request: request, + options: options, + onResponse: handleResponse + ) + } + + /// Call the "ListVoices" method. + /// + /// - Parameters: + /// - message: request message to send. + /// - metadata: Additional metadata to send, defaults to empty. + /// - options: Options to apply to this RPC, defaults to `.defaults`. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + public func listVoices( + _ message: Appleintelligence_ListVoicesRequest, + metadata: GRPCCore.Metadata = [:], + options: GRPCCore.CallOptions = .defaults, + onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result = { response in + try response.message + } + ) async throws -> Result where Result: Sendable { + let request = GRPCCore.ClientRequest( + message: message, + metadata: metadata + ) + return try await self.listVoices( + request: request, + options: options, + onResponse: handleResponse + ) + } + + /// Call the "Transcribe" method. + /// + /// > Source IDL Documentation: + /// > + /// > Speech-to-Text + /// + /// - Parameters: + /// - message: request message to send. + /// - metadata: Additional metadata to send, defaults to empty. + /// - options: Options to apply to this RPC, defaults to `.defaults`. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + public func transcribe( + _ message: Appleintelligence_TranscribeRequest, + metadata: GRPCCore.Metadata = [:], + options: GRPCCore.CallOptions = .defaults, + onResponse handleResponse: @Sendable @escaping (GRPCCore.ClientResponse) async throws -> Result = { response in + try response.message + } + ) async throws -> Result where Result: Sendable { + let request = GRPCCore.ClientRequest( + message: message, + metadata: metadata + ) + return try await self.transcribe( + request: request, + options: options, + onResponse: handleResponse + ) + } + + /// Call the "StreamTranscribe" method. + /// + /// - Parameters: + /// - metadata: Additional metadata to send, defaults to empty. + /// - options: Options to apply to this RPC, defaults to `.defaults`. + /// - producer: A closure producing request messages to send to the server. The request + /// stream is closed when the closure returns. + /// - handleResponse: A closure which handles the response, the result of which is + /// returned to the caller. Returning from the closure will cancel the RPC if it + /// hasn't already finished. + /// - Returns: The result of `handleResponse`. + public func streamTranscribe( + metadata: GRPCCore.Metadata = [:], + options: GRPCCore.CallOptions = .defaults, + requestProducer producer: @Sendable @escaping (GRPCCore.RPCWriter) async throws -> Void, + onResponse handleResponse: @Sendable @escaping (GRPCCore.StreamingClientResponse) async throws -> Result + ) async throws -> Result where Result: Sendable { + let request = GRPCCore.StreamingClientRequest( + metadata: metadata, + producer: producer + ) + return try await self.streamTranscribe( + request: request, + options: options, + onResponse: handleResponse + ) + } } \ No newline at end of file diff --git a/Sources/AppleIntelligenceCore/Generated/apple_intelligence.pb.swift b/Sources/AppleIntelligenceCore/Generated/apple_intelligence.pb.swift index f3d512a..7129892 100644 --- a/Sources/AppleIntelligenceCore/Generated/apple_intelligence.pb.swift +++ b/Sources/AppleIntelligenceCore/Generated/apple_intelligence.pb.swift @@ -21,6 +21,45 @@ fileprivate struct _GeneratedWithProtocGenSwiftVersion: SwiftProtobuf.ProtobufAP typealias Version = _2 } +/// Audio format enumeration +public enum Appleintelligence_AudioFormat: SwiftProtobuf.Enum, Swift.CaseIterable { + public typealias RawValue = Int + case unspecified // = 0 + case wav // = 1 + case mp3 // = 2 + case UNRECOGNIZED(Int) + + public init() { + self = .unspecified + } + + public init?(rawValue: Int) { + switch rawValue { + case 0: self = .unspecified + case 1: self = .wav + case 2: self = .mp3 + default: self = .UNRECOGNIZED(rawValue) + } + } + + public var rawValue: Int { + switch self { + case .unspecified: return 0 + case .wav: return 1 + case .mp3: return 2 + case .UNRECOGNIZED(let i): return i + } + } + + // The compiler won't synthesize support with the UNRECOGNIZED case. + public static let allCases: [Appleintelligence_AudioFormat] = [ + .unspecified, + .wav, + .mp3, + ] + +} + /// Image data for vision requests public struct Appleintelligence_ImageData: Sendable { // SwiftProtobuf.Message conformance is added in an extension below. See the @@ -159,10 +198,373 @@ public struct Appleintelligence_HealthResponse: Sendable { public init() {} } +/// Voice configuration for TTS +public struct Appleintelligence_VoiceConfig: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var voiceIdentifier: String = String() + + /// 0.0-1.0, default 0.5 + public var speakingRate: Float { + get {return _speakingRate ?? 0} + set {_speakingRate = newValue} + } + /// Returns true if `speakingRate` has been explicitly set. + public var hasSpeakingRate: Bool {return self._speakingRate != nil} + /// Clears the value of `speakingRate`. Subsequent reads from it will return its default value. + public mutating func clearSpeakingRate() {self._speakingRate = nil} + + /// 0.5-2.0, default 1.0 + public var pitchMultiplier: Float { + get {return _pitchMultiplier ?? 0} + set {_pitchMultiplier = newValue} + } + /// Returns true if `pitchMultiplier` has been explicitly set. + public var hasPitchMultiplier: Bool {return self._pitchMultiplier != nil} + /// Clears the value of `pitchMultiplier`. Subsequent reads from it will return its default value. + public mutating func clearPitchMultiplier() {self._pitchMultiplier = nil} + + /// 0.0-1.0, default 1.0 + public var volume: Float { + get {return _volume ?? 0} + set {_volume = newValue} + } + /// Returns true if `volume` has been explicitly set. + public var hasVolume: Bool {return self._volume != nil} + /// Clears the value of `volume`. Subsequent reads from it will return its default value. + public mutating func clearVolume() {self._volume = nil} + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} + + fileprivate var _speakingRate: Float? = nil + fileprivate var _pitchMultiplier: Float? = nil + fileprivate var _volume: Float? = nil +} + +/// TTS Request +public struct Appleintelligence_TextToSpeechRequest: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var text: String = String() + + public var outputFormat: Appleintelligence_AudioFormat = .unspecified + + public var voiceConfig: Appleintelligence_VoiceConfig { + get {return _voiceConfig ?? Appleintelligence_VoiceConfig()} + set {_voiceConfig = newValue} + } + /// Returns true if `voiceConfig` has been explicitly set. + public var hasVoiceConfig: Bool {return self._voiceConfig != nil} + /// Clears the value of `voiceConfig`. Subsequent reads from it will return its default value. + public mutating func clearVoiceConfig() {self._voiceConfig = nil} + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} + + fileprivate var _voiceConfig: Appleintelligence_VoiceConfig? = nil +} + +/// TTS Response +public struct Appleintelligence_TextToSpeechResponse: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var audioData: Data = Data() + + public var format: Appleintelligence_AudioFormat = .unspecified + + public var sampleRate: Int32 = 0 + + public var channels: Int32 = 0 + + public var durationSeconds: Float = 0 + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} +} + +/// List available voices request +public struct Appleintelligence_ListVoicesRequest: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var languageCode: String { + get {return _languageCode ?? String()} + set {_languageCode = newValue} + } + /// Returns true if `languageCode` has been explicitly set. + public var hasLanguageCode: Bool {return self._languageCode != nil} + /// Clears the value of `languageCode`. Subsequent reads from it will return its default value. + public mutating func clearLanguageCode() {self._languageCode = nil} + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} + + fileprivate var _languageCode: String? = nil +} + +/// Voice information +public struct Appleintelligence_VoiceInfo: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var identifier: String = String() + + public var name: String = String() + + public var language: String = String() + + public var isPremium: Bool = false + + public var gender: String = String() + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} +} + +/// List voices response +public struct Appleintelligence_ListVoicesResponse: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var voices: [Appleintelligence_VoiceInfo] = [] + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} +} + +/// STT Configuration +public struct Appleintelligence_TranscriptionConfig: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var languageCode: String { + get {return _languageCode ?? String()} + set {_languageCode = newValue} + } + /// Returns true if `languageCode` has been explicitly set. + public var hasLanguageCode: Bool {return self._languageCode != nil} + /// Clears the value of `languageCode`. Subsequent reads from it will return its default value. + public mutating func clearLanguageCode() {self._languageCode = nil} + + /// default true + public var enablePunctuation: Bool { + get {return _enablePunctuation ?? false} + set {_enablePunctuation = newValue} + } + /// Returns true if `enablePunctuation` has been explicitly set. + public var hasEnablePunctuation: Bool {return self._enablePunctuation != nil} + /// Clears the value of `enablePunctuation`. Subsequent reads from it will return its default value. + public mutating func clearEnablePunctuation() {self._enablePunctuation = nil} + + /// default false + public var enableTimestamps: Bool { + get {return _enableTimestamps ?? false} + set {_enableTimestamps = newValue} + } + /// Returns true if `enableTimestamps` has been explicitly set. + public var hasEnableTimestamps: Bool {return self._enableTimestamps != nil} + /// Clears the value of `enableTimestamps`. Subsequent reads from it will return its default value. + public mutating func clearEnableTimestamps() {self._enableTimestamps = nil} + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} + + fileprivate var _languageCode: String? = nil + fileprivate var _enablePunctuation: Bool? = nil + fileprivate var _enableTimestamps: Bool? = nil +} + +/// Audio data for STT +public struct Appleintelligence_AudioInput: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var data: Data = Data() + + /// "audio/wav", "audio/mp3", "audio/m4a" + public var mimeType: String = String() + + public var sampleRate: Int32 { + get {return _sampleRate ?? 0} + set {_sampleRate = newValue} + } + /// Returns true if `sampleRate` has been explicitly set. + public var hasSampleRate: Bool {return self._sampleRate != nil} + /// Clears the value of `sampleRate`. Subsequent reads from it will return its default value. + public mutating func clearSampleRate() {self._sampleRate = nil} + + public var channels: Int32 { + get {return _channels ?? 0} + set {_channels = newValue} + } + /// Returns true if `channels` has been explicitly set. + public var hasChannels: Bool {return self._channels != nil} + /// Clears the value of `channels`. Subsequent reads from it will return its default value. + public mutating func clearChannels() {self._channels = nil} + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} + + fileprivate var _sampleRate: Int32? = nil + fileprivate var _channels: Int32? = nil +} + +/// File-based transcription request +public struct Appleintelligence_TranscribeRequest: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var audio: Appleintelligence_AudioInput { + get {return _audio ?? Appleintelligence_AudioInput()} + set {_audio = newValue} + } + /// Returns true if `audio` has been explicitly set. + public var hasAudio: Bool {return self._audio != nil} + /// Clears the value of `audio`. Subsequent reads from it will return its default value. + public mutating func clearAudio() {self._audio = nil} + + public var config: Appleintelligence_TranscriptionConfig { + get {return _config ?? Appleintelligence_TranscriptionConfig()} + set {_config = newValue} + } + /// Returns true if `config` has been explicitly set. + public var hasConfig: Bool {return self._config != nil} + /// Clears the value of `config`. Subsequent reads from it will return its default value. + public mutating func clearConfig() {self._config = nil} + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} + + fileprivate var _audio: Appleintelligence_AudioInput? = nil + fileprivate var _config: Appleintelligence_TranscriptionConfig? = nil +} + +/// Transcription segment with timing +public struct Appleintelligence_TranscriptionSegment: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var text: String = String() + + public var startTime: Float = 0 + + public var endTime: Float = 0 + + public var confidence: Float = 0 + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} +} + +/// Transcription response +public struct Appleintelligence_TranscribeResponse: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var text: String = String() + + public var segments: [Appleintelligence_TranscriptionSegment] = [] + + public var detectedLanguage: String = String() + + public var confidence: Float = 0 + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} +} + +/// Streaming STT request chunk +public struct Appleintelligence_StreamingTranscribeRequest: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var request: Appleintelligence_StreamingTranscribeRequest.OneOf_Request? = nil + + /// Send first to configure + public var config: Appleintelligence_TranscriptionConfig { + get { + if case .config(let v)? = request {return v} + return Appleintelligence_TranscriptionConfig() + } + set {request = .config(newValue)} + } + + /// Subsequent audio chunks + public var audioChunk: Data { + get { + if case .audioChunk(let v)? = request {return v} + return Data() + } + set {request = .audioChunk(newValue)} + } + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public enum OneOf_Request: Equatable, Sendable { + /// Send first to configure + case config(Appleintelligence_TranscriptionConfig) + /// Subsequent audio chunks + case audioChunk(Data) + + } + + public init() {} +} + +/// Streaming STT response +public struct Appleintelligence_StreamingTranscribeResponse: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var partialText: String = String() + + public var isFinal: Bool = false + + public var finalText: String = String() + + public var segments: [Appleintelligence_TranscriptionSegment] = [] + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} +} + // MARK: - Code below here is support for the SwiftProtobuf runtime. fileprivate let _protobuf_package = "appleintelligence" +extension Appleintelligence_AudioFormat: SwiftProtobuf._ProtoNameProviding { + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{2}\0AUDIO_FORMAT_UNSPECIFIED\0\u{1}AUDIO_FORMAT_WAV\0\u{1}AUDIO_FORMAT_MP3\0") +} + extension Appleintelligence_ImageData: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { public static let protoMessageName: String = _protobuf_package + ".ImageData" public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{1}data\0\u{1}filename\0\u{3}mime_type\0") @@ -445,3 +847,589 @@ extension Appleintelligence_HealthResponse: SwiftProtobuf.Message, SwiftProtobuf return true } } + +extension Appleintelligence_VoiceConfig: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".VoiceConfig" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{3}voice_identifier\0\u{3}speaking_rate\0\u{3}pitch_multiplier\0\u{1}volume\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.voiceIdentifier) }() + case 2: try { try decoder.decodeSingularFloatField(value: &self._speakingRate) }() + case 3: try { try decoder.decodeSingularFloatField(value: &self._pitchMultiplier) }() + case 4: try { try decoder.decodeSingularFloatField(value: &self._volume) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + if !self.voiceIdentifier.isEmpty { + try visitor.visitSingularStringField(value: self.voiceIdentifier, fieldNumber: 1) + } + try { if let v = self._speakingRate { + try visitor.visitSingularFloatField(value: v, fieldNumber: 2) + } }() + try { if let v = self._pitchMultiplier { + try visitor.visitSingularFloatField(value: v, fieldNumber: 3) + } }() + try { if let v = self._volume { + try visitor.visitSingularFloatField(value: v, fieldNumber: 4) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_VoiceConfig, rhs: Appleintelligence_VoiceConfig) -> Bool { + if lhs.voiceIdentifier != rhs.voiceIdentifier {return false} + if lhs._speakingRate != rhs._speakingRate {return false} + if lhs._pitchMultiplier != rhs._pitchMultiplier {return false} + if lhs._volume != rhs._volume {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Appleintelligence_TextToSpeechRequest: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".TextToSpeechRequest" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{1}text\0\u{3}output_format\0\u{3}voice_config\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.text) }() + case 2: try { try decoder.decodeSingularEnumField(value: &self.outputFormat) }() + case 3: try { try decoder.decodeSingularMessageField(value: &self._voiceConfig) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + if !self.text.isEmpty { + try visitor.visitSingularStringField(value: self.text, fieldNumber: 1) + } + if self.outputFormat != .unspecified { + try visitor.visitSingularEnumField(value: self.outputFormat, fieldNumber: 2) + } + try { if let v = self._voiceConfig { + try visitor.visitSingularMessageField(value: v, fieldNumber: 3) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_TextToSpeechRequest, rhs: Appleintelligence_TextToSpeechRequest) -> Bool { + if lhs.text != rhs.text {return false} + if lhs.outputFormat != rhs.outputFormat {return false} + if lhs._voiceConfig != rhs._voiceConfig {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Appleintelligence_TextToSpeechResponse: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".TextToSpeechResponse" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{3}audio_data\0\u{1}format\0\u{3}sample_rate\0\u{1}channels\0\u{3}duration_seconds\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.audioData) }() + case 2: try { try decoder.decodeSingularEnumField(value: &self.format) }() + case 3: try { try decoder.decodeSingularInt32Field(value: &self.sampleRate) }() + case 4: try { try decoder.decodeSingularInt32Field(value: &self.channels) }() + case 5: try { try decoder.decodeSingularFloatField(value: &self.durationSeconds) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + if !self.audioData.isEmpty { + try visitor.visitSingularBytesField(value: self.audioData, fieldNumber: 1) + } + if self.format != .unspecified { + try visitor.visitSingularEnumField(value: self.format, fieldNumber: 2) + } + if self.sampleRate != 0 { + try visitor.visitSingularInt32Field(value: self.sampleRate, fieldNumber: 3) + } + if self.channels != 0 { + try visitor.visitSingularInt32Field(value: self.channels, fieldNumber: 4) + } + if self.durationSeconds.bitPattern != 0 { + try visitor.visitSingularFloatField(value: self.durationSeconds, fieldNumber: 5) + } + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_TextToSpeechResponse, rhs: Appleintelligence_TextToSpeechResponse) -> Bool { + if lhs.audioData != rhs.audioData {return false} + if lhs.format != rhs.format {return false} + if lhs.sampleRate != rhs.sampleRate {return false} + if lhs.channels != rhs.channels {return false} + if lhs.durationSeconds != rhs.durationSeconds {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Appleintelligence_ListVoicesRequest: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".ListVoicesRequest" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{3}language_code\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self._languageCode) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + try { if let v = self._languageCode { + try visitor.visitSingularStringField(value: v, fieldNumber: 1) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_ListVoicesRequest, rhs: Appleintelligence_ListVoicesRequest) -> Bool { + if lhs._languageCode != rhs._languageCode {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Appleintelligence_VoiceInfo: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".VoiceInfo" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{1}identifier\0\u{1}name\0\u{1}language\0\u{3}is_premium\0\u{1}gender\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.identifier) }() + case 2: try { try decoder.decodeSingularStringField(value: &self.name) }() + case 3: try { try decoder.decodeSingularStringField(value: &self.language) }() + case 4: try { try decoder.decodeSingularBoolField(value: &self.isPremium) }() + case 5: try { try decoder.decodeSingularStringField(value: &self.gender) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + if !self.identifier.isEmpty { + try visitor.visitSingularStringField(value: self.identifier, fieldNumber: 1) + } + if !self.name.isEmpty { + try visitor.visitSingularStringField(value: self.name, fieldNumber: 2) + } + if !self.language.isEmpty { + try visitor.visitSingularStringField(value: self.language, fieldNumber: 3) + } + if self.isPremium != false { + try visitor.visitSingularBoolField(value: self.isPremium, fieldNumber: 4) + } + if !self.gender.isEmpty { + try visitor.visitSingularStringField(value: self.gender, fieldNumber: 5) + } + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_VoiceInfo, rhs: Appleintelligence_VoiceInfo) -> Bool { + if lhs.identifier != rhs.identifier {return false} + if lhs.name != rhs.name {return false} + if lhs.language != rhs.language {return false} + if lhs.isPremium != rhs.isPremium {return false} + if lhs.gender != rhs.gender {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Appleintelligence_ListVoicesResponse: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".ListVoicesResponse" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{1}voices\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeRepeatedMessageField(value: &self.voices) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + if !self.voices.isEmpty { + try visitor.visitRepeatedMessageField(value: self.voices, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_ListVoicesResponse, rhs: Appleintelligence_ListVoicesResponse) -> Bool { + if lhs.voices != rhs.voices {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Appleintelligence_TranscriptionConfig: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".TranscriptionConfig" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{3}language_code\0\u{3}enable_punctuation\0\u{3}enable_timestamps\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self._languageCode) }() + case 2: try { try decoder.decodeSingularBoolField(value: &self._enablePunctuation) }() + case 3: try { try decoder.decodeSingularBoolField(value: &self._enableTimestamps) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + try { if let v = self._languageCode { + try visitor.visitSingularStringField(value: v, fieldNumber: 1) + } }() + try { if let v = self._enablePunctuation { + try visitor.visitSingularBoolField(value: v, fieldNumber: 2) + } }() + try { if let v = self._enableTimestamps { + try visitor.visitSingularBoolField(value: v, fieldNumber: 3) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_TranscriptionConfig, rhs: Appleintelligence_TranscriptionConfig) -> Bool { + if lhs._languageCode != rhs._languageCode {return false} + if lhs._enablePunctuation != rhs._enablePunctuation {return false} + if lhs._enableTimestamps != rhs._enableTimestamps {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Appleintelligence_AudioInput: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".AudioInput" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{1}data\0\u{3}mime_type\0\u{3}sample_rate\0\u{1}channels\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.data) }() + case 2: try { try decoder.decodeSingularStringField(value: &self.mimeType) }() + case 3: try { try decoder.decodeSingularInt32Field(value: &self._sampleRate) }() + case 4: try { try decoder.decodeSingularInt32Field(value: &self._channels) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + if !self.data.isEmpty { + try visitor.visitSingularBytesField(value: self.data, fieldNumber: 1) + } + if !self.mimeType.isEmpty { + try visitor.visitSingularStringField(value: self.mimeType, fieldNumber: 2) + } + try { if let v = self._sampleRate { + try visitor.visitSingularInt32Field(value: v, fieldNumber: 3) + } }() + try { if let v = self._channels { + try visitor.visitSingularInt32Field(value: v, fieldNumber: 4) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_AudioInput, rhs: Appleintelligence_AudioInput) -> Bool { + if lhs.data != rhs.data {return false} + if lhs.mimeType != rhs.mimeType {return false} + if lhs._sampleRate != rhs._sampleRate {return false} + if lhs._channels != rhs._channels {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Appleintelligence_TranscribeRequest: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".TranscribeRequest" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{1}audio\0\u{1}config\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularMessageField(value: &self._audio) }() + case 2: try { try decoder.decodeSingularMessageField(value: &self._config) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + try { if let v = self._audio { + try visitor.visitSingularMessageField(value: v, fieldNumber: 1) + } }() + try { if let v = self._config { + try visitor.visitSingularMessageField(value: v, fieldNumber: 2) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_TranscribeRequest, rhs: Appleintelligence_TranscribeRequest) -> Bool { + if lhs._audio != rhs._audio {return false} + if lhs._config != rhs._config {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Appleintelligence_TranscriptionSegment: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".TranscriptionSegment" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{1}text\0\u{3}start_time\0\u{3}end_time\0\u{1}confidence\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.text) }() + case 2: try { try decoder.decodeSingularFloatField(value: &self.startTime) }() + case 3: try { try decoder.decodeSingularFloatField(value: &self.endTime) }() + case 4: try { try decoder.decodeSingularFloatField(value: &self.confidence) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + if !self.text.isEmpty { + try visitor.visitSingularStringField(value: self.text, fieldNumber: 1) + } + if self.startTime.bitPattern != 0 { + try visitor.visitSingularFloatField(value: self.startTime, fieldNumber: 2) + } + if self.endTime.bitPattern != 0 { + try visitor.visitSingularFloatField(value: self.endTime, fieldNumber: 3) + } + if self.confidence.bitPattern != 0 { + try visitor.visitSingularFloatField(value: self.confidence, fieldNumber: 4) + } + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_TranscriptionSegment, rhs: Appleintelligence_TranscriptionSegment) -> Bool { + if lhs.text != rhs.text {return false} + if lhs.startTime != rhs.startTime {return false} + if lhs.endTime != rhs.endTime {return false} + if lhs.confidence != rhs.confidence {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Appleintelligence_TranscribeResponse: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".TranscribeResponse" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{1}text\0\u{1}segments\0\u{3}detected_language\0\u{1}confidence\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.text) }() + case 2: try { try decoder.decodeRepeatedMessageField(value: &self.segments) }() + case 3: try { try decoder.decodeSingularStringField(value: &self.detectedLanguage) }() + case 4: try { try decoder.decodeSingularFloatField(value: &self.confidence) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + if !self.text.isEmpty { + try visitor.visitSingularStringField(value: self.text, fieldNumber: 1) + } + if !self.segments.isEmpty { + try visitor.visitRepeatedMessageField(value: self.segments, fieldNumber: 2) + } + if !self.detectedLanguage.isEmpty { + try visitor.visitSingularStringField(value: self.detectedLanguage, fieldNumber: 3) + } + if self.confidence.bitPattern != 0 { + try visitor.visitSingularFloatField(value: self.confidence, fieldNumber: 4) + } + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_TranscribeResponse, rhs: Appleintelligence_TranscribeResponse) -> Bool { + if lhs.text != rhs.text {return false} + if lhs.segments != rhs.segments {return false} + if lhs.detectedLanguage != rhs.detectedLanguage {return false} + if lhs.confidence != rhs.confidence {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Appleintelligence_StreamingTranscribeRequest: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".StreamingTranscribeRequest" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{1}config\0\u{3}audio_chunk\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { + var v: Appleintelligence_TranscriptionConfig? + var hadOneofValue = false + if let current = self.request { + hadOneofValue = true + if case .config(let m) = current {v = m} + } + try decoder.decodeSingularMessageField(value: &v) + if let v = v { + if hadOneofValue {try decoder.handleConflictingOneOf()} + self.request = .config(v) + } + }() + case 2: try { + var v: Data? + try decoder.decodeSingularBytesField(value: &v) + if let v = v { + if self.request != nil {try decoder.handleConflictingOneOf()} + self.request = .audioChunk(v) + } + }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + switch self.request { + case .config?: try { + guard case .config(let v)? = self.request else { preconditionFailure() } + try visitor.visitSingularMessageField(value: v, fieldNumber: 1) + }() + case .audioChunk?: try { + guard case .audioChunk(let v)? = self.request else { preconditionFailure() } + try visitor.visitSingularBytesField(value: v, fieldNumber: 2) + }() + case nil: break + } + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_StreamingTranscribeRequest, rhs: Appleintelligence_StreamingTranscribeRequest) -> Bool { + if lhs.request != rhs.request {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Appleintelligence_StreamingTranscribeResponse: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".StreamingTranscribeResponse" + public static let _protobuf_nameMap = SwiftProtobuf._NameMap(bytecode: "\0\u{3}partial_text\0\u{3}is_final\0\u{3}final_text\0\u{1}segments\0") + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.partialText) }() + case 2: try { try decoder.decodeSingularBoolField(value: &self.isFinal) }() + case 3: try { try decoder.decodeSingularStringField(value: &self.finalText) }() + case 4: try { try decoder.decodeRepeatedMessageField(value: &self.segments) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + if !self.partialText.isEmpty { + try visitor.visitSingularStringField(value: self.partialText, fieldNumber: 1) + } + if self.isFinal != false { + try visitor.visitSingularBoolField(value: self.isFinal, fieldNumber: 2) + } + if !self.finalText.isEmpty { + try visitor.visitSingularStringField(value: self.finalText, fieldNumber: 3) + } + if !self.segments.isEmpty { + try visitor.visitRepeatedMessageField(value: self.segments, fieldNumber: 4) + } + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Appleintelligence_StreamingTranscribeResponse, rhs: Appleintelligence_StreamingTranscribeResponse) -> Bool { + if lhs.partialText != rhs.partialText {return false} + if lhs.isFinal != rhs.isFinal {return false} + if lhs.finalText != rhs.finalText {return false} + if lhs.segments != rhs.segments {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} diff --git a/Sources/AppleIntelligenceCore/Providers/AppleIntelligenceProvider.swift b/Sources/AppleIntelligenceCore/Providers/AppleIntelligenceProvider.swift index 742f5d0..369c5f9 100644 --- a/Sources/AppleIntelligenceCore/Providers/AppleIntelligenceProvider.swift +++ b/Sources/AppleIntelligenceCore/Providers/AppleIntelligenceProvider.swift @@ -8,11 +8,24 @@ public struct AppleIntelligenceProvider: Appleintelligence_AppleIntelligenceServ /// The underlying AI service private let service: AppleIntelligenceService + /// Text-to-Speech service + private let ttsService: TextToSpeechService? + + /// Speech-to-Text service + private let sttService: SpeechToTextService? + /// Optional API key for authentication private let apiKey: String? - public init(service: AppleIntelligenceService, apiKey: String? = nil) { + public init( + service: AppleIntelligenceService, + ttsService: TextToSpeechService? = nil, + sttService: SpeechToTextService? = nil, + apiKey: String? = nil + ) { self.service = service + self.ttsService = ttsService + self.sttService = sttService self.apiKey = apiKey } @@ -139,6 +152,213 @@ public struct AppleIntelligenceProvider: Appleintelligence_AppleIntelligenceServ return ServerResponse(message: response) } + // MARK: - Text-to-Speech + + public func textToSpeech( + request: GRPCCore.ServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.ServerResponse { + try validateApiKey(metadata: request.metadata) + + guard let ttsService = ttsService else { + throw RPCError(code: .unavailable, message: "Text-to-Speech service not available") + } + + let message = request.message + + // Convert proto config to service config + var config = SpeechConfig.default + if message.hasVoiceConfig { + let voiceConfig = message.voiceConfig + config = SpeechConfig( + voiceIdentifier: voiceConfig.voiceIdentifier.isEmpty ? nil : voiceConfig.voiceIdentifier, + speakingRate: voiceConfig.hasSpeakingRate ? voiceConfig.speakingRate : 0.5, + pitchMultiplier: voiceConfig.hasPitchMultiplier ? voiceConfig.pitchMultiplier : 1.0, + volume: voiceConfig.hasVolume ? voiceConfig.volume : 1.0 + ) + } + + // Convert proto format to service format + let outputFormat: AudioOutputFormat + switch message.outputFormat { + case .wav, .unspecified: + outputFormat = .wav + case .mp3: + outputFormat = .mp3 + case .UNRECOGNIZED: + outputFormat = .wav + } + + do { + let result = try await ttsService.synthesize( + text: message.text, + config: config, + outputFormat: outputFormat + ) + + var response = Appleintelligence_TextToSpeechResponse() + response.audioData = result.audioData + response.format = outputFormat == .wav ? .wav : .mp3 + response.sampleRate = Int32(result.sampleRate) + response.channels = Int32(result.channels) + response.durationSeconds = result.durationSeconds + + return ServerResponse(message: response) + } catch let error as TextToSpeechError { + throw RPCError(code: .internalError, message: error.description) + } + } + + public func listVoices( + request: GRPCCore.ServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.ServerResponse { + try validateApiKey(metadata: request.metadata) + + guard let ttsService = ttsService else { + throw RPCError(code: .unavailable, message: "Text-to-Speech service not available") + } + + let message = request.message + let languageCode = message.hasLanguageCode ? message.languageCode : nil + + let voices = await ttsService.listVoices(languageCode: languageCode) + + var response = Appleintelligence_ListVoicesResponse() + response.voices = voices.map { voice in + var protoVoice = Appleintelligence_VoiceInfo() + protoVoice.identifier = voice.identifier + protoVoice.name = voice.name + protoVoice.language = voice.language + protoVoice.isPremium = voice.isPremium + protoVoice.gender = voice.gender + return protoVoice + } + + return ServerResponse(message: response) + } + + // MARK: - Speech-to-Text + + public func transcribe( + request: GRPCCore.ServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.ServerResponse { + try validateApiKey(metadata: request.metadata) + + guard let sttService = sttService else { + throw RPCError(code: .unavailable, message: "Speech-to-Text service not available") + } + + let message = request.message + + guard message.hasAudio else { + throw RPCError(code: .invalidArgument, message: "Audio data is required") + } + + // Convert proto config to service config + var config = TranscriptionConfig.default + if message.hasConfig { + let protoConfig = message.config + config = TranscriptionConfig( + languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil, + enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true, + enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false + ) + } + + do { + let result = try await sttService.transcribe( + audioData: message.audio.data, + mimeType: message.audio.mimeType, + config: config + ) + + var response = Appleintelligence_TranscribeResponse() + response.text = result.text + response.detectedLanguage = result.detectedLanguage + response.confidence = result.confidence + response.segments = result.segments.map { segment in + var protoSegment = Appleintelligence_TranscriptionSegment() + protoSegment.text = segment.text + protoSegment.startTime = segment.startTime + protoSegment.endTime = segment.endTime + protoSegment.confidence = segment.confidence + return protoSegment + } + + return ServerResponse(message: response) + } catch let error as SpeechToTextError { + throw RPCError(code: .internalError, message: error.description) + } + } + + public func streamTranscribe( + request: GRPCCore.StreamingServerRequest, + context: GRPCCore.ServerContext + ) async throws -> GRPCCore.StreamingServerResponse { + try validateApiKey(metadata: request.metadata) + + guard let sttService = sttService else { + throw RPCError(code: .unavailable, message: "Speech-to-Text service not available") + } + + return StreamingServerResponse { writer in + var config = TranscriptionConfig.default + + // Process incoming stream + for try await message in request.messages { + switch message.request { + case .config(let protoConfig): + // First message should be config + config = TranscriptionConfig( + languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil, + enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true, + enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false + ) + + // Start streaming transcription + let stream = await sttService.streamTranscribe(config: config) + Task { + do { + for try await update in stream { + var response = Appleintelligence_StreamingTranscribeResponse() + response.partialText = update.partialText + response.isFinal = update.isFinal + if let finalText = update.finalText { + response.finalText = finalText + } + response.segments = update.segments.map { segment in + var protoSegment = Appleintelligence_TranscriptionSegment() + protoSegment.text = segment.text + protoSegment.startTime = segment.startTime + protoSegment.endTime = segment.endTime + protoSegment.confidence = segment.confidence + return protoSegment + } + try await writer.write(response) + } + } catch { + // Stream ended or error occurred + } + } + + case .audioChunk(let chunk): + // Feed audio chunk to service + try await sttService.feedAudioChunk(chunk) + + case .none: + break + } + } + + // End streaming session + await sttService.endStreamingSession() + + return [:] + } + } + // MARK: - Private Helpers /// Validate API key if configured diff --git a/Sources/AppleIntelligenceCore/Resources/apple_intelligence.pb b/Sources/AppleIntelligenceCore/Resources/apple_intelligence.pb index eacd6689b5cd1334505fc5ee0f094482f155ecc5..9c86d15179bc6a90adcdda5d7d55aa8d7b1c3e80 100644 GIT binary patch literal 3906 zcmbVPOLH4V5Y8%*E%i#WJT{bY-b(@o3Rg;@IAbX>78MiM%1NM#s;#v&wzsk`w7aR~ z&Vd6bPF%TgfGU0iXS#c4SLOo$ zwYUt;zzd|4zKCV-qS_?j0N0ieZI-+py8esAOT6N$* z_-qi%nVV)aWmj$Y0_jqK-y4Ro00Q^9lSWTvnDDFhUBhl7cEAvw`2?0ca$wTCgesf; zD_aXr-YPt7>$Bnm=z8JA&!*CGwSRhbbYR0?(Rl8*K?hn+Nv}a0OlPi@^@{Q|TiE71Pv0yaNp{aZoGQ=O8rfWiQ#G<)R6fn1ydL zq0XT?w+SoI-j%MOo@jnqd(g=N?e`umpYV|v?0i&JFNyv^gw{DpimW4sQ!<&1jTy53 zGQfNsc@v5GaO52mOP$J!{^Z-*Zq3$f+#GbF_-bKoj~ zOX@hKV%lVctL&$+bS`uBX>lFQ#H9+S0!%Th+n^u{E|u!Zi5rGgKPgsqyI2#IrWq0g zi=&fBv|yTW!Kv9RIX+gf^$=DccuC6MNb*XG{PNxKIKxVZ_D`kaA-buR!$U-KO5_+H zwSF|9$q7yXhtS3ZNE% zEpUn?r}~O5MXjtaK$NZ2p~GsmYJ?F5CLq0N^|ug_x6-)558VK#vRgL?CWe5EvO$qA;0t{Z$sD0}39K71R(* zi7+ss>Eyh(e!@8*55=L(*?Sl2oPW$(vw!t<1%wT=)mbrwC6(zM2ebcP#jX4)o=a5J z(IS11l%>k2q*&iky(qI}ERO@M>y@PsS&b2$r5syUpk_B2p^#EbL{h6wF5dPsu6Yad ziyy?^Fwr&JE+N)$I56aDkcTw$^8w*>{cR*lM>UX{{rBJG18tLCTofO{>Qtt3lFF%* zXM;}SZoctAmli4xXDa`L0UP5K$CUsZUD*mX?|oK;jLi5SA;T_hs@grF0fimkB91c1 zLkop;rshlQr2&^K)zMDHo*!cnJ=ez(4yP~WTvKDzJWdTalrW{?206e(kGiYo>n0<7 za|nf#_3U~6=zRDDOipRtEU!Rs`_aA8Lucoqy|;bfJo<9He}6dI8Qs6vy&%@1dusQ| z_T#RxfbQ+z?biCg)=?Dw5gC>Khm7Uy8CFd31R8o2l;TzerRH;@&ftyPZ#bnL{ta|g zJkBZJI#WEy{f;5_U`dU8qH1)^Cqbs-#?1ASw&Y;;6Jy8Rk>x^aEc5 z%zIp=qvzXaIl6pmE_$ARm7?d{XNfMK(xNlysu<4q#K#rhtV}%R{H_0bTs^sI+%XR6 JzhN5P{TDiP-rN8H literal 3259 zcmb7G+iu%N5G9usMXqkvS{I7G(Mse?Zit;WHDJ_1lA=ipp zKj>%lso&7YenP*d&pordq?N>N5WkpbXXebz+1Xw3KRetX&gbK35Kq%+JdR&S(@}J1 zzL=#mQJ0owL-%LyJ(~<)N8b(8p%C0Y!++ll&AoD8ev5#+ui|ku9Zn*jVcqNJSgdhx z5>KK*`hFhymNL@93*hdUS0wYp>2Ul$i4(ES%W3p39gJqEY?{g}UcW?ZkJXZBac4Yy z8I2R9vXDw65%SVml#CYfJdI~looach`;W=%k7pA!P9FW}*NZ4gi4UDm=2BU0z#hw~eOTPGZrfPiklrq?ss9#wgHuvFKh3NH;VtLpw?agSPj8ia5x__Cxc3J4-^NZ=5PpWM2!dW!N1gc3>Ky<@p}xFc-Ccwlmun`L1chM+GdEM^4! z7H760?1JH$K^V$&7t)m$zM3tsJ5Y{L{*!L9QWw{3$N!g5%hmwACe zflx6+aCF>qhAg1kiVlOR$XeQDBCN>V43W*tC{2VF8%-UCqFvV3jtpUL9>x&ndLpwS z%x$)K5AB)4E*mRm&mFF2B?@G~1os%z_-1YOoxN1Le2TArH{)VxM!Ls+ft z^AKG!VXqn;GiZdDbDiy++Bk|`bycN|8f-8{1b`$BodgJu4X-AX03k_3tjgLLaWolD zCiWWu4K{UyKp<(BRx%rT*4HxxlI9j(B@7K?Xt2dXZDJs4K#36j?4 zmL>sc=Oh+L+Bu2k0F|=lEReLD8Hq)4ISImimKA9ux6d4%4Rd4pMKx@U*$DH4HGTl8 zEn%-4+h)-FP-=DH!oBHidOMN#Lp*)Gue7Hc*4;psj&moh4W@(wK-LYdJrEo>yppT| z2w69(H5~@iCQ}n`NwqigFdC;#ucE`~&}^=#BWjV_TTHbNfNWcN80ov^sdES$!nQhD zkpLmPr5Q=zQ0r?k(l;!r;ROOo*xAiIDelHiz6YUfVP7|Hn8A9e?n~$qj`H6e;fS6z zr4NaH-EGT8ZPKTw^#MS|JzX&n9D7h{_h(1 z%%J_TzZzJz0S*H0x-BV*LmP0Ql>|V(11$*<91n_0Ng$MUpp|q;twVij0wC;AO9DhT zeQ5$A>=2hGhQz_^b04o0hD5&P;qp@+>bG3h=uqnWZipz`kuT2?dl0OJrx>eov^-e) z!IHX?C*qDASu!E)_fbvWK7>%o(amr9P00zzWA-fAyn0?gqNSFO9eK-uh#u7A<_=95 zA#COG!8aK~lH;Q%d`%{S;|cpZsD7{=S*@hui6b9P(9qdEsmN!D5NJ+%_g>PE@^p6Z EUj@-V)Bpeg diff --git a/Sources/AppleIntelligenceCore/Services/SpeechToTextService.swift b/Sources/AppleIntelligenceCore/Services/SpeechToTextService.swift new file mode 100644 index 0000000..0122cc3 --- /dev/null +++ b/Sources/AppleIntelligenceCore/Services/SpeechToTextService.swift @@ -0,0 +1,337 @@ +import Foundation +import Speech +import AVFoundation + +// MARK: - Result Types + +/// Transcription result +public struct TranscriptionResult: Sendable { + public let text: String + public let segments: [TranscriptionSegmentResult] + public let detectedLanguage: String + public let confidence: Float +} + +/// Individual transcription segment +public struct TranscriptionSegmentResult: Sendable { + public let text: String + public let startTime: Float + public let endTime: Float + public let confidence: Float +} + +/// Streaming transcription update +public struct StreamingTranscriptionUpdate: Sendable { + public let partialText: String + public let isFinal: Bool + public let finalText: String? + public let segments: [TranscriptionSegmentResult] +} + +/// Transcription configuration +public struct TranscriptionConfig: Sendable { + public var languageCode: String? + public var enablePunctuation: Bool + public var enableTimestamps: Bool + + public static let `default` = TranscriptionConfig( + languageCode: nil, + enablePunctuation: true, + enableTimestamps: false + ) + + public init( + languageCode: String? = nil, + enablePunctuation: Bool = true, + enableTimestamps: Bool = false + ) { + self.languageCode = languageCode + self.enablePunctuation = enablePunctuation + self.enableTimestamps = enableTimestamps + } +} + +// MARK: - Errors + +public enum SpeechToTextError: Error, CustomStringConvertible, Sendable { + case notAvailable + case authorizationDenied + case modelNotReady(String) + case transcriptionFailed(String) + case invalidAudioFormat + case audioProcessingFailed(String) + case unsupportedMimeType(String) + + public var description: String { + switch self { + case .notAvailable: return "Speech recognition not available on this system" + case .authorizationDenied: return "Speech recognition authorization denied" + case .modelNotReady(let reason): return "Speech model not ready: \(reason)" + case .transcriptionFailed(let reason): return "Transcription failed: \(reason)" + case .invalidAudioFormat: return "Invalid audio format" + case .audioProcessingFailed(let reason): return "Audio processing failed: \(reason)" + case .unsupportedMimeType(let type): return "Unsupported audio MIME type: \(type)" + } + } +} + +// MARK: - Service Actor + +public actor SpeechToTextService { + + /// Service availability status + public private(set) var isAvailable: Bool = false + + /// Streaming session state + private var isStreamingActive: Bool = false + + public init() async { + await checkAvailability() + } + + // MARK: - Public API + + /// Transcribe audio data (file-based) + public func transcribe( + audioData: Data, + mimeType: String, + config: TranscriptionConfig = .default + ) async throws -> TranscriptionResult { + guard isAvailable else { + throw SpeechToTextError.notAvailable + } + + // Convert audio data to file URL for processing + let tempURL = try createTempAudioFile(data: audioData, mimeType: mimeType) + defer { try? FileManager.default.removeItem(at: tempURL) } + + return try await transcribeWithSFSpeechRecognizer(url: tempURL, config: config) + } + + /// Stream transcription from audio chunks + public func streamTranscribe( + config: TranscriptionConfig = .default + ) -> AsyncThrowingStream { + AsyncThrowingStream { continuation in + Task { + guard self.isAvailable else { + continuation.finish(throwing: SpeechToTextError.notAvailable) + return + } + + do { + try await self.startStreamingWithSFSpeechRecognizer(config: config, continuation: continuation) + } catch { + continuation.finish(throwing: error) + } + } + } + } + + /// Feed audio chunk for streaming transcription + public func feedAudioChunk(_ chunk: Data) async throws { + guard isStreamingActive else { + throw SpeechToTextError.transcriptionFailed("No active streaming session") + } + // Audio chunk handling implemented in streaming methods + } + + /// End streaming session + public func endStreamingSession() async { + isStreamingActive = false + } + + /// Get status information + public func getStatus() -> String { + if isAvailable { + return "SFSpeechRecognizer available" + } else { + return "Speech recognition not available" + } + } + + // MARK: - Private Implementation + + private func checkAvailability() async { + // Check SFSpeechRecognizer availability + let status = SFSpeechRecognizer.authorizationStatus() + switch status { + case .authorized: + isAvailable = SFSpeechRecognizer.supportedLocales().count > 0 + case .notDetermined: + // Request authorization + isAvailable = await withCheckedContinuation { continuation in + SFSpeechRecognizer.requestAuthorization { newStatus in + continuation.resume(returning: newStatus == .authorized) + } + } + default: + isAvailable = false + } + } + + /// Create temporary audio file from data + private func createTempAudioFile(data: Data, mimeType: String) throws -> URL { + let ext = extensionForMimeType(mimeType) + let tempDir = FileManager.default.temporaryDirectory + let fileName = UUID().uuidString + "." + ext + let fileURL = tempDir.appendingPathComponent(fileName) + + try data.write(to: fileURL) + return fileURL + } + + /// Get file extension for MIME type + private func extensionForMimeType(_ mimeType: String) -> String { + switch mimeType.lowercased() { + case "audio/wav", "audio/wave", "audio/x-wav": + return "wav" + case "audio/mp3", "audio/mpeg": + return "mp3" + case "audio/m4a", "audio/mp4", "audio/x-m4a": + return "m4a" + case "audio/aac": + return "aac" + case "audio/flac": + return "flac" + default: + return "wav" + } + } + + /// Transcribe using SFSpeechRecognizer + private func transcribeWithSFSpeechRecognizer( + url: URL, + config: TranscriptionConfig + ) async throws -> TranscriptionResult { + let locale = Locale(identifier: config.languageCode ?? "en-US") + guard let recognizer = SFSpeechRecognizer(locale: locale) else { + throw SpeechToTextError.notAvailable + } + + guard recognizer.isAvailable else { + throw SpeechToTextError.notAvailable + } + + let request = SFSpeechURLRecognitionRequest(url: url) + request.shouldReportPartialResults = false + + return try await withCheckedThrowingContinuation { continuation in + var hasResumed = false + + recognizer.recognitionTask(with: request) { result, error in + guard !hasResumed else { return } + + if let error = error { + hasResumed = true + continuation.resume(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription)) + return + } + + guard let result = result, result.isFinal else { return } + + hasResumed = true + + let transcription = result.bestTranscription + var segments: [TranscriptionSegmentResult] = [] + + if config.enableTimestamps { + for segment in transcription.segments { + segments.append(TranscriptionSegmentResult( + text: segment.substring, + startTime: Float(segment.timestamp), + endTime: Float(segment.timestamp + segment.duration), + confidence: segment.confidence + )) + } + } + + let transcriptionResult = TranscriptionResult( + text: transcription.formattedString, + segments: segments, + detectedLanguage: config.languageCode ?? "en-US", + confidence: segments.isEmpty ? 1.0 : segments.reduce(0) { $0 + $1.confidence } / Float(segments.count) + ) + + continuation.resume(returning: transcriptionResult) + } + } + } + + /// Start streaming with SFSpeechRecognizer + private func startStreamingWithSFSpeechRecognizer( + config: TranscriptionConfig, + continuation: AsyncThrowingStream.Continuation + ) async throws { + let locale = Locale(identifier: config.languageCode ?? "en-US") + guard let recognizer = SFSpeechRecognizer(locale: locale) else { + throw SpeechToTextError.notAvailable + } + + guard recognizer.isAvailable else { + throw SpeechToTextError.notAvailable + } + + isStreamingActive = true + + let audioEngine = AVAudioEngine() + let request = SFSpeechAudioBufferRecognitionRequest() + request.shouldReportPartialResults = true + + let inputNode = audioEngine.inputNode + let recordingFormat = inputNode.outputFormat(forBus: 0) + + inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { buffer, _ in + request.append(buffer) + } + + audioEngine.prepare() + try audioEngine.start() + + recognizer.recognitionTask(with: request) { result, error in + if let error = error { + continuation.finish(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription)) + return + } + + guard let result = result else { return } + + let transcription = result.bestTranscription + var segments: [TranscriptionSegmentResult] = [] + + if config.enableTimestamps { + for segment in transcription.segments { + segments.append(TranscriptionSegmentResult( + text: segment.substring, + startTime: Float(segment.timestamp), + endTime: Float(segment.timestamp + segment.duration), + confidence: segment.confidence + )) + } + } + + let update = StreamingTranscriptionUpdate( + partialText: transcription.formattedString, + isFinal: result.isFinal, + finalText: result.isFinal ? transcription.formattedString : nil, + segments: segments + ) + continuation.yield(update) + + if result.isFinal { + audioEngine.stop() + inputNode.removeTap(onBus: 0) + continuation.finish() + } + } + + // Wait for streaming to end + while isStreamingActive { + try await Task.sleep(for: .milliseconds(100)) + } + + audioEngine.stop() + inputNode.removeTap(onBus: 0) + request.endAudio() + } +} diff --git a/Sources/AppleIntelligenceCore/Services/TextToSpeechService.swift b/Sources/AppleIntelligenceCore/Services/TextToSpeechService.swift new file mode 100644 index 0000000..ad3658f --- /dev/null +++ b/Sources/AppleIntelligenceCore/Services/TextToSpeechService.swift @@ -0,0 +1,280 @@ +import Foundation +import AVFoundation + +// MARK: - Result Types + +/// Result of text-to-speech synthesis +public struct TextToSpeechResult: Sendable { + public let audioData: Data + public let format: AudioOutputFormat + public let sampleRate: Int + public let channels: Int + public let durationSeconds: Float +} + +/// Supported output formats +public enum AudioOutputFormat: Sendable { + case wav + case mp3 +} + +/// Voice information +public struct VoiceDescription: Sendable { + public let identifier: String + public let name: String + public let language: String + public let isPremium: Bool + public let gender: String +} + +/// Configuration for speech synthesis +public struct SpeechConfig: Sendable { + public var voiceIdentifier: String? + public var speakingRate: Float // 0.0 - 1.0 + public var pitchMultiplier: Float // 0.5 - 2.0 + public var volume: Float // 0.0 - 1.0 + + public static let `default` = SpeechConfig( + voiceIdentifier: nil, + speakingRate: 0.5, + pitchMultiplier: 1.0, + volume: 1.0 + ) + + public init( + voiceIdentifier: String? = nil, + speakingRate: Float = 0.5, + pitchMultiplier: Float = 1.0, + volume: Float = 1.0 + ) { + self.voiceIdentifier = voiceIdentifier + self.speakingRate = speakingRate + self.pitchMultiplier = pitchMultiplier + self.volume = volume + } +} + +// MARK: - Errors + +public enum TextToSpeechError: Error, CustomStringConvertible, Sendable { + case invalidVoice(String) + case synthesisFailure(String) + case encodingFailure(String) + case noAudioGenerated + case unsupportedFormat + + public var description: String { + switch self { + case .invalidVoice(let id): return "Invalid voice identifier: \(id)" + case .synthesisFailure(let reason): return "Speech synthesis failed: \(reason)" + case .encodingFailure(let reason): return "Audio encoding failed: \(reason)" + case .noAudioGenerated: return "No audio was generated" + case .unsupportedFormat: return "Unsupported audio format" + } + } +} + +// MARK: - Service Actor + +public actor TextToSpeechService { + /// Keep strong reference to synthesizer during synthesis + private var activeSynthesizer: AVSpeechSynthesizer? + + public init() {} + + // MARK: - Public API + + /// Synthesize text to speech + public func synthesize( + text: String, + config: SpeechConfig = .default, + outputFormat: AudioOutputFormat = .wav + ) async throws -> TextToSpeechResult { + // Create utterance + let utterance = AVSpeechUtterance(string: text) + + // Configure voice + if let voiceId = config.voiceIdentifier { + if let voice = AVSpeechSynthesisVoice(identifier: voiceId) { + utterance.voice = voice + } else { + throw TextToSpeechError.invalidVoice(voiceId) + } + } else { + // Use default English voice + utterance.voice = AVSpeechSynthesisVoice(language: "en-US") + } + + // Configure speech parameters + utterance.rate = config.speakingRate + utterance.pitchMultiplier = config.pitchMultiplier + utterance.volume = config.volume + + // Collect PCM data + let pcmData = try await collectPCMData(utterance: utterance) + + // Convert to requested format + let audioData: Data + switch outputFormat { + case .wav: + audioData = createWAVData(from: pcmData) + case .mp3: + // Use WAV as fallback (MP3 encoding requires external library) + audioData = createWAVData(from: pcmData) + } + + // Calculate duration + let bytesPerSample = 2 // Int16 + let totalSamples = pcmData.samples.count / bytesPerSample / pcmData.channelCount + let duration = Float(totalSamples) / Float(pcmData.sampleRate) + + return TextToSpeechResult( + audioData: audioData, + format: outputFormat, + sampleRate: Int(pcmData.sampleRate), + channels: pcmData.channelCount, + durationSeconds: duration + ) + } + + /// List available voices + public func listVoices(languageCode: String? = nil) -> [VoiceDescription] { + let voices = AVSpeechSynthesisVoice.speechVoices() + + let filtered: [AVSpeechSynthesisVoice] + if let lang = languageCode { + filtered = voices.filter { $0.language.hasPrefix(lang) } + } else { + filtered = voices + } + + return filtered.map { voice in + VoiceDescription( + identifier: voice.identifier, + name: voice.name, + language: voice.language, + isPremium: voice.quality == .enhanced || voice.quality == .premium, + gender: genderString(for: voice) + ) + } + } + + // MARK: - Private Implementation + + /// PCM buffer data for internal processing + private struct PCMBufferData: Sendable { + let samples: Data + let sampleRate: Double + let channelCount: Int + } + + /// Collect PCM data from synthesizer using write callback + private func collectPCMData( + utterance: AVSpeechUtterance + ) async throws -> PCMBufferData { + // Create and store synthesizer to keep strong reference during synthesis + let synthesizer = AVSpeechSynthesizer() + self.activeSynthesizer = synthesizer + + defer { self.activeSynthesizer = nil } + + return try await withCheckedThrowingContinuation { continuation in + var pcmData = Data() + var sampleRate: Double = 0 + var channelCount: Int = 0 + var hasResumed = false + + synthesizer.write(utterance) { buffer in + guard let pcmBuffer = buffer as? AVAudioPCMBuffer else { + // End of audio - empty buffer signals completion + if !hasResumed { + hasResumed = true + if pcmData.isEmpty { + continuation.resume(throwing: TextToSpeechError.noAudioGenerated) + } else { + continuation.resume(returning: PCMBufferData( + samples: pcmData, + sampleRate: sampleRate, + channelCount: channelCount + )) + } + } + return + } + + if pcmBuffer.frameLength > 0 { + // Store format from first buffer + if sampleRate == 0 { + sampleRate = pcmBuffer.format.sampleRate + channelCount = Int(pcmBuffer.format.channelCount) + } + + // Convert float samples to Int16 PCM + if let channelData = pcmBuffer.floatChannelData { + let frameCount = Int(pcmBuffer.frameLength) + for frame in 0.. Data { + let bitsPerSample = 16 + let sampleRate = Int(pcmData.sampleRate) + let channels = pcmData.channelCount + let dataSize = pcmData.samples.count + + var header = Data() + + // RIFF header + header.append(contentsOf: "RIFF".utf8) + let fileSize = UInt32(dataSize + 36) + withUnsafeBytes(of: fileSize.littleEndian) { header.append(contentsOf: $0) } + header.append(contentsOf: "WAVE".utf8) + + // fmt subchunk + header.append(contentsOf: "fmt ".utf8) + let subchunk1Size = UInt32(16) + withUnsafeBytes(of: subchunk1Size.littleEndian) { header.append(contentsOf: $0) } + let audioFormat = UInt16(1) // PCM + withUnsafeBytes(of: audioFormat.littleEndian) { header.append(contentsOf: $0) } + let numChannels = UInt16(channels) + withUnsafeBytes(of: numChannels.littleEndian) { header.append(contentsOf: $0) } + let sampleRateU32 = UInt32(sampleRate) + withUnsafeBytes(of: sampleRateU32.littleEndian) { header.append(contentsOf: $0) } + let byteRate = UInt32(sampleRate * channels * bitsPerSample / 8) + withUnsafeBytes(of: byteRate.littleEndian) { header.append(contentsOf: $0) } + let blockAlign = UInt16(channels * bitsPerSample / 8) + withUnsafeBytes(of: blockAlign.littleEndian) { header.append(contentsOf: $0) } + let bitsPerSampleU16 = UInt16(bitsPerSample) + withUnsafeBytes(of: bitsPerSampleU16.littleEndian) { header.append(contentsOf: $0) } + + // data subchunk + header.append(contentsOf: "data".utf8) + let dataU32 = UInt32(dataSize) + withUnsafeBytes(of: dataU32.littleEndian) { header.append(contentsOf: $0) } + + return header + pcmData.samples + } + + /// Get gender string for voice + private func genderString(for voice: AVSpeechSynthesisVoice) -> String { + switch voice.gender { + case .male: return "male" + case .female: return "female" + case .unspecified: return "unspecified" + @unknown default: return "unknown" + } + } +} diff --git a/Sources/AppleIntelligenceServer/main.swift b/Sources/AppleIntelligenceServer/main.swift index 13980c9..e31eb13 100644 --- a/Sources/AppleIntelligenceServer/main.swift +++ b/Sources/AppleIntelligenceServer/main.swift @@ -37,7 +37,21 @@ struct AppleIntelligenceServer: AsyncParsableCommand { throw ExitCode.failure } - let provider = AppleIntelligenceProvider(service: service, apiKey: config.apiKey) + // Initialize speech services + print("Initializing Text-to-Speech service...") + let ttsService = TextToSpeechService() + + print("Initializing Speech-to-Text service...") + let sttService = await SpeechToTextService() + let sttStatus = await sttService.getStatus() + print("Speech-to-Text status: \(sttStatus)") + + let provider = AppleIntelligenceProvider( + service: service, + ttsService: ttsService, + sttService: sttService, + apiKey: config.apiKey + ) let transport = HTTP2ServerTransport.Posix( address: .ipv4(host: bindHost, port: bindPort), @@ -52,7 +66,15 @@ struct AppleIntelligenceServer: AsyncParsableCommand { print("API key authentication is enabled") } print("Server is ready to accept connections") - print("Health check: grpcurl -plaintext \(bindHost):\(bindPort) appleintelligence.AppleIntelligence/Health") + print("") + print("Available services:") + print(" - Complete/StreamComplete: Text generation with Apple Intelligence") + print(" - TextToSpeech: Convert text to spoken audio") + print(" - ListVoices: List available TTS voices") + print(" - Transcribe: Convert audio file to text") + print(" - StreamTranscribe: Real-time speech-to-text") + print("") + print("Health check: grpcurl -plaintext \(bindHost):\(bindPort) appleintelligence.AppleIntelligenceService/Health") print("Press Ctrl+C to stop the server") try await server.serve()