syntax = "proto3"; package appleintelligence; // Image data for vision requests message ImageData { bytes data = 1; string filename = 2; string mime_type = 3; } // Vision analysis results message ImageAnalysis { string text_content = 1; repeated string labels = 2; string description = 3; } // Completion request message CompletionRequest { string prompt = 1; optional float temperature = 2; optional int32 max_tokens = 3; repeated ImageData images = 4; bool include_analysis = 5; } // Completion response (non-streaming) message CompletionResponse { string id = 1; string text = 2; string finish_reason = 3; repeated ImageAnalysis image_analyses = 4; } // Streaming completion chunk message CompletionChunk { string id = 1; string delta = 2; bool is_final = 3; string finish_reason = 4; repeated ImageAnalysis image_analyses = 5; } // Health check request message HealthRequest {} // Health check response message HealthResponse { bool healthy = 1; string model_status = 2; } // ============ TEXT-TO-SPEECH ============ // Audio format enumeration enum AudioFormat { AUDIO_FORMAT_UNSPECIFIED = 0; AUDIO_FORMAT_WAV = 1; AUDIO_FORMAT_MP3 = 2; } // Voice configuration for TTS message VoiceConfig { string voice_identifier = 1; optional float speaking_rate = 2; // 0.0-1.0, default 0.5 optional float pitch_multiplier = 3; // 0.5-2.0, default 1.0 optional float volume = 4; // 0.0-1.0, default 1.0 } // TTS Request message TextToSpeechRequest { string text = 1; AudioFormat output_format = 2; optional VoiceConfig voice_config = 3; } // TTS Response message TextToSpeechResponse { bytes audio_data = 1; AudioFormat format = 2; int32 sample_rate = 3; int32 channels = 4; float duration_seconds = 5; } // List available voices request message ListVoicesRequest { optional string language_code = 1; } // Voice information message VoiceInfo { string identifier = 1; string name = 2; string language = 3; bool is_premium = 4; string gender = 5; } // List voices response message ListVoicesResponse { repeated VoiceInfo voices = 1; } // ============ SPEECH-TO-TEXT ============ // STT Configuration message TranscriptionConfig { optional string language_code = 1; optional bool enable_punctuation = 2; // default true optional bool enable_timestamps = 3; // default false } // Audio data for STT message AudioInput { bytes data = 1; string mime_type = 2; // "audio/wav", "audio/mp3", "audio/m4a" optional int32 sample_rate = 3; optional int32 channels = 4; } // File-based transcription request message TranscribeRequest { AudioInput audio = 1; optional TranscriptionConfig config = 2; } // Transcription segment with timing message TranscriptionSegment { string text = 1; float start_time = 2; float end_time = 3; float confidence = 4; } // Transcription response message TranscribeResponse { string text = 1; repeated TranscriptionSegment segments = 2; string detected_language = 3; float confidence = 4; } // Streaming STT request chunk message StreamingTranscribeRequest { oneof request { TranscriptionConfig config = 1; // Send first to configure bytes audio_chunk = 2; // Subsequent audio chunks } } // Streaming STT response message StreamingTranscribeResponse { string partial_text = 1; bool is_final = 2; string final_text = 3; repeated TranscriptionSegment segments = 4; } // Apple Intelligence Service service AppleIntelligenceService { // Single completion request rpc Complete(CompletionRequest) returns (CompletionResponse); // Streaming completion request rpc StreamComplete(CompletionRequest) returns (stream CompletionChunk); // Health check rpc Health(HealthRequest) returns (HealthResponse); // Text-to-Speech rpc TextToSpeech(TextToSpeechRequest) returns (TextToSpeechResponse); rpc ListVoices(ListVoicesRequest) returns (ListVoicesResponse); // Speech-to-Text rpc Transcribe(TranscribeRequest) returns (TranscribeResponse); rpc StreamTranscribe(stream StreamingTranscribeRequest) returns (stream StreamingTranscribeResponse); }