swift-apple-intelligence-grpc/Proto/apple_intelligence.proto

syntax = "proto3";

package appleintelligence;

// Image data for vision requests
message ImageData {
  bytes data = 1;
  string filename = 2;
  string mime_type = 3;
}

// Vision analysis results
message ImageAnalysis {
  string text_content = 1;
  repeated string labels = 2;
  string description = 3;
}

// Completion request
message CompletionRequest {
  string prompt = 1;
  optional float temperature = 2;
  optional int32 max_tokens = 3;
  repeated ImageData images = 4;
  bool include_analysis = 5;
}

// Completion response (non-streaming)
message CompletionResponse {
  string id = 1;
  string text = 2;
  string finish_reason = 3;
  repeated ImageAnalysis image_analyses = 4;
}

// Streaming completion chunk
message CompletionChunk {
  string id = 1;
  string delta = 2;
  bool is_final = 3;
  string finish_reason = 4;
  repeated ImageAnalysis image_analyses = 5;
}

// Health check request
message HealthRequest {}

// Health check response
message HealthResponse {
  bool healthy = 1;
  string model_status = 2;
}

// ============ TEXT-TO-SPEECH ============

// Audio format enumeration
enum AudioFormat {
  AUDIO_FORMAT_UNSPECIFIED = 0;
  AUDIO_FORMAT_WAV = 1;
  AUDIO_FORMAT_MP3 = 2;
}

// Voice configuration for TTS
message VoiceConfig {
  string voice_identifier = 1;
  optional float speaking_rate = 2;    // 0.0-1.0, default 0.5
  optional float pitch_multiplier = 3; // 0.5-2.0, default 1.0
  optional float volume = 4;           // 0.0-1.0, default 1.0
}

// TTS Request
message TextToSpeechRequest {
  string text = 1;
  AudioFormat output_format = 2;
  optional VoiceConfig voice_config = 3;
}

// TTS Response
message TextToSpeechResponse {
  bytes audio_data = 1;
  AudioFormat format = 2;
  int32 sample_rate = 3;
  int32 channels = 4;
  float duration_seconds = 5;
}

// List available voices request
message ListVoicesRequest {
  optional string language_code = 1;
}

// Voice information
message VoiceInfo {
  string identifier = 1;
  string name = 2;
  string language = 3;
  bool is_premium = 4;
  string gender = 5;
}

// List voices response
message ListVoicesResponse {
  repeated VoiceInfo voices = 1;
}

// ============ SPEECH-TO-TEXT ============

// STT Configuration
message TranscriptionConfig {
  optional string language_code = 1;
  optional bool enable_punctuation = 2;  // default true
  optional bool enable_timestamps = 3;   // default false
}

// Audio data for STT
message AudioInput {
  bytes data = 1;
  string mime_type = 2;           // "audio/wav", "audio/mp3", "audio/m4a"
  optional int32 sample_rate = 3;
  optional int32 channels = 4;
}

// File-based transcription request
message TranscribeRequest {
  AudioInput audio = 1;
  optional TranscriptionConfig config = 2;
}

// Transcription segment with timing
message TranscriptionSegment {
  string text = 1;
  float start_time = 2;
  float end_time = 3;
  float confidence = 4;
}

// Transcription response
message TranscribeResponse {
  string text = 1;
  repeated TranscriptionSegment segments = 2;
  string detected_language = 3;
  float confidence = 4;
}

// Streaming STT request chunk
message StreamingTranscribeRequest {
  oneof request {
    TranscriptionConfig config = 1;  // Send first to configure
    bytes audio_chunk = 2;           // Subsequent audio chunks
  }
}

// Streaming STT response
message StreamingTranscribeResponse {
  string partial_text = 1;
  bool is_final = 2;
  string final_text = 3;
  repeated TranscriptionSegment segments = 4;
}

// Apple Intelligence Service
service AppleIntelligenceService {
  // Single completion request
  rpc Complete(CompletionRequest) returns (CompletionResponse);

  // Streaming completion request
  rpc StreamComplete(CompletionRequest) returns (stream CompletionChunk);

  // Health check
  rpc Health(HealthRequest) returns (HealthResponse);

  // Text-to-Speech
  rpc TextToSpeech(TextToSpeechRequest) returns (TextToSpeechResponse);
  rpc ListVoices(ListVoicesRequest) returns (ListVoicesResponse);

  // Speech-to-Text
  rpc Transcribe(TranscribeRequest) returns (TranscribeResponse);
  rpc StreamTranscribe(stream StreamingTranscribeRequest) returns (stream StreamingTranscribeResponse);
}