swift-apple-intelligence-grpc/Proto/apple_intelligence.proto
Mathias Beaulieu-Duncan b754945923 Add Text-to-Speech and Speech-to-Text features
- Add TTS service using AVSpeechSynthesizer for voice output
- Add STT service using SpeechAnalyzer (macOS 26) for transcription
- Add voice input (microphone) button in chat with recording level indicator
- Add speak button on assistant messages for TTS playback
- Add language toggle (EN-CA/FR-CA) for bilingual speech recognition
- Fix Swift 6 strict concurrency issues in audio callbacks
- Update proto schema with TTS/STT message types and RPCs
- Update gRPC provider with speech service endpoints

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 02:57:30 -05:00

180 lines
4.1 KiB
Protocol Buffer

syntax = "proto3";
package appleintelligence;
// Image data for vision requests
message ImageData {
bytes data = 1;
string filename = 2;
string mime_type = 3;
}
// Vision analysis results
message ImageAnalysis {
string text_content = 1;
repeated string labels = 2;
string description = 3;
}
// Completion request
message CompletionRequest {
string prompt = 1;
optional float temperature = 2;
optional int32 max_tokens = 3;
repeated ImageData images = 4;
bool include_analysis = 5;
}
// Completion response (non-streaming)
message CompletionResponse {
string id = 1;
string text = 2;
string finish_reason = 3;
repeated ImageAnalysis image_analyses = 4;
}
// Streaming completion chunk
message CompletionChunk {
string id = 1;
string delta = 2;
bool is_final = 3;
string finish_reason = 4;
repeated ImageAnalysis image_analyses = 5;
}
// Health check request
message HealthRequest {}
// Health check response
message HealthResponse {
bool healthy = 1;
string model_status = 2;
}
// ============ TEXT-TO-SPEECH ============
// Audio format enumeration
enum AudioFormat {
AUDIO_FORMAT_UNSPECIFIED = 0;
AUDIO_FORMAT_WAV = 1;
AUDIO_FORMAT_MP3 = 2;
}
// Voice configuration for TTS
message VoiceConfig {
string voice_identifier = 1;
optional float speaking_rate = 2; // 0.0-1.0, default 0.5
optional float pitch_multiplier = 3; // 0.5-2.0, default 1.0
optional float volume = 4; // 0.0-1.0, default 1.0
}
// TTS Request
message TextToSpeechRequest {
string text = 1;
AudioFormat output_format = 2;
optional VoiceConfig voice_config = 3;
}
// TTS Response
message TextToSpeechResponse {
bytes audio_data = 1;
AudioFormat format = 2;
int32 sample_rate = 3;
int32 channels = 4;
float duration_seconds = 5;
}
// List available voices request
message ListVoicesRequest {
optional string language_code = 1;
}
// Voice information
message VoiceInfo {
string identifier = 1;
string name = 2;
string language = 3;
bool is_premium = 4;
string gender = 5;
}
// List voices response
message ListVoicesResponse {
repeated VoiceInfo voices = 1;
}
// ============ SPEECH-TO-TEXT ============
// STT Configuration
message TranscriptionConfig {
optional string language_code = 1;
optional bool enable_punctuation = 2; // default true
optional bool enable_timestamps = 3; // default false
}
// Audio data for STT
message AudioInput {
bytes data = 1;
string mime_type = 2; // "audio/wav", "audio/mp3", "audio/m4a"
optional int32 sample_rate = 3;
optional int32 channels = 4;
}
// File-based transcription request
message TranscribeRequest {
AudioInput audio = 1;
optional TranscriptionConfig config = 2;
}
// Transcription segment with timing
message TranscriptionSegment {
string text = 1;
float start_time = 2;
float end_time = 3;
float confidence = 4;
}
// Transcription response
message TranscribeResponse {
string text = 1;
repeated TranscriptionSegment segments = 2;
string detected_language = 3;
float confidence = 4;
}
// Streaming STT request chunk
message StreamingTranscribeRequest {
oneof request {
TranscriptionConfig config = 1; // Send first to configure
bytes audio_chunk = 2; // Subsequent audio chunks
}
}
// Streaming STT response
message StreamingTranscribeResponse {
string partial_text = 1;
bool is_final = 2;
string final_text = 3;
repeated TranscriptionSegment segments = 4;
}
// Apple Intelligence Service
service AppleIntelligenceService {
// Single completion request
rpc Complete(CompletionRequest) returns (CompletionResponse);
// Streaming completion request
rpc StreamComplete(CompletionRequest) returns (stream CompletionChunk);
// Health check
rpc Health(HealthRequest) returns (HealthResponse);
// Text-to-Speech
rpc TextToSpeech(TextToSpeechRequest) returns (TextToSpeechResponse);
rpc ListVoices(ListVoicesRequest) returns (ListVoicesResponse);
// Speech-to-Text
rpc Transcribe(TranscribeRequest) returns (TranscribeResponse);
rpc StreamTranscribe(stream StreamingTranscribeRequest) returns (stream StreamingTranscribeResponse);
}