- Add TTS service using AVSpeechSynthesizer for voice output - Add STT service using SpeechAnalyzer (macOS 26) for transcription - Add voice input (microphone) button in chat with recording level indicator - Add speak button on assistant messages for TTS playback - Add language toggle (EN-CA/FR-CA) for bilingual speech recognition - Fix Swift 6 strict concurrency issues in audio callbacks - Update proto schema with TTS/STT message types and RPCs - Update gRPC provider with speech service endpoints 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
180 lines
4.1 KiB
Protocol Buffer
180 lines
4.1 KiB
Protocol Buffer
syntax = "proto3";
|
|
|
|
package appleintelligence;
|
|
|
|
// Image data for vision requests
|
|
message ImageData {
|
|
bytes data = 1;
|
|
string filename = 2;
|
|
string mime_type = 3;
|
|
}
|
|
|
|
// Vision analysis results
|
|
message ImageAnalysis {
|
|
string text_content = 1;
|
|
repeated string labels = 2;
|
|
string description = 3;
|
|
}
|
|
|
|
// Completion request
|
|
message CompletionRequest {
|
|
string prompt = 1;
|
|
optional float temperature = 2;
|
|
optional int32 max_tokens = 3;
|
|
repeated ImageData images = 4;
|
|
bool include_analysis = 5;
|
|
}
|
|
|
|
// Completion response (non-streaming)
|
|
message CompletionResponse {
|
|
string id = 1;
|
|
string text = 2;
|
|
string finish_reason = 3;
|
|
repeated ImageAnalysis image_analyses = 4;
|
|
}
|
|
|
|
// Streaming completion chunk
|
|
message CompletionChunk {
|
|
string id = 1;
|
|
string delta = 2;
|
|
bool is_final = 3;
|
|
string finish_reason = 4;
|
|
repeated ImageAnalysis image_analyses = 5;
|
|
}
|
|
|
|
// Health check request
|
|
message HealthRequest {}
|
|
|
|
// Health check response
|
|
message HealthResponse {
|
|
bool healthy = 1;
|
|
string model_status = 2;
|
|
}
|
|
|
|
// ============ TEXT-TO-SPEECH ============
|
|
|
|
// Audio format enumeration
|
|
enum AudioFormat {
|
|
AUDIO_FORMAT_UNSPECIFIED = 0;
|
|
AUDIO_FORMAT_WAV = 1;
|
|
AUDIO_FORMAT_MP3 = 2;
|
|
}
|
|
|
|
// Voice configuration for TTS
|
|
message VoiceConfig {
|
|
string voice_identifier = 1;
|
|
optional float speaking_rate = 2; // 0.0-1.0, default 0.5
|
|
optional float pitch_multiplier = 3; // 0.5-2.0, default 1.0
|
|
optional float volume = 4; // 0.0-1.0, default 1.0
|
|
}
|
|
|
|
// TTS Request
|
|
message TextToSpeechRequest {
|
|
string text = 1;
|
|
AudioFormat output_format = 2;
|
|
optional VoiceConfig voice_config = 3;
|
|
}
|
|
|
|
// TTS Response
|
|
message TextToSpeechResponse {
|
|
bytes audio_data = 1;
|
|
AudioFormat format = 2;
|
|
int32 sample_rate = 3;
|
|
int32 channels = 4;
|
|
float duration_seconds = 5;
|
|
}
|
|
|
|
// List available voices request
|
|
message ListVoicesRequest {
|
|
optional string language_code = 1;
|
|
}
|
|
|
|
// Voice information
|
|
message VoiceInfo {
|
|
string identifier = 1;
|
|
string name = 2;
|
|
string language = 3;
|
|
bool is_premium = 4;
|
|
string gender = 5;
|
|
}
|
|
|
|
// List voices response
|
|
message ListVoicesResponse {
|
|
repeated VoiceInfo voices = 1;
|
|
}
|
|
|
|
// ============ SPEECH-TO-TEXT ============
|
|
|
|
// STT Configuration
|
|
message TranscriptionConfig {
|
|
optional string language_code = 1;
|
|
optional bool enable_punctuation = 2; // default true
|
|
optional bool enable_timestamps = 3; // default false
|
|
}
|
|
|
|
// Audio data for STT
|
|
message AudioInput {
|
|
bytes data = 1;
|
|
string mime_type = 2; // "audio/wav", "audio/mp3", "audio/m4a"
|
|
optional int32 sample_rate = 3;
|
|
optional int32 channels = 4;
|
|
}
|
|
|
|
// File-based transcription request
|
|
message TranscribeRequest {
|
|
AudioInput audio = 1;
|
|
optional TranscriptionConfig config = 2;
|
|
}
|
|
|
|
// Transcription segment with timing
|
|
message TranscriptionSegment {
|
|
string text = 1;
|
|
float start_time = 2;
|
|
float end_time = 3;
|
|
float confidence = 4;
|
|
}
|
|
|
|
// Transcription response
|
|
message TranscribeResponse {
|
|
string text = 1;
|
|
repeated TranscriptionSegment segments = 2;
|
|
string detected_language = 3;
|
|
float confidence = 4;
|
|
}
|
|
|
|
// Streaming STT request chunk
|
|
message StreamingTranscribeRequest {
|
|
oneof request {
|
|
TranscriptionConfig config = 1; // Send first to configure
|
|
bytes audio_chunk = 2; // Subsequent audio chunks
|
|
}
|
|
}
|
|
|
|
// Streaming STT response
|
|
message StreamingTranscribeResponse {
|
|
string partial_text = 1;
|
|
bool is_final = 2;
|
|
string final_text = 3;
|
|
repeated TranscriptionSegment segments = 4;
|
|
}
|
|
|
|
// Apple Intelligence Service
|
|
service AppleIntelligenceService {
|
|
// Single completion request
|
|
rpc Complete(CompletionRequest) returns (CompletionResponse);
|
|
|
|
// Streaming completion request
|
|
rpc StreamComplete(CompletionRequest) returns (stream CompletionChunk);
|
|
|
|
// Health check
|
|
rpc Health(HealthRequest) returns (HealthResponse);
|
|
|
|
// Text-to-Speech
|
|
rpc TextToSpeech(TextToSpeechRequest) returns (TextToSpeechResponse);
|
|
rpc ListVoices(ListVoicesRequest) returns (ListVoicesResponse);
|
|
|
|
// Speech-to-Text
|
|
rpc Transcribe(TranscribeRequest) returns (TranscribeResponse);
|
|
rpc StreamTranscribe(stream StreamingTranscribeRequest) returns (stream StreamingTranscribeResponse);
|
|
}
|