Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
851d6fef2b | ||
|
|
f7b8fbfa36 | ||
|
|
7655f1f0b8 | ||
|
|
b754945923 | ||
|
|
638656e7ca |
@ -1,13 +1,22 @@
|
|||||||
{
|
{
|
||||||
"originHash" : "73128af91f020c013de06bf6af5d06131ff05e38285118f5ff904ee06a3a6e24",
|
"originHash" : "1d1344dab64c4f153b2a1af227098e02f62d2c1f627c95dcad4304f1c16a97a3",
|
||||||
"pins" : [
|
"pins" : [
|
||||||
{
|
{
|
||||||
"identity" : "grpc-swift",
|
"identity" : "grpc-swift-2",
|
||||||
"kind" : "remoteSourceControl",
|
"kind" : "remoteSourceControl",
|
||||||
"location" : "https://github.com/grpc/grpc-swift.git",
|
"location" : "https://github.com/grpc/grpc-swift-2.git",
|
||||||
"state" : {
|
"state" : {
|
||||||
"revision" : "adc18c3e1c55027d0ce43893897ac448e3f27ebe",
|
"revision" : "531924b28fde0cf7585123c781c6f55cc35ef7fc",
|
||||||
"version" : "2.2.3"
|
"version" : "2.2.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"identity" : "grpc-swift-extras",
|
||||||
|
"kind" : "remoteSourceControl",
|
||||||
|
"location" : "https://github.com/grpc/grpc-swift-extras.git",
|
||||||
|
"state" : {
|
||||||
|
"revision" : "7ab4a690ac09696689a9c4b99320af7ef809bb3d",
|
||||||
|
"version" : "2.1.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -15,8 +24,8 @@
|
|||||||
"kind" : "remoteSourceControl",
|
"kind" : "remoteSourceControl",
|
||||||
"location" : "https://github.com/grpc/grpc-swift-nio-transport.git",
|
"location" : "https://github.com/grpc/grpc-swift-nio-transport.git",
|
||||||
"state" : {
|
"state" : {
|
||||||
"revision" : "ca2303eb7f3df556beafbba33a143ffa30d5b786",
|
"revision" : "dcfa8dc858bba5ded7a3760cede8c5fc03558a42",
|
||||||
"version" : "1.2.3"
|
"version" : "2.4.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -24,8 +33,8 @@
|
|||||||
"kind" : "remoteSourceControl",
|
"kind" : "remoteSourceControl",
|
||||||
"location" : "https://github.com/grpc/grpc-swift-protobuf.git",
|
"location" : "https://github.com/grpc/grpc-swift-protobuf.git",
|
||||||
"state" : {
|
"state" : {
|
||||||
"revision" : "53e89e3a5d417307f70a721c7b83e564fefb1e1c",
|
"revision" : "a1aa982cb2a276c72b478433eb75a4ec6508a277",
|
||||||
"version" : "1.3.1"
|
"version" : "2.1.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -100,6 +109,15 @@
|
|||||||
"version" : "4.2.0"
|
"version" : "4.2.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"identity" : "swift-distributed-tracing",
|
||||||
|
"kind" : "remoteSourceControl",
|
||||||
|
"location" : "https://github.com/apple/swift-distributed-tracing.git",
|
||||||
|
"state" : {
|
||||||
|
"revision" : "baa932c1336f7894145cbaafcd34ce2dd0b77c97",
|
||||||
|
"version" : "1.3.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"identity" : "swift-http-structured-headers",
|
"identity" : "swift-http-structured-headers",
|
||||||
"kind" : "remoteSourceControl",
|
"kind" : "remoteSourceControl",
|
||||||
@ -190,6 +208,15 @@
|
|||||||
"version" : "1.33.3"
|
"version" : "1.33.3"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"identity" : "swift-service-context",
|
||||||
|
"kind" : "remoteSourceControl",
|
||||||
|
"location" : "https://github.com/apple/swift-service-context.git",
|
||||||
|
"state" : {
|
||||||
|
"revision" : "1983448fefc717a2bc2ebde5490fe99873c5b8a6",
|
||||||
|
"version" : "1.2.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"identity" : "swift-service-lifecycle",
|
"identity" : "swift-service-lifecycle",
|
||||||
"kind" : "remoteSourceControl",
|
"kind" : "remoteSourceControl",
|
||||||
|
|||||||
@ -11,9 +11,10 @@ let package = Package(
|
|||||||
.executable(name: "AppleIntelligenceApp", targets: ["AppleIntelligenceApp"]),
|
.executable(name: "AppleIntelligenceApp", targets: ["AppleIntelligenceApp"]),
|
||||||
],
|
],
|
||||||
dependencies: [
|
dependencies: [
|
||||||
.package(url: "https://github.com/grpc/grpc-swift.git", from: "2.0.0"),
|
.package(url: "https://github.com/grpc/grpc-swift-2.git", from: "2.0.0"),
|
||||||
.package(url: "https://github.com/grpc/grpc-swift-nio-transport.git", from: "1.0.0"),
|
.package(url: "https://github.com/grpc/grpc-swift-nio-transport.git", from: "2.0.0"),
|
||||||
.package(url: "https://github.com/grpc/grpc-swift-protobuf.git", from: "1.0.0"),
|
.package(url: "https://github.com/grpc/grpc-swift-protobuf.git", from: "2.0.0"),
|
||||||
|
.package(url: "https://github.com/grpc/grpc-swift-extras.git", from: "2.0.0"),
|
||||||
.package(url: "https://github.com/apple/swift-protobuf.git", from: "1.28.0"),
|
.package(url: "https://github.com/apple/swift-protobuf.git", from: "1.28.0"),
|
||||||
.package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.3.0"),
|
.package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.3.0"),
|
||||||
],
|
],
|
||||||
@ -22,11 +23,15 @@ let package = Package(
|
|||||||
.target(
|
.target(
|
||||||
name: "AppleIntelligenceCore",
|
name: "AppleIntelligenceCore",
|
||||||
dependencies: [
|
dependencies: [
|
||||||
.product(name: "GRPCCore", package: "grpc-swift"),
|
.product(name: "GRPCCore", package: "grpc-swift-2"),
|
||||||
.product(name: "GRPCNIOTransportHTTP2", package: "grpc-swift-nio-transport"),
|
.product(name: "GRPCNIOTransportHTTP2", package: "grpc-swift-nio-transport"),
|
||||||
.product(name: "GRPCProtobuf", package: "grpc-swift-protobuf"),
|
.product(name: "GRPCProtobuf", package: "grpc-swift-protobuf"),
|
||||||
|
.product(name: "GRPCReflectionService", package: "grpc-swift-extras"),
|
||||||
.product(name: "SwiftProtobuf", package: "swift-protobuf"),
|
.product(name: "SwiftProtobuf", package: "swift-protobuf"),
|
||||||
],
|
],
|
||||||
|
resources: [
|
||||||
|
.copy("Resources/apple_intelligence.pb")
|
||||||
|
],
|
||||||
swiftSettings: [
|
swiftSettings: [
|
||||||
.unsafeFlags(["-Xfrontend", "-suppress-warnings"])
|
.unsafeFlags(["-Xfrontend", "-suppress-warnings"])
|
||||||
]
|
]
|
||||||
|
|||||||
179
Proto/apple_intelligence.proto
Normal file
179
Proto/apple_intelligence.proto
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
syntax = "proto3";
|
||||||
|
|
||||||
|
package appleintelligence;
|
||||||
|
|
||||||
|
// Image data for vision requests
|
||||||
|
message ImageData {
|
||||||
|
bytes data = 1;
|
||||||
|
string filename = 2;
|
||||||
|
string mime_type = 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Vision analysis results
|
||||||
|
message ImageAnalysis {
|
||||||
|
string text_content = 1;
|
||||||
|
repeated string labels = 2;
|
||||||
|
string description = 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Completion request
|
||||||
|
message CompletionRequest {
|
||||||
|
string prompt = 1;
|
||||||
|
optional float temperature = 2;
|
||||||
|
optional int32 max_tokens = 3;
|
||||||
|
repeated ImageData images = 4;
|
||||||
|
bool include_analysis = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Completion response (non-streaming)
|
||||||
|
message CompletionResponse {
|
||||||
|
string id = 1;
|
||||||
|
string text = 2;
|
||||||
|
string finish_reason = 3;
|
||||||
|
repeated ImageAnalysis image_analyses = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Streaming completion chunk
|
||||||
|
message CompletionChunk {
|
||||||
|
string id = 1;
|
||||||
|
string delta = 2;
|
||||||
|
bool is_final = 3;
|
||||||
|
string finish_reason = 4;
|
||||||
|
repeated ImageAnalysis image_analyses = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Health check request
|
||||||
|
message HealthRequest {}
|
||||||
|
|
||||||
|
// Health check response
|
||||||
|
message HealthResponse {
|
||||||
|
bool healthy = 1;
|
||||||
|
string model_status = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============ TEXT-TO-SPEECH ============
|
||||||
|
|
||||||
|
// Audio format enumeration
|
||||||
|
enum AudioFormat {
|
||||||
|
AUDIO_FORMAT_UNSPECIFIED = 0;
|
||||||
|
AUDIO_FORMAT_WAV = 1;
|
||||||
|
AUDIO_FORMAT_MP3 = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Voice configuration for TTS
|
||||||
|
message VoiceConfig {
|
||||||
|
string voice_identifier = 1;
|
||||||
|
optional float speaking_rate = 2; // 0.0-1.0, default 0.5
|
||||||
|
optional float pitch_multiplier = 3; // 0.5-2.0, default 1.0
|
||||||
|
optional float volume = 4; // 0.0-1.0, default 1.0
|
||||||
|
}
|
||||||
|
|
||||||
|
// TTS Request
|
||||||
|
message TextToSpeechRequest {
|
||||||
|
string text = 1;
|
||||||
|
AudioFormat output_format = 2;
|
||||||
|
optional VoiceConfig voice_config = 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TTS Response
|
||||||
|
message TextToSpeechResponse {
|
||||||
|
bytes audio_data = 1;
|
||||||
|
AudioFormat format = 2;
|
||||||
|
int32 sample_rate = 3;
|
||||||
|
int32 channels = 4;
|
||||||
|
float duration_seconds = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
// List available voices request
|
||||||
|
message ListVoicesRequest {
|
||||||
|
optional string language_code = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Voice information
|
||||||
|
message VoiceInfo {
|
||||||
|
string identifier = 1;
|
||||||
|
string name = 2;
|
||||||
|
string language = 3;
|
||||||
|
bool is_premium = 4;
|
||||||
|
string gender = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
// List voices response
|
||||||
|
message ListVoicesResponse {
|
||||||
|
repeated VoiceInfo voices = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============ SPEECH-TO-TEXT ============
|
||||||
|
|
||||||
|
// STT Configuration
|
||||||
|
message TranscriptionConfig {
|
||||||
|
optional string language_code = 1;
|
||||||
|
optional bool enable_punctuation = 2; // default true
|
||||||
|
optional bool enable_timestamps = 3; // default false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Audio data for STT
|
||||||
|
message AudioInput {
|
||||||
|
bytes data = 1;
|
||||||
|
string mime_type = 2; // "audio/wav", "audio/mp3", "audio/m4a"
|
||||||
|
optional int32 sample_rate = 3;
|
||||||
|
optional int32 channels = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// File-based transcription request
|
||||||
|
message TranscribeRequest {
|
||||||
|
AudioInput audio = 1;
|
||||||
|
optional TranscriptionConfig config = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transcription segment with timing
|
||||||
|
message TranscriptionSegment {
|
||||||
|
string text = 1;
|
||||||
|
float start_time = 2;
|
||||||
|
float end_time = 3;
|
||||||
|
float confidence = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transcription response
|
||||||
|
message TranscribeResponse {
|
||||||
|
string text = 1;
|
||||||
|
repeated TranscriptionSegment segments = 2;
|
||||||
|
string detected_language = 3;
|
||||||
|
float confidence = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Streaming STT request chunk
|
||||||
|
message StreamingTranscribeRequest {
|
||||||
|
oneof request {
|
||||||
|
TranscriptionConfig config = 1; // Send first to configure
|
||||||
|
bytes audio_chunk = 2; // Subsequent audio chunks
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Streaming STT response
|
||||||
|
message StreamingTranscribeResponse {
|
||||||
|
string partial_text = 1;
|
||||||
|
bool is_final = 2;
|
||||||
|
string final_text = 3;
|
||||||
|
repeated TranscriptionSegment segments = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apple Intelligence Service
|
||||||
|
service AppleIntelligenceService {
|
||||||
|
// Single completion request
|
||||||
|
rpc Complete(CompletionRequest) returns (CompletionResponse);
|
||||||
|
|
||||||
|
// Streaming completion request
|
||||||
|
rpc StreamComplete(CompletionRequest) returns (stream CompletionChunk);
|
||||||
|
|
||||||
|
// Health check
|
||||||
|
rpc Health(HealthRequest) returns (HealthResponse);
|
||||||
|
|
||||||
|
// Text-to-Speech
|
||||||
|
rpc TextToSpeech(TextToSpeechRequest) returns (TextToSpeechResponse);
|
||||||
|
rpc ListVoices(ListVoicesRequest) returns (ListVoicesResponse);
|
||||||
|
|
||||||
|
// Speech-to-Text
|
||||||
|
rpc Transcribe(TranscribeRequest) returns (TranscribeResponse);
|
||||||
|
rpc StreamTranscribe(stream StreamingTranscribeRequest) returns (stream StreamingTranscribeResponse);
|
||||||
|
}
|
||||||
174
README.md
174
README.md
@ -6,8 +6,11 @@ A Swift-based gRPC server that exposes Apple Intelligence (Foundation Models) ov
|
|||||||
|
|
||||||
- **gRPC API** - Standard gRPC interface accessible from any language
|
- **gRPC API** - Standard gRPC interface accessible from any language
|
||||||
- **Streaming Support** - Real-time token streaming for responsive UX
|
- **Streaming Support** - Real-time token streaming for responsive UX
|
||||||
|
- **Vision Analysis** - Analyze images with text extraction, labeling, and descriptions
|
||||||
|
- **Text-to-Speech** - Convert text to audio (WAV/MP3) with multiple voices
|
||||||
|
- **Speech-to-Text** - Transcribe audio files or stream audio in real-time
|
||||||
- **Menu Bar App** - Native macOS app with system tray integration
|
- **Menu Bar App** - Native macOS app with system tray integration
|
||||||
- **Built-in Chat UI** - Test the AI directly from the app
|
- **Built-in Chat UI** - Test the AI directly from the app with voice input/output
|
||||||
- **API Key Auth** - Optional bearer token authentication
|
- **API Key Auth** - Optional bearer token authentication
|
||||||
- **Auto-Start** - Launch at login and auto-start server options
|
- **Auto-Start** - Launch at login and auto-start server options
|
||||||
|
|
||||||
@ -45,7 +48,7 @@ swift build -c release --product AppleIntelligenceServer
|
|||||||
1. Launch **Apple Intelligence Server** from Applications
|
1. Launch **Apple Intelligence Server** from Applications
|
||||||
2. Click the brain icon in the menu bar
|
2. Click the brain icon in the menu bar
|
||||||
3. Toggle **Start Server** to begin accepting connections
|
3. Toggle **Start Server** to begin accepting connections
|
||||||
4. Use **Chat** to test the AI directly
|
4. Use **Chat** to test the AI directly (supports voice input/output)
|
||||||
5. Configure host, port, and API key in **Settings**
|
5. Configure host, port, and API key in **Settings**
|
||||||
|
|
||||||
### CLI Server
|
### CLI Server
|
||||||
@ -63,10 +66,19 @@ GRPC_HOST=127.0.0.1 GRPC_PORT=8080 API_KEY=secret .build/release/AppleIntelligen
|
|||||||
### Service Definition
|
### Service Definition
|
||||||
|
|
||||||
```protobuf
|
```protobuf
|
||||||
service AppleIntelligence {
|
service AppleIntelligenceService {
|
||||||
|
// AI Completion
|
||||||
rpc Health(HealthRequest) returns (HealthResponse);
|
rpc Health(HealthRequest) returns (HealthResponse);
|
||||||
rpc Complete(CompletionRequest) returns (CompletionResponse);
|
rpc Complete(CompletionRequest) returns (CompletionResponse);
|
||||||
rpc StreamComplete(CompletionRequest) returns (stream CompletionChunk);
|
rpc StreamComplete(CompletionRequest) returns (stream CompletionChunk);
|
||||||
|
|
||||||
|
// Text-to-Speech
|
||||||
|
rpc TextToSpeech(TextToSpeechRequest) returns (TextToSpeechResponse);
|
||||||
|
rpc ListVoices(ListVoicesRequest) returns (ListVoicesResponse);
|
||||||
|
|
||||||
|
// Speech-to-Text
|
||||||
|
rpc Transcribe(TranscribeRequest) returns (TranscribeResponse);
|
||||||
|
rpc StreamTranscribe(stream StreamingTranscribeRequest) returns (stream StreamingTranscribeResponse);
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -75,24 +87,134 @@ service AppleIntelligence {
|
|||||||
| Method | Type | Description |
|
| Method | Type | Description |
|
||||||
|--------|------|-------------|
|
|--------|------|-------------|
|
||||||
| `Health` | Unary | Check server and model availability |
|
| `Health` | Unary | Check server and model availability |
|
||||||
| `Complete` | Unary | Generate complete response |
|
| `Complete` | Unary | Generate complete response (supports images) |
|
||||||
| `StreamComplete` | Server Streaming | Stream tokens as they're generated |
|
| `StreamComplete` | Server Streaming | Stream tokens as they're generated |
|
||||||
|
| `TextToSpeech` | Unary | Convert text to audio |
|
||||||
|
| `ListVoices` | Unary | List available TTS voices |
|
||||||
|
| `Transcribe` | Unary | Transcribe audio file to text |
|
||||||
|
| `StreamTranscribe` | Bidirectional | Real-time audio transcription |
|
||||||
|
|
||||||
|
### Vision Support
|
||||||
|
|
||||||
|
The `Complete` and `StreamComplete` methods support image analysis:
|
||||||
|
|
||||||
|
```protobuf
|
||||||
|
message CompletionRequest {
|
||||||
|
string prompt = 1;
|
||||||
|
optional float temperature = 2;
|
||||||
|
optional int32 max_tokens = 3;
|
||||||
|
repeated ImageData images = 4; // Attach images for analysis
|
||||||
|
bool include_analysis = 5; // Return detailed analysis
|
||||||
|
}
|
||||||
|
|
||||||
|
message ImageData {
|
||||||
|
bytes data = 1;
|
||||||
|
string filename = 2;
|
||||||
|
string mime_type = 3; // image/png, image/jpeg, etc.
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Supported Image Formats:** PNG, JPEG, GIF, WebP, HEIC
|
||||||
|
|
||||||
|
### Text-to-Speech
|
||||||
|
|
||||||
|
```protobuf
|
||||||
|
message TextToSpeechRequest {
|
||||||
|
string text = 1;
|
||||||
|
AudioFormat output_format = 2; // WAV or MP3
|
||||||
|
optional VoiceConfig voice_config = 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
message VoiceConfig {
|
||||||
|
string voice_identifier = 1; // Voice ID from ListVoices
|
||||||
|
optional float speaking_rate = 2; // 0.0-1.0, default 0.5
|
||||||
|
optional float pitch_multiplier = 3; // 0.5-2.0, default 1.0
|
||||||
|
optional float volume = 4; // 0.0-1.0, default 1.0
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output Formats:** WAV, MP3
|
||||||
|
|
||||||
|
### Speech-to-Text
|
||||||
|
|
||||||
|
#### File-based Transcription
|
||||||
|
|
||||||
|
```protobuf
|
||||||
|
message TranscribeRequest {
|
||||||
|
AudioInput audio = 1;
|
||||||
|
optional TranscriptionConfig config = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
message AudioInput {
|
||||||
|
bytes data = 1;
|
||||||
|
string mime_type = 2; // audio/wav, audio/mp3, etc.
|
||||||
|
optional int32 sample_rate = 3;
|
||||||
|
optional int32 channels = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
message TranscriptionConfig {
|
||||||
|
optional string language_code = 1; // e.g., "en-US", "fr-CA"
|
||||||
|
optional bool enable_punctuation = 2;
|
||||||
|
optional bool enable_timestamps = 3;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Supported Audio Formats:** WAV, MP3, M4A, AAC, FLAC
|
||||||
|
|
||||||
|
#### Streaming Transcription
|
||||||
|
|
||||||
|
For real-time transcription, use bidirectional streaming:
|
||||||
|
|
||||||
|
1. Send `TranscriptionConfig` first to configure the session
|
||||||
|
2. Send `audio_chunk` messages with PCM audio data (16-bit, 16kHz, mono)
|
||||||
|
3. Receive `StreamingTranscribeResponse` with partial and final results
|
||||||
|
|
||||||
|
```protobuf
|
||||||
|
message StreamingTranscribeRequest {
|
||||||
|
oneof request {
|
||||||
|
TranscriptionConfig config = 1; // Send first
|
||||||
|
bytes audio_chunk = 2; // Then audio chunks
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
message StreamingTranscribeResponse {
|
||||||
|
string partial_text = 1;
|
||||||
|
bool is_final = 2;
|
||||||
|
string final_text = 3;
|
||||||
|
repeated TranscriptionSegment segments = 4;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
### Quick Test with grpcurl
|
### Quick Test with grpcurl
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Health check
|
# Health check
|
||||||
grpcurl -plaintext localhost:50051 appleintelligence.AppleIntelligence/Health
|
grpcurl -plaintext localhost:50051 appleintelligence.AppleIntelligenceService/Health
|
||||||
|
|
||||||
# Non-streaming completion
|
# Text completion
|
||||||
grpcurl -plaintext \
|
grpcurl -plaintext \
|
||||||
-d '{"prompt": "What is 2 + 2?"}' \
|
-d '{"prompt": "What is 2 + 2?"}' \
|
||||||
localhost:50051 appleintelligence.AppleIntelligence/Complete
|
localhost:50051 appleintelligence.AppleIntelligenceService/Complete
|
||||||
|
|
||||||
# Streaming completion
|
# Streaming completion
|
||||||
grpcurl -plaintext \
|
grpcurl -plaintext \
|
||||||
-d '{"prompt": "Tell me a short story"}' \
|
-d '{"prompt": "Tell me a short story"}' \
|
||||||
localhost:50051 appleintelligence.AppleIntelligence/StreamComplete
|
localhost:50051 appleintelligence.AppleIntelligenceService/StreamComplete
|
||||||
|
|
||||||
|
# List TTS voices
|
||||||
|
grpcurl -plaintext \
|
||||||
|
-d '{"language_code": "en-US"}' \
|
||||||
|
localhost:50051 appleintelligence.AppleIntelligenceService/ListVoices
|
||||||
|
|
||||||
|
# Text-to-Speech (base64 encode the response audio_data)
|
||||||
|
grpcurl -plaintext \
|
||||||
|
-d '{"text": "Hello world", "output_format": 1}' \
|
||||||
|
localhost:50051 appleintelligence.AppleIntelligenceService/TextToSpeech
|
||||||
|
|
||||||
|
# Transcribe audio file (base64 encode audio data)
|
||||||
|
grpcurl -plaintext \
|
||||||
|
-d '{"audio": {"data": "'$(base64 -i audio.wav)'", "mime_type": "audio/wav"}}' \
|
||||||
|
localhost:50051 appleintelligence.AppleIntelligenceService/Transcribe
|
||||||
```
|
```
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
@ -103,6 +225,21 @@ grpcurl -plaintext \
|
|||||||
| `GRPC_PORT` | `50051` | Port to listen on |
|
| `GRPC_PORT` | `50051` | Port to listen on |
|
||||||
| `API_KEY` | *none* | Optional API key for authentication |
|
| `API_KEY` | *none* | Optional API key for authentication |
|
||||||
|
|
||||||
|
## Supported Languages
|
||||||
|
|
||||||
|
### Speech Recognition (STT)
|
||||||
|
- English (US, CA, GB, AU, IN, IE, ZA)
|
||||||
|
- French (CA, FR)
|
||||||
|
- Spanish (ES, MX)
|
||||||
|
- German, Italian, Portuguese, Japanese, Korean, Chinese
|
||||||
|
- And many more via macOS Speech framework
|
||||||
|
|
||||||
|
### Text-to-Speech (TTS)
|
||||||
|
All voices available in macOS System Settings, including:
|
||||||
|
- Premium voices (highest quality, requires download)
|
||||||
|
- Enhanced voices (good quality)
|
||||||
|
- Default/Compact voices (pre-installed)
|
||||||
|
|
||||||
## Client Libraries
|
## Client Libraries
|
||||||
|
|
||||||
Connect from any language with gRPC support:
|
Connect from any language with gRPC support:
|
||||||
@ -120,15 +257,21 @@ See [docs/grpc-client-guide.md](docs/grpc-client-guide.md) for detailed examples
|
|||||||
```
|
```
|
||||||
apple-intelligence-grpc/
|
apple-intelligence-grpc/
|
||||||
├── Package.swift
|
├── Package.swift
|
||||||
|
├── Proto/
|
||||||
|
│ └── apple_intelligence.proto # gRPC service definition
|
||||||
├── Sources/
|
├── Sources/
|
||||||
│ ├── AppleIntelligenceCore/ # Shared gRPC service code
|
│ ├── AppleIntelligenceCore/ # Shared gRPC service code
|
||||||
│ │ ├── Config.swift
|
│ │ ├── Config.swift
|
||||||
│ │ ├── Services/
|
│ │ ├── Services/
|
||||||
│ │ │ └── AppleIntelligenceService.swift
|
│ │ │ ├── AppleIntelligenceService.swift
|
||||||
|
│ │ │ ├── TextToSpeechService.swift
|
||||||
|
│ │ │ ├── SpeechToTextService.swift
|
||||||
|
│ │ │ └── VisionAnalysisService.swift
|
||||||
│ │ ├── Providers/
|
│ │ ├── Providers/
|
||||||
│ │ │ └── AppleIntelligenceProvider.swift
|
│ │ │ └── AppleIntelligenceProvider.swift
|
||||||
│ │ └── Generated/
|
│ │ └── Generated/
|
||||||
│ │ └── AppleIntelligence.pb.swift
|
│ │ ├── apple_intelligence.pb.swift
|
||||||
|
│ │ └── apple_intelligence.grpc.swift
|
||||||
│ ├── AppleIntelligenceServer/ # CLI executable
|
│ ├── AppleIntelligenceServer/ # CLI executable
|
||||||
│ │ └── main.swift
|
│ │ └── main.swift
|
||||||
│ └── AppleIntelligenceApp/ # Menu bar app
|
│ └── AppleIntelligenceApp/ # Menu bar app
|
||||||
@ -182,6 +325,17 @@ See [docs/pipeline-configuration.md](docs/pipeline-configuration.md) for setup i
|
|||||||
- Include the API key in the Authorization header: `Bearer YOUR_API_KEY`
|
- Include the API key in the Authorization header: `Bearer YOUR_API_KEY`
|
||||||
- Verify the key matches what's configured in Settings
|
- Verify the key matches what's configured in Settings
|
||||||
|
|
||||||
|
### Speech Recognition Not Working
|
||||||
|
|
||||||
|
- Grant microphone permission when prompted
|
||||||
|
- Check System Settings → Privacy & Security → Speech Recognition
|
||||||
|
- Ensure the language is supported
|
||||||
|
|
||||||
|
### TTS Voice Quality
|
||||||
|
|
||||||
|
- Download Premium/Enhanced voices from System Settings → Accessibility → Read & Speak
|
||||||
|
- Premium voices are larger (~150-500MB) but sound more natural
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
MIT
|
MIT
|
||||||
|
|||||||
@ -34,7 +34,7 @@ struct AppleIntelligenceApp: App {
|
|||||||
.defaultSize(width: 500, height: 600)
|
.defaultSize(width: 500, height: 600)
|
||||||
|
|
||||||
Window("Settings", id: "settings") {
|
Window("Settings", id: "settings") {
|
||||||
SettingsView(settings: settings)
|
SettingsView(settings: settings, serverManager: serverManager)
|
||||||
}
|
}
|
||||||
.windowResizability(.contentSize)
|
.windowResizability(.contentSize)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -19,6 +19,10 @@ final class AppSettings {
|
|||||||
didSet { UserDefaults.standard.set(autoStartServer, forKey: "auto_start_server") }
|
didSet { UserDefaults.standard.set(autoStartServer, forKey: "auto_start_server") }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var enableReflection: Bool {
|
||||||
|
didSet { UserDefaults.standard.set(enableReflection, forKey: "enable_reflection") }
|
||||||
|
}
|
||||||
|
|
||||||
var launchAtLogin: Bool {
|
var launchAtLogin: Bool {
|
||||||
didSet {
|
didSet {
|
||||||
do {
|
do {
|
||||||
@ -39,6 +43,12 @@ final class AppSettings {
|
|||||||
self.port = savedPort == 0 ? 50051 : savedPort
|
self.port = savedPort == 0 ? 50051 : savedPort
|
||||||
self.apiKey = UserDefaults.standard.string(forKey: "api_key") ?? ""
|
self.apiKey = UserDefaults.standard.string(forKey: "api_key") ?? ""
|
||||||
self.autoStartServer = UserDefaults.standard.bool(forKey: "auto_start_server")
|
self.autoStartServer = UserDefaults.standard.bool(forKey: "auto_start_server")
|
||||||
|
// Default to true if not set
|
||||||
|
if UserDefaults.standard.object(forKey: "enable_reflection") == nil {
|
||||||
|
self.enableReflection = true
|
||||||
|
} else {
|
||||||
|
self.enableReflection = UserDefaults.standard.bool(forKey: "enable_reflection")
|
||||||
|
}
|
||||||
self.launchAtLogin = SMAppService.mainApp.status == .enabled
|
self.launchAtLogin = SMAppService.mainApp.status == .enabled
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -47,6 +57,7 @@ final class AppSettings {
|
|||||||
port = 50051
|
port = 50051
|
||||||
apiKey = ""
|
apiKey = ""
|
||||||
autoStartServer = false
|
autoStartServer = false
|
||||||
|
enableReflection = true
|
||||||
launchAtLogin = false
|
launchAtLogin = false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,4 +1,62 @@
|
|||||||
import Foundation
|
import Foundation
|
||||||
|
import AppKit
|
||||||
|
|
||||||
|
/// Represents an attached image in a chat message
|
||||||
|
struct ImageAttachment: Identifiable, Equatable {
|
||||||
|
let id: UUID
|
||||||
|
let data: Data
|
||||||
|
let filename: String?
|
||||||
|
let thumbnail: NSImage?
|
||||||
|
let mimeType: String
|
||||||
|
|
||||||
|
init(data: Data, filename: String? = nil) {
|
||||||
|
self.id = UUID()
|
||||||
|
self.data = data
|
||||||
|
self.filename = filename
|
||||||
|
self.thumbnail = Self.generateThumbnail(from: data)
|
||||||
|
self.mimeType = Self.detectMimeType(from: data)
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func generateThumbnail(from data: Data) -> NSImage? {
|
||||||
|
guard let image = NSImage(data: data) else { return nil }
|
||||||
|
|
||||||
|
let maxSize: CGFloat = 100
|
||||||
|
let ratio = min(maxSize / image.size.width, maxSize / image.size.height, 1.0)
|
||||||
|
let newSize = NSSize(
|
||||||
|
width: image.size.width * ratio,
|
||||||
|
height: image.size.height * ratio
|
||||||
|
)
|
||||||
|
|
||||||
|
let thumbnail = NSImage(size: newSize)
|
||||||
|
thumbnail.lockFocus()
|
||||||
|
image.draw(
|
||||||
|
in: NSRect(origin: .zero, size: newSize),
|
||||||
|
from: NSRect(origin: .zero, size: image.size),
|
||||||
|
operation: .copy,
|
||||||
|
fraction: 1.0
|
||||||
|
)
|
||||||
|
thumbnail.unlockFocus()
|
||||||
|
return thumbnail
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func detectMimeType(from data: Data) -> String {
|
||||||
|
guard data.count >= 4 else { return "application/octet-stream" }
|
||||||
|
let bytes = [UInt8](data.prefix(4))
|
||||||
|
|
||||||
|
if bytes[0] == 0x89 && bytes[1] == 0x50 && bytes[2] == 0x4E && bytes[3] == 0x47 {
|
||||||
|
return "image/png"
|
||||||
|
} else if bytes[0] == 0xFF && bytes[1] == 0xD8 {
|
||||||
|
return "image/jpeg"
|
||||||
|
} else if bytes[0] == 0x47 && bytes[1] == 0x49 && bytes[2] == 0x46 {
|
||||||
|
return "image/gif"
|
||||||
|
}
|
||||||
|
return "image/png" // Default to PNG
|
||||||
|
}
|
||||||
|
|
||||||
|
static func == (lhs: ImageAttachment, rhs: ImageAttachment) -> Bool {
|
||||||
|
lhs.id == rhs.id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct ChatMessage: Identifiable, Equatable {
|
struct ChatMessage: Identifiable, Equatable {
|
||||||
let id: UUID
|
let id: UUID
|
||||||
@ -6,17 +64,19 @@ struct ChatMessage: Identifiable, Equatable {
|
|||||||
var content: String
|
var content: String
|
||||||
let timestamp: Date
|
let timestamp: Date
|
||||||
var isStreaming: Bool
|
var isStreaming: Bool
|
||||||
|
var images: [ImageAttachment]
|
||||||
|
|
||||||
enum Role: Equatable {
|
enum Role: Equatable {
|
||||||
case user
|
case user
|
||||||
case assistant
|
case assistant
|
||||||
}
|
}
|
||||||
|
|
||||||
init(role: Role, content: String, isStreaming: Bool = false) {
|
init(role: Role, content: String, isStreaming: Bool = false, images: [ImageAttachment] = []) {
|
||||||
self.id = UUID()
|
self.id = UUID()
|
||||||
self.role = role
|
self.role = role
|
||||||
self.content = content
|
self.content = content
|
||||||
self.timestamp = Date()
|
self.timestamp = Date()
|
||||||
self.isStreaming = isStreaming
|
self.isStreaming = isStreaming
|
||||||
|
self.images = images
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,6 +2,7 @@ import Foundation
|
|||||||
import AppleIntelligenceCore
|
import AppleIntelligenceCore
|
||||||
import GRPCCore
|
import GRPCCore
|
||||||
import GRPCNIOTransportHTTP2
|
import GRPCNIOTransportHTTP2
|
||||||
|
import GRPCReflectionService
|
||||||
|
|
||||||
@MainActor
|
@MainActor
|
||||||
@Observable
|
@Observable
|
||||||
@ -51,6 +52,7 @@ final class ServerManager {
|
|||||||
let host = settings.host
|
let host = settings.host
|
||||||
let port = settings.port
|
let port = settings.port
|
||||||
let apiKey = settings.apiKey.isEmpty ? nil : settings.apiKey
|
let apiKey = settings.apiKey.isEmpty ? nil : settings.apiKey
|
||||||
|
let enableReflection = settings.enableReflection
|
||||||
|
|
||||||
serverTask = Task {
|
serverTask = Task {
|
||||||
do {
|
do {
|
||||||
@ -82,7 +84,16 @@ final class ServerManager {
|
|||||||
config: .defaults
|
config: .defaults
|
||||||
)
|
)
|
||||||
|
|
||||||
let server = GRPCServer(transport: transport, services: [provider])
|
// Build services list with optional reflection
|
||||||
|
var services: [any RegistrableRPCService] = [provider]
|
||||||
|
if enableReflection {
|
||||||
|
if let descriptorURL = AppleIntelligenceResources.descriptorSetURL {
|
||||||
|
let reflectionService = try ReflectionService(descriptorSetFileURLs: [descriptorURL])
|
||||||
|
services.append(reflectionService)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let server = GRPCServer(transport: transport, services: services)
|
||||||
|
|
||||||
await MainActor.run {
|
await MainActor.run {
|
||||||
self.state = .running(host: host, port: port)
|
self.state = .running(host: host, port: port)
|
||||||
@ -113,6 +124,19 @@ final class ServerManager {
|
|||||||
state = .stopped
|
state = .stopped
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func restart() {
|
||||||
|
guard state.isRunning else { return }
|
||||||
|
|
||||||
|
// Stop the current server
|
||||||
|
stop()
|
||||||
|
state = .starting
|
||||||
|
|
||||||
|
// Start again after a short delay to allow port release
|
||||||
|
DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) { [weak self] in
|
||||||
|
self?.start()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func toggle() {
|
func toggle() {
|
||||||
if state.isRunning {
|
if state.isRunning {
|
||||||
stop()
|
stop()
|
||||||
|
|||||||
@ -1,4 +1,8 @@
|
|||||||
import Foundation
|
import Foundation
|
||||||
|
import AppKit
|
||||||
|
import AVFoundation
|
||||||
|
import Speech
|
||||||
|
import UniformTypeIdentifiers
|
||||||
import AppleIntelligenceCore
|
import AppleIntelligenceCore
|
||||||
|
|
||||||
@MainActor
|
@MainActor
|
||||||
@ -9,11 +13,113 @@ final class ChatViewModel {
|
|||||||
var isLoading: Bool = false
|
var isLoading: Bool = false
|
||||||
var errorMessage: String?
|
var errorMessage: String?
|
||||||
|
|
||||||
|
// Image attachment state
|
||||||
|
var pendingImages: [ImageAttachment] = []
|
||||||
|
|
||||||
|
// Voice input/output state
|
||||||
|
var isRecording: Bool = false
|
||||||
|
var isSpeaking: Bool = false
|
||||||
|
var speakingMessageId: UUID?
|
||||||
|
var recordingLevel: Float = 0
|
||||||
|
|
||||||
private var service: AppleIntelligenceService?
|
private var service: AppleIntelligenceService?
|
||||||
|
private var ttsService: TextToSpeechService?
|
||||||
|
private var sttService: SpeechToTextService?
|
||||||
private var currentTask: Task<Void, Never>?
|
private var currentTask: Task<Void, Never>?
|
||||||
|
|
||||||
|
// Audio recording - multi-language support
|
||||||
|
private var audioEngine: AVAudioEngine?
|
||||||
|
private var speechRecognizers: [String: SFSpeechRecognizer] = [:]
|
||||||
|
private var activeRecognizer: SFSpeechRecognizer?
|
||||||
|
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
||||||
|
private var recognitionTask: SFSpeechRecognitionTask?
|
||||||
|
|
||||||
|
// Supported speech recognition languages (Canadian English and French)
|
||||||
|
private static let supportedLocales = ["en-CA", "fr-CA"]
|
||||||
|
var detectedLanguage: String = "en-CA"
|
||||||
|
|
||||||
|
// Audio playback - use direct speech synthesis for reliability
|
||||||
|
private var speechSynthesizer: AVSpeechSynthesizer?
|
||||||
|
private var speechDelegate: SpeechSynthesizerDelegate?
|
||||||
|
|
||||||
|
// Maximum images per message
|
||||||
|
private let maxImagesPerMessage = 5
|
||||||
|
|
||||||
|
// Supported image types
|
||||||
|
static let supportedImageTypes: [UTType] = [.png, .jpeg, .gif, .webP, .heic]
|
||||||
|
|
||||||
|
// Recent images from Downloads and Desktop
|
||||||
|
var recentImages: [URL] = []
|
||||||
|
|
||||||
func initialize() async {
|
func initialize() async {
|
||||||
service = await AppleIntelligenceService()
|
service = await AppleIntelligenceService()
|
||||||
|
ttsService = TextToSpeechService()
|
||||||
|
sttService = await SpeechToTextService()
|
||||||
|
|
||||||
|
// Initialize speech recognizers for all supported locales
|
||||||
|
for localeId in Self.supportedLocales {
|
||||||
|
if let recognizer = SFSpeechRecognizer(locale: Locale(identifier: localeId)) {
|
||||||
|
speechRecognizers[localeId] = recognizer
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default to system locale if supported, otherwise en-CA
|
||||||
|
let systemLocale = Locale.current.identifier
|
||||||
|
if speechRecognizers[systemLocale] != nil {
|
||||||
|
detectedLanguage = systemLocale
|
||||||
|
} else if systemLocale.starts(with: "fr") {
|
||||||
|
detectedLanguage = "fr-CA"
|
||||||
|
} else {
|
||||||
|
detectedLanguage = "en-CA"
|
||||||
|
}
|
||||||
|
activeRecognizer = speechRecognizers[detectedLanguage]
|
||||||
|
|
||||||
|
loadRecentImages()
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Recent Images
|
||||||
|
|
||||||
|
func loadRecentImages() {
|
||||||
|
let fileManager = FileManager.default
|
||||||
|
let homeDir = fileManager.homeDirectoryForCurrentUser
|
||||||
|
|
||||||
|
let folders = [
|
||||||
|
homeDir.appendingPathComponent("Downloads"),
|
||||||
|
homeDir.appendingPathComponent("Desktop")
|
||||||
|
]
|
||||||
|
|
||||||
|
let imageExtensions = ["png", "jpg", "jpeg", "gif", "webp", "heic", "heif"]
|
||||||
|
|
||||||
|
var allImages: [(url: URL, date: Date)] = []
|
||||||
|
|
||||||
|
for folder in folders {
|
||||||
|
guard let contents = try? fileManager.contentsOfDirectory(
|
||||||
|
at: folder,
|
||||||
|
includingPropertiesForKeys: [.contentModificationDateKey, .isRegularFileKey],
|
||||||
|
options: [.skipsHiddenFiles]
|
||||||
|
) else { continue }
|
||||||
|
|
||||||
|
for url in contents {
|
||||||
|
let ext = url.pathExtension.lowercased()
|
||||||
|
guard imageExtensions.contains(ext) else { continue }
|
||||||
|
|
||||||
|
if let attributes = try? url.resourceValues(forKeys: [.contentModificationDateKey, .isRegularFileKey]),
|
||||||
|
attributes.isRegularFile == true,
|
||||||
|
let modDate = attributes.contentModificationDate {
|
||||||
|
allImages.append((url: url, date: modDate))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by date descending and take last 10
|
||||||
|
recentImages = allImages
|
||||||
|
.sorted { $0.date > $1.date }
|
||||||
|
.prefix(10)
|
||||||
|
.map { $0.url }
|
||||||
|
}
|
||||||
|
|
||||||
|
func addRecentImage(_ url: URL) {
|
||||||
|
addImage(from: url)
|
||||||
}
|
}
|
||||||
|
|
||||||
var isServiceAvailable: Bool {
|
var isServiceAvailable: Bool {
|
||||||
@ -22,19 +128,77 @@ final class ChatViewModel {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var canSend: Bool {
|
||||||
|
!inputText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty || !pendingImages.isEmpty
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Image Handling
|
||||||
|
|
||||||
|
func addImage(from url: URL) {
|
||||||
|
guard pendingImages.count < maxImagesPerMessage else {
|
||||||
|
errorMessage = "Maximum \(maxImagesPerMessage) images per message"
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
let data = try Data(contentsOf: url)
|
||||||
|
let attachment = ImageAttachment(data: data, filename: url.lastPathComponent)
|
||||||
|
pendingImages.append(attachment)
|
||||||
|
errorMessage = nil
|
||||||
|
} catch {
|
||||||
|
errorMessage = "Failed to load image: \(error.localizedDescription)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func addImageFromPasteboard() {
|
||||||
|
guard let image = NSPasteboard.general.readObjects(
|
||||||
|
forClasses: [NSImage.self],
|
||||||
|
options: nil
|
||||||
|
)?.first as? NSImage else {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
guard pendingImages.count < maxImagesPerMessage else {
|
||||||
|
errorMessage = "Maximum \(maxImagesPerMessage) images per message"
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if let tiffData = image.tiffRepresentation,
|
||||||
|
let bitmap = NSBitmapImageRep(data: tiffData),
|
||||||
|
let pngData = bitmap.representation(using: .png, properties: [:]) {
|
||||||
|
let attachment = ImageAttachment(data: pngData, filename: "pasted_image.png")
|
||||||
|
pendingImages.append(attachment)
|
||||||
|
errorMessage = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func removePendingImage(_ attachment: ImageAttachment) {
|
||||||
|
pendingImages.removeAll { $0.id == attachment.id }
|
||||||
|
}
|
||||||
|
|
||||||
|
func clearPendingImages() {
|
||||||
|
pendingImages.removeAll()
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Messaging
|
||||||
|
|
||||||
func sendMessage() {
|
func sendMessage() {
|
||||||
let text = inputText.trimmingCharacters(in: .whitespacesAndNewlines)
|
let text = inputText.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||||
guard !text.isEmpty else { return }
|
guard !text.isEmpty || !pendingImages.isEmpty else { return }
|
||||||
guard !isLoading else { return }
|
guard !isLoading else { return }
|
||||||
|
|
||||||
// Add user message
|
// Capture images before clearing
|
||||||
let userMessage = ChatMessage(role: .user, content: text)
|
let imagesToSend = pendingImages
|
||||||
|
|
||||||
|
// Add user message with images
|
||||||
|
let userMessage = ChatMessage(role: .user, content: text, images: imagesToSend)
|
||||||
messages.append(userMessage)
|
messages.append(userMessage)
|
||||||
inputText = ""
|
inputText = ""
|
||||||
|
pendingImages = []
|
||||||
errorMessage = nil
|
errorMessage = nil
|
||||||
|
|
||||||
// Add placeholder for assistant response
|
// Add placeholder for assistant response
|
||||||
var assistantMessage = ChatMessage(role: .assistant, content: "", isStreaming: true)
|
let assistantMessage = ChatMessage(role: .assistant, content: "", isStreaming: true)
|
||||||
messages.append(assistantMessage)
|
messages.append(assistantMessage)
|
||||||
|
|
||||||
isLoading = true
|
isLoading = true
|
||||||
@ -45,14 +209,20 @@ final class ChatViewModel {
|
|||||||
throw AppleIntelligenceError.modelNotAvailable
|
throw AppleIntelligenceError.modelNotAvailable
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Convert attachments to service format
|
||||||
|
let images = imagesToSend.map { attachment in
|
||||||
|
(data: attachment.data, filename: attachment.filename)
|
||||||
|
}
|
||||||
|
|
||||||
let stream = await service.streamComplete(
|
let stream = await service.streamComplete(
|
||||||
prompt: text,
|
prompt: text,
|
||||||
temperature: nil,
|
temperature: nil,
|
||||||
maxTokens: nil
|
maxTokens: nil,
|
||||||
|
images: images
|
||||||
)
|
)
|
||||||
|
|
||||||
var fullResponse = ""
|
var fullResponse = ""
|
||||||
for try await partialResponse in stream {
|
for try await (partialResponse, _) in stream {
|
||||||
fullResponse = partialResponse
|
fullResponse = partialResponse
|
||||||
// Update the last message (assistant's response)
|
// Update the last message (assistant's response)
|
||||||
if let index = messages.lastIndex(where: { $0.role == .assistant }) {
|
if let index = messages.lastIndex(where: { $0.role == .assistant }) {
|
||||||
@ -93,4 +263,279 @@ final class ChatViewModel {
|
|||||||
messages.removeAll()
|
messages.removeAll()
|
||||||
errorMessage = nil
|
errorMessage = nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MARK: - Voice Input (Speech-to-Text)
|
||||||
|
|
||||||
|
func toggleRecording() {
|
||||||
|
if isRecording {
|
||||||
|
stopRecording()
|
||||||
|
} else {
|
||||||
|
startRecording()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func startRecording() {
|
||||||
|
Task {
|
||||||
|
// Use nonisolated helper to avoid MainActor isolation inheritance in TCC callback
|
||||||
|
let status = await Self.requestSpeechAuthorization()
|
||||||
|
|
||||||
|
guard status == .authorized else {
|
||||||
|
self.errorMessage = "Speech recognition not authorized"
|
||||||
|
return
|
||||||
|
}
|
||||||
|
self.beginRecording()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Request speech recognition authorization without MainActor isolation.
|
||||||
|
/// This prevents Swift 6 strict concurrency from asserting MainActor in the TCC callback.
|
||||||
|
private nonisolated static func requestSpeechAuthorization() async -> SFSpeechRecognizerAuthorizationStatus {
|
||||||
|
await withCheckedContinuation { continuation in
|
||||||
|
SFSpeechRecognizer.requestAuthorization { status in
|
||||||
|
continuation.resume(returning: status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates audio tap handler in nonisolated context to avoid MainActor isolation inheritance.
|
||||||
|
/// Audio taps run on CoreAudio's RealtimeMessenger queue, not MainActor.
|
||||||
|
private nonisolated static func createAudioTapHandler(
|
||||||
|
request: SFSpeechAudioBufferRecognitionRequest,
|
||||||
|
levelUpdater: RecordingLevelUpdater
|
||||||
|
) -> (AVAudioPCMBuffer, AVAudioTime) -> Void {
|
||||||
|
return { buffer, _ in
|
||||||
|
request.append(buffer)
|
||||||
|
|
||||||
|
// Calculate audio level for visual feedback
|
||||||
|
guard let channelData = buffer.floatChannelData else { return }
|
||||||
|
let channelDataValue = channelData.pointee
|
||||||
|
let channelDataValueArray = stride(from: 0, to: Int(buffer.frameLength), by: buffer.stride).map { channelDataValue[$0] }
|
||||||
|
let rms = sqrt(channelDataValueArray.map { $0 * $0 }.reduce(0, +) / Float(buffer.frameLength))
|
||||||
|
let avgPower = 20 * log10(rms)
|
||||||
|
let level = max(0, min(1, (avgPower + 50) / 50))
|
||||||
|
|
||||||
|
levelUpdater.updateLevel(level)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func beginRecording() {
|
||||||
|
// Try to find an available recognizer
|
||||||
|
let recognizer = activeRecognizer ?? speechRecognizers.values.first { $0.isAvailable }
|
||||||
|
guard let speechRecognizer = recognizer, speechRecognizer.isAvailable else {
|
||||||
|
errorMessage = "Speech recognition not available"
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop any existing recording
|
||||||
|
if audioEngine != nil {
|
||||||
|
stopRecording()
|
||||||
|
}
|
||||||
|
|
||||||
|
audioEngine = AVAudioEngine()
|
||||||
|
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
||||||
|
|
||||||
|
guard let audioEngine = audioEngine,
|
||||||
|
let recognitionRequest = recognitionRequest else {
|
||||||
|
errorMessage = "Failed to initialize audio engine"
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
recognitionRequest.shouldReportPartialResults = true
|
||||||
|
|
||||||
|
// Enable automatic language detection if available (macOS 14+)
|
||||||
|
if #available(macOS 14, *) {
|
||||||
|
recognitionRequest.addsPunctuation = true
|
||||||
|
}
|
||||||
|
|
||||||
|
let inputNode = audioEngine.inputNode
|
||||||
|
let recordingFormat = inputNode.outputFormat(forBus: 0)
|
||||||
|
|
||||||
|
// Use nonisolated static function to create audio tap handler
|
||||||
|
// This breaks MainActor isolation inheritance in the closure
|
||||||
|
let levelUpdater = RecordingLevelUpdater(viewModel: self)
|
||||||
|
let audioTapHandler = Self.createAudioTapHandler(request: recognitionRequest, levelUpdater: levelUpdater)
|
||||||
|
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat, block: audioTapHandler)
|
||||||
|
|
||||||
|
audioEngine.prepare()
|
||||||
|
|
||||||
|
do {
|
||||||
|
try audioEngine.start()
|
||||||
|
isRecording = true
|
||||||
|
|
||||||
|
// Use a sendable wrapper for recognition results with language detection
|
||||||
|
let resultHandler = RecognitionResultHandler(viewModel: self)
|
||||||
|
|
||||||
|
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
|
||||||
|
resultHandler.handleResult(result: result, error: error)
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
errorMessage = "Failed to start recording: \(error.localizedDescription)"
|
||||||
|
cleanupRecording()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Switch to a different language for speech recognition
|
||||||
|
func switchLanguage(to localeId: String) {
|
||||||
|
guard let recognizer = speechRecognizers[localeId] else { return }
|
||||||
|
activeRecognizer = recognizer
|
||||||
|
detectedLanguage = localeId
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get available languages for speech recognition
|
||||||
|
var availableLanguages: [(id: String, name: String)] {
|
||||||
|
speechRecognizers.keys.sorted().compactMap { localeId in
|
||||||
|
let locale = Locale(identifier: localeId)
|
||||||
|
let name = locale.localizedString(forIdentifier: localeId) ?? localeId
|
||||||
|
return (id: localeId, name: name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func stopRecording() {
|
||||||
|
recognitionRequest?.endAudio()
|
||||||
|
cleanupRecording()
|
||||||
|
}
|
||||||
|
|
||||||
|
fileprivate func cleanupRecording() {
|
||||||
|
audioEngine?.stop()
|
||||||
|
audioEngine?.inputNode.removeTap(onBus: 0)
|
||||||
|
audioEngine = nil
|
||||||
|
recognitionRequest = nil
|
||||||
|
recognitionTask?.cancel()
|
||||||
|
recognitionTask = nil
|
||||||
|
isRecording = false
|
||||||
|
recordingLevel = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Voice Output (Text-to-Speech)
|
||||||
|
|
||||||
|
func speakMessage(_ message: ChatMessage) {
|
||||||
|
guard !message.content.isEmpty else { return }
|
||||||
|
|
||||||
|
// If already speaking this message, stop
|
||||||
|
if isSpeaking && speakingMessageId == message.id {
|
||||||
|
stopSpeaking()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop any current speech
|
||||||
|
stopSpeaking()
|
||||||
|
|
||||||
|
speakingMessageId = message.id
|
||||||
|
isSpeaking = true
|
||||||
|
|
||||||
|
// Create utterance
|
||||||
|
let utterance = AVSpeechUtterance(string: message.content)
|
||||||
|
utterance.rate = AVSpeechUtteranceDefaultSpeechRate
|
||||||
|
utterance.pitchMultiplier = 1.0
|
||||||
|
utterance.volume = 1.0
|
||||||
|
|
||||||
|
// Detect message language and use appropriate voice
|
||||||
|
let isFrench = Self.detectFrench(message.content)
|
||||||
|
let language = isFrench ? "fr-CA" : "en-US"
|
||||||
|
utterance.voice = AVSpeechSynthesisVoice(language: language)
|
||||||
|
|
||||||
|
// Create synthesizer and delegate
|
||||||
|
let synthesizer = AVSpeechSynthesizer()
|
||||||
|
speechDelegate = SpeechSynthesizerDelegate { [weak self] in
|
||||||
|
Task { @MainActor in
|
||||||
|
self?.isSpeaking = false
|
||||||
|
self?.speakingMessageId = nil
|
||||||
|
self?.speechDelegate = nil
|
||||||
|
self?.speechSynthesizer = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
synthesizer.delegate = speechDelegate
|
||||||
|
speechSynthesizer = synthesizer
|
||||||
|
|
||||||
|
// Speak directly
|
||||||
|
synthesizer.speak(utterance)
|
||||||
|
}
|
||||||
|
|
||||||
|
func stopSpeaking() {
|
||||||
|
speechSynthesizer?.stopSpeaking(at: .immediate)
|
||||||
|
speechSynthesizer = nil
|
||||||
|
speechDelegate = nil
|
||||||
|
isSpeaking = false
|
||||||
|
speakingMessageId = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect if text is likely French based on common words
|
||||||
|
private static func detectFrench(_ text: String) -> Bool {
|
||||||
|
let lowercased = text.lowercased()
|
||||||
|
let frenchIndicators = [
|
||||||
|
" le ", " la ", " les ", " un ", " une ", " des ",
|
||||||
|
" je ", " tu ", " il ", " elle ", " nous ", " vous ", " ils ", " elles ",
|
||||||
|
" est ", " sont ", " avoir ", " être ", " fait ", " faire ",
|
||||||
|
" que ", " qui ", " quoi ", " dans ", " pour ", " avec ", " sur ",
|
||||||
|
" ce ", " cette ", " ces ", " mon ", " ma ", " mes ",
|
||||||
|
" pas ", " plus ", " très ", " bien ", " aussi ",
|
||||||
|
"bonjour", "merci", "salut", "oui", "non", "peut",
|
||||||
|
" et ", " ou ", " mais ", " donc ", " car ",
|
||||||
|
"c'est", "j'ai", "qu'est", "n'est", "d'un", "l'on"
|
||||||
|
]
|
||||||
|
|
||||||
|
let frenchCount = frenchIndicators.filter { lowercased.contains($0) }.count
|
||||||
|
return frenchCount >= 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Speech Synthesizer Delegate
|
||||||
|
|
||||||
|
private final class SpeechSynthesizerDelegate: NSObject, AVSpeechSynthesizerDelegate, @unchecked Sendable {
|
||||||
|
let onFinish: () -> Void
|
||||||
|
|
||||||
|
init(onFinish: @escaping () -> Void) {
|
||||||
|
self.onFinish = onFinish
|
||||||
|
}
|
||||||
|
|
||||||
|
func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
|
||||||
|
onFinish()
|
||||||
|
}
|
||||||
|
|
||||||
|
func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
|
||||||
|
onFinish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Sendable Wrappers for Audio Callbacks
|
||||||
|
|
||||||
|
/// Wrapper to safely update recording level from audio callback thread
|
||||||
|
private final class RecordingLevelUpdater: @unchecked Sendable {
|
||||||
|
private weak var viewModel: ChatViewModel?
|
||||||
|
|
||||||
|
init(viewModel: ChatViewModel) {
|
||||||
|
self.viewModel = viewModel
|
||||||
|
}
|
||||||
|
|
||||||
|
func updateLevel(_ level: Float) {
|
||||||
|
Task { @MainActor [weak viewModel] in
|
||||||
|
viewModel?.recordingLevel = level
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wrapper to safely handle recognition results from Speech framework callback
|
||||||
|
private final class RecognitionResultHandler: @unchecked Sendable {
|
||||||
|
private weak var viewModel: ChatViewModel?
|
||||||
|
|
||||||
|
init(viewModel: ChatViewModel) {
|
||||||
|
self.viewModel = viewModel
|
||||||
|
}
|
||||||
|
|
||||||
|
func handleResult(result: SFSpeechRecognitionResult?, error: Error?) {
|
||||||
|
// Extract data before crossing actor boundary (SFSpeechRecognitionResult is not Sendable)
|
||||||
|
let transcription = result?.bestTranscription.formattedString
|
||||||
|
let isFinal = result?.isFinal ?? false
|
||||||
|
let hasError = error != nil
|
||||||
|
|
||||||
|
Task { @MainActor [weak viewModel] in
|
||||||
|
if let transcription = transcription {
|
||||||
|
viewModel?.inputText = transcription
|
||||||
|
}
|
||||||
|
|
||||||
|
if hasError || isFinal {
|
||||||
|
viewModel?.cleanupRecording()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,17 +1,33 @@
|
|||||||
import SwiftUI
|
import SwiftUI
|
||||||
|
import UniformTypeIdentifiers
|
||||||
|
|
||||||
struct ChatView: View {
|
struct ChatView: View {
|
||||||
@Bindable var viewModel: ChatViewModel
|
@Bindable var viewModel: ChatViewModel
|
||||||
@FocusState private var isInputFocused: Bool
|
@FocusState private var isInputFocused: Bool
|
||||||
|
@State private var isShowingFilePicker = false
|
||||||
|
@State private var isDragOver = false
|
||||||
|
@State private var previewImageURL: URL?
|
||||||
|
|
||||||
var body: some View {
|
var body: some View {
|
||||||
|
HStack(spacing: 0) {
|
||||||
|
// Recent images sidebar
|
||||||
|
if !viewModel.recentImages.isEmpty {
|
||||||
|
recentImagesSidebar
|
||||||
|
Divider()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Main chat area
|
||||||
VStack(spacing: 0) {
|
VStack(spacing: 0) {
|
||||||
// Messages list
|
// Messages list
|
||||||
ScrollViewReader { proxy in
|
ScrollViewReader { proxy in
|
||||||
ScrollView {
|
ScrollView {
|
||||||
LazyVStack(spacing: 12) {
|
LazyVStack(spacing: 12) {
|
||||||
ForEach(viewModel.messages) { message in
|
ForEach(viewModel.messages) { message in
|
||||||
MessageBubble(message: message)
|
MessageBubble(
|
||||||
|
message: message,
|
||||||
|
isSpeaking: viewModel.speakingMessageId == message.id,
|
||||||
|
onSpeak: { viewModel.speakMessage(message) }
|
||||||
|
)
|
||||||
.id(message.id)
|
.id(message.id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -55,14 +71,260 @@ struct ChatView: View {
|
|||||||
|
|
||||||
Divider()
|
Divider()
|
||||||
|
|
||||||
|
// Pending images preview
|
||||||
|
if !viewModel.pendingImages.isEmpty {
|
||||||
|
pendingImagesView
|
||||||
|
}
|
||||||
|
|
||||||
// Input area
|
// Input area
|
||||||
HStack(spacing: 12) {
|
inputArea
|
||||||
|
}
|
||||||
|
.onDrop(of: [.fileURL, .image], isTargeted: $isDragOver) { providers in
|
||||||
|
handleDrop(providers: providers)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
.overlay {
|
||||||
|
if isDragOver {
|
||||||
|
RoundedRectangle(cornerRadius: 8)
|
||||||
|
.stroke(Color.accentColor, lineWidth: 3)
|
||||||
|
.background(Color.accentColor.opacity(0.1))
|
||||||
|
.padding(4)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.frame(minWidth: 500, minHeight: 500)
|
||||||
|
.toolbar {
|
||||||
|
ToolbarItem(placement: .primaryAction) {
|
||||||
|
Button {
|
||||||
|
viewModel.loadRecentImages()
|
||||||
|
} label: {
|
||||||
|
Image(systemName: "arrow.clockwise")
|
||||||
|
}
|
||||||
|
.help("Refresh recent images")
|
||||||
|
}
|
||||||
|
ToolbarItem(placement: .primaryAction) {
|
||||||
|
Button {
|
||||||
|
viewModel.clearChat()
|
||||||
|
} label: {
|
||||||
|
Image(systemName: "trash")
|
||||||
|
}
|
||||||
|
.help("Clear chat")
|
||||||
|
.disabled(viewModel.messages.isEmpty)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.task {
|
||||||
|
await viewModel.initialize()
|
||||||
|
}
|
||||||
|
.onAppear {
|
||||||
|
NSApp.setActivationPolicy(.regular)
|
||||||
|
NSApp.activate(ignoringOtherApps: true)
|
||||||
|
|
||||||
|
DispatchQueue.main.asyncAfter(deadline: .now() + 0.2) {
|
||||||
|
if let window = NSApp.windows.first(where: { $0.title == "Chat" }) {
|
||||||
|
window.makeKeyAndOrderFront(nil)
|
||||||
|
}
|
||||||
|
isInputFocused = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.onDisappear {
|
||||||
|
if NSApp.windows.filter({ $0.isVisible && $0.title != "" }).isEmpty {
|
||||||
|
NSApp.setActivationPolicy(.accessory)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.fileImporter(
|
||||||
|
isPresented: $isShowingFilePicker,
|
||||||
|
allowedContentTypes: ChatViewModel.supportedImageTypes,
|
||||||
|
allowsMultipleSelection: true
|
||||||
|
) { result in
|
||||||
|
switch result {
|
||||||
|
case .success(let urls):
|
||||||
|
for url in urls {
|
||||||
|
if url.startAccessingSecurityScopedResource() {
|
||||||
|
viewModel.addImage(from: url)
|
||||||
|
url.stopAccessingSecurityScopedResource()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case .failure(let error):
|
||||||
|
viewModel.errorMessage = error.localizedDescription
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.sheet(item: $previewImageURL) { url in
|
||||||
|
ImagePreviewSheet(url: url) {
|
||||||
|
viewModel.addRecentImage(url)
|
||||||
|
previewImageURL = nil
|
||||||
|
} onCancel: {
|
||||||
|
previewImageURL = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Drag & Drop Handler
|
||||||
|
|
||||||
|
private func handleDrop(providers: [NSItemProvider]) {
|
||||||
|
for provider in providers {
|
||||||
|
// Try to load as file URL first
|
||||||
|
if provider.hasItemConformingToTypeIdentifier(UTType.fileURL.identifier) {
|
||||||
|
provider.loadItem(forTypeIdentifier: UTType.fileURL.identifier, options: nil) { item, error in
|
||||||
|
guard error == nil else { return }
|
||||||
|
|
||||||
|
if let data = item as? Data,
|
||||||
|
let url = URL(dataRepresentation: data, relativeTo: nil) {
|
||||||
|
DispatchQueue.main.async {
|
||||||
|
viewModel.addImage(from: url)
|
||||||
|
}
|
||||||
|
} else if let url = item as? URL {
|
||||||
|
DispatchQueue.main.async {
|
||||||
|
viewModel.addImage(from: url)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Try to load as image data
|
||||||
|
else if provider.hasItemConformingToTypeIdentifier(UTType.image.identifier) {
|
||||||
|
provider.loadDataRepresentation(forTypeIdentifier: UTType.image.identifier) { data, error in
|
||||||
|
guard let data = data, error == nil else { return }
|
||||||
|
DispatchQueue.main.async {
|
||||||
|
let attachment = ImageAttachment(data: data, filename: "dropped_image.png")
|
||||||
|
if viewModel.pendingImages.count < 5 {
|
||||||
|
viewModel.pendingImages.append(attachment)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Recent Images Sidebar
|
||||||
|
|
||||||
|
private var recentImagesSidebar: some View {
|
||||||
|
VStack(alignment: .leading, spacing: 8) {
|
||||||
|
Text("Recent")
|
||||||
|
.font(.headline)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
.padding(.horizontal, 8)
|
||||||
|
.padding(.top, 8)
|
||||||
|
|
||||||
|
ScrollView {
|
||||||
|
LazyVStack(spacing: 8) {
|
||||||
|
ForEach(viewModel.recentImages, id: \.self) { url in
|
||||||
|
RecentImageThumbnail(url: url) {
|
||||||
|
previewImageURL = url
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.padding(.horizontal, 8)
|
||||||
|
.padding(.bottom, 8)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.frame(width: 100)
|
||||||
|
.background(Color(nsColor: .controlBackgroundColor).opacity(0.5))
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Pending Images Preview
|
||||||
|
|
||||||
|
private var pendingImagesView: some View {
|
||||||
|
ScrollView(.horizontal, showsIndicators: false) {
|
||||||
|
HStack(spacing: 8) {
|
||||||
|
ForEach(viewModel.pendingImages) { attachment in
|
||||||
|
pendingImageThumbnail(attachment)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.padding(.horizontal)
|
||||||
|
.padding(.vertical, 8)
|
||||||
|
}
|
||||||
|
.background(Color(nsColor: .controlBackgroundColor))
|
||||||
|
}
|
||||||
|
|
||||||
|
private func pendingImageThumbnail(_ attachment: ImageAttachment) -> some View {
|
||||||
|
ZStack(alignment: .topTrailing) {
|
||||||
|
if let thumbnail = attachment.thumbnail {
|
||||||
|
Image(nsImage: thumbnail)
|
||||||
|
.resizable()
|
||||||
|
.aspectRatio(contentMode: .fill)
|
||||||
|
.frame(width: 60, height: 60)
|
||||||
|
.clipShape(RoundedRectangle(cornerRadius: 8))
|
||||||
|
} else {
|
||||||
|
RoundedRectangle(cornerRadius: 8)
|
||||||
|
.fill(Color.gray.opacity(0.3))
|
||||||
|
.frame(width: 60, height: 60)
|
||||||
|
.overlay {
|
||||||
|
Image(systemName: "photo")
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Button {
|
||||||
|
viewModel.removePendingImage(attachment)
|
||||||
|
} label: {
|
||||||
|
Image(systemName: "xmark.circle.fill")
|
||||||
|
.font(.system(size: 16))
|
||||||
|
.foregroundStyle(.white)
|
||||||
|
.background(Circle().fill(.black.opacity(0.6)).frame(width: 18, height: 18))
|
||||||
|
}
|
||||||
|
.buttonStyle(.plain)
|
||||||
|
.offset(x: 6, y: -6)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Input Area
|
||||||
|
|
||||||
|
private var inputArea: some View {
|
||||||
|
HStack(spacing: 8) {
|
||||||
|
Button {
|
||||||
|
isShowingFilePicker = true
|
||||||
|
} label: {
|
||||||
|
Image(systemName: "photo.badge.plus")
|
||||||
|
.font(.title3)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
}
|
||||||
|
.buttonStyle(.plain)
|
||||||
|
.help("Add image (or paste with ⌘V)")
|
||||||
|
|
||||||
|
// Language toggle for speech recognition
|
||||||
|
Button {
|
||||||
|
// Toggle between en-CA and fr-CA
|
||||||
|
let newLang = viewModel.detectedLanguage == "en-CA" ? "fr-CA" : "en-CA"
|
||||||
|
viewModel.switchLanguage(to: newLang)
|
||||||
|
} label: {
|
||||||
|
Text(viewModel.detectedLanguage == "fr-CA" ? "FR" : "EN")
|
||||||
|
.font(.caption.bold())
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
.frame(width: 24, height: 24)
|
||||||
|
.background(
|
||||||
|
RoundedRectangle(cornerRadius: 4)
|
||||||
|
.fill(Color.secondary.opacity(0.1))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
.buttonStyle(.plain)
|
||||||
|
.help("Speech language: \(viewModel.detectedLanguage) (click to toggle)")
|
||||||
|
|
||||||
|
// Microphone button for voice input
|
||||||
|
Button {
|
||||||
|
viewModel.toggleRecording()
|
||||||
|
} label: {
|
||||||
|
ZStack {
|
||||||
|
if viewModel.isRecording {
|
||||||
|
// Recording indicator with level
|
||||||
|
Circle()
|
||||||
|
.fill(Color.red.opacity(0.3))
|
||||||
|
.frame(width: 28 + CGFloat(viewModel.recordingLevel) * 10,
|
||||||
|
height: 28 + CGFloat(viewModel.recordingLevel) * 10)
|
||||||
|
.animation(.easeInOut(duration: 0.1), value: viewModel.recordingLevel)
|
||||||
|
}
|
||||||
|
Image(systemName: viewModel.isRecording ? "mic.fill" : "mic")
|
||||||
|
.font(.title3)
|
||||||
|
.foregroundStyle(viewModel.isRecording ? .red : .secondary)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.buttonStyle(.plain)
|
||||||
|
.help(viewModel.isRecording ? "Stop recording" : "Voice input")
|
||||||
|
|
||||||
TextField("Message...", text: $viewModel.inputText, axis: .vertical)
|
TextField("Message...", text: $viewModel.inputText, axis: .vertical)
|
||||||
.textFieldStyle(.plain)
|
.textFieldStyle(.plain)
|
||||||
.lineLimit(1...5)
|
.lineLimit(1...5)
|
||||||
.focused($isInputFocused)
|
.focused($isInputFocused)
|
||||||
.onSubmit {
|
.onSubmit {
|
||||||
if !viewModel.inputText.isEmpty {
|
if viewModel.canSend {
|
||||||
viewModel.sendMessage()
|
viewModel.sendMessage()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -82,53 +344,104 @@ struct ChatView: View {
|
|||||||
} label: {
|
} label: {
|
||||||
Image(systemName: "arrow.up.circle.fill")
|
Image(systemName: "arrow.up.circle.fill")
|
||||||
.font(.title2)
|
.font(.title2)
|
||||||
.foregroundStyle(viewModel.inputText.isEmpty ? .gray : .accentColor)
|
.foregroundStyle(viewModel.canSend ? Color.accentColor : Color.gray)
|
||||||
}
|
}
|
||||||
.buttonStyle(.plain)
|
.buttonStyle(.plain)
|
||||||
.disabled(viewModel.inputText.isEmpty)
|
.disabled(!viewModel.canSend)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.padding()
|
.padding()
|
||||||
}
|
.onPasteCommand(of: [.image, .png, .jpeg, .tiff]) { providers in
|
||||||
.frame(minWidth: 400, minHeight: 500)
|
for provider in providers {
|
||||||
.toolbar {
|
// Try to load as image
|
||||||
ToolbarItem(placement: .primaryAction) {
|
if provider.hasItemConformingToTypeIdentifier(UTType.image.identifier) {
|
||||||
Button {
|
provider.loadDataRepresentation(forTypeIdentifier: UTType.image.identifier) { data, _ in
|
||||||
viewModel.clearChat()
|
if let data = data {
|
||||||
} label: {
|
DispatchQueue.main.async {
|
||||||
Image(systemName: "trash")
|
let attachment = ImageAttachment(data: data, filename: "pasted_image.png")
|
||||||
}
|
if viewModel.pendingImages.count < 5 {
|
||||||
.help("Clear chat")
|
viewModel.pendingImages.append(attachment)
|
||||||
.disabled(viewModel.messages.isEmpty)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.task {
|
|
||||||
await viewModel.initialize()
|
|
||||||
}
|
|
||||||
.onAppear {
|
|
||||||
// Force the app to become active and accept keyboard input
|
|
||||||
NSApp.setActivationPolicy(.regular)
|
|
||||||
NSApp.activate(ignoringOtherApps: true)
|
|
||||||
|
|
||||||
DispatchQueue.main.asyncAfter(deadline: .now() + 0.2) {
|
|
||||||
// Make sure the window is key
|
|
||||||
if let window = NSApp.windows.first(where: { $0.title == "Chat" }) {
|
|
||||||
window.makeKeyAndOrderFront(nil)
|
|
||||||
}
|
|
||||||
isInputFocused = true
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.onDisappear {
|
return
|
||||||
// Return to accessory mode when chat is closed
|
|
||||||
if NSApp.windows.filter({ $0.isVisible && $0.title != "" }).isEmpty {
|
|
||||||
NSApp.setActivationPolicy(.accessory)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Fallback to pasteboard check
|
||||||
|
viewModel.addImageFromPasteboard()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MARK: - Recent Image Thumbnail
|
||||||
|
|
||||||
|
struct RecentImageThumbnail: View {
|
||||||
|
let url: URL
|
||||||
|
let onTap: () -> Void
|
||||||
|
|
||||||
|
@State private var thumbnail: NSImage?
|
||||||
|
|
||||||
|
var body: some View {
|
||||||
|
Button(action: onTap) {
|
||||||
|
ZStack {
|
||||||
|
if let thumbnail = thumbnail {
|
||||||
|
Image(nsImage: thumbnail)
|
||||||
|
.resizable()
|
||||||
|
.aspectRatio(contentMode: .fill)
|
||||||
|
.frame(width: 80, height: 80)
|
||||||
|
.clipShape(RoundedRectangle(cornerRadius: 8))
|
||||||
|
} else {
|
||||||
|
RoundedRectangle(cornerRadius: 8)
|
||||||
|
.fill(Color.gray.opacity(0.3))
|
||||||
|
.frame(width: 80, height: 80)
|
||||||
|
.overlay {
|
||||||
|
ProgressView()
|
||||||
|
.scaleEffect(0.6)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.buttonStyle(.plain)
|
||||||
|
.help(url.lastPathComponent)
|
||||||
|
.task {
|
||||||
|
await loadThumbnail()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func loadThumbnail() async {
|
||||||
|
guard let image = NSImage(contentsOf: url) else { return }
|
||||||
|
|
||||||
|
let maxSize: CGFloat = 80
|
||||||
|
let ratio = min(maxSize / image.size.width, maxSize / image.size.height, 1.0)
|
||||||
|
let newSize = NSSize(
|
||||||
|
width: image.size.width * ratio,
|
||||||
|
height: image.size.height * ratio
|
||||||
|
)
|
||||||
|
|
||||||
|
let thumb = NSImage(size: newSize)
|
||||||
|
thumb.lockFocus()
|
||||||
|
image.draw(
|
||||||
|
in: NSRect(origin: .zero, size: newSize),
|
||||||
|
from: NSRect(origin: .zero, size: image.size),
|
||||||
|
operation: .copy,
|
||||||
|
fraction: 1.0
|
||||||
|
)
|
||||||
|
thumb.unlockFocus()
|
||||||
|
|
||||||
|
await MainActor.run {
|
||||||
|
thumbnail = thumb
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Message Bubble
|
||||||
|
|
||||||
struct MessageBubble: View {
|
struct MessageBubble: View {
|
||||||
let message: ChatMessage
|
let message: ChatMessage
|
||||||
|
var isSpeaking: Bool = false
|
||||||
|
var onSpeak: (() -> Void)? = nil
|
||||||
|
@State private var showCopied = false
|
||||||
|
|
||||||
var body: some View {
|
var body: some View {
|
||||||
HStack {
|
HStack {
|
||||||
@ -137,6 +450,11 @@ struct MessageBubble: View {
|
|||||||
}
|
}
|
||||||
|
|
||||||
VStack(alignment: message.role == .user ? .trailing : .leading, spacing: 4) {
|
VStack(alignment: message.role == .user ? .trailing : .leading, spacing: 4) {
|
||||||
|
if !message.images.isEmpty {
|
||||||
|
imageGrid
|
||||||
|
}
|
||||||
|
|
||||||
|
if !message.content.isEmpty {
|
||||||
Text(message.content)
|
Text(message.content)
|
||||||
.textSelection(.enabled)
|
.textSelection(.enabled)
|
||||||
.padding(.horizontal, 12)
|
.padding(.horizontal, 12)
|
||||||
@ -144,6 +462,7 @@ struct MessageBubble: View {
|
|||||||
.background(bubbleColor)
|
.background(bubbleColor)
|
||||||
.foregroundStyle(message.role == .user ? .white : .primary)
|
.foregroundStyle(message.role == .user ? .white : .primary)
|
||||||
.clipShape(RoundedRectangle(cornerRadius: 16))
|
.clipShape(RoundedRectangle(cornerRadius: 16))
|
||||||
|
}
|
||||||
|
|
||||||
if message.isStreaming {
|
if message.isStreaming {
|
||||||
HStack(spacing: 4) {
|
HStack(spacing: 4) {
|
||||||
@ -154,6 +473,45 @@ struct MessageBubble: View {
|
|||||||
.foregroundStyle(.secondary)
|
.foregroundStyle(.secondary)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Action buttons for assistant messages
|
||||||
|
if message.role == .assistant && !message.content.isEmpty && !message.isStreaming {
|
||||||
|
HStack(spacing: 12) {
|
||||||
|
// Speaker button for TTS
|
||||||
|
Button {
|
||||||
|
onSpeak?()
|
||||||
|
} label: {
|
||||||
|
HStack(spacing: 4) {
|
||||||
|
Image(systemName: isSpeaking ? "stop.fill" : "speaker.wave.2")
|
||||||
|
Text(isSpeaking ? "Stop" : "Speak")
|
||||||
|
}
|
||||||
|
.font(.caption)
|
||||||
|
.foregroundStyle(isSpeaking ? .red : .secondary)
|
||||||
|
}
|
||||||
|
.buttonStyle(.plain)
|
||||||
|
|
||||||
|
// Copy button
|
||||||
|
Button {
|
||||||
|
NSPasteboard.general.clearContents()
|
||||||
|
NSPasteboard.general.setString(message.content, forType: .string)
|
||||||
|
showCopied = true
|
||||||
|
DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) {
|
||||||
|
showCopied = false
|
||||||
|
}
|
||||||
|
} label: {
|
||||||
|
HStack(spacing: 4) {
|
||||||
|
Image(systemName: showCopied ? "checkmark" : "doc.on.doc")
|
||||||
|
Text(showCopied ? "Copied" : "Copy")
|
||||||
|
}
|
||||||
|
.font(.caption)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
}
|
||||||
|
.buttonStyle(.plain)
|
||||||
|
|
||||||
|
Spacer()
|
||||||
|
}
|
||||||
|
.padding(.top, 2)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if message.role == .assistant {
|
if message.role == .assistant {
|
||||||
@ -162,6 +520,32 @@ struct MessageBubble: View {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ViewBuilder
|
||||||
|
private var imageGrid: some View {
|
||||||
|
let columns = min(message.images.count, 3)
|
||||||
|
LazyVGrid(
|
||||||
|
columns: Array(repeating: GridItem(.flexible(), spacing: 4), count: columns),
|
||||||
|
spacing: 4
|
||||||
|
) {
|
||||||
|
ForEach(message.images) { attachment in
|
||||||
|
if let thumbnail = attachment.thumbnail {
|
||||||
|
Image(nsImage: thumbnail)
|
||||||
|
.resizable()
|
||||||
|
.aspectRatio(contentMode: .fill)
|
||||||
|
.frame(width: 80, height: 80)
|
||||||
|
.clipShape(RoundedRectangle(cornerRadius: 8))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.padding(4)
|
||||||
|
.background(
|
||||||
|
message.role == .user
|
||||||
|
? Color.accentColor.opacity(0.8)
|
||||||
|
: Color(nsColor: .controlBackgroundColor)
|
||||||
|
)
|
||||||
|
.clipShape(RoundedRectangle(cornerRadius: 12))
|
||||||
|
}
|
||||||
|
|
||||||
private var bubbleColor: Color {
|
private var bubbleColor: Color {
|
||||||
switch message.role {
|
switch message.role {
|
||||||
case .user:
|
case .user:
|
||||||
@ -171,3 +555,65 @@ struct MessageBubble: View {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MARK: - Image Preview Sheet
|
||||||
|
|
||||||
|
struct ImagePreviewSheet: View {
|
||||||
|
let url: URL
|
||||||
|
let onConfirm: () -> Void
|
||||||
|
let onCancel: () -> Void
|
||||||
|
|
||||||
|
@State private var image: NSImage?
|
||||||
|
|
||||||
|
var body: some View {
|
||||||
|
VStack(spacing: 16) {
|
||||||
|
Text("Add Image")
|
||||||
|
.font(.headline)
|
||||||
|
|
||||||
|
if let image = image {
|
||||||
|
Image(nsImage: image)
|
||||||
|
.resizable()
|
||||||
|
.aspectRatio(contentMode: .fit)
|
||||||
|
.frame(maxWidth: 500, maxHeight: 400)
|
||||||
|
.clipShape(RoundedRectangle(cornerRadius: 8))
|
||||||
|
.shadow(radius: 4)
|
||||||
|
} else {
|
||||||
|
RoundedRectangle(cornerRadius: 8)
|
||||||
|
.fill(Color.gray.opacity(0.2))
|
||||||
|
.frame(width: 300, height: 200)
|
||||||
|
.overlay {
|
||||||
|
ProgressView()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Text(url.lastPathComponent)
|
||||||
|
.font(.caption)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
.lineLimit(1)
|
||||||
|
|
||||||
|
HStack(spacing: 16) {
|
||||||
|
Button("Cancel") {
|
||||||
|
onCancel()
|
||||||
|
}
|
||||||
|
.keyboardShortcut(.cancelAction)
|
||||||
|
|
||||||
|
Button("Add to Message") {
|
||||||
|
onConfirm()
|
||||||
|
}
|
||||||
|
.keyboardShortcut(.defaultAction)
|
||||||
|
.buttonStyle(.borderedProminent)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.padding(24)
|
||||||
|
.frame(minWidth: 400, minHeight: 300)
|
||||||
|
.task {
|
||||||
|
image = NSImage(contentsOf: url)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - URL Identifiable Extension
|
||||||
|
|
||||||
|
extension URL: @retroactive Identifiable {
|
||||||
|
public var id: String { absoluteString }
|
||||||
|
}
|
||||||
|
|||||||
@ -2,6 +2,7 @@ import SwiftUI
|
|||||||
|
|
||||||
struct SettingsView: View {
|
struct SettingsView: View {
|
||||||
@Bindable var settings: AppSettings
|
@Bindable var settings: AppSettings
|
||||||
|
var serverManager: ServerManager?
|
||||||
@Environment(\.dismiss) private var dismiss
|
@Environment(\.dismiss) private var dismiss
|
||||||
|
|
||||||
var body: some View {
|
var body: some View {
|
||||||
@ -10,7 +11,7 @@ struct SettingsView: View {
|
|||||||
TextField("Host", text: $settings.host)
|
TextField("Host", text: $settings.host)
|
||||||
.textFieldStyle(.roundedBorder)
|
.textFieldStyle(.roundedBorder)
|
||||||
|
|
||||||
TextField("Port", value: $settings.port, format: .number)
|
TextField("Port", value: $settings.port, format: .number.grouping(.never))
|
||||||
.textFieldStyle(.roundedBorder)
|
.textFieldStyle(.roundedBorder)
|
||||||
|
|
||||||
SecureField("API Key (optional)", text: $settings.apiKey)
|
SecureField("API Key (optional)", text: $settings.apiKey)
|
||||||
@ -22,6 +23,13 @@ struct SettingsView: View {
|
|||||||
Toggle("Auto-start server on launch", isOn: $settings.autoStartServer)
|
Toggle("Auto-start server on launch", isOn: $settings.autoStartServer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Section("API") {
|
||||||
|
Toggle("Enable gRPC reflection", isOn: $settings.enableReflection)
|
||||||
|
.onChange(of: settings.enableReflection) { _, _ in
|
||||||
|
serverManager?.restart()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Section {
|
Section {
|
||||||
HStack {
|
HStack {
|
||||||
Button("Reset to Defaults") {
|
Button("Reset to Defaults") {
|
||||||
@ -38,7 +46,7 @@ struct SettingsView: View {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
.formStyle(.grouped)
|
.formStyle(.grouped)
|
||||||
.frame(width: 400, height: 310)
|
.frame(width: 400, height: 380)
|
||||||
.fixedSize()
|
.fixedSize()
|
||||||
.onAppear {
|
.onAppear {
|
||||||
NSApp.setActivationPolicy(.regular)
|
NSApp.setActivationPolicy(.regular)
|
||||||
|
|||||||
@ -1,238 +0,0 @@
|
|||||||
// DO NOT EDIT.
|
|
||||||
// swift-format-ignore-file
|
|
||||||
// swiftlint:disable all
|
|
||||||
//
|
|
||||||
// Generated protocol buffer code for apple_intelligence.proto
|
|
||||||
|
|
||||||
import Foundation
|
|
||||||
import SwiftProtobuf
|
|
||||||
|
|
||||||
// MARK: - Messages
|
|
||||||
|
|
||||||
struct Appleintelligence_CompletionRequest: Sendable, SwiftProtobuf.Message {
|
|
||||||
static let protoMessageName: String = "appleintelligence.CompletionRequest"
|
|
||||||
|
|
||||||
var prompt: String = ""
|
|
||||||
var temperature: Float = 0
|
|
||||||
var maxTokens: Int32 = 0
|
|
||||||
|
|
||||||
var hasTemperature: Bool = false
|
|
||||||
var hasMaxTokens: Bool = false
|
|
||||||
|
|
||||||
var unknownFields = SwiftProtobuf.UnknownStorage()
|
|
||||||
|
|
||||||
init() {}
|
|
||||||
|
|
||||||
init(prompt: String, temperature: Float? = nil, maxTokens: Int32? = nil) {
|
|
||||||
self.prompt = prompt
|
|
||||||
if let temp = temperature {
|
|
||||||
self.temperature = temp
|
|
||||||
self.hasTemperature = true
|
|
||||||
}
|
|
||||||
if let tokens = maxTokens {
|
|
||||||
self.maxTokens = tokens
|
|
||||||
self.hasMaxTokens = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
|
|
||||||
while let fieldNumber = try decoder.nextFieldNumber() {
|
|
||||||
switch fieldNumber {
|
|
||||||
case 1: try decoder.decodeSingularStringField(value: &prompt)
|
|
||||||
case 2:
|
|
||||||
try decoder.decodeSingularFloatField(value: &temperature)
|
|
||||||
hasTemperature = true
|
|
||||||
case 3:
|
|
||||||
try decoder.decodeSingularInt32Field(value: &maxTokens)
|
|
||||||
hasMaxTokens = true
|
|
||||||
default: break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
|
|
||||||
if !prompt.isEmpty {
|
|
||||||
try visitor.visitSingularStringField(value: prompt, fieldNumber: 1)
|
|
||||||
}
|
|
||||||
if hasTemperature {
|
|
||||||
try visitor.visitSingularFloatField(value: temperature, fieldNumber: 2)
|
|
||||||
}
|
|
||||||
if hasMaxTokens {
|
|
||||||
try visitor.visitSingularInt32Field(value: maxTokens, fieldNumber: 3)
|
|
||||||
}
|
|
||||||
try unknownFields.traverse(visitor: &visitor)
|
|
||||||
}
|
|
||||||
|
|
||||||
static func ==(lhs: Self, rhs: Self) -> Bool {
|
|
||||||
lhs.prompt == rhs.prompt && lhs.temperature == rhs.temperature && lhs.maxTokens == rhs.maxTokens && lhs.unknownFields == rhs.unknownFields
|
|
||||||
}
|
|
||||||
|
|
||||||
func isEqualTo(message: any SwiftProtobuf.Message) -> Bool {
|
|
||||||
guard let other = message as? Self else { return false }
|
|
||||||
return self == other
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Appleintelligence_CompletionResponse: Sendable, SwiftProtobuf.Message {
|
|
||||||
static let protoMessageName: String = "appleintelligence.CompletionResponse"
|
|
||||||
|
|
||||||
var id: String = ""
|
|
||||||
var text: String = ""
|
|
||||||
var finishReason: String = ""
|
|
||||||
|
|
||||||
var unknownFields = SwiftProtobuf.UnknownStorage()
|
|
||||||
|
|
||||||
init() {}
|
|
||||||
|
|
||||||
mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
|
|
||||||
while let fieldNumber = try decoder.nextFieldNumber() {
|
|
||||||
switch fieldNumber {
|
|
||||||
case 1: try decoder.decodeSingularStringField(value: &id)
|
|
||||||
case 2: try decoder.decodeSingularStringField(value: &text)
|
|
||||||
case 3: try decoder.decodeSingularStringField(value: &finishReason)
|
|
||||||
default: break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
|
|
||||||
if !id.isEmpty {
|
|
||||||
try visitor.visitSingularStringField(value: id, fieldNumber: 1)
|
|
||||||
}
|
|
||||||
if !text.isEmpty {
|
|
||||||
try visitor.visitSingularStringField(value: text, fieldNumber: 2)
|
|
||||||
}
|
|
||||||
if !finishReason.isEmpty {
|
|
||||||
try visitor.visitSingularStringField(value: finishReason, fieldNumber: 3)
|
|
||||||
}
|
|
||||||
try unknownFields.traverse(visitor: &visitor)
|
|
||||||
}
|
|
||||||
|
|
||||||
static func ==(lhs: Self, rhs: Self) -> Bool {
|
|
||||||
lhs.id == rhs.id && lhs.text == rhs.text && lhs.finishReason == rhs.finishReason && lhs.unknownFields == rhs.unknownFields
|
|
||||||
}
|
|
||||||
|
|
||||||
func isEqualTo(message: any SwiftProtobuf.Message) -> Bool {
|
|
||||||
guard let other = message as? Self else { return false }
|
|
||||||
return self == other
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Appleintelligence_CompletionChunk: Sendable, SwiftProtobuf.Message {
|
|
||||||
static let protoMessageName: String = "appleintelligence.CompletionChunk"
|
|
||||||
|
|
||||||
var id: String = ""
|
|
||||||
var delta: String = ""
|
|
||||||
var isFinal: Bool = false
|
|
||||||
var finishReason: String = ""
|
|
||||||
|
|
||||||
var hasFinishReason: Bool {
|
|
||||||
!finishReason.isEmpty
|
|
||||||
}
|
|
||||||
|
|
||||||
var unknownFields = SwiftProtobuf.UnknownStorage()
|
|
||||||
|
|
||||||
init() {}
|
|
||||||
|
|
||||||
mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
|
|
||||||
while let fieldNumber = try decoder.nextFieldNumber() {
|
|
||||||
switch fieldNumber {
|
|
||||||
case 1: try decoder.decodeSingularStringField(value: &id)
|
|
||||||
case 2: try decoder.decodeSingularStringField(value: &delta)
|
|
||||||
case 3: try decoder.decodeSingularBoolField(value: &isFinal)
|
|
||||||
case 4: try decoder.decodeSingularStringField(value: &finishReason)
|
|
||||||
default: break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
|
|
||||||
if !id.isEmpty {
|
|
||||||
try visitor.visitSingularStringField(value: id, fieldNumber: 1)
|
|
||||||
}
|
|
||||||
if !delta.isEmpty {
|
|
||||||
try visitor.visitSingularStringField(value: delta, fieldNumber: 2)
|
|
||||||
}
|
|
||||||
if isFinal {
|
|
||||||
try visitor.visitSingularBoolField(value: isFinal, fieldNumber: 3)
|
|
||||||
}
|
|
||||||
if !finishReason.isEmpty {
|
|
||||||
try visitor.visitSingularStringField(value: finishReason, fieldNumber: 4)
|
|
||||||
}
|
|
||||||
try unknownFields.traverse(visitor: &visitor)
|
|
||||||
}
|
|
||||||
|
|
||||||
static func ==(lhs: Self, rhs: Self) -> Bool {
|
|
||||||
lhs.id == rhs.id && lhs.delta == rhs.delta && lhs.isFinal == rhs.isFinal && lhs.finishReason == rhs.finishReason && lhs.unknownFields == rhs.unknownFields
|
|
||||||
}
|
|
||||||
|
|
||||||
func isEqualTo(message: any SwiftProtobuf.Message) -> Bool {
|
|
||||||
guard let other = message as? Self else { return false }
|
|
||||||
return self == other
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Appleintelligence_HealthRequest: Sendable, SwiftProtobuf.Message {
|
|
||||||
static let protoMessageName: String = "appleintelligence.HealthRequest"
|
|
||||||
|
|
||||||
var unknownFields = SwiftProtobuf.UnknownStorage()
|
|
||||||
|
|
||||||
init() {}
|
|
||||||
|
|
||||||
mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
|
|
||||||
while let _ = try decoder.nextFieldNumber() {}
|
|
||||||
}
|
|
||||||
|
|
||||||
func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
|
|
||||||
try unknownFields.traverse(visitor: &visitor)
|
|
||||||
}
|
|
||||||
|
|
||||||
static func ==(lhs: Self, rhs: Self) -> Bool {
|
|
||||||
lhs.unknownFields == rhs.unknownFields
|
|
||||||
}
|
|
||||||
|
|
||||||
func isEqualTo(message: any SwiftProtobuf.Message) -> Bool {
|
|
||||||
guard let other = message as? Self else { return false }
|
|
||||||
return self == other
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Appleintelligence_HealthResponse: Sendable, SwiftProtobuf.Message {
|
|
||||||
static let protoMessageName: String = "appleintelligence.HealthResponse"
|
|
||||||
|
|
||||||
var healthy: Bool = false
|
|
||||||
var modelStatus: String = ""
|
|
||||||
|
|
||||||
var unknownFields = SwiftProtobuf.UnknownStorage()
|
|
||||||
|
|
||||||
init() {}
|
|
||||||
|
|
||||||
mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
|
|
||||||
while let fieldNumber = try decoder.nextFieldNumber() {
|
|
||||||
switch fieldNumber {
|
|
||||||
case 1: try decoder.decodeSingularBoolField(value: &healthy)
|
|
||||||
case 2: try decoder.decodeSingularStringField(value: &modelStatus)
|
|
||||||
default: break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
|
|
||||||
if healthy {
|
|
||||||
try visitor.visitSingularBoolField(value: healthy, fieldNumber: 1)
|
|
||||||
}
|
|
||||||
if !modelStatus.isEmpty {
|
|
||||||
try visitor.visitSingularStringField(value: modelStatus, fieldNumber: 2)
|
|
||||||
}
|
|
||||||
try unknownFields.traverse(visitor: &visitor)
|
|
||||||
}
|
|
||||||
|
|
||||||
static func ==(lhs: Self, rhs: Self) -> Bool {
|
|
||||||
lhs.healthy == rhs.healthy && lhs.modelStatus == rhs.modelStatus && lhs.unknownFields == rhs.unknownFields
|
|
||||||
}
|
|
||||||
|
|
||||||
func isEqualTo(message: any SwiftProtobuf.Message) -> Bool {
|
|
||||||
guard let other = message as? Self else { return false }
|
|
||||||
return self == other
|
|
||||||
}
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
1435
Sources/AppleIntelligenceCore/Generated/apple_intelligence.pb.swift
Normal file
1435
Sources/AppleIntelligenceCore/Generated/apple_intelligence.pb.swift
Normal file
File diff suppressed because it is too large
Load Diff
@ -4,63 +4,51 @@ import GRPCProtobuf
|
|||||||
import GRPCNIOTransportHTTP2
|
import GRPCNIOTransportHTTP2
|
||||||
|
|
||||||
/// gRPC service provider for Apple Intelligence
|
/// gRPC service provider for Apple Intelligence
|
||||||
public struct AppleIntelligenceProvider: RegistrableRPCService {
|
public struct AppleIntelligenceProvider: Appleintelligence_AppleIntelligenceService.ServiceProtocol {
|
||||||
/// Service descriptor
|
|
||||||
public static let serviceDescriptor = ServiceDescriptor(
|
|
||||||
fullyQualifiedService: "appleintelligence.AppleIntelligence"
|
|
||||||
)
|
|
||||||
|
|
||||||
/// Method descriptors
|
|
||||||
enum Methods {
|
|
||||||
static let complete = MethodDescriptor(
|
|
||||||
service: AppleIntelligenceProvider.serviceDescriptor,
|
|
||||||
method: "Complete"
|
|
||||||
)
|
|
||||||
static let streamComplete = MethodDescriptor(
|
|
||||||
service: AppleIntelligenceProvider.serviceDescriptor,
|
|
||||||
method: "StreamComplete"
|
|
||||||
)
|
|
||||||
static let health = MethodDescriptor(
|
|
||||||
service: AppleIntelligenceProvider.serviceDescriptor,
|
|
||||||
method: "Health"
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The underlying AI service
|
/// The underlying AI service
|
||||||
private let service: AppleIntelligenceService
|
private let service: AppleIntelligenceService
|
||||||
|
|
||||||
|
/// Text-to-Speech service
|
||||||
|
private let ttsService: TextToSpeechService?
|
||||||
|
|
||||||
|
/// Speech-to-Text service
|
||||||
|
private let sttService: SpeechToTextService?
|
||||||
|
|
||||||
/// Optional API key for authentication
|
/// Optional API key for authentication
|
||||||
private let apiKey: String?
|
private let apiKey: String?
|
||||||
|
|
||||||
public init(service: AppleIntelligenceService, apiKey: String? = nil) {
|
public init(
|
||||||
|
service: AppleIntelligenceService,
|
||||||
|
ttsService: TextToSpeechService? = nil,
|
||||||
|
sttService: SpeechToTextService? = nil,
|
||||||
|
apiKey: String? = nil
|
||||||
|
) {
|
||||||
self.service = service
|
self.service = service
|
||||||
|
self.ttsService = ttsService
|
||||||
|
self.sttService = sttService
|
||||||
self.apiKey = apiKey
|
self.apiKey = apiKey
|
||||||
}
|
}
|
||||||
|
|
||||||
public func registerMethods<Transport: ServerTransport>(with router: inout RPCRouter<Transport>) {
|
// MARK: - ServiceProtocol Implementation
|
||||||
// Register Complete method (unary)
|
|
||||||
router.registerHandler(
|
|
||||||
forMethod: Methods.complete,
|
|
||||||
deserializer: ProtobufDeserializer<Appleintelligence_CompletionRequest>(),
|
|
||||||
serializer: ProtobufSerializer<Appleintelligence_CompletionResponse>()
|
|
||||||
) { request, context in
|
|
||||||
try self.validateApiKey(metadata: request.metadata)
|
|
||||||
|
|
||||||
// Collect the single message from the request stream
|
public func complete(
|
||||||
var requestMessage: Appleintelligence_CompletionRequest?
|
request: GRPCCore.ServerRequest<Appleintelligence_CompletionRequest>,
|
||||||
for try await message in request.messages {
|
context: GRPCCore.ServerContext
|
||||||
requestMessage = message
|
) async throws -> GRPCCore.ServerResponse<Appleintelligence_CompletionResponse> {
|
||||||
break
|
try validateApiKey(metadata: request.metadata)
|
||||||
|
|
||||||
|
let message = request.message
|
||||||
|
|
||||||
|
// Convert protobuf images to service format
|
||||||
|
let images = message.images.map { img in
|
||||||
|
(data: img.data, filename: img.filename.isEmpty ? nil : img.filename)
|
||||||
}
|
}
|
||||||
|
|
||||||
guard let message = requestMessage else {
|
let (text, analyses) = try await service.complete(
|
||||||
throw RPCError(code: .invalidArgument, message: "No request message received")
|
|
||||||
}
|
|
||||||
|
|
||||||
let text = try await self.service.complete(
|
|
||||||
prompt: message.prompt,
|
prompt: message.prompt,
|
||||||
temperature: message.hasTemperature ? message.temperature : nil,
|
temperature: message.hasTemperature ? message.temperature : nil,
|
||||||
maxTokens: message.hasMaxTokens ? Int(message.maxTokens) : nil
|
maxTokens: message.hasMaxTokens ? Int(message.maxTokens) : nil,
|
||||||
|
images: images
|
||||||
)
|
)
|
||||||
|
|
||||||
var response = Appleintelligence_CompletionResponse()
|
var response = Appleintelligence_CompletionResponse()
|
||||||
@ -68,42 +56,45 @@ public struct AppleIntelligenceProvider: RegistrableRPCService {
|
|||||||
response.text = text
|
response.text = text
|
||||||
response.finishReason = "stop"
|
response.finishReason = "stop"
|
||||||
|
|
||||||
return StreamingServerResponse(single: ServerResponse(message: response))
|
// Include analysis results if requested
|
||||||
|
if message.includeAnalysis {
|
||||||
|
response.imageAnalyses = analyses.map { analysis in
|
||||||
|
var protoAnalysis = Appleintelligence_ImageAnalysis()
|
||||||
|
protoAnalysis.textContent = analysis.textContent
|
||||||
|
protoAnalysis.labels = analysis.labels
|
||||||
|
protoAnalysis.description_p = analysis.description
|
||||||
|
return protoAnalysis
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register StreamComplete method (server streaming)
|
return ServerResponse(message: response)
|
||||||
router.registerHandler(
|
|
||||||
forMethod: Methods.streamComplete,
|
|
||||||
deserializer: ProtobufDeserializer<Appleintelligence_CompletionRequest>(),
|
|
||||||
serializer: ProtobufSerializer<Appleintelligence_CompletionChunk>()
|
|
||||||
) { request, context in
|
|
||||||
try self.validateApiKey(metadata: request.metadata)
|
|
||||||
|
|
||||||
// Collect the single message from the request stream
|
|
||||||
var requestMessage: Appleintelligence_CompletionRequest?
|
|
||||||
for try await message in request.messages {
|
|
||||||
requestMessage = message
|
|
||||||
break
|
|
||||||
}
|
}
|
||||||
|
|
||||||
guard let message = requestMessage else {
|
public func streamComplete(
|
||||||
throw RPCError(code: .invalidArgument, message: "No request message received")
|
request: GRPCCore.ServerRequest<Appleintelligence_CompletionRequest>,
|
||||||
}
|
context: GRPCCore.ServerContext
|
||||||
|
) async throws -> GRPCCore.StreamingServerResponse<Appleintelligence_CompletionChunk> {
|
||||||
|
try validateApiKey(metadata: request.metadata)
|
||||||
|
|
||||||
|
let message = request.message
|
||||||
let completionId = UUID().uuidString
|
let completionId = UUID().uuidString
|
||||||
let prompt = message.prompt
|
|
||||||
let temperature = message.hasTemperature ? message.temperature : nil
|
// Convert protobuf images to service format
|
||||||
let maxTokens = message.hasMaxTokens ? Int(message.maxTokens) : nil
|
let images = message.images.map { img in
|
||||||
|
(data: img.data, filename: img.filename.isEmpty ? nil : img.filename)
|
||||||
|
}
|
||||||
|
|
||||||
return StreamingServerResponse { writer in
|
return StreamingServerResponse { writer in
|
||||||
let stream = await self.service.streamComplete(
|
let stream = await self.service.streamComplete(
|
||||||
prompt: prompt,
|
prompt: message.prompt,
|
||||||
temperature: temperature,
|
temperature: message.hasTemperature ? message.temperature : nil,
|
||||||
maxTokens: maxTokens
|
maxTokens: message.hasMaxTokens ? Int(message.maxTokens) : nil,
|
||||||
|
images: images
|
||||||
)
|
)
|
||||||
|
|
||||||
var lastContent = ""
|
var lastContent = ""
|
||||||
for try await partialResponse in stream {
|
var isFirstChunk = true
|
||||||
|
for try await (partialResponse, analyses) in stream {
|
||||||
// Calculate the delta (new text since last response)
|
// Calculate the delta (new text since last response)
|
||||||
let delta: String
|
let delta: String
|
||||||
if partialResponse.hasPrefix(lastContent) {
|
if partialResponse.hasPrefix(lastContent) {
|
||||||
@ -113,12 +104,25 @@ public struct AppleIntelligenceProvider: RegistrableRPCService {
|
|||||||
}
|
}
|
||||||
lastContent = partialResponse
|
lastContent = partialResponse
|
||||||
|
|
||||||
if !delta.isEmpty {
|
if !delta.isEmpty || isFirstChunk {
|
||||||
var chunk = Appleintelligence_CompletionChunk()
|
var chunk = Appleintelligence_CompletionChunk()
|
||||||
chunk.id = completionId
|
chunk.id = completionId
|
||||||
chunk.delta = delta
|
chunk.delta = delta
|
||||||
chunk.isFinal = false
|
chunk.isFinal = false
|
||||||
|
|
||||||
|
// Include analyses in first chunk if requested
|
||||||
|
if isFirstChunk && message.includeAnalysis, let analyses = analyses {
|
||||||
|
chunk.imageAnalyses = analyses.map { analysis in
|
||||||
|
var protoAnalysis = Appleintelligence_ImageAnalysis()
|
||||||
|
protoAnalysis.textContent = analysis.textContent
|
||||||
|
protoAnalysis.labels = analysis.labels
|
||||||
|
protoAnalysis.description_p = analysis.description
|
||||||
|
return protoAnalysis
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
try await writer.write(chunk)
|
try await writer.write(chunk)
|
||||||
|
isFirstChunk = false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -134,26 +138,229 @@ public struct AppleIntelligenceProvider: RegistrableRPCService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register Health method (unary)
|
public func health(
|
||||||
router.registerHandler(
|
request: GRPCCore.ServerRequest<Appleintelligence_HealthRequest>,
|
||||||
forMethod: Methods.health,
|
context: GRPCCore.ServerContext
|
||||||
deserializer: ProtobufDeserializer<Appleintelligence_HealthRequest>(),
|
) async throws -> GRPCCore.ServerResponse<Appleintelligence_HealthResponse> {
|
||||||
serializer: ProtobufSerializer<Appleintelligence_HealthResponse>()
|
let isHealthy = await service.isAvailable
|
||||||
) { request, context in
|
let modelStatus = await service.getModelStatus()
|
||||||
// Consume request messages (empty for health check)
|
|
||||||
for try await _ in request.messages {}
|
|
||||||
|
|
||||||
let isHealthy = await self.service.isAvailable
|
|
||||||
let modelStatus = await self.service.getModelStatus()
|
|
||||||
|
|
||||||
var response = Appleintelligence_HealthResponse()
|
var response = Appleintelligence_HealthResponse()
|
||||||
response.healthy = isHealthy
|
response.healthy = isHealthy
|
||||||
response.modelStatus = modelStatus
|
response.modelStatus = modelStatus
|
||||||
|
|
||||||
return StreamingServerResponse(single: ServerResponse(message: response))
|
return ServerResponse(message: response)
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Text-to-Speech
|
||||||
|
|
||||||
|
public func textToSpeech(
|
||||||
|
request: GRPCCore.ServerRequest<Appleintelligence_TextToSpeechRequest>,
|
||||||
|
context: GRPCCore.ServerContext
|
||||||
|
) async throws -> GRPCCore.ServerResponse<Appleintelligence_TextToSpeechResponse> {
|
||||||
|
try validateApiKey(metadata: request.metadata)
|
||||||
|
|
||||||
|
guard let ttsService = ttsService else {
|
||||||
|
throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
|
||||||
|
}
|
||||||
|
|
||||||
|
let message = request.message
|
||||||
|
|
||||||
|
// Convert proto config to service config
|
||||||
|
var config = SpeechConfig.default
|
||||||
|
if message.hasVoiceConfig {
|
||||||
|
let voiceConfig = message.voiceConfig
|
||||||
|
config = SpeechConfig(
|
||||||
|
voiceIdentifier: voiceConfig.voiceIdentifier.isEmpty ? nil : voiceConfig.voiceIdentifier,
|
||||||
|
speakingRate: voiceConfig.hasSpeakingRate ? voiceConfig.speakingRate : 0.5,
|
||||||
|
pitchMultiplier: voiceConfig.hasPitchMultiplier ? voiceConfig.pitchMultiplier : 1.0,
|
||||||
|
volume: voiceConfig.hasVolume ? voiceConfig.volume : 1.0
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert proto format to service format
|
||||||
|
let outputFormat: AudioOutputFormat
|
||||||
|
switch message.outputFormat {
|
||||||
|
case .wav, .unspecified:
|
||||||
|
outputFormat = .wav
|
||||||
|
case .mp3:
|
||||||
|
outputFormat = .mp3
|
||||||
|
case .UNRECOGNIZED:
|
||||||
|
outputFormat = .wav
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
let result = try await ttsService.synthesize(
|
||||||
|
text: message.text,
|
||||||
|
config: config,
|
||||||
|
outputFormat: outputFormat
|
||||||
|
)
|
||||||
|
|
||||||
|
var response = Appleintelligence_TextToSpeechResponse()
|
||||||
|
response.audioData = result.audioData
|
||||||
|
response.format = outputFormat == .wav ? .wav : .mp3
|
||||||
|
response.sampleRate = Int32(result.sampleRate)
|
||||||
|
response.channels = Int32(result.channels)
|
||||||
|
response.durationSeconds = result.durationSeconds
|
||||||
|
|
||||||
|
return ServerResponse(message: response)
|
||||||
|
} catch let error as TextToSpeechError {
|
||||||
|
throw RPCError(code: .internalError, message: error.description)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public func listVoices(
|
||||||
|
request: GRPCCore.ServerRequest<Appleintelligence_ListVoicesRequest>,
|
||||||
|
context: GRPCCore.ServerContext
|
||||||
|
) async throws -> GRPCCore.ServerResponse<Appleintelligence_ListVoicesResponse> {
|
||||||
|
try validateApiKey(metadata: request.metadata)
|
||||||
|
|
||||||
|
guard let ttsService = ttsService else {
|
||||||
|
throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
|
||||||
|
}
|
||||||
|
|
||||||
|
let message = request.message
|
||||||
|
let languageCode = message.hasLanguageCode ? message.languageCode : nil
|
||||||
|
|
||||||
|
let voices = await ttsService.listVoices(languageCode: languageCode)
|
||||||
|
|
||||||
|
var response = Appleintelligence_ListVoicesResponse()
|
||||||
|
response.voices = voices.map { voice in
|
||||||
|
var protoVoice = Appleintelligence_VoiceInfo()
|
||||||
|
protoVoice.identifier = voice.identifier
|
||||||
|
protoVoice.name = voice.name
|
||||||
|
protoVoice.language = voice.language
|
||||||
|
protoVoice.isPremium = voice.isPremium
|
||||||
|
protoVoice.gender = voice.gender
|
||||||
|
return protoVoice
|
||||||
|
}
|
||||||
|
|
||||||
|
return ServerResponse(message: response)
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Speech-to-Text
|
||||||
|
|
||||||
|
public func transcribe(
|
||||||
|
request: GRPCCore.ServerRequest<Appleintelligence_TranscribeRequest>,
|
||||||
|
context: GRPCCore.ServerContext
|
||||||
|
) async throws -> GRPCCore.ServerResponse<Appleintelligence_TranscribeResponse> {
|
||||||
|
try validateApiKey(metadata: request.metadata)
|
||||||
|
|
||||||
|
guard let sttService = sttService else {
|
||||||
|
throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
|
||||||
|
}
|
||||||
|
|
||||||
|
let message = request.message
|
||||||
|
|
||||||
|
guard message.hasAudio else {
|
||||||
|
throw RPCError(code: .invalidArgument, message: "Audio data is required")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert proto config to service config
|
||||||
|
var config = TranscriptionConfig.default
|
||||||
|
if message.hasConfig {
|
||||||
|
let protoConfig = message.config
|
||||||
|
config = TranscriptionConfig(
|
||||||
|
languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
|
||||||
|
enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
|
||||||
|
enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
let result = try await sttService.transcribe(
|
||||||
|
audioData: message.audio.data,
|
||||||
|
mimeType: message.audio.mimeType,
|
||||||
|
config: config
|
||||||
|
)
|
||||||
|
|
||||||
|
var response = Appleintelligence_TranscribeResponse()
|
||||||
|
response.text = result.text
|
||||||
|
response.detectedLanguage = result.detectedLanguage
|
||||||
|
response.confidence = result.confidence
|
||||||
|
response.segments = result.segments.map { segment in
|
||||||
|
var protoSegment = Appleintelligence_TranscriptionSegment()
|
||||||
|
protoSegment.text = segment.text
|
||||||
|
protoSegment.startTime = segment.startTime
|
||||||
|
protoSegment.endTime = segment.endTime
|
||||||
|
protoSegment.confidence = segment.confidence
|
||||||
|
return protoSegment
|
||||||
|
}
|
||||||
|
|
||||||
|
return ServerResponse(message: response)
|
||||||
|
} catch let error as SpeechToTextError {
|
||||||
|
throw RPCError(code: .internalError, message: error.description)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public func streamTranscribe(
|
||||||
|
request: GRPCCore.StreamingServerRequest<Appleintelligence_StreamingTranscribeRequest>,
|
||||||
|
context: GRPCCore.ServerContext
|
||||||
|
) async throws -> GRPCCore.StreamingServerResponse<Appleintelligence_StreamingTranscribeResponse> {
|
||||||
|
try validateApiKey(metadata: request.metadata)
|
||||||
|
|
||||||
|
guard let sttService = sttService else {
|
||||||
|
throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
|
||||||
|
}
|
||||||
|
|
||||||
|
return StreamingServerResponse { writer in
|
||||||
|
var config = TranscriptionConfig.default
|
||||||
|
|
||||||
|
// Process incoming stream
|
||||||
|
for try await message in request.messages {
|
||||||
|
switch message.request {
|
||||||
|
case .config(let protoConfig):
|
||||||
|
// First message should be config
|
||||||
|
config = TranscriptionConfig(
|
||||||
|
languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
|
||||||
|
enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
|
||||||
|
enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
|
||||||
|
)
|
||||||
|
|
||||||
|
// Start streaming transcription
|
||||||
|
let stream = await sttService.streamTranscribe(config: config)
|
||||||
|
Task {
|
||||||
|
do {
|
||||||
|
for try await update in stream {
|
||||||
|
var response = Appleintelligence_StreamingTranscribeResponse()
|
||||||
|
response.partialText = update.partialText
|
||||||
|
response.isFinal = update.isFinal
|
||||||
|
if let finalText = update.finalText {
|
||||||
|
response.finalText = finalText
|
||||||
|
}
|
||||||
|
response.segments = update.segments.map { segment in
|
||||||
|
var protoSegment = Appleintelligence_TranscriptionSegment()
|
||||||
|
protoSegment.text = segment.text
|
||||||
|
protoSegment.startTime = segment.startTime
|
||||||
|
protoSegment.endTime = segment.endTime
|
||||||
|
protoSegment.confidence = segment.confidence
|
||||||
|
return protoSegment
|
||||||
|
}
|
||||||
|
try await writer.write(response)
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Stream ended or error occurred
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
case .audioChunk(let chunk):
|
||||||
|
// Feed audio chunk to service
|
||||||
|
try await sttService.feedAudioChunk(chunk)
|
||||||
|
|
||||||
|
case .none:
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// End streaming session
|
||||||
|
await sttService.endStreamingSession()
|
||||||
|
|
||||||
|
return [:]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Private Helpers
|
||||||
|
|
||||||
/// Validate API key if configured
|
/// Validate API key if configured
|
||||||
private func validateApiKey(metadata: Metadata) throws {
|
private func validateApiKey(metadata: Metadata) throws {
|
||||||
guard let expectedKey = apiKey else {
|
guard let expectedKey = apiKey else {
|
||||||
|
|||||||
9
Sources/AppleIntelligenceCore/Resources.swift
Normal file
9
Sources/AppleIntelligenceCore/Resources.swift
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// Helper for accessing bundled resources
|
||||||
|
public enum AppleIntelligenceResources {
|
||||||
|
/// URL to the protobuf descriptor set file for reflection
|
||||||
|
public static var descriptorSetURL: URL? {
|
||||||
|
Bundle.module.url(forResource: "apple_intelligence", withExtension: "pb")
|
||||||
|
}
|
||||||
|
}
|
||||||
BIN
Sources/AppleIntelligenceCore/Resources/apple_intelligence.pb
Normal file
BIN
Sources/AppleIntelligenceCore/Resources/apple_intelligence.pb
Normal file
Binary file not shown.
@ -6,6 +6,7 @@ public enum AppleIntelligenceError: Error, CustomStringConvertible, Sendable {
|
|||||||
case modelNotAvailable
|
case modelNotAvailable
|
||||||
case generationFailed(String)
|
case generationFailed(String)
|
||||||
case sessionCreationFailed
|
case sessionCreationFailed
|
||||||
|
case imageAnalysisFailed(String)
|
||||||
|
|
||||||
public var description: String {
|
public var description: String {
|
||||||
switch self {
|
switch self {
|
||||||
@ -15,6 +16,8 @@ public enum AppleIntelligenceError: Error, CustomStringConvertible, Sendable {
|
|||||||
return "Generation failed: \(reason)"
|
return "Generation failed: \(reason)"
|
||||||
case .sessionCreationFailed:
|
case .sessionCreationFailed:
|
||||||
return "Failed to create language model session"
|
return "Failed to create language model session"
|
||||||
|
case .imageAnalysisFailed(let reason):
|
||||||
|
return "Image analysis failed: \(reason)"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -24,6 +27,9 @@ public actor AppleIntelligenceService {
|
|||||||
/// The language model session
|
/// The language model session
|
||||||
private var session: LanguageModelSession?
|
private var session: LanguageModelSession?
|
||||||
|
|
||||||
|
/// Vision analysis service for image processing
|
||||||
|
private let visionService = VisionAnalysisService()
|
||||||
|
|
||||||
/// Whether the model is available
|
/// Whether the model is available
|
||||||
public private(set) var isAvailable: Bool = false
|
public private(set) var isAvailable: Bool = false
|
||||||
|
|
||||||
@ -60,21 +66,42 @@ public actor AppleIntelligenceService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Generate a completion for the given prompt (non-streaming)
|
/// Generate a completion for the given prompt (non-streaming)
|
||||||
public func complete(prompt: String, temperature: Float?, maxTokens: Int?) async throws -> String {
|
public func complete(
|
||||||
|
prompt: String,
|
||||||
|
temperature: Float?,
|
||||||
|
maxTokens: Int?,
|
||||||
|
images: [(data: Data, filename: String?)] = []
|
||||||
|
) async throws -> (text: String, analyses: [VisionAnalysisResult]) {
|
||||||
guard isAvailable, let session = session else {
|
guard isAvailable, let session = session else {
|
||||||
throw AppleIntelligenceError.modelNotAvailable
|
throw AppleIntelligenceError.modelNotAvailable
|
||||||
}
|
}
|
||||||
|
|
||||||
let response = try await session.respond(to: prompt)
|
// Analyze images if provided
|
||||||
return response.content
|
var analyses: [VisionAnalysisResult] = []
|
||||||
|
var enhancedPrompt = prompt
|
||||||
|
|
||||||
|
if !images.isEmpty {
|
||||||
|
do {
|
||||||
|
analyses = try await visionService.analyzeMultiple(images: images)
|
||||||
|
let analysesWithFilenames = zip(analyses, images).map { (result: $0.0, filename: $0.1.filename) }
|
||||||
|
let context = await visionService.formatAnalysesAsPromptContext(analyses: analysesWithFilenames)
|
||||||
|
enhancedPrompt = context + "\n\n" + prompt
|
||||||
|
} catch {
|
||||||
|
throw AppleIntelligenceError.imageAnalysisFailed(error.localizedDescription)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = try await session.respond(to: enhancedPrompt)
|
||||||
|
return (text: response.content, analyses: analyses)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Generate a streaming completion for the given prompt
|
/// Generate a streaming completion for the given prompt
|
||||||
public func streamComplete(
|
public func streamComplete(
|
||||||
prompt: String,
|
prompt: String,
|
||||||
temperature: Float?,
|
temperature: Float?,
|
||||||
maxTokens: Int?
|
maxTokens: Int?,
|
||||||
) -> AsyncThrowingStream<String, Error> {
|
images: [(data: Data, filename: String?)] = []
|
||||||
|
) -> AsyncThrowingStream<(text: String, analyses: [VisionAnalysisResult]?), Error> {
|
||||||
AsyncThrowingStream { continuation in
|
AsyncThrowingStream { continuation in
|
||||||
Task {
|
Task {
|
||||||
guard self.isAvailable, let session = self.session else {
|
guard self.isAvailable, let session = self.session else {
|
||||||
@ -82,10 +109,33 @@ public actor AppleIntelligenceService {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Analyze images first if provided
|
||||||
|
var analyses: [VisionAnalysisResult] = []
|
||||||
|
var enhancedPrompt = prompt
|
||||||
|
|
||||||
|
if !images.isEmpty {
|
||||||
do {
|
do {
|
||||||
let stream = session.streamResponse(to: prompt)
|
analyses = try await self.visionService.analyzeMultiple(images: images)
|
||||||
|
let analysesWithFilenames = zip(analyses, images).map { (result: $0.0, filename: $0.1.filename) }
|
||||||
|
let context = await self.visionService.formatAnalysesAsPromptContext(analyses: analysesWithFilenames)
|
||||||
|
enhancedPrompt = context + "\n\n" + prompt
|
||||||
|
} catch {
|
||||||
|
continuation.finish(throwing: AppleIntelligenceError.imageAnalysisFailed(error.localizedDescription))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
let stream = session.streamResponse(to: enhancedPrompt)
|
||||||
|
var isFirst = true
|
||||||
for try await partialResponse in stream {
|
for try await partialResponse in stream {
|
||||||
continuation.yield(partialResponse.content)
|
// Include analyses only in first chunk
|
||||||
|
if isFirst {
|
||||||
|
continuation.yield((text: partialResponse.content, analyses: analyses))
|
||||||
|
isFirst = false
|
||||||
|
} else {
|
||||||
|
continuation.yield((text: partialResponse.content, analyses: nil))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
continuation.finish()
|
continuation.finish()
|
||||||
} catch {
|
} catch {
|
||||||
|
|||||||
387
Sources/AppleIntelligenceCore/Services/SpeechToTextService.swift
Normal file
387
Sources/AppleIntelligenceCore/Services/SpeechToTextService.swift
Normal file
@ -0,0 +1,387 @@
|
|||||||
|
import Foundation
|
||||||
|
import Speech
|
||||||
|
import AVFoundation
|
||||||
|
|
||||||
|
// MARK: - Result Types
|
||||||
|
|
||||||
|
/// Transcription result
|
||||||
|
public struct TranscriptionResult: Sendable {
|
||||||
|
public let text: String
|
||||||
|
public let segments: [TranscriptionSegmentResult]
|
||||||
|
public let detectedLanguage: String
|
||||||
|
public let confidence: Float
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Individual transcription segment
|
||||||
|
public struct TranscriptionSegmentResult: Sendable {
|
||||||
|
public let text: String
|
||||||
|
public let startTime: Float
|
||||||
|
public let endTime: Float
|
||||||
|
public let confidence: Float
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Streaming transcription update
|
||||||
|
public struct StreamingTranscriptionUpdate: Sendable {
|
||||||
|
public let partialText: String
|
||||||
|
public let isFinal: Bool
|
||||||
|
public let finalText: String?
|
||||||
|
public let segments: [TranscriptionSegmentResult]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Transcription configuration
|
||||||
|
public struct TranscriptionConfig: Sendable {
|
||||||
|
public var languageCode: String?
|
||||||
|
public var enablePunctuation: Bool
|
||||||
|
public var enableTimestamps: Bool
|
||||||
|
|
||||||
|
public static let `default` = TranscriptionConfig(
|
||||||
|
languageCode: nil,
|
||||||
|
enablePunctuation: true,
|
||||||
|
enableTimestamps: false
|
||||||
|
)
|
||||||
|
|
||||||
|
public init(
|
||||||
|
languageCode: String? = nil,
|
||||||
|
enablePunctuation: Bool = true,
|
||||||
|
enableTimestamps: Bool = false
|
||||||
|
) {
|
||||||
|
self.languageCode = languageCode
|
||||||
|
self.enablePunctuation = enablePunctuation
|
||||||
|
self.enableTimestamps = enableTimestamps
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Errors
|
||||||
|
|
||||||
|
public enum SpeechToTextError: Error, CustomStringConvertible, Sendable {
|
||||||
|
case notAvailable
|
||||||
|
case authorizationDenied
|
||||||
|
case modelNotReady(String)
|
||||||
|
case transcriptionFailed(String)
|
||||||
|
case invalidAudioFormat
|
||||||
|
case audioProcessingFailed(String)
|
||||||
|
case unsupportedMimeType(String)
|
||||||
|
|
||||||
|
public var description: String {
|
||||||
|
switch self {
|
||||||
|
case .notAvailable: return "Speech recognition not available on this system"
|
||||||
|
case .authorizationDenied: return "Speech recognition authorization denied"
|
||||||
|
case .modelNotReady(let reason): return "Speech model not ready: \(reason)"
|
||||||
|
case .transcriptionFailed(let reason): return "Transcription failed: \(reason)"
|
||||||
|
case .invalidAudioFormat: return "Invalid audio format"
|
||||||
|
case .audioProcessingFailed(let reason): return "Audio processing failed: \(reason)"
|
||||||
|
case .unsupportedMimeType(let type): return "Unsupported audio MIME type: \(type)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Service Actor
|
||||||
|
|
||||||
|
public actor SpeechToTextService {
|
||||||
|
|
||||||
|
/// Service availability status
|
||||||
|
public private(set) var isAvailable: Bool = false
|
||||||
|
|
||||||
|
/// Streaming session state
|
||||||
|
private var isStreamingActive: Bool = false
|
||||||
|
private var streamingRequest: SFSpeechAudioBufferRecognitionRequest?
|
||||||
|
private var streamingRecognizer: SFSpeechRecognizer?
|
||||||
|
private var streamingTask: SFSpeechRecognitionTask?
|
||||||
|
private var streamingContinuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation?
|
||||||
|
|
||||||
|
public init() async {
|
||||||
|
await checkAvailability()
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Public API
|
||||||
|
|
||||||
|
/// Transcribe audio data (file-based)
|
||||||
|
public func transcribe(
|
||||||
|
audioData: Data,
|
||||||
|
mimeType: String,
|
||||||
|
config: TranscriptionConfig = .default
|
||||||
|
) async throws -> TranscriptionResult {
|
||||||
|
guard isAvailable else {
|
||||||
|
throw SpeechToTextError.notAvailable
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert audio data to file URL for processing
|
||||||
|
let tempURL = try createTempAudioFile(data: audioData, mimeType: mimeType)
|
||||||
|
defer { try? FileManager.default.removeItem(at: tempURL) }
|
||||||
|
|
||||||
|
return try await transcribeWithSFSpeechRecognizer(url: tempURL, config: config)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stream transcription from audio chunks sent via gRPC
|
||||||
|
public func streamTranscribe(
|
||||||
|
config: TranscriptionConfig = .default
|
||||||
|
) -> AsyncThrowingStream<StreamingTranscriptionUpdate, Error> {
|
||||||
|
AsyncThrowingStream { continuation in
|
||||||
|
Task {
|
||||||
|
guard await self.isAvailable else {
|
||||||
|
continuation.finish(throwing: SpeechToTextError.notAvailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
try await self.startStreamingSession(config: config, continuation: continuation)
|
||||||
|
} catch {
|
||||||
|
continuation.finish(throwing: error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feed audio chunk for streaming transcription (PCM audio data)
|
||||||
|
public func feedAudioChunk(_ chunk: Data) async throws {
|
||||||
|
guard isStreamingActive, let request = streamingRequest else {
|
||||||
|
throw SpeechToTextError.transcriptionFailed("No active streaming session")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert raw PCM data to audio buffer
|
||||||
|
// Assuming 16-bit PCM, mono, 16kHz (common format for speech)
|
||||||
|
let audioFormat = AVAudioFormat(
|
||||||
|
commonFormat: .pcmFormatInt16,
|
||||||
|
sampleRate: 16000,
|
||||||
|
channels: 1,
|
||||||
|
interleaved: true
|
||||||
|
)!
|
||||||
|
|
||||||
|
let frameCount = UInt32(chunk.count / 2) // 2 bytes per Int16 sample
|
||||||
|
guard let buffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: frameCount) else {
|
||||||
|
throw SpeechToTextError.audioProcessingFailed("Failed to create audio buffer")
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.frameLength = frameCount
|
||||||
|
|
||||||
|
// Copy data into buffer
|
||||||
|
chunk.withUnsafeBytes { rawPtr in
|
||||||
|
if let int16Ptr = rawPtr.baseAddress?.assumingMemoryBound(to: Int16.self) {
|
||||||
|
buffer.int16ChannelData?[0].update(from: int16Ptr, count: Int(frameCount))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
request.append(buffer)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// End streaming session
|
||||||
|
public func endStreamingSession() async {
|
||||||
|
streamingRequest?.endAudio()
|
||||||
|
isStreamingActive = false
|
||||||
|
streamingRequest = nil
|
||||||
|
streamingTask = nil
|
||||||
|
streamingRecognizer = nil
|
||||||
|
streamingContinuation = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get status information
|
||||||
|
public func getStatus() -> String {
|
||||||
|
if isAvailable {
|
||||||
|
return "SFSpeechRecognizer available"
|
||||||
|
} else {
|
||||||
|
return "Speech recognition not available"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Private Implementation
|
||||||
|
|
||||||
|
private func checkAvailability() async {
|
||||||
|
// Check SFSpeechRecognizer availability
|
||||||
|
let status = SFSpeechRecognizer.authorizationStatus()
|
||||||
|
switch status {
|
||||||
|
case .authorized:
|
||||||
|
isAvailable = SFSpeechRecognizer.supportedLocales().count > 0
|
||||||
|
case .notDetermined:
|
||||||
|
// Request authorization
|
||||||
|
isAvailable = await withCheckedContinuation { continuation in
|
||||||
|
SFSpeechRecognizer.requestAuthorization { newStatus in
|
||||||
|
continuation.resume(returning: newStatus == .authorized)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
isAvailable = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create temporary audio file from data
|
||||||
|
private func createTempAudioFile(data: Data, mimeType: String) throws -> URL {
|
||||||
|
let ext = extensionForMimeType(mimeType)
|
||||||
|
let tempDir = FileManager.default.temporaryDirectory
|
||||||
|
let fileName = UUID().uuidString + "." + ext
|
||||||
|
let fileURL = tempDir.appendingPathComponent(fileName)
|
||||||
|
|
||||||
|
try data.write(to: fileURL)
|
||||||
|
return fileURL
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get file extension for MIME type
|
||||||
|
private func extensionForMimeType(_ mimeType: String) -> String {
|
||||||
|
switch mimeType.lowercased() {
|
||||||
|
case "audio/wav", "audio/wave", "audio/x-wav":
|
||||||
|
return "wav"
|
||||||
|
case "audio/mp3", "audio/mpeg":
|
||||||
|
return "mp3"
|
||||||
|
case "audio/m4a", "audio/mp4", "audio/x-m4a":
|
||||||
|
return "m4a"
|
||||||
|
case "audio/aac":
|
||||||
|
return "aac"
|
||||||
|
case "audio/flac":
|
||||||
|
return "flac"
|
||||||
|
default:
|
||||||
|
return "wav"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Transcribe using SFSpeechRecognizer
|
||||||
|
private func transcribeWithSFSpeechRecognizer(
|
||||||
|
url: URL,
|
||||||
|
config: TranscriptionConfig
|
||||||
|
) async throws -> TranscriptionResult {
|
||||||
|
let locale = Locale(identifier: config.languageCode ?? "en-US")
|
||||||
|
guard let recognizer = SFSpeechRecognizer(locale: locale) else {
|
||||||
|
throw SpeechToTextError.notAvailable
|
||||||
|
}
|
||||||
|
|
||||||
|
guard recognizer.isAvailable else {
|
||||||
|
throw SpeechToTextError.notAvailable
|
||||||
|
}
|
||||||
|
|
||||||
|
let request = SFSpeechURLRecognitionRequest(url: url)
|
||||||
|
request.shouldReportPartialResults = false
|
||||||
|
|
||||||
|
return try await withCheckedThrowingContinuation { continuation in
|
||||||
|
var hasResumed = false
|
||||||
|
|
||||||
|
recognizer.recognitionTask(with: request) { result, error in
|
||||||
|
guard !hasResumed else { return }
|
||||||
|
|
||||||
|
if let error = error {
|
||||||
|
hasResumed = true
|
||||||
|
continuation.resume(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
guard let result = result, result.isFinal else { return }
|
||||||
|
|
||||||
|
hasResumed = true
|
||||||
|
|
||||||
|
let transcription = result.bestTranscription
|
||||||
|
var segments: [TranscriptionSegmentResult] = []
|
||||||
|
|
||||||
|
if config.enableTimestamps {
|
||||||
|
for segment in transcription.segments {
|
||||||
|
segments.append(TranscriptionSegmentResult(
|
||||||
|
text: segment.substring,
|
||||||
|
startTime: Float(segment.timestamp),
|
||||||
|
endTime: Float(segment.timestamp + segment.duration),
|
||||||
|
confidence: segment.confidence
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let transcriptionResult = TranscriptionResult(
|
||||||
|
text: transcription.formattedString,
|
||||||
|
segments: segments,
|
||||||
|
detectedLanguage: config.languageCode ?? "en-US",
|
||||||
|
confidence: segments.isEmpty ? 1.0 : segments.reduce(0) { $0 + $1.confidence } / Float(segments.count)
|
||||||
|
)
|
||||||
|
|
||||||
|
continuation.resume(returning: transcriptionResult)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Start streaming session for gRPC audio chunks
|
||||||
|
private func startStreamingSession(
|
||||||
|
config: TranscriptionConfig,
|
||||||
|
continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation
|
||||||
|
) async throws {
|
||||||
|
let locale = Locale(identifier: config.languageCode ?? "en-US")
|
||||||
|
guard let recognizer = SFSpeechRecognizer(locale: locale) else {
|
||||||
|
throw SpeechToTextError.notAvailable
|
||||||
|
}
|
||||||
|
|
||||||
|
guard recognizer.isAvailable else {
|
||||||
|
throw SpeechToTextError.notAvailable
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set up streaming state
|
||||||
|
isStreamingActive = true
|
||||||
|
streamingRecognizer = recognizer
|
||||||
|
streamingContinuation = continuation
|
||||||
|
|
||||||
|
let request = SFSpeechAudioBufferRecognitionRequest()
|
||||||
|
request.shouldReportPartialResults = true
|
||||||
|
streamingRequest = request
|
||||||
|
|
||||||
|
// Create wrapper to handle results safely
|
||||||
|
let service = self
|
||||||
|
let resultHandler = StreamingResultHandler(
|
||||||
|
config: config,
|
||||||
|
continuation: continuation,
|
||||||
|
onFinish: {
|
||||||
|
Task { await service.endStreamingSession() }
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
streamingTask = recognizer.recognitionTask(with: request) { result, error in
|
||||||
|
resultHandler.handleResult(result: result, error: error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Streaming Result Handler
|
||||||
|
|
||||||
|
/// Wrapper to safely handle streaming recognition results
|
||||||
|
private final class StreamingResultHandler: @unchecked Sendable {
|
||||||
|
private let config: TranscriptionConfig
|
||||||
|
private let continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation
|
||||||
|
private let onFinish: () -> Void
|
||||||
|
|
||||||
|
init(
|
||||||
|
config: TranscriptionConfig,
|
||||||
|
continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation,
|
||||||
|
onFinish: @escaping () -> Void
|
||||||
|
) {
|
||||||
|
self.config = config
|
||||||
|
self.continuation = continuation
|
||||||
|
self.onFinish = onFinish
|
||||||
|
}
|
||||||
|
|
||||||
|
func handleResult(result: SFSpeechRecognitionResult?, error: Error?) {
|
||||||
|
if let error = error {
|
||||||
|
continuation.finish(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription))
|
||||||
|
onFinish()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
guard let result = result else { return }
|
||||||
|
|
||||||
|
let transcription = result.bestTranscription
|
||||||
|
var segments: [TranscriptionSegmentResult] = []
|
||||||
|
|
||||||
|
if config.enableTimestamps {
|
||||||
|
for segment in transcription.segments {
|
||||||
|
segments.append(TranscriptionSegmentResult(
|
||||||
|
text: segment.substring,
|
||||||
|
startTime: Float(segment.timestamp),
|
||||||
|
endTime: Float(segment.timestamp + segment.duration),
|
||||||
|
confidence: segment.confidence
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let update = StreamingTranscriptionUpdate(
|
||||||
|
partialText: transcription.formattedString,
|
||||||
|
isFinal: result.isFinal,
|
||||||
|
finalText: result.isFinal ? transcription.formattedString : nil,
|
||||||
|
segments: segments
|
||||||
|
)
|
||||||
|
continuation.yield(update)
|
||||||
|
|
||||||
|
if result.isFinal {
|
||||||
|
continuation.finish()
|
||||||
|
onFinish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
280
Sources/AppleIntelligenceCore/Services/TextToSpeechService.swift
Normal file
280
Sources/AppleIntelligenceCore/Services/TextToSpeechService.swift
Normal file
@ -0,0 +1,280 @@
|
|||||||
|
import Foundation
|
||||||
|
import AVFoundation
|
||||||
|
|
||||||
|
// MARK: - Result Types
|
||||||
|
|
||||||
|
/// Result of text-to-speech synthesis
|
||||||
|
public struct TextToSpeechResult: Sendable {
|
||||||
|
public let audioData: Data
|
||||||
|
public let format: AudioOutputFormat
|
||||||
|
public let sampleRate: Int
|
||||||
|
public let channels: Int
|
||||||
|
public let durationSeconds: Float
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Supported output formats
|
||||||
|
public enum AudioOutputFormat: Sendable {
|
||||||
|
case wav
|
||||||
|
case mp3
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Voice information
|
||||||
|
public struct VoiceDescription: Sendable {
|
||||||
|
public let identifier: String
|
||||||
|
public let name: String
|
||||||
|
public let language: String
|
||||||
|
public let isPremium: Bool
|
||||||
|
public let gender: String
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configuration for speech synthesis
|
||||||
|
public struct SpeechConfig: Sendable {
|
||||||
|
public var voiceIdentifier: String?
|
||||||
|
public var speakingRate: Float // 0.0 - 1.0
|
||||||
|
public var pitchMultiplier: Float // 0.5 - 2.0
|
||||||
|
public var volume: Float // 0.0 - 1.0
|
||||||
|
|
||||||
|
public static let `default` = SpeechConfig(
|
||||||
|
voiceIdentifier: nil,
|
||||||
|
speakingRate: 0.5,
|
||||||
|
pitchMultiplier: 1.0,
|
||||||
|
volume: 1.0
|
||||||
|
)
|
||||||
|
|
||||||
|
public init(
|
||||||
|
voiceIdentifier: String? = nil,
|
||||||
|
speakingRate: Float = 0.5,
|
||||||
|
pitchMultiplier: Float = 1.0,
|
||||||
|
volume: Float = 1.0
|
||||||
|
) {
|
||||||
|
self.voiceIdentifier = voiceIdentifier
|
||||||
|
self.speakingRate = speakingRate
|
||||||
|
self.pitchMultiplier = pitchMultiplier
|
||||||
|
self.volume = volume
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Errors
|
||||||
|
|
||||||
|
public enum TextToSpeechError: Error, CustomStringConvertible, Sendable {
|
||||||
|
case invalidVoice(String)
|
||||||
|
case synthesisFailure(String)
|
||||||
|
case encodingFailure(String)
|
||||||
|
case noAudioGenerated
|
||||||
|
case unsupportedFormat
|
||||||
|
|
||||||
|
public var description: String {
|
||||||
|
switch self {
|
||||||
|
case .invalidVoice(let id): return "Invalid voice identifier: \(id)"
|
||||||
|
case .synthesisFailure(let reason): return "Speech synthesis failed: \(reason)"
|
||||||
|
case .encodingFailure(let reason): return "Audio encoding failed: \(reason)"
|
||||||
|
case .noAudioGenerated: return "No audio was generated"
|
||||||
|
case .unsupportedFormat: return "Unsupported audio format"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Service Actor
|
||||||
|
|
||||||
|
public actor TextToSpeechService {
|
||||||
|
/// Keep strong reference to synthesizer during synthesis
|
||||||
|
private var activeSynthesizer: AVSpeechSynthesizer?
|
||||||
|
|
||||||
|
public init() {}
|
||||||
|
|
||||||
|
// MARK: - Public API
|
||||||
|
|
||||||
|
/// Synthesize text to speech
|
||||||
|
public func synthesize(
|
||||||
|
text: String,
|
||||||
|
config: SpeechConfig = .default,
|
||||||
|
outputFormat: AudioOutputFormat = .wav
|
||||||
|
) async throws -> TextToSpeechResult {
|
||||||
|
// Create utterance
|
||||||
|
let utterance = AVSpeechUtterance(string: text)
|
||||||
|
|
||||||
|
// Configure voice
|
||||||
|
if let voiceId = config.voiceIdentifier {
|
||||||
|
if let voice = AVSpeechSynthesisVoice(identifier: voiceId) {
|
||||||
|
utterance.voice = voice
|
||||||
|
} else {
|
||||||
|
throw TextToSpeechError.invalidVoice(voiceId)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Use default English voice
|
||||||
|
utterance.voice = AVSpeechSynthesisVoice(language: "en-US")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Configure speech parameters
|
||||||
|
utterance.rate = config.speakingRate
|
||||||
|
utterance.pitchMultiplier = config.pitchMultiplier
|
||||||
|
utterance.volume = config.volume
|
||||||
|
|
||||||
|
// Collect PCM data
|
||||||
|
let pcmData = try await collectPCMData(utterance: utterance)
|
||||||
|
|
||||||
|
// Convert to requested format
|
||||||
|
let audioData: Data
|
||||||
|
switch outputFormat {
|
||||||
|
case .wav:
|
||||||
|
audioData = createWAVData(from: pcmData)
|
||||||
|
case .mp3:
|
||||||
|
// Use WAV as fallback (MP3 encoding requires external library)
|
||||||
|
audioData = createWAVData(from: pcmData)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate duration
|
||||||
|
let bytesPerSample = 2 // Int16
|
||||||
|
let totalSamples = pcmData.samples.count / bytesPerSample / pcmData.channelCount
|
||||||
|
let duration = Float(totalSamples) / Float(pcmData.sampleRate)
|
||||||
|
|
||||||
|
return TextToSpeechResult(
|
||||||
|
audioData: audioData,
|
||||||
|
format: outputFormat,
|
||||||
|
sampleRate: Int(pcmData.sampleRate),
|
||||||
|
channels: pcmData.channelCount,
|
||||||
|
durationSeconds: duration
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// List available voices
|
||||||
|
public func listVoices(languageCode: String? = nil) -> [VoiceDescription] {
|
||||||
|
let voices = AVSpeechSynthesisVoice.speechVoices()
|
||||||
|
|
||||||
|
let filtered: [AVSpeechSynthesisVoice]
|
||||||
|
if let lang = languageCode {
|
||||||
|
filtered = voices.filter { $0.language.hasPrefix(lang) }
|
||||||
|
} else {
|
||||||
|
filtered = voices
|
||||||
|
}
|
||||||
|
|
||||||
|
return filtered.map { voice in
|
||||||
|
VoiceDescription(
|
||||||
|
identifier: voice.identifier,
|
||||||
|
name: voice.name,
|
||||||
|
language: voice.language,
|
||||||
|
isPremium: voice.quality == .enhanced || voice.quality == .premium,
|
||||||
|
gender: genderString(for: voice)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Private Implementation
|
||||||
|
|
||||||
|
/// PCM buffer data for internal processing
|
||||||
|
private struct PCMBufferData: Sendable {
|
||||||
|
let samples: Data
|
||||||
|
let sampleRate: Double
|
||||||
|
let channelCount: Int
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect PCM data from synthesizer using write callback
|
||||||
|
private func collectPCMData(
|
||||||
|
utterance: AVSpeechUtterance
|
||||||
|
) async throws -> PCMBufferData {
|
||||||
|
// Create and store synthesizer to keep strong reference during synthesis
|
||||||
|
let synthesizer = AVSpeechSynthesizer()
|
||||||
|
self.activeSynthesizer = synthesizer
|
||||||
|
|
||||||
|
defer { self.activeSynthesizer = nil }
|
||||||
|
|
||||||
|
return try await withCheckedThrowingContinuation { continuation in
|
||||||
|
var pcmData = Data()
|
||||||
|
var sampleRate: Double = 0
|
||||||
|
var channelCount: Int = 0
|
||||||
|
var hasResumed = false
|
||||||
|
|
||||||
|
synthesizer.write(utterance) { buffer in
|
||||||
|
guard let pcmBuffer = buffer as? AVAudioPCMBuffer else {
|
||||||
|
// End of audio - empty buffer signals completion
|
||||||
|
if !hasResumed {
|
||||||
|
hasResumed = true
|
||||||
|
if pcmData.isEmpty {
|
||||||
|
continuation.resume(throwing: TextToSpeechError.noAudioGenerated)
|
||||||
|
} else {
|
||||||
|
continuation.resume(returning: PCMBufferData(
|
||||||
|
samples: pcmData,
|
||||||
|
sampleRate: sampleRate,
|
||||||
|
channelCount: channelCount
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if pcmBuffer.frameLength > 0 {
|
||||||
|
// Store format from first buffer
|
||||||
|
if sampleRate == 0 {
|
||||||
|
sampleRate = pcmBuffer.format.sampleRate
|
||||||
|
channelCount = Int(pcmBuffer.format.channelCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert float samples to Int16 PCM
|
||||||
|
if let channelData = pcmBuffer.floatChannelData {
|
||||||
|
let frameCount = Int(pcmBuffer.frameLength)
|
||||||
|
for frame in 0..<frameCount {
|
||||||
|
for channel in 0..<channelCount {
|
||||||
|
let sample = channelData[channel][frame]
|
||||||
|
let clampedSample = max(-1.0, min(1.0, sample))
|
||||||
|
let int16Sample = Int16(clampedSample * Float(Int16.max))
|
||||||
|
withUnsafeBytes(of: int16Sample.littleEndian) { bytes in
|
||||||
|
pcmData.append(contentsOf: bytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create WAV data from PCM buffer data
|
||||||
|
private func createWAVData(from pcmData: PCMBufferData) -> Data {
|
||||||
|
let bitsPerSample = 16
|
||||||
|
let sampleRate = Int(pcmData.sampleRate)
|
||||||
|
let channels = pcmData.channelCount
|
||||||
|
let dataSize = pcmData.samples.count
|
||||||
|
|
||||||
|
var header = Data()
|
||||||
|
|
||||||
|
// RIFF header
|
||||||
|
header.append(contentsOf: "RIFF".utf8)
|
||||||
|
let fileSize = UInt32(dataSize + 36)
|
||||||
|
withUnsafeBytes(of: fileSize.littleEndian) { header.append(contentsOf: $0) }
|
||||||
|
header.append(contentsOf: "WAVE".utf8)
|
||||||
|
|
||||||
|
// fmt subchunk
|
||||||
|
header.append(contentsOf: "fmt ".utf8)
|
||||||
|
let subchunk1Size = UInt32(16)
|
||||||
|
withUnsafeBytes(of: subchunk1Size.littleEndian) { header.append(contentsOf: $0) }
|
||||||
|
let audioFormat = UInt16(1) // PCM
|
||||||
|
withUnsafeBytes(of: audioFormat.littleEndian) { header.append(contentsOf: $0) }
|
||||||
|
let numChannels = UInt16(channels)
|
||||||
|
withUnsafeBytes(of: numChannels.littleEndian) { header.append(contentsOf: $0) }
|
||||||
|
let sampleRateU32 = UInt32(sampleRate)
|
||||||
|
withUnsafeBytes(of: sampleRateU32.littleEndian) { header.append(contentsOf: $0) }
|
||||||
|
let byteRate = UInt32(sampleRate * channels * bitsPerSample / 8)
|
||||||
|
withUnsafeBytes(of: byteRate.littleEndian) { header.append(contentsOf: $0) }
|
||||||
|
let blockAlign = UInt16(channels * bitsPerSample / 8)
|
||||||
|
withUnsafeBytes(of: blockAlign.littleEndian) { header.append(contentsOf: $0) }
|
||||||
|
let bitsPerSampleU16 = UInt16(bitsPerSample)
|
||||||
|
withUnsafeBytes(of: bitsPerSampleU16.littleEndian) { header.append(contentsOf: $0) }
|
||||||
|
|
||||||
|
// data subchunk
|
||||||
|
header.append(contentsOf: "data".utf8)
|
||||||
|
let dataU32 = UInt32(dataSize)
|
||||||
|
withUnsafeBytes(of: dataU32.littleEndian) { header.append(contentsOf: $0) }
|
||||||
|
|
||||||
|
return header + pcmData.samples
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get gender string for voice
|
||||||
|
private func genderString(for voice: AVSpeechSynthesisVoice) -> String {
|
||||||
|
switch voice.gender {
|
||||||
|
case .male: return "male"
|
||||||
|
case .female: return "female"
|
||||||
|
case .unspecified: return "unspecified"
|
||||||
|
@unknown default: return "unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,243 @@
|
|||||||
|
import Foundation
|
||||||
|
import Vision
|
||||||
|
import CoreImage
|
||||||
|
|
||||||
|
#if canImport(AppKit)
|
||||||
|
import AppKit
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/// Result of Vision framework analysis on an image
|
||||||
|
public struct VisionAnalysisResult: Sendable {
|
||||||
|
public let textContent: String
|
||||||
|
public let labels: [String]
|
||||||
|
public let description: String
|
||||||
|
|
||||||
|
public init(textContent: String = "", labels: [String] = [], description: String = "") {
|
||||||
|
self.textContent = textContent
|
||||||
|
self.labels = labels
|
||||||
|
self.description = description
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format analysis for LLM context
|
||||||
|
public func formatAsContext(imageIndex: Int, filename: String?) -> String {
|
||||||
|
var parts: [String] = []
|
||||||
|
|
||||||
|
let imageName = filename ?? "Image \(imageIndex + 1)"
|
||||||
|
|
||||||
|
if !textContent.isEmpty {
|
||||||
|
parts.append("Text: \"\(textContent)\"")
|
||||||
|
}
|
||||||
|
|
||||||
|
if !labels.isEmpty {
|
||||||
|
parts.append("Objects: \(labels.joined(separator: ", "))")
|
||||||
|
}
|
||||||
|
|
||||||
|
if parts.isEmpty {
|
||||||
|
return "\(imageName): No content detected"
|
||||||
|
}
|
||||||
|
|
||||||
|
return "\(imageName): \(parts.joined(separator: " | "))"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Errors from Vision analysis
|
||||||
|
public enum VisionAnalysisError: Error, CustomStringConvertible, Sendable {
|
||||||
|
case invalidImageData
|
||||||
|
case analysisFailure(String)
|
||||||
|
case unsupportedFormat
|
||||||
|
|
||||||
|
public var description: String {
|
||||||
|
switch self {
|
||||||
|
case .invalidImageData:
|
||||||
|
return "Invalid or corrupted image data"
|
||||||
|
case .analysisFailure(let reason):
|
||||||
|
return "Vision analysis failed: \(reason)"
|
||||||
|
case .unsupportedFormat:
|
||||||
|
return "Unsupported image format"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Service for analyzing images using Apple's Vision framework
|
||||||
|
public actor VisionAnalysisService {
|
||||||
|
|
||||||
|
/// Configuration for which analyses to perform
|
||||||
|
public struct AnalysisOptions: Sendable {
|
||||||
|
public var performOCR: Bool
|
||||||
|
public var performClassification: Bool
|
||||||
|
|
||||||
|
public init(performOCR: Bool = true, performClassification: Bool = true) {
|
||||||
|
self.performOCR = performOCR
|
||||||
|
self.performClassification = performClassification
|
||||||
|
}
|
||||||
|
|
||||||
|
public static let all = AnalysisOptions()
|
||||||
|
public static let textOnly = AnalysisOptions(performOCR: true, performClassification: false)
|
||||||
|
}
|
||||||
|
|
||||||
|
public init() {}
|
||||||
|
|
||||||
|
/// Analyze a single image
|
||||||
|
public func analyze(
|
||||||
|
imageData: Data,
|
||||||
|
options: AnalysisOptions = .all
|
||||||
|
) async throws -> VisionAnalysisResult {
|
||||||
|
guard let cgImage = createCGImage(from: imageData) else {
|
||||||
|
throw VisionAnalysisError.invalidImageData
|
||||||
|
}
|
||||||
|
|
||||||
|
var textContent = ""
|
||||||
|
var labels: [String] = []
|
||||||
|
|
||||||
|
// Perform OCR
|
||||||
|
if options.performOCR {
|
||||||
|
textContent = try await performTextRecognition(on: cgImage)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perform image classification
|
||||||
|
if options.performClassification {
|
||||||
|
labels = try await performImageClassification(on: cgImage)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build description
|
||||||
|
var descriptionParts: [String] = []
|
||||||
|
if !textContent.isEmpty {
|
||||||
|
let truncatedText = textContent.count > 200
|
||||||
|
? String(textContent.prefix(200)) + "..."
|
||||||
|
: textContent
|
||||||
|
descriptionParts.append("Contains text: \"\(truncatedText)\"")
|
||||||
|
}
|
||||||
|
if !labels.isEmpty {
|
||||||
|
descriptionParts.append("Shows: \(labels.prefix(5).joined(separator: ", "))")
|
||||||
|
}
|
||||||
|
|
||||||
|
let description = descriptionParts.isEmpty
|
||||||
|
? "Image with no recognizable content"
|
||||||
|
: descriptionParts.joined(separator: ". ")
|
||||||
|
|
||||||
|
return VisionAnalysisResult(
|
||||||
|
textContent: textContent,
|
||||||
|
labels: labels,
|
||||||
|
description: description
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Analyze multiple images
|
||||||
|
public func analyzeMultiple(
|
||||||
|
images: [(data: Data, filename: String?)],
|
||||||
|
options: AnalysisOptions = .all
|
||||||
|
) async throws -> [VisionAnalysisResult] {
|
||||||
|
var results: [VisionAnalysisResult] = []
|
||||||
|
|
||||||
|
for image in images {
|
||||||
|
let result = try await analyze(imageData: image.data, options: options)
|
||||||
|
results.append(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
return results
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format multiple analyses as a combined context string for LLM
|
||||||
|
public func formatAnalysesAsPromptContext(
|
||||||
|
analyses: [(result: VisionAnalysisResult, filename: String?)]
|
||||||
|
) -> String {
|
||||||
|
guard !analyses.isEmpty else { return "" }
|
||||||
|
|
||||||
|
var lines: [String] = ["[Image Analysis]"]
|
||||||
|
|
||||||
|
for (index, analysis) in analyses.enumerated() {
|
||||||
|
lines.append(analysis.result.formatAsContext(
|
||||||
|
imageIndex: index,
|
||||||
|
filename: analysis.filename
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
lines.append("[End Image Analysis]")
|
||||||
|
|
||||||
|
return lines.joined(separator: "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Private Methods
|
||||||
|
|
||||||
|
private func createCGImage(from data: Data) -> CGImage? {
|
||||||
|
#if canImport(AppKit)
|
||||||
|
guard let nsImage = NSImage(data: data),
|
||||||
|
let cgImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
|
||||||
|
// Try CIImage as fallback
|
||||||
|
guard let ciImage = CIImage(data: data) else { return nil }
|
||||||
|
let context = CIContext()
|
||||||
|
return context.createCGImage(ciImage, from: ciImage.extent)
|
||||||
|
}
|
||||||
|
return cgImage
|
||||||
|
#else
|
||||||
|
guard let ciImage = CIImage(data: data) else { return nil }
|
||||||
|
let context = CIContext()
|
||||||
|
return context.createCGImage(ciImage, from: ciImage.extent)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
private func performTextRecognition(on image: CGImage) async throws -> String {
|
||||||
|
try await withCheckedThrowingContinuation { continuation in
|
||||||
|
let request = VNRecognizeTextRequest { request, error in
|
||||||
|
if let error = error {
|
||||||
|
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
guard let observations = request.results as? [VNRecognizedTextObservation] else {
|
||||||
|
continuation.resume(returning: "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
let recognizedText = observations.compactMap { observation in
|
||||||
|
observation.topCandidates(1).first?.string
|
||||||
|
}.joined(separator: "\n")
|
||||||
|
|
||||||
|
continuation.resume(returning: recognizedText)
|
||||||
|
}
|
||||||
|
|
||||||
|
request.recognitionLevel = .accurate
|
||||||
|
request.usesLanguageCorrection = true
|
||||||
|
|
||||||
|
let handler = VNImageRequestHandler(cgImage: image, options: [:])
|
||||||
|
|
||||||
|
do {
|
||||||
|
try handler.perform([request])
|
||||||
|
} catch {
|
||||||
|
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func performImageClassification(on image: CGImage) async throws -> [String] {
|
||||||
|
try await withCheckedThrowingContinuation { continuation in
|
||||||
|
let request = VNClassifyImageRequest { request, error in
|
||||||
|
if let error = error {
|
||||||
|
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
guard let observations = request.results as? [VNClassificationObservation] else {
|
||||||
|
continuation.resume(returning: [])
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter to high-confidence labels and take top 10
|
||||||
|
let labels = observations
|
||||||
|
.filter { $0.confidence > 0.3 }
|
||||||
|
.prefix(10)
|
||||||
|
.map { $0.identifier.replacingOccurrences(of: "_", with: " ") }
|
||||||
|
|
||||||
|
continuation.resume(returning: Array(labels))
|
||||||
|
}
|
||||||
|
|
||||||
|
let handler = VNImageRequestHandler(cgImage: image, options: [:])
|
||||||
|
|
||||||
|
do {
|
||||||
|
try handler.perform([request])
|
||||||
|
} catch {
|
||||||
|
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -37,7 +37,21 @@ struct AppleIntelligenceServer: AsyncParsableCommand {
|
|||||||
throw ExitCode.failure
|
throw ExitCode.failure
|
||||||
}
|
}
|
||||||
|
|
||||||
let provider = AppleIntelligenceProvider(service: service, apiKey: config.apiKey)
|
// Initialize speech services
|
||||||
|
print("Initializing Text-to-Speech service...")
|
||||||
|
let ttsService = TextToSpeechService()
|
||||||
|
|
||||||
|
print("Initializing Speech-to-Text service...")
|
||||||
|
let sttService = await SpeechToTextService()
|
||||||
|
let sttStatus = await sttService.getStatus()
|
||||||
|
print("Speech-to-Text status: \(sttStatus)")
|
||||||
|
|
||||||
|
let provider = AppleIntelligenceProvider(
|
||||||
|
service: service,
|
||||||
|
ttsService: ttsService,
|
||||||
|
sttService: sttService,
|
||||||
|
apiKey: config.apiKey
|
||||||
|
)
|
||||||
|
|
||||||
let transport = HTTP2ServerTransport.Posix(
|
let transport = HTTP2ServerTransport.Posix(
|
||||||
address: .ipv4(host: bindHost, port: bindPort),
|
address: .ipv4(host: bindHost, port: bindPort),
|
||||||
@ -52,7 +66,15 @@ struct AppleIntelligenceServer: AsyncParsableCommand {
|
|||||||
print("API key authentication is enabled")
|
print("API key authentication is enabled")
|
||||||
}
|
}
|
||||||
print("Server is ready to accept connections")
|
print("Server is ready to accept connections")
|
||||||
print("Health check: grpcurl -plaintext \(bindHost):\(bindPort) appleintelligence.AppleIntelligence/Health")
|
print("")
|
||||||
|
print("Available services:")
|
||||||
|
print(" - Complete/StreamComplete: Text generation with Apple Intelligence")
|
||||||
|
print(" - TextToSpeech: Convert text to spoken audio")
|
||||||
|
print(" - ListVoices: List available TTS voices")
|
||||||
|
print(" - Transcribe: Convert audio file to text")
|
||||||
|
print(" - StreamTranscribe: Real-time speech-to-text")
|
||||||
|
print("")
|
||||||
|
print("Health check: grpcurl -plaintext \(bindHost):\(bindPort) appleintelligence.AppleIntelligenceService/Health")
|
||||||
print("Press Ctrl+C to stop the server")
|
print("Press Ctrl+C to stop the server")
|
||||||
|
|
||||||
try await server.serve()
|
try await server.serve()
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user