Compare commits

...

5 Commits

Author SHA1 Message Date
Mathias Beaulieu-Duncan
851d6fef2b Update README with TTS, STT, and Vision documentation
- Document all gRPC API methods including new speech services
- Add Vision support section with image formats
- Add Text-to-Speech section with voice configuration
- Add Speech-to-Text section with file and streaming support
- Document supported audio formats (WAV, MP3, M4A, AAC, FLAC)
- Add streaming transcription protocol details
- Update grpcurl examples for all endpoints
- Add supported languages section
- Update project structure with new services
- Add troubleshooting for speech features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 03:41:59 -05:00
Mathias Beaulieu-Duncan
f7b8fbfa36 Fix STT streaming to receive audio from gRPC client
- Fix streaming STT to accept audio chunks from gRPC stream instead of local microphone
- Add proper PCM audio buffer conversion for 16-bit, 16kHz, mono audio
- Add StreamingResultHandler for safe callback handling
- Properly manage streaming session state and cleanup

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 03:40:46 -05:00
Mathias Beaulieu-Duncan
7655f1f0b8 Improve chat UX: paste images and language-aware TTS
- Remove clipboard button, add ⌘V paste support for images
- Add automatic language detection for TTS (French/English)
- Use appropriate voice based on message language
- Simplify TTS to use system default voices

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 03:38:36 -05:00
Mathias Beaulieu-Duncan
b754945923 Add Text-to-Speech and Speech-to-Text features
- Add TTS service using AVSpeechSynthesizer for voice output
- Add STT service using SpeechAnalyzer (macOS 26) for transcription
- Add voice input (microphone) button in chat with recording level indicator
- Add speak button on assistant messages for TTS playback
- Add language toggle (EN-CA/FR-CA) for bilingual speech recognition
- Fix Swift 6 strict concurrency issues in audio callbacks
- Update proto schema with TTS/STT message types and RPCs
- Update gRPC provider with speech service endpoints

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 02:57:30 -05:00
Mathias Beaulieu-Duncan
638656e7ca Add vision support, gRPC reflection toggle, and chat improvements
- Add Vision framework integration for image analysis (OCR, classification)
- Add image attachment support in chat UI with drag & drop
- Add recent images sidebar from Downloads/Desktop
- Add copy to clipboard button for assistant responses
- Add gRPC reflection service with toggle in settings
- Create proper .proto file and generate Swift code
- Add server restart when toggling reflection setting
- Fix port number formatting in settings (remove comma grouping)
- Update gRPC dependencies to v2.x

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-30 16:18:06 -05:00
22 changed files with 5869 additions and 499 deletions

View File

@ -1,13 +1,22 @@
{
"originHash" : "73128af91f020c013de06bf6af5d06131ff05e38285118f5ff904ee06a3a6e24",
"originHash" : "1d1344dab64c4f153b2a1af227098e02f62d2c1f627c95dcad4304f1c16a97a3",
"pins" : [
{
"identity" : "grpc-swift",
"identity" : "grpc-swift-2",
"kind" : "remoteSourceControl",
"location" : "https://github.com/grpc/grpc-swift.git",
"location" : "https://github.com/grpc/grpc-swift-2.git",
"state" : {
"revision" : "adc18c3e1c55027d0ce43893897ac448e3f27ebe",
"version" : "2.2.3"
"revision" : "531924b28fde0cf7585123c781c6f55cc35ef7fc",
"version" : "2.2.1"
}
},
{
"identity" : "grpc-swift-extras",
"kind" : "remoteSourceControl",
"location" : "https://github.com/grpc/grpc-swift-extras.git",
"state" : {
"revision" : "7ab4a690ac09696689a9c4b99320af7ef809bb3d",
"version" : "2.1.1"
}
},
{
@ -15,8 +24,8 @@
"kind" : "remoteSourceControl",
"location" : "https://github.com/grpc/grpc-swift-nio-transport.git",
"state" : {
"revision" : "ca2303eb7f3df556beafbba33a143ffa30d5b786",
"version" : "1.2.3"
"revision" : "dcfa8dc858bba5ded7a3760cede8c5fc03558a42",
"version" : "2.4.0"
}
},
{
@ -24,8 +33,8 @@
"kind" : "remoteSourceControl",
"location" : "https://github.com/grpc/grpc-swift-protobuf.git",
"state" : {
"revision" : "53e89e3a5d417307f70a721c7b83e564fefb1e1c",
"version" : "1.3.1"
"revision" : "a1aa982cb2a276c72b478433eb75a4ec6508a277",
"version" : "2.1.2"
}
},
{
@ -100,6 +109,15 @@
"version" : "4.2.0"
}
},
{
"identity" : "swift-distributed-tracing",
"kind" : "remoteSourceControl",
"location" : "https://github.com/apple/swift-distributed-tracing.git",
"state" : {
"revision" : "baa932c1336f7894145cbaafcd34ce2dd0b77c97",
"version" : "1.3.1"
}
},
{
"identity" : "swift-http-structured-headers",
"kind" : "remoteSourceControl",
@ -190,6 +208,15 @@
"version" : "1.33.3"
}
},
{
"identity" : "swift-service-context",
"kind" : "remoteSourceControl",
"location" : "https://github.com/apple/swift-service-context.git",
"state" : {
"revision" : "1983448fefc717a2bc2ebde5490fe99873c5b8a6",
"version" : "1.2.1"
}
},
{
"identity" : "swift-service-lifecycle",
"kind" : "remoteSourceControl",

View File

@ -11,9 +11,10 @@ let package = Package(
.executable(name: "AppleIntelligenceApp", targets: ["AppleIntelligenceApp"]),
],
dependencies: [
.package(url: "https://github.com/grpc/grpc-swift.git", from: "2.0.0"),
.package(url: "https://github.com/grpc/grpc-swift-nio-transport.git", from: "1.0.0"),
.package(url: "https://github.com/grpc/grpc-swift-protobuf.git", from: "1.0.0"),
.package(url: "https://github.com/grpc/grpc-swift-2.git", from: "2.0.0"),
.package(url: "https://github.com/grpc/grpc-swift-nio-transport.git", from: "2.0.0"),
.package(url: "https://github.com/grpc/grpc-swift-protobuf.git", from: "2.0.0"),
.package(url: "https://github.com/grpc/grpc-swift-extras.git", from: "2.0.0"),
.package(url: "https://github.com/apple/swift-protobuf.git", from: "1.28.0"),
.package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.3.0"),
],
@ -22,11 +23,15 @@ let package = Package(
.target(
name: "AppleIntelligenceCore",
dependencies: [
.product(name: "GRPCCore", package: "grpc-swift"),
.product(name: "GRPCCore", package: "grpc-swift-2"),
.product(name: "GRPCNIOTransportHTTP2", package: "grpc-swift-nio-transport"),
.product(name: "GRPCProtobuf", package: "grpc-swift-protobuf"),
.product(name: "GRPCReflectionService", package: "grpc-swift-extras"),
.product(name: "SwiftProtobuf", package: "swift-protobuf"),
],
resources: [
.copy("Resources/apple_intelligence.pb")
],
swiftSettings: [
.unsafeFlags(["-Xfrontend", "-suppress-warnings"])
]

View File

@ -0,0 +1,179 @@
syntax = "proto3";
package appleintelligence;
// Image data for vision requests
message ImageData {
bytes data = 1;
string filename = 2;
string mime_type = 3;
}
// Vision analysis results
message ImageAnalysis {
string text_content = 1;
repeated string labels = 2;
string description = 3;
}
// Completion request
message CompletionRequest {
string prompt = 1;
optional float temperature = 2;
optional int32 max_tokens = 3;
repeated ImageData images = 4;
bool include_analysis = 5;
}
// Completion response (non-streaming)
message CompletionResponse {
string id = 1;
string text = 2;
string finish_reason = 3;
repeated ImageAnalysis image_analyses = 4;
}
// Streaming completion chunk
message CompletionChunk {
string id = 1;
string delta = 2;
bool is_final = 3;
string finish_reason = 4;
repeated ImageAnalysis image_analyses = 5;
}
// Health check request
message HealthRequest {}
// Health check response
message HealthResponse {
bool healthy = 1;
string model_status = 2;
}
// ============ TEXT-TO-SPEECH ============
// Audio format enumeration
enum AudioFormat {
AUDIO_FORMAT_UNSPECIFIED = 0;
AUDIO_FORMAT_WAV = 1;
AUDIO_FORMAT_MP3 = 2;
}
// Voice configuration for TTS
message VoiceConfig {
string voice_identifier = 1;
optional float speaking_rate = 2; // 0.0-1.0, default 0.5
optional float pitch_multiplier = 3; // 0.5-2.0, default 1.0
optional float volume = 4; // 0.0-1.0, default 1.0
}
// TTS Request
message TextToSpeechRequest {
string text = 1;
AudioFormat output_format = 2;
optional VoiceConfig voice_config = 3;
}
// TTS Response
message TextToSpeechResponse {
bytes audio_data = 1;
AudioFormat format = 2;
int32 sample_rate = 3;
int32 channels = 4;
float duration_seconds = 5;
}
// List available voices request
message ListVoicesRequest {
optional string language_code = 1;
}
// Voice information
message VoiceInfo {
string identifier = 1;
string name = 2;
string language = 3;
bool is_premium = 4;
string gender = 5;
}
// List voices response
message ListVoicesResponse {
repeated VoiceInfo voices = 1;
}
// ============ SPEECH-TO-TEXT ============
// STT Configuration
message TranscriptionConfig {
optional string language_code = 1;
optional bool enable_punctuation = 2; // default true
optional bool enable_timestamps = 3; // default false
}
// Audio data for STT
message AudioInput {
bytes data = 1;
string mime_type = 2; // "audio/wav", "audio/mp3", "audio/m4a"
optional int32 sample_rate = 3;
optional int32 channels = 4;
}
// File-based transcription request
message TranscribeRequest {
AudioInput audio = 1;
optional TranscriptionConfig config = 2;
}
// Transcription segment with timing
message TranscriptionSegment {
string text = 1;
float start_time = 2;
float end_time = 3;
float confidence = 4;
}
// Transcription response
message TranscribeResponse {
string text = 1;
repeated TranscriptionSegment segments = 2;
string detected_language = 3;
float confidence = 4;
}
// Streaming STT request chunk
message StreamingTranscribeRequest {
oneof request {
TranscriptionConfig config = 1; // Send first to configure
bytes audio_chunk = 2; // Subsequent audio chunks
}
}
// Streaming STT response
message StreamingTranscribeResponse {
string partial_text = 1;
bool is_final = 2;
string final_text = 3;
repeated TranscriptionSegment segments = 4;
}
// Apple Intelligence Service
service AppleIntelligenceService {
// Single completion request
rpc Complete(CompletionRequest) returns (CompletionResponse);
// Streaming completion request
rpc StreamComplete(CompletionRequest) returns (stream CompletionChunk);
// Health check
rpc Health(HealthRequest) returns (HealthResponse);
// Text-to-Speech
rpc TextToSpeech(TextToSpeechRequest) returns (TextToSpeechResponse);
rpc ListVoices(ListVoicesRequest) returns (ListVoicesResponse);
// Speech-to-Text
rpc Transcribe(TranscribeRequest) returns (TranscribeResponse);
rpc StreamTranscribe(stream StreamingTranscribeRequest) returns (stream StreamingTranscribeResponse);
}

174
README.md
View File

@ -6,8 +6,11 @@ A Swift-based gRPC server that exposes Apple Intelligence (Foundation Models) ov
- **gRPC API** - Standard gRPC interface accessible from any language
- **Streaming Support** - Real-time token streaming for responsive UX
- **Vision Analysis** - Analyze images with text extraction, labeling, and descriptions
- **Text-to-Speech** - Convert text to audio (WAV/MP3) with multiple voices
- **Speech-to-Text** - Transcribe audio files or stream audio in real-time
- **Menu Bar App** - Native macOS app with system tray integration
- **Built-in Chat UI** - Test the AI directly from the app
- **Built-in Chat UI** - Test the AI directly from the app with voice input/output
- **API Key Auth** - Optional bearer token authentication
- **Auto-Start** - Launch at login and auto-start server options
@ -45,7 +48,7 @@ swift build -c release --product AppleIntelligenceServer
1. Launch **Apple Intelligence Server** from Applications
2. Click the brain icon in the menu bar
3. Toggle **Start Server** to begin accepting connections
4. Use **Chat** to test the AI directly
4. Use **Chat** to test the AI directly (supports voice input/output)
5. Configure host, port, and API key in **Settings**
### CLI Server
@ -63,10 +66,19 @@ GRPC_HOST=127.0.0.1 GRPC_PORT=8080 API_KEY=secret .build/release/AppleIntelligen
### Service Definition
```protobuf
service AppleIntelligence {
service AppleIntelligenceService {
// AI Completion
rpc Health(HealthRequest) returns (HealthResponse);
rpc Complete(CompletionRequest) returns (CompletionResponse);
rpc StreamComplete(CompletionRequest) returns (stream CompletionChunk);
// Text-to-Speech
rpc TextToSpeech(TextToSpeechRequest) returns (TextToSpeechResponse);
rpc ListVoices(ListVoicesRequest) returns (ListVoicesResponse);
// Speech-to-Text
rpc Transcribe(TranscribeRequest) returns (TranscribeResponse);
rpc StreamTranscribe(stream StreamingTranscribeRequest) returns (stream StreamingTranscribeResponse);
}
```
@ -75,24 +87,134 @@ service AppleIntelligence {
| Method | Type | Description |
|--------|------|-------------|
| `Health` | Unary | Check server and model availability |
| `Complete` | Unary | Generate complete response |
| `Complete` | Unary | Generate complete response (supports images) |
| `StreamComplete` | Server Streaming | Stream tokens as they're generated |
| `TextToSpeech` | Unary | Convert text to audio |
| `ListVoices` | Unary | List available TTS voices |
| `Transcribe` | Unary | Transcribe audio file to text |
| `StreamTranscribe` | Bidirectional | Real-time audio transcription |
### Vision Support
The `Complete` and `StreamComplete` methods support image analysis:
```protobuf
message CompletionRequest {
string prompt = 1;
optional float temperature = 2;
optional int32 max_tokens = 3;
repeated ImageData images = 4; // Attach images for analysis
bool include_analysis = 5; // Return detailed analysis
}
message ImageData {
bytes data = 1;
string filename = 2;
string mime_type = 3; // image/png, image/jpeg, etc.
}
```
**Supported Image Formats:** PNG, JPEG, GIF, WebP, HEIC
### Text-to-Speech
```protobuf
message TextToSpeechRequest {
string text = 1;
AudioFormat output_format = 2; // WAV or MP3
optional VoiceConfig voice_config = 3;
}
message VoiceConfig {
string voice_identifier = 1; // Voice ID from ListVoices
optional float speaking_rate = 2; // 0.0-1.0, default 0.5
optional float pitch_multiplier = 3; // 0.5-2.0, default 1.0
optional float volume = 4; // 0.0-1.0, default 1.0
}
```
**Output Formats:** WAV, MP3
### Speech-to-Text
#### File-based Transcription
```protobuf
message TranscribeRequest {
AudioInput audio = 1;
optional TranscriptionConfig config = 2;
}
message AudioInput {
bytes data = 1;
string mime_type = 2; // audio/wav, audio/mp3, etc.
optional int32 sample_rate = 3;
optional int32 channels = 4;
}
message TranscriptionConfig {
optional string language_code = 1; // e.g., "en-US", "fr-CA"
optional bool enable_punctuation = 2;
optional bool enable_timestamps = 3;
}
```
**Supported Audio Formats:** WAV, MP3, M4A, AAC, FLAC
#### Streaming Transcription
For real-time transcription, use bidirectional streaming:
1. Send `TranscriptionConfig` first to configure the session
2. Send `audio_chunk` messages with PCM audio data (16-bit, 16kHz, mono)
3. Receive `StreamingTranscribeResponse` with partial and final results
```protobuf
message StreamingTranscribeRequest {
oneof request {
TranscriptionConfig config = 1; // Send first
bytes audio_chunk = 2; // Then audio chunks
}
}
message StreamingTranscribeResponse {
string partial_text = 1;
bool is_final = 2;
string final_text = 3;
repeated TranscriptionSegment segments = 4;
}
```
### Quick Test with grpcurl
```bash
# Health check
grpcurl -plaintext localhost:50051 appleintelligence.AppleIntelligence/Health
grpcurl -plaintext localhost:50051 appleintelligence.AppleIntelligenceService/Health
# Non-streaming completion
# Text completion
grpcurl -plaintext \
-d '{"prompt": "What is 2 + 2?"}' \
localhost:50051 appleintelligence.AppleIntelligence/Complete
localhost:50051 appleintelligence.AppleIntelligenceService/Complete
# Streaming completion
grpcurl -plaintext \
-d '{"prompt": "Tell me a short story"}' \
localhost:50051 appleintelligence.AppleIntelligence/StreamComplete
localhost:50051 appleintelligence.AppleIntelligenceService/StreamComplete
# List TTS voices
grpcurl -plaintext \
-d '{"language_code": "en-US"}' \
localhost:50051 appleintelligence.AppleIntelligenceService/ListVoices
# Text-to-Speech (base64 encode the response audio_data)
grpcurl -plaintext \
-d '{"text": "Hello world", "output_format": 1}' \
localhost:50051 appleintelligence.AppleIntelligenceService/TextToSpeech
# Transcribe audio file (base64 encode audio data)
grpcurl -plaintext \
-d '{"audio": {"data": "'$(base64 -i audio.wav)'", "mime_type": "audio/wav"}}' \
localhost:50051 appleintelligence.AppleIntelligenceService/Transcribe
```
## Configuration
@ -103,6 +225,21 @@ grpcurl -plaintext \
| `GRPC_PORT` | `50051` | Port to listen on |
| `API_KEY` | *none* | Optional API key for authentication |
## Supported Languages
### Speech Recognition (STT)
- English (US, CA, GB, AU, IN, IE, ZA)
- French (CA, FR)
- Spanish (ES, MX)
- German, Italian, Portuguese, Japanese, Korean, Chinese
- And many more via macOS Speech framework
### Text-to-Speech (TTS)
All voices available in macOS System Settings, including:
- Premium voices (highest quality, requires download)
- Enhanced voices (good quality)
- Default/Compact voices (pre-installed)
## Client Libraries
Connect from any language with gRPC support:
@ -120,15 +257,21 @@ See [docs/grpc-client-guide.md](docs/grpc-client-guide.md) for detailed examples
```
apple-intelligence-grpc/
├── Package.swift
├── Proto/
│ └── apple_intelligence.proto # gRPC service definition
├── Sources/
│ ├── AppleIntelligenceCore/ # Shared gRPC service code
│ │ ├── Config.swift
│ │ ├── Services/
│ │ │ └── AppleIntelligenceService.swift
│ │ │ ├── AppleIntelligenceService.swift
│ │ │ ├── TextToSpeechService.swift
│ │ │ ├── SpeechToTextService.swift
│ │ │ └── VisionAnalysisService.swift
│ │ ├── Providers/
│ │ │ └── AppleIntelligenceProvider.swift
│ │ └── Generated/
│ │ └── AppleIntelligence.pb.swift
│ │ ├── apple_intelligence.pb.swift
│ │ └── apple_intelligence.grpc.swift
│ ├── AppleIntelligenceServer/ # CLI executable
│ │ └── main.swift
│ └── AppleIntelligenceApp/ # Menu bar app
@ -182,6 +325,17 @@ See [docs/pipeline-configuration.md](docs/pipeline-configuration.md) for setup i
- Include the API key in the Authorization header: `Bearer YOUR_API_KEY`
- Verify the key matches what's configured in Settings
### Speech Recognition Not Working
- Grant microphone permission when prompted
- Check System Settings → Privacy & Security → Speech Recognition
- Ensure the language is supported
### TTS Voice Quality
- Download Premium/Enhanced voices from System Settings → Accessibility → Read & Speak
- Premium voices are larger (~150-500MB) but sound more natural
## License
MIT

View File

@ -34,7 +34,7 @@ struct AppleIntelligenceApp: App {
.defaultSize(width: 500, height: 600)
Window("Settings", id: "settings") {
SettingsView(settings: settings)
SettingsView(settings: settings, serverManager: serverManager)
}
.windowResizability(.contentSize)
}

View File

@ -19,6 +19,10 @@ final class AppSettings {
didSet { UserDefaults.standard.set(autoStartServer, forKey: "auto_start_server") }
}
var enableReflection: Bool {
didSet { UserDefaults.standard.set(enableReflection, forKey: "enable_reflection") }
}
var launchAtLogin: Bool {
didSet {
do {
@ -39,6 +43,12 @@ final class AppSettings {
self.port = savedPort == 0 ? 50051 : savedPort
self.apiKey = UserDefaults.standard.string(forKey: "api_key") ?? ""
self.autoStartServer = UserDefaults.standard.bool(forKey: "auto_start_server")
// Default to true if not set
if UserDefaults.standard.object(forKey: "enable_reflection") == nil {
self.enableReflection = true
} else {
self.enableReflection = UserDefaults.standard.bool(forKey: "enable_reflection")
}
self.launchAtLogin = SMAppService.mainApp.status == .enabled
}
@ -47,6 +57,7 @@ final class AppSettings {
port = 50051
apiKey = ""
autoStartServer = false
enableReflection = true
launchAtLogin = false
}
}

View File

@ -1,4 +1,62 @@
import Foundation
import AppKit
/// Represents an attached image in a chat message
struct ImageAttachment: Identifiable, Equatable {
let id: UUID
let data: Data
let filename: String?
let thumbnail: NSImage?
let mimeType: String
init(data: Data, filename: String? = nil) {
self.id = UUID()
self.data = data
self.filename = filename
self.thumbnail = Self.generateThumbnail(from: data)
self.mimeType = Self.detectMimeType(from: data)
}
private static func generateThumbnail(from data: Data) -> NSImage? {
guard let image = NSImage(data: data) else { return nil }
let maxSize: CGFloat = 100
let ratio = min(maxSize / image.size.width, maxSize / image.size.height, 1.0)
let newSize = NSSize(
width: image.size.width * ratio,
height: image.size.height * ratio
)
let thumbnail = NSImage(size: newSize)
thumbnail.lockFocus()
image.draw(
in: NSRect(origin: .zero, size: newSize),
from: NSRect(origin: .zero, size: image.size),
operation: .copy,
fraction: 1.0
)
thumbnail.unlockFocus()
return thumbnail
}
private static func detectMimeType(from data: Data) -> String {
guard data.count >= 4 else { return "application/octet-stream" }
let bytes = [UInt8](data.prefix(4))
if bytes[0] == 0x89 && bytes[1] == 0x50 && bytes[2] == 0x4E && bytes[3] == 0x47 {
return "image/png"
} else if bytes[0] == 0xFF && bytes[1] == 0xD8 {
return "image/jpeg"
} else if bytes[0] == 0x47 && bytes[1] == 0x49 && bytes[2] == 0x46 {
return "image/gif"
}
return "image/png" // Default to PNG
}
static func == (lhs: ImageAttachment, rhs: ImageAttachment) -> Bool {
lhs.id == rhs.id
}
}
struct ChatMessage: Identifiable, Equatable {
let id: UUID
@ -6,17 +64,19 @@ struct ChatMessage: Identifiable, Equatable {
var content: String
let timestamp: Date
var isStreaming: Bool
var images: [ImageAttachment]
enum Role: Equatable {
case user
case assistant
}
init(role: Role, content: String, isStreaming: Bool = false) {
init(role: Role, content: String, isStreaming: Bool = false, images: [ImageAttachment] = []) {
self.id = UUID()
self.role = role
self.content = content
self.timestamp = Date()
self.isStreaming = isStreaming
self.images = images
}
}

View File

@ -2,6 +2,7 @@ import Foundation
import AppleIntelligenceCore
import GRPCCore
import GRPCNIOTransportHTTP2
import GRPCReflectionService
@MainActor
@Observable
@ -51,6 +52,7 @@ final class ServerManager {
let host = settings.host
let port = settings.port
let apiKey = settings.apiKey.isEmpty ? nil : settings.apiKey
let enableReflection = settings.enableReflection
serverTask = Task {
do {
@ -82,7 +84,16 @@ final class ServerManager {
config: .defaults
)
let server = GRPCServer(transport: transport, services: [provider])
// Build services list with optional reflection
var services: [any RegistrableRPCService] = [provider]
if enableReflection {
if let descriptorURL = AppleIntelligenceResources.descriptorSetURL {
let reflectionService = try ReflectionService(descriptorSetFileURLs: [descriptorURL])
services.append(reflectionService)
}
}
let server = GRPCServer(transport: transport, services: services)
await MainActor.run {
self.state = .running(host: host, port: port)
@ -113,6 +124,19 @@ final class ServerManager {
state = .stopped
}
func restart() {
guard state.isRunning else { return }
// Stop the current server
stop()
state = .starting
// Start again after a short delay to allow port release
DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) { [weak self] in
self?.start()
}
}
func toggle() {
if state.isRunning {
stop()

View File

@ -1,4 +1,8 @@
import Foundation
import AppKit
import AVFoundation
import Speech
import UniformTypeIdentifiers
import AppleIntelligenceCore
@MainActor
@ -9,11 +13,113 @@ final class ChatViewModel {
var isLoading: Bool = false
var errorMessage: String?
// Image attachment state
var pendingImages: [ImageAttachment] = []
// Voice input/output state
var isRecording: Bool = false
var isSpeaking: Bool = false
var speakingMessageId: UUID?
var recordingLevel: Float = 0
private var service: AppleIntelligenceService?
private var ttsService: TextToSpeechService?
private var sttService: SpeechToTextService?
private var currentTask: Task<Void, Never>?
// Audio recording - multi-language support
private var audioEngine: AVAudioEngine?
private var speechRecognizers: [String: SFSpeechRecognizer] = [:]
private var activeRecognizer: SFSpeechRecognizer?
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
// Supported speech recognition languages (Canadian English and French)
private static let supportedLocales = ["en-CA", "fr-CA"]
var detectedLanguage: String = "en-CA"
// Audio playback - use direct speech synthesis for reliability
private var speechSynthesizer: AVSpeechSynthesizer?
private var speechDelegate: SpeechSynthesizerDelegate?
// Maximum images per message
private let maxImagesPerMessage = 5
// Supported image types
static let supportedImageTypes: [UTType] = [.png, .jpeg, .gif, .webP, .heic]
// Recent images from Downloads and Desktop
var recentImages: [URL] = []
func initialize() async {
service = await AppleIntelligenceService()
ttsService = TextToSpeechService()
sttService = await SpeechToTextService()
// Initialize speech recognizers for all supported locales
for localeId in Self.supportedLocales {
if let recognizer = SFSpeechRecognizer(locale: Locale(identifier: localeId)) {
speechRecognizers[localeId] = recognizer
}
}
// Default to system locale if supported, otherwise en-CA
let systemLocale = Locale.current.identifier
if speechRecognizers[systemLocale] != nil {
detectedLanguage = systemLocale
} else if systemLocale.starts(with: "fr") {
detectedLanguage = "fr-CA"
} else {
detectedLanguage = "en-CA"
}
activeRecognizer = speechRecognizers[detectedLanguage]
loadRecentImages()
}
// MARK: - Recent Images
func loadRecentImages() {
let fileManager = FileManager.default
let homeDir = fileManager.homeDirectoryForCurrentUser
let folders = [
homeDir.appendingPathComponent("Downloads"),
homeDir.appendingPathComponent("Desktop")
]
let imageExtensions = ["png", "jpg", "jpeg", "gif", "webp", "heic", "heif"]
var allImages: [(url: URL, date: Date)] = []
for folder in folders {
guard let contents = try? fileManager.contentsOfDirectory(
at: folder,
includingPropertiesForKeys: [.contentModificationDateKey, .isRegularFileKey],
options: [.skipsHiddenFiles]
) else { continue }
for url in contents {
let ext = url.pathExtension.lowercased()
guard imageExtensions.contains(ext) else { continue }
if let attributes = try? url.resourceValues(forKeys: [.contentModificationDateKey, .isRegularFileKey]),
attributes.isRegularFile == true,
let modDate = attributes.contentModificationDate {
allImages.append((url: url, date: modDate))
}
}
}
// Sort by date descending and take last 10
recentImages = allImages
.sorted { $0.date > $1.date }
.prefix(10)
.map { $0.url }
}
func addRecentImage(_ url: URL) {
addImage(from: url)
}
var isServiceAvailable: Bool {
@ -22,19 +128,77 @@ final class ChatViewModel {
}
}
var canSend: Bool {
!inputText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty || !pendingImages.isEmpty
}
// MARK: - Image Handling
func addImage(from url: URL) {
guard pendingImages.count < maxImagesPerMessage else {
errorMessage = "Maximum \(maxImagesPerMessage) images per message"
return
}
do {
let data = try Data(contentsOf: url)
let attachment = ImageAttachment(data: data, filename: url.lastPathComponent)
pendingImages.append(attachment)
errorMessage = nil
} catch {
errorMessage = "Failed to load image: \(error.localizedDescription)"
}
}
func addImageFromPasteboard() {
guard let image = NSPasteboard.general.readObjects(
forClasses: [NSImage.self],
options: nil
)?.first as? NSImage else {
return
}
guard pendingImages.count < maxImagesPerMessage else {
errorMessage = "Maximum \(maxImagesPerMessage) images per message"
return
}
if let tiffData = image.tiffRepresentation,
let bitmap = NSBitmapImageRep(data: tiffData),
let pngData = bitmap.representation(using: .png, properties: [:]) {
let attachment = ImageAttachment(data: pngData, filename: "pasted_image.png")
pendingImages.append(attachment)
errorMessage = nil
}
}
func removePendingImage(_ attachment: ImageAttachment) {
pendingImages.removeAll { $0.id == attachment.id }
}
func clearPendingImages() {
pendingImages.removeAll()
}
// MARK: - Messaging
func sendMessage() {
let text = inputText.trimmingCharacters(in: .whitespacesAndNewlines)
guard !text.isEmpty else { return }
guard !text.isEmpty || !pendingImages.isEmpty else { return }
guard !isLoading else { return }
// Add user message
let userMessage = ChatMessage(role: .user, content: text)
// Capture images before clearing
let imagesToSend = pendingImages
// Add user message with images
let userMessage = ChatMessage(role: .user, content: text, images: imagesToSend)
messages.append(userMessage)
inputText = ""
pendingImages = []
errorMessage = nil
// Add placeholder for assistant response
var assistantMessage = ChatMessage(role: .assistant, content: "", isStreaming: true)
let assistantMessage = ChatMessage(role: .assistant, content: "", isStreaming: true)
messages.append(assistantMessage)
isLoading = true
@ -45,14 +209,20 @@ final class ChatViewModel {
throw AppleIntelligenceError.modelNotAvailable
}
// Convert attachments to service format
let images = imagesToSend.map { attachment in
(data: attachment.data, filename: attachment.filename)
}
let stream = await service.streamComplete(
prompt: text,
temperature: nil,
maxTokens: nil
maxTokens: nil,
images: images
)
var fullResponse = ""
for try await partialResponse in stream {
for try await (partialResponse, _) in stream {
fullResponse = partialResponse
// Update the last message (assistant's response)
if let index = messages.lastIndex(where: { $0.role == .assistant }) {
@ -93,4 +263,279 @@ final class ChatViewModel {
messages.removeAll()
errorMessage = nil
}
// MARK: - Voice Input (Speech-to-Text)
func toggleRecording() {
if isRecording {
stopRecording()
} else {
startRecording()
}
}
func startRecording() {
Task {
// Use nonisolated helper to avoid MainActor isolation inheritance in TCC callback
let status = await Self.requestSpeechAuthorization()
guard status == .authorized else {
self.errorMessage = "Speech recognition not authorized"
return
}
self.beginRecording()
}
}
/// Request speech recognition authorization without MainActor isolation.
/// This prevents Swift 6 strict concurrency from asserting MainActor in the TCC callback.
private nonisolated static func requestSpeechAuthorization() async -> SFSpeechRecognizerAuthorizationStatus {
await withCheckedContinuation { continuation in
SFSpeechRecognizer.requestAuthorization { status in
continuation.resume(returning: status)
}
}
}
/// Creates audio tap handler in nonisolated context to avoid MainActor isolation inheritance.
/// Audio taps run on CoreAudio's RealtimeMessenger queue, not MainActor.
private nonisolated static func createAudioTapHandler(
request: SFSpeechAudioBufferRecognitionRequest,
levelUpdater: RecordingLevelUpdater
) -> (AVAudioPCMBuffer, AVAudioTime) -> Void {
return { buffer, _ in
request.append(buffer)
// Calculate audio level for visual feedback
guard let channelData = buffer.floatChannelData else { return }
let channelDataValue = channelData.pointee
let channelDataValueArray = stride(from: 0, to: Int(buffer.frameLength), by: buffer.stride).map { channelDataValue[$0] }
let rms = sqrt(channelDataValueArray.map { $0 * $0 }.reduce(0, +) / Float(buffer.frameLength))
let avgPower = 20 * log10(rms)
let level = max(0, min(1, (avgPower + 50) / 50))
levelUpdater.updateLevel(level)
}
}
private func beginRecording() {
// Try to find an available recognizer
let recognizer = activeRecognizer ?? speechRecognizers.values.first { $0.isAvailable }
guard let speechRecognizer = recognizer, speechRecognizer.isAvailable else {
errorMessage = "Speech recognition not available"
return
}
// Stop any existing recording
if audioEngine != nil {
stopRecording()
}
audioEngine = AVAudioEngine()
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
guard let audioEngine = audioEngine,
let recognitionRequest = recognitionRequest else {
errorMessage = "Failed to initialize audio engine"
return
}
recognitionRequest.shouldReportPartialResults = true
// Enable automatic language detection if available (macOS 14+)
if #available(macOS 14, *) {
recognitionRequest.addsPunctuation = true
}
let inputNode = audioEngine.inputNode
let recordingFormat = inputNode.outputFormat(forBus: 0)
// Use nonisolated static function to create audio tap handler
// This breaks MainActor isolation inheritance in the closure
let levelUpdater = RecordingLevelUpdater(viewModel: self)
let audioTapHandler = Self.createAudioTapHandler(request: recognitionRequest, levelUpdater: levelUpdater)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat, block: audioTapHandler)
audioEngine.prepare()
do {
try audioEngine.start()
isRecording = true
// Use a sendable wrapper for recognition results with language detection
let resultHandler = RecognitionResultHandler(viewModel: self)
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
resultHandler.handleResult(result: result, error: error)
}
} catch {
errorMessage = "Failed to start recording: \(error.localizedDescription)"
cleanupRecording()
}
}
/// Switch to a different language for speech recognition
func switchLanguage(to localeId: String) {
guard let recognizer = speechRecognizers[localeId] else { return }
activeRecognizer = recognizer
detectedLanguage = localeId
}
/// Get available languages for speech recognition
var availableLanguages: [(id: String, name: String)] {
speechRecognizers.keys.sorted().compactMap { localeId in
let locale = Locale(identifier: localeId)
let name = locale.localizedString(forIdentifier: localeId) ?? localeId
return (id: localeId, name: name)
}
}
func stopRecording() {
recognitionRequest?.endAudio()
cleanupRecording()
}
fileprivate func cleanupRecording() {
audioEngine?.stop()
audioEngine?.inputNode.removeTap(onBus: 0)
audioEngine = nil
recognitionRequest = nil
recognitionTask?.cancel()
recognitionTask = nil
isRecording = false
recordingLevel = 0
}
// MARK: - Voice Output (Text-to-Speech)
func speakMessage(_ message: ChatMessage) {
guard !message.content.isEmpty else { return }
// If already speaking this message, stop
if isSpeaking && speakingMessageId == message.id {
stopSpeaking()
return
}
// Stop any current speech
stopSpeaking()
speakingMessageId = message.id
isSpeaking = true
// Create utterance
let utterance = AVSpeechUtterance(string: message.content)
utterance.rate = AVSpeechUtteranceDefaultSpeechRate
utterance.pitchMultiplier = 1.0
utterance.volume = 1.0
// Detect message language and use appropriate voice
let isFrench = Self.detectFrench(message.content)
let language = isFrench ? "fr-CA" : "en-US"
utterance.voice = AVSpeechSynthesisVoice(language: language)
// Create synthesizer and delegate
let synthesizer = AVSpeechSynthesizer()
speechDelegate = SpeechSynthesizerDelegate { [weak self] in
Task { @MainActor in
self?.isSpeaking = false
self?.speakingMessageId = nil
self?.speechDelegate = nil
self?.speechSynthesizer = nil
}
}
synthesizer.delegate = speechDelegate
speechSynthesizer = synthesizer
// Speak directly
synthesizer.speak(utterance)
}
func stopSpeaking() {
speechSynthesizer?.stopSpeaking(at: .immediate)
speechSynthesizer = nil
speechDelegate = nil
isSpeaking = false
speakingMessageId = nil
}
/// Detect if text is likely French based on common words
private static func detectFrench(_ text: String) -> Bool {
let lowercased = text.lowercased()
let frenchIndicators = [
" le ", " la ", " les ", " un ", " une ", " des ",
" je ", " tu ", " il ", " elle ", " nous ", " vous ", " ils ", " elles ",
" est ", " sont ", " avoir ", " être ", " fait ", " faire ",
" que ", " qui ", " quoi ", " dans ", " pour ", " avec ", " sur ",
" ce ", " cette ", " ces ", " mon ", " ma ", " mes ",
" pas ", " plus ", " très ", " bien ", " aussi ",
"bonjour", "merci", "salut", "oui", "non", "peut",
" et ", " ou ", " mais ", " donc ", " car ",
"c'est", "j'ai", "qu'est", "n'est", "d'un", "l'on"
]
let frenchCount = frenchIndicators.filter { lowercased.contains($0) }.count
return frenchCount >= 2
}
}
// MARK: - Speech Synthesizer Delegate
private final class SpeechSynthesizerDelegate: NSObject, AVSpeechSynthesizerDelegate, @unchecked Sendable {
let onFinish: () -> Void
init(onFinish: @escaping () -> Void) {
self.onFinish = onFinish
}
func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
onFinish()
}
func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
onFinish()
}
}
// MARK: - Sendable Wrappers for Audio Callbacks
/// Wrapper to safely update recording level from audio callback thread
private final class RecordingLevelUpdater: @unchecked Sendable {
private weak var viewModel: ChatViewModel?
init(viewModel: ChatViewModel) {
self.viewModel = viewModel
}
func updateLevel(_ level: Float) {
Task { @MainActor [weak viewModel] in
viewModel?.recordingLevel = level
}
}
}
/// Wrapper to safely handle recognition results from Speech framework callback
private final class RecognitionResultHandler: @unchecked Sendable {
private weak var viewModel: ChatViewModel?
init(viewModel: ChatViewModel) {
self.viewModel = viewModel
}
func handleResult(result: SFSpeechRecognitionResult?, error: Error?) {
// Extract data before crossing actor boundary (SFSpeechRecognitionResult is not Sendable)
let transcription = result?.bestTranscription.formattedString
let isFinal = result?.isFinal ?? false
let hasError = error != nil
Task { @MainActor [weak viewModel] in
if let transcription = transcription {
viewModel?.inputText = transcription
}
if hasError || isFinal {
viewModel?.cleanupRecording()
}
}
}
}

View File

@ -1,17 +1,33 @@
import SwiftUI
import UniformTypeIdentifiers
struct ChatView: View {
@Bindable var viewModel: ChatViewModel
@FocusState private var isInputFocused: Bool
@State private var isShowingFilePicker = false
@State private var isDragOver = false
@State private var previewImageURL: URL?
var body: some View {
HStack(spacing: 0) {
// Recent images sidebar
if !viewModel.recentImages.isEmpty {
recentImagesSidebar
Divider()
}
// Main chat area
VStack(spacing: 0) {
// Messages list
ScrollViewReader { proxy in
ScrollView {
LazyVStack(spacing: 12) {
ForEach(viewModel.messages) { message in
MessageBubble(message: message)
MessageBubble(
message: message,
isSpeaking: viewModel.speakingMessageId == message.id,
onSpeak: { viewModel.speakMessage(message) }
)
.id(message.id)
}
}
@ -55,14 +71,260 @@ struct ChatView: View {
Divider()
// Pending images preview
if !viewModel.pendingImages.isEmpty {
pendingImagesView
}
// Input area
HStack(spacing: 12) {
inputArea
}
.onDrop(of: [.fileURL, .image], isTargeted: $isDragOver) { providers in
handleDrop(providers: providers)
return true
}
.overlay {
if isDragOver {
RoundedRectangle(cornerRadius: 8)
.stroke(Color.accentColor, lineWidth: 3)
.background(Color.accentColor.opacity(0.1))
.padding(4)
}
}
}
.frame(minWidth: 500, minHeight: 500)
.toolbar {
ToolbarItem(placement: .primaryAction) {
Button {
viewModel.loadRecentImages()
} label: {
Image(systemName: "arrow.clockwise")
}
.help("Refresh recent images")
}
ToolbarItem(placement: .primaryAction) {
Button {
viewModel.clearChat()
} label: {
Image(systemName: "trash")
}
.help("Clear chat")
.disabled(viewModel.messages.isEmpty)
}
}
.task {
await viewModel.initialize()
}
.onAppear {
NSApp.setActivationPolicy(.regular)
NSApp.activate(ignoringOtherApps: true)
DispatchQueue.main.asyncAfter(deadline: .now() + 0.2) {
if let window = NSApp.windows.first(where: { $0.title == "Chat" }) {
window.makeKeyAndOrderFront(nil)
}
isInputFocused = true
}
}
.onDisappear {
if NSApp.windows.filter({ $0.isVisible && $0.title != "" }).isEmpty {
NSApp.setActivationPolicy(.accessory)
}
}
.fileImporter(
isPresented: $isShowingFilePicker,
allowedContentTypes: ChatViewModel.supportedImageTypes,
allowsMultipleSelection: true
) { result in
switch result {
case .success(let urls):
for url in urls {
if url.startAccessingSecurityScopedResource() {
viewModel.addImage(from: url)
url.stopAccessingSecurityScopedResource()
}
}
case .failure(let error):
viewModel.errorMessage = error.localizedDescription
}
}
.sheet(item: $previewImageURL) { url in
ImagePreviewSheet(url: url) {
viewModel.addRecentImage(url)
previewImageURL = nil
} onCancel: {
previewImageURL = nil
}
}
}
// MARK: - Drag & Drop Handler
private func handleDrop(providers: [NSItemProvider]) {
for provider in providers {
// Try to load as file URL first
if provider.hasItemConformingToTypeIdentifier(UTType.fileURL.identifier) {
provider.loadItem(forTypeIdentifier: UTType.fileURL.identifier, options: nil) { item, error in
guard error == nil else { return }
if let data = item as? Data,
let url = URL(dataRepresentation: data, relativeTo: nil) {
DispatchQueue.main.async {
viewModel.addImage(from: url)
}
} else if let url = item as? URL {
DispatchQueue.main.async {
viewModel.addImage(from: url)
}
}
}
}
// Try to load as image data
else if provider.hasItemConformingToTypeIdentifier(UTType.image.identifier) {
provider.loadDataRepresentation(forTypeIdentifier: UTType.image.identifier) { data, error in
guard let data = data, error == nil else { return }
DispatchQueue.main.async {
let attachment = ImageAttachment(data: data, filename: "dropped_image.png")
if viewModel.pendingImages.count < 5 {
viewModel.pendingImages.append(attachment)
}
}
}
}
}
}
// MARK: - Recent Images Sidebar
private var recentImagesSidebar: some View {
VStack(alignment: .leading, spacing: 8) {
Text("Recent")
.font(.headline)
.foregroundStyle(.secondary)
.padding(.horizontal, 8)
.padding(.top, 8)
ScrollView {
LazyVStack(spacing: 8) {
ForEach(viewModel.recentImages, id: \.self) { url in
RecentImageThumbnail(url: url) {
previewImageURL = url
}
}
}
.padding(.horizontal, 8)
.padding(.bottom, 8)
}
}
.frame(width: 100)
.background(Color(nsColor: .controlBackgroundColor).opacity(0.5))
}
// MARK: - Pending Images Preview
private var pendingImagesView: some View {
ScrollView(.horizontal, showsIndicators: false) {
HStack(spacing: 8) {
ForEach(viewModel.pendingImages) { attachment in
pendingImageThumbnail(attachment)
}
}
.padding(.horizontal)
.padding(.vertical, 8)
}
.background(Color(nsColor: .controlBackgroundColor))
}
private func pendingImageThumbnail(_ attachment: ImageAttachment) -> some View {
ZStack(alignment: .topTrailing) {
if let thumbnail = attachment.thumbnail {
Image(nsImage: thumbnail)
.resizable()
.aspectRatio(contentMode: .fill)
.frame(width: 60, height: 60)
.clipShape(RoundedRectangle(cornerRadius: 8))
} else {
RoundedRectangle(cornerRadius: 8)
.fill(Color.gray.opacity(0.3))
.frame(width: 60, height: 60)
.overlay {
Image(systemName: "photo")
.foregroundStyle(.secondary)
}
}
Button {
viewModel.removePendingImage(attachment)
} label: {
Image(systemName: "xmark.circle.fill")
.font(.system(size: 16))
.foregroundStyle(.white)
.background(Circle().fill(.black.opacity(0.6)).frame(width: 18, height: 18))
}
.buttonStyle(.plain)
.offset(x: 6, y: -6)
}
}
// MARK: - Input Area
private var inputArea: some View {
HStack(spacing: 8) {
Button {
isShowingFilePicker = true
} label: {
Image(systemName: "photo.badge.plus")
.font(.title3)
.foregroundStyle(.secondary)
}
.buttonStyle(.plain)
.help("Add image (or paste with ⌘V)")
// Language toggle for speech recognition
Button {
// Toggle between en-CA and fr-CA
let newLang = viewModel.detectedLanguage == "en-CA" ? "fr-CA" : "en-CA"
viewModel.switchLanguage(to: newLang)
} label: {
Text(viewModel.detectedLanguage == "fr-CA" ? "FR" : "EN")
.font(.caption.bold())
.foregroundStyle(.secondary)
.frame(width: 24, height: 24)
.background(
RoundedRectangle(cornerRadius: 4)
.fill(Color.secondary.opacity(0.1))
)
}
.buttonStyle(.plain)
.help("Speech language: \(viewModel.detectedLanguage) (click to toggle)")
// Microphone button for voice input
Button {
viewModel.toggleRecording()
} label: {
ZStack {
if viewModel.isRecording {
// Recording indicator with level
Circle()
.fill(Color.red.opacity(0.3))
.frame(width: 28 + CGFloat(viewModel.recordingLevel) * 10,
height: 28 + CGFloat(viewModel.recordingLevel) * 10)
.animation(.easeInOut(duration: 0.1), value: viewModel.recordingLevel)
}
Image(systemName: viewModel.isRecording ? "mic.fill" : "mic")
.font(.title3)
.foregroundStyle(viewModel.isRecording ? .red : .secondary)
}
}
.buttonStyle(.plain)
.help(viewModel.isRecording ? "Stop recording" : "Voice input")
TextField("Message...", text: $viewModel.inputText, axis: .vertical)
.textFieldStyle(.plain)
.lineLimit(1...5)
.focused($isInputFocused)
.onSubmit {
if !viewModel.inputText.isEmpty {
if viewModel.canSend {
viewModel.sendMessage()
}
}
@ -82,53 +344,104 @@ struct ChatView: View {
} label: {
Image(systemName: "arrow.up.circle.fill")
.font(.title2)
.foregroundStyle(viewModel.inputText.isEmpty ? .gray : .accentColor)
.foregroundStyle(viewModel.canSend ? Color.accentColor : Color.gray)
}
.buttonStyle(.plain)
.disabled(viewModel.inputText.isEmpty)
.disabled(!viewModel.canSend)
}
}
.padding()
}
.frame(minWidth: 400, minHeight: 500)
.toolbar {
ToolbarItem(placement: .primaryAction) {
Button {
viewModel.clearChat()
} label: {
Image(systemName: "trash")
}
.help("Clear chat")
.disabled(viewModel.messages.isEmpty)
.onPasteCommand(of: [.image, .png, .jpeg, .tiff]) { providers in
for provider in providers {
// Try to load as image
if provider.hasItemConformingToTypeIdentifier(UTType.image.identifier) {
provider.loadDataRepresentation(forTypeIdentifier: UTType.image.identifier) { data, _ in
if let data = data {
DispatchQueue.main.async {
let attachment = ImageAttachment(data: data, filename: "pasted_image.png")
if viewModel.pendingImages.count < 5 {
viewModel.pendingImages.append(attachment)
}
}
}
}
return
}
}
// Fallback to pasteboard check
viewModel.addImageFromPasteboard()
}
}
.task {
await viewModel.initialize()
}
.onAppear {
// Force the app to become active and accept keyboard input
NSApp.setActivationPolicy(.regular)
NSApp.activate(ignoringOtherApps: true)
DispatchQueue.main.asyncAfter(deadline: .now() + 0.2) {
// Make sure the window is key
if let window = NSApp.windows.first(where: { $0.title == "Chat" }) {
window.makeKeyAndOrderFront(nil)
}
isInputFocused = true
}
}
.onDisappear {
// Return to accessory mode when chat is closed
if NSApp.windows.filter({ $0.isVisible && $0.title != "" }).isEmpty {
NSApp.setActivationPolicy(.accessory)
// MARK: - Recent Image Thumbnail
struct RecentImageThumbnail: View {
let url: URL
let onTap: () -> Void
@State private var thumbnail: NSImage?
var body: some View {
Button(action: onTap) {
ZStack {
if let thumbnail = thumbnail {
Image(nsImage: thumbnail)
.resizable()
.aspectRatio(contentMode: .fill)
.frame(width: 80, height: 80)
.clipShape(RoundedRectangle(cornerRadius: 8))
} else {
RoundedRectangle(cornerRadius: 8)
.fill(Color.gray.opacity(0.3))
.frame(width: 80, height: 80)
.overlay {
ProgressView()
.scaleEffect(0.6)
}
}
}
}
.buttonStyle(.plain)
.help(url.lastPathComponent)
.task {
await loadThumbnail()
}
}
private func loadThumbnail() async {
guard let image = NSImage(contentsOf: url) else { return }
let maxSize: CGFloat = 80
let ratio = min(maxSize / image.size.width, maxSize / image.size.height, 1.0)
let newSize = NSSize(
width: image.size.width * ratio,
height: image.size.height * ratio
)
let thumb = NSImage(size: newSize)
thumb.lockFocus()
image.draw(
in: NSRect(origin: .zero, size: newSize),
from: NSRect(origin: .zero, size: image.size),
operation: .copy,
fraction: 1.0
)
thumb.unlockFocus()
await MainActor.run {
thumbnail = thumb
}
}
}
// MARK: - Message Bubble
struct MessageBubble: View {
let message: ChatMessage
var isSpeaking: Bool = false
var onSpeak: (() -> Void)? = nil
@State private var showCopied = false
var body: some View {
HStack {
@ -137,6 +450,11 @@ struct MessageBubble: View {
}
VStack(alignment: message.role == .user ? .trailing : .leading, spacing: 4) {
if !message.images.isEmpty {
imageGrid
}
if !message.content.isEmpty {
Text(message.content)
.textSelection(.enabled)
.padding(.horizontal, 12)
@ -144,6 +462,7 @@ struct MessageBubble: View {
.background(bubbleColor)
.foregroundStyle(message.role == .user ? .white : .primary)
.clipShape(RoundedRectangle(cornerRadius: 16))
}
if message.isStreaming {
HStack(spacing: 4) {
@ -154,6 +473,45 @@ struct MessageBubble: View {
.foregroundStyle(.secondary)
}
}
// Action buttons for assistant messages
if message.role == .assistant && !message.content.isEmpty && !message.isStreaming {
HStack(spacing: 12) {
// Speaker button for TTS
Button {
onSpeak?()
} label: {
HStack(spacing: 4) {
Image(systemName: isSpeaking ? "stop.fill" : "speaker.wave.2")
Text(isSpeaking ? "Stop" : "Speak")
}
.font(.caption)
.foregroundStyle(isSpeaking ? .red : .secondary)
}
.buttonStyle(.plain)
// Copy button
Button {
NSPasteboard.general.clearContents()
NSPasteboard.general.setString(message.content, forType: .string)
showCopied = true
DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) {
showCopied = false
}
} label: {
HStack(spacing: 4) {
Image(systemName: showCopied ? "checkmark" : "doc.on.doc")
Text(showCopied ? "Copied" : "Copy")
}
.font(.caption)
.foregroundStyle(.secondary)
}
.buttonStyle(.plain)
Spacer()
}
.padding(.top, 2)
}
}
if message.role == .assistant {
@ -162,6 +520,32 @@ struct MessageBubble: View {
}
}
@ViewBuilder
private var imageGrid: some View {
let columns = min(message.images.count, 3)
LazyVGrid(
columns: Array(repeating: GridItem(.flexible(), spacing: 4), count: columns),
spacing: 4
) {
ForEach(message.images) { attachment in
if let thumbnail = attachment.thumbnail {
Image(nsImage: thumbnail)
.resizable()
.aspectRatio(contentMode: .fill)
.frame(width: 80, height: 80)
.clipShape(RoundedRectangle(cornerRadius: 8))
}
}
}
.padding(4)
.background(
message.role == .user
? Color.accentColor.opacity(0.8)
: Color(nsColor: .controlBackgroundColor)
)
.clipShape(RoundedRectangle(cornerRadius: 12))
}
private var bubbleColor: Color {
switch message.role {
case .user:
@ -171,3 +555,65 @@ struct MessageBubble: View {
}
}
}
// MARK: - Image Preview Sheet
struct ImagePreviewSheet: View {
let url: URL
let onConfirm: () -> Void
let onCancel: () -> Void
@State private var image: NSImage?
var body: some View {
VStack(spacing: 16) {
Text("Add Image")
.font(.headline)
if let image = image {
Image(nsImage: image)
.resizable()
.aspectRatio(contentMode: .fit)
.frame(maxWidth: 500, maxHeight: 400)
.clipShape(RoundedRectangle(cornerRadius: 8))
.shadow(radius: 4)
} else {
RoundedRectangle(cornerRadius: 8)
.fill(Color.gray.opacity(0.2))
.frame(width: 300, height: 200)
.overlay {
ProgressView()
}
}
Text(url.lastPathComponent)
.font(.caption)
.foregroundStyle(.secondary)
.lineLimit(1)
HStack(spacing: 16) {
Button("Cancel") {
onCancel()
}
.keyboardShortcut(.cancelAction)
Button("Add to Message") {
onConfirm()
}
.keyboardShortcut(.defaultAction)
.buttonStyle(.borderedProminent)
}
}
.padding(24)
.frame(minWidth: 400, minHeight: 300)
.task {
image = NSImage(contentsOf: url)
}
}
}
// MARK: - URL Identifiable Extension
extension URL: @retroactive Identifiable {
public var id: String { absoluteString }
}

View File

@ -2,6 +2,7 @@ import SwiftUI
struct SettingsView: View {
@Bindable var settings: AppSettings
var serverManager: ServerManager?
@Environment(\.dismiss) private var dismiss
var body: some View {
@ -10,7 +11,7 @@ struct SettingsView: View {
TextField("Host", text: $settings.host)
.textFieldStyle(.roundedBorder)
TextField("Port", value: $settings.port, format: .number)
TextField("Port", value: $settings.port, format: .number.grouping(.never))
.textFieldStyle(.roundedBorder)
SecureField("API Key (optional)", text: $settings.apiKey)
@ -22,6 +23,13 @@ struct SettingsView: View {
Toggle("Auto-start server on launch", isOn: $settings.autoStartServer)
}
Section("API") {
Toggle("Enable gRPC reflection", isOn: $settings.enableReflection)
.onChange(of: settings.enableReflection) { _, _ in
serverManager?.restart()
}
}
Section {
HStack {
Button("Reset to Defaults") {
@ -38,7 +46,7 @@ struct SettingsView: View {
}
}
.formStyle(.grouped)
.frame(width: 400, height: 310)
.frame(width: 400, height: 380)
.fixedSize()
.onAppear {
NSApp.setActivationPolicy(.regular)

View File

@ -1,238 +0,0 @@
// DO NOT EDIT.
// swift-format-ignore-file
// swiftlint:disable all
//
// Generated protocol buffer code for apple_intelligence.proto
import Foundation
import SwiftProtobuf
// MARK: - Messages
struct Appleintelligence_CompletionRequest: Sendable, SwiftProtobuf.Message {
static let protoMessageName: String = "appleintelligence.CompletionRequest"
var prompt: String = ""
var temperature: Float = 0
var maxTokens: Int32 = 0
var hasTemperature: Bool = false
var hasMaxTokens: Bool = false
var unknownFields = SwiftProtobuf.UnknownStorage()
init() {}
init(prompt: String, temperature: Float? = nil, maxTokens: Int32? = nil) {
self.prompt = prompt
if let temp = temperature {
self.temperature = temp
self.hasTemperature = true
}
if let tokens = maxTokens {
self.maxTokens = tokens
self.hasMaxTokens = true
}
}
mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
while let fieldNumber = try decoder.nextFieldNumber() {
switch fieldNumber {
case 1: try decoder.decodeSingularStringField(value: &prompt)
case 2:
try decoder.decodeSingularFloatField(value: &temperature)
hasTemperature = true
case 3:
try decoder.decodeSingularInt32Field(value: &maxTokens)
hasMaxTokens = true
default: break
}
}
}
func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
if !prompt.isEmpty {
try visitor.visitSingularStringField(value: prompt, fieldNumber: 1)
}
if hasTemperature {
try visitor.visitSingularFloatField(value: temperature, fieldNumber: 2)
}
if hasMaxTokens {
try visitor.visitSingularInt32Field(value: maxTokens, fieldNumber: 3)
}
try unknownFields.traverse(visitor: &visitor)
}
static func ==(lhs: Self, rhs: Self) -> Bool {
lhs.prompt == rhs.prompt && lhs.temperature == rhs.temperature && lhs.maxTokens == rhs.maxTokens && lhs.unknownFields == rhs.unknownFields
}
func isEqualTo(message: any SwiftProtobuf.Message) -> Bool {
guard let other = message as? Self else { return false }
return self == other
}
}
struct Appleintelligence_CompletionResponse: Sendable, SwiftProtobuf.Message {
static let protoMessageName: String = "appleintelligence.CompletionResponse"
var id: String = ""
var text: String = ""
var finishReason: String = ""
var unknownFields = SwiftProtobuf.UnknownStorage()
init() {}
mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
while let fieldNumber = try decoder.nextFieldNumber() {
switch fieldNumber {
case 1: try decoder.decodeSingularStringField(value: &id)
case 2: try decoder.decodeSingularStringField(value: &text)
case 3: try decoder.decodeSingularStringField(value: &finishReason)
default: break
}
}
}
func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
if !id.isEmpty {
try visitor.visitSingularStringField(value: id, fieldNumber: 1)
}
if !text.isEmpty {
try visitor.visitSingularStringField(value: text, fieldNumber: 2)
}
if !finishReason.isEmpty {
try visitor.visitSingularStringField(value: finishReason, fieldNumber: 3)
}
try unknownFields.traverse(visitor: &visitor)
}
static func ==(lhs: Self, rhs: Self) -> Bool {
lhs.id == rhs.id && lhs.text == rhs.text && lhs.finishReason == rhs.finishReason && lhs.unknownFields == rhs.unknownFields
}
func isEqualTo(message: any SwiftProtobuf.Message) -> Bool {
guard let other = message as? Self else { return false }
return self == other
}
}
struct Appleintelligence_CompletionChunk: Sendable, SwiftProtobuf.Message {
static let protoMessageName: String = "appleintelligence.CompletionChunk"
var id: String = ""
var delta: String = ""
var isFinal: Bool = false
var finishReason: String = ""
var hasFinishReason: Bool {
!finishReason.isEmpty
}
var unknownFields = SwiftProtobuf.UnknownStorage()
init() {}
mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
while let fieldNumber = try decoder.nextFieldNumber() {
switch fieldNumber {
case 1: try decoder.decodeSingularStringField(value: &id)
case 2: try decoder.decodeSingularStringField(value: &delta)
case 3: try decoder.decodeSingularBoolField(value: &isFinal)
case 4: try decoder.decodeSingularStringField(value: &finishReason)
default: break
}
}
}
func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
if !id.isEmpty {
try visitor.visitSingularStringField(value: id, fieldNumber: 1)
}
if !delta.isEmpty {
try visitor.visitSingularStringField(value: delta, fieldNumber: 2)
}
if isFinal {
try visitor.visitSingularBoolField(value: isFinal, fieldNumber: 3)
}
if !finishReason.isEmpty {
try visitor.visitSingularStringField(value: finishReason, fieldNumber: 4)
}
try unknownFields.traverse(visitor: &visitor)
}
static func ==(lhs: Self, rhs: Self) -> Bool {
lhs.id == rhs.id && lhs.delta == rhs.delta && lhs.isFinal == rhs.isFinal && lhs.finishReason == rhs.finishReason && lhs.unknownFields == rhs.unknownFields
}
func isEqualTo(message: any SwiftProtobuf.Message) -> Bool {
guard let other = message as? Self else { return false }
return self == other
}
}
struct Appleintelligence_HealthRequest: Sendable, SwiftProtobuf.Message {
static let protoMessageName: String = "appleintelligence.HealthRequest"
var unknownFields = SwiftProtobuf.UnknownStorage()
init() {}
mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
while let _ = try decoder.nextFieldNumber() {}
}
func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
try unknownFields.traverse(visitor: &visitor)
}
static func ==(lhs: Self, rhs: Self) -> Bool {
lhs.unknownFields == rhs.unknownFields
}
func isEqualTo(message: any SwiftProtobuf.Message) -> Bool {
guard let other = message as? Self else { return false }
return self == other
}
}
struct Appleintelligence_HealthResponse: Sendable, SwiftProtobuf.Message {
static let protoMessageName: String = "appleintelligence.HealthResponse"
var healthy: Bool = false
var modelStatus: String = ""
var unknownFields = SwiftProtobuf.UnknownStorage()
init() {}
mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
while let fieldNumber = try decoder.nextFieldNumber() {
switch fieldNumber {
case 1: try decoder.decodeSingularBoolField(value: &healthy)
case 2: try decoder.decodeSingularStringField(value: &modelStatus)
default: break
}
}
}
func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
if healthy {
try visitor.visitSingularBoolField(value: healthy, fieldNumber: 1)
}
if !modelStatus.isEmpty {
try visitor.visitSingularStringField(value: modelStatus, fieldNumber: 2)
}
try unknownFields.traverse(visitor: &visitor)
}
static func ==(lhs: Self, rhs: Self) -> Bool {
lhs.healthy == rhs.healthy && lhs.modelStatus == rhs.modelStatus && lhs.unknownFields == rhs.unknownFields
}
func isEqualTo(message: any SwiftProtobuf.Message) -> Bool {
guard let other = message as? Self else { return false }
return self == other
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -4,63 +4,51 @@ import GRPCProtobuf
import GRPCNIOTransportHTTP2
/// gRPC service provider for Apple Intelligence
public struct AppleIntelligenceProvider: RegistrableRPCService {
/// Service descriptor
public static let serviceDescriptor = ServiceDescriptor(
fullyQualifiedService: "appleintelligence.AppleIntelligence"
)
/// Method descriptors
enum Methods {
static let complete = MethodDescriptor(
service: AppleIntelligenceProvider.serviceDescriptor,
method: "Complete"
)
static let streamComplete = MethodDescriptor(
service: AppleIntelligenceProvider.serviceDescriptor,
method: "StreamComplete"
)
static let health = MethodDescriptor(
service: AppleIntelligenceProvider.serviceDescriptor,
method: "Health"
)
}
public struct AppleIntelligenceProvider: Appleintelligence_AppleIntelligenceService.ServiceProtocol {
/// The underlying AI service
private let service: AppleIntelligenceService
/// Text-to-Speech service
private let ttsService: TextToSpeechService?
/// Speech-to-Text service
private let sttService: SpeechToTextService?
/// Optional API key for authentication
private let apiKey: String?
public init(service: AppleIntelligenceService, apiKey: String? = nil) {
public init(
service: AppleIntelligenceService,
ttsService: TextToSpeechService? = nil,
sttService: SpeechToTextService? = nil,
apiKey: String? = nil
) {
self.service = service
self.ttsService = ttsService
self.sttService = sttService
self.apiKey = apiKey
}
public func registerMethods<Transport: ServerTransport>(with router: inout RPCRouter<Transport>) {
// Register Complete method (unary)
router.registerHandler(
forMethod: Methods.complete,
deserializer: ProtobufDeserializer<Appleintelligence_CompletionRequest>(),
serializer: ProtobufSerializer<Appleintelligence_CompletionResponse>()
) { request, context in
try self.validateApiKey(metadata: request.metadata)
// MARK: - ServiceProtocol Implementation
// Collect the single message from the request stream
var requestMessage: Appleintelligence_CompletionRequest?
for try await message in request.messages {
requestMessage = message
break
public func complete(
request: GRPCCore.ServerRequest<Appleintelligence_CompletionRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_CompletionResponse> {
try validateApiKey(metadata: request.metadata)
let message = request.message
// Convert protobuf images to service format
let images = message.images.map { img in
(data: img.data, filename: img.filename.isEmpty ? nil : img.filename)
}
guard let message = requestMessage else {
throw RPCError(code: .invalidArgument, message: "No request message received")
}
let text = try await self.service.complete(
let (text, analyses) = try await service.complete(
prompt: message.prompt,
temperature: message.hasTemperature ? message.temperature : nil,
maxTokens: message.hasMaxTokens ? Int(message.maxTokens) : nil
maxTokens: message.hasMaxTokens ? Int(message.maxTokens) : nil,
images: images
)
var response = Appleintelligence_CompletionResponse()
@ -68,42 +56,45 @@ public struct AppleIntelligenceProvider: RegistrableRPCService {
response.text = text
response.finishReason = "stop"
return StreamingServerResponse(single: ServerResponse(message: response))
// Include analysis results if requested
if message.includeAnalysis {
response.imageAnalyses = analyses.map { analysis in
var protoAnalysis = Appleintelligence_ImageAnalysis()
protoAnalysis.textContent = analysis.textContent
protoAnalysis.labels = analysis.labels
protoAnalysis.description_p = analysis.description
return protoAnalysis
}
}
// Register StreamComplete method (server streaming)
router.registerHandler(
forMethod: Methods.streamComplete,
deserializer: ProtobufDeserializer<Appleintelligence_CompletionRequest>(),
serializer: ProtobufSerializer<Appleintelligence_CompletionChunk>()
) { request, context in
try self.validateApiKey(metadata: request.metadata)
// Collect the single message from the request stream
var requestMessage: Appleintelligence_CompletionRequest?
for try await message in request.messages {
requestMessage = message
break
return ServerResponse(message: response)
}
guard let message = requestMessage else {
throw RPCError(code: .invalidArgument, message: "No request message received")
}
public func streamComplete(
request: GRPCCore.ServerRequest<Appleintelligence_CompletionRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.StreamingServerResponse<Appleintelligence_CompletionChunk> {
try validateApiKey(metadata: request.metadata)
let message = request.message
let completionId = UUID().uuidString
let prompt = message.prompt
let temperature = message.hasTemperature ? message.temperature : nil
let maxTokens = message.hasMaxTokens ? Int(message.maxTokens) : nil
// Convert protobuf images to service format
let images = message.images.map { img in
(data: img.data, filename: img.filename.isEmpty ? nil : img.filename)
}
return StreamingServerResponse { writer in
let stream = await self.service.streamComplete(
prompt: prompt,
temperature: temperature,
maxTokens: maxTokens
prompt: message.prompt,
temperature: message.hasTemperature ? message.temperature : nil,
maxTokens: message.hasMaxTokens ? Int(message.maxTokens) : nil,
images: images
)
var lastContent = ""
for try await partialResponse in stream {
var isFirstChunk = true
for try await (partialResponse, analyses) in stream {
// Calculate the delta (new text since last response)
let delta: String
if partialResponse.hasPrefix(lastContent) {
@ -113,12 +104,25 @@ public struct AppleIntelligenceProvider: RegistrableRPCService {
}
lastContent = partialResponse
if !delta.isEmpty {
if !delta.isEmpty || isFirstChunk {
var chunk = Appleintelligence_CompletionChunk()
chunk.id = completionId
chunk.delta = delta
chunk.isFinal = false
// Include analyses in first chunk if requested
if isFirstChunk && message.includeAnalysis, let analyses = analyses {
chunk.imageAnalyses = analyses.map { analysis in
var protoAnalysis = Appleintelligence_ImageAnalysis()
protoAnalysis.textContent = analysis.textContent
protoAnalysis.labels = analysis.labels
protoAnalysis.description_p = analysis.description
return protoAnalysis
}
}
try await writer.write(chunk)
isFirstChunk = false
}
}
@ -134,26 +138,229 @@ public struct AppleIntelligenceProvider: RegistrableRPCService {
}
}
// Register Health method (unary)
router.registerHandler(
forMethod: Methods.health,
deserializer: ProtobufDeserializer<Appleintelligence_HealthRequest>(),
serializer: ProtobufSerializer<Appleintelligence_HealthResponse>()
) { request, context in
// Consume request messages (empty for health check)
for try await _ in request.messages {}
let isHealthy = await self.service.isAvailable
let modelStatus = await self.service.getModelStatus()
public func health(
request: GRPCCore.ServerRequest<Appleintelligence_HealthRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_HealthResponse> {
let isHealthy = await service.isAvailable
let modelStatus = await service.getModelStatus()
var response = Appleintelligence_HealthResponse()
response.healthy = isHealthy
response.modelStatus = modelStatus
return StreamingServerResponse(single: ServerResponse(message: response))
return ServerResponse(message: response)
}
// MARK: - Text-to-Speech
public func textToSpeech(
request: GRPCCore.ServerRequest<Appleintelligence_TextToSpeechRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_TextToSpeechResponse> {
try validateApiKey(metadata: request.metadata)
guard let ttsService = ttsService else {
throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
}
let message = request.message
// Convert proto config to service config
var config = SpeechConfig.default
if message.hasVoiceConfig {
let voiceConfig = message.voiceConfig
config = SpeechConfig(
voiceIdentifier: voiceConfig.voiceIdentifier.isEmpty ? nil : voiceConfig.voiceIdentifier,
speakingRate: voiceConfig.hasSpeakingRate ? voiceConfig.speakingRate : 0.5,
pitchMultiplier: voiceConfig.hasPitchMultiplier ? voiceConfig.pitchMultiplier : 1.0,
volume: voiceConfig.hasVolume ? voiceConfig.volume : 1.0
)
}
// Convert proto format to service format
let outputFormat: AudioOutputFormat
switch message.outputFormat {
case .wav, .unspecified:
outputFormat = .wav
case .mp3:
outputFormat = .mp3
case .UNRECOGNIZED:
outputFormat = .wav
}
do {
let result = try await ttsService.synthesize(
text: message.text,
config: config,
outputFormat: outputFormat
)
var response = Appleintelligence_TextToSpeechResponse()
response.audioData = result.audioData
response.format = outputFormat == .wav ? .wav : .mp3
response.sampleRate = Int32(result.sampleRate)
response.channels = Int32(result.channels)
response.durationSeconds = result.durationSeconds
return ServerResponse(message: response)
} catch let error as TextToSpeechError {
throw RPCError(code: .internalError, message: error.description)
}
}
public func listVoices(
request: GRPCCore.ServerRequest<Appleintelligence_ListVoicesRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_ListVoicesResponse> {
try validateApiKey(metadata: request.metadata)
guard let ttsService = ttsService else {
throw RPCError(code: .unavailable, message: "Text-to-Speech service not available")
}
let message = request.message
let languageCode = message.hasLanguageCode ? message.languageCode : nil
let voices = await ttsService.listVoices(languageCode: languageCode)
var response = Appleintelligence_ListVoicesResponse()
response.voices = voices.map { voice in
var protoVoice = Appleintelligence_VoiceInfo()
protoVoice.identifier = voice.identifier
protoVoice.name = voice.name
protoVoice.language = voice.language
protoVoice.isPremium = voice.isPremium
protoVoice.gender = voice.gender
return protoVoice
}
return ServerResponse(message: response)
}
// MARK: - Speech-to-Text
public func transcribe(
request: GRPCCore.ServerRequest<Appleintelligence_TranscribeRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.ServerResponse<Appleintelligence_TranscribeResponse> {
try validateApiKey(metadata: request.metadata)
guard let sttService = sttService else {
throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
}
let message = request.message
guard message.hasAudio else {
throw RPCError(code: .invalidArgument, message: "Audio data is required")
}
// Convert proto config to service config
var config = TranscriptionConfig.default
if message.hasConfig {
let protoConfig = message.config
config = TranscriptionConfig(
languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
)
}
do {
let result = try await sttService.transcribe(
audioData: message.audio.data,
mimeType: message.audio.mimeType,
config: config
)
var response = Appleintelligence_TranscribeResponse()
response.text = result.text
response.detectedLanguage = result.detectedLanguage
response.confidence = result.confidence
response.segments = result.segments.map { segment in
var protoSegment = Appleintelligence_TranscriptionSegment()
protoSegment.text = segment.text
protoSegment.startTime = segment.startTime
protoSegment.endTime = segment.endTime
protoSegment.confidence = segment.confidence
return protoSegment
}
return ServerResponse(message: response)
} catch let error as SpeechToTextError {
throw RPCError(code: .internalError, message: error.description)
}
}
public func streamTranscribe(
request: GRPCCore.StreamingServerRequest<Appleintelligence_StreamingTranscribeRequest>,
context: GRPCCore.ServerContext
) async throws -> GRPCCore.StreamingServerResponse<Appleintelligence_StreamingTranscribeResponse> {
try validateApiKey(metadata: request.metadata)
guard let sttService = sttService else {
throw RPCError(code: .unavailable, message: "Speech-to-Text service not available")
}
return StreamingServerResponse { writer in
var config = TranscriptionConfig.default
// Process incoming stream
for try await message in request.messages {
switch message.request {
case .config(let protoConfig):
// First message should be config
config = TranscriptionConfig(
languageCode: protoConfig.hasLanguageCode ? protoConfig.languageCode : nil,
enablePunctuation: protoConfig.hasEnablePunctuation ? protoConfig.enablePunctuation : true,
enableTimestamps: protoConfig.hasEnableTimestamps ? protoConfig.enableTimestamps : false
)
// Start streaming transcription
let stream = await sttService.streamTranscribe(config: config)
Task {
do {
for try await update in stream {
var response = Appleintelligence_StreamingTranscribeResponse()
response.partialText = update.partialText
response.isFinal = update.isFinal
if let finalText = update.finalText {
response.finalText = finalText
}
response.segments = update.segments.map { segment in
var protoSegment = Appleintelligence_TranscriptionSegment()
protoSegment.text = segment.text
protoSegment.startTime = segment.startTime
protoSegment.endTime = segment.endTime
protoSegment.confidence = segment.confidence
return protoSegment
}
try await writer.write(response)
}
} catch {
// Stream ended or error occurred
}
}
case .audioChunk(let chunk):
// Feed audio chunk to service
try await sttService.feedAudioChunk(chunk)
case .none:
break
}
}
// End streaming session
await sttService.endStreamingSession()
return [:]
}
}
// MARK: - Private Helpers
/// Validate API key if configured
private func validateApiKey(metadata: Metadata) throws {
guard let expectedKey = apiKey else {

View File

@ -0,0 +1,9 @@
import Foundation
/// Helper for accessing bundled resources
public enum AppleIntelligenceResources {
/// URL to the protobuf descriptor set file for reflection
public static var descriptorSetURL: URL? {
Bundle.module.url(forResource: "apple_intelligence", withExtension: "pb")
}
}

View File

@ -6,6 +6,7 @@ public enum AppleIntelligenceError: Error, CustomStringConvertible, Sendable {
case modelNotAvailable
case generationFailed(String)
case sessionCreationFailed
case imageAnalysisFailed(String)
public var description: String {
switch self {
@ -15,6 +16,8 @@ public enum AppleIntelligenceError: Error, CustomStringConvertible, Sendable {
return "Generation failed: \(reason)"
case .sessionCreationFailed:
return "Failed to create language model session"
case .imageAnalysisFailed(let reason):
return "Image analysis failed: \(reason)"
}
}
}
@ -24,6 +27,9 @@ public actor AppleIntelligenceService {
/// The language model session
private var session: LanguageModelSession?
/// Vision analysis service for image processing
private let visionService = VisionAnalysisService()
/// Whether the model is available
public private(set) var isAvailable: Bool = false
@ -60,21 +66,42 @@ public actor AppleIntelligenceService {
}
/// Generate a completion for the given prompt (non-streaming)
public func complete(prompt: String, temperature: Float?, maxTokens: Int?) async throws -> String {
public func complete(
prompt: String,
temperature: Float?,
maxTokens: Int?,
images: [(data: Data, filename: String?)] = []
) async throws -> (text: String, analyses: [VisionAnalysisResult]) {
guard isAvailable, let session = session else {
throw AppleIntelligenceError.modelNotAvailable
}
let response = try await session.respond(to: prompt)
return response.content
// Analyze images if provided
var analyses: [VisionAnalysisResult] = []
var enhancedPrompt = prompt
if !images.isEmpty {
do {
analyses = try await visionService.analyzeMultiple(images: images)
let analysesWithFilenames = zip(analyses, images).map { (result: $0.0, filename: $0.1.filename) }
let context = await visionService.formatAnalysesAsPromptContext(analyses: analysesWithFilenames)
enhancedPrompt = context + "\n\n" + prompt
} catch {
throw AppleIntelligenceError.imageAnalysisFailed(error.localizedDescription)
}
}
let response = try await session.respond(to: enhancedPrompt)
return (text: response.content, analyses: analyses)
}
/// Generate a streaming completion for the given prompt
public func streamComplete(
prompt: String,
temperature: Float?,
maxTokens: Int?
) -> AsyncThrowingStream<String, Error> {
maxTokens: Int?,
images: [(data: Data, filename: String?)] = []
) -> AsyncThrowingStream<(text: String, analyses: [VisionAnalysisResult]?), Error> {
AsyncThrowingStream { continuation in
Task {
guard self.isAvailable, let session = self.session else {
@ -82,10 +109,33 @@ public actor AppleIntelligenceService {
return
}
// Analyze images first if provided
var analyses: [VisionAnalysisResult] = []
var enhancedPrompt = prompt
if !images.isEmpty {
do {
let stream = session.streamResponse(to: prompt)
analyses = try await self.visionService.analyzeMultiple(images: images)
let analysesWithFilenames = zip(analyses, images).map { (result: $0.0, filename: $0.1.filename) }
let context = await self.visionService.formatAnalysesAsPromptContext(analyses: analysesWithFilenames)
enhancedPrompt = context + "\n\n" + prompt
} catch {
continuation.finish(throwing: AppleIntelligenceError.imageAnalysisFailed(error.localizedDescription))
return
}
}
do {
let stream = session.streamResponse(to: enhancedPrompt)
var isFirst = true
for try await partialResponse in stream {
continuation.yield(partialResponse.content)
// Include analyses only in first chunk
if isFirst {
continuation.yield((text: partialResponse.content, analyses: analyses))
isFirst = false
} else {
continuation.yield((text: partialResponse.content, analyses: nil))
}
}
continuation.finish()
} catch {

View File

@ -0,0 +1,387 @@
import Foundation
import Speech
import AVFoundation
// MARK: - Result Types
/// Transcription result
public struct TranscriptionResult: Sendable {
public let text: String
public let segments: [TranscriptionSegmentResult]
public let detectedLanguage: String
public let confidence: Float
}
/// Individual transcription segment
public struct TranscriptionSegmentResult: Sendable {
public let text: String
public let startTime: Float
public let endTime: Float
public let confidence: Float
}
/// Streaming transcription update
public struct StreamingTranscriptionUpdate: Sendable {
public let partialText: String
public let isFinal: Bool
public let finalText: String?
public let segments: [TranscriptionSegmentResult]
}
/// Transcription configuration
public struct TranscriptionConfig: Sendable {
public var languageCode: String?
public var enablePunctuation: Bool
public var enableTimestamps: Bool
public static let `default` = TranscriptionConfig(
languageCode: nil,
enablePunctuation: true,
enableTimestamps: false
)
public init(
languageCode: String? = nil,
enablePunctuation: Bool = true,
enableTimestamps: Bool = false
) {
self.languageCode = languageCode
self.enablePunctuation = enablePunctuation
self.enableTimestamps = enableTimestamps
}
}
// MARK: - Errors
public enum SpeechToTextError: Error, CustomStringConvertible, Sendable {
case notAvailable
case authorizationDenied
case modelNotReady(String)
case transcriptionFailed(String)
case invalidAudioFormat
case audioProcessingFailed(String)
case unsupportedMimeType(String)
public var description: String {
switch self {
case .notAvailable: return "Speech recognition not available on this system"
case .authorizationDenied: return "Speech recognition authorization denied"
case .modelNotReady(let reason): return "Speech model not ready: \(reason)"
case .transcriptionFailed(let reason): return "Transcription failed: \(reason)"
case .invalidAudioFormat: return "Invalid audio format"
case .audioProcessingFailed(let reason): return "Audio processing failed: \(reason)"
case .unsupportedMimeType(let type): return "Unsupported audio MIME type: \(type)"
}
}
}
// MARK: - Service Actor
public actor SpeechToTextService {
/// Service availability status
public private(set) var isAvailable: Bool = false
/// Streaming session state
private var isStreamingActive: Bool = false
private var streamingRequest: SFSpeechAudioBufferRecognitionRequest?
private var streamingRecognizer: SFSpeechRecognizer?
private var streamingTask: SFSpeechRecognitionTask?
private var streamingContinuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation?
public init() async {
await checkAvailability()
}
// MARK: - Public API
/// Transcribe audio data (file-based)
public func transcribe(
audioData: Data,
mimeType: String,
config: TranscriptionConfig = .default
) async throws -> TranscriptionResult {
guard isAvailable else {
throw SpeechToTextError.notAvailable
}
// Convert audio data to file URL for processing
let tempURL = try createTempAudioFile(data: audioData, mimeType: mimeType)
defer { try? FileManager.default.removeItem(at: tempURL) }
return try await transcribeWithSFSpeechRecognizer(url: tempURL, config: config)
}
/// Stream transcription from audio chunks sent via gRPC
public func streamTranscribe(
config: TranscriptionConfig = .default
) -> AsyncThrowingStream<StreamingTranscriptionUpdate, Error> {
AsyncThrowingStream { continuation in
Task {
guard await self.isAvailable else {
continuation.finish(throwing: SpeechToTextError.notAvailable)
return
}
do {
try await self.startStreamingSession(config: config, continuation: continuation)
} catch {
continuation.finish(throwing: error)
}
}
}
}
/// Feed audio chunk for streaming transcription (PCM audio data)
public func feedAudioChunk(_ chunk: Data) async throws {
guard isStreamingActive, let request = streamingRequest else {
throw SpeechToTextError.transcriptionFailed("No active streaming session")
}
// Convert raw PCM data to audio buffer
// Assuming 16-bit PCM, mono, 16kHz (common format for speech)
let audioFormat = AVAudioFormat(
commonFormat: .pcmFormatInt16,
sampleRate: 16000,
channels: 1,
interleaved: true
)!
let frameCount = UInt32(chunk.count / 2) // 2 bytes per Int16 sample
guard let buffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: frameCount) else {
throw SpeechToTextError.audioProcessingFailed("Failed to create audio buffer")
}
buffer.frameLength = frameCount
// Copy data into buffer
chunk.withUnsafeBytes { rawPtr in
if let int16Ptr = rawPtr.baseAddress?.assumingMemoryBound(to: Int16.self) {
buffer.int16ChannelData?[0].update(from: int16Ptr, count: Int(frameCount))
}
}
request.append(buffer)
}
/// End streaming session
public func endStreamingSession() async {
streamingRequest?.endAudio()
isStreamingActive = false
streamingRequest = nil
streamingTask = nil
streamingRecognizer = nil
streamingContinuation = nil
}
/// Get status information
public func getStatus() -> String {
if isAvailable {
return "SFSpeechRecognizer available"
} else {
return "Speech recognition not available"
}
}
// MARK: - Private Implementation
private func checkAvailability() async {
// Check SFSpeechRecognizer availability
let status = SFSpeechRecognizer.authorizationStatus()
switch status {
case .authorized:
isAvailable = SFSpeechRecognizer.supportedLocales().count > 0
case .notDetermined:
// Request authorization
isAvailable = await withCheckedContinuation { continuation in
SFSpeechRecognizer.requestAuthorization { newStatus in
continuation.resume(returning: newStatus == .authorized)
}
}
default:
isAvailable = false
}
}
/// Create temporary audio file from data
private func createTempAudioFile(data: Data, mimeType: String) throws -> URL {
let ext = extensionForMimeType(mimeType)
let tempDir = FileManager.default.temporaryDirectory
let fileName = UUID().uuidString + "." + ext
let fileURL = tempDir.appendingPathComponent(fileName)
try data.write(to: fileURL)
return fileURL
}
/// Get file extension for MIME type
private func extensionForMimeType(_ mimeType: String) -> String {
switch mimeType.lowercased() {
case "audio/wav", "audio/wave", "audio/x-wav":
return "wav"
case "audio/mp3", "audio/mpeg":
return "mp3"
case "audio/m4a", "audio/mp4", "audio/x-m4a":
return "m4a"
case "audio/aac":
return "aac"
case "audio/flac":
return "flac"
default:
return "wav"
}
}
/// Transcribe using SFSpeechRecognizer
private func transcribeWithSFSpeechRecognizer(
url: URL,
config: TranscriptionConfig
) async throws -> TranscriptionResult {
let locale = Locale(identifier: config.languageCode ?? "en-US")
guard let recognizer = SFSpeechRecognizer(locale: locale) else {
throw SpeechToTextError.notAvailable
}
guard recognizer.isAvailable else {
throw SpeechToTextError.notAvailable
}
let request = SFSpeechURLRecognitionRequest(url: url)
request.shouldReportPartialResults = false
return try await withCheckedThrowingContinuation { continuation in
var hasResumed = false
recognizer.recognitionTask(with: request) { result, error in
guard !hasResumed else { return }
if let error = error {
hasResumed = true
continuation.resume(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription))
return
}
guard let result = result, result.isFinal else { return }
hasResumed = true
let transcription = result.bestTranscription
var segments: [TranscriptionSegmentResult] = []
if config.enableTimestamps {
for segment in transcription.segments {
segments.append(TranscriptionSegmentResult(
text: segment.substring,
startTime: Float(segment.timestamp),
endTime: Float(segment.timestamp + segment.duration),
confidence: segment.confidence
))
}
}
let transcriptionResult = TranscriptionResult(
text: transcription.formattedString,
segments: segments,
detectedLanguage: config.languageCode ?? "en-US",
confidence: segments.isEmpty ? 1.0 : segments.reduce(0) { $0 + $1.confidence } / Float(segments.count)
)
continuation.resume(returning: transcriptionResult)
}
}
}
/// Start streaming session for gRPC audio chunks
private func startStreamingSession(
config: TranscriptionConfig,
continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation
) async throws {
let locale = Locale(identifier: config.languageCode ?? "en-US")
guard let recognizer = SFSpeechRecognizer(locale: locale) else {
throw SpeechToTextError.notAvailable
}
guard recognizer.isAvailable else {
throw SpeechToTextError.notAvailable
}
// Set up streaming state
isStreamingActive = true
streamingRecognizer = recognizer
streamingContinuation = continuation
let request = SFSpeechAudioBufferRecognitionRequest()
request.shouldReportPartialResults = true
streamingRequest = request
// Create wrapper to handle results safely
let service = self
let resultHandler = StreamingResultHandler(
config: config,
continuation: continuation,
onFinish: {
Task { await service.endStreamingSession() }
}
)
streamingTask = recognizer.recognitionTask(with: request) { result, error in
resultHandler.handleResult(result: result, error: error)
}
}
}
// MARK: - Streaming Result Handler
/// Wrapper to safely handle streaming recognition results
private final class StreamingResultHandler: @unchecked Sendable {
private let config: TranscriptionConfig
private let continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation
private let onFinish: () -> Void
init(
config: TranscriptionConfig,
continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation,
onFinish: @escaping () -> Void
) {
self.config = config
self.continuation = continuation
self.onFinish = onFinish
}
func handleResult(result: SFSpeechRecognitionResult?, error: Error?) {
if let error = error {
continuation.finish(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription))
onFinish()
return
}
guard let result = result else { return }
let transcription = result.bestTranscription
var segments: [TranscriptionSegmentResult] = []
if config.enableTimestamps {
for segment in transcription.segments {
segments.append(TranscriptionSegmentResult(
text: segment.substring,
startTime: Float(segment.timestamp),
endTime: Float(segment.timestamp + segment.duration),
confidence: segment.confidence
))
}
}
let update = StreamingTranscriptionUpdate(
partialText: transcription.formattedString,
isFinal: result.isFinal,
finalText: result.isFinal ? transcription.formattedString : nil,
segments: segments
)
continuation.yield(update)
if result.isFinal {
continuation.finish()
onFinish()
}
}
}

View File

@ -0,0 +1,280 @@
import Foundation
import AVFoundation
// MARK: - Result Types
/// Result of text-to-speech synthesis
public struct TextToSpeechResult: Sendable {
public let audioData: Data
public let format: AudioOutputFormat
public let sampleRate: Int
public let channels: Int
public let durationSeconds: Float
}
/// Supported output formats
public enum AudioOutputFormat: Sendable {
case wav
case mp3
}
/// Voice information
public struct VoiceDescription: Sendable {
public let identifier: String
public let name: String
public let language: String
public let isPremium: Bool
public let gender: String
}
/// Configuration for speech synthesis
public struct SpeechConfig: Sendable {
public var voiceIdentifier: String?
public var speakingRate: Float // 0.0 - 1.0
public var pitchMultiplier: Float // 0.5 - 2.0
public var volume: Float // 0.0 - 1.0
public static let `default` = SpeechConfig(
voiceIdentifier: nil,
speakingRate: 0.5,
pitchMultiplier: 1.0,
volume: 1.0
)
public init(
voiceIdentifier: String? = nil,
speakingRate: Float = 0.5,
pitchMultiplier: Float = 1.0,
volume: Float = 1.0
) {
self.voiceIdentifier = voiceIdentifier
self.speakingRate = speakingRate
self.pitchMultiplier = pitchMultiplier
self.volume = volume
}
}
// MARK: - Errors
public enum TextToSpeechError: Error, CustomStringConvertible, Sendable {
case invalidVoice(String)
case synthesisFailure(String)
case encodingFailure(String)
case noAudioGenerated
case unsupportedFormat
public var description: String {
switch self {
case .invalidVoice(let id): return "Invalid voice identifier: \(id)"
case .synthesisFailure(let reason): return "Speech synthesis failed: \(reason)"
case .encodingFailure(let reason): return "Audio encoding failed: \(reason)"
case .noAudioGenerated: return "No audio was generated"
case .unsupportedFormat: return "Unsupported audio format"
}
}
}
// MARK: - Service Actor
public actor TextToSpeechService {
/// Keep strong reference to synthesizer during synthesis
private var activeSynthesizer: AVSpeechSynthesizer?
public init() {}
// MARK: - Public API
/// Synthesize text to speech
public func synthesize(
text: String,
config: SpeechConfig = .default,
outputFormat: AudioOutputFormat = .wav
) async throws -> TextToSpeechResult {
// Create utterance
let utterance = AVSpeechUtterance(string: text)
// Configure voice
if let voiceId = config.voiceIdentifier {
if let voice = AVSpeechSynthesisVoice(identifier: voiceId) {
utterance.voice = voice
} else {
throw TextToSpeechError.invalidVoice(voiceId)
}
} else {
// Use default English voice
utterance.voice = AVSpeechSynthesisVoice(language: "en-US")
}
// Configure speech parameters
utterance.rate = config.speakingRate
utterance.pitchMultiplier = config.pitchMultiplier
utterance.volume = config.volume
// Collect PCM data
let pcmData = try await collectPCMData(utterance: utterance)
// Convert to requested format
let audioData: Data
switch outputFormat {
case .wav:
audioData = createWAVData(from: pcmData)
case .mp3:
// Use WAV as fallback (MP3 encoding requires external library)
audioData = createWAVData(from: pcmData)
}
// Calculate duration
let bytesPerSample = 2 // Int16
let totalSamples = pcmData.samples.count / bytesPerSample / pcmData.channelCount
let duration = Float(totalSamples) / Float(pcmData.sampleRate)
return TextToSpeechResult(
audioData: audioData,
format: outputFormat,
sampleRate: Int(pcmData.sampleRate),
channels: pcmData.channelCount,
durationSeconds: duration
)
}
/// List available voices
public func listVoices(languageCode: String? = nil) -> [VoiceDescription] {
let voices = AVSpeechSynthesisVoice.speechVoices()
let filtered: [AVSpeechSynthesisVoice]
if let lang = languageCode {
filtered = voices.filter { $0.language.hasPrefix(lang) }
} else {
filtered = voices
}
return filtered.map { voice in
VoiceDescription(
identifier: voice.identifier,
name: voice.name,
language: voice.language,
isPremium: voice.quality == .enhanced || voice.quality == .premium,
gender: genderString(for: voice)
)
}
}
// MARK: - Private Implementation
/// PCM buffer data for internal processing
private struct PCMBufferData: Sendable {
let samples: Data
let sampleRate: Double
let channelCount: Int
}
/// Collect PCM data from synthesizer using write callback
private func collectPCMData(
utterance: AVSpeechUtterance
) async throws -> PCMBufferData {
// Create and store synthesizer to keep strong reference during synthesis
let synthesizer = AVSpeechSynthesizer()
self.activeSynthesizer = synthesizer
defer { self.activeSynthesizer = nil }
return try await withCheckedThrowingContinuation { continuation in
var pcmData = Data()
var sampleRate: Double = 0
var channelCount: Int = 0
var hasResumed = false
synthesizer.write(utterance) { buffer in
guard let pcmBuffer = buffer as? AVAudioPCMBuffer else {
// End of audio - empty buffer signals completion
if !hasResumed {
hasResumed = true
if pcmData.isEmpty {
continuation.resume(throwing: TextToSpeechError.noAudioGenerated)
} else {
continuation.resume(returning: PCMBufferData(
samples: pcmData,
sampleRate: sampleRate,
channelCount: channelCount
))
}
}
return
}
if pcmBuffer.frameLength > 0 {
// Store format from first buffer
if sampleRate == 0 {
sampleRate = pcmBuffer.format.sampleRate
channelCount = Int(pcmBuffer.format.channelCount)
}
// Convert float samples to Int16 PCM
if let channelData = pcmBuffer.floatChannelData {
let frameCount = Int(pcmBuffer.frameLength)
for frame in 0..<frameCount {
for channel in 0..<channelCount {
let sample = channelData[channel][frame]
let clampedSample = max(-1.0, min(1.0, sample))
let int16Sample = Int16(clampedSample * Float(Int16.max))
withUnsafeBytes(of: int16Sample.littleEndian) { bytes in
pcmData.append(contentsOf: bytes)
}
}
}
}
}
}
}
}
/// Create WAV data from PCM buffer data
private func createWAVData(from pcmData: PCMBufferData) -> Data {
let bitsPerSample = 16
let sampleRate = Int(pcmData.sampleRate)
let channels = pcmData.channelCount
let dataSize = pcmData.samples.count
var header = Data()
// RIFF header
header.append(contentsOf: "RIFF".utf8)
let fileSize = UInt32(dataSize + 36)
withUnsafeBytes(of: fileSize.littleEndian) { header.append(contentsOf: $0) }
header.append(contentsOf: "WAVE".utf8)
// fmt subchunk
header.append(contentsOf: "fmt ".utf8)
let subchunk1Size = UInt32(16)
withUnsafeBytes(of: subchunk1Size.littleEndian) { header.append(contentsOf: $0) }
let audioFormat = UInt16(1) // PCM
withUnsafeBytes(of: audioFormat.littleEndian) { header.append(contentsOf: $0) }
let numChannels = UInt16(channels)
withUnsafeBytes(of: numChannels.littleEndian) { header.append(contentsOf: $0) }
let sampleRateU32 = UInt32(sampleRate)
withUnsafeBytes(of: sampleRateU32.littleEndian) { header.append(contentsOf: $0) }
let byteRate = UInt32(sampleRate * channels * bitsPerSample / 8)
withUnsafeBytes(of: byteRate.littleEndian) { header.append(contentsOf: $0) }
let blockAlign = UInt16(channels * bitsPerSample / 8)
withUnsafeBytes(of: blockAlign.littleEndian) { header.append(contentsOf: $0) }
let bitsPerSampleU16 = UInt16(bitsPerSample)
withUnsafeBytes(of: bitsPerSampleU16.littleEndian) { header.append(contentsOf: $0) }
// data subchunk
header.append(contentsOf: "data".utf8)
let dataU32 = UInt32(dataSize)
withUnsafeBytes(of: dataU32.littleEndian) { header.append(contentsOf: $0) }
return header + pcmData.samples
}
/// Get gender string for voice
private func genderString(for voice: AVSpeechSynthesisVoice) -> String {
switch voice.gender {
case .male: return "male"
case .female: return "female"
case .unspecified: return "unspecified"
@unknown default: return "unknown"
}
}
}

View File

@ -0,0 +1,243 @@
import Foundation
import Vision
import CoreImage
#if canImport(AppKit)
import AppKit
#endif
/// Result of Vision framework analysis on an image
public struct VisionAnalysisResult: Sendable {
public let textContent: String
public let labels: [String]
public let description: String
public init(textContent: String = "", labels: [String] = [], description: String = "") {
self.textContent = textContent
self.labels = labels
self.description = description
}
/// Format analysis for LLM context
public func formatAsContext(imageIndex: Int, filename: String?) -> String {
var parts: [String] = []
let imageName = filename ?? "Image \(imageIndex + 1)"
if !textContent.isEmpty {
parts.append("Text: \"\(textContent)\"")
}
if !labels.isEmpty {
parts.append("Objects: \(labels.joined(separator: ", "))")
}
if parts.isEmpty {
return "\(imageName): No content detected"
}
return "\(imageName): \(parts.joined(separator: " | "))"
}
}
/// Errors from Vision analysis
public enum VisionAnalysisError: Error, CustomStringConvertible, Sendable {
case invalidImageData
case analysisFailure(String)
case unsupportedFormat
public var description: String {
switch self {
case .invalidImageData:
return "Invalid or corrupted image data"
case .analysisFailure(let reason):
return "Vision analysis failed: \(reason)"
case .unsupportedFormat:
return "Unsupported image format"
}
}
}
/// Service for analyzing images using Apple's Vision framework
public actor VisionAnalysisService {
/// Configuration for which analyses to perform
public struct AnalysisOptions: Sendable {
public var performOCR: Bool
public var performClassification: Bool
public init(performOCR: Bool = true, performClassification: Bool = true) {
self.performOCR = performOCR
self.performClassification = performClassification
}
public static let all = AnalysisOptions()
public static let textOnly = AnalysisOptions(performOCR: true, performClassification: false)
}
public init() {}
/// Analyze a single image
public func analyze(
imageData: Data,
options: AnalysisOptions = .all
) async throws -> VisionAnalysisResult {
guard let cgImage = createCGImage(from: imageData) else {
throw VisionAnalysisError.invalidImageData
}
var textContent = ""
var labels: [String] = []
// Perform OCR
if options.performOCR {
textContent = try await performTextRecognition(on: cgImage)
}
// Perform image classification
if options.performClassification {
labels = try await performImageClassification(on: cgImage)
}
// Build description
var descriptionParts: [String] = []
if !textContent.isEmpty {
let truncatedText = textContent.count > 200
? String(textContent.prefix(200)) + "..."
: textContent
descriptionParts.append("Contains text: \"\(truncatedText)\"")
}
if !labels.isEmpty {
descriptionParts.append("Shows: \(labels.prefix(5).joined(separator: ", "))")
}
let description = descriptionParts.isEmpty
? "Image with no recognizable content"
: descriptionParts.joined(separator: ". ")
return VisionAnalysisResult(
textContent: textContent,
labels: labels,
description: description
)
}
/// Analyze multiple images
public func analyzeMultiple(
images: [(data: Data, filename: String?)],
options: AnalysisOptions = .all
) async throws -> [VisionAnalysisResult] {
var results: [VisionAnalysisResult] = []
for image in images {
let result = try await analyze(imageData: image.data, options: options)
results.append(result)
}
return results
}
/// Format multiple analyses as a combined context string for LLM
public func formatAnalysesAsPromptContext(
analyses: [(result: VisionAnalysisResult, filename: String?)]
) -> String {
guard !analyses.isEmpty else { return "" }
var lines: [String] = ["[Image Analysis]"]
for (index, analysis) in analyses.enumerated() {
lines.append(analysis.result.formatAsContext(
imageIndex: index,
filename: analysis.filename
))
}
lines.append("[End Image Analysis]")
return lines.joined(separator: "\n")
}
// MARK: - Private Methods
private func createCGImage(from data: Data) -> CGImage? {
#if canImport(AppKit)
guard let nsImage = NSImage(data: data),
let cgImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
// Try CIImage as fallback
guard let ciImage = CIImage(data: data) else { return nil }
let context = CIContext()
return context.createCGImage(ciImage, from: ciImage.extent)
}
return cgImage
#else
guard let ciImage = CIImage(data: data) else { return nil }
let context = CIContext()
return context.createCGImage(ciImage, from: ciImage.extent)
#endif
}
private func performTextRecognition(on image: CGImage) async throws -> String {
try await withCheckedThrowingContinuation { continuation in
let request = VNRecognizeTextRequest { request, error in
if let error = error {
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
return
}
guard let observations = request.results as? [VNRecognizedTextObservation] else {
continuation.resume(returning: "")
return
}
let recognizedText = observations.compactMap { observation in
observation.topCandidates(1).first?.string
}.joined(separator: "\n")
continuation.resume(returning: recognizedText)
}
request.recognitionLevel = .accurate
request.usesLanguageCorrection = true
let handler = VNImageRequestHandler(cgImage: image, options: [:])
do {
try handler.perform([request])
} catch {
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
}
}
}
private func performImageClassification(on image: CGImage) async throws -> [String] {
try await withCheckedThrowingContinuation { continuation in
let request = VNClassifyImageRequest { request, error in
if let error = error {
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
return
}
guard let observations = request.results as? [VNClassificationObservation] else {
continuation.resume(returning: [])
return
}
// Filter to high-confidence labels and take top 10
let labels = observations
.filter { $0.confidence > 0.3 }
.prefix(10)
.map { $0.identifier.replacingOccurrences(of: "_", with: " ") }
continuation.resume(returning: Array(labels))
}
let handler = VNImageRequestHandler(cgImage: image, options: [:])
do {
try handler.perform([request])
} catch {
continuation.resume(throwing: VisionAnalysisError.analysisFailure(error.localizedDescription))
}
}
}
}

View File

@ -37,7 +37,21 @@ struct AppleIntelligenceServer: AsyncParsableCommand {
throw ExitCode.failure
}
let provider = AppleIntelligenceProvider(service: service, apiKey: config.apiKey)
// Initialize speech services
print("Initializing Text-to-Speech service...")
let ttsService = TextToSpeechService()
print("Initializing Speech-to-Text service...")
let sttService = await SpeechToTextService()
let sttStatus = await sttService.getStatus()
print("Speech-to-Text status: \(sttStatus)")
let provider = AppleIntelligenceProvider(
service: service,
ttsService: ttsService,
sttService: sttService,
apiKey: config.apiKey
)
let transport = HTTP2ServerTransport.Posix(
address: .ipv4(host: bindHost, port: bindPort),
@ -52,7 +66,15 @@ struct AppleIntelligenceServer: AsyncParsableCommand {
print("API key authentication is enabled")
}
print("Server is ready to accept connections")
print("Health check: grpcurl -plaintext \(bindHost):\(bindPort) appleintelligence.AppleIntelligence/Health")
print("")
print("Available services:")
print(" - Complete/StreamComplete: Text generation with Apple Intelligence")
print(" - TextToSpeech: Convert text to spoken audio")
print(" - ListVoices: List available TTS voices")
print(" - Transcribe: Convert audio file to text")
print(" - StreamTranscribe: Real-time speech-to-text")
print("")
print("Health check: grpcurl -plaintext \(bindHost):\(bindPort) appleintelligence.AppleIntelligenceService/Health")
print("Press Ctrl+C to stop the server")
try await server.serve()