Add Text-to-Speech and Speech-to-Text features

- Add TTS service using AVSpeechSynthesizer for voice output
- Add STT service using SpeechAnalyzer (macOS 26) for transcription
- Add voice input (microphone) button in chat with recording level indicator
- Add speak button on assistant messages for TTS playback
- Add language toggle (EN-CA/FR-CA) for bilingual speech recognition
- Fix Swift 6 strict concurrency issues in audio callbacks
- Update proto schema with TTS/STT message types and RPCs
- Update gRPC provider with speech service endpoints

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Mathias Beaulieu-Duncan
2025-12-31 02:57:30 -05:00
parent 638656e7ca
commit b754945923
10 changed files with 3151 additions and 8 deletions
@@ -23,7 +23,11 @@ struct ChatView: View {
ScrollView {
LazyVStack(spacing: 12) {
ForEach(viewModel.messages) { message in
MessageBubble(message: message)
MessageBubble(
message: message,
isSpeaking: viewModel.speakingMessageId == message.id,
onSpeak: { viewModel.speakMessage(message) }
)
.id(message.id)
}
}
@@ -286,6 +290,45 @@ struct ChatView: View {
.buttonStyle(.plain)
.help("Paste image from clipboard")
// Language toggle for speech recognition
Button {
// Toggle between en-CA and fr-CA
let newLang = viewModel.detectedLanguage == "en-CA" ? "fr-CA" : "en-CA"
viewModel.switchLanguage(to: newLang)
} label: {
Text(viewModel.detectedLanguage == "fr-CA" ? "FR" : "EN")
.font(.caption.bold())
.foregroundStyle(.secondary)
.frame(width: 24, height: 24)
.background(
RoundedRectangle(cornerRadius: 4)
.fill(Color.secondary.opacity(0.1))
)
}
.buttonStyle(.plain)
.help("Speech language: \(viewModel.detectedLanguage) (click to toggle)")
// Microphone button for voice input
Button {
viewModel.toggleRecording()
} label: {
ZStack {
if viewModel.isRecording {
// Recording indicator with level
Circle()
.fill(Color.red.opacity(0.3))
.frame(width: 28 + CGFloat(viewModel.recordingLevel) * 10,
height: 28 + CGFloat(viewModel.recordingLevel) * 10)
.animation(.easeInOut(duration: 0.1), value: viewModel.recordingLevel)
}
Image(systemName: viewModel.isRecording ? "mic.fill" : "mic")
.font(.title3)
.foregroundStyle(viewModel.isRecording ? .red : .secondary)
}
}
.buttonStyle(.plain)
.help(viewModel.isRecording ? "Stop recording" : "Voice input")
TextField("Message...", text: $viewModel.inputText, axis: .vertical)
.textFieldStyle(.plain)
.lineLimit(1...5)
@@ -386,6 +429,8 @@ struct RecentImageThumbnail: View {
struct MessageBubble: View {
let message: ChatMessage
var isSpeaking: Bool = false
var onSpeak: (() -> Void)? = nil
@State private var showCopied = false
var body: some View {
@@ -419,10 +464,23 @@ struct MessageBubble: View {
}
}
// Copy button for assistant messages
// Action buttons for assistant messages
if message.role == .assistant && !message.content.isEmpty && !message.isStreaming {
HStack {
Spacer()
HStack(spacing: 12) {
// Speaker button for TTS
Button {
onSpeak?()
} label: {
HStack(spacing: 4) {
Image(systemName: isSpeaking ? "stop.fill" : "speaker.wave.2")
Text(isSpeaking ? "Stop" : "Speak")
}
.font(.caption)
.foregroundStyle(isSpeaking ? .red : .secondary)
}
.buttonStyle(.plain)
// Copy button
Button {
NSPasteboard.general.clearContents()
NSPasteboard.general.setString(message.content, forType: .string)
@@ -439,6 +497,8 @@ struct MessageBubble: View {
.foregroundStyle(.secondary)
}
.buttonStyle(.plain)
Spacer()
}
.padding(.top, 2)
}