Fix STT streaming to receive audio from gRPC client

- Fix streaming STT to accept audio chunks from gRPC stream instead of local microphone - Add proper PCM audio buffer conversion for 16-bit, 16kHz, mono audio - Add StreamingResultHandler for safe callback handling - Properly manage streaming session state and cleanup 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 03:40:46 -05:00 · 2025-12-31 03:40:46 -05:00 · f7b8fbfa36
commit f7b8fbfa36
parent 7655f1f0b8
1 changed files with 111 additions and 61 deletions
--- a/Sources/AppleIntelligenceCore/Services/SpeechToTextService.swift
+++ b/Sources/AppleIntelligenceCore/Services/SpeechToTextService.swift
@ -84,6 +84,10 @@ public actor SpeechToTextService {

    /// Streaming session state
    private var isStreamingActive: Bool = false
+    private var streamingRequest: SFSpeechAudioBufferRecognitionRequest?
+    private var streamingRecognizer: SFSpeechRecognizer?
+    private var streamingTask: SFSpeechRecognitionTask?
+    private var streamingContinuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation?

    public init() async {
        await checkAvailability()
@ -108,19 +112,19 @@ public actor SpeechToTextService {
        return try await transcribeWithSFSpeechRecognizer(url: tempURL, config: config)
    }

-    /// Stream transcription from audio chunks
+    /// Stream transcription from audio chunks sent via gRPC
    public func streamTranscribe(
        config: TranscriptionConfig = .default
    ) -> AsyncThrowingStream<StreamingTranscriptionUpdate, Error> {
        AsyncThrowingStream { continuation in
            Task {
-                guard self.isAvailable else {
+                guard await self.isAvailable else {
                    continuation.finish(throwing: SpeechToTextError.notAvailable)
                    return
                }

                do {
-                    try await self.startStreamingWithSFSpeechRecognizer(config: config, continuation: continuation)
+                    try await self.startStreamingSession(config: config, continuation: continuation)
                } catch {
                    continuation.finish(throwing: error)
                }
@ -128,17 +132,46 @@ public actor SpeechToTextService {
        }
    }

-    /// Feed audio chunk for streaming transcription
+    /// Feed audio chunk for streaming transcription (PCM audio data)
    public func feedAudioChunk(_ chunk: Data) async throws {
-        guard isStreamingActive else {
+        guard isStreamingActive, let request = streamingRequest else {
            throw SpeechToTextError.transcriptionFailed("No active streaming session")
        }
-        // Audio chunk handling implemented in streaming methods
+
+        // Convert raw PCM data to audio buffer
+        // Assuming 16-bit PCM, mono, 16kHz (common format for speech)
+        let audioFormat = AVAudioFormat(
+            commonFormat: .pcmFormatInt16,
+            sampleRate: 16000,
+            channels: 1,
+            interleaved: true
+        )!
+
+        let frameCount = UInt32(chunk.count / 2) // 2 bytes per Int16 sample
+        guard let buffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: frameCount) else {
+            throw SpeechToTextError.audioProcessingFailed("Failed to create audio buffer")
+        }
+
+        buffer.frameLength = frameCount
+
+        // Copy data into buffer
+        chunk.withUnsafeBytes { rawPtr in
+            if let int16Ptr = rawPtr.baseAddress?.assumingMemoryBound(to: Int16.self) {
+                buffer.int16ChannelData?[0].update(from: int16Ptr, count: Int(frameCount))
+            }
+        }
+
+        request.append(buffer)
    }

    /// End streaming session
    public func endStreamingSession() async {
+        streamingRequest?.endAudio()
        isStreamingActive = false
+        streamingRequest = nil
+        streamingTask = nil
+        streamingRecognizer = nil
+        streamingContinuation = nil
    }

    /// Get status information
@ -258,8 +291,8 @@ public actor SpeechToTextService {
        }
    }

-    /// Start streaming with SFSpeechRecognizer
-    private func startStreamingWithSFSpeechRecognizer(
+    /// Start streaming session for gRPC audio chunks
+    private func startStreamingSession(
        config: TranscriptionConfig,
        continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation
    ) async throws {
@ -272,66 +305,83 @@ public actor SpeechToTextService {
            throw SpeechToTextError.notAvailable
        }

+        // Set up streaming state
        isStreamingActive = true
+        streamingRecognizer = recognizer
+        streamingContinuation = continuation

-        let audioEngine = AVAudioEngine()
        let request = SFSpeechAudioBufferRecognitionRequest()
        request.shouldReportPartialResults = true
+        streamingRequest = request

-        let inputNode = audioEngine.inputNode
-        let recordingFormat = inputNode.outputFormat(forBus: 0)
-
-        inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { buffer, _ in
-            request.append(buffer)
-        }
-
-        audioEngine.prepare()
-        try audioEngine.start()
-
-        recognizer.recognitionTask(with: request) { result, error in
-            if let error = error {
-                continuation.finish(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription))
-                return
+        // Create wrapper to handle results safely
+        let service = self
+        let resultHandler = StreamingResultHandler(
+            config: config,
+            continuation: continuation,
+            onFinish: {
+                Task { await service.endStreamingSession() }
            }
+        )

-            guard let result = result else { return }
-
-            let transcription = result.bestTranscription
-            var segments: [TranscriptionSegmentResult] = []
-
-            if config.enableTimestamps {
-                for segment in transcription.segments {
-                    segments.append(TranscriptionSegmentResult(
-                        text: segment.substring,
-                        startTime: Float(segment.timestamp),
-                        endTime: Float(segment.timestamp + segment.duration),
-                        confidence: segment.confidence
-                    ))
-                }
-            }
-
-            let update = StreamingTranscriptionUpdate(
-                partialText: transcription.formattedString,
-                isFinal: result.isFinal,
-                finalText: result.isFinal ? transcription.formattedString : nil,
-                segments: segments
-            )
-            continuation.yield(update)
-
-            if result.isFinal {
-                audioEngine.stop()
-                inputNode.removeTap(onBus: 0)
-                continuation.finish()
-            }
+        streamingTask = recognizer.recognitionTask(with: request) { result, error in
+            resultHandler.handleResult(result: result, error: error)
+        }
+    }
+}
+
+// MARK: - Streaming Result Handler
+
+/// Wrapper to safely handle streaming recognition results
+private final class StreamingResultHandler: @unchecked Sendable {
+    private let config: TranscriptionConfig
+    private let continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation
+    private let onFinish: () -> Void
+
+    init(
+        config: TranscriptionConfig,
+        continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation,
+        onFinish: @escaping () -> Void
+    ) {
+        self.config = config
+        self.continuation = continuation
+        self.onFinish = onFinish
+    }
+
+    func handleResult(result: SFSpeechRecognitionResult?, error: Error?) {
+        if let error = error {
+            continuation.finish(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription))
+            onFinish()
+            return
+        }
+
+        guard let result = result else { return }
+
+        let transcription = result.bestTranscription
+        var segments: [TranscriptionSegmentResult] = []
+
+        if config.enableTimestamps {
+            for segment in transcription.segments {
+                segments.append(TranscriptionSegmentResult(
+                    text: segment.substring,
+                    startTime: Float(segment.timestamp),
+                    endTime: Float(segment.timestamp + segment.duration),
+                    confidence: segment.confidence
+                ))
+            }
+        }
+
+        let update = StreamingTranscriptionUpdate(
+            partialText: transcription.formattedString,
+            isFinal: result.isFinal,
+            finalText: result.isFinal ? transcription.formattedString : nil,
+            segments: segments
+        )
+        continuation.yield(update)
+
+        if result.isFinal {
+            continuation.finish()
+            onFinish()
        }
-
-        // Wait for streaming to end
-        while isStreamingActive {
-            try await Task.sleep(for: .milliseconds(100))
-        }
-
-        audioEngine.stop()
-        inputNode.removeTap(onBus: 0)
-        request.endAudio()
    }
 }