Fix STT streaming to receive audio from gRPC client
- Fix streaming STT to accept audio chunks from gRPC stream instead of local microphone - Add proper PCM audio buffer conversion for 16-bit, 16kHz, mono audio - Add StreamingResultHandler for safe callback handling - Properly manage streaming session state and cleanup 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
7655f1f0b8
commit
f7b8fbfa36
@ -84,6 +84,10 @@ public actor SpeechToTextService {
|
||||
|
||||
/// Streaming session state
|
||||
private var isStreamingActive: Bool = false
|
||||
private var streamingRequest: SFSpeechAudioBufferRecognitionRequest?
|
||||
private var streamingRecognizer: SFSpeechRecognizer?
|
||||
private var streamingTask: SFSpeechRecognitionTask?
|
||||
private var streamingContinuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation?
|
||||
|
||||
public init() async {
|
||||
await checkAvailability()
|
||||
@ -108,19 +112,19 @@ public actor SpeechToTextService {
|
||||
return try await transcribeWithSFSpeechRecognizer(url: tempURL, config: config)
|
||||
}
|
||||
|
||||
/// Stream transcription from audio chunks
|
||||
/// Stream transcription from audio chunks sent via gRPC
|
||||
public func streamTranscribe(
|
||||
config: TranscriptionConfig = .default
|
||||
) -> AsyncThrowingStream<StreamingTranscriptionUpdate, Error> {
|
||||
AsyncThrowingStream { continuation in
|
||||
Task {
|
||||
guard self.isAvailable else {
|
||||
guard await self.isAvailable else {
|
||||
continuation.finish(throwing: SpeechToTextError.notAvailable)
|
||||
return
|
||||
}
|
||||
|
||||
do {
|
||||
try await self.startStreamingWithSFSpeechRecognizer(config: config, continuation: continuation)
|
||||
try await self.startStreamingSession(config: config, continuation: continuation)
|
||||
} catch {
|
||||
continuation.finish(throwing: error)
|
||||
}
|
||||
@ -128,17 +132,46 @@ public actor SpeechToTextService {
|
||||
}
|
||||
}
|
||||
|
||||
/// Feed audio chunk for streaming transcription
|
||||
/// Feed audio chunk for streaming transcription (PCM audio data)
|
||||
public func feedAudioChunk(_ chunk: Data) async throws {
|
||||
guard isStreamingActive else {
|
||||
guard isStreamingActive, let request = streamingRequest else {
|
||||
throw SpeechToTextError.transcriptionFailed("No active streaming session")
|
||||
}
|
||||
// Audio chunk handling implemented in streaming methods
|
||||
|
||||
// Convert raw PCM data to audio buffer
|
||||
// Assuming 16-bit PCM, mono, 16kHz (common format for speech)
|
||||
let audioFormat = AVAudioFormat(
|
||||
commonFormat: .pcmFormatInt16,
|
||||
sampleRate: 16000,
|
||||
channels: 1,
|
||||
interleaved: true
|
||||
)!
|
||||
|
||||
let frameCount = UInt32(chunk.count / 2) // 2 bytes per Int16 sample
|
||||
guard let buffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: frameCount) else {
|
||||
throw SpeechToTextError.audioProcessingFailed("Failed to create audio buffer")
|
||||
}
|
||||
|
||||
buffer.frameLength = frameCount
|
||||
|
||||
// Copy data into buffer
|
||||
chunk.withUnsafeBytes { rawPtr in
|
||||
if let int16Ptr = rawPtr.baseAddress?.assumingMemoryBound(to: Int16.self) {
|
||||
buffer.int16ChannelData?[0].update(from: int16Ptr, count: Int(frameCount))
|
||||
}
|
||||
}
|
||||
|
||||
request.append(buffer)
|
||||
}
|
||||
|
||||
/// End streaming session
|
||||
public func endStreamingSession() async {
|
||||
streamingRequest?.endAudio()
|
||||
isStreamingActive = false
|
||||
streamingRequest = nil
|
||||
streamingTask = nil
|
||||
streamingRecognizer = nil
|
||||
streamingContinuation = nil
|
||||
}
|
||||
|
||||
/// Get status information
|
||||
@ -258,8 +291,8 @@ public actor SpeechToTextService {
|
||||
}
|
||||
}
|
||||
|
||||
/// Start streaming with SFSpeechRecognizer
|
||||
private func startStreamingWithSFSpeechRecognizer(
|
||||
/// Start streaming session for gRPC audio chunks
|
||||
private func startStreamingSession(
|
||||
config: TranscriptionConfig,
|
||||
continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation
|
||||
) async throws {
|
||||
@ -272,25 +305,53 @@ public actor SpeechToTextService {
|
||||
throw SpeechToTextError.notAvailable
|
||||
}
|
||||
|
||||
// Set up streaming state
|
||||
isStreamingActive = true
|
||||
streamingRecognizer = recognizer
|
||||
streamingContinuation = continuation
|
||||
|
||||
let audioEngine = AVAudioEngine()
|
||||
let request = SFSpeechAudioBufferRecognitionRequest()
|
||||
request.shouldReportPartialResults = true
|
||||
streamingRequest = request
|
||||
|
||||
let inputNode = audioEngine.inputNode
|
||||
let recordingFormat = inputNode.outputFormat(forBus: 0)
|
||||
// Create wrapper to handle results safely
|
||||
let service = self
|
||||
let resultHandler = StreamingResultHandler(
|
||||
config: config,
|
||||
continuation: continuation,
|
||||
onFinish: {
|
||||
Task { await service.endStreamingSession() }
|
||||
}
|
||||
)
|
||||
|
||||
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { buffer, _ in
|
||||
request.append(buffer)
|
||||
streamingTask = recognizer.recognitionTask(with: request) { result, error in
|
||||
resultHandler.handleResult(result: result, error: error)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Streaming Result Handler
|
||||
|
||||
/// Wrapper to safely handle streaming recognition results
|
||||
private final class StreamingResultHandler: @unchecked Sendable {
|
||||
private let config: TranscriptionConfig
|
||||
private let continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation
|
||||
private let onFinish: () -> Void
|
||||
|
||||
init(
|
||||
config: TranscriptionConfig,
|
||||
continuation: AsyncThrowingStream<StreamingTranscriptionUpdate, Error>.Continuation,
|
||||
onFinish: @escaping () -> Void
|
||||
) {
|
||||
self.config = config
|
||||
self.continuation = continuation
|
||||
self.onFinish = onFinish
|
||||
}
|
||||
|
||||
audioEngine.prepare()
|
||||
try audioEngine.start()
|
||||
|
||||
recognizer.recognitionTask(with: request) { result, error in
|
||||
func handleResult(result: SFSpeechRecognitionResult?, error: Error?) {
|
||||
if let error = error {
|
||||
continuation.finish(throwing: SpeechToTextError.transcriptionFailed(error.localizedDescription))
|
||||
onFinish()
|
||||
return
|
||||
}
|
||||
|
||||
@ -319,19 +380,8 @@ public actor SpeechToTextService {
|
||||
continuation.yield(update)
|
||||
|
||||
if result.isFinal {
|
||||
audioEngine.stop()
|
||||
inputNode.removeTap(onBus: 0)
|
||||
continuation.finish()
|
||||
onFinish()
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for streaming to end
|
||||
while isStreamingActive {
|
||||
try await Task.sleep(for: .milliseconds(100))
|
||||
}
|
||||
|
||||
audioEngine.stop()
|
||||
inputNode.removeTap(onBus: 0)
|
||||
request.endAudio()
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user