svrnty-hermes-webui-plugin/routes/transcribe.py
Svrnty 37123f570b
All checks were successful
plugin-tests / test (push) Successful in 8s
feat(plugin): STT migration via audio_attachment_processor hook (L1-L6)
Closes Phase 2.A. STT now lives entirely in the plugin via the new public-API
method `api.register_audio_attachment_processor` added to the loader hook
(Rule 1 — extended API, no forced-internal). The fork patch stays minimal
(streaming.py gains a small loop that calls registered processors; loader
adds the 1 new method).

Plugin additions:
  routes/transcribe.py            POST /api/transcribe + audio_attachment_processor
                                  - _external_stt_transcribe: multipart POST to STT endpoint
                                  - _handle_transcribe: one-shot transcription route
                                  - _transcribe_audio_attachments: voice-message processor
                                  - _parse_multipart_file: stdlib email-based multipart
                                    (Python 3.13 dropped cgi per PEP 594)
  tests/unit/test_transcribe.py   8 tests (register, processor, route, multipart parser)
  tests/evals/test_features.py    + 1 eval (audio processor signature contract)

Config (read at call time, never persisted):
  HERMES_WEBUI_STT_URL  external STT endpoint (OpenAI or WhisperX shape)
  HERMES_WEBUI_STT_KEY  optional bearer token

CONNECTION-MAP regenerated: 9 public-API · 0 forced-internal · 1 frontend.
20/20 tests PASS.

Loader API extended in hermes-webui (next commit there) — 7th method:
register_audio_attachment_processor. Streaming.py gets a small loop that
calls registered processors before _build_native_multimodal_message.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 10:14:29 -04:00

188 lines
7.4 KiB
Python

"""POST /api/transcribe + voice-message audio processor.
Migrated from hermes-webui fork commit 014b9eef (now reverted) per Phase 2.1
of the SVRNTY-HERMES Plugin Protocol. Uses the loader's new public API method
`api.register_audio_attachment_processor` so streaming.py can pull transcripts
of voice-message attachments into the agent-visible text WITHOUT any further
fork patch.
Configuration (read at call time, never persisted):
HERMES_WEBUI_STT_URL external STT endpoint (OpenAI-shape or WhisperX)
HERMES_WEBUI_STT_KEY optional bearer token
Endpoints + processors:
POST /api/transcribe direct one-shot transcription
audio_attachment_processor called by streaming.py before agent receives msg
Public API surface used: register_route, register_audio_attachment_processor, logger.
No forced internal dependencies.
"""
import email
import email.parser
import email.policy
import io
import json
import mimetypes
import os
import re
import tempfile
import urllib.request
import uuid
_VOICE_MSG_AUDIO_EXTS = ('.m4a', '.aac', '.oga', '.opus', '.wav', '.mp3', '.flac', '.ogg', '.webm')
def register(api):
"""Wire route + audio processor."""
log = api.logger("svrnty.routes.transcribe")
api.register_route("/api/transcribe", "POST", _handle_transcribe)
api.register_audio_attachment_processor(_transcribe_audio_attachments)
log.info("transcribe endpoint + audio processor registered")
def _external_stt_transcribe(audio_path: str, url: str, api_key: str) -> str:
"""POST audio to an external STT endpoint (multipart `file`).
Handles OpenAI-shaped servers (top-level `text`) and WhisperX-style servers
(`segments[].text`). Stdlib only.
"""
boundary = '----webui' + uuid.uuid4().hex
fname = os.path.basename(audio_path) or 'audio.webm'
ctype = mimetypes.guess_type(fname)[0] or 'application/octet-stream'
with open(audio_path, 'rb') as f:
audio = f.read()
body = b''.join([
('--' + boundary + '\r\n'
'Content-Disposition: form-data; name="file"; filename="' + fname + '"\r\n'
'Content-Type: ' + ctype + '\r\n\r\n').encode(),
audio,
('\r\n--' + boundary + '\r\n'
'Content-Disposition: form-data; name="model"\r\n\r\nwhisper-1').encode(),
('\r\n--' + boundary + '--\r\n').encode(),
])
headers = {'Content-Type': 'multipart/form-data; boundary=' + boundary}
if api_key:
headers['Authorization'] = 'Bearer ' + api_key
req = urllib.request.Request(url, data=body, headers=headers)
with urllib.request.urlopen(req, timeout=300) as resp:
data = json.loads(resp.read())
text = str(data.get('text') or '').strip()
if not text:
segs = data.get('segments') or []
text = ' '.join(str(s.get('text', '')).strip() for s in segs).strip()
return text
def _transcribe_audio_attachments(attachments) -> str:
"""Audio-attachment processor — registered via the loader.
Scans attachments for voice-message audio files; transcribes each via the
configured STT endpoint; returns a single text block to prepend to the
agent-visible message. Empty string when no audio / STT not configured.
"""
stt_url = os.environ.get('HERMES_WEBUI_STT_URL', '').strip()
if not stt_url or not attachments:
return ''
stt_key = os.environ.get('HERMES_WEBUI_STT_KEY', '').strip()
parts = []
for att in attachments or []:
if not isinstance(att, dict):
continue
path = str(att.get('path') or '')
mime = str(att.get('mime') or '').lower()
name = str(att.get('name') or '') or path
is_audio = (
os.path.basename(name).startswith('voice-message')
or mime.startswith('audio/')
or os.path.splitext(name)[1].lower() in _VOICE_MSG_AUDIO_EXTS
)
if not is_audio or not path:
continue
try:
text = _external_stt_transcribe(path, stt_url, stt_key)
except Exception:
print(f'[svrnty] voice-message transcription failed for {name}', flush=True)
text = ''
if text:
parts.append(text)
return '[Voice message transcript]\n' + '\n\n'.join(parts) if parts else ''
def _handle_transcribe(handler, parsed):
"""POST /api/transcribe — direct one-shot transcription.
Reads a multipart form with field `file` (the recorded audio blob), writes
it to a temp file, sends it to the configured STT endpoint, returns
`{"ok": true, "transcript": "..."}`.
"""
stt_url = os.environ.get('HERMES_WEBUI_STT_URL', '').strip()
if not stt_url:
return _send_json(handler, {'ok': False, 'error': 'HERMES_WEBUI_STT_URL not configured'}, 503)
ctype = handler.headers.get('Content-Type', '')
if 'multipart' not in ctype.lower():
return _send_json(handler, {'ok': False, 'error': 'multipart/form-data required'}, 400)
length = int(handler.headers.get('Content-Length', '0') or 0)
if not length:
return _send_json(handler, {'ok': False, 'error': 'empty body'}, 400)
body = handler.rfile.read(length)
file_bytes, fname = _parse_multipart_file(body, ctype, field_name='file')
if file_bytes is None:
return _send_json(handler, {'ok': False, 'error': "missing 'file' field"}, 400)
fname = fname or 'audio.webm'
suffix = os.path.splitext(fname)[1] or '.webm'
temp_path = None
try:
with tempfile.NamedTemporaryFile(prefix='svrnty-stt-', suffix=suffix, delete=False) as tmp:
temp_path = tmp.name
tmp.write(file_bytes)
transcript = _external_stt_transcribe(
temp_path, stt_url, os.environ.get('HERMES_WEBUI_STT_KEY', '').strip())
return _send_json(handler, {'ok': True, 'transcript': transcript}, 200)
except Exception as e:
return _send_json(handler, {'ok': False, 'error': str(e)}, 500)
finally:
if temp_path and os.path.exists(temp_path):
try:
os.remove(temp_path)
except OSError:
pass
def _parse_multipart_file(body: bytes, content_type: str, field_name: str = 'file'):
"""Parse a multipart body and return (file_bytes, filename) for the named field.
Stdlib only. cgi.FieldStorage was removed in Python 3.13 (PEP 594), so we
parse via the email module which is the documented replacement.
Returns (None, None) when the named field is absent.
"""
# Construct a fake email message so email.parser handles the multipart split.
full = b'Content-Type: ' + content_type.encode() + b'\r\n\r\n' + body
parser = email.parser.BytesParser(policy=email.policy.default)
msg = parser.parsebytes(full)
if not msg.is_multipart():
return None, None
for part in msg.iter_parts():
disp = part.get('Content-Disposition', '')
m = re.search(r'name="([^"]+)"', disp)
if not m or m.group(1) != field_name:
continue
fn_m = re.search(r'filename="([^"]+)"', disp)
filename = fn_m.group(1) if fn_m else None
payload = part.get_payload(decode=True)
return payload, filename
return None, None
def _send_json(handler, payload: dict, status: int) -> bool:
body = json.dumps(payload).encode('utf-8')
handler.send_response(status)
handler.send_header('Content-Type', 'application/json; charset=utf-8')
handler.send_header('Content-Length', str(len(body)))
handler.send_header('Cache-Control', 'no-store')
handler.end_headers()
handler.wfile.write(body)
return True