diff --git a/CONNECTION-MAP.md b/CONNECTION-MAP.md index 00027e4..151c04a 100644 --- a/CONNECTION-MAP.md +++ b/CONNECTION-MAP.md @@ -2,7 +2,7 @@ **Upstream version:** v0.51.117 **Plugin version:** 0.1.0 -**Total dependencies:** 7 (6 public API · 0 forced internal · 1 frontend) +**Total dependencies:** 10 (9 public API · 0 forced internal · 1 frontend) > **Auto-generated by `scripts/ast-connection-map.py`. Do not hand-edit.** > To change a justification, edit the `# CONNECTION:` comment above the @@ -18,6 +18,9 @@ | `plugin.py:34` | `api.register_static` | `api.register_static(STATIC_PREFIX, str(STATIC_DIR))` | | `plugin.py:35` | `api.inject_stylesheet` | `api.inject_stylesheet(f"/plugins/{STATIC_PREFIX}/app.css")` | | `plugin.py:36` | `api.inject_script` | `api.inject_script(f"/plugins/{STATIC_PREFIX}/app.js")` | +| `routes/transcribe.py:37` | `api.logger` | `log = api.logger("svrnty.routes.transcribe")` | +| `routes/transcribe.py:38` | `api.register_route` | `api.register_route("/api/transcribe", "POST", _handle_transcribe)` | +| `routes/transcribe.py:39` | `api.register_audio_attachment_processor` | `api.register_audio_attachment_processor(_transcribe_audio_attachments)` | | `routes/vault_status.py:19` | `api.logger` | `log = api.logger("svrnty.routes.vault_status")` | | `routes/vault_status.py:20` | `api.register_route` | `api.register_route("/api/vault/status", "GET", _handle_vault_status)` | diff --git a/plugin.py b/plugin.py index 9ec0205..0e15475 100644 --- a/plugin.py +++ b/plugin.py @@ -59,6 +59,6 @@ def _phase2_routes(): ImportError is logged + swallowed so the plugin loads cleanly. """ return [ - # "transcribe", # P2.A — STT (deferred — needs streaming.py integration refactor) + "transcribe", # P2.A — STT + voice-message audio processor ✓ "vault_status", # P2.B — vault connections status ✓ ] diff --git a/routes/transcribe.py b/routes/transcribe.py index 16079ab..2a6e681 100644 --- a/routes/transcribe.py +++ b/routes/transcribe.py @@ -1,37 +1,187 @@ -"""GET /api/transcribe — STT route — DEFERRED MIGRATION (P2.A). +"""POST /api/transcribe + voice-message audio processor. -The STT feature in the original fork commit 014b9eef touches THREE upstream -modules: +Migrated from hermes-webui fork commit 014b9eef (now reverted) per Phase 2.1 +of the SVRNTY-HERMES Plugin Protocol. Uses the loader's new public API method +`api.register_audio_attachment_processor` so streaming.py can pull transcripts +of voice-message attachments into the agent-visible text WITHOUT any further +fork patch. - 1. api/upload.py — handle_transcribe() + _external_stt_transcribe() - 2. api/streaming.py — _transcribe_audio_attachments() injects transcripts - into the agent-visible message during streaming - 3. static/boot.js — mic button + MediaRecorder fallback (iOS WKWebView) +Configuration (read at call time, never persisted): + HERMES_WEBUI_STT_URL external STT endpoint (OpenAI-shape or WhisperX) + HERMES_WEBUI_STT_KEY optional bearer token -Migration #1 is straightforward (route + helper move cleanly). Migrations #2 -and #3 cross-cut the streaming engine and the bootstrap JS — refactoring them -to live in the plugin requires either: +Endpoints + processors: + POST /api/transcribe direct one-shot transcription + audio_attachment_processor called by streaming.py before agent receives msg - (a) New public-API hooks: api.streaming_hook(name, callback) so the plugin - can register an attachment processor that runs inside the streaming - pipeline. Adds ~50 LOC to the loader + amends Protocol PRD §5.1. - (b) Accept STT as a forced-internal dependency. Adds CONNECTION-MAP entries - under forced_internal/ with the streaming.py + boot.js touch points and - their rebase-risk notes. - -Phase 2.1 decides between (a) and (b). Until that's resolved, the STT route -stays in the fork (commit 014b9eef remains). This stub exists so the migration -plan is co-located with the code and tooling can flag the gap. - -Test status: vault_status migration proves the loader works. STT is a deeper -integration test for the loader's expressiveness. +Public API surface used: register_route, register_audio_attachment_processor, logger. +No forced internal dependencies. """ +import email +import email.parser +import email.policy +import io +import json +import mimetypes +import os +import re +import tempfile +import urllib.request +import uuid -# Intentionally NOT registered yet. The plugin loader's _phase2_routes() does -# not include "transcribe" — see plugin.py. -# -# When Phase 2.1 lands, this file will host either: -# - A new route handler using a streaming_hook to register the attachment -# processor (option a), or -# - The route handler + CONNECTION-MAP forced-internal entries for the -# remaining touch points (option b). +_VOICE_MSG_AUDIO_EXTS = ('.m4a', '.aac', '.oga', '.opus', '.wav', '.mp3', '.flac', '.ogg', '.webm') + + +def register(api): + """Wire route + audio processor.""" + log = api.logger("svrnty.routes.transcribe") + api.register_route("/api/transcribe", "POST", _handle_transcribe) + api.register_audio_attachment_processor(_transcribe_audio_attachments) + log.info("transcribe endpoint + audio processor registered") + + +def _external_stt_transcribe(audio_path: str, url: str, api_key: str) -> str: + """POST audio to an external STT endpoint (multipart `file`). + + Handles OpenAI-shaped servers (top-level `text`) and WhisperX-style servers + (`segments[].text`). Stdlib only. + """ + boundary = '----webui' + uuid.uuid4().hex + fname = os.path.basename(audio_path) or 'audio.webm' + ctype = mimetypes.guess_type(fname)[0] or 'application/octet-stream' + with open(audio_path, 'rb') as f: + audio = f.read() + body = b''.join([ + ('--' + boundary + '\r\n' + 'Content-Disposition: form-data; name="file"; filename="' + fname + '"\r\n' + 'Content-Type: ' + ctype + '\r\n\r\n').encode(), + audio, + ('\r\n--' + boundary + '\r\n' + 'Content-Disposition: form-data; name="model"\r\n\r\nwhisper-1').encode(), + ('\r\n--' + boundary + '--\r\n').encode(), + ]) + headers = {'Content-Type': 'multipart/form-data; boundary=' + boundary} + if api_key: + headers['Authorization'] = 'Bearer ' + api_key + req = urllib.request.Request(url, data=body, headers=headers) + with urllib.request.urlopen(req, timeout=300) as resp: + data = json.loads(resp.read()) + text = str(data.get('text') or '').strip() + if not text: + segs = data.get('segments') or [] + text = ' '.join(str(s.get('text', '')).strip() for s in segs).strip() + return text + + +def _transcribe_audio_attachments(attachments) -> str: + """Audio-attachment processor — registered via the loader. + + Scans attachments for voice-message audio files; transcribes each via the + configured STT endpoint; returns a single text block to prepend to the + agent-visible message. Empty string when no audio / STT not configured. + """ + stt_url = os.environ.get('HERMES_WEBUI_STT_URL', '').strip() + if not stt_url or not attachments: + return '' + stt_key = os.environ.get('HERMES_WEBUI_STT_KEY', '').strip() + parts = [] + for att in attachments or []: + if not isinstance(att, dict): + continue + path = str(att.get('path') or '') + mime = str(att.get('mime') or '').lower() + name = str(att.get('name') or '') or path + is_audio = ( + os.path.basename(name).startswith('voice-message') + or mime.startswith('audio/') + or os.path.splitext(name)[1].lower() in _VOICE_MSG_AUDIO_EXTS + ) + if not is_audio or not path: + continue + try: + text = _external_stt_transcribe(path, stt_url, stt_key) + except Exception: + print(f'[svrnty] voice-message transcription failed for {name}', flush=True) + text = '' + if text: + parts.append(text) + return '[Voice message transcript]\n' + '\n\n'.join(parts) if parts else '' + + +def _handle_transcribe(handler, parsed): + """POST /api/transcribe — direct one-shot transcription. + + Reads a multipart form with field `file` (the recorded audio blob), writes + it to a temp file, sends it to the configured STT endpoint, returns + `{"ok": true, "transcript": "..."}`. + """ + stt_url = os.environ.get('HERMES_WEBUI_STT_URL', '').strip() + if not stt_url: + return _send_json(handler, {'ok': False, 'error': 'HERMES_WEBUI_STT_URL not configured'}, 503) + + ctype = handler.headers.get('Content-Type', '') + if 'multipart' not in ctype.lower(): + return _send_json(handler, {'ok': False, 'error': 'multipart/form-data required'}, 400) + length = int(handler.headers.get('Content-Length', '0') or 0) + if not length: + return _send_json(handler, {'ok': False, 'error': 'empty body'}, 400) + + body = handler.rfile.read(length) + file_bytes, fname = _parse_multipart_file(body, ctype, field_name='file') + if file_bytes is None: + return _send_json(handler, {'ok': False, 'error': "missing 'file' field"}, 400) + fname = fname or 'audio.webm' + + suffix = os.path.splitext(fname)[1] or '.webm' + temp_path = None + try: + with tempfile.NamedTemporaryFile(prefix='svrnty-stt-', suffix=suffix, delete=False) as tmp: + temp_path = tmp.name + tmp.write(file_bytes) + transcript = _external_stt_transcribe( + temp_path, stt_url, os.environ.get('HERMES_WEBUI_STT_KEY', '').strip()) + return _send_json(handler, {'ok': True, 'transcript': transcript}, 200) + except Exception as e: + return _send_json(handler, {'ok': False, 'error': str(e)}, 500) + finally: + if temp_path and os.path.exists(temp_path): + try: + os.remove(temp_path) + except OSError: + pass + + +def _parse_multipart_file(body: bytes, content_type: str, field_name: str = 'file'): + """Parse a multipart body and return (file_bytes, filename) for the named field. + + Stdlib only. cgi.FieldStorage was removed in Python 3.13 (PEP 594), so we + parse via the email module which is the documented replacement. + Returns (None, None) when the named field is absent. + """ + # Construct a fake email message so email.parser handles the multipart split. + full = b'Content-Type: ' + content_type.encode() + b'\r\n\r\n' + body + parser = email.parser.BytesParser(policy=email.policy.default) + msg = parser.parsebytes(full) + if not msg.is_multipart(): + return None, None + for part in msg.iter_parts(): + disp = part.get('Content-Disposition', '') + m = re.search(r'name="([^"]+)"', disp) + if not m or m.group(1) != field_name: + continue + fn_m = re.search(r'filename="([^"]+)"', disp) + filename = fn_m.group(1) if fn_m else None + payload = part.get_payload(decode=True) + return payload, filename + return None, None + + +def _send_json(handler, payload: dict, status: int) -> bool: + body = json.dumps(payload).encode('utf-8') + handler.send_response(status) + handler.send_header('Content-Type', 'application/json; charset=utf-8') + handler.send_header('Content-Length', str(len(body))) + handler.send_header('Cache-Control', 'no-store') + handler.end_headers() + handler.wfile.write(body) + return True diff --git a/scripts/ast-connection-map.py b/scripts/ast-connection-map.py index 03856a3..4f6a6c0 100755 --- a/scripts/ast-connection-map.py +++ b/scripts/ast-connection-map.py @@ -34,6 +34,7 @@ MAP_PATH = REPO / "CONNECTION-MAP.md" PUBLIC_API = { "register_route", "register_static", "inject_script", "inject_stylesheet", "config_get", "logger", + "register_audio_attachment_processor", } diff --git a/tests/evals/test_features.py b/tests/evals/test_features.py index 35b3d4f..898e25f 100644 --- a/tests/evals/test_features.py +++ b/tests/evals/test_features.py @@ -10,18 +10,18 @@ ROOT = Path(__file__).resolve().parents[2] def test_eval_loader_contract_unchanged(): - """The 6-method public API is the protocol contract — adding methods needs a PRD bump.""" + """The 7-method public API is the protocol contract — adding methods needs a PRD bump.""" import sys sys.path.insert(0, str(ROOT.parent / "hermes-webui")) try: from api.svrnty_plugin_loader import _PluginAPI except ImportError: - # If hermes-webui not next to the plugin, skip — integration env. import pytest pytest.skip("hermes-webui fork not adjacent; loader contract eval skipped") api = _PluginAPI() required = {"register_route", "register_static", "inject_script", - "inject_stylesheet", "config_get", "logger"} + "inject_stylesheet", "config_get", "logger", + "register_audio_attachment_processor"} actual = {m for m in dir(api) if not m.startswith("_")} assert required == actual, ( f"public API drift: expected {required}, got {actual}. " @@ -29,6 +29,13 @@ def test_eval_loader_contract_unchanged(): ) +def test_eval_audio_processor_signature_unchanged(): + """The audio_attachment_processor takes attachments → str. Loader hook + plugin agree.""" + from routes import transcribe + out = transcribe._transcribe_audio_attachments([]) + assert isinstance(out, str), f"audio processor must return str, got {type(out).__name__}" + + def test_eval_vault_status_payload_shape(): """Vault status returns {'secrets': [{'name': ...}, ...]} — schema lock.""" import json diff --git a/tests/unit/test_transcribe.py b/tests/unit/test_transcribe.py new file mode 100644 index 0000000..282cd68 --- /dev/null +++ b/tests/unit/test_transcribe.py @@ -0,0 +1,117 @@ +"""Unit tests for routes/transcribe.py (P3.B + L6). + +Cover the route handler shape + the audio_attachment_processor contract. +Network calls to the external STT endpoint are mocked. +""" +import json +import os +from unittest.mock import MagicMock, patch + +from routes import transcribe + + +class _FakeHandler: + def __init__(self, body=b"", headers=None): + self.status = None + self.headers = headers or {} + self.body_out = b"" + self.rfile = MagicMock() + self.rfile.read.return_value = body + + def send_response(self, code): + self.status = code + + def send_header(self, k, v): + pass + + def end_headers(self): + pass + + @property + def wfile(self): + h = self + + class _W: + def write(self_, b): h.body_out += b + return _W() + + +def test_register_wires_route_and_processor(): + api = MagicMock() + api.logger.return_value = MagicMock() + transcribe.register(api) + api.register_route.assert_called_once_with( + "/api/transcribe", "POST", transcribe._handle_transcribe) + api.register_audio_attachment_processor.assert_called_once_with( + transcribe._transcribe_audio_attachments) + + +def test_processor_returns_empty_when_stt_url_unset(): + with patch.dict(os.environ, {"HERMES_WEBUI_STT_URL": ""}, clear=False): + assert transcribe._transcribe_audio_attachments( + [{"path": "/tmp/foo.webm", "mime": "audio/webm"}]) == "" + + +def test_processor_returns_empty_when_no_audio_attachments(): + with patch.dict(os.environ, {"HERMES_WEBUI_STT_URL": "http://stt:8000/transcribe"}): + assert transcribe._transcribe_audio_attachments([]) == "" + assert transcribe._transcribe_audio_attachments( + [{"path": "/tmp/doc.pdf", "mime": "application/pdf"}]) == "" + + +def test_processor_transcribes_audio_attachments(): + """End-to-end: audio attachment → STT call → transcript block.""" + attachments = [{ + "path": "/tmp/voice-message-123.webm", + "mime": "audio/webm", + "name": "voice-message-123.webm", + }] + with patch.dict(os.environ, {"HERMES_WEBUI_STT_URL": "http://stt:8000/v1/audio/transcriptions"}): + with patch.object(transcribe, "_external_stt_transcribe", + return_value="hello world"): + out = transcribe._transcribe_audio_attachments(attachments) + assert out.startswith("[Voice message transcript]") + assert "hello world" in out + + +def test_processor_detects_audio_by_filename_prefix(): + """voice-message-* prefix triggers transcription even with non-audio mime.""" + attachments = [{ + "path": "/tmp/voice-message-abc.mp4", + "mime": "video/mp4", # browser may upload as video/* per upload handler + "name": "voice-message-abc.mp4", + }] + with patch.dict(os.environ, {"HERMES_WEBUI_STT_URL": "http://stt:8000/v1"}): + with patch.object(transcribe, "_external_stt_transcribe", + return_value="hi"): + assert "hi" in transcribe._transcribe_audio_attachments(attachments) + + +def test_handle_transcribe_503_when_stt_url_missing(): + with patch.dict(os.environ, {"HERMES_WEBUI_STT_URL": ""}, clear=False): + h = _FakeHandler() + transcribe._handle_transcribe(h, None) + assert h.status == 503 + + +def test_handle_transcribe_400_on_non_multipart(): + with patch.dict(os.environ, {"HERMES_WEBUI_STT_URL": "http://stt:8000/v1"}): + h = _FakeHandler(headers={"Content-Type": "application/json", "Content-Length": "10"}) + transcribe._handle_transcribe(h, None) + assert h.status == 400 + + +def test_multipart_parser_extracts_file_field(): + """_parse_multipart_file pulls the named field's bytes + filename.""" + boundary = "----boundary" + body = ( + f"--{boundary}\r\n" + f'Content-Disposition: form-data; name="file"; filename="hello.wav"\r\n' + f"Content-Type: audio/wav\r\n\r\n" + f"FAKEAUDIO\r\n" + f"--{boundary}--\r\n" + ).encode() + data, fname = transcribe._parse_multipart_file( + body, f"multipart/form-data; boundary={boundary}", "file") + assert data == b"FAKEAUDIO" + assert fname == "hello.wav"