dev-mode-orpheus-tts

Paused

App Files Files Community

Tomtom84 commited on Jun 8

Commit

8ff13a2

verified ·

1 Parent(s): 4708247

Create orpheus_engine.py

Browse files

Files changed (1) hide show

engines/orpheus_engine.py +295 -0

engines/orpheus_engine.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# -*- coding: utf-8 -*-
+"""OrpheusEngine
+~~~~~~~~~~~~~~~~
+A drop‑in replacement for the original ``orpheus_engine.py`` that fixes
+all outstanding token‑streaming issues and eliminates audible clicks by
+* streaming **token‑IDs** instead of partial text
+* dynamically sending a *tiny* first audio chunk (3×7 codes) followed by
+  steady blocks (30×7)
+* mapping vLLM/OpenAI token‑IDs → SNAC codes without fragile
+  ``"<custom_token_"`` string parsing
+* adding an optional fade‑in / fade‑out per chunk
+* emitting a proper WAV header as the first element in the queue so that
+  browsers / HTML5 `<audio>` tags start playback immediately.
+The API (``get_voices()``, ``set_voice()``, …) is unchanged, so you can
+keep using it from RealTimeTTS.
+"""
+from __future__ import annotations
+###############################################################################
+# Standard library & 3rd‑party imports                                      #
+###############################################################################
+import json
+import logging
+import struct
+import time
+from queue import Queue
+from typing import Generator, Iterable, List, Optional
+import numpy as np
+import pyaudio  # provided by RealTimeTTS[system]
+import requests
+from RealtimeTTS.engines import BaseEngine
+###############################################################################
+# Constants                                                                  #
+###############################################################################
+DEFAULT_API_URL = "http://127.0.0.1:1234"
+DEFAULT_MODEL = "SebastianBodza/Kartoffel_Orpheus-3B_german_synthetic-v0.1"
+DEFAULT_HEADERS = {"Content-Type": "application/json"}
+DEFAULT_VOICE = "Martin"
+# Audio
+SAMPLE_RATE = 24_000
+BITS_PER_SAMPLE = 16
+AUDIO_CHANNELS = 1
+# Token‑ID magic numbers (defined in the model card)
+CODE_START_TOKEN_ID   = 128257  # <|audio|>
+CODE_REMOVE_TOKEN_ID  = 128258
+CODE_TOKEN_OFFSET     = 128266  # <custom_token_?> – first usable code id
+# Chunking strategy
+_INITIAL_GROUPS = 3    # 3×7 = 21 codes ≈ 90 ms @24 kHz
+_STEADY_GROUPS  = 30   # 30×7 = 210 codes ≈ 900 ms
+###############################################################################
+# Helper functions                                                           #
+###############################################################################
+def _create_wav_header(sample_rate: int, bits_per_sample: int, channels: int) -> bytes:
+    """Return a 44‑byte WAV/PCM header with unknown data size (0xFFFFFFFF)."""
+    riff_size = 0xFFFFFFFF
+    header = b"RIFF" + struct.pack("<I", riff_size) + b"WAVEfmt "
+    header += struct.pack("<IHHIIHH", 16, 1, channels, sample_rate,
+                           sample_rate * channels * bits_per_sample // 8,
+                           channels * bits_per_sample // 8, bits_per_sample)
+    header += b"data" + struct.pack("<I", 0xFFFFFFFF)
+    return header
+def _fade_in_out(audio: np.ndarray, fade_ms: int = 50) -> np.ndarray:
+    """Apply linear fade‑in/out to avoid clicks."""
+    if fade_ms <= 0:
+        return audio
+    fade_samples = int(SAMPLE_RATE * fade_ms / 1000)
+    fade_samples -= fade_samples % 2  # keep it even
+    if fade_samples == 0 or audio.size < 2 * fade_samples:
+        return audio
+    ramp = np.linspace(0.0, 1.0, fade_samples, dtype=np.float32)
+    audio[:fade_samples] *= ramp
+    audio[-fade_samples:] *= ramp[::-1]
+    return audio
+###############################################################################
+# SNAC – lightweight wrapper                                                 #
+###############################################################################
+try:
+    from snac import SNAC
+    _snac_model: Optional[SNAC] = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
+    _snac_model = _snac_model.to("cuda" if _snac_model and _snac_model.is_cuda_available() else "cpu")
+except Exception as exc:  # pragma: no cover
+    logging.warning("SNAC model could not be loaded – %s", exc)
+    _snac_model = None
+def _codes_to_audio(codes: List[int]) -> bytes:
+    """Convert a *flat* list of SNAC codes to 16‑bit PCM bytes."""
+    if not _snac_model or not codes:
+        return b""
+    # --- redistribute into 3 snac layers (see original paper) --------------
+    groups = len(codes) // 7
+    codes = codes[: groups * 7]  # trim incomplete tail
+    if groups == 0:
+        return b""
+    l1, l2, l3 = [], [], []
+    for g in range(groups):
+        base = g * 7
+        l1.append(codes[base])
+        l2.append(codes[base + 1] - 4096)
+        l3.extend([
+            codes[base + 2] - 2 * 4096,
+            codes[base + 3] - 3 * 4096,
+            codes[base + 5] - 5 * 4096,
+            codes[base + 6] - 6 * 4096,
+        ])
+        l2.append(codes[base + 4] - 4 * 4096)
+    import torch
+    with torch.no_grad():
+        layers = [
+            torch.tensor(l1, device=_snac_model.device).unsqueeze(0),
+            torch.tensor(l2, device=_snac_model.device).unsqueeze(0),
+            torch.tensor(l3, device=_snac_model.device).unsqueeze(0),
+        ]
+        wav = _snac_model.decode(layers).cpu().numpy().squeeze()
+    wav = _fade_in_out(wav)
+    pcm = np.clip(wav * 32767, -32768, 32767).astype(np.int16).tobytes()
+    return pcm
+###############################################################################
+# Main class                                                                 #
+###############################################################################
+class OrpheusVoice:
+    def __init__(self, name: str, gender: str | None = None):
+        self.name = name
+        self.gender = gender
+class OrpheusEngine(BaseEngine):
+    """Realtime TTS engine using the Orpheus SNAC model via vLLM."""
+    _SPEAKERS = [
+        OrpheusVoice("Martin", "m"), OrpheusVoice("Emma", "f"),
+        OrpheusVoice("Luca", "m"), OrpheusVoice("Anna", "f"),
+        OrpheusVoice("Jakob", "m"), OrpheusVoice("Anton", "m"),
+        OrpheusVoice("Julian", "m"), OrpheusVoice("Jan", "m"),
+        OrpheusVoice("Alexander", "m"), OrpheusVoice("Emil", "m"),
+        OrpheusVoice("Ben", "m"), OrpheusVoice("Elias", "m"),
+        OrpheusVoice("Felix", "m"), OrpheusVoice("Jonas", "m"),
+        OrpheusVoice("Noah", "m"), OrpheusVoice("Maximilian", "m"),
+        OrpheusVoice("Sophie", "f"), OrpheusVoice("Marie", "f"),
+        OrpheusVoice("Mia", "f"), OrpheusVoice("Maria", "f"),
+        OrpheusVoice("Sophia", "f"), OrpheusVoice("Lina", "f"),
+        OrpheusVoice("Lea", "f"),
+    ]
+    # ---------------------------------------------------------------------
+    def __init__(
+        self,
+        api_url: str = DEFAULT_API_URL,
+        model: str = DEFAULT_MODEL,
+        headers: dict = DEFAULT_HEADERS,
+        voice: Optional[OrpheusVoice] = None,
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        max_tokens: int = 1200,
+        repetition_penalty: float = 1.1,
+        debug: bool = False,
+    ) -> None:
+        super().__init__()
+        self.api_url = api_url.rstrip("/")
+        self.model = model
+        self.headers = headers
+        self.voice = voice or OrpheusVoice(DEFAULT_VOICE)
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+        self.repetition_penalty = repetition_penalty
+        self.debug = debug
+        self.queue: "Queue[bytes | None]" = Queue()
+        self.engine_name = "orpheus"
+    # ------------------------------------------------------------------ API
+    def get_stream_info(self):
+        return pyaudio.paInt16, AUDIO_CHANNELS, SAMPLE_RATE
+    def get_voices(self):
+        return self._SPEAKERS
+    def set_voice(self, voice_name: str):
+        if voice_name not in {v.name for v in self._SPEAKERS}:
+            raise ValueError(f"Unknown Orpheus speaker '{voice_name}'")
+        self.voice = OrpheusVoice(voice_name)
+    # --------------------------------------------------------------- public
+    def synthesize(self, text: str) -> bool:  # noqa: C901 (long)
+        """Start streaming TTS for **text** – blocks until finished."""
+        super().synthesize(text)
+        self.queue.put(_create_wav_header(SAMPLE_RATE, BITS_PER_SAMPLE, AUDIO_CHANNELS))
+        try:
+            code_stream = self._stream_snac_codes(text)
+            first_chunk = True
+            buffer: List[int] = []
+            sent = 0
+            groups_needed = _INITIAL_GROUPS
+            for code_id in code_stream:
+                buffer.append(code_id)
+                available = len(buffer) - sent
+                if available >= groups_needed * 7:
+                    chunk_codes = buffer[sent : sent + groups_needed * 7]
+                    sent += groups_needed * 7
+                    pcm = _codes_to_audio(chunk_codes)
+                    if pcm:
+                        self.queue.put(pcm)
+                        first_chunk = False
+                        groups_needed = _STEADY_GROUPS
+            # flush remaining full groups
+            remaining = len(buffer) - sent
+            final_groups = remaining // 7
+            if final_groups:
+                pcm = _codes_to_audio(buffer[sent : sent + final_groups * 7])
+                if pcm:
+                    self.queue.put(pcm)
+            return True
+        except Exception as exc:  # pragma: no cover
+            logging.exception("OrpheusEngine: synthesis failed – %s", exc)
+            return False
+        finally:
+            self.queue.put(None)  # close stream
+    # ------------------------------------------------------------ internals
+    def _format_prompt(self, prompt: str) -> str:
+        return f"<|audio|>{self.voice.name}: {prompt}<|eot_id|>"
+    def _stream_snac_codes(self, prompt: str) -> Generator[int, None, None]:
+        """Yield SNAC code‑IDs as they arrive from the model."""
+        payload = {
+            "model": self.model,
+            "prompt": self._format_prompt(prompt),
+            "max_tokens": self.max_tokens,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "stream": True,
+            "skip_special_tokens": False,
+            "frequency_penalty": self.repetition_penalty,
+        }
+        url = f"{self.api_url}/v1/completions"  # plain completion endpoint
+        with requests.post(url, headers=self.headers, json=payload, stream=True, timeout=600) as r:
+            r.raise_for_status()
+            started = False
+            for line in r.iter_lines():
+                if not line:
+                    continue
+                if line.startswith(b"data: "):
+                    data = line[6:].decode()
+                    if data.strip() == "[DONE]":
+                        break
+                    try:
+                        obj = json.loads(data)
+                        delta = obj["choices"][0]
+                        tid: int = delta.get("token_id")  # vLLM ≥0.9 provides this
+                        if tid is None:
+                            # fallback: derive from text
+                            text_piece = delta.get("text", "")
+                            if not text_piece:
+                                continue
+                            tid = ord(text_piece[-1])  # NOT reliable; skip
+                            continue
+                    except Exception:
+                        continue
+                    if not started:
+                        if tid == CODE_START_TOKEN_ID:
+                            started = True
+                        continue
+                    if tid == CODE_REMOVE_TOKEN_ID or tid < CODE_TOKEN_OFFSET:
+                        continue
+                    yield tid - CODE_TOKEN_OFFSET
+    # ------------------------------------------------------------------ misc
+    def __del__(self):
+        try:
+            self.queue.put(None)
+        except Exception:
+            pass