Spaces:

DroolingPanda
/

kokoro-tts-server

Runtime error

App Files Files Community

Michael Hu commited on Apr 27

Commit

05b45a5

1 Parent(s): e55a2a8

initial check in

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +66 -0
api/__init__.py +1 -0
api/src/builds/v1_0/config.json +172 -0
api/src/core/__init__.py +3 -0
api/src/core/config.py +85 -0
api/src/core/don_quixote.txt +9 -0
api/src/core/model_config.py +50 -0
api/src/core/openai_mappings.json +18 -0
api/src/core/paths.py +413 -0
api/src/inference/__init__.py +12 -0
api/src/inference/base.py +127 -0
api/src/inference/kokoro_v1.py +370 -0
api/src/inference/model_manager.py +171 -0
api/src/inference/voice_manager.py +115 -0
api/src/main.py +152 -0
api/src/models/v1_0/config.json +150 -0
api/src/routers/__init__.py +1 -0
api/src/routers/debug.py +209 -0
api/src/routers/development.py +408 -0
api/src/routers/openai_compatible.py +662 -0
api/src/routers/web_player.py +49 -0
api/src/services/__init__.py +3 -0
api/src/services/audio.py +248 -0
api/src/services/streaming_audio_writer.py +100 -0
api/src/services/temp_manager.py +170 -0
api/src/services/text_processing/__init__.py +21 -0
api/src/services/text_processing/normalizer.py +415 -0
api/src/services/text_processing/phonemizer.py +102 -0
api/src/services/text_processing/text_processor.py +276 -0
api/src/services/text_processing/vocabulary.py +40 -0
api/src/services/tts_service.py +459 -0
api/src/structures/__init__.py +17 -0
api/src/structures/custom_responses.py +50 -0
api/src/structures/model_schemas.py +16 -0
api/src/structures/schemas.py +158 -0
api/src/structures/text_schemas.py +41 -0
api/tests/__init__.py +1 -0
api/tests/conftest.py +71 -0
api/tests/test_audio_service.py +256 -0
api/tests/test_data/generate_test_data.py +23 -0
api/tests/test_data/test_audio.npy +0 -0
api/tests/test_development.py +34 -0
api/tests/test_kokoro_v1.py +165 -0
api/tests/test_normalizer.py +179 -0
api/tests/test_openai_endpoints.py +499 -0
api/tests/test_paths.py +138 -0
api/tests/test_text_processor.py +105 -0
api/tests/test_tts_service.py +126 -0
charts/kokoro-fastapi/.helmignore +23 -0
charts/kokoro-fastapi/Chart.yaml +12 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,66 @@

+FROM python:3.10-slim
+# Install dependencies and check espeak location
+RUN apt-get update && apt-get install -y \
+    espeak-ng \
+    espeak-ng-data \
+    git \
+    libsndfile1 \
+    curl \
+    ffmpeg \
+    g++ \
+&& apt-get clean \
+&& rm -rf /var/lib/apt/lists/* \
+&& mkdir -p /usr/share/espeak-ng-data \
+&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
+# Install UV using the installer script
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
+    mv /root/.local/bin/uv /usr/local/bin/ && \
+    mv /root/.local/bin/uvx /usr/local/bin/
+# Create non-root user and set up directories and permissions
+RUN useradd -m -u 1000 appuser && \
+    mkdir -p /app/api/src/models/v1_0 && \
+    chown -R appuser:appuser /app
+USER appuser
+WORKDIR /app
+# Copy dependency files
+COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
+# Install Rust (required to build sudachipy and pyopenjtalk-plus)
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
+ENV PATH="/home/appuser/.cargo/bin:$PATH"
+# Install dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv venv --python 3.10 && \
+    uv sync --extra cpu
+# Copy project files including models
+COPY --chown=appuser:appuser api ./api
+COPY --chown=appuser:appuser web ./web
+COPY --chown=appuser:appuser docker/scripts/ ./
+RUN chmod +x ./entrypoint.sh
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONPATH=/app:/app/api \
+    PATH="/app/.venv/bin:$PATH" \
+    UV_LINK_MODE=copy \
+    USE_GPU=false \
+    PHONEMIZER_ESPEAK_PATH=/usr/bin \
+    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
+    ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
+ENV DOWNLOAD_MODEL=true
+# Download model if enabled
+RUN if [ "$DOWNLOAD_MODEL" = "true" ]; then \
+    python download_model.py --output api/src/models/v1_0; \
+    fi
+ENV DEVICE="cpu"
+# Run FastAPI server through entrypoint.sh
+CMD ["./entrypoint.sh"]

api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Make api directory a Python package

api/src/builds/v1_0/config.json ADDED Viewed

	@@ -0,0 +1,172 @@

+{
+  "istftnet": {
+    "upsample_kernel_sizes": [
+      20,
+      12
+    ],
+    "upsample_rates": [
+      10,
+      6
+    ],
+    "gen_istft_hop_size": 5,
+    "gen_istft_n_fft": 20,
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "upsample_initial_channel": 512
+  },
+  "dim_in": 64,
+  "dropout": 0.2,
+  "hidden_dim": 512,
+  "max_conv_dim": 512,
+  "max_dur": 50,
+  "multispeaker": true,
+  "n_layer": 3,
+  "n_mels": 80,
+  "n_token": 178,
+  "style_dim": 128,
+  "text_encoder_kernel_size": 5,
+  "plbert": {
+    "hidden_size": 768,
+    "num_attention_heads": 12,
+    "intermediate_size": 2048,
+    "max_position_embeddings": 512,
+    "num_hidden_layers": 12,
+    "dropout": 0.1
+  },
+  "vocab": {
+    ";": 1,
+    ":": 2,
+    ",": 3,
+    ".": 4,
+    "!": 5,
+    "?": 6,
+    "—": 9,
+    "…": 10,
+    "\"": 11,
+    "(": 12,
+    ")": 13,
+    "“": 14,
+    "”": 15,
+    " ": 16,
+    "̃": 17,
+    "ʣ": 18,
+    "ʥ": 19,
+    "ʦ": 20,
+    "ʨ": 21,
+    "ᵝ": 22,
+    "ꭧ": 23,
+    "A": 24,
+    "I": 25,
+    "O": 31,
+    "Q": 33,
+    "S": 35,
+    "T": 36,
+    "W": 39,
+    "Y": 41,
+    "ᵊ": 42,
+    "a": 43,
+    "b": 44,
+    "c": 45,
+    "d": 46,
+    "e": 47,
+    "f": 48,
+    "h": 50,
+    "i": 51,
+    "j": 52,
+    "k": 53,
+    "l": 54,
+    "m": 55,
+    "n": 56,
+    "o": 57,
+    "p": 58,
+    "q": 59,
+    "r": 60,
+    "s": 61,
+    "t": 62,
+    "u": 63,
+    "v": 64,
+    "w": 65,
+    "x": 66,
+    "y": 67,
+    "z": 68,
+    "ɑ": 69,
+    "ɐ": 70,
+    "ɒ": 71,
+    "æ": 72,
+    "β": 75,
+    "ɔ": 76,
+    "ɕ": 77,
+    "ç": 78,
+    "ɖ": 80,
+    "ð": 81,
+    "ʤ": 82,
+    "ə": 83,
+    "ɚ": 85,
+    "ɛ": 86,
+    "ɜ": 87,
+    "ɟ": 90,
+    "ɡ": 92,
+    "ɥ": 99,
+    "ɨ": 101,
+    "ɪ": 102,
+    "ʝ": 103,
+    "ɯ": 110,
+    "ɰ": 111,
+    "ŋ": 112,
+    "ɳ": 113,
+    "ɲ": 114,
+    "ɴ": 115,
+    "ø": 116,
+    "ɸ": 118,
+    "θ": 119,
+    "œ": 120,
+    "ɹ": 123,
+    "ɾ": 125,
+    "ɻ": 126,
+    "ʁ": 128,
+    "ɽ": 129,
+    "ʂ": 130,
+    "ʃ": 131,
+    "ʈ": 132,
+    "ʧ": 133,
+    "ʊ": 135,
+    "ʋ": 136,
+    "ʌ": 138,
+    "ɣ": 139,
+    "ɤ": 140,
+    "χ": 142,
+    "ʎ": 143,
+    "ʒ": 147,
+    "ʔ": 148,
+    "ˈ": 156,
+    "ˌ": 157,
+    "ː": 158,
+    "ʰ": 162,
+    "ʲ": 164,
+    "↓": 169,
+    "→": 171,
+    "↗": 172,
+    "↘": 173,
+    "ᵻ": 177
+  }
+}

api/src/core/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .config import settings
2	+
3	+ __all__ = ["settings"]

api/src/core/config.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+from pydantic_settings import BaseSettings
+class Settings(BaseSettings):
+    # API Settings
+    api_title: str = "Kokoro TTS API"
+    api_description: str = "API for text-to-speech generation using Kokoro"
+    api_version: str = "1.0.0"
+    host: str = "0.0.0.0"
+    port: int = 8880
+    # Application Settings
+    output_dir: str = "output"
+    output_dir_size_limit_mb: float = 500.0  # Maximum size of output directory in MB
+    default_voice: str = "af_heart"
+    default_voice_code: str | None = (
+        None  # If set, overrides the first letter of voice name, though api call param still takes precedence
+    )
+    use_gpu: bool = True  # Whether to use GPU acceleration if available
+    device_type: str | None = (
+        None  # Will be auto-detected if None, can be "cuda", "mps", or "cpu"
+    )
+    allow_local_voice_saving: bool = (
+        False  # Whether to allow saving combined voices locally
+    )
+    # Container absolute paths
+    model_dir: str = "/app/api/src/models"  # Absolute path in container
+    voices_dir: str = "/app/api/src/voices/v1_0"  # Absolute path in container
+    # Audio Settings
+    sample_rate: int = 24000
+    # Text Processing Settings
+    target_min_tokens: int = 175  # Target minimum tokens per chunk
+    target_max_tokens: int = 250  # Target maximum tokens per chunk
+    absolute_max_tokens: int = 450  # Absolute maximum tokens per chunk
+    advanced_text_normalization: bool = True  # Preproesses the text before misiki
+    voice_weight_normalization: bool = (
+        True  # Normalize the voice weights so they add up to 1
+    )
+    gap_trim_ms: int = (
+        1  # Base amount to trim from streaming chunk ends in milliseconds
+    )
+    dynamic_gap_trim_padding_ms: int = 410  # Padding to add to dynamic gap trim
+    dynamic_gap_trim_padding_char_multiplier: dict[str, float] = {
+        ".": 1,
+        "!": 0.9,
+        "?": 1,
+        ",": 0.8,
+    }
+    # Web Player Settings
+    enable_web_player: bool = True  # Whether to serve the web player UI
+    web_player_path: str = "web"  # Path to web player static files
+    cors_origins: list[str] = ["*"]  # CORS origins for web player
+    cors_enabled: bool = True  # Whether to enable CORS
+    # Temp File Settings for WEB Ui
+    temp_file_dir: str = "api/temp_files"  # Directory for temporary audio files (relative to project root)
+    max_temp_dir_size_mb: int = 2048  # Maximum size of temp directory (2GB)
+    max_temp_dir_age_hours: int = 1  # Remove temp files older than 1 hour
+    max_temp_dir_count: int = 3  # Maximum number of temp files to keep
+    class Config:
+        env_file = ".env"
+    def get_device(self) -> str:
+        """Get the appropriate device based on settings and availability"""
+        if not self.use_gpu:
+            return "cpu"
+        if self.device_type:
+            return self.device_type
+        # Auto-detect device
+        if torch.backends.mps.is_available():
+            return "mps"
+        elif torch.cuda.is_available():
+            return "cuda"
+        return "cpu"
+settings = Settings()

api/src/core/don_quixote.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+In a village of La Mancha, the name of which I have no desire to call
+to mind, there lived not long since one of those gentlemen that keep a
+lance in the lance-rack, an old buckler, a lean hack, and a greyhound
+for coursing. An olla of rather more beef than mutton, a salad on most
+nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so
+extra on Sundays, made away with three-quarters of his income. The rest
+of it went in a doublet of fine cloth and velvet breeches and shoes to
+match for holidays, while on week-days he made a brave figure in his
+best homespun.

api/src/core/model_config.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""Model configuration for Kokoro V1.
+This module provides model-specific configuration settings that complement the application-level
+settings in config.py. While config.py handles general application settings (API, paths, etc.),
+this module focuses on memory management and model file paths.
+"""
+from pydantic import BaseModel, Field
+class KokoroV1Config(BaseModel):
+    """Kokoro V1 configuration."""
+    languages: list[str] = ["en"]
+    class Config:
+        frozen = True
+class PyTorchConfig(BaseModel):
+    """PyTorch backend configuration."""
+    memory_threshold: float = Field(0.8, description="Memory threshold for cleanup")
+    retry_on_oom: bool = Field(True, description="Whether to retry on OOM errors")
+    class Config:
+        frozen = True
+class ModelConfig(BaseModel):
+    """Kokoro V1 model configuration."""
+    # General settings
+    cache_voices: bool = Field(True, description="Whether to cache voice tensors")
+    voice_cache_size: int = Field(2, description="Maximum number of cached voices")
+    # Model filename
+    pytorch_kokoro_v1_file: str = Field(
+        "v1_0/kokoro-v1_0.pth", description="PyTorch Kokoro V1 model filename"
+    )
+    # Backend config
+    pytorch_gpu: PyTorchConfig = Field(default_factory=PyTorchConfig)
+    class Config:
+        frozen = True
+# Global instance
+model_config = ModelConfig()

api/src/core/openai_mappings.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "models": {
+        "tts-1": "kokoro-v1_0",
+        "tts-1-hd": "kokoro-v1_0",
+        "kokoro": "kokoro-v1_0"
+    },
+    "voices": {
+        "alloy": "am_v0adam",
+        "ash": "af_v0nicole",
+        "coral": "bf_v0emma",
+        "echo": "af_v0bella",
+        "fable": "af_sarah",
+        "onyx": "bm_george",
+        "nova": "bf_isabella",
+        "sage": "am_michael",
+        "shimmer": "af_sky"
+    }
+}

api/src/core/paths.py ADDED Viewed

	@@ -0,0 +1,413 @@

+"""Async file and path operations."""
+import io
+import json
+import os
+from pathlib import Path
+from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Set
+import aiofiles
+import aiofiles.os
+import torch
+from loguru import logger
+from .config import settings
+async def _find_file(
+    filename: str,
+    search_paths: List[str],
+    filter_fn: Optional[Callable[[str], bool]] = None,
+) -> str:
+    """Find file in search paths.
+    Args:
+        filename: Name of file to find
+        search_paths: List of paths to search in
+        filter_fn: Optional function to filter files
+    Returns:
+        Absolute path to file
+    Raises:
+        RuntimeError: If file not found
+    """
+    if os.path.isabs(filename) and await aiofiles.os.path.exists(filename):
+        return filename
+    for path in search_paths:
+        full_path = os.path.join(path, filename)
+        if await aiofiles.os.path.exists(full_path):
+            if filter_fn is None or filter_fn(full_path):
+                return full_path
+    raise FileNotFoundError(f"File not found: {filename} in paths: {search_paths}")
+async def _scan_directories(
+    search_paths: List[str], filter_fn: Optional[Callable[[str], bool]] = None
+) -> Set[str]:
+    """Scan directories for files.
+    Args:
+        search_paths: List of paths to scan
+        filter_fn: Optional function to filter files
+    Returns:
+        Set of matching filenames
+    """
+    results = set()
+    for path in search_paths:
+        if not await aiofiles.os.path.exists(path):
+            continue
+        try:
+            # Get directory entries first
+            entries = await aiofiles.os.scandir(path)
+            # Then process entries after await completes
+            for entry in entries:
+                if filter_fn is None or filter_fn(entry.name):
+                    results.add(entry.name)
+        except Exception as e:
+            logger.warning(f"Error scanning {path}: {e}")
+    return results
+async def get_model_path(model_name: str) -> str:
+    """Get path to model file.
+    Args:
+        model_name: Name of model file
+    Returns:
+        Absolute path to model file
+    Raises:
+        RuntimeError: If model not found
+    """
+    # Get api directory path (two levels up from core)
+    api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+    # Construct model directory path relative to api directory
+    model_dir = os.path.join(api_dir, settings.model_dir)
+    # Ensure model directory exists
+    os.makedirs(model_dir, exist_ok=True)
+    # Search in model directory
+    search_paths = [model_dir]
+    logger.debug(f"Searching for model in path: {model_dir}")
+    return await _find_file(model_name, search_paths)
+async def get_voice_path(voice_name: str) -> str:
+    """Get path to voice file.
+    Args:
+        voice_name: Name of voice file (without .pt extension)
+    Returns:
+        Absolute path to voice file
+    Raises:
+        RuntimeError: If voice not found
+    """
+    # Get api directory path
+    api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+    # Construct voice directory path relative to api directory
+    voice_dir = os.path.join(api_dir, settings.voices_dir)
+    # Ensure voice directory exists
+    os.makedirs(voice_dir, exist_ok=True)
+    voice_file = f"{voice_name}.pt"
+    # Search in voice directory/o
+    search_paths = [voice_dir]
+    logger.debug(f"Searching for voice in path: {voice_dir}")
+    return await _find_file(voice_file, search_paths)
+async def list_voices() -> List[str]:
+    """List available voice files.
+    Returns:
+        List of voice names (without .pt extension)
+    """
+    # Get api directory path
+    api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+    # Construct voice directory path relative to api directory
+    voice_dir = os.path.join(api_dir, settings.voices_dir)
+    # Ensure voice directory exists
+    os.makedirs(voice_dir, exist_ok=True)
+    # Search in voice directory
+    search_paths = [voice_dir]
+    logger.debug(f"Scanning for voices in path: {voice_dir}")
+    def filter_voice_files(name: str) -> bool:
+        return name.endswith(".pt")
+    voices = await _scan_directories(search_paths, filter_voice_files)
+    return sorted([name[:-3] for name in voices])  # Remove .pt extension
+async def load_voice_tensor(
+    voice_path: str, device: str = "cpu", weights_only=False
+) -> torch.Tensor:
+    """Load voice tensor from file.
+    Args:
+        voice_path: Path to voice file
+        device: Device to load tensor to
+    Returns:
+        Voice tensor
+    Raises:
+        RuntimeError: If file cannot be read
+    """
+    try:
+        async with aiofiles.open(voice_path, "rb") as f:
+            data = await f.read()
+            return torch.load(
+                io.BytesIO(data), map_location=device, weights_only=weights_only
+            )
+    except Exception as e:
+        raise RuntimeError(f"Failed to load voice tensor from {voice_path}: {e}")
+async def save_voice_tensor(tensor: torch.Tensor, voice_path: str) -> None:
+    """Save voice tensor to file.
+    Args:
+        tensor: Voice tensor to save
+        voice_path: Path to save voice file
+    Raises:
+        RuntimeError: If file cannot be written
+    """
+    try:
+        buffer = io.BytesIO()
+        torch.save(tensor, buffer)
+        async with aiofiles.open(voice_path, "wb") as f:
+            await f.write(buffer.getvalue())
+    except Exception as e:
+        raise RuntimeError(f"Failed to save voice tensor to {voice_path}: {e}")
+async def load_json(path: str) -> dict:
+    """Load JSON file asynchronously.
+    Args:
+        path: Path to JSON file
+    Returns:
+        Parsed JSON data
+    Raises:
+        RuntimeError: If file cannot be read or parsed
+    """
+    try:
+        async with aiofiles.open(path, "r", encoding="utf-8") as f:
+            content = await f.read()
+            return json.loads(content)
+    except Exception as e:
+        raise RuntimeError(f"Failed to load JSON file {path}: {e}")
+async def load_model_weights(path: str, device: str = "cpu") -> dict:
+    """Load model weights asynchronously.
+    Args:
+        path: Path to model file (.pth or .onnx)
+        device: Device to load model to
+    Returns:
+        Model weights
+    Raises:
+        RuntimeError: If file cannot be read
+    """
+    try:
+        async with aiofiles.open(path, "rb") as f:
+            data = await f.read()
+            return torch.load(io.BytesIO(data), map_location=device, weights_only=True)
+    except Exception as e:
+        raise RuntimeError(f"Failed to load model weights from {path}: {e}")
+async def read_file(path: str) -> str:
+    """Read text file asynchronously.
+    Args:
+        path: Path to file
+    Returns:
+        File contents as string
+    Raises:
+        RuntimeError: If file cannot be read
+    """
+    try:
+        async with aiofiles.open(path, "r", encoding="utf-8") as f:
+            return await f.read()
+    except Exception as e:
+        raise RuntimeError(f"Failed to read file {path}: {e}")
+async def read_bytes(path: str) -> bytes:
+    """Read file as bytes asynchronously.
+    Args:
+        path: Path to file
+    Returns:
+        File contents as bytes
+    Raises:
+        RuntimeError: If file cannot be read
+    """
+    try:
+        async with aiofiles.open(path, "rb") as f:
+            return await f.read()
+    except Exception as e:
+        raise RuntimeError(f"Failed to read file {path}: {e}")
+async def get_web_file_path(filename: str) -> str:
+    """Get path to web static file.
+    Args:
+        filename: Name of file in web directory
+    Returns:
+        Absolute path to file
+    Raises:
+        RuntimeError: If file not found
+    """
+    # Get project root directory (four levels up from core to get to project root)
+    root_dir = os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+    )
+    # Construct web directory path relative to project root
+    web_dir = os.path.join("/app", settings.web_player_path)
+    # Search in web directory
+    search_paths = [web_dir]
+    logger.debug(f"Searching for web file in path: {web_dir}")
+    return await _find_file(filename, search_paths)
+async def get_content_type(path: str) -> str:
+    """Get content type for file.
+    Args:
+        path: Path to file
+    Returns:
+        Content type string
+    """
+    ext = os.path.splitext(path)[1].lower()
+    return {
+        ".html": "text/html",
+        ".js": "application/javascript",
+        ".css": "text/css",
+        ".png": "image/png",
+        ".jpg": "image/jpeg",
+        ".jpeg": "image/jpeg",
+        ".gif": "image/gif",
+        ".svg": "image/svg+xml",
+        ".ico": "image/x-icon",
+    }.get(ext, "application/octet-stream")
+async def verify_model_path(model_path: str) -> bool:
+    """Verify model file exists at path."""
+    return await aiofiles.os.path.exists(model_path)
+async def cleanup_temp_files() -> None:
+    """Clean up old temp files on startup"""
+    try:
+        if not await aiofiles.os.path.exists(settings.temp_file_dir):
+            await aiofiles.os.makedirs(settings.temp_file_dir, exist_ok=True)
+            return
+        entries = await aiofiles.os.scandir(settings.temp_file_dir)
+        for entry in entries:
+            if entry.is_file():
+                stat = await aiofiles.os.stat(entry.path)
+                max_age = stat.st_mtime + (settings.max_temp_dir_age_hours * 3600)
+                if max_age < stat.st_mtime:
+                    try:
+                        await aiofiles.os.remove(entry.path)
+                        logger.info(f"Cleaned up old temp file: {entry.name}")
+                    except Exception as e:
+                        logger.warning(
+                            f"Failed to delete old temp file {entry.name}: {e}"
+                        )
+    except Exception as e:
+        logger.warning(f"Error cleaning temp files: {e}")
+async def get_temp_file_path(filename: str) -> str:
+    """Get path to temporary audio file.
+    Args:
+        filename: Name of temp file
+    Returns:
+        Absolute path to temp file
+    Raises:
+        RuntimeError: If temp directory does not exist
+    """
+    temp_path = os.path.join(settings.temp_file_dir, filename)
+    # Ensure temp directory exists
+    if not await aiofiles.os.path.exists(settings.temp_file_dir):
+        await aiofiles.os.makedirs(settings.temp_file_dir, exist_ok=True)
+    return temp_path
+async def list_temp_files() -> List[str]:
+    """List temporary audio files.
+    Returns:
+        List of temp file names
+    """
+    if not await aiofiles.os.path.exists(settings.temp_file_dir):
+        return []
+    entries = await aiofiles.os.scandir(settings.temp_file_dir)
+    return [entry.name for entry in entries if entry.is_file()]
+async def get_temp_dir_size() -> int:
+    """Get total size of temp directory in bytes.
+    Returns:
+        Size in bytes
+    """
+    if not await aiofiles.os.path.exists(settings.temp_file_dir):
+        return 0
+    total = 0
+    entries = await aiofiles.os.scandir(settings.temp_file_dir)
+    for entry in entries:
+        if entry.is_file():
+            stat = await aiofiles.os.stat(entry.path)
+            total += stat.st_size
+    return total

api/src/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Model inference package."""
+from .base import BaseModelBackend
+from .kokoro_v1 import KokoroV1
+from .model_manager import ModelManager, get_manager
+__all__ = [
+    "BaseModelBackend",
+    "ModelManager",
+    "get_manager",
+    "KokoroV1",
+]

api/src/inference/base.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""Base interface for Kokoro inference."""
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, List, Optional, Tuple, Union
+import numpy as np
+import torch
+class AudioChunk:
+    """Class for audio chunks returned by model backends"""
+    def __init__(
+        self,
+        audio: np.ndarray,
+        word_timestamps: Optional[List] = [],
+        output: Optional[Union[bytes, np.ndarray]] = b"",
+    ):
+        self.audio = audio
+        self.word_timestamps = word_timestamps
+        self.output = output
+    @staticmethod
+    def combine(audio_chunk_list: List):
+        output = AudioChunk(
+            audio_chunk_list[0].audio, audio_chunk_list[0].word_timestamps
+        )
+        for audio_chunk in audio_chunk_list[1:]:
+            output.audio = np.concatenate(
+                (output.audio, audio_chunk.audio), dtype=np.int16
+            )
+            if output.word_timestamps is not None:
+                output.word_timestamps += audio_chunk.word_timestamps
+        return output
+class ModelBackend(ABC):
+    """Abstract base class for model inference backend."""
+    @abstractmethod
+    async def load_model(self, path: str) -> None:
+        """Load model from path.
+        Args:
+            path: Path to model file
+        Raises:
+            RuntimeError: If model loading fails
+        """
+        pass
+    @abstractmethod
+    async def generate(
+        self,
+        text: str,
+        voice: Union[str, Tuple[str, Union[torch.Tensor, str]]],
+        speed: float = 1.0,
+    ) -> AsyncGenerator[AudioChunk, None]:
+        """Generate audio from text.
+        Args:
+            text: Input text to synthesize
+            voice: Either a voice path or tuple of (name, tensor/path)
+            speed: Speed multiplier
+        Yields:
+            Generated audio chunks
+        Raises:
+            RuntimeError: If generation fails
+        """
+        pass
+    @abstractmethod
+    def unload(self) -> None:
+        """Unload model and free resources."""
+        pass
+    @property
+    @abstractmethod
+    def is_loaded(self) -> bool:
+        """Check if model is loaded.
+        Returns:
+            True if model is loaded, False otherwise
+        """
+        pass
+    @property
+    @abstractmethod
+    def device(self) -> str:
+        """Get device model is running on.
+        Returns:
+            Device string ('cpu' or 'cuda')
+        """
+        pass
+class BaseModelBackend(ModelBackend):
+    """Base implementation of model backend."""
+    def __init__(self):
+        """Initialize base backend."""
+        self._model: Optional[torch.nn.Module] = None
+        self._device: str = "cpu"
+    @property
+    def is_loaded(self) -> bool:
+        """Check if model is loaded."""
+        return self._model is not None
+    @property
+    def device(self) -> str:
+        """Get device model is running on."""
+        return self._device
+    def unload(self) -> None:
+        """Unload model and free resources."""
+        if self._model is not None:
+            del self._model
+            self._model = None
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()

api/src/inference/kokoro_v1.py ADDED Viewed

	@@ -0,0 +1,370 @@

+"""Clean Kokoro implementation with controlled resource management."""
+import os
+from typing import AsyncGenerator, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+from kokoro import KModel, KPipeline
+from loguru import logger
+from ..core import paths
+from ..core.config import settings
+from ..core.model_config import model_config
+from ..structures.schemas import WordTimestamp
+from .base import AudioChunk, BaseModelBackend
+class KokoroV1(BaseModelBackend):
+    """Kokoro backend with controlled resource management."""
+    def __init__(self):
+        """Initialize backend with environment-based configuration."""
+        super().__init__()
+        # Strictly respect settings.use_gpu
+        self._device = settings.get_device()
+        self._model: Optional[KModel] = None
+        self._pipelines: Dict[str, KPipeline] = {}  # Store pipelines by lang_code
+    async def load_model(self, path: str) -> None:
+        """Load pre-baked model.
+        Args:
+            path: Path to model file
+        Raises:
+            RuntimeError: If model loading fails
+        """
+        try:
+            # Get verified model path
+            model_path = await paths.get_model_path(path)
+            config_path = os.path.join(os.path.dirname(model_path), "config.json")
+            if not os.path.exists(config_path):
+                raise RuntimeError(f"Config file not found: {config_path}")
+            logger.info(f"Loading Kokoro model on {self._device}")
+            logger.info(f"Config path: {config_path}")
+            logger.info(f"Model path: {model_path}")
+            # Load model and let KModel handle device mapping
+            self._model = KModel(config=config_path, model=model_path).eval()
+            # For MPS, manually move ISTFT layers to CPU while keeping rest on MPS
+            if self._device == "mps":
+                logger.info(
+                    "Moving model to MPS device with CPU fallback for unsupported operations"
+                )
+                self._model = self._model.to(torch.device("mps"))
+            elif self._device == "cuda":
+                self._model = self._model.cuda()
+            else:
+                self._model = self._model.cpu()
+        except FileNotFoundError as e:
+            raise e
+        except Exception as e:
+            raise RuntimeError(f"Failed to load Kokoro model: {e}")
+    def _get_pipeline(self, lang_code: str) -> KPipeline:
+        """Get or create pipeline for language code.
+        Args:
+            lang_code: Language code to use
+        Returns:
+            KPipeline instance for the language
+        """
+        if not self._model:
+            raise RuntimeError("Model not loaded")
+        if lang_code not in self._pipelines:
+            logger.info(f"Creating new pipeline for language code: {lang_code}")
+            self._pipelines[lang_code] = KPipeline(
+                lang_code=lang_code, model=self._model, device=self._device
+            )
+        return self._pipelines[lang_code]
+    async def generate_from_tokens(
+        self,
+        tokens: str,
+        voice: Union[str, Tuple[str, Union[torch.Tensor, str]]],
+        speed: float = 1.0,
+        lang_code: Optional[str] = None,
+    ) -> AsyncGenerator[np.ndarray, None]:
+        """Generate audio from phoneme tokens.
+        Args:
+            tokens: Input phoneme tokens to synthesize
+            voice: Either a voice path string or a tuple of (voice_name, voice_tensor/path)
+            speed: Speed multiplier
+            lang_code: Optional language code override
+        Yields:
+            Generated audio chunks
+        Raises:
+            RuntimeError: If generation fails
+        """
+        if not self.is_loaded:
+            raise RuntimeError("Model not loaded")
+        try:
+            # Memory management for GPU
+            if self._device == "cuda":
+                if self._check_memory():
+                    self._clear_memory()
+            # Handle voice input
+            voice_path: str
+            voice_name: str
+            if isinstance(voice, tuple):
+                voice_name, voice_data = voice
+                if isinstance(voice_data, str):
+                    voice_path = voice_data
+                else:
+                    # Save tensor to temporary file
+                    import tempfile
+                    temp_dir = tempfile.gettempdir()
+                    voice_path = os.path.join(temp_dir, f"{voice_name}.pt")
+                    # Save tensor with CPU mapping for portability
+                    torch.save(voice_data.cpu(), voice_path)
+            else:
+                voice_path = voice
+                voice_name = os.path.splitext(os.path.basename(voice_path))[0]
+            # Load voice tensor with proper device mapping
+            voice_tensor = await paths.load_voice_tensor(
+                voice_path, device=self._device
+            )
+            # Save back to a temporary file with proper device mapping
+            import tempfile
+            temp_dir = tempfile.gettempdir()
+            temp_path = os.path.join(
+                temp_dir, f"temp_voice_{os.path.basename(voice_path)}"
+            )
+            await paths.save_voice_tensor(voice_tensor, temp_path)
+            voice_path = temp_path
+            # Use provided lang_code, settings voice code override, or first letter of voice name
+            if lang_code:  # api is given priority
+                pipeline_lang_code = lang_code
+            elif settings.default_voice_code:  # settings is next priority
+                pipeline_lang_code = settings.default_voice_code
+            else:  # voice name is default/fallback
+                pipeline_lang_code = voice_name[0].lower()
+            pipeline = self._get_pipeline(pipeline_lang_code)
+            logger.debug(
+                f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}{'...' if len(tokens) > 100 else ''}'"
+            )
+            for result in pipeline.generate_from_tokens(
+                tokens=tokens, voice=voice_path, speed=speed, model=self._model
+            ):
+                if result.audio is not None:
+                    logger.debug(f"Got audio chunk with shape: {result.audio.shape}")
+                    yield result.audio.numpy()
+                else:
+                    logger.warning("No audio in chunk")
+        except Exception as e:
+            logger.error(f"Generation failed: {e}")
+            if (
+                self._device == "cuda"
+                and model_config.pytorch_gpu.retry_on_oom
+                and "out of memory" in str(e).lower()
+            ):
+                self._clear_memory()
+                async for chunk in self.generate_from_tokens(
+                    tokens, voice, speed, lang_code
+                ):
+                    yield chunk
+            raise
+    async def generate(
+        self,
+        text: str,
+        voice: Union[str, Tuple[str, Union[torch.Tensor, str]]],
+        speed: float = 1.0,
+        lang_code: Optional[str] = None,
+        return_timestamps: Optional[bool] = False,
+    ) -> AsyncGenerator[AudioChunk, None]:
+        """Generate audio using model.
+        Args:
+            text: Input text to synthesize
+            voice: Either a voice path string or a tuple of (voice_name, voice_tensor/path)
+            speed: Speed multiplier
+            lang_code: Optional language code override
+        Yields:
+            Generated audio chunks
+        Raises:
+            RuntimeError: If generation fails
+        """
+        if not self.is_loaded:
+            raise RuntimeError("Model not loaded")
+        try:
+            # Memory management for GPU
+            if self._device == "cuda":
+                if self._check_memory():
+                    self._clear_memory()
+            # Handle voice input
+            voice_path: str
+            voice_name: str
+            if isinstance(voice, tuple):
+                voice_name, voice_data = voice
+                if isinstance(voice_data, str):
+                    voice_path = voice_data
+                else:
+                    # Save tensor to temporary file
+                    import tempfile
+                    temp_dir = tempfile.gettempdir()
+                    voice_path = os.path.join(temp_dir, f"{voice_name}.pt")
+                    # Save tensor with CPU mapping for portability
+                    torch.save(voice_data.cpu(), voice_path)
+            else:
+                voice_path = voice
+                voice_name = os.path.splitext(os.path.basename(voice_path))[0]
+            # Load voice tensor with proper device mapping
+            voice_tensor = await paths.load_voice_tensor(
+                voice_path, device=self._device
+            )
+            # Save back to a temporary file with proper device mapping
+            import tempfile
+            temp_dir = tempfile.gettempdir()
+            temp_path = os.path.join(
+                temp_dir, f"temp_voice_{os.path.basename(voice_path)}"
+            )
+            await paths.save_voice_tensor(voice_tensor, temp_path)
+            voice_path = temp_path
+            # Use provided lang_code, settings voice code override, or first letter of voice name
+            pipeline_lang_code = (
+                lang_code
+                if lang_code
+                else (
+                    settings.default_voice_code
+                    if settings.default_voice_code
+                    else voice_name[0].lower()
+                )
+            )
+            pipeline = self._get_pipeline(pipeline_lang_code)
+            logger.debug(
+                f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}{'...' if len(text) > 100 else ''}'"
+            )
+            for result in pipeline(
+                text, voice=voice_path, speed=speed, model=self._model
+            ):
+                if result.audio is not None:
+                    logger.debug(f"Got audio chunk with shape: {result.audio.shape}")
+                    word_timestamps = None
+                    if (
+                        return_timestamps
+                        and hasattr(result, "tokens")
+                        and result.tokens
+                    ):
+                        word_timestamps = []
+                        current_offset = 0.0
+                        logger.debug(
+                            f"Processing chunk timestamps with {len(result.tokens)} tokens"
+                        )
+                        if result.pred_dur is not None:
+                            try:
+                                # Add timestamps with offset
+                                for token in result.tokens:
+                                    if not all(
+                                        hasattr(token, attr)
+                                        for attr in [
+                                            "text",
+                                            "start_ts",
+                                            "end_ts",
+                                        ]
+                                    ):
+                                        continue
+                                    if not token.text or not token.text.strip():
+                                        continue
+                                    start_time = float(token.start_ts) + current_offset
+                                    end_time = float(token.end_ts) + current_offset
+                                    word_timestamps.append(
+                                        WordTimestamp(
+                                            word=str(token.text).strip(),
+                                            start_time=start_time,
+                                            end_time=end_time,
+                                        )
+                                    )
+                                    logger.debug(
+                                        f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s"
+                                    )
+                            except Exception as e:
+                                logger.error(
+                                    f"Failed to process timestamps for chunk: {e}"
+                                )
+                    yield AudioChunk(
+                        result.audio.numpy(), word_timestamps=word_timestamps
+                    )
+                else:
+                    logger.warning("No audio in chunk")
+        except Exception as e:
+            logger.error(f"Generation failed: {e}")
+            if (
+                self._device == "cuda"
+                and model_config.pytorch_gpu.retry_on_oom
+                and "out of memory" in str(e).lower()
+            ):
+                self._clear_memory()
+                async for chunk in self.generate(text, voice, speed, lang_code):
+                    yield chunk
+            raise
+    def _check_memory(self) -> bool:
+        """Check if memory usage is above threshold."""
+        if self._device == "cuda":
+            memory_gb = torch.cuda.memory_allocated() / 1e9
+            return memory_gb > model_config.pytorch_gpu.memory_threshold
+        # MPS doesn't provide memory management APIs
+        return False
+    def _clear_memory(self) -> None:
+        """Clear device memory."""
+        if self._device == "cuda":
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        elif self._device == "mps":
+            # Empty cache if available (future-proofing)
+            if hasattr(torch.mps, "empty_cache"):
+                torch.mps.empty_cache()
+    def unload(self) -> None:
+        """Unload model and free resources."""
+        if self._model is not None:
+            del self._model
+            self._model = None
+        for pipeline in self._pipelines.values():
+            del pipeline
+        self._pipelines.clear()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+    @property
+    def is_loaded(self) -> bool:
+        """Check if model is loaded."""
+        return self._model is not None
+    @property
+    def device(self) -> str:
+        """Get device model is running on."""
+        return self._device

api/src/inference/model_manager.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""Kokoro V1 model management."""
+from typing import Optional
+from loguru import logger
+from ..core import paths
+from ..core.config import settings
+from ..core.model_config import ModelConfig, model_config
+from .base import BaseModelBackend
+from .kokoro_v1 import KokoroV1
+class ModelManager:
+    """Manages Kokoro V1 model loading and inference."""
+    # Singleton instance
+    _instance = None
+    def __init__(self, config: Optional[ModelConfig] = None):
+        """Initialize manager.
+        Args:
+            config: Optional model configuration override
+        """
+        self._config = config or model_config
+        self._backend: Optional[KokoroV1] = None  # Explicitly type as KokoroV1
+        self._device: Optional[str] = None
+    def _determine_device(self) -> str:
+        """Determine device based on settings."""
+        return "cuda" if settings.use_gpu else "cpu"
+    async def initialize(self) -> None:
+        """Initialize Kokoro V1 backend."""
+        try:
+            self._device = self._determine_device()
+            logger.info(f"Initializing Kokoro V1 on {self._device}")
+            self._backend = KokoroV1()
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize Kokoro V1: {e}")
+    async def initialize_with_warmup(self, voice_manager) -> tuple[str, str, int]:
+        """Initialize and warm up model.
+        Args:
+            voice_manager: Voice manager instance for warmup
+        Returns:
+            Tuple of (device, backend type, voice count)
+        Raises:
+            RuntimeError: If initialization fails
+        """
+        import time
+        start = time.perf_counter()
+        try:
+            # Initialize backend
+            await self.initialize()
+            # Load model
+            model_path = self._config.pytorch_kokoro_v1_file
+            await self.load_model(model_path)
+            # Use paths module to get voice path
+            try:
+                voices = await paths.list_voices()
+                voice_path = await paths.get_voice_path(settings.default_voice)
+                # Warm up with short text
+                warmup_text = "Warmup text for initialization."
+                # Use default voice name for warmup
+                voice_name = settings.default_voice
+                logger.debug(f"Using default voice '{voice_name}' for warmup")
+                async for _ in self.generate(warmup_text, (voice_name, voice_path)):
+                    pass
+            except Exception as e:
+                raise RuntimeError(f"Failed to get default voice: {e}")
+            ms = int((time.perf_counter() - start) * 1000)
+            logger.info(f"Warmup completed in {ms}ms")
+            return self._device, "kokoro_v1", len(voices)
+        except FileNotFoundError as e:
+            logger.error("""
+Model files not found! You need to download the Kokoro V1 model:
+1. Download model using the script:
+   python docker/scripts/download_model.py --output api/src/models/v1_0
+2. Or set environment variable in docker-compose:
+   DOWNLOAD_MODEL=true
+""")
+            exit(0)
+        except Exception as e:
+            raise RuntimeError(f"Warmup failed: {e}")
+    def get_backend(self) -> BaseModelBackend:
+        """Get initialized backend.
+        Returns:
+            Initialized backend instance
+        Raises:
+            RuntimeError: If backend not initialized
+        """
+        if not self._backend:
+            raise RuntimeError("Backend not initialized")
+        return self._backend
+    async def load_model(self, path: str) -> None:
+        """Load model using initialized backend.
+        Args:
+            path: Path to model file
+        Raises:
+            RuntimeError: If loading fails
+        """
+        if not self._backend:
+            raise RuntimeError("Backend not initialized")
+        try:
+            await self._backend.load_model(path)
+        except FileNotFoundError as e:
+            raise e
+        except Exception as e:
+            raise RuntimeError(f"Failed to load model: {e}")
+    async def generate(self, *args, **kwargs):
+        """Generate audio using initialized backend.
+        Raises:
+            RuntimeError: If generation fails
+        """
+        if not self._backend:
+            raise RuntimeError("Backend not initialized")
+        try:
+            async for chunk in self._backend.generate(*args, **kwargs):
+                yield chunk
+        except Exception as e:
+            raise RuntimeError(f"Generation failed: {e}")
+    def unload_all(self) -> None:
+        """Unload model and free resources."""
+        if self._backend:
+            self._backend.unload()
+            self._backend = None
+    @property
+    def current_backend(self) -> str:
+        """Get current backend type."""
+        return "kokoro_v1"
+async def get_manager(config: Optional[ModelConfig] = None) -> ModelManager:
+    """Get model manager instance.
+    Args:
+        config: Optional configuration override
+    Returns:
+        ModelManager instance
+    """
+    if ModelManager._instance is None:
+        ModelManager._instance = ModelManager(config)
+    return ModelManager._instance

api/src/inference/voice_manager.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""Voice management with controlled resource handling."""
+from typing import Dict, List, Optional
+import aiofiles
+import torch
+from loguru import logger
+from ..core import paths
+from ..core.config import settings
+class VoiceManager:
+    """Manages voice loading and caching with controlled resource usage."""
+    # Singleton instance
+    _instance = None
+    def __init__(self):
+        """Initialize voice manager."""
+        # Strictly respect settings.use_gpu
+        self._device = settings.get_device()
+        self._voices: Dict[str, torch.Tensor] = {}
+    async def get_voice_path(self, voice_name: str) -> str:
+        """Get path to voice file.
+        Args:
+            voice_name: Name of voice
+        Returns:
+            Path to voice file
+        Raises:
+            RuntimeError: If voice not found
+        """
+        return await paths.get_voice_path(voice_name)
+    async def load_voice(
+        self, voice_name: str, device: Optional[str] = None
+    ) -> torch.Tensor:
+        """Load voice tensor.
+        Args:
+            voice_name: Name of voice to load
+            device: Optional override for target device
+        Returns:
+            Voice tensor
+        Raises:
+            RuntimeError: If voice not found
+        """
+        try:
+            voice_path = await self.get_voice_path(voice_name)
+            target_device = device or self._device
+            voice = await paths.load_voice_tensor(voice_path, target_device)
+            self._voices[voice_name] = voice
+            return voice
+        except Exception as e:
+            raise RuntimeError(f"Failed to load voice {voice_name}: {e}")
+    async def combine_voices(
+        self, voices: List[str], device: Optional[str] = None
+    ) -> torch.Tensor:
+        """Combine multiple voices.
+        Args:
+            voices: List of voice names to combine
+            device: Optional override for target device
+        Returns:
+            Combined voice tensor
+        Raises:
+            RuntimeError: If any voice not found
+        """
+        if len(voices) < 2:
+            raise ValueError("Need at least 2 voices to combine")
+        target_device = device or self._device
+        voice_tensors = []
+        for name in voices:
+            voice = await self.load_voice(name, target_device)
+            voice_tensors.append(voice)
+        combined = torch.mean(torch.stack(voice_tensors), dim=0)
+        return combined
+    async def list_voices(self) -> List[str]:
+        """List available voice names.
+        Returns:
+            List of voice names
+        """
+        return await paths.list_voices()
+    def cache_info(self) -> Dict[str, int]:
+        """Get cache statistics.
+        Returns:
+            Dict with cache statistics
+        """
+        return {"loaded_voices": len(self._voices), "device": self._device}
+async def get_manager() -> VoiceManager:
+    """Get voice manager instance.
+    Returns:
+        VoiceManager instance
+    """
+    if VoiceManager._instance is None:
+        VoiceManager._instance = VoiceManager()
+    return VoiceManager._instance

api/src/main.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+FastAPI OpenAI Compatible API
+"""
+import os
+import sys
+from contextlib import asynccontextmanager
+from pathlib import Path
+import torch
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from loguru import logger
+from .core.config import settings
+from .routers.debug import router as debug_router
+from .routers.development import router as dev_router
+from .routers.openai_compatible import router as openai_router
+from .routers.web_player import router as web_router
+def setup_logger():
+    """Configure loguru logger with custom formatting"""
+    config = {
+        "handlers": [
+            {
+                "sink": sys.stdout,
+                "format": "<fg #2E8B57>{time:hh:mm:ss A}</fg #2E8B57> | "
+                "{level: <8} | "
+                "<fg #4169E1>{module}:{line}</fg #4169E1> | "
+                "{message}",
+                "colorize": True,
+                "level": "DEBUG",
+            },
+        ],
+    }
+    logger.remove()
+    logger.configure(**config)
+    logger.level("ERROR", color="<red>")
+# Configure logger
+setup_logger()
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Lifespan context manager for model initialization"""
+    from .inference.model_manager import get_manager
+    from .inference.voice_manager import get_manager as get_voice_manager
+    from .services.temp_manager import cleanup_temp_files
+    # Clean old temp files on startup
+    await cleanup_temp_files()
+    logger.info("Loading TTS model and voice packs...")
+    try:
+        # Initialize managers
+        model_manager = await get_manager()
+        voice_manager = await get_voice_manager()
+        # Initialize model with warmup and get status
+        device, model, voicepack_count = await model_manager.initialize_with_warmup(
+            voice_manager
+        )
+    except Exception as e:
+        logger.error(f"Failed to initialize model: {e}")
+        raise
+    boundary = "░" * 2 * 12
+    startup_msg = f"""
+{boundary}
+    ╔═╗┌─┐┌─┐┌┬┐
+    ╠╣ ├─┤└─┐ │
+    ╚  ┴ ┴└─┘ ┴
+    ╦╔═┌─┐┬┌─┌─┐
+    ╠╩╗│ │├┴┐│ │
+    ╩ ╩└─┘┴ ┴└─┘
+{boundary}
+                """
+    startup_msg += f"\nModel warmed up on {device}: {model}"
+    if device == "mps":
+        startup_msg += "\nUsing Apple Metal Performance Shaders (MPS)"
+    elif device == "cuda":
+        startup_msg += f"\nCUDA: {torch.cuda.is_available()}"
+    else:
+        startup_msg += "\nRunning on CPU"
+    startup_msg += f"\n{voicepack_count} voice packs loaded"
+    # Add web player info if enabled
+    if settings.enable_web_player:
+        startup_msg += (
+            f"\n\nBeta Web Player: http://{settings.host}:{settings.port}/web/"
+        )
+        startup_msg += f"\nor http://localhost:{settings.port}/web/"
+    else:
+        startup_msg += "\n\nWeb Player: disabled"
+    startup_msg += f"\n{boundary}\n"
+    logger.info(startup_msg)
+    yield
+# Initialize FastAPI app
+app = FastAPI(
+    title=settings.api_title,
+    description=settings.api_description,
+    version=settings.api_version,
+    lifespan=lifespan,
+    openapi_url="/openapi.json",  # Explicitly enable OpenAPI schema
+)
+# Add CORS middleware if enabled
+if settings.cors_enabled:
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=settings.cors_origins,
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+# Include routers
+app.include_router(openai_router, prefix="/v1")
+app.include_router(dev_router)  # Development endpoints
+app.include_router(debug_router)  # Debug endpoints
+if settings.enable_web_player:
+    app.include_router(web_router, prefix="/web")  # Web player static files
+# Health check endpoint
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy"}
+@app.get("/v1/test")
+async def test_endpoint():
+    """Test endpoint to verify routing"""
+    return {"status": "ok"}
+if __name__ == "__main__":
+    uvicorn.run("api.src.main:app", host=settings.host, port=settings.port, reload=True)

api/src/models/v1_0/config.json ADDED Viewed

	@@ -0,0 +1,150 @@

+{
+  "istftnet": {
+    "upsample_kernel_sizes": [20, 12],
+    "upsample_rates": [10, 6],
+    "gen_istft_hop_size": 5,
+    "gen_istft_n_fft": 20,
+    "resblock_dilation_sizes": [
+      [1, 3, 5],
+      [1, 3, 5],
+      [1, 3, 5]
+    ],
+    "resblock_kernel_sizes": [3, 7, 11],
+    "upsample_initial_channel": 512
+  },
+  "dim_in": 64,
+  "dropout": 0.2,
+  "hidden_dim": 512,
+  "max_conv_dim": 512,
+  "max_dur": 50,
+  "multispeaker": true,
+  "n_layer": 3,
+  "n_mels": 80,
+  "n_token": 178,
+  "style_dim": 128,
+  "text_encoder_kernel_size": 5,
+  "plbert": {
+    "hidden_size": 768,
+    "num_attention_heads": 12,
+    "intermediate_size": 2048,
+    "max_position_embeddings": 512,
+    "num_hidden_layers": 12,
+    "dropout": 0.1
+  },
+  "vocab": {
+    ";": 1,
+    ":": 2,
+    ",": 3,
+    ".": 4,
+    "!": 5,
+    "?": 6,
+    "—": 9,
+    "…": 10,
+    "\"": 11,
+    "(": 12,
+    ")": 13,
+    "“": 14,
+    "”": 15,
+    " ": 16,
+    "\u0303": 17,
+    "ʣ": 18,
+    "ʥ": 19,
+    "ʦ": 20,
+    "ʨ": 21,
+    "ᵝ": 22,
+    "\uAB67": 23,
+    "A": 24,
+    "I": 25,
+    "O": 31,
+    "Q": 33,
+    "S": 35,
+    "T": 36,
+    "W": 39,
+    "Y": 41,
+    "ᵊ": 42,
+    "a": 43,
+    "b": 44,
+    "c": 45,
+    "d": 46,
+    "e": 47,
+    "f": 48,
+    "h": 50,
+    "i": 51,
+    "j": 52,
+    "k": 53,
+    "l": 54,
+    "m": 55,
+    "n": 56,
+    "o": 57,
+    "p": 58,
+    "q": 59,
+    "r": 60,
+    "s": 61,
+    "t": 62,
+    "u": 63,
+    "v": 64,
+    "w": 65,
+    "x": 66,
+    "y": 67,
+    "z": 68,
+    "ɑ": 69,
+    "ɐ": 70,
+    "ɒ": 71,
+    "æ": 72,
+    "β": 75,
+    "ɔ": 76,
+    "ɕ": 77,
+    "ç": 78,
+    "ɖ": 80,
+    "ð": 81,
+    "ʤ": 82,
+    "ə": 83,
+    "ɚ": 85,
+    "ɛ": 86,
+    "ɜ": 87,
+    "ɟ": 90,
+    "ɡ": 92,
+    "ɥ": 99,
+    "ɨ": 101,
+    "ɪ": 102,
+    "ʝ": 103,
+    "ɯ": 110,
+    "ɰ": 111,
+    "ŋ": 112,
+    "ɳ": 113,
+    "ɲ": 114,
+    "ɴ": 115,
+    "ø": 116,
+    "ɸ": 118,
+    "θ": 119,
+    "œ": 120,
+    "ɹ": 123,
+    "ɾ": 125,
+    "ɻ": 126,
+    "ʁ": 128,
+    "ɽ": 129,
+    "ʂ": 130,
+    "ʃ": 131,
+    "ʈ": 132,
+    "ʧ": 133,
+    "ʊ": 135,
+    "ʋ": 136,
+    "ʌ": 138,
+    "ɣ": 139,
+    "ɤ": 140,
+    "χ": 142,
+    "ʎ": 143,
+    "ʒ": 147,
+    "ʔ": 148,
+    "ˈ": 156,
+    "ˌ": 157,
+    "ː": 158,
+    "ʰ": 162,
+    "ʲ": 164,
+    "↓": 169,
+    "→": 171,
+    "↗": 172,
+    "↘": 173,
+    "ᵻ": 177
+  }
+}

api/src/routers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ #

api/src/routers/debug.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import threading
+import time
+from datetime import datetime
+import psutil
+import torch
+from fastapi import APIRouter
+try:
+    import GPUtil
+    GPU_AVAILABLE = True
+except ImportError:
+    GPU_AVAILABLE = False
+router = APIRouter(tags=["debug"])
+@router.get("/debug/threads")
+async def get_thread_info():
+    process = psutil.Process()
+    current_threads = threading.enumerate()
+    # Get per-thread CPU times
+    thread_details = []
+    for thread in current_threads:
+        thread_info = {
+            "name": thread.name,
+            "id": thread.ident,
+            "alive": thread.is_alive(),
+            "daemon": thread.daemon,
+        }
+        thread_details.append(thread_info)
+    return {
+        "total_threads": process.num_threads(),
+        "active_threads": len(current_threads),
+        "thread_names": [t.name for t in current_threads],
+        "thread_details": thread_details,
+        "memory_mb": process.memory_info().rss / 1024 / 1024,
+    }
+@router.get("/debug/storage")
+async def get_storage_info():
+    # Get disk partitions
+    partitions = psutil.disk_partitions()
+    storage_info = []
+    for partition in partitions:
+        try:
+            usage = psutil.disk_usage(partition.mountpoint)
+            storage_info.append(
+                {
+                    "device": partition.device,
+                    "mountpoint": partition.mountpoint,
+                    "fstype": partition.fstype,
+                    "total_gb": usage.total / (1024**3),
+                    "used_gb": usage.used / (1024**3),
+                    "free_gb": usage.free / (1024**3),
+                    "percent_used": usage.percent,
+                }
+            )
+        except PermissionError:
+            continue
+    return {"storage_info": storage_info}
+@router.get("/debug/system")
+async def get_system_info():
+    process = psutil.Process()
+    # CPU Info
+    cpu_info = {
+        "cpu_count": psutil.cpu_count(),
+        "cpu_percent": psutil.cpu_percent(interval=1),
+        "per_cpu_percent": psutil.cpu_percent(interval=1, percpu=True),
+        "load_avg": psutil.getloadavg(),
+    }
+    # Memory Info
+    virtual_memory = psutil.virtual_memory()
+    swap_memory = psutil.swap_memory()
+    memory_info = {
+        "virtual": {
+            "total_gb": virtual_memory.total / (1024**3),
+            "available_gb": virtual_memory.available / (1024**3),
+            "used_gb": virtual_memory.used / (1024**3),
+            "percent": virtual_memory.percent,
+        },
+        "swap": {
+            "total_gb": swap_memory.total / (1024**3),
+            "used_gb": swap_memory.used / (1024**3),
+            "free_gb": swap_memory.free / (1024**3),
+            "percent": swap_memory.percent,
+        },
+    }
+    # Process Info
+    process_info = {
+        "pid": process.pid,
+        "status": process.status(),
+        "create_time": datetime.fromtimestamp(process.create_time()).isoformat(),
+        "cpu_percent": process.cpu_percent(),
+        "memory_percent": process.memory_percent(),
+    }
+    # Network Info
+    network_info = {
+        "connections": len(process.net_connections()),
+        "network_io": psutil.net_io_counters()._asdict(),
+    }
+    # GPU Info if available
+    gpu_info = None
+    if torch.backends.mps.is_available():
+        gpu_info = {
+            "type": "MPS",
+            "available": True,
+            "device": "Apple Silicon",
+            "backend": "Metal",
+        }
+    elif GPU_AVAILABLE:
+        try:
+            gpus = GPUtil.getGPUs()
+            gpu_info = [
+                {
+                    "id": gpu.id,
+                    "name": gpu.name,
+                    "load": gpu.load,
+                    "memory": {
+                        "total": gpu.memoryTotal,
+                        "used": gpu.memoryUsed,
+                        "free": gpu.memoryFree,
+                        "percent": (gpu.memoryUsed / gpu.memoryTotal) * 100,
+                    },
+                    "temperature": gpu.temperature,
+                }
+                for gpu in gpus
+            ]
+        except Exception:
+            gpu_info = "GPU information unavailable"
+    return {
+        "cpu": cpu_info,
+        "memory": memory_info,
+        "process": process_info,
+        "network": network_info,
+        "gpu": gpu_info,
+    }
+@router.get("/debug/session_pools")
+async def get_session_pool_info():
+    """Get information about ONNX session pools."""
+    from ..inference.model_manager import get_manager
+    manager = await get_manager()
+    pools = manager._session_pools
+    current_time = time.time()
+    pool_info = {}
+    # Get CPU pool info
+    if "onnx_cpu" in pools:
+        cpu_pool = pools["onnx_cpu"]
+        pool_info["cpu"] = {
+            "active_sessions": len(cpu_pool._sessions),
+            "max_sessions": cpu_pool._max_size,
+            "sessions": [
+                {"model": path, "age_seconds": current_time - info.last_used}
+                for path, info in cpu_pool._sessions.items()
+            ],
+        }
+    # Get GPU pool info
+    if "onnx_gpu" in pools:
+        gpu_pool = pools["onnx_gpu"]
+        pool_info["gpu"] = {
+            "active_sessions": len(gpu_pool._sessions),
+            "max_streams": gpu_pool._max_size,
+            "available_streams": len(gpu_pool._available_streams),
+            "sessions": [
+                {
+                    "model": path,
+                    "age_seconds": current_time - info.last_used,
+                    "stream_id": info.stream_id,
+                }
+                for path, info in gpu_pool._sessions.items()
+            ],
+        }
+        # Add GPU memory info if available
+        if GPU_AVAILABLE:
+            try:
+                gpus = GPUtil.getGPUs()
+                if gpus:
+                    gpu = gpus[0]  # Assume first GPU
+                    pool_info["gpu"]["memory"] = {
+                        "total_mb": gpu.memoryTotal,
+                        "used_mb": gpu.memoryUsed,
+                        "free_mb": gpu.memoryFree,
+                        "percent_used": (gpu.memoryUsed / gpu.memoryTotal) * 100,
+                    }
+            except Exception:
+                pass
+    return pool_info

api/src/routers/development.py ADDED Viewed

	@@ -0,0 +1,408 @@

+import base64
+import json
+import os
+import re
+from pathlib import Path
+from typing import AsyncGenerator, List, Tuple, Union
+import numpy as np
+import torch
+from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
+from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
+from kokoro import KPipeline
+from loguru import logger
+from ..core.config import settings
+from ..inference.base import AudioChunk
+from ..services.audio import AudioNormalizer, AudioService
+from ..services.streaming_audio_writer import StreamingAudioWriter
+from ..services.temp_manager import TempFileWriter
+from ..services.text_processing import smart_split
+from ..services.tts_service import TTSService
+from ..structures import CaptionedSpeechRequest, CaptionedSpeechResponse, WordTimestamp
+from ..structures.custom_responses import JSONStreamingResponse
+from ..structures.text_schemas import (
+    GenerateFromPhonemesRequest,
+    PhonemeRequest,
+    PhonemeResponse,
+)
+from .openai_compatible import process_and_validate_voices, stream_audio_chunks
+router = APIRouter(tags=["text processing"])
+async def get_tts_service() -> TTSService:
+    """Dependency to get TTSService instance"""
+    return (
+        await TTSService.create()
+    )  # Create service with properly initialized managers
+@router.post("/dev/phonemize", response_model=PhonemeResponse)
+async def phonemize_text(request: PhonemeRequest) -> PhonemeResponse:
+    """Convert text to phonemes using Kokoro's quiet mode.
+    Args:
+        request: Request containing text and language
+    Returns:
+        Phonemes and token IDs
+    """
+    try:
+        if not request.text:
+            raise ValueError("Text cannot be empty")
+        # Initialize Kokoro pipeline in quiet mode (no model)
+        pipeline = KPipeline(lang_code=request.language, model=False)
+        # Get first result from pipeline (we only need one since we're not chunking)
+        for result in pipeline(request.text):
+            # result.graphemes = original text
+            # result.phonemes = phonemized text
+            # result.tokens = token objects (if available)
+            return PhonemeResponse(phonemes=result.phonemes, tokens=[])
+        raise ValueError("Failed to generate phonemes")
+    except ValueError as e:
+        logger.error(f"Error in phoneme generation: {str(e)}")
+        raise HTTPException(
+            status_code=500, detail={"error": "Server error", "message": str(e)}
+        )
+    except Exception as e:
+        logger.error(f"Error in phoneme generation: {str(e)}")
+        raise HTTPException(
+            status_code=500, detail={"error": "Server error", "message": str(e)}
+        )
+@router.post("/dev/generate_from_phonemes")
+async def generate_from_phonemes(
+    request: GenerateFromPhonemesRequest,
+    client_request: Request,
+    tts_service: TTSService = Depends(get_tts_service),
+) -> StreamingResponse:
+    """Generate audio directly from phonemes using Kokoro's phoneme format"""
+    try:
+        # Basic validation
+        if not isinstance(request.phonemes, str):
+            raise ValueError("Phonemes must be a string")
+        if not request.phonemes:
+            raise ValueError("Phonemes cannot be empty")
+        # Create streaming audio writer and normalizer
+        writer = StreamingAudioWriter(format="wav", sample_rate=24000, channels=1)
+        normalizer = AudioNormalizer()
+        async def generate_chunks():
+            try:
+                # Generate audio from phonemes
+                chunk_audio, _ = await tts_service.generate_from_phonemes(
+                    phonemes=request.phonemes,  # Pass complete phoneme string
+                    voice=request.voice,
+                    speed=1.0,
+                )
+                if chunk_audio is not None:
+                    # Normalize audio before writing
+                    normalized_audio = await normalizer.normalize(chunk_audio)
+                    # Write chunk and yield bytes
+                    chunk_bytes = writer.write_chunk(normalized_audio)
+                    if chunk_bytes:
+                        yield chunk_bytes
+                    # Finalize and yield remaining bytes
+                    final_bytes = writer.write_chunk(finalize=True)
+                    if final_bytes:
+                        yield final_bytes
+                else:
+                    raise ValueError("Failed to generate audio data")
+            except Exception as e:
+                logger.error(f"Error in audio generation: {str(e)}")
+                # Clean up writer on error
+                writer.close()
+                # Re-raise the original exception
+                raise
+        return StreamingResponse(
+            generate_chunks(),
+            media_type="audio/wav",
+            headers={
+                "Content-Disposition": "attachment; filename=speech.wav",
+                "X-Accel-Buffering": "no",
+                "Cache-Control": "no-cache",
+                "Transfer-Encoding": "chunked",
+            },
+        )
+    except ValueError as e:
+        logger.error(f"Error generating audio: {str(e)}")
+        raise HTTPException(
+            status_code=400,
+            detail={
+                "error": "validation_error",
+                "message": str(e),
+                "type": "invalid_request_error",
+            },
+        )
+    except Exception as e:
+        logger.error(f"Error generating audio: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "processing_error",
+                "message": str(e),
+                "type": "server_error",
+            },
+        )
+@router.post("/dev/captioned_speech")
+async def create_captioned_speech(
+    request: CaptionedSpeechRequest,
+    client_request: Request,
+    x_raw_response: str = Header(None, alias="x-raw-response"),
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """Generate audio with word-level timestamps using streaming approach"""
+    try:
+        # model_name = get_model_name(request.model)
+        tts_service = await get_tts_service()
+        voice_name = await process_and_validate_voices(request.voice, tts_service)
+        # Set content type based on format
+        content_type = {
+            "mp3": "audio/mpeg",
+            "opus": "audio/opus",
+            "m4a": "audio/mp4",
+            "flac": "audio/flac",
+            "wav": "audio/wav",
+            "pcm": "audio/pcm",
+        }.get(request.response_format, f"audio/{request.response_format}")
+        writer = StreamingAudioWriter(request.response_format, sample_rate=24000)
+        # Check if streaming is requested (default for OpenAI client)
+        if request.stream:
+            # Create generator but don't start it yet
+            generator = stream_audio_chunks(
+                tts_service, request, client_request, writer
+            )
+            # If download link requested, wrap generator with temp file writer
+            if request.return_download_link:
+                from ..services.temp_manager import TempFileWriter
+                temp_writer = TempFileWriter(request.response_format)
+                await temp_writer.__aenter__()  # Initialize temp file
+                # Get download path immediately after temp file creation
+                download_path = temp_writer.download_path
+                # Create response headers with download path
+                headers = {
+                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+                    "X-Accel-Buffering": "no",
+                    "Cache-Control": "no-cache",
+                    "Transfer-Encoding": "chunked",
+                    "X-Download-Path": download_path,
+                }
+                # Create async generator for streaming
+                async def dual_output():
+                    try:
+                        # Write chunks to temp file and stream
+                        async for chunk_data in generator:
+                            # The timestamp acumulator is only used when word level time stamps are generated but no audio is returned.
+                            timestamp_acumulator = []
+                            if chunk_data.output:  # Skip empty chunks
+                                await temp_writer.write(chunk_data.output)
+                                base64_chunk = base64.b64encode(
+                                    chunk_data.output
+                                ).decode("utf-8")
+                                # Add any chunks that may be in the acumulator into the return word_timestamps
+                                chunk_data.word_timestamps = (
+                                    timestamp_acumulator + chunk_data.word_timestamps
+                                )
+                                timestamp_acumulator = []
+                                yield CaptionedSpeechResponse(
+                                    audio=base64_chunk,
+                                    audio_format=content_type,
+                                    timestamps=chunk_data.word_timestamps,
+                                )
+                            else:
+                                if (
+                                    chunk_data.word_timestamps is not None
+                                    and len(chunk_data.word_timestamps) > 0
+                                ):
+                                    timestamp_acumulator += chunk_data.word_timestamps
+                        # Finalize the temp file
+                        await temp_writer.finalize()
+                    except Exception as e:
+                        logger.error(f"Error in dual output streaming: {e}")
+                        await temp_writer.__aexit__(type(e), e, e.__traceback__)
+                        raise
+                    finally:
+                        # Ensure temp writer is closed
+                        if not temp_writer._finalized:
+                            await temp_writer.__aexit__(None, None, None)
+                        writer.close()
+                # Stream with temp file writing
+                return JSONStreamingResponse(
+                    dual_output(), media_type="application/json", headers=headers
+                )
+            async def single_output():
+                try:
+                    # The timestamp acumulator is only used when word level time stamps are generated but no audio is returned.
+                    timestamp_acumulator = []
+                    # Stream chunks
+                    async for chunk_data in generator:
+                        if chunk_data.output:  # Skip empty chunks
+                            # Encode the chunk bytes into base 64
+                            base64_chunk = base64.b64encode(chunk_data.output).decode(
+                                "utf-8"
+                            )
+                            # Add any chunks that may be in the acumulator into the return word_timestamps
+                            if chunk_data.word_timestamps != None:
+                                chunk_data.word_timestamps = (
+                                    timestamp_acumulator + chunk_data.word_timestamps
+                                )
+                            else:
+                                chunk_data.word_timestamps = []
+                            timestamp_acumulator = []
+                            yield CaptionedSpeechResponse(
+                                audio=base64_chunk,
+                                audio_format=content_type,
+                                timestamps=chunk_data.word_timestamps,
+                            )
+                        else:
+                            if (
+                                chunk_data.word_timestamps is not None
+                                and len(chunk_data.word_timestamps) > 0
+                            ):
+                                timestamp_acumulator += chunk_data.word_timestamps
+                except Exception as e:
+                    logger.error(f"Error in single output streaming: {e}")
+                    writer.close()
+                    raise
+            # Standard streaming without download link
+            return JSONStreamingResponse(
+                single_output(),
+                media_type="application/json",
+                headers={
+                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+                    "X-Accel-Buffering": "no",
+                    "Cache-Control": "no-cache",
+                    "Transfer-Encoding": "chunked",
+                },
+            )
+        else:
+            # Generate complete audio using public interface
+            audio_data = await tts_service.generate_audio(
+                text=request.input,
+                voice=voice_name,
+                writer=writer,
+                speed=request.speed,
+                return_timestamps=request.return_timestamps,
+                normalization_options=request.normalization_options,
+                lang_code=request.lang_code,
+            )
+            audio_data = await AudioService.convert_audio(
+                audio_data,
+                request.response_format,
+                writer,
+                is_last_chunk=False,
+                trim_audio=False,
+            )
+            # Convert to requested format with proper finalization
+            final = await AudioService.convert_audio(
+                AudioChunk(np.array([], dtype=np.int16)),
+                request.response_format,
+                writer,
+                is_last_chunk=True,
+            )
+            output = audio_data.output + final.output
+            base64_output = base64.b64encode(output).decode("utf-8")
+            content = CaptionedSpeechResponse(
+                audio=base64_output,
+                audio_format=content_type,
+                timestamps=audio_data.word_timestamps,
+            ).model_dump()
+            writer.close()
+            return JSONResponse(
+                content=content,
+                media_type="application/json",
+                headers={
+                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+                    "Cache-Control": "no-cache",  # Prevent caching
+                },
+            )
+    except ValueError as e:
+        # Handle validation errors
+        logger.warning(f"Invalid request: {str(e)}")
+        try:
+            writer.close()
+        except:
+            pass
+        raise HTTPException(
+            status_code=400,
+            detail={
+                "error": "validation_error",
+                "message": str(e),
+                "type": "invalid_request_error",
+            },
+        )
+    except RuntimeError as e:
+        # Handle runtime/processing errors
+        logger.error(f"Processing error: {str(e)}")
+        try:
+            writer.close()
+        except:
+            pass
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "processing_error",
+                "message": str(e),
+                "type": "server_error",
+            },
+        )
+    except Exception as e:
+        # Handle unexpected errors
+        logger.error(f"Unexpected error in captioned speech generation: {str(e)}")
+        try:
+            writer.close()
+        except:
+            pass
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "processing_error",
+                "message": str(e),
+                "type": "server_error",
+            },
+        )

api/src/routers/openai_compatible.py ADDED Viewed

	@@ -0,0 +1,662 @@

+"""OpenAI-compatible router for text-to-speech"""
+import io
+import json
+import os
+import re
+import tempfile
+from typing import AsyncGenerator, Dict, List, Tuple, Union
+from urllib import response
+import aiofiles
+import numpy as np
+import torch
+from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
+from fastapi.responses import FileResponse, StreamingResponse
+from loguru import logger
+from ..core.config import settings
+from ..inference.base import AudioChunk
+from ..services.audio import AudioService
+from ..services.streaming_audio_writer import StreamingAudioWriter
+from ..services.tts_service import TTSService
+from ..structures import OpenAISpeechRequest
+from ..structures.schemas import CaptionedSpeechRequest
+# Load OpenAI mappings
+def load_openai_mappings() -> Dict:
+    """Load OpenAI voice and model mappings from JSON"""
+    api_dir = os.path.dirname(os.path.dirname(__file__))
+    mapping_path = os.path.join(api_dir, "core", "openai_mappings.json")
+    try:
+        with open(mapping_path, "r") as f:
+            return json.load(f)
+    except Exception as e:
+        logger.error(f"Failed to load OpenAI mappings: {e}")
+        return {"models": {}, "voices": {}}
+# Global mappings
+_openai_mappings = load_openai_mappings()
+router = APIRouter(
+    tags=["OpenAI Compatible TTS"],
+    responses={404: {"description": "Not found"}},
+)
+# Global TTSService instance with lock
+_tts_service = None
+_init_lock = None
+async def get_tts_service() -> TTSService:
+    """Get global TTSService instance"""
+    global _tts_service, _init_lock
+    # Create lock if needed
+    if _init_lock is None:
+        import asyncio
+        _init_lock = asyncio.Lock()
+    # Initialize service if needed
+    if _tts_service is None:
+        async with _init_lock:
+            # Double check pattern
+            if _tts_service is None:
+                _tts_service = await TTSService.create()
+                logger.info("Created global TTSService instance")
+    return _tts_service
+def get_model_name(model: str) -> str:
+    """Get internal model name from OpenAI model name"""
+    base_name = _openai_mappings["models"].get(model)
+    if not base_name:
+        raise ValueError(f"Unsupported model: {model}")
+    return base_name + ".pth"
+async def process_and_validate_voices(
+    voice_input: Union[str, List[str]], tts_service: TTSService
+) -> str:
+    """Process voice input, handling both string and list formats
+    Returns:
+        Voice name to use (with weights if specified)
+    """
+    voices = []
+    # Convert input to list of voices
+    if isinstance(voice_input, str):
+        voice_input = voice_input.replace(" ", "").strip()
+        if voice_input[-1] in "+-" or voice_input[0] in "+-":
+            raise ValueError(f"Voice combination contains empty combine items")
+        if re.search(r"[+-]{2,}", voice_input) is not None:
+            raise ValueError(f"Voice combination contains empty combine items")
+        voices = re.split(r"([-+])", voice_input)
+    else:
+        voices = [[item, "+"] for item in voice_input][:-1]
+    available_voices = await tts_service.list_voices()
+    for voice_index in range(0, len(voices), 2):
+        mapped_voice = voices[voice_index].split("(")
+        mapped_voice = list(map(str.strip, mapped_voice))
+        if len(mapped_voice) > 2:
+            raise ValueError(
+                f"Voice '{voices[voice_index]}' contains too many weight items"
+            )
+        if mapped_voice.count(")") > 1:
+            raise ValueError(
+                f"Voice '{voices[voice_index]}' contains too many weight items"
+            )
+        mapped_voice[0] = _openai_mappings["voices"].get(
+            mapped_voice[0], mapped_voice[0]
+        )
+        if mapped_voice[0] not in available_voices:
+            raise ValueError(
+                f"Voice '{mapped_voice[0]}' not found. Available voices: {', '.join(sorted(available_voices))}"
+            )
+        voices[voice_index] = "(".join(mapped_voice)
+    return "".join(voices)
+async def stream_audio_chunks(
+    tts_service: TTSService,
+    request: Union[OpenAISpeechRequest, CaptionedSpeechRequest],
+    client_request: Request,
+    writer: StreamingAudioWriter,
+) -> AsyncGenerator[AudioChunk, None]:
+    """Stream audio chunks as they're generated with client disconnect handling"""
+    voice_name = await process_and_validate_voices(request.voice, tts_service)
+    unique_properties = {"return_timestamps": False}
+    if hasattr(request, "return_timestamps"):
+        unique_properties["return_timestamps"] = request.return_timestamps
+    try:
+        async for chunk_data in tts_service.generate_audio_stream(
+            text=request.input,
+            voice=voice_name,
+            writer=writer,
+            speed=request.speed,
+            output_format=request.response_format,
+            lang_code=request.lang_code,
+            normalization_options=request.normalization_options,
+            return_timestamps=unique_properties["return_timestamps"],
+        ):
+            # Check if client is still connected
+            is_disconnected = client_request.is_disconnected
+            if callable(is_disconnected):
+                is_disconnected = await is_disconnected()
+            if is_disconnected:
+                logger.info("Client disconnected, stopping audio generation")
+                break
+            yield chunk_data
+    except Exception as e:
+        logger.error(f"Error in audio streaming: {str(e)}")
+        # Let the exception propagate to trigger cleanup
+        raise
+@router.post("/audio/speech")
+async def create_speech(
+    request: OpenAISpeechRequest,
+    client_request: Request,
+    x_raw_response: str = Header(None, alias="x-raw-response"),
+):
+    """OpenAI-compatible endpoint for text-to-speech"""
+    # Validate model before processing request
+    if request.model not in _openai_mappings["models"]:
+        raise HTTPException(
+            status_code=400,
+            detail={
+                "error": "invalid_model",
+                "message": f"Unsupported model: {request.model}",
+                "type": "invalid_request_error",
+            },
+        )
+    try:
+        # model_name = get_model_name(request.model)
+        tts_service = await get_tts_service()
+        voice_name = await process_and_validate_voices(request.voice, tts_service)
+        # Set content type based on format
+        content_type = {
+            "mp3": "audio/mpeg",
+            "opus": "audio/opus",
+            "aac": "audio/aac",
+            "flac": "audio/flac",
+            "wav": "audio/wav",
+            "pcm": "audio/pcm",
+        }.get(request.response_format, f"audio/{request.response_format}")
+        writer = StreamingAudioWriter(request.response_format, sample_rate=24000)
+        # Check if streaming is requested (default for OpenAI client)
+        if request.stream:
+            # Create generator but don't start it yet
+            generator = stream_audio_chunks(
+                tts_service, request, client_request, writer
+            )
+            # If download link requested, wrap generator with temp file writer
+            if request.return_download_link:
+                from ..services.temp_manager import TempFileWriter
+                # Use download_format if specified, otherwise use response_format
+                output_format = request.download_format or request.response_format
+                temp_writer = TempFileWriter(output_format)
+                await temp_writer.__aenter__()  # Initialize temp file
+                # Get download path immediately after temp file creation
+                download_path = temp_writer.download_path
+                # Create response headers with download path
+                headers = {
+                    "Content-Disposition": f"attachment; filename=speech.{output_format}",
+                    "X-Accel-Buffering": "no",
+                    "Cache-Control": "no-cache",
+                    "Transfer-Encoding": "chunked",
+                    "X-Download-Path": download_path,
+                }
+                # Add header to indicate if temp file writing is available
+                if temp_writer._write_error:
+                    headers["X-Download-Status"] = "unavailable"
+                # Create async generator for streaming
+                async def dual_output():
+                    try:
+                        # Write chunks to temp file and stream
+                        async for chunk_data in generator:
+                            if chunk_data.output:  # Skip empty chunks
+                                await temp_writer.write(chunk_data.output)
+                                # if return_json:
+                                #    yield chunk, chunk_data
+                                # else:
+                                yield chunk_data.output
+                        # Finalize the temp file
+                        await temp_writer.finalize()
+                    except Exception as e:
+                        logger.error(f"Error in dual output streaming: {e}")
+                        await temp_writer.__aexit__(type(e), e, e.__traceback__)
+                        raise
+                    finally:
+                        # Ensure temp writer is closed
+                        if not temp_writer._finalized:
+                            await temp_writer.__aexit__(None, None, None)
+                        writer.close()
+                # Stream with temp file writing
+                return StreamingResponse(
+                    dual_output(), media_type=content_type, headers=headers
+                )
+            async def single_output():
+                try:
+                    # Stream chunks
+                    async for chunk_data in generator:
+                        if chunk_data.output:  # Skip empty chunks
+                            yield chunk_data.output
+                except Exception as e:
+                    logger.error(f"Error in single output streaming: {e}")
+                    writer.close()
+                    raise
+            # Standard streaming without download link
+            return StreamingResponse(
+                single_output(),
+                media_type=content_type,
+                headers={
+                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+                    "X-Accel-Buffering": "no",
+                    "Cache-Control": "no-cache",
+                    "Transfer-Encoding": "chunked",
+                },
+            )
+        else:
+            headers = {
+                "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+                "Cache-Control": "no-cache",  # Prevent caching
+            }
+            # Generate complete audio using public interface
+            audio_data = await tts_service.generate_audio(
+                text=request.input,
+                voice=voice_name,
+                writer=writer,
+                speed=request.speed,
+                normalization_options=request.normalization_options,
+                lang_code=request.lang_code,
+            )
+            audio_data = await AudioService.convert_audio(
+                audio_data,
+                request.response_format,
+                writer,
+                is_last_chunk=False,
+                trim_audio=False,
+            )
+            # Convert to requested format with proper finalization
+            final = await AudioService.convert_audio(
+                AudioChunk(np.array([], dtype=np.int16)),
+                request.response_format,
+                writer,
+                is_last_chunk=True,
+            )
+            output = audio_data.output + final.output
+            if request.return_download_link:
+                from ..services.temp_manager import TempFileWriter
+                # Use download_format if specified, otherwise use response_format
+                output_format = request.download_format or request.response_format
+                temp_writer = TempFileWriter(output_format)
+                await temp_writer.__aenter__()  # Initialize temp file
+                # Get download path immediately after temp file creation
+                download_path = temp_writer.download_path
+                headers["X-Download-Path"] = download_path
+                try:
+                    # Write chunks to temp file
+                    logger.info("Writing chunks to tempory file for download")
+                    await temp_writer.write(output)
+                    # Finalize the temp file
+                    await temp_writer.finalize()
+                except Exception as e:
+                    logger.error(f"Error in dual output: {e}")
+                    await temp_writer.__aexit__(type(e), e, e.__traceback__)
+                    raise
+                finally:
+                    # Ensure temp writer is closed
+                    if not temp_writer._finalized:
+                        await temp_writer.__aexit__(None, None, None)
+                    writer.close()
+            return Response(
+                content=output,
+                media_type=content_type,
+                headers=headers,
+            )
+    except ValueError as e:
+        # Handle validation errors
+        logger.warning(f"Invalid request: {str(e)}")
+        try:
+            writer.close()
+        except:
+            pass
+        raise HTTPException(
+            status_code=400,
+            detail={
+                "error": "validation_error",
+                "message": str(e),
+                "type": "invalid_request_error",
+            },
+        )
+    except RuntimeError as e:
+        # Handle runtime/processing errors
+        logger.error(f"Processing error: {str(e)}")
+        try:
+            writer.close()
+        except:
+            pass
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "processing_error",
+                "message": str(e),
+                "type": "server_error",
+            },
+        )
+    except Exception as e:
+        # Handle unexpected errors
+        logger.error(f"Unexpected error in speech generation: {str(e)}")
+        try:
+            writer.close()
+        except:
+            pass
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "processing_error",
+                "message": str(e),
+                "type": "server_error",
+            },
+        )
+@router.get("/download/{filename}")
+async def download_audio_file(filename: str):
+    """Download a generated audio file from temp storage"""
+    try:
+        from ..core.paths import _find_file, get_content_type
+        # Search for file in temp directory
+        file_path = await _find_file(
+            filename=filename, search_paths=[settings.temp_file_dir]
+        )
+        # Get content type from path helper
+        content_type = await get_content_type(file_path)
+        return FileResponse(
+            file_path,
+            media_type=content_type,
+            filename=filename,
+            headers={
+                "Cache-Control": "no-cache",
+                "Content-Disposition": f"attachment; filename={filename}",
+            },
+        )
+    except Exception as e:
+        logger.error(f"Error serving download file {filename}: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "server_error",
+                "message": "Failed to serve audio file",
+                "type": "server_error",
+            },
+        )
+@router.get("/models")
+async def list_models():
+    """List all available models"""
+    try:
+        # Create standard model list
+        models = [
+            {
+                "id": "tts-1",
+                "object": "model",
+                "created": 1686935002,
+                "owned_by": "kokoro",
+            },
+            {
+                "id": "tts-1-hd",
+                "object": "model",
+                "created": 1686935002,
+                "owned_by": "kokoro",
+            },
+            {
+                "id": "kokoro",
+                "object": "model",
+                "created": 1686935002,
+                "owned_by": "kokoro",
+            },
+        ]
+        return {"object": "list", "data": models}
+    except Exception as e:
+        logger.error(f"Error listing models: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "server_error",
+                "message": "Failed to retrieve model list",
+                "type": "server_error",
+            },
+        )
+@router.get("/models/{model}")
+async def retrieve_model(model: str):
+    """Retrieve a specific model"""
+    try:
+        # Define available models
+        models = {
+            "tts-1": {
+                "id": "tts-1",
+                "object": "model",
+                "created": 1686935002,
+                "owned_by": "kokoro",
+            },
+            "tts-1-hd": {
+                "id": "tts-1-hd",
+                "object": "model",
+                "created": 1686935002,
+                "owned_by": "kokoro",
+            },
+            "kokoro": {
+                "id": "kokoro",
+                "object": "model",
+                "created": 1686935002,
+                "owned_by": "kokoro",
+            },
+        }
+        # Check if requested model exists
+        if model not in models:
+            raise HTTPException(
+                status_code=404,
+                detail={
+                    "error": "model_not_found",
+                    "message": f"Model '{model}' not found",
+                    "type": "invalid_request_error",
+                },
+            )
+        # Return the specific model
+        return models[model]
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error retrieving model {model}: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "server_error",
+                "message": "Failed to retrieve model information",
+                "type": "server_error",
+            },
+        )
+@router.get("/audio/voices")
+async def list_voices():
+    """List all available voices for text-to-speech"""
+    try:
+        tts_service = await get_tts_service()
+        voices = await tts_service.list_voices()
+        return {"voices": voices}
+    except Exception as e:
+        logger.error(f"Error listing voices: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "server_error",
+                "message": "Failed to retrieve voice list",
+                "type": "server_error",
+            },
+        )
+@router.post("/audio/voices/combine")
+async def combine_voices(request: Union[str, List[str]]):
+    """Combine multiple voices into a new voice and return the .pt file.
+    Args:
+        request: Either a string with voices separated by + (e.g. "voice1+voice2")
+                or a list of voice names to combine
+    Returns:
+        FileResponse with the combined voice .pt file
+    Raises:
+        HTTPException:
+            - 400: Invalid request (wrong number of voices, voice not found)
+            - 500: Server error (file system issues, combination failed)
+    """
+    # Check if local voice saving is allowed
+    if not settings.allow_local_voice_saving:
+        raise HTTPException(
+            status_code=403,
+            detail={
+                "error": "permission_denied",
+                "message": "Local voice saving is disabled",
+                "type": "permission_error",
+            },
+        )
+    try:
+        # Convert input to list of voices
+        if isinstance(request, str):
+            # Check if it's an OpenAI voice name
+            mapped_voice = _openai_mappings["voices"].get(request)
+            if mapped_voice:
+                request = mapped_voice
+            voices = [v.strip() for v in request.split("+") if v.strip()]
+        else:
+            # For list input, map each voice if it's an OpenAI voice name
+            voices = [_openai_mappings["voices"].get(v, v) for v in request]
+            voices = [v.strip() for v in voices if v.strip()]
+        if not voices:
+            raise ValueError("No voices provided")
+        # For multiple voices, validate base voices exist
+        tts_service = await get_tts_service()
+        available_voices = await tts_service.list_voices()
+        for voice in voices:
+            if voice not in available_voices:
+                raise ValueError(
+                    f"Base voice '{voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
+                )
+        # Combine voices
+        combined_tensor = await tts_service.combine_voices(voices=voices)
+        combined_name = "+".join(voices)
+        # Save to temp file
+        temp_dir = tempfile.gettempdir()
+        voice_path = os.path.join(temp_dir, f"{combined_name}.pt")
+        buffer = io.BytesIO()
+        torch.save(combined_tensor, buffer)
+        async with aiofiles.open(voice_path, "wb") as f:
+            await f.write(buffer.getvalue())
+        return FileResponse(
+            voice_path,
+            media_type="application/octet-stream",
+            filename=f"{combined_name}.pt",
+            headers={
+                "Content-Disposition": f"attachment; filename={combined_name}.pt",
+                "Cache-Control": "no-cache",
+            },
+        )
+    except ValueError as e:
+        logger.warning(f"Invalid voice combination request: {str(e)}")
+        raise HTTPException(
+            status_code=400,
+            detail={
+                "error": "validation_error",
+                "message": str(e),
+                "type": "invalid_request_error",
+            },
+        )
+    except RuntimeError as e:
+        logger.error(f"Voice combination processing error: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "processing_error",
+                "message": "Failed to process voice combination request",
+                "type": "server_error",
+            },
+        )
+    except Exception as e:
+        logger.error(f"Unexpected error in voice combination: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "server_error",
+                "message": "An unexpected error occurred",
+                "type": "server_error",
+            },
+        )

api/src/routers/web_player.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""Web player router with async file serving."""
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import Response
+from loguru import logger
+from ..core.config import settings
+from ..core.paths import get_content_type, get_web_file_path, read_bytes
+router = APIRouter(
+    tags=["Web Player"],
+    responses={404: {"description": "Not found"}},
+)
+@router.get("/{filename:path}")
+async def serve_web_file(filename: str):
+    """Serve web player static files asynchronously."""
+    if not settings.enable_web_player:
+        raise HTTPException(status_code=404, detail="Web player is disabled")
+    try:
+        # Default to index.html for root path
+        if filename == "" or filename == "/":
+            filename = "index.html"
+        # Get file path
+        file_path = await get_web_file_path(filename)
+        # Read file content
+        content = await read_bytes(file_path)
+        # Get content type
+        content_type = await get_content_type(file_path)
+        return Response(
+            content=content,
+            media_type=content_type,
+            headers={
+                "Cache-Control": "no-cache",  # Prevent caching during development
+            },
+        )
+    except RuntimeError as e:
+        logger.warning(f"Web file not found: {filename}")
+        raise HTTPException(status_code=404, detail=str(e))
+    except Exception as e:
+        logger.error(f"Error serving web file {filename}: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")

api/src/services/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .tts_service import TTSService
2	+
3	+ __all__ = ["TTSService"]

api/src/services/audio.py ADDED Viewed

	@@ -0,0 +1,248 @@

+"""Audio conversion service"""
+import math
+import struct
+import time
+from io import BytesIO
+from typing import Tuple
+import numpy as np
+import scipy.io.wavfile as wavfile
+import soundfile as sf
+from loguru import logger
+from pydub import AudioSegment
+from torch import norm
+from ..core.config import settings
+from ..inference.base import AudioChunk
+from .streaming_audio_writer import StreamingAudioWriter
+class AudioNormalizer:
+    """Handles audio normalization state for a single stream"""
+    def __init__(self):
+        self.chunk_trim_ms = settings.gap_trim_ms
+        self.sample_rate = 24000  # Sample rate of the audio
+        self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
+        self.samples_to_pad_start = int(50 * self.sample_rate / 1000)
+    def find_first_last_non_silent(
+        self,
+        audio_data: np.ndarray,
+        chunk_text: str,
+        speed: float,
+        silence_threshold_db: int = -45,
+        is_last_chunk: bool = False,
+    ) -> tuple[int, int]:
+        """Finds the indices of the first and last non-silent samples in audio data.
+        Args:
+            audio_data: Input audio data as numpy array
+            chunk_text: The text sent to the model to generate the resulting speech
+            speed: The speaking speed of the voice
+            silence_threshold_db: How quiet audio has to be to be conssidered silent
+            is_last_chunk: Whether this is the last chunk
+        Returns:
+            A tuple with the start of the non silent portion and with the end of the non silent portion
+        """
+        pad_multiplier = 1
+        split_character = chunk_text.strip()
+        if len(split_character) > 0:
+            split_character = split_character[-1]
+            if split_character in settings.dynamic_gap_trim_padding_char_multiplier:
+                pad_multiplier = settings.dynamic_gap_trim_padding_char_multiplier[
+                    split_character
+                ]
+        if not is_last_chunk:
+            samples_to_pad_end = max(
+                int(
+                    (
+                        settings.dynamic_gap_trim_padding_ms
+                        * self.sample_rate
+                        * pad_multiplier
+                    )
+                    / 1000
+                )
+                - self.samples_to_pad_start,
+                0,
+            )
+        else:
+            samples_to_pad_end = self.samples_to_pad_start
+        # Convert dBFS threshold to amplitude
+        amplitude_threshold = np.iinfo(audio_data.dtype).max * (
+            10 ** (silence_threshold_db / 20)
+        )
+        # Find the first samples above the silence threshold at the start and end of the audio
+        non_silent_index_start, non_silent_index_end = None, None
+        for X in range(0, len(audio_data)):
+            if audio_data[X] > amplitude_threshold:
+                non_silent_index_start = X
+                break
+        for X in range(len(audio_data) - 1, -1, -1):
+            if audio_data[X] > amplitude_threshold:
+                non_silent_index_end = X
+                break
+        # Handle the case where the entire audio is silent
+        if non_silent_index_start == None or non_silent_index_end == None:
+            return 0, len(audio_data)
+        return max(non_silent_index_start - self.samples_to_pad_start, 0), min(
+            non_silent_index_end + math.ceil(samples_to_pad_end / speed),
+            len(audio_data),
+        )
+    def normalize(self, audio_data: np.ndarray) -> np.ndarray:
+        """Convert audio data to int16 range
+        Args:
+            audio_data: Input audio data as numpy array
+        Returns:
+            Normalized audio data
+        """
+        if audio_data.dtype != np.int16:
+            # Scale directly to int16 range with clipping
+            return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
+        return audio_data
+class AudioService:
+    """Service for audio format conversions with streaming support"""
+    # Supported formats
+    SUPPORTED_FORMATS = {"wav", "mp3", "opus", "flac", "aac", "pcm"}
+    # Default audio format settings balanced for speed and compression
+    DEFAULT_SETTINGS = {
+        "mp3": {
+            "bitrate_mode": "CONSTANT",  # Faster than variable bitrate
+            "compression_level": 0.0,  # Balanced compression
+        },
+        "opus": {
+            "compression_level": 0.0,  # Good balance for speech
+        },
+        "flac": {
+            "compression_level": 0.0,  # Light compression, still fast
+        },
+        "aac": {
+            "bitrate": "192k",  # Default AAC bitrate
+        },
+    }
+    @staticmethod
+    async def convert_audio(
+        audio_chunk: AudioChunk,
+        output_format: str,
+        writer: StreamingAudioWriter,
+        speed: float = 1,
+        chunk_text: str = "",
+        is_last_chunk: bool = False,
+        trim_audio: bool = True,
+        normalizer: AudioNormalizer = None,
+    ) -> AudioChunk:
+        """Convert audio data to specified format with streaming support
+        Args:
+            audio_data: Numpy array of audio samples
+            output_format: Target format (wav, mp3, ogg, pcm)
+            writer: The StreamingAudioWriter to use
+            speed: The speaking speed of the voice
+            chunk_text: The text sent to the model to generate the resulting speech
+            is_last_chunk: Whether this is the last chunk
+            trim_audio: Whether audio should be trimmed
+            normalizer: Optional AudioNormalizer instance for consistent normalization
+        Returns:
+            Bytes of the converted audio chunk
+        """
+        try:
+            # Validate format
+            if output_format not in AudioService.SUPPORTED_FORMATS:
+                raise ValueError(f"Format {output_format} not supported")
+            # Always normalize audio to ensure proper amplitude scaling
+            if normalizer is None:
+                normalizer = AudioNormalizer()
+            audio_chunk.audio = normalizer.normalize(audio_chunk.audio)
+            if trim_audio == True:
+                audio_chunk = AudioService.trim_audio(
+                    audio_chunk, chunk_text, speed, is_last_chunk, normalizer
+                )
+            # Write audio data first
+            if len(audio_chunk.audio) > 0:
+                chunk_data = writer.write_chunk(audio_chunk.audio)
+            # Then finalize if this is the last chunk
+            if is_last_chunk:
+                final_data = writer.write_chunk(finalize=True)
+                if final_data:
+                    audio_chunk.output = final_data
+                return audio_chunk
+            if chunk_data:
+                audio_chunk.output = chunk_data
+            return audio_chunk
+        except Exception as e:
+            logger.error(f"Error converting audio stream to {output_format}: {str(e)}")
+            raise ValueError(
+                f"Failed to convert audio stream to {output_format}: {str(e)}"
+            )
+    @staticmethod
+    def trim_audio(
+        audio_chunk: AudioChunk,
+        chunk_text: str = "",
+        speed: float = 1,
+        is_last_chunk: bool = False,
+        normalizer: AudioNormalizer = None,
+    ) -> AudioChunk:
+        """Trim silence from start and end
+        Args:
+            audio_data: Input audio data as numpy array
+            chunk_text: The text sent to the model to generate the resulting speech
+            speed: The speaking speed of the voice
+            is_last_chunk: Whether this is the last chunk
+            normalizer: Optional AudioNormalizer instance for consistent normalization
+        Returns:
+            Trimmed audio data
+        """
+        if normalizer is None:
+            normalizer = AudioNormalizer()
+        audio_chunk.audio = normalizer.normalize(audio_chunk.audio)
+        trimed_samples = 0
+        # Trim start and end if enough samples
+        if len(audio_chunk.audio) > (2 * normalizer.samples_to_trim):
+            audio_chunk.audio = audio_chunk.audio[
+                normalizer.samples_to_trim : -normalizer.samples_to_trim
+            ]
+            trimed_samples += normalizer.samples_to_trim
+        # Find non silent portion and trim
+        start_index, end_index = normalizer.find_first_last_non_silent(
+            audio_chunk.audio, chunk_text, speed, is_last_chunk=is_last_chunk
+        )
+        audio_chunk.audio = audio_chunk.audio[start_index:end_index]
+        trimed_samples += start_index
+        if audio_chunk.word_timestamps is not None:
+            for timestamp in audio_chunk.word_timestamps:
+                timestamp.start_time -= trimed_samples / 24000
+                timestamp.end_time -= trimed_samples / 24000
+        return audio_chunk

api/src/services/streaming_audio_writer.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Audio conversion service with proper streaming support"""
+import struct
+from io import BytesIO
+from typing import Optional
+import av
+import numpy as np
+import soundfile as sf
+from loguru import logger
+from pydub import AudioSegment
+class StreamingAudioWriter:
+    """Handles streaming audio format conversions"""
+    def __init__(self, format: str, sample_rate: int, channels: int = 1):
+        self.format = format.lower()
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.bytes_written = 0
+        self.pts = 0
+        codec_map = {
+            "wav": "pcm_s16le",
+            "mp3": "mp3",
+            "opus": "libopus",
+            "flac": "flac",
+            "aac": "aac",
+        }
+        # Format-specific setup
+        if self.format in ["wav", "flac", "mp3", "pcm", "aac", "opus"]:
+            if self.format != "pcm":
+                self.output_buffer = BytesIO()
+                self.container = av.open(
+                    self.output_buffer,
+                    mode="w",
+                    format=self.format if self.format != "aac" else "adts",
+                )
+                self.stream = self.container.add_stream(
+                    codec_map[self.format],
+                    sample_rate=self.sample_rate,
+                    layout="mono" if self.channels == 1 else "stereo",
+                )
+                self.stream.bit_rate = 128000
+        else:
+            raise ValueError(f"Unsupported format: {format}")
+    def close(self):
+        if hasattr(self, "container"):
+            self.container.close()
+        if hasattr(self, "output_buffer"):
+            self.output_buffer.close()
+    def write_chunk(
+        self, audio_data: Optional[np.ndarray] = None, finalize: bool = False
+    ) -> bytes:
+        """Write a chunk of audio data and return bytes in the target format.
+        Args:
+            audio_data: Audio data to write, or None if finalizing
+            finalize: Whether this is the final write to close the stream
+        """
+        if finalize:
+            if self.format != "pcm":
+                packets = self.stream.encode(None)
+                for packet in packets:
+                    self.container.mux(packet)
+                data = self.output_buffer.getvalue()
+                self.close()
+                return data
+        if audio_data is None or len(audio_data) == 0:
+            return b""
+        if self.format == "pcm":
+            # Write raw bytes
+            return audio_data.tobytes()
+        else:
+            frame = av.AudioFrame.from_ndarray(
+                audio_data.reshape(1, -1),
+                format="s16",
+                layout="mono" if self.channels == 1 else "stereo",
+            )
+            frame.sample_rate = self.sample_rate
+            frame.pts = self.pts
+            self.pts += frame.samples
+            packets = self.stream.encode(frame)
+            for packet in packets:
+                self.container.mux(packet)
+            data = self.output_buffer.getvalue()
+            self.output_buffer.seek(0)
+            self.output_buffer.truncate(0)
+            return data

api/src/services/temp_manager.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""Temporary file writer for audio downloads"""
+import os
+import tempfile
+from typing import List, Optional
+import aiofiles
+from fastapi import HTTPException
+from loguru import logger
+from ..core.config import settings
+async def cleanup_temp_files() -> None:
+    """Clean up old temp files"""
+    try:
+        if not await aiofiles.os.path.exists(settings.temp_file_dir):
+            await aiofiles.os.makedirs(settings.temp_file_dir, exist_ok=True)
+            return
+        # Get all temp files with stats
+        files = []
+        total_size = 0
+        # Use os.scandir for sync iteration, but aiofiles.os.stat for async stats
+        for entry in os.scandir(settings.temp_file_dir):
+            if entry.is_file():
+                stat = await aiofiles.os.stat(entry.path)
+                files.append((entry.path, stat.st_mtime, stat.st_size))
+                total_size += stat.st_size
+        # Sort by modification time (oldest first)
+        files.sort(key=lambda x: x[1])
+        # Remove files if:
+        # 1. They're too old
+        # 2. We have too many files
+        # 3. Directory is too large
+        current_time = (await aiofiles.os.stat(settings.temp_file_dir)).st_mtime
+        max_age = settings.max_temp_dir_age_hours * 3600
+        for path, mtime, size in files:
+            should_delete = False
+            # Check age
+            if current_time - mtime > max_age:
+                should_delete = True
+                logger.info(f"Deleting old temp file: {path}")
+            # Check count limit
+            elif len(files) > settings.max_temp_dir_count:
+                should_delete = True
+                logger.info(f"Deleting excess temp file: {path}")
+            # Check size limit
+            elif total_size > settings.max_temp_dir_size_mb * 1024 * 1024:
+                should_delete = True
+                logger.info(f"Deleting to reduce directory size: {path}")
+            if should_delete:
+                try:
+                    await aiofiles.os.remove(path)
+                    total_size -= size
+                    logger.info(f"Deleted temp file: {path}")
+                except Exception as e:
+                    logger.warning(f"Failed to delete temp file {path}: {e}")
+    except Exception as e:
+        logger.warning(f"Error during temp file cleanup: {e}")
+class TempFileWriter:
+    """Handles writing audio chunks to a temp file"""
+    def __init__(self, format: str):
+        """Initialize temp file writer
+        Args:
+            format: Audio format extension (mp3, wav, etc)
+        """
+        self.format = format
+        self.temp_file = None
+        self._finalized = False
+        self._write_error = False  # Flag to track if we've had a write error
+    async def __aenter__(self):
+        """Async context manager entry"""
+        try:
+            # Clean up old files first
+            await cleanup_temp_files()
+            # Create temp file with proper extension
+            await aiofiles.os.makedirs(settings.temp_file_dir, exist_ok=True)
+            temp = tempfile.NamedTemporaryFile(
+                dir=settings.temp_file_dir,
+                delete=False,
+                suffix=f".{self.format}",
+                mode="wb",
+            )
+            self.temp_file = await aiofiles.open(temp.name, mode="wb")
+            self.temp_path = temp.name
+            temp.close()  # Close sync file, we'll use async version
+            # Generate download path immediately
+            self.download_path = f"/download/{os.path.basename(self.temp_path)}"
+        except Exception as e:
+            # Handle permission issues or other errors gracefully
+            logger.error(f"Failed to create temp file: {e}")
+            self._write_error = True
+            # Set a placeholder path so the API can still function
+            self.temp_path = f"unavailable_{self.format}"
+            self.download_path = f"/download/{self.temp_path}"
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit"""
+        try:
+            if self.temp_file and not self._finalized:
+                await self.temp_file.close()
+                self._finalized = True
+        except Exception as e:
+            logger.error(f"Error closing temp file: {e}")
+            self._write_error = True
+    async def write(self, chunk: bytes) -> None:
+        """Write a chunk of audio data
+        Args:
+            chunk: Audio data bytes to write
+        """
+        if self._finalized:
+            raise RuntimeError("Cannot write to finalized temp file")
+        # Skip writing if we've already encountered an error
+        if self._write_error or not self.temp_file:
+            return
+        try:
+            await self.temp_file.write(chunk)
+            await self.temp_file.flush()
+        except Exception as e:
+            # Handle permission issues or other errors gracefully
+            logger.error(f"Failed to write to temp file: {e}")
+            self._write_error = True
+    async def finalize(self) -> str:
+        """Close temp file and return download path
+        Returns:
+            Path to use for downloading the temp file
+        """
+        if self._finalized:
+            raise RuntimeError("Temp file already finalized")
+        # Skip finalizing if we've already encountered an error
+        if self._write_error or not self.temp_file:
+            self._finalized = True
+            return self.download_path
+        try:
+            await self.temp_file.close()
+            self._finalized = True
+        except Exception as e:
+            # Handle permission issues or other errors gracefully
+            logger.error(f"Failed to finalize temp file: {e}")
+            self._write_error = True
+            self._finalized = True
+        return self.download_path

api/src/services/text_processing/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Text processing pipeline."""
+from .normalizer import normalize_text
+from .phonemizer import phonemize
+from .text_processor import process_text_chunk, smart_split
+from .vocabulary import tokenize
+def process_text(text: str) -> list[int]:
+    """Process text into token IDs (for backward compatibility)."""
+    return process_text_chunk(text)
+__all__ = [
+    "normalize_text",
+    "phonemize",
+    "tokenize",
+    "process_text",
+    "process_text_chunk",
+    "smart_split",
+]

api/src/services/text_processing/normalizer.py ADDED Viewed

	@@ -0,0 +1,415 @@

+"""
+Text normalization module for TTS processing.
+Handles various text formats including URLs, emails, numbers, money, and special characters.
+Converts them into a format suitable for text-to-speech processing.
+"""
+import re
+from functools import lru_cache
+import inflect
+from numpy import number
+from text_to_num import text2num
+from torch import mul
+from ...structures.schemas import NormalizationOptions
+# Constants
+VALID_TLDS = [
+    "com",
+    "org",
+    "net",
+    "edu",
+    "gov",
+    "mil",
+    "int",
+    "biz",
+    "info",
+    "name",
+    "pro",
+    "coop",
+    "museum",
+    "travel",
+    "jobs",
+    "mobi",
+    "tel",
+    "asia",
+    "cat",
+    "xxx",
+    "aero",
+    "arpa",
+    "bg",
+    "br",
+    "ca",
+    "cn",
+    "de",
+    "es",
+    "eu",
+    "fr",
+    "in",
+    "it",
+    "jp",
+    "mx",
+    "nl",
+    "ru",
+    "uk",
+    "us",
+    "io",
+    "co",
+]
+VALID_UNITS = {
+    "m": "meter",
+    "cm": "centimeter",
+    "mm": "millimeter",
+    "km": "kilometer",
+    "in": "inch",
+    "ft": "foot",
+    "yd": "yard",
+    "mi": "mile",  # Length
+    "g": "gram",
+    "kg": "kilogram",
+    "mg": "milligram",  # Mass
+    "s": "second",
+    "ms": "millisecond",
+    "min": "minutes",
+    "h": "hour",  # Time
+    "l": "liter",
+    "ml": "mililiter",
+    "cl": "centiliter",
+    "dl": "deciliter",  # Volume
+    "kph": "kilometer per hour",
+    "mph": "mile per hour",
+    "mi/h": "mile per hour",
+    "m/s": "meter per second",
+    "km/h": "kilometer per hour",
+    "mm/s": "milimeter per second",
+    "cm/s": "centimeter per second",
+    "ft/s": "feet per second",
+    "cm/h": "centimeter per day",  # Speed
+    "°c": "degree celsius",
+    "c": "degree celsius",
+    "°f": "degree fahrenheit",
+    "f": "degree fahrenheit",
+    "k": "kelvin",  # Temperature
+    "pa": "pascal",
+    "kpa": "kilopascal",
+    "mpa": "megapascal",
+    "atm": "atmosphere",  # Pressure
+    "hz": "hertz",
+    "khz": "kilohertz",
+    "mhz": "megahertz",
+    "ghz": "gigahertz",  # Frequency
+    "v": "volt",
+    "kv": "kilovolt",
+    "mv": "mergavolt",  # Voltage
+    "a": "amp",
+    "ma": "megaamp",
+    "ka": "kiloamp",  # Current
+    "w": "watt",
+    "kw": "kilowatt",
+    "mw": "megawatt",  # Power
+    "j": "joule",
+    "kj": "kilojoule",
+    "mj": "megajoule",  # Energy
+    "Ω": "ohm",
+    "kΩ": "kiloohm",
+    "mΩ": "megaohm",  # Resistance (Ohm)
+    "f": "farad",
+    "µf": "microfarad",
+    "nf": "nanofarad",
+    "pf": "picofarad",  # Capacitance
+    "b": "bit",
+    "kb": "kilobit",
+    "mb": "megabit",
+    "gb": "gigabit",
+    "tb": "terabit",
+    "pb": "petabit",  # Data size
+    "kbps": "kilobit per second",
+    "mbps": "megabit per second",
+    "gbps": "gigabit per second",
+    "tbps": "terabit per second",
+    "px": "pixel",  # CSS units
+}
+# Pre-compiled regex patterns for performance
+EMAIL_PATTERN = re.compile(
+    r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE
+)
+URL_PATTERN = re.compile(
+    r"(https?://|www\.|)+(localhost|[a-zA-Z0-9.-]+(\.(?:"
+    + "|".join(VALID_TLDS)
+    + "))+|[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})(:[0-9]+)?([/?][^\s]*)?",
+    re.IGNORECASE,
+)
+UNIT_PATTERN = re.compile(
+    r"((?<!\w)([+-]?)(\d{1,3}(,\d{3})*|\d+)(\.\d+)?)\s*("
+    + "|".join(sorted(list(VALID_UNITS.keys()), reverse=True))
+    + r"""){1}(?=[^\w\d]{1}|\b)""",
+    re.IGNORECASE,
+)
+TIME_PATTERN = re.compile(
+    r"([0-9]{2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?", re.IGNORECASE
+)
+INFLECT_ENGINE = inflect.engine()
+def split_num(num: re.Match[str]) -> str:
+    """Handle number splitting for various formats"""
+    num = num.group()
+    if "." in num:
+        return num
+    elif ":" in num:
+        h, m = [int(n) for n in num.split(":")]
+        if m == 0:
+            return f"{h} o'clock"
+        elif m < 10:
+            return f"{h} oh {m}"
+        return f"{h} {m}"
+    year = int(num[:4])
+    if year < 1100 or year % 1000 < 10:
+        return num
+    left, right = num[:2], int(num[2:4])
+    s = "s" if num.endswith("s") else ""
+    if 100 <= year % 1000 <= 999:
+        if right == 0:
+            return f"{left} hundred{s}"
+        elif right < 10:
+            return f"{left} oh {right}{s}"
+    return f"{left} {right}{s}"
+def handle_units(u: re.Match[str]) -> str:
+    """Converts units to their full form"""
+    unit_string = u.group(6).strip()
+    unit = unit_string
+    if unit_string.lower() in VALID_UNITS:
+        unit = VALID_UNITS[unit_string.lower()].split(" ")
+        # Handles the B vs b case
+        if unit[0].endswith("bit"):
+            b_case = unit_string[min(1, len(unit_string) - 1)]
+            if b_case == "B":
+                unit[0] = unit[0][:-3] + "byte"
+        number = u.group(1).strip()
+        unit[0] = INFLECT_ENGINE.no(unit[0], number)
+    return " ".join(unit)
+def conditional_int(number: float, threshold: float = 0.00001):
+    if abs(round(number) - number) < threshold:
+        return int(round(number))
+    return number
+def handle_money(m: re.Match[str]) -> str:
+    """Convert money expressions to spoken form"""
+    bill = "dollar" if m.group(2) == "$" else "pound"
+    coin = "cent" if m.group(2) == "$" else "pence"
+    number = m.group(3)
+    multiplier = m.group(4)
+    try:
+        number = float(number)
+    except:
+        return m.group()
+    if m.group(1) == "-":
+        number *= -1
+    if number % 1 == 0 or multiplier != "":
+        text_number = f"{INFLECT_ENGINE.number_to_words(conditional_int(number))}{multiplier} {INFLECT_ENGINE.plural(bill, count=number)}"
+    else:
+        sub_number = int(str(number).split(".")[-1].ljust(2, "0"))
+        text_number = f"{INFLECT_ENGINE.number_to_words(int(round(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}"
+    return text_number
+def handle_decimal(num: re.Match[str]) -> str:
+    """Convert decimal numbers to spoken form"""
+    a, b = num.group().split(".")
+    return " point ".join([a, " ".join(b)])
+def handle_email(m: re.Match[str]) -> str:
+    """Convert email addresses into speakable format"""
+    email = m.group(0)
+    parts = email.split("@")
+    if len(parts) == 2:
+        user, domain = parts
+        domain = domain.replace(".", " dot ")
+        return f"{user} at {domain}"
+    return email
+def handle_url(u: re.Match[str]) -> str:
+    """Make URLs speakable by converting special characters to spoken words"""
+    if not u:
+        return ""
+    url = u.group(0).strip()
+    # Handle protocol first
+    url = re.sub(
+        r"^https?://",
+        lambda a: "https " if "https" in a.group() else "http ",
+        url,
+        flags=re.IGNORECASE,
+    )
+    url = re.sub(r"^www\.", "www ", url, flags=re.IGNORECASE)
+    # Handle port numbers before other replacements
+    url = re.sub(r":(\d+)(?=/|$)", lambda m: f" colon {m.group(1)}", url)
+    # Split into domain and path
+    parts = url.split("/", 1)
+    domain = parts[0]
+    path = parts[1] if len(parts) > 1 else ""
+    # Handle dots in domain
+    domain = domain.replace(".", " dot ")
+    # Reconstruct URL
+    if path:
+        url = f"{domain} slash {path}"
+    else:
+        url = domain
+    # Replace remaining symbols with words
+    url = url.replace("-", " dash ")
+    url = url.replace("_", " underscore ")
+    url = url.replace("?", " question-mark ")
+    url = url.replace("=", " equals ")
+    url = url.replace("&", " ampersand ")
+    url = url.replace("%", " percent ")
+    url = url.replace(":", " colon ")  # Handle any remaining colons
+    url = url.replace("/", " slash ")  # Handle any remaining slashes
+    # Clean up extra spaces
+    return re.sub(r"\s+", " ", url).strip()
+def handle_phone_number(p: re.Match[str]) -> str:
+    p = list(p.groups())
+    country_code = ""
+    if p[0] is not None:
+        p[0] = p[0].replace("+", "")
+        country_code += INFLECT_ENGINE.number_to_words(p[0])
+    area_code = INFLECT_ENGINE.number_to_words(
+        p[2].replace("(", "").replace(")", ""), group=1, comma=""
+    )
+    telephone_prefix = INFLECT_ENGINE.number_to_words(p[3], group=1, comma="")
+    line_number = INFLECT_ENGINE.number_to_words(p[4], group=1, comma="")
+    return ",".join([country_code, area_code, telephone_prefix, line_number])
+def handle_time(t: re.Match[str]) -> str:
+    t = t.groups()
+    numbers = " ".join(
+        [INFLECT_ENGINE.number_to_words(X.strip()) for X in t[0].split(":")]
+    )
+    half = ""
+    if t[2] is not None:
+        half = t[2].strip()
+    return numbers + half
+def normalize_text(text: str, normalization_options: NormalizationOptions) -> str:
+    """Normalize text for TTS processing"""
+    # Handle email addresses first if enabled
+    if normalization_options.email_normalization:
+        text = EMAIL_PATTERN.sub(handle_email, text)
+    # Handle URLs if enabled
+    if normalization_options.url_normalization:
+        text = URL_PATTERN.sub(handle_url, text)
+    # Pre-process numbers with units if enabled
+    if normalization_options.unit_normalization:
+        text = UNIT_PATTERN.sub(handle_units, text)
+    # Replace optional pluralization
+    if normalization_options.optional_pluralization_normalization:
+        text = re.sub(r"\(s\)", "s", text)
+    # Replace phone numbers:
+    if normalization_options.phone_normalization:
+        text = re.sub(
+            r"(\+?\d{1,2})?([ .-]?)(\(?\d{3}\)?)[\s.-](\d{3})[\s.-](\d{4})",
+            handle_phone_number,
+            text,
+        )
+    # Replace quotes and brackets
+    text = text.replace(chr(8216), "'").replace(chr(8217), "'")
+    text = text.replace("«", chr(8220)).replace("»", chr(8221))
+    text = text.replace(chr(8220), '"').replace(chr(8221), '"')
+    # Handle CJK punctuation and some non standard chars
+    for a, b in zip("、。！，：；？–", ",.!,:;?-"):
+        text = text.replace(a, b + " ")
+    # Handle simple time in the format of HH:MM:SS
+    text = TIME_PATTERN.sub(
+        handle_time,
+        text,
+    )
+    # Clean up whitespace
+    text = re.sub(r"[^\S \n]", " ", text)
+    text = re.sub(r"  +", " ", text)
+    text = re.sub(r"(?<=\n) +(?=\n)", "", text)
+    # Handle titles and abbreviations
+    text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
+    text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
+    text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
+    text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
+    text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
+    # Handle common words
+    text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
+    # Handle numbers and money
+    text = re.sub(r"(?<=\d),(?=\d)", "", text)
+    text = re.sub(
+        r"(?i)(-?)([$£])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion)*)\b",
+        handle_money,
+        text,
+    )
+    text = re.sub(
+        r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text
+    )
+    text = re.sub(r"\d*\.\d+", handle_decimal, text)
+    # Handle various formatting
+    text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
+    text = re.sub(r"(?<=\d)S", " S", text)
+    text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
+    text = re.sub(r"(?<=X')S\b", "s", text)
+    text = re.sub(
+        r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
+    )
+    text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
+    return text.strip()

api/src/services/text_processing/phonemizer.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import re
+from abc import ABC, abstractmethod
+import phonemizer
+from .normalizer import normalize_text
+phonemizers = {}
+class PhonemizerBackend(ABC):
+    """Abstract base class for phonemization backends"""
+    @abstractmethod
+    def phonemize(self, text: str) -> str:
+        """Convert text to phonemes
+        Args:
+            text: Text to convert to phonemes
+        Returns:
+            Phonemized text
+        """
+        pass
+class EspeakBackend(PhonemizerBackend):
+    """Espeak-based phonemizer implementation"""
+    def __init__(self, language: str):
+        """Initialize espeak backend
+        Args:
+            language: Language code ('en-us' or 'en-gb')
+        """
+        self.backend = phonemizer.backend.EspeakBackend(
+            language=language, preserve_punctuation=True, with_stress=True
+        )
+        self.language = language
+    def phonemize(self, text: str) -> str:
+        """Convert text to phonemes using espeak
+        Args:
+            text: Text to convert to phonemes
+        Returns:
+            Phonemized text
+        """
+        # Phonemize text
+        ps = self.backend.phonemize([text])
+        ps = ps[0] if ps else ""
+        # Handle special cases
+        ps = ps.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace("kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ")
+        ps = ps.replace("ʲ", "j").replace("r", "ɹ").replace("x", "k").replace("ɬ", "l")
+        ps = re.sub(r"(?<=[a-zɹː])(?=hˈʌndɹɪd)", " ", ps)
+        ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»"" ]|$)', "z", ps)
+        # Language-specific rules
+        if self.language == "en-us":
+            ps = re.sub(r"(?<=nˈaɪn)ti(?!ː)", "di", ps)
+        return ps.strip()
+def create_phonemizer(language: str = "a") -> PhonemizerBackend:
+    """Factory function to create phonemizer backend
+    Args:
+        language: Language code ('a' for US English, 'b' for British English)
+    Returns:
+        Phonemizer backend instance
+    """
+    # Map language codes to espeak language codes
+    lang_map = {"a": "en-us", "b": "en-gb"}
+    if language not in lang_map:
+        raise ValueError(f"Unsupported language code: {language}")
+    return EspeakBackend(lang_map[language])
+def phonemize(text: str, language: str = "a", normalize: bool = True) -> str:
+    """Convert text to phonemes
+    Args:
+        text: Text to convert to phonemes
+        language: Language code ('a' for US English, 'b' for British English)
+        normalize: Whether to normalize text before phonemization
+    Returns:
+        Phonemized text
+    """
+    global phonemizers
+    if normalize:
+        text = normalize_text(text)
+    if language not in phonemizers:
+        phonemizers[language] = create_phonemizer(language)
+    return phonemizers[language].phonemize(text)

api/src/services/text_processing/text_processor.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""Unified text processing for TTS with smart chunking."""
+import re
+import time
+from typing import AsyncGenerator, Dict, List, Tuple
+from loguru import logger
+from ...core.config import settings
+from ...structures.schemas import NormalizationOptions
+from .normalizer import normalize_text
+from .phonemizer import phonemize
+from .vocabulary import tokenize
+# Pre-compiled regex patterns for performance
+CUSTOM_PHONEMES = re.compile(r"(\[([^\]]|\n)*?\])(\(\/([^\/)]|\n)*?\/\))")
+def process_text_chunk(
+    text: str, language: str = "a", skip_phonemize: bool = False
+) -> List[int]:
+    """Process a chunk of text through normalization, phonemization, and tokenization.
+    Args:
+        text: Text chunk to process
+        language: Language code for phonemization
+        skip_phonemize: If True, treat input as phonemes and skip normalization/phonemization
+    Returns:
+        List of token IDs
+    """
+    start_time = time.time()
+    if skip_phonemize:
+        # Input is already phonemes, just tokenize
+        t0 = time.time()
+        tokens = tokenize(text)
+        t1 = time.time()
+    else:
+        # Normal text processing pipeline
+        t0 = time.time()
+        t1 = time.time()
+        t0 = time.time()
+        phonemes = phonemize(text, language, normalize=False)  # Already normalized
+        t1 = time.time()
+        t0 = time.time()
+        tokens = tokenize(phonemes)
+        t1 = time.time()
+    total_time = time.time() - start_time
+    logger.debug(
+        f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}{'...' if len(text) > 50 else ''}'"
+    )
+    return tokens
+async def yield_chunk(
+    text: str, tokens: List[int], chunk_count: int
+) -> Tuple[str, List[int]]:
+    """Yield a chunk with consistent logging."""
+    logger.debug(
+        f"Yielding chunk {chunk_count}: '{text[:50]}{'...' if len(text) > 50 else ''}' ({len(tokens)} tokens)"
+    )
+    return text, tokens
+def process_text(text: str, language: str = "a") -> List[int]:
+    """Process text into token IDs.
+    Args:
+        text: Text to process
+        language: Language code for phonemization
+    Returns:
+        List of token IDs
+    """
+    if not isinstance(text, str):
+        text = str(text) if text is not None else ""
+    text = text.strip()
+    if not text:
+        return []
+    return process_text_chunk(text, language)
+def get_sentence_info(
+    text: str, custom_phenomes_list: Dict[str, str]
+) -> List[Tuple[str, List[int], int]]:
+    """Process all sentences and return info."""
+    sentences = re.split(r"([.!?;:])(?=\s|$)", text)
+    phoneme_length, min_value = len(custom_phenomes_list), 0
+    results = []
+    for i in range(0, len(sentences), 2):
+        sentence = sentences[i].strip()
+        for replaced in range(min_value, phoneme_length):
+            current_id = f"</|custom_phonemes_{replaced}|/>"
+            if current_id in sentence:
+                sentence = sentence.replace(
+                    current_id, custom_phenomes_list.pop(current_id)
+                )
+                min_value += 1
+        punct = sentences[i + 1] if i + 1 < len(sentences) else ""
+        if not sentence:
+            continue
+        full = sentence + punct
+        tokens = process_text_chunk(full)
+        results.append((full, tokens, len(tokens)))
+    return results
+def handle_custom_phonemes(s: re.Match[str], phenomes_list: Dict[str, str]) -> str:
+    latest_id = f"</|custom_phonemes_{len(phenomes_list)}|/>"
+    phenomes_list[latest_id] = s.group(0).strip()
+    return latest_id
+async def smart_split(
+    text: str,
+    max_tokens: int = settings.absolute_max_tokens,
+    lang_code: str = "a",
+    normalization_options: NormalizationOptions = NormalizationOptions(),
+) -> AsyncGenerator[Tuple[str, List[int]], None]:
+    """Build optimal chunks targeting 300-400 tokens, never exceeding max_tokens."""
+    start_time = time.time()
+    chunk_count = 0
+    logger.info(f"Starting smart split for {len(text)} chars")
+    custom_phoneme_list = {}
+    # Normalize text
+    if settings.advanced_text_normalization and normalization_options.normalize:
+        print(lang_code)
+        if lang_code in ["a", "b", "en-us", "en-gb"]:
+            text = CUSTOM_PHONEMES.sub(
+                lambda s: handle_custom_phonemes(s, custom_phoneme_list), text
+            )
+            text = normalize_text(text, normalization_options)
+        else:
+            logger.info(
+                "Skipping text normalization as it is only supported for english"
+            )
+    # Process all sentences
+    sentences = get_sentence_info(text, custom_phoneme_list)
+    current_chunk = []
+    current_tokens = []
+    current_count = 0
+    for sentence, tokens, count in sentences:
+        # Handle sentences that exceed max tokens
+        if count > max_tokens:
+            # Yield current chunk if any
+            if current_chunk:
+                chunk_text = " ".join(current_chunk)
+                chunk_count += 1
+                logger.debug(
+                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
+                )
+                yield chunk_text, current_tokens
+                current_chunk = []
+                current_tokens = []
+                current_count = 0
+            # Split long sentence on commas
+            clauses = re.split(r"([,])", sentence)
+            clause_chunk = []
+            clause_tokens = []
+            clause_count = 0
+            for j in range(0, len(clauses), 2):
+                clause = clauses[j].strip()
+                comma = clauses[j + 1] if j + 1 < len(clauses) else ""
+                if not clause:
+                    continue
+                full_clause = clause + comma
+                tokens = process_text_chunk(full_clause)
+                count = len(tokens)
+                # If adding clause keeps us under max and not optimal yet
+                if (
+                    clause_count + count <= max_tokens
+                    and clause_count + count <= settings.target_max_tokens
+                ):
+                    clause_chunk.append(full_clause)
+                    clause_tokens.extend(tokens)
+                    clause_count += count
+                else:
+                    # Yield clause chunk if we have one
+                    if clause_chunk:
+                        chunk_text = " ".join(clause_chunk)
+                        chunk_count += 1
+                        logger.debug(
+                            f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
+                        )
+                        yield chunk_text, clause_tokens
+                    clause_chunk = [full_clause]
+                    clause_tokens = tokens
+                    clause_count = count
+            # Don't forget last clause chunk
+            if clause_chunk:
+                chunk_text = " ".join(clause_chunk)
+                chunk_count += 1
+                logger.debug(
+                    f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
+                )
+                yield chunk_text, clause_tokens
+        # Regular sentence handling
+        elif (
+            current_count >= settings.target_min_tokens
+            and current_count + count > settings.target_max_tokens
+        ):
+            # If we have a good sized chunk and adding next sentence exceeds target,
+            # yield current chunk and start new one
+            chunk_text = " ".join(current_chunk)
+            chunk_count += 1
+            logger.info(
+                f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
+            )
+            yield chunk_text, current_tokens
+            current_chunk = [sentence]
+            current_tokens = tokens
+            current_count = count
+        elif current_count + count <= settings.target_max_tokens:
+            # Keep building chunk while under target max
+            current_chunk.append(sentence)
+            current_tokens.extend(tokens)
+            current_count += count
+        elif (
+            current_count + count <= max_tokens
+            and current_count < settings.target_min_tokens
+        ):
+            # Only exceed target max if we haven't reached minimum size yet
+            current_chunk.append(sentence)
+            current_tokens.extend(tokens)
+            current_count += count
+        else:
+            # Yield current chunk and start new one
+            if current_chunk:
+                chunk_text = " ".join(current_chunk)
+                chunk_count += 1
+                logger.info(
+                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
+                )
+                yield chunk_text, current_tokens
+            current_chunk = [sentence]
+            current_tokens = tokens
+            current_count = count
+    # Don't forget the last chunk
+    if current_chunk:
+        chunk_text = " ".join(current_chunk)
+        chunk_count += 1
+        logger.info(
+            f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
+        )
+        yield chunk_text, current_tokens
+    total_time = time.time() - start_time
+    logger.info(
+        f"Split completed in {total_time * 1000:.2f}ms, produced {chunk_count} chunks"
+    )

api/src/services/text_processing/vocabulary.py ADDED Viewed

	@@ -0,0 +1,40 @@

+def get_vocab():
+    """Get the vocabulary dictionary mapping characters to token IDs"""
+    _pad = "$"
+    _punctuation = ';:,.!?¡¿—…"«»"" '
+    _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+    _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+    # Create vocabulary dictionary
+    symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
+    return {symbol: i for i, symbol in enumerate(symbols)}
+# Initialize vocabulary
+VOCAB = get_vocab()
+def tokenize(phonemes: str) -> list[int]:
+    """Convert phonemes string to token IDs
+    Args:
+        phonemes: String of phonemes to tokenize
+    Returns:
+        List of token IDs
+    """
+    return [i for i in map(VOCAB.get, phonemes) if i is not None]
+def decode_tokens(tokens: list[int]) -> str:
+    """Convert token IDs back to phonemes string
+    Args:
+        tokens: List of token IDs
+    Returns:
+        String of phonemes
+    """
+    # Create reverse mapping
+    id_to_symbol = {i: s for s, i in VOCAB.items()}
+    return "".join(id_to_symbol[t] for t in tokens)

api/src/services/tts_service.py ADDED Viewed

	@@ -0,0 +1,459 @@

+"""TTS service using model and voice managers."""
+import asyncio
+import os
+import re
+import tempfile
+import time
+from typing import AsyncGenerator, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from kokoro import KPipeline
+from loguru import logger
+from ..core.config import settings
+from ..inference.base import AudioChunk
+from ..inference.kokoro_v1 import KokoroV1
+from ..inference.model_manager import get_manager as get_model_manager
+from ..inference.voice_manager import get_manager as get_voice_manager
+from ..structures.schemas import NormalizationOptions
+from .audio import AudioNormalizer, AudioService
+from .streaming_audio_writer import StreamingAudioWriter
+from .text_processing import tokenize
+from .text_processing.text_processor import process_text_chunk, smart_split
+class TTSService:
+    """Text-to-speech service."""
+    # Limit concurrent chunk processing
+    _chunk_semaphore = asyncio.Semaphore(4)
+    def __init__(self, output_dir: str = None):
+        """Initialize service."""
+        self.output_dir = output_dir
+        self.model_manager = None
+        self._voice_manager = None
+    @classmethod
+    async def create(cls, output_dir: str = None) -> "TTSService":
+        """Create and initialize TTSService instance."""
+        service = cls(output_dir)
+        service.model_manager = await get_model_manager()
+        service._voice_manager = await get_voice_manager()
+        return service
+    async def _process_chunk(
+        self,
+        chunk_text: str,
+        tokens: List[int],
+        voice_name: str,
+        voice_path: str,
+        speed: float,
+        writer: StreamingAudioWriter,
+        output_format: Optional[str] = None,
+        is_first: bool = False,
+        is_last: bool = False,
+        normalizer: Optional[AudioNormalizer] = None,
+        lang_code: Optional[str] = None,
+        return_timestamps: Optional[bool] = False,
+    ) -> AsyncGenerator[AudioChunk, None]:
+        """Process tokens into audio."""
+        async with self._chunk_semaphore:
+            try:
+                # Handle stream finalization
+                if is_last:
+                    # Skip format conversion for raw audio mode
+                    if not output_format:
+                        yield AudioChunk(np.array([], dtype=np.int16), output=b"")
+                        return
+                    chunk_data = await AudioService.convert_audio(
+                        AudioChunk(
+                            np.array([], dtype=np.float32)
+                        ),  # Dummy data for type checking
+                        output_format,
+                        writer,
+                        speed,
+                        "",
+                        normalizer=normalizer,
+                        is_last_chunk=True,
+                    )
+                    yield chunk_data
+                    return
+                # Skip empty chunks
+                if not tokens and not chunk_text:
+                    return
+                # Get backend
+                backend = self.model_manager.get_backend()
+                # Generate audio using pre-warmed model
+                if isinstance(backend, KokoroV1):
+                    chunk_index = 0
+                    # For Kokoro V1, pass text and voice info with lang_code
+                    async for chunk_data in self.model_manager.generate(
+                        chunk_text,
+                        (voice_name, voice_path),
+                        speed=speed,
+                        lang_code=lang_code,
+                        return_timestamps=return_timestamps,
+                    ):
+                        # For streaming, convert to bytes
+                        if output_format:
+                            try:
+                                chunk_data = await AudioService.convert_audio(
+                                    chunk_data,
+                                    output_format,
+                                    writer,
+                                    speed,
+                                    chunk_text,
+                                    is_last_chunk=is_last,
+                                    normalizer=normalizer,
+                                )
+                                yield chunk_data
+                            except Exception as e:
+                                logger.error(f"Failed to convert audio: {str(e)}")
+                        else:
+                            chunk_data = AudioService.trim_audio(
+                                chunk_data, chunk_text, speed, is_last, normalizer
+                            )
+                            yield chunk_data
+                        chunk_index += 1
+                else:
+                    # For legacy backends, load voice tensor
+                    voice_tensor = await self._voice_manager.load_voice(
+                        voice_name, device=backend.device
+                    )
+                    chunk_data = await self.model_manager.generate(
+                        tokens,
+                        voice_tensor,
+                        speed=speed,
+                        return_timestamps=return_timestamps,
+                    )
+                    if chunk_data.audio is None:
+                        logger.error("Model generated None for audio chunk")
+                        return
+                    if len(chunk_data.audio) == 0:
+                        logger.error("Model generated empty audio chunk")
+                        return
+                    # For streaming, convert to bytes
+                    if output_format:
+                        try:
+                            chunk_data = await AudioService.convert_audio(
+                                chunk_data,
+                                output_format,
+                                writer,
+                                speed,
+                                chunk_text,
+                                normalizer=normalizer,
+                                is_last_chunk=is_last,
+                            )
+                            yield chunk_data
+                        except Exception as e:
+                            logger.error(f"Failed to convert audio: {str(e)}")
+                    else:
+                        trimmed = AudioService.trim_audio(
+                            chunk_data, chunk_text, speed, is_last, normalizer
+                        )
+                        yield trimmed
+            except Exception as e:
+                logger.error(f"Failed to process tokens: {str(e)}")
+    async def _load_voice_from_path(self, path: str, weight: float):
+        # Check if the path is None and raise a ValueError if it is not
+        if not path:
+            raise ValueError(f"Voice not found at path: {path}")
+        logger.debug(f"Loading voice tensor from path: {path}")
+        return torch.load(path, map_location="cpu") * weight
+    async def _get_voices_path(self, voice: str) -> Tuple[str, str]:
+        """Get voice path, handling combined voices.
+        Args:
+            voice: Voice name or combined voice names (e.g., 'af_jadzia+af_jessica')
+        Returns:
+            Tuple of (voice name to use, voice path to use)
+        Raises:
+            RuntimeError: If voice not found
+        """
+        try:
+            # Split the voice on + and - and ensure that they get added to the list eg: hi+bob = ["hi","+","bob"]
+            split_voice = re.split(r"([-+])", voice)
+            # If it is only once voice there is no point in loading it up, doing nothing with it, then saving it
+            if len(split_voice) == 1:
+                # Since its a single voice the only time that the weight would matter is if voice_weight_normalization is off
+                if (
+                    "(" not in voice and ")" not in voice
+                ) or settings.voice_weight_normalization == True:
+                    path = await self._voice_manager.get_voice_path(voice)
+                    if not path:
+                        raise RuntimeError(f"Voice not found: {voice}")
+                    logger.debug(f"Using single voice path: {path}")
+                    return voice, path
+            total_weight = 0
+            for voice_index in range(0, len(split_voice), 2):
+                voice_object = split_voice[voice_index]
+                if "(" in voice_object and ")" in voice_object:
+                    voice_name = voice_object.split("(")[0].strip()
+                    voice_weight = float(voice_object.split("(")[1].split(")")[0])
+                else:
+                    voice_name = voice_object
+                    voice_weight = 1
+                total_weight += voice_weight
+                split_voice[voice_index] = (voice_name, voice_weight)
+            # If voice_weight_normalization is false prevent normalizing the weights by setting the total_weight to 1 so it divides each weight by 1
+            if settings.voice_weight_normalization == False:
+                total_weight = 1
+            # Load the first voice as the starting point for voices to be combined onto
+            path = await self._voice_manager.get_voice_path(split_voice[0][0])
+            combined_tensor = await self._load_voice_from_path(
+                path, split_voice[0][1] / total_weight
+            )
+            # Loop through each + or - in split_voice so they can be applied to combined voice
+            for operation_index in range(1, len(split_voice) - 1, 2):
+                # Get the voice path of the voice 1 index ahead of the operator
+                path = await self._voice_manager.get_voice_path(
+                    split_voice[operation_index + 1][0]
+                )
+                voice_tensor = await self._load_voice_from_path(
+                    path, split_voice[operation_index + 1][1] / total_weight
+                )
+                # Either add or subtract the voice from the current combined voice
+                if split_voice[operation_index] == "+":
+                    combined_tensor += voice_tensor
+                else:
+                    combined_tensor -= voice_tensor
+            # Save the new combined voice so it can be loaded latter
+            temp_dir = tempfile.gettempdir()
+            combined_path = os.path.join(temp_dir, f"{voice}.pt")
+            logger.debug(f"Saving combined voice to: {combined_path}")
+            torch.save(combined_tensor, combined_path)
+            return voice, combined_path
+        except Exception as e:
+            logger.error(f"Failed to get voice path: {e}")
+            raise
+    async def generate_audio_stream(
+        self,
+        text: str,
+        voice: str,
+        writer: StreamingAudioWriter,
+        speed: float = 1.0,
+        output_format: str = "wav",
+        lang_code: Optional[str] = None,
+        normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
+        return_timestamps: Optional[bool] = False,
+    ) -> AsyncGenerator[AudioChunk, None]:
+        """Generate and stream audio chunks."""
+        stream_normalizer = AudioNormalizer()
+        chunk_index = 0
+        current_offset = 0.0
+        try:
+            # Get backend
+            backend = self.model_manager.get_backend()
+            # Get voice path, handling combined voices
+            voice_name, voice_path = await self._get_voices_path(voice)
+            logger.debug(f"Using voice path: {voice_path}")
+            # Use provided lang_code or determine from voice name
+            pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
+            logger.info(
+                f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in audio stream"
+            )
+            # Process text in chunks with smart splitting
+            async for chunk_text, tokens in smart_split(
+                text,
+                lang_code=pipeline_lang_code,
+                normalization_options=normalization_options,
+            ):
+                try:
+                    # Process audio for chunk
+                    async for chunk_data in self._process_chunk(
+                        chunk_text,  # Pass text for Kokoro V1
+                        tokens,  # Pass tokens for legacy backends
+                        voice_name,  # Pass voice name
+                        voice_path,  # Pass voice path
+                        speed,
+                        writer,
+                        output_format,
+                        is_first=(chunk_index == 0),
+                        is_last=False,  # We'll update the last chunk later
+                        normalizer=stream_normalizer,
+                        lang_code=pipeline_lang_code,  # Pass lang_code
+                        return_timestamps=return_timestamps,
+                    ):
+                        if chunk_data.word_timestamps is not None:
+                            for timestamp in chunk_data.word_timestamps:
+                                timestamp.start_time += current_offset
+                                timestamp.end_time += current_offset
+                        current_offset += len(chunk_data.audio) / 24000
+                        if chunk_data.output is not None:
+                            yield chunk_data
+                        else:
+                            logger.warning(
+                                f"No audio generated for chunk: '{chunk_text[:100]}...'"
+                            )
+                        chunk_index += 1
+                except Exception as e:
+                    logger.error(
+                        f"Failed to process audio for chunk: '{chunk_text[:100]}...'. Error: {str(e)}"
+                    )
+                    continue
+            # Only finalize if we successfully processed at least one chunk
+            if chunk_index > 0:
+                try:
+                    # Empty tokens list to finalize audio
+                    async for chunk_data in self._process_chunk(
+                        "",  # Empty text
+                        [],  # Empty tokens
+                        voice_name,
+                        voice_path,
+                        speed,
+                        writer,
+                        output_format,
+                        is_first=False,
+                        is_last=True,  # Signal this is the last chunk
+                        normalizer=stream_normalizer,
+                        lang_code=pipeline_lang_code,  # Pass lang_code
+                    ):
+                        if chunk_data.output is not None:
+                            yield chunk_data
+                except Exception as e:
+                    logger.error(f"Failed to finalize audio stream: {str(e)}")
+        except Exception as e:
+            logger.error(f"Error in phoneme audio generation: {str(e)}")
+            raise e
+    async def generate_audio(
+        self,
+        text: str,
+        voice: str,
+        writer: StreamingAudioWriter,
+        speed: float = 1.0,
+        return_timestamps: bool = False,
+        normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
+        lang_code: Optional[str] = None,
+    ) -> AudioChunk:
+        """Generate complete audio for text using streaming internally."""
+        audio_data_chunks = []
+        try:
+            async for audio_stream_data in self.generate_audio_stream(
+                text,
+                voice,
+                writer,
+                speed=speed,
+                normalization_options=normalization_options,
+                return_timestamps=return_timestamps,
+                lang_code=lang_code,
+                output_format=None,
+            ):
+                if len(audio_stream_data.audio) > 0:
+                    audio_data_chunks.append(audio_stream_data)
+            combined_audio_data = AudioChunk.combine(audio_data_chunks)
+            return combined_audio_data
+        except Exception as e:
+            logger.error(f"Error in audio generation: {str(e)}")
+            raise
+    async def combine_voices(self, voices: List[str]) -> torch.Tensor:
+        """Combine multiple voices.
+        Returns:
+            Combined voice tensor
+        """
+        return await self._voice_manager.combine_voices(voices)
+    async def list_voices(self) -> List[str]:
+        """List available voices."""
+        return await self._voice_manager.list_voices()
+    async def generate_from_phonemes(
+        self,
+        phonemes: str,
+        voice: str,
+        speed: float = 1.0,
+        lang_code: Optional[str] = None,
+    ) -> Tuple[np.ndarray, float]:
+        """Generate audio directly from phonemes.
+        Args:
+            phonemes: Phonemes in Kokoro format
+            voice: Voice name
+            speed: Speed multiplier
+            lang_code: Optional language code override
+        Returns:
+            Tuple of (audio array, processing time)
+        """
+        start_time = time.time()
+        try:
+            # Get backend and voice path
+            backend = self.model_manager.get_backend()
+            voice_name, voice_path = await self._get_voices_path(voice)
+            if isinstance(backend, KokoroV1):
+                # For Kokoro V1, use generate_from_tokens with raw phonemes
+                result = None
+                # Use provided lang_code or determine from voice name
+                pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
+                logger.info(
+                    f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in phoneme pipeline"
+                )
+                try:
+                    # Use backend's pipeline management
+                    for r in backend._get_pipeline(
+                        pipeline_lang_code
+                    ).generate_from_tokens(
+                        tokens=phonemes,  # Pass raw phonemes string
+                        voice=voice_path,
+                        speed=speed,
+                    ):
+                        if r.audio is not None:
+                            result = r
+                            break
+                except Exception as e:
+                    logger.error(f"Failed to generate from phonemes: {e}")
+                    raise RuntimeError(f"Phoneme generation failed: {e}")
+                if result is None or result.audio is None:
+                    raise ValueError("No audio generated")
+                processing_time = time.time() - start_time
+                return result.audio.numpy(), processing_time
+            else:
+                raise ValueError(
+                    "Phoneme generation only supported with Kokoro V1 backend"
+                )
+        except Exception as e:
+            logger.error(f"Error in phoneme audio generation: {str(e)}")
+            raise

api/src/structures/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from .schemas import (
+    CaptionedSpeechRequest,
+    CaptionedSpeechResponse,
+    OpenAISpeechRequest,
+    TTSStatus,
+    VoiceCombineRequest,
+    WordTimestamp,
+)
+__all__ = [
+    "OpenAISpeechRequest",
+    "CaptionedSpeechRequest",
+    "CaptionedSpeechResponse",
+    "WordTimestamp",
+    "TTSStatus",
+    "VoiceCombineRequest",
+]

api/src/structures/custom_responses.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import json
+import typing
+from collections.abc import AsyncIterable, Iterable
+from pydantic import BaseModel
+from starlette.background import BackgroundTask
+from starlette.concurrency import iterate_in_threadpool
+from starlette.responses import JSONResponse, StreamingResponse
+class JSONStreamingResponse(StreamingResponse, JSONResponse):
+    """StreamingResponse that also render with JSON."""
+    def __init__(
+        self,
+        content: Iterable | AsyncIterable,
+        status_code: int = 200,
+        headers: dict[str, str] | None = None,
+        media_type: str | None = None,
+        background: BackgroundTask | None = None,
+    ) -> None:
+        if isinstance(content, AsyncIterable):
+            self._content_iterable: AsyncIterable = content
+        else:
+            self._content_iterable = iterate_in_threadpool(content)
+        async def body_iterator() -> AsyncIterable[bytes]:
+            async for content_ in self._content_iterable:
+                if isinstance(content_, BaseModel):
+                    content_ = content_.model_dump()
+                yield self.render(content_)
+        self.body_iterator = body_iterator()
+        self.status_code = status_code
+        if media_type is not None:
+            self.media_type = media_type
+        self.background = background
+        self.init_headers(headers)
+    def render(self, content: typing.Any) -> bytes:
+        return (
+            json.dumps(
+                content,
+                ensure_ascii=False,
+                allow_nan=False,
+                indent=None,
+                separators=(",", ":"),
+            )
+            + "\n"
+        ).encode("utf-8")

api/src/structures/model_schemas.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""Voice configuration schemas."""
+from pydantic import BaseModel, Field
+class VoiceConfig(BaseModel):
+    """Voice configuration."""
+    use_cache: bool = Field(True, description="Whether to cache loaded voices")
+    cache_size: int = Field(3, description="Number of voices to cache")
+    validate_on_load: bool = Field(
+        True, description="Whether to validate voices when loading"
+    )
+    class Config:
+        frozen = True  # Make config immutable

api/src/structures/schemas.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from enum import Enum
+from typing import List, Literal, Optional, Union
+from pydantic import BaseModel, Field
+class VoiceCombineRequest(BaseModel):
+    """Request schema for voice combination endpoint that accepts either a string with + or a list"""
+    voices: Union[str, List[str]] = Field(
+        ...,
+        description="Either a string with voices separated by + (e.g. 'voice1+voice2') or a list of voice names to combine",
+    )
+class TTSStatus(str, Enum):
+    PENDING = "pending"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    DELETED = "deleted"  # For files removed by cleanup
+# OpenAI-compatible schemas
+class WordTimestamp(BaseModel):
+    """Word-level timestamp information"""
+    word: str = Field(..., description="The word or token")
+    start_time: float = Field(..., description="Start time in seconds")
+    end_time: float = Field(..., description="End time in seconds")
+class CaptionedSpeechResponse(BaseModel):
+    """Response schema for captioned speech endpoint"""
+    audio: str = Field(..., description="The generated audio data encoded in base 64")
+    audio_format: str = Field(..., description="The format of the output audio")
+    timestamps: Optional[List[WordTimestamp]] = Field(
+        ..., description="Word-level timestamps"
+    )
+class NormalizationOptions(BaseModel):
+    """Options for the normalization system"""
+    normalize: bool = Field(
+        default=True,
+        description="Normalizes input text to make it easier for the model to say",
+    )
+    unit_normalization: bool = Field(
+        default=False, description="Transforms units like 10KB to 10 kilobytes"
+    )
+    url_normalization: bool = Field(
+        default=True,
+        description="Changes urls so they can be properly pronounced by kokoro",
+    )
+    email_normalization: bool = Field(
+        default=True,
+        description="Changes emails so they can be properly pronouced by kokoro",
+    )
+    optional_pluralization_normalization: bool = Field(
+        default=True,
+        description="Replaces (s) with s so some words get pronounced correctly",
+    )
+    phone_normalization: bool = Field(
+        default=True,
+        description="Changes phone numbers so they can be properly pronouced by kokoro",
+    )
+class OpenAISpeechRequest(BaseModel):
+    """Request schema for OpenAI-compatible speech endpoint"""
+    model: str = Field(
+        default="kokoro",
+        description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro",
+    )
+    input: str = Field(..., description="The text to generate audio for")
+    voice: str = Field(
+        default="af_heart",
+        description="The voice to use for generation. Can be a base voice or a combined voice name.",
+    )
+    response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
+        default="mp3",
+        description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
+    )
+    download_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = (
+        Field(
+            default=None,
+            description="Optional different format for the final download. If not provided, uses response_format.",
+        )
+    )
+    speed: float = Field(
+        default=1.0,
+        ge=0.25,
+        le=4.0,
+        description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
+    )
+    stream: bool = Field(
+        default=True,  # Default to streaming for OpenAI compatibility
+        description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
+    )
+    return_download_link: bool = Field(
+        default=False,
+        description="If true, returns a download link in X-Download-Path header after streaming completes",
+    )
+    lang_code: Optional[str] = Field(
+        default=None,
+        description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
+    )
+    normalization_options: Optional[NormalizationOptions] = Field(
+        default=NormalizationOptions(),
+        description="Options for the normalization system",
+    )
+class CaptionedSpeechRequest(BaseModel):
+    """Request schema for captioned speech endpoint"""
+    model: str = Field(
+        default="kokoro",
+        description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro",
+    )
+    input: str = Field(..., description="The text to generate audio for")
+    voice: str = Field(
+        default="af_heart",
+        description="The voice to use for generation. Can be a base voice or a combined voice name.",
+    )
+    response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
+        default="mp3",
+        description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
+    )
+    speed: float = Field(
+        default=1.0,
+        ge=0.25,
+        le=4.0,
+        description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
+    )
+    stream: bool = Field(
+        default=True,  # Default to streaming for OpenAI compatibility
+        description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
+    )
+    return_timestamps: bool = Field(
+        default=True,
+        description="If true (default), returns word-level timestamps in the response",
+    )
+    return_download_link: bool = Field(
+        default=False,
+        description="If true, returns a download link in X-Download-Path header after streaming completes",
+    )
+    lang_code: Optional[str] = Field(
+        default=None,
+        description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
+    )
+    normalization_options: Optional[NormalizationOptions] = Field(
+        default=NormalizationOptions(),
+        description="Options for the normalization system",
+    )

api/src/structures/text_schemas.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from typing import List, Optional, Union
+from pydantic import BaseModel, Field, field_validator
+class PhonemeRequest(BaseModel):
+    text: str
+    language: str = "a"  # Default to American English
+class PhonemeResponse(BaseModel):
+    phonemes: str
+    tokens: list[int]
+class StitchOptions(BaseModel):
+    """Options for stitching audio chunks together"""
+    gap_method: str = Field(
+        default="static_trim",
+        description="Method to handle gaps between chunks. Currently only 'static_trim' supported.",
+    )
+    trim_ms: int = Field(
+        default=0,
+        ge=0,
+        description="Milliseconds to trim from chunk boundaries when using static_trim",
+    )
+    @field_validator("gap_method")
+    @classmethod
+    def validate_gap_method(cls, v: str) -> str:
+        if v != "static_trim":
+            raise ValueError("Currently only 'static_trim' gap method is supported")
+        return v
+class GenerateFromPhonemesRequest(BaseModel):
+    """Simple request for phoneme-to-speech generation"""
+    phonemes: str = Field(..., description="Phoneme string to synthesize")
+    voice: str = Field(..., description="Voice ID to use for generation")

api/tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Make tests directory a Python package

api/tests/conftest.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+from pathlib import Path
+from unittest.mock import AsyncMock, MagicMock, patch
+import numpy as np
+import pytest
+import pytest_asyncio
+import torch
+from api.src.inference.model_manager import ModelManager
+from api.src.inference.voice_manager import VoiceManager
+from api.src.services.tts_service import TTSService
+from api.src.structures.model_schemas import VoiceConfig
+@pytest.fixture
+def mock_voice_tensor():
+    """Load a real voice tensor for testing."""
+    voice_path = os.path.join(
+        os.path.dirname(os.path.dirname(__file__)), "src/voices/af_bella.pt"
+    )
+    return torch.load(voice_path, map_location="cpu", weights_only=False)
+@pytest.fixture
+def mock_audio_output():
+    """Load pre-generated test audio for consistent testing."""
+    test_audio_path = os.path.join(
+        os.path.dirname(__file__), "test_data/test_audio.npy"
+    )
+    return np.load(test_audio_path)  # Return as numpy array instead of bytes
+@pytest_asyncio.fixture
+async def mock_model_manager(mock_audio_output):
+    """Mock model manager for testing."""
+    manager = AsyncMock(spec=ModelManager)
+    manager.get_backend = MagicMock()
+    async def mock_generate(*args, **kwargs):
+        # Simulate successful audio generation
+        return np.random.rand(24000).astype(np.float32)  # 1 second of random audio data
+    manager.generate = AsyncMock(side_effect=mock_generate)
+    return manager
+@pytest_asyncio.fixture
+async def mock_voice_manager(mock_voice_tensor):
+    """Mock voice manager for testing."""
+    manager = AsyncMock(spec=VoiceManager)
+    manager.get_voice_path = MagicMock(return_value="/mock/path/voice.pt")
+    manager.load_voice = AsyncMock(return_value=mock_voice_tensor)
+    manager.list_voices = AsyncMock(return_value=["voice1", "voice2"])
+    manager.combine_voices = AsyncMock(return_value="voice1_voice2")
+    return manager
+@pytest_asyncio.fixture
+async def tts_service(mock_model_manager, mock_voice_manager):
+    """Get mocked TTS service instance."""
+    service = TTSService()
+    service.model_manager = mock_model_manager
+    service._voice_manager = mock_voice_manager
+    return service
+@pytest.fixture
+def test_voice():
+    """Return a test voice name."""
+    return "voice1"

api/tests/test_audio_service.py ADDED Viewed

	@@ -0,0 +1,256 @@

+"""Tests for AudioService"""
+from unittest.mock import patch
+import numpy as np
+import pytest
+from api.src.inference.base import AudioChunk
+from api.src.services.audio import AudioNormalizer, AudioService
+from api.src.services.streaming_audio_writer import StreamingAudioWriter
+@pytest.fixture(autouse=True)
+def mock_settings():
+    """Mock settings for all tests"""
+    with patch("api.src.services.audio.settings") as mock_settings:
+        mock_settings.gap_trim_ms = 250
+        yield mock_settings
+@pytest.fixture
+def sample_audio():
+    """Generate a simple sine wave for testing"""
+    sample_rate = 24000
+    duration = 0.1  # 100ms
+    t = np.linspace(0, duration, int(sample_rate * duration))
+    frequency = 440  # A4 note
+    return np.sin(2 * np.pi * frequency * t).astype(np.float32), sample_rate
+@pytest.mark.asyncio
+async def test_convert_to_wav(sample_audio):
+    """Test converting to WAV format"""
+    audio_data, sample_rate = sample_audio
+    writer = StreamingAudioWriter("wav", sample_rate=24000)
+    # Write and finalize in one step for WAV
+    audio_chunk = await AudioService.convert_audio(
+        AudioChunk(audio_data), "wav", writer, is_last_chunk=False
+    )
+    writer.close()
+    assert isinstance(audio_chunk.output, bytes)
+    assert isinstance(audio_chunk, AudioChunk)
+    assert len(audio_chunk.output) > 0
+    # Check WAV header
+    assert audio_chunk.output.startswith(b"RIFF")
+    assert b"WAVE" in audio_chunk.output[:12]
+@pytest.mark.asyncio
+async def test_convert_to_mp3(sample_audio):
+    """Test converting to MP3 format"""
+    audio_data, sample_rate = sample_audio
+    writer = StreamingAudioWriter("mp3", sample_rate=24000)
+    audio_chunk = await AudioService.convert_audio(
+        AudioChunk(audio_data), "mp3", writer
+    )
+    writer.close()
+    assert isinstance(audio_chunk.output, bytes)
+    assert isinstance(audio_chunk, AudioChunk)
+    assert len(audio_chunk.output) > 0
+    # Check MP3 header (ID3 or MPEG frame sync)
+    assert audio_chunk.output.startswith(b"ID3") or audio_chunk.output.startswith(
+        b"\xff\xfb"
+    )
+@pytest.mark.asyncio
+async def test_convert_to_opus(sample_audio):
+    """Test converting to Opus format"""
+    audio_data, sample_rate = sample_audio
+    writer = StreamingAudioWriter("opus", sample_rate=24000)
+    audio_chunk = await AudioService.convert_audio(
+        AudioChunk(audio_data), "opus", writer
+    )
+    writer.close()
+    assert isinstance(audio_chunk.output, bytes)
+    assert isinstance(audio_chunk, AudioChunk)
+    assert len(audio_chunk.output) > 0
+    # Check OGG header
+    assert audio_chunk.output.startswith(b"OggS")
+@pytest.mark.asyncio
+async def test_convert_to_flac(sample_audio):
+    """Test converting to FLAC format"""
+    audio_data, sample_rate = sample_audio
+    writer = StreamingAudioWriter("flac", sample_rate=24000)
+    audio_chunk = await AudioService.convert_audio(
+        AudioChunk(audio_data), "flac", writer
+    )
+    writer.close()
+    assert isinstance(audio_chunk.output, bytes)
+    assert isinstance(audio_chunk, AudioChunk)
+    assert len(audio_chunk.output) > 0
+    # Check FLAC header
+    assert audio_chunk.output.startswith(b"fLaC")
+@pytest.mark.asyncio
+async def test_convert_to_aac(sample_audio):
+    """Test converting to M4A format"""
+    audio_data, sample_rate = sample_audio
+    writer = StreamingAudioWriter("aac", sample_rate=24000)
+    audio_chunk = await AudioService.convert_audio(
+        AudioChunk(audio_data), "aac", writer
+    )
+    writer.close()
+    assert isinstance(audio_chunk.output, bytes)
+    assert isinstance(audio_chunk, AudioChunk)
+    assert len(audio_chunk.output) > 0
+    # Check ADTS header (AAC)
+    assert audio_chunk.output.startswith(b"\xff\xf0") or audio_chunk.output.startswith(
+        b"\xff\xf1"
+    )
+@pytest.mark.asyncio
+async def test_convert_to_pcm(sample_audio):
+    """Test converting to PCM format"""
+    audio_data, sample_rate = sample_audio
+    writer = StreamingAudioWriter("pcm", sample_rate=24000)
+    audio_chunk = await AudioService.convert_audio(
+        AudioChunk(audio_data), "pcm", writer
+    )
+    writer.close()
+    assert isinstance(audio_chunk.output, bytes)
+    assert isinstance(audio_chunk, AudioChunk)
+    assert len(audio_chunk.output) > 0
+    # PCM is raw bytes, so no header to check
+@pytest.mark.asyncio
+async def test_convert_to_invalid_format_raises_error(sample_audio):
+    """Test that converting to an invalid format raises an error"""
+    # audio_data, sample_rate = sample_audio
+    with pytest.raises(ValueError, match="Unsupported format: invalid"):
+        writer = StreamingAudioWriter("invalid", sample_rate=24000)
+@pytest.mark.asyncio
+async def test_normalization_wav(sample_audio):
+    """Test that WAV output is properly normalized to int16 range"""
+    audio_data, sample_rate = sample_audio
+    writer = StreamingAudioWriter("wav", sample_rate=24000)
+    # Create audio data outside int16 range
+    large_audio = audio_data * 1e5
+    # Write and finalize in one step for WAV
+    audio_chunk = await AudioService.convert_audio(
+        AudioChunk(large_audio), "wav", writer
+    )
+    writer.close()
+    assert isinstance(audio_chunk.output, bytes)
+    assert isinstance(audio_chunk, AudioChunk)
+    assert len(audio_chunk.output) > 0
+@pytest.mark.asyncio
+async def test_normalization_pcm(sample_audio):
+    """Test that PCM output is properly normalized to int16 range"""
+    audio_data, sample_rate = sample_audio
+    writer = StreamingAudioWriter("pcm", sample_rate=24000)
+    # Create audio data outside int16 range
+    large_audio = audio_data * 1e5
+    audio_chunk = await AudioService.convert_audio(
+        AudioChunk(large_audio), "pcm", writer
+    )
+    assert isinstance(audio_chunk.output, bytes)
+    assert isinstance(audio_chunk, AudioChunk)
+    assert len(audio_chunk.output) > 0
+@pytest.mark.asyncio
+async def test_invalid_audio_data():
+    """Test handling of invalid audio data"""
+    invalid_audio = np.array([])  # Empty array
+    sample_rate = 24000
+    writer = StreamingAudioWriter("wav", sample_rate=24000)
+    with pytest.raises(ValueError):
+        await AudioService.convert_audio(invalid_audio, sample_rate, "wav", writer)
+@pytest.mark.asyncio
+async def test_different_sample_rates(sample_audio):
+    """Test converting audio with different sample rates"""
+    audio_data, _ = sample_audio
+    sample_rates = [8000, 16000, 44100, 48000]
+    for rate in sample_rates:
+        writer = StreamingAudioWriter("wav", sample_rate=rate)
+        audio_chunk = await AudioService.convert_audio(
+            AudioChunk(audio_data), "wav", writer
+        )
+        writer.close()
+        assert isinstance(audio_chunk.output, bytes)
+        assert isinstance(audio_chunk, AudioChunk)
+        assert len(audio_chunk.output) > 0
+@pytest.mark.asyncio
+async def test_buffer_position_after_conversion(sample_audio):
+    """Test that buffer position is reset after writing"""
+    audio_data, sample_rate = sample_audio
+    writer = StreamingAudioWriter("wav", sample_rate=24000)
+    # Write and finalize in one step for first conversion
+    audio_chunk1 = await AudioService.convert_audio(
+        AudioChunk(audio_data), "wav", writer, is_last_chunk=True
+    )
+    assert isinstance(audio_chunk1.output, bytes)
+    assert isinstance(audio_chunk1, AudioChunk)
+    # Convert again to ensure buffer was properly reset
+    writer = StreamingAudioWriter("wav", sample_rate=24000)
+    audio_chunk2 = await AudioService.convert_audio(
+        AudioChunk(audio_data), "wav", writer, is_last_chunk=True
+    )
+    assert isinstance(audio_chunk2.output, bytes)
+    assert isinstance(audio_chunk2, AudioChunk)
+    assert len(audio_chunk1.output) == len(audio_chunk2.output)

api/tests/test_data/generate_test_data.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+import numpy as np
+def generate_test_audio():
+    """Generate test audio data - 1 second of 440Hz tone"""
+    # Create 1 second of silence at 24kHz
+    audio = np.zeros(24000, dtype=np.float32)
+    # Add a simple sine wave to make it non-zero
+    t = np.linspace(0, 1, 24000)
+    audio += 0.5 * np.sin(2 * np.pi * 440 * t)  # 440 Hz tone at half amplitude
+    # Create test_data directory if it doesn't exist
+    os.makedirs("api/tests/test_data", exist_ok=True)
+    # Save the test audio
+    np.save("api/tests/test_data/test_audio.npy", audio)
+if __name__ == "__main__":
+    generate_test_audio()

api/tests/test_data/test_audio.npy ADDED Viewed

Binary file (96.1 kB). View file

api/tests/test_development.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import base64
+import json
+from unittest.mock import MagicMock, patch
+import pytest
+import requests
+def test_generate_captioned_speech():
+    """Test the generate_captioned_speech function with mocked responses"""
+    # Mock the API responses
+    mock_audio_response = MagicMock()
+    mock_audio_response.status_code = 200
+    mock_timestamps_response = MagicMock()
+    mock_timestamps_response.status_code = 200
+    mock_timestamps_response.content = json.dumps(
+        {
+            "audio": base64.b64encode(b"mock audio data").decode("utf-8"),
+            "timestamps": [{"word": "test", "start_time": 0.0, "end_time": 1.0}],
+        }
+    )
+    # Patch the HTTP requests
+    with patch("requests.post", return_value=mock_timestamps_response):
+        # Import here to avoid module-level import issues
+        from examples.captioned_speech_example import generate_captioned_speech
+        # Test the function
+        audio, timestamps = generate_captioned_speech("test text")
+        # Verify we got both audio and timestamps
+        assert audio == b"mock audio data"
+        assert timestamps == [{"word": "test", "start_time": 0.0, "end_time": 1.0}]

api/tests/test_kokoro_v1.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from unittest.mock import ANY, MagicMock, patch
+import numpy as np
+import pytest
+import torch
+from api.src.inference.kokoro_v1 import KokoroV1
+@pytest.fixture
+def kokoro_backend():
+    """Create a KokoroV1 instance for testing."""
+    return KokoroV1()
+def test_initial_state(kokoro_backend):
+    """Test initial state of KokoroV1."""
+    assert not kokoro_backend.is_loaded
+    assert kokoro_backend._model is None
+    assert kokoro_backend._pipelines == {}  # Now using dict of pipelines
+    # Device should be set based on settings
+    assert kokoro_backend.device in ["cuda", "cpu"]
+@patch("torch.cuda.is_available", return_value=True)
+@patch("torch.cuda.memory_allocated", return_value=5e9)
+def test_memory_management(mock_memory, mock_cuda, kokoro_backend):
+    """Test GPU memory management functions."""
+    # Patch backend so it thinks we have cuda
+    with patch.object(kokoro_backend, "_device", "cuda"):
+        # Test memory check
+        with patch("api.src.inference.kokoro_v1.model_config") as mock_config:
+            mock_config.pytorch_gpu.memory_threshold = 4
+            assert kokoro_backend._check_memory() == True
+            mock_config.pytorch_gpu.memory_threshold = 6
+            assert kokoro_backend._check_memory() == False
+@patch("torch.cuda.empty_cache")
+@patch("torch.cuda.synchronize")
+def test_clear_memory(mock_sync, mock_clear, kokoro_backend):
+    """Test memory clearing."""
+    with patch.object(kokoro_backend, "_device", "cuda"):
+        kokoro_backend._clear_memory()
+        mock_clear.assert_called_once()
+        mock_sync.assert_called_once()
+@pytest.mark.asyncio
+async def test_load_model_validation(kokoro_backend):
+    """Test model loading validation."""
+    with pytest.raises(RuntimeError, match="Failed to load Kokoro model"):
+        await kokoro_backend.load_model("nonexistent_model.pth")
+def test_unload_with_pipelines(kokoro_backend):
+    """Test model unloading with multiple pipelines."""
+    # Mock loaded state with multiple pipelines
+    kokoro_backend._model = MagicMock()
+    pipeline_a = MagicMock()
+    pipeline_e = MagicMock()
+    kokoro_backend._pipelines = {"a": pipeline_a, "e": pipeline_e}
+    assert kokoro_backend.is_loaded
+    # Test unload
+    kokoro_backend.unload()
+    assert not kokoro_backend.is_loaded
+    assert kokoro_backend._model is None
+    assert kokoro_backend._pipelines == {}  # All pipelines should be cleared
+@pytest.mark.asyncio
+async def test_generate_validation(kokoro_backend):
+    """Test generation validation."""
+    with pytest.raises(RuntimeError, match="Model not loaded"):
+        async for _ in kokoro_backend.generate("test", "voice"):
+            pass
+@pytest.mark.asyncio
+async def test_generate_from_tokens_validation(kokoro_backend):
+    """Test token generation validation."""
+    with pytest.raises(RuntimeError, match="Model not loaded"):
+        async for _ in kokoro_backend.generate_from_tokens("test tokens", "voice"):
+            pass
+def test_get_pipeline_creates_new(kokoro_backend):
+    """Test that _get_pipeline creates new pipeline for new language code."""
+    # Mock loaded state
+    kokoro_backend._model = MagicMock()
+    # Mock KPipeline
+    mock_pipeline = MagicMock()
+    with patch(
+        "api.src.inference.kokoro_v1.KPipeline", return_value=mock_pipeline
+    ) as mock_kpipeline:
+        # Get pipeline for Spanish
+        pipeline_e = kokoro_backend._get_pipeline("e")
+        # Should create new pipeline with correct params
+        mock_kpipeline.assert_called_once_with(
+            lang_code="e", model=kokoro_backend._model, device=kokoro_backend._device
+        )
+        assert pipeline_e == mock_pipeline
+        assert kokoro_backend._pipelines["e"] == mock_pipeline
+def test_get_pipeline_reuses_existing(kokoro_backend):
+    """Test that _get_pipeline reuses existing pipeline for same language code."""
+    # Mock loaded state
+    kokoro_backend._model = MagicMock()
+    # Mock KPipeline
+    mock_pipeline = MagicMock()
+    with patch(
+        "api.src.inference.kokoro_v1.KPipeline", return_value=mock_pipeline
+    ) as mock_kpipeline:
+        # Get pipeline twice for same language
+        pipeline1 = kokoro_backend._get_pipeline("e")
+        pipeline2 = kokoro_backend._get_pipeline("e")
+        # Should only create pipeline once
+        mock_kpipeline.assert_called_once()
+        assert pipeline1 == pipeline2
+        assert kokoro_backend._pipelines["e"] == mock_pipeline
+@pytest.mark.asyncio
+async def test_generate_uses_correct_pipeline(kokoro_backend):
+    """Test that generate uses correct pipeline for language code."""
+    # Mock loaded state
+    kokoro_backend._model = MagicMock()
+    # Mock voice path handling
+    with (
+        patch("api.src.core.paths.load_voice_tensor") as mock_load_voice,
+        patch("api.src.core.paths.save_voice_tensor"),
+        patch("tempfile.gettempdir") as mock_tempdir,
+    ):
+        mock_load_voice.return_value = torch.ones(1)
+        mock_tempdir.return_value = "/tmp"
+        # Mock KPipeline
+        mock_pipeline = MagicMock()
+        mock_pipeline.return_value = iter([])  # Empty generator for testing
+        with patch("api.src.inference.kokoro_v1.KPipeline", return_value=mock_pipeline):
+            # Generate with Spanish voice and explicit lang_code
+            async for _ in kokoro_backend.generate("test", "ef_voice", lang_code="e"):
+                pass
+            # Should create pipeline with Spanish lang_code
+            assert "e" in kokoro_backend._pipelines
+            # Use ANY to match the temp file path since it's dynamic
+            mock_pipeline.assert_called_with(
+                "test",
+                voice=ANY,  # Don't check exact path since it's dynamic
+                speed=1.0,
+                model=kokoro_backend._model,
+            )
+            # Verify the voice path is a temp file path
+            call_args = mock_pipeline.call_args
+            assert isinstance(call_args[1]["voice"], str)
+            assert call_args[1]["voice"].startswith("/tmp/temp_voice_")

api/tests/test_normalizer.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""Tests for text normalization service"""
+import pytest
+from api.src.services.text_processing.normalizer import normalize_text
+from api.src.structures.schemas import NormalizationOptions
+def test_url_protocols():
+    """Test URL protocol handling"""
+    assert (
+        normalize_text(
+            "Check out https://example.com",
+            normalization_options=NormalizationOptions(),
+        )
+        == "Check out https example dot com"
+    )
+    assert (
+        normalize_text(
+            "Visit http://site.com", normalization_options=NormalizationOptions()
+        )
+        == "Visit http site dot com"
+    )
+    assert (
+        normalize_text(
+            "Go to https://test.org/path", normalization_options=NormalizationOptions()
+        )
+        == "Go to https test dot org slash path"
+    )
+def test_url_www():
+    """Test www prefix handling"""
+    assert (
+        normalize_text(
+            "Go to www.example.com", normalization_options=NormalizationOptions()
+        )
+        == "Go to www example dot com"
+    )
+    assert (
+        normalize_text(
+            "Visit www.test.org/docs", normalization_options=NormalizationOptions()
+        )
+        == "Visit www test dot org slash docs"
+    )
+    assert (
+        normalize_text(
+            "Check www.site.com?q=test", normalization_options=NormalizationOptions()
+        )
+        == "Check www site dot com question-mark q equals test"
+    )
+def test_url_localhost():
+    """Test localhost URL handling"""
+    assert (
+        normalize_text(
+            "Running on localhost:7860", normalization_options=NormalizationOptions()
+        )
+        == "Running on localhost colon 78 60"
+    )
+    assert (
+        normalize_text(
+            "Server at localhost:8080/api", normalization_options=NormalizationOptions()
+        )
+        == "Server at localhost colon 80 80 slash api"
+    )
+    assert (
+        normalize_text(
+            "Test localhost:3000/test?v=1", normalization_options=NormalizationOptions()
+        )
+        == "Test localhost colon 3000 slash test question-mark v equals 1"
+    )
+def test_url_ip_addresses():
+    """Test IP address URL handling"""
+    assert (
+        normalize_text(
+            "Access 0.0.0.0:9090/test", normalization_options=NormalizationOptions()
+        )
+        == "Access 0 dot 0 dot 0 dot 0 colon 90 90 slash test"
+    )
+    assert (
+        normalize_text(
+            "API at 192.168.1.1:8000", normalization_options=NormalizationOptions()
+        )
+        == "API at 192 dot 168 dot 1 dot 1 colon 8000"
+    )
+    assert (
+        normalize_text("Server 127.0.0.1", normalization_options=NormalizationOptions())
+        == "Server 127 dot 0 dot 0 dot 1"
+    )
+def test_url_raw_domains():
+    """Test raw domain handling"""
+    assert (
+        normalize_text(
+            "Visit google.com/search", normalization_options=NormalizationOptions()
+        )
+        == "Visit google dot com slash search"
+    )
+    assert (
+        normalize_text(
+            "Go to example.com/path?q=test",
+            normalization_options=NormalizationOptions(),
+        )
+        == "Go to example dot com slash path question-mark q equals test"
+    )
+    assert (
+        normalize_text(
+            "Check docs.test.com", normalization_options=NormalizationOptions()
+        )
+        == "Check docs dot test dot com"
+    )
+def test_url_email_addresses():
+    """Test email address handling"""
+    assert (
+        normalize_text(
+            "Email me at [email protected]", normalization_options=NormalizationOptions()
+        )
+        == "Email me at user at example dot com"
+    )
+    assert (
+        normalize_text(
+            "Contact [email protected]", normalization_options=NormalizationOptions()
+        )
+        == "Contact admin at test dot org"
+    )
+    assert (
+        normalize_text(
+            "Send to [email protected]", normalization_options=NormalizationOptions()
+        )
+        == "Send to test dot user at site dot com"
+    )
+def test_money():
+    """Test that money text is normalized correctly"""
+    assert (
+        normalize_text(
+            "He lost $5.3 thousand.", normalization_options=NormalizationOptions()
+        )
+        == "He lost five point three thousand dollars."
+    )
+    assert (
+        normalize_text(
+            "To put it weirdly -$6.9 million",
+            normalization_options=NormalizationOptions(),
+        )
+        == "To put it weirdly minus six point nine million dollars"
+    )
+    assert (
+        normalize_text("It costs $50.3.", normalization_options=NormalizationOptions())
+        == "It costs fifty dollars and thirty cents."
+    )
+def test_non_url_text():
+    """Test that non-URL text is unaffected"""
+    assert (
+        normalize_text(
+            "This is not.a.url text", normalization_options=NormalizationOptions()
+        )
+        == "This is not-a-url text"
+    )
+    assert (
+        normalize_text(
+            "Hello, how are you today?", normalization_options=NormalizationOptions()
+        )
+        == "Hello, how are you today?"
+    )
+    assert (
+        normalize_text("It costs $50.", normalization_options=NormalizationOptions())
+        == "It costs fifty dollars."
+    )

api/tests/test_openai_endpoints.py ADDED Viewed

	@@ -0,0 +1,499 @@

+import asyncio
+import json
+import os
+from typing import AsyncGenerator, Tuple
+from unittest.mock import AsyncMock, MagicMock, patch
+import numpy as np
+import pytest
+from fastapi.testclient import TestClient
+from api.src.core.config import settings
+from api.src.inference.base import AudioChunk
+from api.src.main import app
+from api.src.routers.openai_compatible import (
+    get_tts_service,
+    load_openai_mappings,
+    stream_audio_chunks,
+)
+from api.src.services.streaming_audio_writer import StreamingAudioWriter
+from api.src.services.tts_service import TTSService
+from api.src.structures.schemas import OpenAISpeechRequest
+client = TestClient(app)
+@pytest.fixture
+def test_voice():
+    """Fixture providing a test voice name."""
+    return "test_voice"
+@pytest.fixture
+def mock_openai_mappings():
+    """Mock OpenAI mappings for testing."""
+    with patch(
+        "api.src.routers.openai_compatible._openai_mappings",
+        {
+            "models": {"tts-1": "kokoro-v1_0", "tts-1-hd": "kokoro-v1_0"},
+            "voices": {"alloy": "am_adam", "nova": "bf_isabella"},
+        },
+    ):
+        yield
+@pytest.fixture
+def mock_json_file(tmp_path):
+    """Create a temporary mock JSON file."""
+    content = {
+        "models": {"test-model": "test-kokoro"},
+        "voices": {"test-voice": "test-internal"},
+    }
+    json_file = tmp_path / "test_mappings.json"
+    json_file.write_text(json.dumps(content))
+    return json_file
+def test_load_openai_mappings(mock_json_file):
+    """Test loading OpenAI mappings from JSON file"""
+    with patch("os.path.join", return_value=str(mock_json_file)):
+        mappings = load_openai_mappings()
+        assert "models" in mappings
+        assert "voices" in mappings
+        assert mappings["models"]["test-model"] == "test-kokoro"
+        assert mappings["voices"]["test-voice"] == "test-internal"
+def test_load_openai_mappings_file_not_found():
+    """Test handling of missing mappings file"""
+    with patch("os.path.join", return_value="/nonexistent/path"):
+        mappings = load_openai_mappings()
+        assert mappings == {"models": {}, "voices": {}}
+def test_list_models(mock_openai_mappings):
+    """Test listing available models endpoint"""
+    response = client.get("/v1/models")
+    assert response.status_code == 200
+    data = response.json()
+    assert data["object"] == "list"
+    assert isinstance(data["data"], list)
+    assert len(data["data"]) == 3  # tts-1, tts-1-hd, and kokoro
+    # Verify all expected models are present
+    model_ids = [model["id"] for model in data["data"]]
+    assert "tts-1" in model_ids
+    assert "tts-1-hd" in model_ids
+    assert "kokoro" in model_ids
+    # Verify model format
+    for model in data["data"]:
+        assert model["object"] == "model"
+        assert "created" in model
+        assert model["owned_by"] == "kokoro"
+def test_retrieve_model(mock_openai_mappings):
+    """Test retrieving a specific model endpoint"""
+    # Test successful model retrieval
+    response = client.get("/v1/models/tts-1")
+    assert response.status_code == 200
+    data = response.json()
+    assert data["id"] == "tts-1"
+    assert data["object"] == "model"
+    assert data["owned_by"] == "kokoro"
+    assert "created" in data
+    # Test non-existent model
+    response = client.get("/v1/models/nonexistent-model")
+    assert response.status_code == 404
+    error = response.json()
+    assert error["detail"]["error"] == "model_not_found"
+    assert "not found" in error["detail"]["message"]
+    assert error["detail"]["type"] == "invalid_request_error"
+@pytest.mark.asyncio
+async def test_get_tts_service_initialization():
+    """Test TTSService initialization"""
+    with patch("api.src.routers.openai_compatible._tts_service", None):
+        with patch("api.src.routers.openai_compatible._init_lock", None):
+            with patch("api.src.services.tts_service.TTSService.create") as mock_create:
+                mock_service = AsyncMock()
+                mock_create.return_value = mock_service
+                # Test concurrent access
+                async def get_service():
+                    return await get_tts_service()
+                # Create multiple concurrent requests
+                tasks = [get_service() for _ in range(5)]
+                results = await asyncio.gather(*tasks)
+                # Verify service was created only once
+                mock_create.assert_called_once()
+                assert all(r == mock_service for r in results)
+@pytest.mark.asyncio
+async def test_stream_audio_chunks_client_disconnect():
+    """Test handling of client disconnect during streaming"""
+    mock_request = MagicMock()
+    mock_request.is_disconnected = AsyncMock(return_value=True)
+    mock_service = AsyncMock()
+    async def mock_stream(*args, **kwargs):
+        for i in range(5):
+            yield AudioChunk(np.ndarray([], np.int16), output=b"chunk")
+    mock_service.generate_audio_stream = mock_stream
+    mock_service.list_voices.return_value = ["test_voice"]
+    request = OpenAISpeechRequest(
+        model="kokoro",
+        input="Test text",
+        voice="test_voice",
+        response_format="mp3",
+        stream=True,
+        speed=1.0,
+    )
+    writer = StreamingAudioWriter("mp3", 24000)
+    chunks = []
+    async for chunk in stream_audio_chunks(mock_service, request, mock_request, writer):
+        chunks.append(chunk)
+    writer.close()
+    assert len(chunks) == 0  # Should stop immediately due to disconnect
+def test_openai_voice_mapping(mock_tts_service, mock_openai_mappings):
+    """Test OpenAI voice name mapping"""
+    mock_tts_service.list_voices.return_value = ["am_adam", "bf_isabella"]
+    response = client.post(
+        "/v1/audio/speech",
+        json={
+            "model": "tts-1",
+            "input": "Hello world",
+            "voice": "alloy",  # OpenAI voice name
+            "response_format": "mp3",
+            "stream": False,
+        },
+    )
+    assert response.status_code == 200
+    mock_tts_service.generate_audio.assert_called_once()
+    assert mock_tts_service.generate_audio.call_args[1]["voice"] == "am_adam"
+def test_openai_voice_mapping_streaming(
+    mock_tts_service, mock_openai_mappings, mock_audio_bytes
+):
+    """Test OpenAI voice mapping in streaming mode"""
+    mock_tts_service.list_voices.return_value = ["am_adam", "bf_isabella"]
+    response = client.post(
+        "/v1/audio/speech",
+        json={
+            "model": "tts-1-hd",
+            "input": "Hello world",
+            "voice": "nova",  # OpenAI voice name
+            "response_format": "mp3",
+            "stream": True,
+        },
+    )
+    assert response.status_code == 200
+    content = b""
+    for chunk in response.iter_bytes():
+        content += chunk
+    assert content == mock_audio_bytes
+def test_invalid_openai_model(mock_tts_service, mock_openai_mappings):
+    """Test error handling for invalid OpenAI model"""
+    response = client.post(
+        "/v1/audio/speech",
+        json={
+            "model": "invalid-model",
+            "input": "Hello world",
+            "voice": "alloy",
+            "response_format": "mp3",
+            "stream": False,
+        },
+    )
+    assert response.status_code == 400
+    error_response = response.json()
+    assert error_response["detail"]["error"] == "invalid_model"
+    assert "Unsupported model" in error_response["detail"]["message"]
+@pytest.fixture
+def mock_audio_bytes():
+    """Mock audio bytes for testing."""
+    return b"mock audio data"
+@pytest.fixture
+def mock_tts_service(mock_audio_bytes):
+    """Mock TTS service for testing."""
+    with patch("api.src.routers.openai_compatible.get_tts_service") as mock_get:
+        service = AsyncMock(spec=TTSService)
+        service.generate_audio.return_value = AudioChunk(np.zeros(1000, np.int16))
+        async def mock_stream(*args, **kwargs) -> AsyncGenerator[AudioChunk, None]:
+            yield AudioChunk(np.ndarray([], np.int16), output=mock_audio_bytes)
+        service.generate_audio_stream = mock_stream
+        service.list_voices.return_value = ["test_voice", "voice1", "voice2"]
+        service.combine_voices.return_value = "voice1_voice2"
+        mock_get.return_value = service
+        mock_get.side_effect = None
+        yield service
+@patch("api.src.services.audio.AudioService.convert_audio")
+def test_openai_speech_endpoint(
+    mock_convert, mock_tts_service, test_voice, mock_audio_bytes
+):
+    """Test the OpenAI-compatible speech endpoint with basic MP3 generation"""
+    # Configure mocks
+    mock_tts_service.generate_audio.return_value = AudioChunk(np.zeros(1000, np.int16))
+    mock_convert.return_value = AudioChunk(
+        np.zeros(1000, np.int16), output=mock_audio_bytes
+    )
+    response = client.post(
+        "/v1/audio/speech",
+        json={
+            "model": "kokoro",
+            "input": "Hello world",
+            "voice": test_voice,
+            "response_format": "mp3",
+            "stream": False,
+        },
+    )
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "audio/mpeg"
+    assert len(response.content) > 0
+    assert response.content == mock_audio_bytes + mock_audio_bytes
+    mock_tts_service.generate_audio.assert_called_once()
+    assert mock_convert.call_count == 2
+def test_openai_speech_streaming(mock_tts_service, test_voice, mock_audio_bytes):
+    """Test the OpenAI-compatible speech endpoint with streaming"""
+    response = client.post(
+        "/v1/audio/speech",
+        json={
+            "model": "kokoro",
+            "input": "Hello world",
+            "voice": test_voice,
+            "response_format": "mp3",
+            "stream": True,
+        },
+    )
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "audio/mpeg"
+    assert "Transfer-Encoding" in response.headers
+    assert response.headers["Transfer-Encoding"] == "chunked"
+    content = b""
+    for chunk in response.iter_bytes():
+        content += chunk
+    assert content == mock_audio_bytes
+def test_openai_speech_pcm_streaming(mock_tts_service, test_voice, mock_audio_bytes):
+    """Test PCM streaming format"""
+    response = client.post(
+        "/v1/audio/speech",
+        json={
+            "model": "kokoro",
+            "input": "Hello world",
+            "voice": test_voice,
+            "response_format": "pcm",
+            "stream": True,
+        },
+    )
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "audio/pcm"
+    content = b""
+    for chunk in response.iter_bytes():
+        content += chunk
+    assert content == mock_audio_bytes
+def test_openai_speech_invalid_voice(mock_tts_service):
+    """Test error handling for invalid voice"""
+    mock_tts_service.generate_audio.side_effect = ValueError(
+        "Voice 'invalid_voice' not found"
+    )
+    response = client.post(
+        "/v1/audio/speech",
+        json={
+            "model": "kokoro",
+            "input": "Hello world",
+            "voice": "invalid_voice",
+            "response_format": "mp3",
+            "stream": False,
+        },
+    )
+    assert response.status_code == 400
+    error_response = response.json()
+    assert error_response["detail"]["error"] == "validation_error"
+    assert "Voice 'invalid_voice' not found" in error_response["detail"]["message"]
+    assert error_response["detail"]["type"] == "invalid_request_error"
+def test_openai_speech_empty_text(mock_tts_service, test_voice):
+    """Test error handling for empty text"""
+    async def mock_error_stream(*args, **kwargs):
+        raise ValueError("Text is empty after preprocessing")
+    mock_tts_service.generate_audio = mock_error_stream
+    mock_tts_service.list_voices.return_value = ["test_voice"]
+    response = client.post(
+        "/v1/audio/speech",
+        json={
+            "model": "kokoro",
+            "input": "",
+            "voice": test_voice,
+            "response_format": "mp3",
+            "stream": False,
+        },
+    )
+    assert response.status_code == 400
+    error_response = response.json()
+    assert error_response["detail"]["error"] == "validation_error"
+    assert "Text is empty after preprocessing" in error_response["detail"]["message"]
+    assert error_response["detail"]["type"] == "invalid_request_error"
+def test_openai_speech_invalid_format(mock_tts_service, test_voice):
+    """Test error handling for invalid format"""
+    response = client.post(
+        "/v1/audio/speech",
+        json={
+            "model": "kokoro",
+            "input": "Hello world",
+            "voice": test_voice,
+            "response_format": "invalid_format",
+            "stream": False,
+        },
+    )
+    assert response.status_code == 422  # Validation error from Pydantic
+def test_list_voices(mock_tts_service):
+    """Test listing available voices"""
+    # Override the mock for this specific test
+    mock_tts_service.list_voices.return_value = ["voice1", "voice2"]
+    response = client.get("/v1/audio/voices")
+    assert response.status_code == 200
+    data = response.json()
+    assert "voices" in data
+    assert len(data["voices"]) == 2
+    assert "voice1" in data["voices"]
+    assert "voice2" in data["voices"]
+@patch("api.src.routers.openai_compatible.settings")
+def test_combine_voices(mock_settings, mock_tts_service):
+    """Test combining voices endpoint"""
+    # Enable local voice saving for this test
+    mock_settings.allow_local_voice_saving = True
+    response = client.post("/v1/audio/voices/combine", json="voice1+voice2")
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "application/octet-stream"
+    assert "voice1+voice2.pt" in response.headers["content-disposition"]
+def test_server_error(mock_tts_service, test_voice):
+    """Test handling of server errors"""
+    async def mock_error_stream(*args, **kwargs):
+        raise RuntimeError("Internal server error")
+    mock_tts_service.generate_audio = mock_error_stream
+    mock_tts_service.list_voices.return_value = ["test_voice"]
+    response = client.post(
+        "/v1/audio/speech",
+        json={
+            "model": "kokoro",
+            "input": "Hello world",
+            "voice": test_voice,
+            "response_format": "mp3",
+            "stream": False,
+        },
+    )
+    assert response.status_code == 500
+    error_response = response.json()
+    assert error_response["detail"]["error"] == "processing_error"
+    assert error_response["detail"]["type"] == "server_error"
+def test_streaming_error(mock_tts_service, test_voice):
+    """Test handling streaming errors"""
+    # Mock process_voices to raise the error
+    mock_tts_service.list_voices.side_effect = RuntimeError("Streaming failed")
+    response = client.post(
+        "/v1/audio/speech",
+        json={
+            "model": "kokoro",
+            "input": "Hello world",
+            "voice": test_voice,
+            "response_format": "mp3",
+            "stream": True,
+        },
+    )
+    assert response.status_code == 500
+    error_data = response.json()
+    assert error_data["detail"]["error"] == "processing_error"
+    assert error_data["detail"]["type"] == "server_error"
+    assert "Streaming failed" in error_data["detail"]["message"]
+@pytest.mark.asyncio
+async def test_streaming_initialization_error():
+    """Test handling of streaming initialization errors"""
+    mock_service = AsyncMock()
+    async def mock_error_stream(*args, **kwargs):
+        if False:  # This makes it a proper generator
+            yield b""
+        raise RuntimeError("Failed to initialize stream")
+    mock_service.generate_audio_stream = mock_error_stream
+    mock_service.list_voices.return_value = ["test_voice"]
+    request = OpenAISpeechRequest(
+        model="kokoro",
+        input="Test text",
+        voice="test_voice",
+        response_format="mp3",
+        stream=True,
+        speed=1.0,
+    )
+    writer = StreamingAudioWriter("mp3", 24000)
+    with pytest.raises(RuntimeError) as exc:
+        async for _ in stream_audio_chunks(mock_service, request, MagicMock(), writer):
+            pass
+    writer.close()
+    assert "Failed to initialize stream" in str(exc.value)

api/tests/test_paths.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import os
+from unittest.mock import patch
+import pytest
+from api.src.core.paths import (
+    _find_file,
+    _scan_directories,
+    get_content_type,
+    get_temp_dir_size,
+    get_temp_file_path,
+    list_temp_files,
+)
+@pytest.mark.asyncio
+async def test_find_file_exists():
+    """Test finding existing file."""
+    with patch("aiofiles.os.path.exists") as mock_exists:
+        mock_exists.return_value = True
+        path = await _find_file("test.txt", ["/test/path"])
+        assert path == "/test/path/test.txt"
+@pytest.mark.asyncio
+async def test_find_file_not_exists():
+    """Test finding non-existent file."""
+    with patch("aiofiles.os.path.exists") as mock_exists:
+        mock_exists.return_value = False
+        with pytest.raises(FileNotFoundError, match="File not found"):
+            await _find_file("test.txt", ["/test/path"])
+@pytest.mark.asyncio
+async def test_find_file_with_filter():
+    """Test finding file with filter function."""
+    with patch("aiofiles.os.path.exists") as mock_exists:
+        mock_exists.return_value = True
+        filter_fn = lambda p: p.endswith(".txt")
+        path = await _find_file("test.txt", ["/test/path"], filter_fn)
+        assert path == "/test/path/test.txt"
+@pytest.mark.asyncio
+async def test_scan_directories():
+    """Test scanning directories."""
+    mock_entry = type("MockEntry", (), {"name": "test.txt"})()
+    with (
+        patch("aiofiles.os.path.exists") as mock_exists,
+        patch("aiofiles.os.scandir") as mock_scandir,
+    ):
+        mock_exists.return_value = True
+        mock_scandir.return_value = [mock_entry]
+        files = await _scan_directories(["/test/path"])
+        assert "test.txt" in files
+@pytest.mark.asyncio
+async def test_get_content_type():
+    """Test content type detection."""
+    test_cases = [
+        ("test.html", "text/html"),
+        ("test.js", "application/javascript"),
+        ("test.css", "text/css"),
+        ("test.png", "image/png"),
+        ("test.unknown", "application/octet-stream"),
+    ]
+    for filename, expected in test_cases:
+        content_type = await get_content_type(filename)
+        assert content_type == expected
+@pytest.mark.asyncio
+async def test_get_temp_file_path():
+    """Test temp file path generation."""
+    with (
+        patch("aiofiles.os.path.exists") as mock_exists,
+        patch("aiofiles.os.makedirs") as mock_makedirs,
+    ):
+        mock_exists.return_value = False
+        path = await get_temp_file_path("test.wav")
+        assert "test.wav" in path
+        mock_makedirs.assert_called_once()
+@pytest.mark.asyncio
+async def test_list_temp_files():
+    """Test listing temp files."""
+    class MockEntry:
+        def __init__(self, name):
+            self.name = name
+        def is_file(self):
+            return True
+    mock_entry = MockEntry("test.wav")
+    with (
+        patch("aiofiles.os.path.exists") as mock_exists,
+        patch("aiofiles.os.scandir") as mock_scandir,
+    ):
+        mock_exists.return_value = True
+        mock_scandir.return_value = [mock_entry]
+        files = await list_temp_files()
+        assert "test.wav" in files
+@pytest.mark.asyncio
+async def test_get_temp_dir_size():
+    """Test getting temp directory size."""
+    class MockEntry:
+        def __init__(self, path):
+            self.path = path
+        def is_file(self):
+            return True
+    mock_entry = MockEntry("/tmp/test.wav")
+    mock_stat = type("MockStat", (), {"st_size": 1024})()
+    with (
+        patch("aiofiles.os.path.exists") as mock_exists,
+        patch("aiofiles.os.scandir") as mock_scandir,
+        patch("aiofiles.os.stat") as mock_stat_fn,
+    ):
+        mock_exists.return_value = True
+        mock_scandir.return_value = [mock_entry]
+        mock_stat_fn.return_value = mock_stat
+        size = await get_temp_dir_size()
+        assert size == 1024

api/tests/test_text_processor.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import pytest
+from api.src.services.text_processing.text_processor import (
+    get_sentence_info,
+    process_text_chunk,
+    smart_split,
+)
+def test_process_text_chunk_basic():
+    """Test basic text chunk processing."""
+    text = "Hello world"
+    tokens = process_text_chunk(text)
+    assert isinstance(tokens, list)
+    assert len(tokens) > 0
+def test_process_text_chunk_empty():
+    """Test processing empty text."""
+    text = ""
+    tokens = process_text_chunk(text)
+    assert isinstance(tokens, list)
+    assert len(tokens) == 0
+def test_process_text_chunk_phonemes():
+    """Test processing with skip_phonemize."""
+    phonemes = "h @ l @U"  # Example phoneme sequence
+    tokens = process_text_chunk(phonemes, skip_phonemize=True)
+    assert isinstance(tokens, list)
+    assert len(tokens) > 0
+def test_get_sentence_info():
+    """Test sentence splitting and info extraction."""
+    text = "This is sentence one. This is sentence two! What about three?"
+    results = get_sentence_info(text, {})
+    assert len(results) == 3
+    for sentence, tokens, count in results:
+        assert isinstance(sentence, str)
+        assert isinstance(tokens, list)
+        assert isinstance(count, int)
+        assert count == len(tokens)
+        assert count > 0
+def test_get_sentence_info_phenomoes():
+    """Test sentence splitting and info extraction."""
+    text = (
+        "This is sentence one. This is </|custom_phonemes_0|/> two! What about three?"
+    )
+    results = get_sentence_info(text, {"</|custom_phonemes_0|/>": r"sˈɛntᵊns"})
+    assert len(results) == 3
+    assert "sˈɛntᵊns" in results[1][0]
+    for sentence, tokens, count in results:
+        assert isinstance(sentence, str)
+        assert isinstance(tokens, list)
+        assert isinstance(count, int)
+        assert count == len(tokens)
+        assert count > 0
+@pytest.mark.asyncio
+async def test_smart_split_short_text():
+    """Test smart splitting with text under max tokens."""
+    text = "This is a short test sentence."
+    chunks = []
+    async for chunk_text, chunk_tokens in smart_split(text):
+        chunks.append((chunk_text, chunk_tokens))
+    assert len(chunks) == 1
+    assert isinstance(chunks[0][0], str)
+    assert isinstance(chunks[0][1], list)
+@pytest.mark.asyncio
+async def test_smart_split_long_text():
+    """Test smart splitting with longer text."""
+    # Create text that should split into multiple chunks
+    text = ". ".join(["This is test sentence number " + str(i) for i in range(20)])
+    chunks = []
+    async for chunk_text, chunk_tokens in smart_split(text):
+        chunks.append((chunk_text, chunk_tokens))
+    assert len(chunks) > 1
+    for chunk_text, chunk_tokens in chunks:
+        assert isinstance(chunk_text, str)
+        assert isinstance(chunk_tokens, list)
+        assert len(chunk_tokens) > 0
+@pytest.mark.asyncio
+async def test_smart_split_with_punctuation():
+    """Test smart splitting handles punctuation correctly."""
+    text = "First sentence! Second sentence? Third sentence; Fourth sentence: Fifth sentence."
+    chunks = []
+    async for chunk_text, chunk_tokens in smart_split(text):
+        chunks.append(chunk_text)
+    # Verify punctuation is preserved
+    assert all(any(p in chunk for p in "!?;:.") for chunk in chunks)

api/tests/test_tts_service.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from unittest.mock import AsyncMock, MagicMock, patch
+import numpy as np
+import pytest
+import torch
+from api.src.services.tts_service import TTSService
+@pytest.fixture
+def mock_managers():
+    """Mock model and voice managers."""
+    async def _mock_managers():
+        model_manager = AsyncMock()
+        model_manager.get_backend.return_value = MagicMock()
+        voice_manager = AsyncMock()
+        voice_manager.get_voice_path.return_value = "/path/to/voice.pt"
+        voice_manager.list_voices.return_value = ["voice1", "voice2"]
+        with (
+            patch("api.src.services.tts_service.get_model_manager") as mock_get_model,
+            patch("api.src.services.tts_service.get_voice_manager") as mock_get_voice,
+        ):
+            mock_get_model.return_value = model_manager
+            mock_get_voice.return_value = voice_manager
+            return model_manager, voice_manager
+    return _mock_managers()
+@pytest.fixture
+def tts_service(mock_managers):
+    """Create TTSService instance with mocked dependencies."""
+    async def _create_service():
+        return await TTSService.create("test_output")
+    return _create_service()
+@pytest.mark.asyncio
+async def test_service_creation():
+    """Test service creation and initialization."""
+    model_manager = AsyncMock()
+    voice_manager = AsyncMock()
+    with (
+        patch("api.src.services.tts_service.get_model_manager") as mock_get_model,
+        patch("api.src.services.tts_service.get_voice_manager") as mock_get_voice,
+    ):
+        mock_get_model.return_value = model_manager
+        mock_get_voice.return_value = voice_manager
+        service = await TTSService.create("test_output")
+        assert service.output_dir == "test_output"
+        assert service.model_manager is model_manager
+        assert service._voice_manager is voice_manager
+@pytest.mark.asyncio
+async def test_get_voice_path_single():
+    """Test getting path for single voice."""
+    model_manager = AsyncMock()
+    voice_manager = AsyncMock()
+    voice_manager.get_voice_path.return_value = "/path/to/voice1.pt"
+    with (
+        patch("api.src.services.tts_service.get_model_manager") as mock_get_model,
+        patch("api.src.services.tts_service.get_voice_manager") as mock_get_voice,
+    ):
+        mock_get_model.return_value = model_manager
+        mock_get_voice.return_value = voice_manager
+        service = await TTSService.create("test_output")
+        name, path = await service._get_voices_path("voice1")
+        assert name == "voice1"
+        assert path == "/path/to/voice1.pt"
+        voice_manager.get_voice_path.assert_called_once_with("voice1")
+@pytest.mark.asyncio
+async def test_get_voice_path_combined():
+    """Test getting path for combined voices."""
+    model_manager = AsyncMock()
+    voice_manager = AsyncMock()
+    voice_manager.get_voice_path.return_value = "/path/to/voice.pt"
+    with (
+        patch("api.src.services.tts_service.get_model_manager") as mock_get_model,
+        patch("api.src.services.tts_service.get_voice_manager") as mock_get_voice,
+        patch("torch.load") as mock_load,
+        patch("torch.save") as mock_save,
+        patch("tempfile.gettempdir") as mock_temp,
+    ):
+        mock_get_model.return_value = model_manager
+        mock_get_voice.return_value = voice_manager
+        mock_temp.return_value = "/tmp"
+        mock_load.return_value = torch.ones(10)
+        service = await TTSService.create("test_output")
+        name, path = await service._get_voices_path("voice1+voice2")
+        assert name == "voice1+voice2"
+        assert path.endswith("voice1+voice2.pt")
+        mock_save.assert_called_once()
+@pytest.mark.asyncio
+async def test_list_voices():
+    """Test listing available voices."""
+    model_manager = AsyncMock()
+    voice_manager = AsyncMock()
+    voice_manager.list_voices.return_value = ["voice1", "voice2"]
+    with (
+        patch("api.src.services.tts_service.get_model_manager") as mock_get_model,
+        patch("api.src.services.tts_service.get_voice_manager") as mock_get_voice,
+    ):
+        mock_get_model.return_value = model_manager
+        mock_get_voice.return_value = voice_manager
+        service = await TTSService.create("test_output")
+        voices = await service.list_voices()
+        assert voices == ["voice1", "voice2"]
+        voice_manager.list_voices.assert_called_once()

charts/kokoro-fastapi/.helmignore ADDED Viewed

	@@ -0,0 +1,23 @@

+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/

charts/kokoro-fastapi/Chart.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+apiVersion: v2
+name: kokoro-fastapi
+description: A Helm chart for deploying the Kokoro FastAPI TTS service to Kubernetes
+type: application
+version: 0.3.0
+appVersion: "0.3.0"
+keywords:
+  - tts
+  - fastapi
+  - gpu
+  - kokoro