diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f52830790a51ab3aeff66613f58a478777b2d08e
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,66 @@
+FROM python:3.10-slim
+
+# Install dependencies and check espeak location
+RUN apt-get update && apt-get install -y \
+ espeak-ng \
+ espeak-ng-data \
+ git \
+ libsndfile1 \
+ curl \
+ ffmpeg \
+ g++ \
+&& apt-get clean \
+&& rm -rf /var/lib/apt/lists/* \
+&& mkdir -p /usr/share/espeak-ng-data \
+&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
+
+# Install UV using the installer script
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
+ mv /root/.local/bin/uv /usr/local/bin/ && \
+ mv /root/.local/bin/uvx /usr/local/bin/
+
+# Create non-root user and set up directories and permissions
+RUN useradd -m -u 1000 appuser && \
+ mkdir -p /app/api/src/models/v1_0 && \
+ chown -R appuser:appuser /app
+
+USER appuser
+WORKDIR /app
+
+# Copy dependency files
+COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
+
+# Install Rust (required to build sudachipy and pyopenjtalk-plus)
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
+ENV PATH="/home/appuser/.cargo/bin:$PATH"
+
+# Install dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+ uv venv --python 3.10 && \
+ uv sync --extra cpu
+
+# Copy project files including models
+COPY --chown=appuser:appuser api ./api
+COPY --chown=appuser:appuser web ./web
+COPY --chown=appuser:appuser docker/scripts/ ./
+RUN chmod +x ./entrypoint.sh
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+ PYTHONPATH=/app:/app/api \
+ PATH="/app/.venv/bin:$PATH" \
+ UV_LINK_MODE=copy \
+ USE_GPU=false \
+ PHONEMIZER_ESPEAK_PATH=/usr/bin \
+ PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
+ ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
+
+ENV DOWNLOAD_MODEL=true
+# Download model if enabled
+RUN if [ "$DOWNLOAD_MODEL" = "true" ]; then \
+ python download_model.py --output api/src/models/v1_0; \
+ fi
+
+ENV DEVICE="cpu"
+# Run FastAPI server through entrypoint.sh
+CMD ["./entrypoint.sh"]
diff --git a/api/__init__.py b/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4d39849baa9210560611a1bc8280ddc2ae0b910
--- /dev/null
+++ b/api/__init__.py
@@ -0,0 +1 @@
+# Make api directory a Python package
diff --git a/api/src/builds/v1_0/config.json b/api/src/builds/v1_0/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..25f35b9cebee10cc937b3a407e1347276dbed7fa
--- /dev/null
+++ b/api/src/builds/v1_0/config.json
@@ -0,0 +1,172 @@
+{
+ "istftnet": {
+ "upsample_kernel_sizes": [
+ 20,
+ 12
+ ],
+ "upsample_rates": [
+ 10,
+ 6
+ ],
+ "gen_istft_hop_size": 5,
+ "gen_istft_n_fft": 20,
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "upsample_initial_channel": 512
+ },
+ "dim_in": 64,
+ "dropout": 0.2,
+ "hidden_dim": 512,
+ "max_conv_dim": 512,
+ "max_dur": 50,
+ "multispeaker": true,
+ "n_layer": 3,
+ "n_mels": 80,
+ "n_token": 178,
+ "style_dim": 128,
+ "text_encoder_kernel_size": 5,
+ "plbert": {
+ "hidden_size": 768,
+ "num_attention_heads": 12,
+ "intermediate_size": 2048,
+ "max_position_embeddings": 512,
+ "num_hidden_layers": 12,
+ "dropout": 0.1
+ },
+ "vocab": {
+ ";": 1,
+ ":": 2,
+ ",": 3,
+ ".": 4,
+ "!": 5,
+ "?": 6,
+ "—": 9,
+ "…": 10,
+ "\"": 11,
+ "(": 12,
+ ")": 13,
+ "“": 14,
+ "”": 15,
+ " ": 16,
+ "̃": 17,
+ "ʣ": 18,
+ "ʥ": 19,
+ "ʦ": 20,
+ "ʨ": 21,
+ "ᵝ": 22,
+ "ꭧ": 23,
+ "A": 24,
+ "I": 25,
+ "O": 31,
+ "Q": 33,
+ "S": 35,
+ "T": 36,
+ "W": 39,
+ "Y": 41,
+ "ᵊ": 42,
+ "a": 43,
+ "b": 44,
+ "c": 45,
+ "d": 46,
+ "e": 47,
+ "f": 48,
+ "h": 50,
+ "i": 51,
+ "j": 52,
+ "k": 53,
+ "l": 54,
+ "m": 55,
+ "n": 56,
+ "o": 57,
+ "p": 58,
+ "q": 59,
+ "r": 60,
+ "s": 61,
+ "t": 62,
+ "u": 63,
+ "v": 64,
+ "w": 65,
+ "x": 66,
+ "y": 67,
+ "z": 68,
+ "ɑ": 69,
+ "ɐ": 70,
+ "ɒ": 71,
+ "æ": 72,
+ "β": 75,
+ "ɔ": 76,
+ "ɕ": 77,
+ "ç": 78,
+ "ɖ": 80,
+ "ð": 81,
+ "ʤ": 82,
+ "ə": 83,
+ "ɚ": 85,
+ "ɛ": 86,
+ "ɜ": 87,
+ "ɟ": 90,
+ "ɡ": 92,
+ "ɥ": 99,
+ "ɨ": 101,
+ "ɪ": 102,
+ "ʝ": 103,
+ "ɯ": 110,
+ "ɰ": 111,
+ "ŋ": 112,
+ "ɳ": 113,
+ "ɲ": 114,
+ "ɴ": 115,
+ "ø": 116,
+ "ɸ": 118,
+ "θ": 119,
+ "œ": 120,
+ "ɹ": 123,
+ "ɾ": 125,
+ "ɻ": 126,
+ "ʁ": 128,
+ "ɽ": 129,
+ "ʂ": 130,
+ "ʃ": 131,
+ "ʈ": 132,
+ "ʧ": 133,
+ "ʊ": 135,
+ "ʋ": 136,
+ "ʌ": 138,
+ "ɣ": 139,
+ "ɤ": 140,
+ "χ": 142,
+ "ʎ": 143,
+ "ʒ": 147,
+ "ʔ": 148,
+ "ˈ": 156,
+ "ˌ": 157,
+ "ː": 158,
+ "ʰ": 162,
+ "ʲ": 164,
+ "↓": 169,
+ "→": 171,
+ "↗": 172,
+ "↘": 173,
+ "ᵻ": 177
+ }
+}
\ No newline at end of file
diff --git a/api/src/core/__init__.py b/api/src/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9554f9b18a874f8122b12607fcca955a53912ab
--- /dev/null
+++ b/api/src/core/__init__.py
@@ -0,0 +1,3 @@
+from .config import settings
+
+__all__ = ["settings"]
diff --git a/api/src/core/config.py b/api/src/core/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4657c349ddaa20704b89903530942281872d11
--- /dev/null
+++ b/api/src/core/config.py
@@ -0,0 +1,85 @@
+import torch
+from pydantic_settings import BaseSettings
+
+
+class Settings(BaseSettings):
+ # API Settings
+ api_title: str = "Kokoro TTS API"
+ api_description: str = "API for text-to-speech generation using Kokoro"
+ api_version: str = "1.0.0"
+ host: str = "0.0.0.0"
+ port: int = 8880
+
+ # Application Settings
+ output_dir: str = "output"
+ output_dir_size_limit_mb: float = 500.0 # Maximum size of output directory in MB
+ default_voice: str = "af_heart"
+ default_voice_code: str | None = (
+ None # If set, overrides the first letter of voice name, though api call param still takes precedence
+ )
+ use_gpu: bool = True # Whether to use GPU acceleration if available
+ device_type: str | None = (
+ None # Will be auto-detected if None, can be "cuda", "mps", or "cpu"
+ )
+ allow_local_voice_saving: bool = (
+ False # Whether to allow saving combined voices locally
+ )
+
+ # Container absolute paths
+ model_dir: str = "/app/api/src/models" # Absolute path in container
+ voices_dir: str = "/app/api/src/voices/v1_0" # Absolute path in container
+
+ # Audio Settings
+ sample_rate: int = 24000
+ # Text Processing Settings
+ target_min_tokens: int = 175 # Target minimum tokens per chunk
+ target_max_tokens: int = 250 # Target maximum tokens per chunk
+ absolute_max_tokens: int = 450 # Absolute maximum tokens per chunk
+ advanced_text_normalization: bool = True # Preproesses the text before misiki
+ voice_weight_normalization: bool = (
+ True # Normalize the voice weights so they add up to 1
+ )
+
+ gap_trim_ms: int = (
+ 1 # Base amount to trim from streaming chunk ends in milliseconds
+ )
+ dynamic_gap_trim_padding_ms: int = 410 # Padding to add to dynamic gap trim
+ dynamic_gap_trim_padding_char_multiplier: dict[str, float] = {
+ ".": 1,
+ "!": 0.9,
+ "?": 1,
+ ",": 0.8,
+ }
+
+ # Web Player Settings
+ enable_web_player: bool = True # Whether to serve the web player UI
+ web_player_path: str = "web" # Path to web player static files
+ cors_origins: list[str] = ["*"] # CORS origins for web player
+ cors_enabled: bool = True # Whether to enable CORS
+
+ # Temp File Settings for WEB Ui
+ temp_file_dir: str = "api/temp_files" # Directory for temporary audio files (relative to project root)
+ max_temp_dir_size_mb: int = 2048 # Maximum size of temp directory (2GB)
+ max_temp_dir_age_hours: int = 1 # Remove temp files older than 1 hour
+ max_temp_dir_count: int = 3 # Maximum number of temp files to keep
+
+ class Config:
+ env_file = ".env"
+
+ def get_device(self) -> str:
+ """Get the appropriate device based on settings and availability"""
+ if not self.use_gpu:
+ return "cpu"
+
+ if self.device_type:
+ return self.device_type
+
+ # Auto-detect device
+ if torch.backends.mps.is_available():
+ return "mps"
+ elif torch.cuda.is_available():
+ return "cuda"
+ return "cpu"
+
+
+settings = Settings()
diff --git a/api/src/core/don_quixote.txt b/api/src/core/don_quixote.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a32a3a6b6cded97faba36b6d2201e86794b4ca5b
--- /dev/null
+++ b/api/src/core/don_quixote.txt
@@ -0,0 +1,9 @@
+In a village of La Mancha, the name of which I have no desire to call
+to mind, there lived not long since one of those gentlemen that keep a
+lance in the lance-rack, an old buckler, a lean hack, and a greyhound
+for coursing. An olla of rather more beef than mutton, a salad on most
+nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so
+extra on Sundays, made away with three-quarters of his income. The rest
+of it went in a doublet of fine cloth and velvet breeches and shoes to
+match for holidays, while on week-days he made a brave figure in his
+best homespun.
\ No newline at end of file
diff --git a/api/src/core/model_config.py b/api/src/core/model_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..47544c12f522e05d3cfc927590c6d23e27103c0e
--- /dev/null
+++ b/api/src/core/model_config.py
@@ -0,0 +1,50 @@
+"""Model configuration for Kokoro V1.
+
+This module provides model-specific configuration settings that complement the application-level
+settings in config.py. While config.py handles general application settings (API, paths, etc.),
+this module focuses on memory management and model file paths.
+"""
+
+from pydantic import BaseModel, Field
+
+
+class KokoroV1Config(BaseModel):
+ """Kokoro V1 configuration."""
+
+ languages: list[str] = ["en"]
+
+ class Config:
+ frozen = True
+
+
+class PyTorchConfig(BaseModel):
+ """PyTorch backend configuration."""
+
+ memory_threshold: float = Field(0.8, description="Memory threshold for cleanup")
+ retry_on_oom: bool = Field(True, description="Whether to retry on OOM errors")
+
+ class Config:
+ frozen = True
+
+
+class ModelConfig(BaseModel):
+ """Kokoro V1 model configuration."""
+
+ # General settings
+ cache_voices: bool = Field(True, description="Whether to cache voice tensors")
+ voice_cache_size: int = Field(2, description="Maximum number of cached voices")
+
+ # Model filename
+ pytorch_kokoro_v1_file: str = Field(
+ "v1_0/kokoro-v1_0.pth", description="PyTorch Kokoro V1 model filename"
+ )
+
+ # Backend config
+ pytorch_gpu: PyTorchConfig = Field(default_factory=PyTorchConfig)
+
+ class Config:
+ frozen = True
+
+
+# Global instance
+model_config = ModelConfig()
diff --git a/api/src/core/openai_mappings.json b/api/src/core/openai_mappings.json
new file mode 100644
index 0000000000000000000000000000000000000000..2821bd62528b56ba1fcc46b6a61a51c07c869773
--- /dev/null
+++ b/api/src/core/openai_mappings.json
@@ -0,0 +1,18 @@
+{
+ "models": {
+ "tts-1": "kokoro-v1_0",
+ "tts-1-hd": "kokoro-v1_0",
+ "kokoro": "kokoro-v1_0"
+ },
+ "voices": {
+ "alloy": "am_v0adam",
+ "ash": "af_v0nicole",
+ "coral": "bf_v0emma",
+ "echo": "af_v0bella",
+ "fable": "af_sarah",
+ "onyx": "bm_george",
+ "nova": "bf_isabella",
+ "sage": "am_michael",
+ "shimmer": "af_sky"
+ }
+}
\ No newline at end of file
diff --git a/api/src/core/paths.py b/api/src/core/paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e6052843454432fb96f9cfebb76ebc0004f9ed4
--- /dev/null
+++ b/api/src/core/paths.py
@@ -0,0 +1,413 @@
+"""Async file and path operations."""
+
+import io
+import json
+import os
+from pathlib import Path
+from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Set
+
+import aiofiles
+import aiofiles.os
+import torch
+from loguru import logger
+
+from .config import settings
+
+
+async def _find_file(
+ filename: str,
+ search_paths: List[str],
+ filter_fn: Optional[Callable[[str], bool]] = None,
+) -> str:
+ """Find file in search paths.
+
+ Args:
+ filename: Name of file to find
+ search_paths: List of paths to search in
+ filter_fn: Optional function to filter files
+
+ Returns:
+ Absolute path to file
+
+ Raises:
+ RuntimeError: If file not found
+ """
+ if os.path.isabs(filename) and await aiofiles.os.path.exists(filename):
+ return filename
+
+ for path in search_paths:
+ full_path = os.path.join(path, filename)
+ if await aiofiles.os.path.exists(full_path):
+ if filter_fn is None or filter_fn(full_path):
+ return full_path
+
+ raise FileNotFoundError(f"File not found: {filename} in paths: {search_paths}")
+
+
+async def _scan_directories(
+ search_paths: List[str], filter_fn: Optional[Callable[[str], bool]] = None
+) -> Set[str]:
+ """Scan directories for files.
+
+ Args:
+ search_paths: List of paths to scan
+ filter_fn: Optional function to filter files
+
+ Returns:
+ Set of matching filenames
+ """
+ results = set()
+
+ for path in search_paths:
+ if not await aiofiles.os.path.exists(path):
+ continue
+
+ try:
+ # Get directory entries first
+ entries = await aiofiles.os.scandir(path)
+ # Then process entries after await completes
+ for entry in entries:
+ if filter_fn is None or filter_fn(entry.name):
+ results.add(entry.name)
+ except Exception as e:
+ logger.warning(f"Error scanning {path}: {e}")
+
+ return results
+
+
+async def get_model_path(model_name: str) -> str:
+ """Get path to model file.
+
+ Args:
+ model_name: Name of model file
+
+ Returns:
+ Absolute path to model file
+
+ Raises:
+ RuntimeError: If model not found
+ """
+ # Get api directory path (two levels up from core)
+ api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+
+ # Construct model directory path relative to api directory
+ model_dir = os.path.join(api_dir, settings.model_dir)
+
+ # Ensure model directory exists
+ os.makedirs(model_dir, exist_ok=True)
+
+ # Search in model directory
+ search_paths = [model_dir]
+ logger.debug(f"Searching for model in path: {model_dir}")
+
+ return await _find_file(model_name, search_paths)
+
+
+async def get_voice_path(voice_name: str) -> str:
+ """Get path to voice file.
+
+ Args:
+ voice_name: Name of voice file (without .pt extension)
+
+ Returns:
+ Absolute path to voice file
+
+ Raises:
+ RuntimeError: If voice not found
+ """
+ # Get api directory path
+ api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+
+ # Construct voice directory path relative to api directory
+ voice_dir = os.path.join(api_dir, settings.voices_dir)
+
+ # Ensure voice directory exists
+ os.makedirs(voice_dir, exist_ok=True)
+
+ voice_file = f"{voice_name}.pt"
+
+ # Search in voice directory/o
+ search_paths = [voice_dir]
+ logger.debug(f"Searching for voice in path: {voice_dir}")
+
+ return await _find_file(voice_file, search_paths)
+
+
+async def list_voices() -> List[str]:
+ """List available voice files.
+
+ Returns:
+ List of voice names (without .pt extension)
+ """
+ # Get api directory path
+ api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+
+ # Construct voice directory path relative to api directory
+ voice_dir = os.path.join(api_dir, settings.voices_dir)
+
+ # Ensure voice directory exists
+ os.makedirs(voice_dir, exist_ok=True)
+
+ # Search in voice directory
+ search_paths = [voice_dir]
+ logger.debug(f"Scanning for voices in path: {voice_dir}")
+
+ def filter_voice_files(name: str) -> bool:
+ return name.endswith(".pt")
+
+ voices = await _scan_directories(search_paths, filter_voice_files)
+ return sorted([name[:-3] for name in voices]) # Remove .pt extension
+
+
+async def load_voice_tensor(
+ voice_path: str, device: str = "cpu", weights_only=False
+) -> torch.Tensor:
+ """Load voice tensor from file.
+
+ Args:
+ voice_path: Path to voice file
+ device: Device to load tensor to
+
+ Returns:
+ Voice tensor
+
+ Raises:
+ RuntimeError: If file cannot be read
+ """
+ try:
+ async with aiofiles.open(voice_path, "rb") as f:
+ data = await f.read()
+ return torch.load(
+ io.BytesIO(data), map_location=device, weights_only=weights_only
+ )
+ except Exception as e:
+ raise RuntimeError(f"Failed to load voice tensor from {voice_path}: {e}")
+
+
+async def save_voice_tensor(tensor: torch.Tensor, voice_path: str) -> None:
+ """Save voice tensor to file.
+
+ Args:
+ tensor: Voice tensor to save
+ voice_path: Path to save voice file
+
+ Raises:
+ RuntimeError: If file cannot be written
+ """
+ try:
+ buffer = io.BytesIO()
+ torch.save(tensor, buffer)
+ async with aiofiles.open(voice_path, "wb") as f:
+ await f.write(buffer.getvalue())
+ except Exception as e:
+ raise RuntimeError(f"Failed to save voice tensor to {voice_path}: {e}")
+
+
+async def load_json(path: str) -> dict:
+ """Load JSON file asynchronously.
+
+ Args:
+ path: Path to JSON file
+
+ Returns:
+ Parsed JSON data
+
+ Raises:
+ RuntimeError: If file cannot be read or parsed
+ """
+ try:
+ async with aiofiles.open(path, "r", encoding="utf-8") as f:
+ content = await f.read()
+ return json.loads(content)
+ except Exception as e:
+ raise RuntimeError(f"Failed to load JSON file {path}: {e}")
+
+
+async def load_model_weights(path: str, device: str = "cpu") -> dict:
+ """Load model weights asynchronously.
+
+ Args:
+ path: Path to model file (.pth or .onnx)
+ device: Device to load model to
+
+ Returns:
+ Model weights
+
+ Raises:
+ RuntimeError: If file cannot be read
+ """
+ try:
+ async with aiofiles.open(path, "rb") as f:
+ data = await f.read()
+ return torch.load(io.BytesIO(data), map_location=device, weights_only=True)
+ except Exception as e:
+ raise RuntimeError(f"Failed to load model weights from {path}: {e}")
+
+
+async def read_file(path: str) -> str:
+ """Read text file asynchronously.
+
+ Args:
+ path: Path to file
+
+ Returns:
+ File contents as string
+
+ Raises:
+ RuntimeError: If file cannot be read
+ """
+ try:
+ async with aiofiles.open(path, "r", encoding="utf-8") as f:
+ return await f.read()
+ except Exception as e:
+ raise RuntimeError(f"Failed to read file {path}: {e}")
+
+
+async def read_bytes(path: str) -> bytes:
+ """Read file as bytes asynchronously.
+
+ Args:
+ path: Path to file
+
+ Returns:
+ File contents as bytes
+
+ Raises:
+ RuntimeError: If file cannot be read
+ """
+ try:
+ async with aiofiles.open(path, "rb") as f:
+ return await f.read()
+ except Exception as e:
+ raise RuntimeError(f"Failed to read file {path}: {e}")
+
+
+async def get_web_file_path(filename: str) -> str:
+ """Get path to web static file.
+
+ Args:
+ filename: Name of file in web directory
+
+ Returns:
+ Absolute path to file
+
+ Raises:
+ RuntimeError: If file not found
+ """
+ # Get project root directory (four levels up from core to get to project root)
+ root_dir = os.path.dirname(
+ os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+ )
+
+ # Construct web directory path relative to project root
+ web_dir = os.path.join("/app", settings.web_player_path)
+
+ # Search in web directory
+ search_paths = [web_dir]
+ logger.debug(f"Searching for web file in path: {web_dir}")
+
+ return await _find_file(filename, search_paths)
+
+
+async def get_content_type(path: str) -> str:
+ """Get content type for file.
+
+ Args:
+ path: Path to file
+
+ Returns:
+ Content type string
+ """
+ ext = os.path.splitext(path)[1].lower()
+ return {
+ ".html": "text/html",
+ ".js": "application/javascript",
+ ".css": "text/css",
+ ".png": "image/png",
+ ".jpg": "image/jpeg",
+ ".jpeg": "image/jpeg",
+ ".gif": "image/gif",
+ ".svg": "image/svg+xml",
+ ".ico": "image/x-icon",
+ }.get(ext, "application/octet-stream")
+
+
+async def verify_model_path(model_path: str) -> bool:
+ """Verify model file exists at path."""
+ return await aiofiles.os.path.exists(model_path)
+
+
+async def cleanup_temp_files() -> None:
+ """Clean up old temp files on startup"""
+ try:
+ if not await aiofiles.os.path.exists(settings.temp_file_dir):
+ await aiofiles.os.makedirs(settings.temp_file_dir, exist_ok=True)
+ return
+
+ entries = await aiofiles.os.scandir(settings.temp_file_dir)
+ for entry in entries:
+ if entry.is_file():
+ stat = await aiofiles.os.stat(entry.path)
+ max_age = stat.st_mtime + (settings.max_temp_dir_age_hours * 3600)
+ if max_age < stat.st_mtime:
+ try:
+ await aiofiles.os.remove(entry.path)
+ logger.info(f"Cleaned up old temp file: {entry.name}")
+ except Exception as e:
+ logger.warning(
+ f"Failed to delete old temp file {entry.name}: {e}"
+ )
+ except Exception as e:
+ logger.warning(f"Error cleaning temp files: {e}")
+
+
+async def get_temp_file_path(filename: str) -> str:
+ """Get path to temporary audio file.
+
+ Args:
+ filename: Name of temp file
+
+ Returns:
+ Absolute path to temp file
+
+ Raises:
+ RuntimeError: If temp directory does not exist
+ """
+ temp_path = os.path.join(settings.temp_file_dir, filename)
+
+ # Ensure temp directory exists
+ if not await aiofiles.os.path.exists(settings.temp_file_dir):
+ await aiofiles.os.makedirs(settings.temp_file_dir, exist_ok=True)
+
+ return temp_path
+
+
+async def list_temp_files() -> List[str]:
+ """List temporary audio files.
+
+ Returns:
+ List of temp file names
+ """
+ if not await aiofiles.os.path.exists(settings.temp_file_dir):
+ return []
+
+ entries = await aiofiles.os.scandir(settings.temp_file_dir)
+ return [entry.name for entry in entries if entry.is_file()]
+
+
+async def get_temp_dir_size() -> int:
+ """Get total size of temp directory in bytes.
+
+ Returns:
+ Size in bytes
+ """
+ if not await aiofiles.os.path.exists(settings.temp_file_dir):
+ return 0
+
+ total = 0
+ entries = await aiofiles.os.scandir(settings.temp_file_dir)
+ for entry in entries:
+ if entry.is_file():
+ stat = await aiofiles.os.stat(entry.path)
+ total += stat.st_size
+ return total
diff --git a/api/src/inference/__init__.py b/api/src/inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..68c7ce384352d6eee057e27d0a3f6d2580eaf4ae
--- /dev/null
+++ b/api/src/inference/__init__.py
@@ -0,0 +1,12 @@
+"""Model inference package."""
+
+from .base import BaseModelBackend
+from .kokoro_v1 import KokoroV1
+from .model_manager import ModelManager, get_manager
+
+__all__ = [
+ "BaseModelBackend",
+ "ModelManager",
+ "get_manager",
+ "KokoroV1",
+]
diff --git a/api/src/inference/base.py b/api/src/inference/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e25c2b514427cc61a85dc7384aa997a2b36e755d
--- /dev/null
+++ b/api/src/inference/base.py
@@ -0,0 +1,127 @@
+"""Base interface for Kokoro inference."""
+
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+
+class AudioChunk:
+ """Class for audio chunks returned by model backends"""
+
+ def __init__(
+ self,
+ audio: np.ndarray,
+ word_timestamps: Optional[List] = [],
+ output: Optional[Union[bytes, np.ndarray]] = b"",
+ ):
+ self.audio = audio
+ self.word_timestamps = word_timestamps
+ self.output = output
+
+ @staticmethod
+ def combine(audio_chunk_list: List):
+ output = AudioChunk(
+ audio_chunk_list[0].audio, audio_chunk_list[0].word_timestamps
+ )
+
+ for audio_chunk in audio_chunk_list[1:]:
+ output.audio = np.concatenate(
+ (output.audio, audio_chunk.audio), dtype=np.int16
+ )
+ if output.word_timestamps is not None:
+ output.word_timestamps += audio_chunk.word_timestamps
+
+ return output
+
+
+class ModelBackend(ABC):
+ """Abstract base class for model inference backend."""
+
+ @abstractmethod
+ async def load_model(self, path: str) -> None:
+ """Load model from path.
+
+ Args:
+ path: Path to model file
+
+ Raises:
+ RuntimeError: If model loading fails
+ """
+ pass
+
+ @abstractmethod
+ async def generate(
+ self,
+ text: str,
+ voice: Union[str, Tuple[str, Union[torch.Tensor, str]]],
+ speed: float = 1.0,
+ ) -> AsyncGenerator[AudioChunk, None]:
+ """Generate audio from text.
+
+ Args:
+ text: Input text to synthesize
+ voice: Either a voice path or tuple of (name, tensor/path)
+ speed: Speed multiplier
+
+ Yields:
+ Generated audio chunks
+
+ Raises:
+ RuntimeError: If generation fails
+ """
+ pass
+
+ @abstractmethod
+ def unload(self) -> None:
+ """Unload model and free resources."""
+ pass
+
+ @property
+ @abstractmethod
+ def is_loaded(self) -> bool:
+ """Check if model is loaded.
+
+ Returns:
+ True if model is loaded, False otherwise
+ """
+ pass
+
+ @property
+ @abstractmethod
+ def device(self) -> str:
+ """Get device model is running on.
+
+ Returns:
+ Device string ('cpu' or 'cuda')
+ """
+ pass
+
+
+class BaseModelBackend(ModelBackend):
+ """Base implementation of model backend."""
+
+ def __init__(self):
+ """Initialize base backend."""
+ self._model: Optional[torch.nn.Module] = None
+ self._device: str = "cpu"
+
+ @property
+ def is_loaded(self) -> bool:
+ """Check if model is loaded."""
+ return self._model is not None
+
+ @property
+ def device(self) -> str:
+ """Get device model is running on."""
+ return self._device
+
+ def unload(self) -> None:
+ """Unload model and free resources."""
+ if self._model is not None:
+ del self._model
+ self._model = None
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ torch.cuda.synchronize()
diff --git a/api/src/inference/kokoro_v1.py b/api/src/inference/kokoro_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..a627dbb35ad371949c5e07e8ae3de55c0eb82e40
--- /dev/null
+++ b/api/src/inference/kokoro_v1.py
@@ -0,0 +1,370 @@
+"""Clean Kokoro implementation with controlled resource management."""
+
+import os
+from typing import AsyncGenerator, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from kokoro import KModel, KPipeline
+from loguru import logger
+
+from ..core import paths
+from ..core.config import settings
+from ..core.model_config import model_config
+from ..structures.schemas import WordTimestamp
+from .base import AudioChunk, BaseModelBackend
+
+
+class KokoroV1(BaseModelBackend):
+ """Kokoro backend with controlled resource management."""
+
+ def __init__(self):
+ """Initialize backend with environment-based configuration."""
+ super().__init__()
+ # Strictly respect settings.use_gpu
+ self._device = settings.get_device()
+ self._model: Optional[KModel] = None
+ self._pipelines: Dict[str, KPipeline] = {} # Store pipelines by lang_code
+
+ async def load_model(self, path: str) -> None:
+ """Load pre-baked model.
+
+ Args:
+ path: Path to model file
+
+ Raises:
+ RuntimeError: If model loading fails
+ """
+ try:
+ # Get verified model path
+ model_path = await paths.get_model_path(path)
+ config_path = os.path.join(os.path.dirname(model_path), "config.json")
+
+ if not os.path.exists(config_path):
+ raise RuntimeError(f"Config file not found: {config_path}")
+
+ logger.info(f"Loading Kokoro model on {self._device}")
+ logger.info(f"Config path: {config_path}")
+ logger.info(f"Model path: {model_path}")
+
+ # Load model and let KModel handle device mapping
+ self._model = KModel(config=config_path, model=model_path).eval()
+ # For MPS, manually move ISTFT layers to CPU while keeping rest on MPS
+ if self._device == "mps":
+ logger.info(
+ "Moving model to MPS device with CPU fallback for unsupported operations"
+ )
+ self._model = self._model.to(torch.device("mps"))
+ elif self._device == "cuda":
+ self._model = self._model.cuda()
+ else:
+ self._model = self._model.cpu()
+
+ except FileNotFoundError as e:
+ raise e
+ except Exception as e:
+ raise RuntimeError(f"Failed to load Kokoro model: {e}")
+
+ def _get_pipeline(self, lang_code: str) -> KPipeline:
+ """Get or create pipeline for language code.
+
+ Args:
+ lang_code: Language code to use
+
+ Returns:
+ KPipeline instance for the language
+ """
+ if not self._model:
+ raise RuntimeError("Model not loaded")
+
+ if lang_code not in self._pipelines:
+ logger.info(f"Creating new pipeline for language code: {lang_code}")
+ self._pipelines[lang_code] = KPipeline(
+ lang_code=lang_code, model=self._model, device=self._device
+ )
+ return self._pipelines[lang_code]
+
+ async def generate_from_tokens(
+ self,
+ tokens: str,
+ voice: Union[str, Tuple[str, Union[torch.Tensor, str]]],
+ speed: float = 1.0,
+ lang_code: Optional[str] = None,
+ ) -> AsyncGenerator[np.ndarray, None]:
+ """Generate audio from phoneme tokens.
+
+ Args:
+ tokens: Input phoneme tokens to synthesize
+ voice: Either a voice path string or a tuple of (voice_name, voice_tensor/path)
+ speed: Speed multiplier
+ lang_code: Optional language code override
+
+ Yields:
+ Generated audio chunks
+
+ Raises:
+ RuntimeError: If generation fails
+ """
+ if not self.is_loaded:
+ raise RuntimeError("Model not loaded")
+
+ try:
+ # Memory management for GPU
+ if self._device == "cuda":
+ if self._check_memory():
+ self._clear_memory()
+
+ # Handle voice input
+ voice_path: str
+ voice_name: str
+ if isinstance(voice, tuple):
+ voice_name, voice_data = voice
+ if isinstance(voice_data, str):
+ voice_path = voice_data
+ else:
+ # Save tensor to temporary file
+ import tempfile
+
+ temp_dir = tempfile.gettempdir()
+ voice_path = os.path.join(temp_dir, f"{voice_name}.pt")
+ # Save tensor with CPU mapping for portability
+ torch.save(voice_data.cpu(), voice_path)
+ else:
+ voice_path = voice
+ voice_name = os.path.splitext(os.path.basename(voice_path))[0]
+
+ # Load voice tensor with proper device mapping
+ voice_tensor = await paths.load_voice_tensor(
+ voice_path, device=self._device
+ )
+ # Save back to a temporary file with proper device mapping
+ import tempfile
+
+ temp_dir = tempfile.gettempdir()
+ temp_path = os.path.join(
+ temp_dir, f"temp_voice_{os.path.basename(voice_path)}"
+ )
+ await paths.save_voice_tensor(voice_tensor, temp_path)
+ voice_path = temp_path
+
+ # Use provided lang_code, settings voice code override, or first letter of voice name
+ if lang_code: # api is given priority
+ pipeline_lang_code = lang_code
+ elif settings.default_voice_code: # settings is next priority
+ pipeline_lang_code = settings.default_voice_code
+ else: # voice name is default/fallback
+ pipeline_lang_code = voice_name[0].lower()
+
+ pipeline = self._get_pipeline(pipeline_lang_code)
+
+ logger.debug(
+ f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}{'...' if len(tokens) > 100 else ''}'"
+ )
+ for result in pipeline.generate_from_tokens(
+ tokens=tokens, voice=voice_path, speed=speed, model=self._model
+ ):
+ if result.audio is not None:
+ logger.debug(f"Got audio chunk with shape: {result.audio.shape}")
+ yield result.audio.numpy()
+ else:
+ logger.warning("No audio in chunk")
+
+ except Exception as e:
+ logger.error(f"Generation failed: {e}")
+ if (
+ self._device == "cuda"
+ and model_config.pytorch_gpu.retry_on_oom
+ and "out of memory" in str(e).lower()
+ ):
+ self._clear_memory()
+ async for chunk in self.generate_from_tokens(
+ tokens, voice, speed, lang_code
+ ):
+ yield chunk
+ raise
+
+ async def generate(
+ self,
+ text: str,
+ voice: Union[str, Tuple[str, Union[torch.Tensor, str]]],
+ speed: float = 1.0,
+ lang_code: Optional[str] = None,
+ return_timestamps: Optional[bool] = False,
+ ) -> AsyncGenerator[AudioChunk, None]:
+ """Generate audio using model.
+
+ Args:
+ text: Input text to synthesize
+ voice: Either a voice path string or a tuple of (voice_name, voice_tensor/path)
+ speed: Speed multiplier
+ lang_code: Optional language code override
+
+ Yields:
+ Generated audio chunks
+
+ Raises:
+ RuntimeError: If generation fails
+ """
+ if not self.is_loaded:
+ raise RuntimeError("Model not loaded")
+ try:
+ # Memory management for GPU
+ if self._device == "cuda":
+ if self._check_memory():
+ self._clear_memory()
+
+ # Handle voice input
+ voice_path: str
+ voice_name: str
+ if isinstance(voice, tuple):
+ voice_name, voice_data = voice
+ if isinstance(voice_data, str):
+ voice_path = voice_data
+ else:
+ # Save tensor to temporary file
+ import tempfile
+
+ temp_dir = tempfile.gettempdir()
+ voice_path = os.path.join(temp_dir, f"{voice_name}.pt")
+ # Save tensor with CPU mapping for portability
+ torch.save(voice_data.cpu(), voice_path)
+ else:
+ voice_path = voice
+ voice_name = os.path.splitext(os.path.basename(voice_path))[0]
+
+ # Load voice tensor with proper device mapping
+ voice_tensor = await paths.load_voice_tensor(
+ voice_path, device=self._device
+ )
+ # Save back to a temporary file with proper device mapping
+ import tempfile
+
+ temp_dir = tempfile.gettempdir()
+ temp_path = os.path.join(
+ temp_dir, f"temp_voice_{os.path.basename(voice_path)}"
+ )
+ await paths.save_voice_tensor(voice_tensor, temp_path)
+ voice_path = temp_path
+
+ # Use provided lang_code, settings voice code override, or first letter of voice name
+ pipeline_lang_code = (
+ lang_code
+ if lang_code
+ else (
+ settings.default_voice_code
+ if settings.default_voice_code
+ else voice_name[0].lower()
+ )
+ )
+ pipeline = self._get_pipeline(pipeline_lang_code)
+
+ logger.debug(
+ f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}{'...' if len(text) > 100 else ''}'"
+ )
+ for result in pipeline(
+ text, voice=voice_path, speed=speed, model=self._model
+ ):
+ if result.audio is not None:
+ logger.debug(f"Got audio chunk with shape: {result.audio.shape}")
+ word_timestamps = None
+ if (
+ return_timestamps
+ and hasattr(result, "tokens")
+ and result.tokens
+ ):
+ word_timestamps = []
+ current_offset = 0.0
+ logger.debug(
+ f"Processing chunk timestamps with {len(result.tokens)} tokens"
+ )
+ if result.pred_dur is not None:
+ try:
+ # Add timestamps with offset
+ for token in result.tokens:
+ if not all(
+ hasattr(token, attr)
+ for attr in [
+ "text",
+ "start_ts",
+ "end_ts",
+ ]
+ ):
+ continue
+ if not token.text or not token.text.strip():
+ continue
+
+ start_time = float(token.start_ts) + current_offset
+ end_time = float(token.end_ts) + current_offset
+ word_timestamps.append(
+ WordTimestamp(
+ word=str(token.text).strip(),
+ start_time=start_time,
+ end_time=end_time,
+ )
+ )
+ logger.debug(
+ f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s"
+ )
+
+ except Exception as e:
+ logger.error(
+ f"Failed to process timestamps for chunk: {e}"
+ )
+
+ yield AudioChunk(
+ result.audio.numpy(), word_timestamps=word_timestamps
+ )
+ else:
+ logger.warning("No audio in chunk")
+
+ except Exception as e:
+ logger.error(f"Generation failed: {e}")
+ if (
+ self._device == "cuda"
+ and model_config.pytorch_gpu.retry_on_oom
+ and "out of memory" in str(e).lower()
+ ):
+ self._clear_memory()
+ async for chunk in self.generate(text, voice, speed, lang_code):
+ yield chunk
+ raise
+
+ def _check_memory(self) -> bool:
+ """Check if memory usage is above threshold."""
+ if self._device == "cuda":
+ memory_gb = torch.cuda.memory_allocated() / 1e9
+ return memory_gb > model_config.pytorch_gpu.memory_threshold
+ # MPS doesn't provide memory management APIs
+ return False
+
+ def _clear_memory(self) -> None:
+ """Clear device memory."""
+ if self._device == "cuda":
+ torch.cuda.empty_cache()
+ torch.cuda.synchronize()
+ elif self._device == "mps":
+ # Empty cache if available (future-proofing)
+ if hasattr(torch.mps, "empty_cache"):
+ torch.mps.empty_cache()
+
+ def unload(self) -> None:
+ """Unload model and free resources."""
+ if self._model is not None:
+ del self._model
+ self._model = None
+ for pipeline in self._pipelines.values():
+ del pipeline
+ self._pipelines.clear()
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ torch.cuda.synchronize()
+
+ @property
+ def is_loaded(self) -> bool:
+ """Check if model is loaded."""
+ return self._model is not None
+
+ @property
+ def device(self) -> str:
+ """Get device model is running on."""
+ return self._device
diff --git a/api/src/inference/model_manager.py b/api/src/inference/model_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cef95ff3cb6234eb50fbcfc4ed4ce53f2877d63
--- /dev/null
+++ b/api/src/inference/model_manager.py
@@ -0,0 +1,171 @@
+"""Kokoro V1 model management."""
+
+from typing import Optional
+
+from loguru import logger
+
+from ..core import paths
+from ..core.config import settings
+from ..core.model_config import ModelConfig, model_config
+from .base import BaseModelBackend
+from .kokoro_v1 import KokoroV1
+
+
+class ModelManager:
+ """Manages Kokoro V1 model loading and inference."""
+
+ # Singleton instance
+ _instance = None
+
+ def __init__(self, config: Optional[ModelConfig] = None):
+ """Initialize manager.
+
+ Args:
+ config: Optional model configuration override
+ """
+ self._config = config or model_config
+ self._backend: Optional[KokoroV1] = None # Explicitly type as KokoroV1
+ self._device: Optional[str] = None
+
+ def _determine_device(self) -> str:
+ """Determine device based on settings."""
+ return "cuda" if settings.use_gpu else "cpu"
+
+ async def initialize(self) -> None:
+ """Initialize Kokoro V1 backend."""
+ try:
+ self._device = self._determine_device()
+ logger.info(f"Initializing Kokoro V1 on {self._device}")
+ self._backend = KokoroV1()
+
+ except Exception as e:
+ raise RuntimeError(f"Failed to initialize Kokoro V1: {e}")
+
+ async def initialize_with_warmup(self, voice_manager) -> tuple[str, str, int]:
+ """Initialize and warm up model.
+
+ Args:
+ voice_manager: Voice manager instance for warmup
+
+ Returns:
+ Tuple of (device, backend type, voice count)
+
+ Raises:
+ RuntimeError: If initialization fails
+ """
+ import time
+
+ start = time.perf_counter()
+
+ try:
+ # Initialize backend
+ await self.initialize()
+
+ # Load model
+ model_path = self._config.pytorch_kokoro_v1_file
+ await self.load_model(model_path)
+
+ # Use paths module to get voice path
+ try:
+ voices = await paths.list_voices()
+ voice_path = await paths.get_voice_path(settings.default_voice)
+
+ # Warm up with short text
+ warmup_text = "Warmup text for initialization."
+ # Use default voice name for warmup
+ voice_name = settings.default_voice
+ logger.debug(f"Using default voice '{voice_name}' for warmup")
+ async for _ in self.generate(warmup_text, (voice_name, voice_path)):
+ pass
+ except Exception as e:
+ raise RuntimeError(f"Failed to get default voice: {e}")
+
+ ms = int((time.perf_counter() - start) * 1000)
+ logger.info(f"Warmup completed in {ms}ms")
+
+ return self._device, "kokoro_v1", len(voices)
+ except FileNotFoundError as e:
+ logger.error("""
+Model files not found! You need to download the Kokoro V1 model:
+
+1. Download model using the script:
+ python docker/scripts/download_model.py --output api/src/models/v1_0
+
+2. Or set environment variable in docker-compose:
+ DOWNLOAD_MODEL=true
+""")
+ exit(0)
+ except Exception as e:
+ raise RuntimeError(f"Warmup failed: {e}")
+
+ def get_backend(self) -> BaseModelBackend:
+ """Get initialized backend.
+
+ Returns:
+ Initialized backend instance
+
+ Raises:
+ RuntimeError: If backend not initialized
+ """
+ if not self._backend:
+ raise RuntimeError("Backend not initialized")
+ return self._backend
+
+ async def load_model(self, path: str) -> None:
+ """Load model using initialized backend.
+
+ Args:
+ path: Path to model file
+
+ Raises:
+ RuntimeError: If loading fails
+ """
+ if not self._backend:
+ raise RuntimeError("Backend not initialized")
+
+ try:
+ await self._backend.load_model(path)
+ except FileNotFoundError as e:
+ raise e
+ except Exception as e:
+ raise RuntimeError(f"Failed to load model: {e}")
+
+ async def generate(self, *args, **kwargs):
+ """Generate audio using initialized backend.
+
+ Raises:
+ RuntimeError: If generation fails
+ """
+ if not self._backend:
+ raise RuntimeError("Backend not initialized")
+
+ try:
+ async for chunk in self._backend.generate(*args, **kwargs):
+ yield chunk
+ except Exception as e:
+ raise RuntimeError(f"Generation failed: {e}")
+
+ def unload_all(self) -> None:
+ """Unload model and free resources."""
+ if self._backend:
+ self._backend.unload()
+ self._backend = None
+
+ @property
+ def current_backend(self) -> str:
+ """Get current backend type."""
+ return "kokoro_v1"
+
+
+async def get_manager(config: Optional[ModelConfig] = None) -> ModelManager:
+ """Get model manager instance.
+
+ Args:
+ config: Optional configuration override
+
+ Returns:
+ ModelManager instance
+ """
+ if ModelManager._instance is None:
+ ModelManager._instance = ModelManager(config)
+ return ModelManager._instance
diff --git a/api/src/inference/voice_manager.py b/api/src/inference/voice_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d82c4f744c01a35c99569cc821ebde6035fb194
--- /dev/null
+++ b/api/src/inference/voice_manager.py
@@ -0,0 +1,115 @@
+"""Voice management with controlled resource handling."""
+
+from typing import Dict, List, Optional
+
+import aiofiles
+import torch
+from loguru import logger
+
+from ..core import paths
+from ..core.config import settings
+
+
+class VoiceManager:
+ """Manages voice loading and caching with controlled resource usage."""
+
+ # Singleton instance
+ _instance = None
+
+ def __init__(self):
+ """Initialize voice manager."""
+ # Strictly respect settings.use_gpu
+ self._device = settings.get_device()
+ self._voices: Dict[str, torch.Tensor] = {}
+
+ async def get_voice_path(self, voice_name: str) -> str:
+ """Get path to voice file.
+
+ Args:
+ voice_name: Name of voice
+
+ Returns:
+ Path to voice file
+
+ Raises:
+ RuntimeError: If voice not found
+ """
+ return await paths.get_voice_path(voice_name)
+
+ async def load_voice(
+ self, voice_name: str, device: Optional[str] = None
+ ) -> torch.Tensor:
+ """Load voice tensor.
+
+ Args:
+ voice_name: Name of voice to load
+ device: Optional override for target device
+
+ Returns:
+ Voice tensor
+
+ Raises:
+ RuntimeError: If voice not found
+ """
+ try:
+ voice_path = await self.get_voice_path(voice_name)
+ target_device = device or self._device
+ voice = await paths.load_voice_tensor(voice_path, target_device)
+ self._voices[voice_name] = voice
+ return voice
+ except Exception as e:
+ raise RuntimeError(f"Failed to load voice {voice_name}: {e}")
+
+ async def combine_voices(
+ self, voices: List[str], device: Optional[str] = None
+ ) -> torch.Tensor:
+ """Combine multiple voices.
+
+ Args:
+ voices: List of voice names to combine
+ device: Optional override for target device
+
+ Returns:
+ Combined voice tensor
+
+ Raises:
+ RuntimeError: If any voice not found
+ """
+ if len(voices) < 2:
+ raise ValueError("Need at least 2 voices to combine")
+
+ target_device = device or self._device
+ voice_tensors = []
+ for name in voices:
+ voice = await self.load_voice(name, target_device)
+ voice_tensors.append(voice)
+
+ combined = torch.mean(torch.stack(voice_tensors), dim=0)
+ return combined
+
+ async def list_voices(self) -> List[str]:
+ """List available voice names.
+
+ Returns:
+ List of voice names
+ """
+ return await paths.list_voices()
+
+ def cache_info(self) -> Dict[str, int]:
+ """Get cache statistics.
+
+ Returns:
+ Dict with cache statistics
+ """
+ return {"loaded_voices": len(self._voices), "device": self._device}
+
+
+async def get_manager() -> VoiceManager:
+ """Get voice manager instance.
+
+ Returns:
+ VoiceManager instance
+ """
+ if VoiceManager._instance is None:
+ VoiceManager._instance = VoiceManager()
+ return VoiceManager._instance
diff --git a/api/src/main.py b/api/src/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..23299cf89071e6cabbe77fa0e3b8146734268773
--- /dev/null
+++ b/api/src/main.py
@@ -0,0 +1,152 @@
+"""
+FastAPI OpenAI Compatible API
+"""
+
+import os
+import sys
+from contextlib import asynccontextmanager
+from pathlib import Path
+
+import torch
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from loguru import logger
+
+from .core.config import settings
+from .routers.debug import router as debug_router
+from .routers.development import router as dev_router
+from .routers.openai_compatible import router as openai_router
+from .routers.web_player import router as web_router
+
+
+def setup_logger():
+ """Configure loguru logger with custom formatting"""
+ config = {
+ "handlers": [
+ {
+ "sink": sys.stdout,
+ "format": "{time:hh:mm:ss A} | "
+ "{level: <8} | "
+ "{module}:{line} | "
+ "{message}",
+ "colorize": True,
+ "level": "DEBUG",
+ },
+ ],
+ }
+ logger.remove()
+ logger.configure(**config)
+ logger.level("ERROR", color="")
+
+
+# Configure logger
+setup_logger()
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+ """Lifespan context manager for model initialization"""
+ from .inference.model_manager import get_manager
+ from .inference.voice_manager import get_manager as get_voice_manager
+ from .services.temp_manager import cleanup_temp_files
+
+ # Clean old temp files on startup
+ await cleanup_temp_files()
+
+ logger.info("Loading TTS model and voice packs...")
+
+ try:
+ # Initialize managers
+ model_manager = await get_manager()
+ voice_manager = await get_voice_manager()
+
+ # Initialize model with warmup and get status
+ device, model, voicepack_count = await model_manager.initialize_with_warmup(
+ voice_manager
+ )
+
+ except Exception as e:
+ logger.error(f"Failed to initialize model: {e}")
+ raise
+
+ boundary = "░" * 2 * 12
+ startup_msg = f"""
+
+{boundary}
+
+ ╔═╗┌─┐┌─┐┌┬┐
+ ╠╣ ├─┤└─┐ │
+ ╚ ┴ ┴└─┘ ┴
+ ╦╔═┌─┐┬┌─┌─┐
+ ╠╩╗│ │├┴┐│ │
+ ╩ ╩└─┘┴ ┴└─┘
+
+{boundary}
+ """
+ startup_msg += f"\nModel warmed up on {device}: {model}"
+ if device == "mps":
+ startup_msg += "\nUsing Apple Metal Performance Shaders (MPS)"
+ elif device == "cuda":
+ startup_msg += f"\nCUDA: {torch.cuda.is_available()}"
+ else:
+ startup_msg += "\nRunning on CPU"
+ startup_msg += f"\n{voicepack_count} voice packs loaded"
+
+ # Add web player info if enabled
+ if settings.enable_web_player:
+ startup_msg += (
+ f"\n\nBeta Web Player: http://{settings.host}:{settings.port}/web/"
+ )
+ startup_msg += f"\nor http://localhost:{settings.port}/web/"
+ else:
+ startup_msg += "\n\nWeb Player: disabled"
+
+ startup_msg += f"\n{boundary}\n"
+ logger.info(startup_msg)
+
+ yield
+
+
+# Initialize FastAPI app
+app = FastAPI(
+ title=settings.api_title,
+ description=settings.api_description,
+ version=settings.api_version,
+ lifespan=lifespan,
+ openapi_url="/openapi.json", # Explicitly enable OpenAPI schema
+)
+
+# Add CORS middleware if enabled
+if settings.cors_enabled:
+ app.add_middleware(
+ CORSMiddleware,
+ allow_origins=settings.cors_origins,
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+ )
+
+# Include routers
+app.include_router(openai_router, prefix="/v1")
+app.include_router(dev_router) # Development endpoints
+app.include_router(debug_router) # Debug endpoints
+if settings.enable_web_player:
+ app.include_router(web_router, prefix="/web") # Web player static files
+
+
+# Health check endpoint
+@app.get("/health")
+async def health_check():
+ """Health check endpoint"""
+ return {"status": "healthy"}
+
+
+@app.get("/v1/test")
+async def test_endpoint():
+ """Test endpoint to verify routing"""
+ return {"status": "ok"}
+
+
+if __name__ == "__main__":
+ uvicorn.run("api.src.main:app", host=settings.host, port=settings.port, reload=True)
diff --git a/api/src/models/v1_0/config.json b/api/src/models/v1_0/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..14a726edd3718279eac426630879ff743955b16a
--- /dev/null
+++ b/api/src/models/v1_0/config.json
@@ -0,0 +1,150 @@
+{
+ "istftnet": {
+ "upsample_kernel_sizes": [20, 12],
+ "upsample_rates": [10, 6],
+ "gen_istft_hop_size": 5,
+ "gen_istft_n_fft": 20,
+ "resblock_dilation_sizes": [
+ [1, 3, 5],
+ [1, 3, 5],
+ [1, 3, 5]
+ ],
+ "resblock_kernel_sizes": [3, 7, 11],
+ "upsample_initial_channel": 512
+ },
+ "dim_in": 64,
+ "dropout": 0.2,
+ "hidden_dim": 512,
+ "max_conv_dim": 512,
+ "max_dur": 50,
+ "multispeaker": true,
+ "n_layer": 3,
+ "n_mels": 80,
+ "n_token": 178,
+ "style_dim": 128,
+ "text_encoder_kernel_size": 5,
+ "plbert": {
+ "hidden_size": 768,
+ "num_attention_heads": 12,
+ "intermediate_size": 2048,
+ "max_position_embeddings": 512,
+ "num_hidden_layers": 12,
+ "dropout": 0.1
+ },
+ "vocab": {
+ ";": 1,
+ ":": 2,
+ ",": 3,
+ ".": 4,
+ "!": 5,
+ "?": 6,
+ "—": 9,
+ "…": 10,
+ "\"": 11,
+ "(": 12,
+ ")": 13,
+ "“": 14,
+ "”": 15,
+ " ": 16,
+ "\u0303": 17,
+ "ʣ": 18,
+ "ʥ": 19,
+ "ʦ": 20,
+ "ʨ": 21,
+ "ᵝ": 22,
+ "\uAB67": 23,
+ "A": 24,
+ "I": 25,
+ "O": 31,
+ "Q": 33,
+ "S": 35,
+ "T": 36,
+ "W": 39,
+ "Y": 41,
+ "ᵊ": 42,
+ "a": 43,
+ "b": 44,
+ "c": 45,
+ "d": 46,
+ "e": 47,
+ "f": 48,
+ "h": 50,
+ "i": 51,
+ "j": 52,
+ "k": 53,
+ "l": 54,
+ "m": 55,
+ "n": 56,
+ "o": 57,
+ "p": 58,
+ "q": 59,
+ "r": 60,
+ "s": 61,
+ "t": 62,
+ "u": 63,
+ "v": 64,
+ "w": 65,
+ "x": 66,
+ "y": 67,
+ "z": 68,
+ "ɑ": 69,
+ "ɐ": 70,
+ "ɒ": 71,
+ "æ": 72,
+ "β": 75,
+ "ɔ": 76,
+ "ɕ": 77,
+ "ç": 78,
+ "ɖ": 80,
+ "ð": 81,
+ "ʤ": 82,
+ "ə": 83,
+ "ɚ": 85,
+ "ɛ": 86,
+ "ɜ": 87,
+ "ɟ": 90,
+ "ɡ": 92,
+ "ɥ": 99,
+ "ɨ": 101,
+ "ɪ": 102,
+ "ʝ": 103,
+ "ɯ": 110,
+ "ɰ": 111,
+ "ŋ": 112,
+ "ɳ": 113,
+ "ɲ": 114,
+ "ɴ": 115,
+ "ø": 116,
+ "ɸ": 118,
+ "θ": 119,
+ "œ": 120,
+ "ɹ": 123,
+ "ɾ": 125,
+ "ɻ": 126,
+ "ʁ": 128,
+ "ɽ": 129,
+ "ʂ": 130,
+ "ʃ": 131,
+ "ʈ": 132,
+ "ʧ": 133,
+ "ʊ": 135,
+ "ʋ": 136,
+ "ʌ": 138,
+ "ɣ": 139,
+ "ɤ": 140,
+ "χ": 142,
+ "ʎ": 143,
+ "ʒ": 147,
+ "ʔ": 148,
+ "ˈ": 156,
+ "ˌ": 157,
+ "ː": 158,
+ "ʰ": 162,
+ "ʲ": 164,
+ "↓": 169,
+ "→": 171,
+ "↗": 172,
+ "↘": 173,
+ "ᵻ": 177
+ }
+}
\ No newline at end of file
diff --git a/api/src/routers/__init__.py b/api/src/routers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..792d6005489ebee62cde02066f19c5521e620451
--- /dev/null
+++ b/api/src/routers/__init__.py
@@ -0,0 +1 @@
+#
diff --git a/api/src/routers/debug.py b/api/src/routers/debug.py
new file mode 100644
index 0000000000000000000000000000000000000000..8acb9fd7664be566f060ccc57424019244464aeb
--- /dev/null
+++ b/api/src/routers/debug.py
@@ -0,0 +1,209 @@
+import threading
+import time
+from datetime import datetime
+
+import psutil
+import torch
+from fastapi import APIRouter
+
+try:
+ import GPUtil
+
+ GPU_AVAILABLE = True
+except ImportError:
+ GPU_AVAILABLE = False
+
+router = APIRouter(tags=["debug"])
+
+
+@router.get("/debug/threads")
+async def get_thread_info():
+ process = psutil.Process()
+ current_threads = threading.enumerate()
+
+ # Get per-thread CPU times
+ thread_details = []
+ for thread in current_threads:
+ thread_info = {
+ "name": thread.name,
+ "id": thread.ident,
+ "alive": thread.is_alive(),
+ "daemon": thread.daemon,
+ }
+ thread_details.append(thread_info)
+
+ return {
+ "total_threads": process.num_threads(),
+ "active_threads": len(current_threads),
+ "thread_names": [t.name for t in current_threads],
+ "thread_details": thread_details,
+ "memory_mb": process.memory_info().rss / 1024 / 1024,
+ }
+
+
+@router.get("/debug/storage")
+async def get_storage_info():
+ # Get disk partitions
+ partitions = psutil.disk_partitions()
+ storage_info = []
+
+ for partition in partitions:
+ try:
+ usage = psutil.disk_usage(partition.mountpoint)
+ storage_info.append(
+ {
+ "device": partition.device,
+ "mountpoint": partition.mountpoint,
+ "fstype": partition.fstype,
+ "total_gb": usage.total / (1024**3),
+ "used_gb": usage.used / (1024**3),
+ "free_gb": usage.free / (1024**3),
+ "percent_used": usage.percent,
+ }
+ )
+ except PermissionError:
+ continue
+
+ return {"storage_info": storage_info}
+
+
+@router.get("/debug/system")
+async def get_system_info():
+ process = psutil.Process()
+
+ # CPU Info
+ cpu_info = {
+ "cpu_count": psutil.cpu_count(),
+ "cpu_percent": psutil.cpu_percent(interval=1),
+ "per_cpu_percent": psutil.cpu_percent(interval=1, percpu=True),
+ "load_avg": psutil.getloadavg(),
+ }
+
+ # Memory Info
+ virtual_memory = psutil.virtual_memory()
+ swap_memory = psutil.swap_memory()
+ memory_info = {
+ "virtual": {
+ "total_gb": virtual_memory.total / (1024**3),
+ "available_gb": virtual_memory.available / (1024**3),
+ "used_gb": virtual_memory.used / (1024**3),
+ "percent": virtual_memory.percent,
+ },
+ "swap": {
+ "total_gb": swap_memory.total / (1024**3),
+ "used_gb": swap_memory.used / (1024**3),
+ "free_gb": swap_memory.free / (1024**3),
+ "percent": swap_memory.percent,
+ },
+ }
+
+ # Process Info
+ process_info = {
+ "pid": process.pid,
+ "status": process.status(),
+ "create_time": datetime.fromtimestamp(process.create_time()).isoformat(),
+ "cpu_percent": process.cpu_percent(),
+ "memory_percent": process.memory_percent(),
+ }
+
+ # Network Info
+ network_info = {
+ "connections": len(process.net_connections()),
+ "network_io": psutil.net_io_counters()._asdict(),
+ }
+
+ # GPU Info if available
+ gpu_info = None
+ if torch.backends.mps.is_available():
+ gpu_info = {
+ "type": "MPS",
+ "available": True,
+ "device": "Apple Silicon",
+ "backend": "Metal",
+ }
+ elif GPU_AVAILABLE:
+ try:
+ gpus = GPUtil.getGPUs()
+ gpu_info = [
+ {
+ "id": gpu.id,
+ "name": gpu.name,
+ "load": gpu.load,
+ "memory": {
+ "total": gpu.memoryTotal,
+ "used": gpu.memoryUsed,
+ "free": gpu.memoryFree,
+ "percent": (gpu.memoryUsed / gpu.memoryTotal) * 100,
+ },
+ "temperature": gpu.temperature,
+ }
+ for gpu in gpus
+ ]
+ except Exception:
+ gpu_info = "GPU information unavailable"
+
+ return {
+ "cpu": cpu_info,
+ "memory": memory_info,
+ "process": process_info,
+ "network": network_info,
+ "gpu": gpu_info,
+ }
+
+
+@router.get("/debug/session_pools")
+async def get_session_pool_info():
+ """Get information about ONNX session pools."""
+ from ..inference.model_manager import get_manager
+
+ manager = await get_manager()
+ pools = manager._session_pools
+ current_time = time.time()
+
+ pool_info = {}
+
+ # Get CPU pool info
+ if "onnx_cpu" in pools:
+ cpu_pool = pools["onnx_cpu"]
+ pool_info["cpu"] = {
+ "active_sessions": len(cpu_pool._sessions),
+ "max_sessions": cpu_pool._max_size,
+ "sessions": [
+ {"model": path, "age_seconds": current_time - info.last_used}
+ for path, info in cpu_pool._sessions.items()
+ ],
+ }
+
+ # Get GPU pool info
+ if "onnx_gpu" in pools:
+ gpu_pool = pools["onnx_gpu"]
+ pool_info["gpu"] = {
+ "active_sessions": len(gpu_pool._sessions),
+ "max_streams": gpu_pool._max_size,
+ "available_streams": len(gpu_pool._available_streams),
+ "sessions": [
+ {
+ "model": path,
+ "age_seconds": current_time - info.last_used,
+ "stream_id": info.stream_id,
+ }
+ for path, info in gpu_pool._sessions.items()
+ ],
+ }
+
+ # Add GPU memory info if available
+ if GPU_AVAILABLE:
+ try:
+ gpus = GPUtil.getGPUs()
+ if gpus:
+ gpu = gpus[0] # Assume first GPU
+ pool_info["gpu"]["memory"] = {
+ "total_mb": gpu.memoryTotal,
+ "used_mb": gpu.memoryUsed,
+ "free_mb": gpu.memoryFree,
+ "percent_used": (gpu.memoryUsed / gpu.memoryTotal) * 100,
+ }
+ except Exception:
+ pass
+
+ return pool_info
diff --git a/api/src/routers/development.py b/api/src/routers/development.py
new file mode 100644
index 0000000000000000000000000000000000000000..e74911967c3e204e31813fe0feba78a4e7f889ff
--- /dev/null
+++ b/api/src/routers/development.py
@@ -0,0 +1,408 @@
+import base64
+import json
+import os
+import re
+from pathlib import Path
+from typing import AsyncGenerator, List, Tuple, Union
+
+import numpy as np
+import torch
+from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
+from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
+from kokoro import KPipeline
+from loguru import logger
+
+from ..core.config import settings
+from ..inference.base import AudioChunk
+from ..services.audio import AudioNormalizer, AudioService
+from ..services.streaming_audio_writer import StreamingAudioWriter
+from ..services.temp_manager import TempFileWriter
+from ..services.text_processing import smart_split
+from ..services.tts_service import TTSService
+from ..structures import CaptionedSpeechRequest, CaptionedSpeechResponse, WordTimestamp
+from ..structures.custom_responses import JSONStreamingResponse
+from ..structures.text_schemas import (
+ GenerateFromPhonemesRequest,
+ PhonemeRequest,
+ PhonemeResponse,
+)
+from .openai_compatible import process_and_validate_voices, stream_audio_chunks
+
+router = APIRouter(tags=["text processing"])
+
+
+async def get_tts_service() -> TTSService:
+ """Dependency to get TTSService instance"""
+ return (
+ await TTSService.create()
+ ) # Create service with properly initialized managers
+
+
+@router.post("/dev/phonemize", response_model=PhonemeResponse)
+async def phonemize_text(request: PhonemeRequest) -> PhonemeResponse:
+ """Convert text to phonemes using Kokoro's quiet mode.
+
+ Args:
+ request: Request containing text and language
+
+ Returns:
+ Phonemes and token IDs
+ """
+ try:
+ if not request.text:
+ raise ValueError("Text cannot be empty")
+
+ # Initialize Kokoro pipeline in quiet mode (no model)
+ pipeline = KPipeline(lang_code=request.language, model=False)
+
+ # Get first result from pipeline (we only need one since we're not chunking)
+ for result in pipeline(request.text):
+ # result.graphemes = original text
+ # result.phonemes = phonemized text
+ # result.tokens = token objects (if available)
+ return PhonemeResponse(phonemes=result.phonemes, tokens=[])
+
+ raise ValueError("Failed to generate phonemes")
+ except ValueError as e:
+ logger.error(f"Error in phoneme generation: {str(e)}")
+ raise HTTPException(
+ status_code=500, detail={"error": "Server error", "message": str(e)}
+ )
+ except Exception as e:
+ logger.error(f"Error in phoneme generation: {str(e)}")
+ raise HTTPException(
+ status_code=500, detail={"error": "Server error", "message": str(e)}
+ )
+
+
+@router.post("/dev/generate_from_phonemes")
+async def generate_from_phonemes(
+ request: GenerateFromPhonemesRequest,
+ client_request: Request,
+ tts_service: TTSService = Depends(get_tts_service),
+) -> StreamingResponse:
+ """Generate audio directly from phonemes using Kokoro's phoneme format"""
+ try:
+ # Basic validation
+ if not isinstance(request.phonemes, str):
+ raise ValueError("Phonemes must be a string")
+ if not request.phonemes:
+ raise ValueError("Phonemes cannot be empty")
+
+ # Create streaming audio writer and normalizer
+ writer = StreamingAudioWriter(format="wav", sample_rate=24000, channels=1)
+ normalizer = AudioNormalizer()
+
+ async def generate_chunks():
+ try:
+ # Generate audio from phonemes
+ chunk_audio, _ = await tts_service.generate_from_phonemes(
+ phonemes=request.phonemes, # Pass complete phoneme string
+ voice=request.voice,
+ speed=1.0,
+ )
+
+ if chunk_audio is not None:
+ # Normalize audio before writing
+ normalized_audio = await normalizer.normalize(chunk_audio)
+ # Write chunk and yield bytes
+ chunk_bytes = writer.write_chunk(normalized_audio)
+ if chunk_bytes:
+ yield chunk_bytes
+
+ # Finalize and yield remaining bytes
+ final_bytes = writer.write_chunk(finalize=True)
+ if final_bytes:
+ yield final_bytes
+ else:
+ raise ValueError("Failed to generate audio data")
+
+ except Exception as e:
+ logger.error(f"Error in audio generation: {str(e)}")
+ # Clean up writer on error
+ writer.close()
+ # Re-raise the original exception
+ raise
+
+ return StreamingResponse(
+ generate_chunks(),
+ media_type="audio/wav",
+ headers={
+ "Content-Disposition": "attachment; filename=speech.wav",
+ "X-Accel-Buffering": "no",
+ "Cache-Control": "no-cache",
+ "Transfer-Encoding": "chunked",
+ },
+ )
+
+ except ValueError as e:
+ logger.error(f"Error generating audio: {str(e)}")
+ raise HTTPException(
+ status_code=400,
+ detail={
+ "error": "validation_error",
+ "message": str(e),
+ "type": "invalid_request_error",
+ },
+ )
+ except Exception as e:
+ logger.error(f"Error generating audio: {str(e)}")
+ raise HTTPException(
+ status_code=500,
+ detail={
+ "error": "processing_error",
+ "message": str(e),
+ "type": "server_error",
+ },
+ )
+
+
+@router.post("/dev/captioned_speech")
+async def create_captioned_speech(
+ request: CaptionedSpeechRequest,
+ client_request: Request,
+ x_raw_response: str = Header(None, alias="x-raw-response"),
+ tts_service: TTSService = Depends(get_tts_service),
+):
+ """Generate audio with word-level timestamps using streaming approach"""
+
+ try:
+ # model_name = get_model_name(request.model)
+ tts_service = await get_tts_service()
+ voice_name = await process_and_validate_voices(request.voice, tts_service)
+
+ # Set content type based on format
+ content_type = {
+ "mp3": "audio/mpeg",
+ "opus": "audio/opus",
+ "m4a": "audio/mp4",
+ "flac": "audio/flac",
+ "wav": "audio/wav",
+ "pcm": "audio/pcm",
+ }.get(request.response_format, f"audio/{request.response_format}")
+
+ writer = StreamingAudioWriter(request.response_format, sample_rate=24000)
+ # Check if streaming is requested (default for OpenAI client)
+ if request.stream:
+ # Create generator but don't start it yet
+ generator = stream_audio_chunks(
+ tts_service, request, client_request, writer
+ )
+
+ # If download link requested, wrap generator with temp file writer
+ if request.return_download_link:
+ from ..services.temp_manager import TempFileWriter
+
+ temp_writer = TempFileWriter(request.response_format)
+ await temp_writer.__aenter__() # Initialize temp file
+
+ # Get download path immediately after temp file creation
+ download_path = temp_writer.download_path
+
+ # Create response headers with download path
+ headers = {
+ "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+ "X-Accel-Buffering": "no",
+ "Cache-Control": "no-cache",
+ "Transfer-Encoding": "chunked",
+ "X-Download-Path": download_path,
+ }
+
+ # Create async generator for streaming
+ async def dual_output():
+ try:
+ # Write chunks to temp file and stream
+ async for chunk_data in generator:
+ # The timestamp acumulator is only used when word level time stamps are generated but no audio is returned.
+ timestamp_acumulator = []
+
+ if chunk_data.output: # Skip empty chunks
+ await temp_writer.write(chunk_data.output)
+ base64_chunk = base64.b64encode(
+ chunk_data.output
+ ).decode("utf-8")
+
+ # Add any chunks that may be in the acumulator into the return word_timestamps
+ chunk_data.word_timestamps = (
+ timestamp_acumulator + chunk_data.word_timestamps
+ )
+ timestamp_acumulator = []
+
+ yield CaptionedSpeechResponse(
+ audio=base64_chunk,
+ audio_format=content_type,
+ timestamps=chunk_data.word_timestamps,
+ )
+ else:
+ if (
+ chunk_data.word_timestamps is not None
+ and len(chunk_data.word_timestamps) > 0
+ ):
+ timestamp_acumulator += chunk_data.word_timestamps
+
+ # Finalize the temp file
+ await temp_writer.finalize()
+ except Exception as e:
+ logger.error(f"Error in dual output streaming: {e}")
+ await temp_writer.__aexit__(type(e), e, e.__traceback__)
+ raise
+ finally:
+ # Ensure temp writer is closed
+ if not temp_writer._finalized:
+ await temp_writer.__aexit__(None, None, None)
+ writer.close()
+
+ # Stream with temp file writing
+ return JSONStreamingResponse(
+ dual_output(), media_type="application/json", headers=headers
+ )
+
+ async def single_output():
+ try:
+ # The timestamp acumulator is only used when word level time stamps are generated but no audio is returned.
+ timestamp_acumulator = []
+
+ # Stream chunks
+ async for chunk_data in generator:
+ if chunk_data.output: # Skip empty chunks
+ # Encode the chunk bytes into base 64
+ base64_chunk = base64.b64encode(chunk_data.output).decode(
+ "utf-8"
+ )
+
+ # Add any chunks that may be in the acumulator into the return word_timestamps
+ if chunk_data.word_timestamps != None:
+ chunk_data.word_timestamps = (
+ timestamp_acumulator + chunk_data.word_timestamps
+ )
+ else:
+ chunk_data.word_timestamps = []
+ timestamp_acumulator = []
+
+ yield CaptionedSpeechResponse(
+ audio=base64_chunk,
+ audio_format=content_type,
+ timestamps=chunk_data.word_timestamps,
+ )
+ else:
+ if (
+ chunk_data.word_timestamps is not None
+ and len(chunk_data.word_timestamps) > 0
+ ):
+ timestamp_acumulator += chunk_data.word_timestamps
+
+ except Exception as e:
+ logger.error(f"Error in single output streaming: {e}")
+ writer.close()
+ raise
+
+ # Standard streaming without download link
+ return JSONStreamingResponse(
+ single_output(),
+ media_type="application/json",
+ headers={
+ "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+ "X-Accel-Buffering": "no",
+ "Cache-Control": "no-cache",
+ "Transfer-Encoding": "chunked",
+ },
+ )
+ else:
+ # Generate complete audio using public interface
+ audio_data = await tts_service.generate_audio(
+ text=request.input,
+ voice=voice_name,
+ writer=writer,
+ speed=request.speed,
+ return_timestamps=request.return_timestamps,
+ normalization_options=request.normalization_options,
+ lang_code=request.lang_code,
+ )
+
+ audio_data = await AudioService.convert_audio(
+ audio_data,
+ request.response_format,
+ writer,
+ is_last_chunk=False,
+ trim_audio=False,
+ )
+
+ # Convert to requested format with proper finalization
+ final = await AudioService.convert_audio(
+ AudioChunk(np.array([], dtype=np.int16)),
+ request.response_format,
+ writer,
+ is_last_chunk=True,
+ )
+ output = audio_data.output + final.output
+
+ base64_output = base64.b64encode(output).decode("utf-8")
+
+ content = CaptionedSpeechResponse(
+ audio=base64_output,
+ audio_format=content_type,
+ timestamps=audio_data.word_timestamps,
+ ).model_dump()
+
+ writer.close()
+
+ return JSONResponse(
+ content=content,
+ media_type="application/json",
+ headers={
+ "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+ "Cache-Control": "no-cache", # Prevent caching
+ },
+ )
+
+ except ValueError as e:
+ # Handle validation errors
+ logger.warning(f"Invalid request: {str(e)}")
+
+ try:
+ writer.close()
+ except:
+ pass
+
+ raise HTTPException(
+ status_code=400,
+ detail={
+ "error": "validation_error",
+ "message": str(e),
+ "type": "invalid_request_error",
+ },
+ )
+ except RuntimeError as e:
+ # Handle runtime/processing errors
+ logger.error(f"Processing error: {str(e)}")
+
+ try:
+ writer.close()
+ except:
+ pass
+
+ raise HTTPException(
+ status_code=500,
+ detail={
+ "error": "processing_error",
+ "message": str(e),
+ "type": "server_error",
+ },
+ )
+ except Exception as e:
+ # Handle unexpected errors
+ logger.error(f"Unexpected error in captioned speech generation: {str(e)}")
+
+ try:
+ writer.close()
+ except:
+ pass
+
+ raise HTTPException(
+ status_code=500,
+ detail={
+ "error": "processing_error",
+ "message": str(e),
+ "type": "server_error",
+ },
+ )
diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py
new file mode 100644
index 0000000000000000000000000000000000000000..4819bc5c7085b6815f2c30205d8c6c1efe693fc9
--- /dev/null
+++ b/api/src/routers/openai_compatible.py
@@ -0,0 +1,662 @@
+"""OpenAI-compatible router for text-to-speech"""
+
+import io
+import json
+import os
+import re
+import tempfile
+from typing import AsyncGenerator, Dict, List, Tuple, Union
+from urllib import response
+
+import aiofiles
+import numpy as np
+import torch
+from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
+from fastapi.responses import FileResponse, StreamingResponse
+from loguru import logger
+
+from ..core.config import settings
+from ..inference.base import AudioChunk
+from ..services.audio import AudioService
+from ..services.streaming_audio_writer import StreamingAudioWriter
+from ..services.tts_service import TTSService
+from ..structures import OpenAISpeechRequest
+from ..structures.schemas import CaptionedSpeechRequest
+
+
+# Load OpenAI mappings
+def load_openai_mappings() -> Dict:
+ """Load OpenAI voice and model mappings from JSON"""
+ api_dir = os.path.dirname(os.path.dirname(__file__))
+ mapping_path = os.path.join(api_dir, "core", "openai_mappings.json")
+ try:
+ with open(mapping_path, "r") as f:
+ return json.load(f)
+ except Exception as e:
+ logger.error(f"Failed to load OpenAI mappings: {e}")
+ return {"models": {}, "voices": {}}
+
+
+# Global mappings
+_openai_mappings = load_openai_mappings()
+
+
+router = APIRouter(
+ tags=["OpenAI Compatible TTS"],
+ responses={404: {"description": "Not found"}},
+)
+
+# Global TTSService instance with lock
+_tts_service = None
+_init_lock = None
+
+
+async def get_tts_service() -> TTSService:
+ """Get global TTSService instance"""
+ global _tts_service, _init_lock
+
+ # Create lock if needed
+ if _init_lock is None:
+ import asyncio
+
+ _init_lock = asyncio.Lock()
+
+ # Initialize service if needed
+ if _tts_service is None:
+ async with _init_lock:
+ # Double check pattern
+ if _tts_service is None:
+ _tts_service = await TTSService.create()
+ logger.info("Created global TTSService instance")
+
+ return _tts_service
+
+
+def get_model_name(model: str) -> str:
+ """Get internal model name from OpenAI model name"""
+ base_name = _openai_mappings["models"].get(model)
+ if not base_name:
+ raise ValueError(f"Unsupported model: {model}")
+ return base_name + ".pth"
+
+
+async def process_and_validate_voices(
+ voice_input: Union[str, List[str]], tts_service: TTSService
+) -> str:
+ """Process voice input, handling both string and list formats
+
+ Returns:
+ Voice name to use (with weights if specified)
+ """
+ voices = []
+ # Convert input to list of voices
+ if isinstance(voice_input, str):
+ voice_input = voice_input.replace(" ", "").strip()
+
+ if voice_input[-1] in "+-" or voice_input[0] in "+-":
+ raise ValueError(f"Voice combination contains empty combine items")
+
+ if re.search(r"[+-]{2,}", voice_input) is not None:
+ raise ValueError(f"Voice combination contains empty combine items")
+ voices = re.split(r"([-+])", voice_input)
+ else:
+ voices = [[item, "+"] for item in voice_input][:-1]
+
+ available_voices = await tts_service.list_voices()
+
+ for voice_index in range(0, len(voices), 2):
+ mapped_voice = voices[voice_index].split("(")
+ mapped_voice = list(map(str.strip, mapped_voice))
+
+ if len(mapped_voice) > 2:
+ raise ValueError(
+ f"Voice '{voices[voice_index]}' contains too many weight items"
+ )
+
+ if mapped_voice.count(")") > 1:
+ raise ValueError(
+ f"Voice '{voices[voice_index]}' contains too many weight items"
+ )
+
+ mapped_voice[0] = _openai_mappings["voices"].get(
+ mapped_voice[0], mapped_voice[0]
+ )
+
+ if mapped_voice[0] not in available_voices:
+ raise ValueError(
+ f"Voice '{mapped_voice[0]}' not found. Available voices: {', '.join(sorted(available_voices))}"
+ )
+
+ voices[voice_index] = "(".join(mapped_voice)
+
+ return "".join(voices)
+
+
+async def stream_audio_chunks(
+ tts_service: TTSService,
+ request: Union[OpenAISpeechRequest, CaptionedSpeechRequest],
+ client_request: Request,
+ writer: StreamingAudioWriter,
+) -> AsyncGenerator[AudioChunk, None]:
+ """Stream audio chunks as they're generated with client disconnect handling"""
+ voice_name = await process_and_validate_voices(request.voice, tts_service)
+ unique_properties = {"return_timestamps": False}
+ if hasattr(request, "return_timestamps"):
+ unique_properties["return_timestamps"] = request.return_timestamps
+
+ try:
+ async for chunk_data in tts_service.generate_audio_stream(
+ text=request.input,
+ voice=voice_name,
+ writer=writer,
+ speed=request.speed,
+ output_format=request.response_format,
+ lang_code=request.lang_code,
+ normalization_options=request.normalization_options,
+ return_timestamps=unique_properties["return_timestamps"],
+ ):
+ # Check if client is still connected
+ is_disconnected = client_request.is_disconnected
+ if callable(is_disconnected):
+ is_disconnected = await is_disconnected()
+ if is_disconnected:
+ logger.info("Client disconnected, stopping audio generation")
+ break
+
+ yield chunk_data
+ except Exception as e:
+ logger.error(f"Error in audio streaming: {str(e)}")
+ # Let the exception propagate to trigger cleanup
+ raise
+
+
+@router.post("/audio/speech")
+async def create_speech(
+ request: OpenAISpeechRequest,
+ client_request: Request,
+ x_raw_response: str = Header(None, alias="x-raw-response"),
+):
+ """OpenAI-compatible endpoint for text-to-speech"""
+ # Validate model before processing request
+ if request.model not in _openai_mappings["models"]:
+ raise HTTPException(
+ status_code=400,
+ detail={
+ "error": "invalid_model",
+ "message": f"Unsupported model: {request.model}",
+ "type": "invalid_request_error",
+ },
+ )
+
+ try:
+ # model_name = get_model_name(request.model)
+ tts_service = await get_tts_service()
+ voice_name = await process_and_validate_voices(request.voice, tts_service)
+
+ # Set content type based on format
+ content_type = {
+ "mp3": "audio/mpeg",
+ "opus": "audio/opus",
+ "aac": "audio/aac",
+ "flac": "audio/flac",
+ "wav": "audio/wav",
+ "pcm": "audio/pcm",
+ }.get(request.response_format, f"audio/{request.response_format}")
+
+ writer = StreamingAudioWriter(request.response_format, sample_rate=24000)
+
+ # Check if streaming is requested (default for OpenAI client)
+ if request.stream:
+ # Create generator but don't start it yet
+ generator = stream_audio_chunks(
+ tts_service, request, client_request, writer
+ )
+
+ # If download link requested, wrap generator with temp file writer
+ if request.return_download_link:
+ from ..services.temp_manager import TempFileWriter
+
+ # Use download_format if specified, otherwise use response_format
+ output_format = request.download_format or request.response_format
+ temp_writer = TempFileWriter(output_format)
+ await temp_writer.__aenter__() # Initialize temp file
+
+ # Get download path immediately after temp file creation
+ download_path = temp_writer.download_path
+
+ # Create response headers with download path
+ headers = {
+ "Content-Disposition": f"attachment; filename=speech.{output_format}",
+ "X-Accel-Buffering": "no",
+ "Cache-Control": "no-cache",
+ "Transfer-Encoding": "chunked",
+ "X-Download-Path": download_path,
+ }
+
+ # Add header to indicate if temp file writing is available
+ if temp_writer._write_error:
+ headers["X-Download-Status"] = "unavailable"
+
+ # Create async generator for streaming
+ async def dual_output():
+ try:
+ # Write chunks to temp file and stream
+ async for chunk_data in generator:
+ if chunk_data.output: # Skip empty chunks
+ await temp_writer.write(chunk_data.output)
+ # if return_json:
+ # yield chunk, chunk_data
+ # else:
+ yield chunk_data.output
+
+ # Finalize the temp file
+ await temp_writer.finalize()
+ except Exception as e:
+ logger.error(f"Error in dual output streaming: {e}")
+ await temp_writer.__aexit__(type(e), e, e.__traceback__)
+ raise
+ finally:
+ # Ensure temp writer is closed
+ if not temp_writer._finalized:
+ await temp_writer.__aexit__(None, None, None)
+ writer.close()
+
+ # Stream with temp file writing
+ return StreamingResponse(
+ dual_output(), media_type=content_type, headers=headers
+ )
+
+ async def single_output():
+ try:
+ # Stream chunks
+ async for chunk_data in generator:
+ if chunk_data.output: # Skip empty chunks
+ yield chunk_data.output
+ except Exception as e:
+ logger.error(f"Error in single output streaming: {e}")
+ writer.close()
+ raise
+
+ # Standard streaming without download link
+ return StreamingResponse(
+ single_output(),
+ media_type=content_type,
+ headers={
+ "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+ "X-Accel-Buffering": "no",
+ "Cache-Control": "no-cache",
+ "Transfer-Encoding": "chunked",
+ },
+ )
+ else:
+ headers = {
+ "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+ "Cache-Control": "no-cache", # Prevent caching
+ }
+
+ # Generate complete audio using public interface
+ audio_data = await tts_service.generate_audio(
+ text=request.input,
+ voice=voice_name,
+ writer=writer,
+ speed=request.speed,
+ normalization_options=request.normalization_options,
+ lang_code=request.lang_code,
+ )
+
+ audio_data = await AudioService.convert_audio(
+ audio_data,
+ request.response_format,
+ writer,
+ is_last_chunk=False,
+ trim_audio=False,
+ )
+
+ # Convert to requested format with proper finalization
+ final = await AudioService.convert_audio(
+ AudioChunk(np.array([], dtype=np.int16)),
+ request.response_format,
+ writer,
+ is_last_chunk=True,
+ )
+ output = audio_data.output + final.output
+
+ if request.return_download_link:
+ from ..services.temp_manager import TempFileWriter
+
+ # Use download_format if specified, otherwise use response_format
+ output_format = request.download_format or request.response_format
+ temp_writer = TempFileWriter(output_format)
+ await temp_writer.__aenter__() # Initialize temp file
+
+ # Get download path immediately after temp file creation
+ download_path = temp_writer.download_path
+ headers["X-Download-Path"] = download_path
+
+ try:
+ # Write chunks to temp file
+ logger.info("Writing chunks to tempory file for download")
+ await temp_writer.write(output)
+ # Finalize the temp file
+ await temp_writer.finalize()
+
+ except Exception as e:
+ logger.error(f"Error in dual output: {e}")
+ await temp_writer.__aexit__(type(e), e, e.__traceback__)
+ raise
+ finally:
+ # Ensure temp writer is closed
+ if not temp_writer._finalized:
+ await temp_writer.__aexit__(None, None, None)
+ writer.close()
+
+ return Response(
+ content=output,
+ media_type=content_type,
+ headers=headers,
+ )
+
+ except ValueError as e:
+ # Handle validation errors
+ logger.warning(f"Invalid request: {str(e)}")
+
+ try:
+ writer.close()
+ except:
+ pass
+
+ raise HTTPException(
+ status_code=400,
+ detail={
+ "error": "validation_error",
+ "message": str(e),
+ "type": "invalid_request_error",
+ },
+ )
+ except RuntimeError as e:
+ # Handle runtime/processing errors
+ logger.error(f"Processing error: {str(e)}")
+
+ try:
+ writer.close()
+ except:
+ pass
+
+ raise HTTPException(
+ status_code=500,
+ detail={
+ "error": "processing_error",
+ "message": str(e),
+ "type": "server_error",
+ },
+ )
+ except Exception as e:
+ # Handle unexpected errors
+ logger.error(f"Unexpected error in speech generation: {str(e)}")
+
+ try:
+ writer.close()
+ except:
+ pass
+
+ raise HTTPException(
+ status_code=500,
+ detail={
+ "error": "processing_error",
+ "message": str(e),
+ "type": "server_error",
+ },
+ )
+
+
+@router.get("/download/{filename}")
+async def download_audio_file(filename: str):
+ """Download a generated audio file from temp storage"""
+ try:
+ from ..core.paths import _find_file, get_content_type
+
+ # Search for file in temp directory
+ file_path = await _find_file(
+ filename=filename, search_paths=[settings.temp_file_dir]
+ )
+
+ # Get content type from path helper
+ content_type = await get_content_type(file_path)
+
+ return FileResponse(
+ file_path,
+ media_type=content_type,
+ filename=filename,
+ headers={
+ "Cache-Control": "no-cache",
+ "Content-Disposition": f"attachment; filename={filename}",
+ },
+ )
+
+ except Exception as e:
+ logger.error(f"Error serving download file {filename}: {e}")
+ raise HTTPException(
+ status_code=500,
+ detail={
+ "error": "server_error",
+ "message": "Failed to serve audio file",
+ "type": "server_error",
+ },
+ )
+
+
+@router.get("/models")
+async def list_models():
+ """List all available models"""
+ try:
+ # Create standard model list
+ models = [
+ {
+ "id": "tts-1",
+ "object": "model",
+ "created": 1686935002,
+ "owned_by": "kokoro",
+ },
+ {
+ "id": "tts-1-hd",
+ "object": "model",
+ "created": 1686935002,
+ "owned_by": "kokoro",
+ },
+ {
+ "id": "kokoro",
+ "object": "model",
+ "created": 1686935002,
+ "owned_by": "kokoro",
+ },
+ ]
+
+ return {"object": "list", "data": models}
+ except Exception as e:
+ logger.error(f"Error listing models: {str(e)}")
+ raise HTTPException(
+ status_code=500,
+ detail={
+ "error": "server_error",
+ "message": "Failed to retrieve model list",
+ "type": "server_error",
+ },
+ )
+
+
+@router.get("/models/{model}")
+async def retrieve_model(model: str):
+ """Retrieve a specific model"""
+ try:
+ # Define available models
+ models = {
+ "tts-1": {
+ "id": "tts-1",
+ "object": "model",
+ "created": 1686935002,
+ "owned_by": "kokoro",
+ },
+ "tts-1-hd": {
+ "id": "tts-1-hd",
+ "object": "model",
+ "created": 1686935002,
+ "owned_by": "kokoro",
+ },
+ "kokoro": {
+ "id": "kokoro",
+ "object": "model",
+ "created": 1686935002,
+ "owned_by": "kokoro",
+ },
+ }
+
+ # Check if requested model exists
+ if model not in models:
+ raise HTTPException(
+ status_code=404,
+ detail={
+ "error": "model_not_found",
+ "message": f"Model '{model}' not found",
+ "type": "invalid_request_error",
+ },
+ )
+
+ # Return the specific model
+ return models[model]
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"Error retrieving model {model}: {str(e)}")
+ raise HTTPException(
+ status_code=500,
+ detail={
+ "error": "server_error",
+ "message": "Failed to retrieve model information",
+ "type": "server_error",
+ },
+ )
+
+
+@router.get("/audio/voices")
+async def list_voices():
+ """List all available voices for text-to-speech"""
+ try:
+ tts_service = await get_tts_service()
+ voices = await tts_service.list_voices()
+ return {"voices": voices}
+ except Exception as e:
+ logger.error(f"Error listing voices: {str(e)}")
+ raise HTTPException(
+ status_code=500,
+ detail={
+ "error": "server_error",
+ "message": "Failed to retrieve voice list",
+ "type": "server_error",
+ },
+ )
+
+
+@router.post("/audio/voices/combine")
+async def combine_voices(request: Union[str, List[str]]):
+ """Combine multiple voices into a new voice and return the .pt file.
+
+ Args:
+ request: Either a string with voices separated by + (e.g. "voice1+voice2")
+ or a list of voice names to combine
+
+ Returns:
+ FileResponse with the combined voice .pt file
+
+ Raises:
+ HTTPException:
+ - 400: Invalid request (wrong number of voices, voice not found)
+ - 500: Server error (file system issues, combination failed)
+ """
+ # Check if local voice saving is allowed
+ if not settings.allow_local_voice_saving:
+ raise HTTPException(
+ status_code=403,
+ detail={
+ "error": "permission_denied",
+ "message": "Local voice saving is disabled",
+ "type": "permission_error",
+ },
+ )
+
+ try:
+ # Convert input to list of voices
+ if isinstance(request, str):
+ # Check if it's an OpenAI voice name
+ mapped_voice = _openai_mappings["voices"].get(request)
+ if mapped_voice:
+ request = mapped_voice
+ voices = [v.strip() for v in request.split("+") if v.strip()]
+ else:
+ # For list input, map each voice if it's an OpenAI voice name
+ voices = [_openai_mappings["voices"].get(v, v) for v in request]
+ voices = [v.strip() for v in voices if v.strip()]
+
+ if not voices:
+ raise ValueError("No voices provided")
+
+ # For multiple voices, validate base voices exist
+ tts_service = await get_tts_service()
+ available_voices = await tts_service.list_voices()
+ for voice in voices:
+ if voice not in available_voices:
+ raise ValueError(
+ f"Base voice '{voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
+ )
+
+ # Combine voices
+ combined_tensor = await tts_service.combine_voices(voices=voices)
+ combined_name = "+".join(voices)
+
+ # Save to temp file
+ temp_dir = tempfile.gettempdir()
+ voice_path = os.path.join(temp_dir, f"{combined_name}.pt")
+ buffer = io.BytesIO()
+ torch.save(combined_tensor, buffer)
+ async with aiofiles.open(voice_path, "wb") as f:
+ await f.write(buffer.getvalue())
+
+ return FileResponse(
+ voice_path,
+ media_type="application/octet-stream",
+ filename=f"{combined_name}.pt",
+ headers={
+ "Content-Disposition": f"attachment; filename={combined_name}.pt",
+ "Cache-Control": "no-cache",
+ },
+ )
+
+ except ValueError as e:
+ logger.warning(f"Invalid voice combination request: {str(e)}")
+ raise HTTPException(
+ status_code=400,
+ detail={
+ "error": "validation_error",
+ "message": str(e),
+ "type": "invalid_request_error",
+ },
+ )
+ except RuntimeError as e:
+ logger.error(f"Voice combination processing error: {str(e)}")
+ raise HTTPException(
+ status_code=500,
+ detail={
+ "error": "processing_error",
+ "message": "Failed to process voice combination request",
+ "type": "server_error",
+ },
+ )
+ except Exception as e:
+ logger.error(f"Unexpected error in voice combination: {str(e)}")
+ raise HTTPException(
+ status_code=500,
+ detail={
+ "error": "server_error",
+ "message": "An unexpected error occurred",
+ "type": "server_error",
+ },
+ )
diff --git a/api/src/routers/web_player.py b/api/src/routers/web_player.py
new file mode 100644
index 0000000000000000000000000000000000000000..65e6208b8147c066c25c854c44ba5aa68d0b5ec1
--- /dev/null
+++ b/api/src/routers/web_player.py
@@ -0,0 +1,49 @@
+"""Web player router with async file serving."""
+
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import Response
+from loguru import logger
+
+from ..core.config import settings
+from ..core.paths import get_content_type, get_web_file_path, read_bytes
+
+router = APIRouter(
+ tags=["Web Player"],
+ responses={404: {"description": "Not found"}},
+)
+
+
+@router.get("/{filename:path}")
+async def serve_web_file(filename: str):
+ """Serve web player static files asynchronously."""
+ if not settings.enable_web_player:
+ raise HTTPException(status_code=404, detail="Web player is disabled")
+
+ try:
+ # Default to index.html for root path
+ if filename == "" or filename == "/":
+ filename = "index.html"
+
+ # Get file path
+ file_path = await get_web_file_path(filename)
+
+ # Read file content
+ content = await read_bytes(file_path)
+
+ # Get content type
+ content_type = await get_content_type(file_path)
+
+ return Response(
+ content=content,
+ media_type=content_type,
+ headers={
+ "Cache-Control": "no-cache", # Prevent caching during development
+ },
+ )
+
+ except RuntimeError as e:
+ logger.warning(f"Web file not found: {filename}")
+ raise HTTPException(status_code=404, detail=str(e))
+ except Exception as e:
+ logger.error(f"Error serving web file {filename}: {e}")
+ raise HTTPException(status_code=500, detail="Internal server error")
diff --git a/api/src/services/__init__.py b/api/src/services/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..82cf76eeae00425e08856bca227a9b60cabc7083
--- /dev/null
+++ b/api/src/services/__init__.py
@@ -0,0 +1,3 @@
+from .tts_service import TTSService
+
+__all__ = ["TTSService"]
diff --git a/api/src/services/audio.py b/api/src/services/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d1d3ff6457c55493dd74b57098d19e8ffe0e5e7
--- /dev/null
+++ b/api/src/services/audio.py
@@ -0,0 +1,248 @@
+"""Audio conversion service"""
+
+import math
+import struct
+import time
+from io import BytesIO
+from typing import Tuple
+
+import numpy as np
+import scipy.io.wavfile as wavfile
+import soundfile as sf
+from loguru import logger
+from pydub import AudioSegment
+from torch import norm
+
+from ..core.config import settings
+from ..inference.base import AudioChunk
+from .streaming_audio_writer import StreamingAudioWriter
+
+
+class AudioNormalizer:
+ """Handles audio normalization state for a single stream"""
+
+ def __init__(self):
+ self.chunk_trim_ms = settings.gap_trim_ms
+ self.sample_rate = 24000 # Sample rate of the audio
+ self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
+ self.samples_to_pad_start = int(50 * self.sample_rate / 1000)
+
+ def find_first_last_non_silent(
+ self,
+ audio_data: np.ndarray,
+ chunk_text: str,
+ speed: float,
+ silence_threshold_db: int = -45,
+ is_last_chunk: bool = False,
+ ) -> tuple[int, int]:
+ """Finds the indices of the first and last non-silent samples in audio data.
+
+ Args:
+ audio_data: Input audio data as numpy array
+ chunk_text: The text sent to the model to generate the resulting speech
+ speed: The speaking speed of the voice
+ silence_threshold_db: How quiet audio has to be to be conssidered silent
+ is_last_chunk: Whether this is the last chunk
+
+ Returns:
+ A tuple with the start of the non silent portion and with the end of the non silent portion
+ """
+
+ pad_multiplier = 1
+ split_character = chunk_text.strip()
+ if len(split_character) > 0:
+ split_character = split_character[-1]
+ if split_character in settings.dynamic_gap_trim_padding_char_multiplier:
+ pad_multiplier = settings.dynamic_gap_trim_padding_char_multiplier[
+ split_character
+ ]
+
+ if not is_last_chunk:
+ samples_to_pad_end = max(
+ int(
+ (
+ settings.dynamic_gap_trim_padding_ms
+ * self.sample_rate
+ * pad_multiplier
+ )
+ / 1000
+ )
+ - self.samples_to_pad_start,
+ 0,
+ )
+ else:
+ samples_to_pad_end = self.samples_to_pad_start
+ # Convert dBFS threshold to amplitude
+ amplitude_threshold = np.iinfo(audio_data.dtype).max * (
+ 10 ** (silence_threshold_db / 20)
+ )
+ # Find the first samples above the silence threshold at the start and end of the audio
+ non_silent_index_start, non_silent_index_end = None, None
+
+ for X in range(0, len(audio_data)):
+ if audio_data[X] > amplitude_threshold:
+ non_silent_index_start = X
+ break
+
+ for X in range(len(audio_data) - 1, -1, -1):
+ if audio_data[X] > amplitude_threshold:
+ non_silent_index_end = X
+ break
+
+ # Handle the case where the entire audio is silent
+ if non_silent_index_start == None or non_silent_index_end == None:
+ return 0, len(audio_data)
+
+ return max(non_silent_index_start - self.samples_to_pad_start, 0), min(
+ non_silent_index_end + math.ceil(samples_to_pad_end / speed),
+ len(audio_data),
+ )
+
+ def normalize(self, audio_data: np.ndarray) -> np.ndarray:
+ """Convert audio data to int16 range
+
+ Args:
+ audio_data: Input audio data as numpy array
+ Returns:
+ Normalized audio data
+ """
+ if audio_data.dtype != np.int16:
+ # Scale directly to int16 range with clipping
+ return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
+ return audio_data
+
+
+class AudioService:
+ """Service for audio format conversions with streaming support"""
+
+ # Supported formats
+ SUPPORTED_FORMATS = {"wav", "mp3", "opus", "flac", "aac", "pcm"}
+
+ # Default audio format settings balanced for speed and compression
+ DEFAULT_SETTINGS = {
+ "mp3": {
+ "bitrate_mode": "CONSTANT", # Faster than variable bitrate
+ "compression_level": 0.0, # Balanced compression
+ },
+ "opus": {
+ "compression_level": 0.0, # Good balance for speech
+ },
+ "flac": {
+ "compression_level": 0.0, # Light compression, still fast
+ },
+ "aac": {
+ "bitrate": "192k", # Default AAC bitrate
+ },
+ }
+
+ @staticmethod
+ async def convert_audio(
+ audio_chunk: AudioChunk,
+ output_format: str,
+ writer: StreamingAudioWriter,
+ speed: float = 1,
+ chunk_text: str = "",
+ is_last_chunk: bool = False,
+ trim_audio: bool = True,
+ normalizer: AudioNormalizer = None,
+ ) -> AudioChunk:
+ """Convert audio data to specified format with streaming support
+
+ Args:
+ audio_data: Numpy array of audio samples
+ output_format: Target format (wav, mp3, ogg, pcm)
+ writer: The StreamingAudioWriter to use
+ speed: The speaking speed of the voice
+ chunk_text: The text sent to the model to generate the resulting speech
+ is_last_chunk: Whether this is the last chunk
+ trim_audio: Whether audio should be trimmed
+ normalizer: Optional AudioNormalizer instance for consistent normalization
+
+ Returns:
+ Bytes of the converted audio chunk
+ """
+
+ try:
+ # Validate format
+ if output_format not in AudioService.SUPPORTED_FORMATS:
+ raise ValueError(f"Format {output_format} not supported")
+
+ # Always normalize audio to ensure proper amplitude scaling
+ if normalizer is None:
+ normalizer = AudioNormalizer()
+
+ audio_chunk.audio = normalizer.normalize(audio_chunk.audio)
+
+ if trim_audio == True:
+ audio_chunk = AudioService.trim_audio(
+ audio_chunk, chunk_text, speed, is_last_chunk, normalizer
+ )
+
+ # Write audio data first
+ if len(audio_chunk.audio) > 0:
+ chunk_data = writer.write_chunk(audio_chunk.audio)
+
+ # Then finalize if this is the last chunk
+ if is_last_chunk:
+ final_data = writer.write_chunk(finalize=True)
+
+ if final_data:
+ audio_chunk.output = final_data
+ return audio_chunk
+
+ if chunk_data:
+ audio_chunk.output = chunk_data
+ return audio_chunk
+
+ except Exception as e:
+ logger.error(f"Error converting audio stream to {output_format}: {str(e)}")
+ raise ValueError(
+ f"Failed to convert audio stream to {output_format}: {str(e)}"
+ )
+
+ @staticmethod
+ def trim_audio(
+ audio_chunk: AudioChunk,
+ chunk_text: str = "",
+ speed: float = 1,
+ is_last_chunk: bool = False,
+ normalizer: AudioNormalizer = None,
+ ) -> AudioChunk:
+ """Trim silence from start and end
+
+ Args:
+ audio_data: Input audio data as numpy array
+ chunk_text: The text sent to the model to generate the resulting speech
+ speed: The speaking speed of the voice
+ is_last_chunk: Whether this is the last chunk
+ normalizer: Optional AudioNormalizer instance for consistent normalization
+
+ Returns:
+ Trimmed audio data
+ """
+ if normalizer is None:
+ normalizer = AudioNormalizer()
+
+ audio_chunk.audio = normalizer.normalize(audio_chunk.audio)
+
+ trimed_samples = 0
+ # Trim start and end if enough samples
+ if len(audio_chunk.audio) > (2 * normalizer.samples_to_trim):
+ audio_chunk.audio = audio_chunk.audio[
+ normalizer.samples_to_trim : -normalizer.samples_to_trim
+ ]
+ trimed_samples += normalizer.samples_to_trim
+
+ # Find non silent portion and trim
+ start_index, end_index = normalizer.find_first_last_non_silent(
+ audio_chunk.audio, chunk_text, speed, is_last_chunk=is_last_chunk
+ )
+
+ audio_chunk.audio = audio_chunk.audio[start_index:end_index]
+ trimed_samples += start_index
+
+ if audio_chunk.word_timestamps is not None:
+ for timestamp in audio_chunk.word_timestamps:
+ timestamp.start_time -= trimed_samples / 24000
+ timestamp.end_time -= trimed_samples / 24000
+ return audio_chunk
diff --git a/api/src/services/streaming_audio_writer.py b/api/src/services/streaming_audio_writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6ec2d6fbd49c7f78721f352fdaf712d84a6195c
--- /dev/null
+++ b/api/src/services/streaming_audio_writer.py
@@ -0,0 +1,100 @@
+"""Audio conversion service with proper streaming support"""
+
+import struct
+from io import BytesIO
+from typing import Optional
+
+import av
+import numpy as np
+import soundfile as sf
+from loguru import logger
+from pydub import AudioSegment
+
+
+class StreamingAudioWriter:
+ """Handles streaming audio format conversions"""
+
+ def __init__(self, format: str, sample_rate: int, channels: int = 1):
+ self.format = format.lower()
+ self.sample_rate = sample_rate
+ self.channels = channels
+ self.bytes_written = 0
+ self.pts = 0
+
+ codec_map = {
+ "wav": "pcm_s16le",
+ "mp3": "mp3",
+ "opus": "libopus",
+ "flac": "flac",
+ "aac": "aac",
+ }
+ # Format-specific setup
+ if self.format in ["wav", "flac", "mp3", "pcm", "aac", "opus"]:
+ if self.format != "pcm":
+ self.output_buffer = BytesIO()
+ self.container = av.open(
+ self.output_buffer,
+ mode="w",
+ format=self.format if self.format != "aac" else "adts",
+ )
+ self.stream = self.container.add_stream(
+ codec_map[self.format],
+ sample_rate=self.sample_rate,
+ layout="mono" if self.channels == 1 else "stereo",
+ )
+ self.stream.bit_rate = 128000
+ else:
+ raise ValueError(f"Unsupported format: {format}")
+
+ def close(self):
+ if hasattr(self, "container"):
+ self.container.close()
+
+ if hasattr(self, "output_buffer"):
+ self.output_buffer.close()
+
+ def write_chunk(
+ self, audio_data: Optional[np.ndarray] = None, finalize: bool = False
+ ) -> bytes:
+ """Write a chunk of audio data and return bytes in the target format.
+
+ Args:
+ audio_data: Audio data to write, or None if finalizing
+ finalize: Whether this is the final write to close the stream
+ """
+
+ if finalize:
+ if self.format != "pcm":
+ packets = self.stream.encode(None)
+ for packet in packets:
+ self.container.mux(packet)
+
+ data = self.output_buffer.getvalue()
+ self.close()
+ return data
+
+ if audio_data is None or len(audio_data) == 0:
+ return b""
+
+ if self.format == "pcm":
+ # Write raw bytes
+ return audio_data.tobytes()
+ else:
+ frame = av.AudioFrame.from_ndarray(
+ audio_data.reshape(1, -1),
+ format="s16",
+ layout="mono" if self.channels == 1 else "stereo",
+ )
+ frame.sample_rate = self.sample_rate
+
+ frame.pts = self.pts
+ self.pts += frame.samples
+
+ packets = self.stream.encode(frame)
+ for packet in packets:
+ self.container.mux(packet)
+
+ data = self.output_buffer.getvalue()
+ self.output_buffer.seek(0)
+ self.output_buffer.truncate(0)
+ return data
diff --git a/api/src/services/temp_manager.py b/api/src/services/temp_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d92a9e1be687f884a7ea23520ee8df12a483581
--- /dev/null
+++ b/api/src/services/temp_manager.py
@@ -0,0 +1,170 @@
+"""Temporary file writer for audio downloads"""
+
+import os
+import tempfile
+from typing import List, Optional
+
+import aiofiles
+from fastapi import HTTPException
+from loguru import logger
+
+from ..core.config import settings
+
+
+async def cleanup_temp_files() -> None:
+ """Clean up old temp files"""
+ try:
+ if not await aiofiles.os.path.exists(settings.temp_file_dir):
+ await aiofiles.os.makedirs(settings.temp_file_dir, exist_ok=True)
+ return
+
+ # Get all temp files with stats
+ files = []
+ total_size = 0
+
+ # Use os.scandir for sync iteration, but aiofiles.os.stat for async stats
+ for entry in os.scandir(settings.temp_file_dir):
+ if entry.is_file():
+ stat = await aiofiles.os.stat(entry.path)
+ files.append((entry.path, stat.st_mtime, stat.st_size))
+ total_size += stat.st_size
+
+ # Sort by modification time (oldest first)
+ files.sort(key=lambda x: x[1])
+
+ # Remove files if:
+ # 1. They're too old
+ # 2. We have too many files
+ # 3. Directory is too large
+ current_time = (await aiofiles.os.stat(settings.temp_file_dir)).st_mtime
+ max_age = settings.max_temp_dir_age_hours * 3600
+
+ for path, mtime, size in files:
+ should_delete = False
+
+ # Check age
+ if current_time - mtime > max_age:
+ should_delete = True
+ logger.info(f"Deleting old temp file: {path}")
+
+ # Check count limit
+ elif len(files) > settings.max_temp_dir_count:
+ should_delete = True
+ logger.info(f"Deleting excess temp file: {path}")
+
+ # Check size limit
+ elif total_size > settings.max_temp_dir_size_mb * 1024 * 1024:
+ should_delete = True
+ logger.info(f"Deleting to reduce directory size: {path}")
+
+ if should_delete:
+ try:
+ await aiofiles.os.remove(path)
+ total_size -= size
+ logger.info(f"Deleted temp file: {path}")
+ except Exception as e:
+ logger.warning(f"Failed to delete temp file {path}: {e}")
+
+ except Exception as e:
+ logger.warning(f"Error during temp file cleanup: {e}")
+
+
+class TempFileWriter:
+ """Handles writing audio chunks to a temp file"""
+
+ def __init__(self, format: str):
+ """Initialize temp file writer
+
+ Args:
+ format: Audio format extension (mp3, wav, etc)
+ """
+ self.format = format
+ self.temp_file = None
+ self._finalized = False
+ self._write_error = False # Flag to track if we've had a write error
+
+ async def __aenter__(self):
+ """Async context manager entry"""
+ try:
+ # Clean up old files first
+ await cleanup_temp_files()
+
+ # Create temp file with proper extension
+ await aiofiles.os.makedirs(settings.temp_file_dir, exist_ok=True)
+ temp = tempfile.NamedTemporaryFile(
+ dir=settings.temp_file_dir,
+ delete=False,
+ suffix=f".{self.format}",
+ mode="wb",
+ )
+ self.temp_file = await aiofiles.open(temp.name, mode="wb")
+ self.temp_path = temp.name
+ temp.close() # Close sync file, we'll use async version
+
+ # Generate download path immediately
+ self.download_path = f"/download/{os.path.basename(self.temp_path)}"
+ except Exception as e:
+ # Handle permission issues or other errors gracefully
+ logger.error(f"Failed to create temp file: {e}")
+ self._write_error = True
+ # Set a placeholder path so the API can still function
+ self.temp_path = f"unavailable_{self.format}"
+ self.download_path = f"/download/{self.temp_path}"
+
+ return self
+
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
+ """Async context manager exit"""
+ try:
+ if self.temp_file and not self._finalized:
+ await self.temp_file.close()
+ self._finalized = True
+ except Exception as e:
+ logger.error(f"Error closing temp file: {e}")
+ self._write_error = True
+
+ async def write(self, chunk: bytes) -> None:
+ """Write a chunk of audio data
+
+ Args:
+ chunk: Audio data bytes to write
+ """
+ if self._finalized:
+ raise RuntimeError("Cannot write to finalized temp file")
+
+ # Skip writing if we've already encountered an error
+ if self._write_error or not self.temp_file:
+ return
+
+ try:
+ await self.temp_file.write(chunk)
+ await self.temp_file.flush()
+ except Exception as e:
+ # Handle permission issues or other errors gracefully
+ logger.error(f"Failed to write to temp file: {e}")
+ self._write_error = True
+
+ async def finalize(self) -> str:
+ """Close temp file and return download path
+
+ Returns:
+ Path to use for downloading the temp file
+ """
+ if self._finalized:
+ raise RuntimeError("Temp file already finalized")
+
+ # Skip finalizing if we've already encountered an error
+ if self._write_error or not self.temp_file:
+ self._finalized = True
+ return self.download_path
+
+ try:
+ await self.temp_file.close()
+ self._finalized = True
+ except Exception as e:
+ # Handle permission issues or other errors gracefully
+ logger.error(f"Failed to finalize temp file: {e}")
+ self._write_error = True
+ self._finalized = True
+
+ return self.download_path
diff --git a/api/src/services/text_processing/__init__.py b/api/src/services/text_processing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc7c0cfe646cce613d1c92287b14a18096ee14d8
--- /dev/null
+++ b/api/src/services/text_processing/__init__.py
@@ -0,0 +1,21 @@
+"""Text processing pipeline."""
+
+from .normalizer import normalize_text
+from .phonemizer import phonemize
+from .text_processor import process_text_chunk, smart_split
+from .vocabulary import tokenize
+
+
+def process_text(text: str) -> list[int]:
+ """Process text into token IDs (for backward compatibility)."""
+ return process_text_chunk(text)
+
+
+__all__ = [
+ "normalize_text",
+ "phonemize",
+ "tokenize",
+ "process_text",
+ "process_text_chunk",
+ "smart_split",
+]
diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9f73c044af066e03be4801f88421d2e7f826cd6
--- /dev/null
+++ b/api/src/services/text_processing/normalizer.py
@@ -0,0 +1,415 @@
+"""
+Text normalization module for TTS processing.
+Handles various text formats including URLs, emails, numbers, money, and special characters.
+Converts them into a format suitable for text-to-speech processing.
+"""
+
+import re
+from functools import lru_cache
+
+import inflect
+from numpy import number
+from text_to_num import text2num
+from torch import mul
+
+from ...structures.schemas import NormalizationOptions
+
+# Constants
+VALID_TLDS = [
+ "com",
+ "org",
+ "net",
+ "edu",
+ "gov",
+ "mil",
+ "int",
+ "biz",
+ "info",
+ "name",
+ "pro",
+ "coop",
+ "museum",
+ "travel",
+ "jobs",
+ "mobi",
+ "tel",
+ "asia",
+ "cat",
+ "xxx",
+ "aero",
+ "arpa",
+ "bg",
+ "br",
+ "ca",
+ "cn",
+ "de",
+ "es",
+ "eu",
+ "fr",
+ "in",
+ "it",
+ "jp",
+ "mx",
+ "nl",
+ "ru",
+ "uk",
+ "us",
+ "io",
+ "co",
+]
+
+VALID_UNITS = {
+ "m": "meter",
+ "cm": "centimeter",
+ "mm": "millimeter",
+ "km": "kilometer",
+ "in": "inch",
+ "ft": "foot",
+ "yd": "yard",
+ "mi": "mile", # Length
+ "g": "gram",
+ "kg": "kilogram",
+ "mg": "milligram", # Mass
+ "s": "second",
+ "ms": "millisecond",
+ "min": "minutes",
+ "h": "hour", # Time
+ "l": "liter",
+ "ml": "mililiter",
+ "cl": "centiliter",
+ "dl": "deciliter", # Volume
+ "kph": "kilometer per hour",
+ "mph": "mile per hour",
+ "mi/h": "mile per hour",
+ "m/s": "meter per second",
+ "km/h": "kilometer per hour",
+ "mm/s": "milimeter per second",
+ "cm/s": "centimeter per second",
+ "ft/s": "feet per second",
+ "cm/h": "centimeter per day", # Speed
+ "°c": "degree celsius",
+ "c": "degree celsius",
+ "°f": "degree fahrenheit",
+ "f": "degree fahrenheit",
+ "k": "kelvin", # Temperature
+ "pa": "pascal",
+ "kpa": "kilopascal",
+ "mpa": "megapascal",
+ "atm": "atmosphere", # Pressure
+ "hz": "hertz",
+ "khz": "kilohertz",
+ "mhz": "megahertz",
+ "ghz": "gigahertz", # Frequency
+ "v": "volt",
+ "kv": "kilovolt",
+ "mv": "mergavolt", # Voltage
+ "a": "amp",
+ "ma": "megaamp",
+ "ka": "kiloamp", # Current
+ "w": "watt",
+ "kw": "kilowatt",
+ "mw": "megawatt", # Power
+ "j": "joule",
+ "kj": "kilojoule",
+ "mj": "megajoule", # Energy
+ "Ω": "ohm",
+ "kΩ": "kiloohm",
+ "mΩ": "megaohm", # Resistance (Ohm)
+ "f": "farad",
+ "µf": "microfarad",
+ "nf": "nanofarad",
+ "pf": "picofarad", # Capacitance
+ "b": "bit",
+ "kb": "kilobit",
+ "mb": "megabit",
+ "gb": "gigabit",
+ "tb": "terabit",
+ "pb": "petabit", # Data size
+ "kbps": "kilobit per second",
+ "mbps": "megabit per second",
+ "gbps": "gigabit per second",
+ "tbps": "terabit per second",
+ "px": "pixel", # CSS units
+}
+
+
+# Pre-compiled regex patterns for performance
+EMAIL_PATTERN = re.compile(
+ r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE
+)
+URL_PATTERN = re.compile(
+ r"(https?://|www\.|)+(localhost|[a-zA-Z0-9.-]+(\.(?:"
+ + "|".join(VALID_TLDS)
+ + "))+|[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})(:[0-9]+)?([/?][^\s]*)?",
+ re.IGNORECASE,
+)
+
+UNIT_PATTERN = re.compile(
+ r"((? str:
+ """Handle number splitting for various formats"""
+ num = num.group()
+ if "." in num:
+ return num
+ elif ":" in num:
+ h, m = [int(n) for n in num.split(":")]
+ if m == 0:
+ return f"{h} o'clock"
+ elif m < 10:
+ return f"{h} oh {m}"
+ return f"{h} {m}"
+ year = int(num[:4])
+ if year < 1100 or year % 1000 < 10:
+ return num
+ left, right = num[:2], int(num[2:4])
+ s = "s" if num.endswith("s") else ""
+ if 100 <= year % 1000 <= 999:
+ if right == 0:
+ return f"{left} hundred{s}"
+ elif right < 10:
+ return f"{left} oh {right}{s}"
+ return f"{left} {right}{s}"
+
+
+def handle_units(u: re.Match[str]) -> str:
+ """Converts units to their full form"""
+ unit_string = u.group(6).strip()
+ unit = unit_string
+
+ if unit_string.lower() in VALID_UNITS:
+ unit = VALID_UNITS[unit_string.lower()].split(" ")
+
+ # Handles the B vs b case
+ if unit[0].endswith("bit"):
+ b_case = unit_string[min(1, len(unit_string) - 1)]
+ if b_case == "B":
+ unit[0] = unit[0][:-3] + "byte"
+
+ number = u.group(1).strip()
+ unit[0] = INFLECT_ENGINE.no(unit[0], number)
+ return " ".join(unit)
+
+
+def conditional_int(number: float, threshold: float = 0.00001):
+ if abs(round(number) - number) < threshold:
+ return int(round(number))
+ return number
+
+
+def handle_money(m: re.Match[str]) -> str:
+ """Convert money expressions to spoken form"""
+
+ bill = "dollar" if m.group(2) == "$" else "pound"
+ coin = "cent" if m.group(2) == "$" else "pence"
+ number = m.group(3)
+
+ multiplier = m.group(4)
+ try:
+ number = float(number)
+ except:
+ return m.group()
+
+ if m.group(1) == "-":
+ number *= -1
+
+ if number % 1 == 0 or multiplier != "":
+ text_number = f"{INFLECT_ENGINE.number_to_words(conditional_int(number))}{multiplier} {INFLECT_ENGINE.plural(bill, count=number)}"
+ else:
+ sub_number = int(str(number).split(".")[-1].ljust(2, "0"))
+
+ text_number = f"{INFLECT_ENGINE.number_to_words(int(round(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}"
+
+ return text_number
+
+
+def handle_decimal(num: re.Match[str]) -> str:
+ """Convert decimal numbers to spoken form"""
+ a, b = num.group().split(".")
+ return " point ".join([a, " ".join(b)])
+
+
+def handle_email(m: re.Match[str]) -> str:
+ """Convert email addresses into speakable format"""
+ email = m.group(0)
+ parts = email.split("@")
+ if len(parts) == 2:
+ user, domain = parts
+ domain = domain.replace(".", " dot ")
+ return f"{user} at {domain}"
+ return email
+
+
+def handle_url(u: re.Match[str]) -> str:
+ """Make URLs speakable by converting special characters to spoken words"""
+ if not u:
+ return ""
+
+ url = u.group(0).strip()
+
+ # Handle protocol first
+ url = re.sub(
+ r"^https?://",
+ lambda a: "https " if "https" in a.group() else "http ",
+ url,
+ flags=re.IGNORECASE,
+ )
+ url = re.sub(r"^www\.", "www ", url, flags=re.IGNORECASE)
+
+ # Handle port numbers before other replacements
+ url = re.sub(r":(\d+)(?=/|$)", lambda m: f" colon {m.group(1)}", url)
+
+ # Split into domain and path
+ parts = url.split("/", 1)
+ domain = parts[0]
+ path = parts[1] if len(parts) > 1 else ""
+
+ # Handle dots in domain
+ domain = domain.replace(".", " dot ")
+
+ # Reconstruct URL
+ if path:
+ url = f"{domain} slash {path}"
+ else:
+ url = domain
+
+ # Replace remaining symbols with words
+ url = url.replace("-", " dash ")
+ url = url.replace("_", " underscore ")
+ url = url.replace("?", " question-mark ")
+ url = url.replace("=", " equals ")
+ url = url.replace("&", " ampersand ")
+ url = url.replace("%", " percent ")
+ url = url.replace(":", " colon ") # Handle any remaining colons
+ url = url.replace("/", " slash ") # Handle any remaining slashes
+
+ # Clean up extra spaces
+ return re.sub(r"\s+", " ", url).strip()
+
+
+def handle_phone_number(p: re.Match[str]) -> str:
+ p = list(p.groups())
+
+ country_code = ""
+ if p[0] is not None:
+ p[0] = p[0].replace("+", "")
+ country_code += INFLECT_ENGINE.number_to_words(p[0])
+
+ area_code = INFLECT_ENGINE.number_to_words(
+ p[2].replace("(", "").replace(")", ""), group=1, comma=""
+ )
+
+ telephone_prefix = INFLECT_ENGINE.number_to_words(p[3], group=1, comma="")
+
+ line_number = INFLECT_ENGINE.number_to_words(p[4], group=1, comma="")
+
+ return ",".join([country_code, area_code, telephone_prefix, line_number])
+
+
+def handle_time(t: re.Match[str]) -> str:
+ t = t.groups()
+
+ numbers = " ".join(
+ [INFLECT_ENGINE.number_to_words(X.strip()) for X in t[0].split(":")]
+ )
+
+ half = ""
+ if t[2] is not None:
+ half = t[2].strip()
+
+ return numbers + half
+
+
+def normalize_text(text: str, normalization_options: NormalizationOptions) -> str:
+ """Normalize text for TTS processing"""
+ # Handle email addresses first if enabled
+ if normalization_options.email_normalization:
+ text = EMAIL_PATTERN.sub(handle_email, text)
+
+ # Handle URLs if enabled
+ if normalization_options.url_normalization:
+ text = URL_PATTERN.sub(handle_url, text)
+
+ # Pre-process numbers with units if enabled
+ if normalization_options.unit_normalization:
+ text = UNIT_PATTERN.sub(handle_units, text)
+
+ # Replace optional pluralization
+ if normalization_options.optional_pluralization_normalization:
+ text = re.sub(r"\(s\)", "s", text)
+
+ # Replace phone numbers:
+ if normalization_options.phone_normalization:
+ text = re.sub(
+ r"(\+?\d{1,2})?([ .-]?)(\(?\d{3}\)?)[\s.-](\d{3})[\s.-](\d{4})",
+ handle_phone_number,
+ text,
+ )
+
+ # Replace quotes and brackets
+ text = text.replace(chr(8216), "'").replace(chr(8217), "'")
+ text = text.replace("«", chr(8220)).replace("»", chr(8221))
+ text = text.replace(chr(8220), '"').replace(chr(8221), '"')
+
+ # Handle CJK punctuation and some non standard chars
+ for a, b in zip("、。!,:;?–", ",.!,:;?-"):
+ text = text.replace(a, b + " ")
+
+ # Handle simple time in the format of HH:MM:SS
+ text = TIME_PATTERN.sub(
+ handle_time,
+ text,
+ )
+
+ # Clean up whitespace
+ text = re.sub(r"[^\S \n]", " ", text)
+ text = re.sub(r" +", " ", text)
+ text = re.sub(r"(?<=\n) +(?=\n)", "", text)
+
+ # Handle titles and abbreviations
+ text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
+ text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
+ text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
+ text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
+ text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
+
+ # Handle common words
+ text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
+
+ # Handle numbers and money
+ text = re.sub(r"(?<=\d),(?=\d)", "", text)
+
+ text = re.sub(
+ r"(?i)(-?)([$£])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion)*)\b",
+ handle_money,
+ text,
+ )
+
+ text = re.sub(
+ r"\d*\.\d+|\b\d{4}s?\b|(? str:
+ """Convert text to phonemes
+
+ Args:
+ text: Text to convert to phonemes
+
+ Returns:
+ Phonemized text
+ """
+ pass
+
+
+class EspeakBackend(PhonemizerBackend):
+ """Espeak-based phonemizer implementation"""
+
+ def __init__(self, language: str):
+ """Initialize espeak backend
+
+ Args:
+ language: Language code ('en-us' or 'en-gb')
+ """
+ self.backend = phonemizer.backend.EspeakBackend(
+ language=language, preserve_punctuation=True, with_stress=True
+ )
+
+ self.language = language
+
+ def phonemize(self, text: str) -> str:
+ """Convert text to phonemes using espeak
+
+ Args:
+ text: Text to convert to phonemes
+
+ Returns:
+ Phonemized text
+ """
+ # Phonemize text
+ ps = self.backend.phonemize([text])
+ ps = ps[0] if ps else ""
+
+ # Handle special cases
+ ps = ps.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace("kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ")
+ ps = ps.replace("ʲ", "j").replace("r", "ɹ").replace("x", "k").replace("ɬ", "l")
+ ps = re.sub(r"(?<=[a-zɹː])(?=hˈʌndɹɪd)", " ", ps)
+ ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»"" ]|$)', "z", ps)
+
+ # Language-specific rules
+ if self.language == "en-us":
+ ps = re.sub(r"(?<=nˈaɪn)ti(?!ː)", "di", ps)
+
+ return ps.strip()
+
+
+def create_phonemizer(language: str = "a") -> PhonemizerBackend:
+ """Factory function to create phonemizer backend
+
+ Args:
+ language: Language code ('a' for US English, 'b' for British English)
+
+ Returns:
+ Phonemizer backend instance
+ """
+ # Map language codes to espeak language codes
+ lang_map = {"a": "en-us", "b": "en-gb"}
+
+ if language not in lang_map:
+ raise ValueError(f"Unsupported language code: {language}")
+
+ return EspeakBackend(lang_map[language])
+
+
+def phonemize(text: str, language: str = "a", normalize: bool = True) -> str:
+ """Convert text to phonemes
+
+ Args:
+ text: Text to convert to phonemes
+ language: Language code ('a' for US English, 'b' for British English)
+ normalize: Whether to normalize text before phonemization
+
+ Returns:
+ Phonemized text
+ """
+ global phonemizers
+ if normalize:
+ text = normalize_text(text)
+ if language not in phonemizers:
+ phonemizers[language] = create_phonemizer(language)
+ return phonemizers[language].phonemize(text)
diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..584affe1b0d0c977df54b63f72bd1f320fab9806
--- /dev/null
+++ b/api/src/services/text_processing/text_processor.py
@@ -0,0 +1,276 @@
+"""Unified text processing for TTS with smart chunking."""
+
+import re
+import time
+from typing import AsyncGenerator, Dict, List, Tuple
+
+from loguru import logger
+
+from ...core.config import settings
+from ...structures.schemas import NormalizationOptions
+from .normalizer import normalize_text
+from .phonemizer import phonemize
+from .vocabulary import tokenize
+
+# Pre-compiled regex patterns for performance
+CUSTOM_PHONEMES = re.compile(r"(\[([^\]]|\n)*?\])(\(\/([^\/)]|\n)*?\/\))")
+
+
+def process_text_chunk(
+ text: str, language: str = "a", skip_phonemize: bool = False
+) -> List[int]:
+ """Process a chunk of text through normalization, phonemization, and tokenization.
+
+ Args:
+ text: Text chunk to process
+ language: Language code for phonemization
+ skip_phonemize: If True, treat input as phonemes and skip normalization/phonemization
+
+ Returns:
+ List of token IDs
+ """
+ start_time = time.time()
+
+ if skip_phonemize:
+ # Input is already phonemes, just tokenize
+ t0 = time.time()
+ tokens = tokenize(text)
+ t1 = time.time()
+ else:
+ # Normal text processing pipeline
+ t0 = time.time()
+ t1 = time.time()
+
+ t0 = time.time()
+ phonemes = phonemize(text, language, normalize=False) # Already normalized
+ t1 = time.time()
+
+ t0 = time.time()
+ tokens = tokenize(phonemes)
+ t1 = time.time()
+
+ total_time = time.time() - start_time
+ logger.debug(
+ f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}{'...' if len(text) > 50 else ''}'"
+ )
+
+ return tokens
+
+
+async def yield_chunk(
+ text: str, tokens: List[int], chunk_count: int
+) -> Tuple[str, List[int]]:
+ """Yield a chunk with consistent logging."""
+ logger.debug(
+ f"Yielding chunk {chunk_count}: '{text[:50]}{'...' if len(text) > 50 else ''}' ({len(tokens)} tokens)"
+ )
+ return text, tokens
+
+
+def process_text(text: str, language: str = "a") -> List[int]:
+ """Process text into token IDs.
+
+ Args:
+ text: Text to process
+ language: Language code for phonemization
+
+ Returns:
+ List of token IDs
+ """
+ if not isinstance(text, str):
+ text = str(text) if text is not None else ""
+
+ text = text.strip()
+ if not text:
+ return []
+
+ return process_text_chunk(text, language)
+
+
+def get_sentence_info(
+ text: str, custom_phenomes_list: Dict[str, str]
+) -> List[Tuple[str, List[int], int]]:
+ """Process all sentences and return info."""
+ sentences = re.split(r"([.!?;:])(?=\s|$)", text)
+ phoneme_length, min_value = len(custom_phenomes_list), 0
+
+ results = []
+ for i in range(0, len(sentences), 2):
+ sentence = sentences[i].strip()
+ for replaced in range(min_value, phoneme_length):
+ current_id = f"|custom_phonemes_{replaced}|/>"
+ if current_id in sentence:
+ sentence = sentence.replace(
+ current_id, custom_phenomes_list.pop(current_id)
+ )
+ min_value += 1
+
+ punct = sentences[i + 1] if i + 1 < len(sentences) else ""
+
+ if not sentence:
+ continue
+
+ full = sentence + punct
+ tokens = process_text_chunk(full)
+ results.append((full, tokens, len(tokens)))
+
+ return results
+
+
+def handle_custom_phonemes(s: re.Match[str], phenomes_list: Dict[str, str]) -> str:
+ latest_id = f"|custom_phonemes_{len(phenomes_list)}|/>"
+ phenomes_list[latest_id] = s.group(0).strip()
+ return latest_id
+
+
+async def smart_split(
+ text: str,
+ max_tokens: int = settings.absolute_max_tokens,
+ lang_code: str = "a",
+ normalization_options: NormalizationOptions = NormalizationOptions(),
+) -> AsyncGenerator[Tuple[str, List[int]], None]:
+ """Build optimal chunks targeting 300-400 tokens, never exceeding max_tokens."""
+ start_time = time.time()
+ chunk_count = 0
+ logger.info(f"Starting smart split for {len(text)} chars")
+
+ custom_phoneme_list = {}
+
+ # Normalize text
+ if settings.advanced_text_normalization and normalization_options.normalize:
+ print(lang_code)
+ if lang_code in ["a", "b", "en-us", "en-gb"]:
+ text = CUSTOM_PHONEMES.sub(
+ lambda s: handle_custom_phonemes(s, custom_phoneme_list), text
+ )
+ text = normalize_text(text, normalization_options)
+ else:
+ logger.info(
+ "Skipping text normalization as it is only supported for english"
+ )
+
+ # Process all sentences
+ sentences = get_sentence_info(text, custom_phoneme_list)
+
+ current_chunk = []
+ current_tokens = []
+ current_count = 0
+
+ for sentence, tokens, count in sentences:
+ # Handle sentences that exceed max tokens
+ if count > max_tokens:
+ # Yield current chunk if any
+ if current_chunk:
+ chunk_text = " ".join(current_chunk)
+ chunk_count += 1
+ logger.debug(
+ f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
+ )
+ yield chunk_text, current_tokens
+ current_chunk = []
+ current_tokens = []
+ current_count = 0
+
+ # Split long sentence on commas
+ clauses = re.split(r"([,])", sentence)
+ clause_chunk = []
+ clause_tokens = []
+ clause_count = 0
+
+ for j in range(0, len(clauses), 2):
+ clause = clauses[j].strip()
+ comma = clauses[j + 1] if j + 1 < len(clauses) else ""
+
+ if not clause:
+ continue
+
+ full_clause = clause + comma
+
+ tokens = process_text_chunk(full_clause)
+ count = len(tokens)
+
+ # If adding clause keeps us under max and not optimal yet
+ if (
+ clause_count + count <= max_tokens
+ and clause_count + count <= settings.target_max_tokens
+ ):
+ clause_chunk.append(full_clause)
+ clause_tokens.extend(tokens)
+ clause_count += count
+ else:
+ # Yield clause chunk if we have one
+ if clause_chunk:
+ chunk_text = " ".join(clause_chunk)
+ chunk_count += 1
+ logger.debug(
+ f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
+ )
+ yield chunk_text, clause_tokens
+ clause_chunk = [full_clause]
+ clause_tokens = tokens
+ clause_count = count
+
+ # Don't forget last clause chunk
+ if clause_chunk:
+ chunk_text = " ".join(clause_chunk)
+ chunk_count += 1
+ logger.debug(
+ f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
+ )
+ yield chunk_text, clause_tokens
+
+ # Regular sentence handling
+ elif (
+ current_count >= settings.target_min_tokens
+ and current_count + count > settings.target_max_tokens
+ ):
+ # If we have a good sized chunk and adding next sentence exceeds target,
+ # yield current chunk and start new one
+ chunk_text = " ".join(current_chunk)
+ chunk_count += 1
+ logger.info(
+ f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
+ )
+ yield chunk_text, current_tokens
+ current_chunk = [sentence]
+ current_tokens = tokens
+ current_count = count
+ elif current_count + count <= settings.target_max_tokens:
+ # Keep building chunk while under target max
+ current_chunk.append(sentence)
+ current_tokens.extend(tokens)
+ current_count += count
+ elif (
+ current_count + count <= max_tokens
+ and current_count < settings.target_min_tokens
+ ):
+ # Only exceed target max if we haven't reached minimum size yet
+ current_chunk.append(sentence)
+ current_tokens.extend(tokens)
+ current_count += count
+ else:
+ # Yield current chunk and start new one
+ if current_chunk:
+ chunk_text = " ".join(current_chunk)
+ chunk_count += 1
+ logger.info(
+ f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
+ )
+ yield chunk_text, current_tokens
+ current_chunk = [sentence]
+ current_tokens = tokens
+ current_count = count
+
+ # Don't forget the last chunk
+ if current_chunk:
+ chunk_text = " ".join(current_chunk)
+ chunk_count += 1
+ logger.info(
+ f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
+ )
+ yield chunk_text, current_tokens
+
+ total_time = time.time() - start_time
+ logger.info(
+ f"Split completed in {total_time * 1000:.2f}ms, produced {chunk_count} chunks"
+ )
diff --git a/api/src/services/text_processing/vocabulary.py b/api/src/services/text_processing/vocabulary.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a128924cfbb1b952268b3cace85c4a729e815e1
--- /dev/null
+++ b/api/src/services/text_processing/vocabulary.py
@@ -0,0 +1,40 @@
+def get_vocab():
+ """Get the vocabulary dictionary mapping characters to token IDs"""
+ _pad = "$"
+ _punctuation = ';:,.!?¡¿—…"«»"" '
+ _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+ _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+
+ # Create vocabulary dictionary
+ symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
+ return {symbol: i for i, symbol in enumerate(symbols)}
+
+
+# Initialize vocabulary
+VOCAB = get_vocab()
+
+
+def tokenize(phonemes: str) -> list[int]:
+ """Convert phonemes string to token IDs
+
+ Args:
+ phonemes: String of phonemes to tokenize
+
+ Returns:
+ List of token IDs
+ """
+ return [i for i in map(VOCAB.get, phonemes) if i is not None]
+
+
+def decode_tokens(tokens: list[int]) -> str:
+ """Convert token IDs back to phonemes string
+
+ Args:
+ tokens: List of token IDs
+
+ Returns:
+ String of phonemes
+ """
+ # Create reverse mapping
+ id_to_symbol = {i: s for s, i in VOCAB.items()}
+ return "".join(id_to_symbol[t] for t in tokens)
diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a69b85a2096f697b0ada2c95bbe545d15417660
--- /dev/null
+++ b/api/src/services/tts_service.py
@@ -0,0 +1,459 @@
+"""TTS service using model and voice managers."""
+
+import asyncio
+import os
+import re
+import tempfile
+import time
+from typing import AsyncGenerator, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from kokoro import KPipeline
+from loguru import logger
+
+from ..core.config import settings
+from ..inference.base import AudioChunk
+from ..inference.kokoro_v1 import KokoroV1
+from ..inference.model_manager import get_manager as get_model_manager
+from ..inference.voice_manager import get_manager as get_voice_manager
+from ..structures.schemas import NormalizationOptions
+from .audio import AudioNormalizer, AudioService
+from .streaming_audio_writer import StreamingAudioWriter
+from .text_processing import tokenize
+from .text_processing.text_processor import process_text_chunk, smart_split
+
+
+class TTSService:
+ """Text-to-speech service."""
+
+ # Limit concurrent chunk processing
+ _chunk_semaphore = asyncio.Semaphore(4)
+
+ def __init__(self, output_dir: str = None):
+ """Initialize service."""
+ self.output_dir = output_dir
+ self.model_manager = None
+ self._voice_manager = None
+
+ @classmethod
+ async def create(cls, output_dir: str = None) -> "TTSService":
+ """Create and initialize TTSService instance."""
+ service = cls(output_dir)
+ service.model_manager = await get_model_manager()
+ service._voice_manager = await get_voice_manager()
+ return service
+
+ async def _process_chunk(
+ self,
+ chunk_text: str,
+ tokens: List[int],
+ voice_name: str,
+ voice_path: str,
+ speed: float,
+ writer: StreamingAudioWriter,
+ output_format: Optional[str] = None,
+ is_first: bool = False,
+ is_last: bool = False,
+ normalizer: Optional[AudioNormalizer] = None,
+ lang_code: Optional[str] = None,
+ return_timestamps: Optional[bool] = False,
+ ) -> AsyncGenerator[AudioChunk, None]:
+ """Process tokens into audio."""
+ async with self._chunk_semaphore:
+ try:
+ # Handle stream finalization
+ if is_last:
+ # Skip format conversion for raw audio mode
+ if not output_format:
+ yield AudioChunk(np.array([], dtype=np.int16), output=b"")
+ return
+ chunk_data = await AudioService.convert_audio(
+ AudioChunk(
+ np.array([], dtype=np.float32)
+ ), # Dummy data for type checking
+ output_format,
+ writer,
+ speed,
+ "",
+ normalizer=normalizer,
+ is_last_chunk=True,
+ )
+ yield chunk_data
+ return
+
+ # Skip empty chunks
+ if not tokens and not chunk_text:
+ return
+
+ # Get backend
+ backend = self.model_manager.get_backend()
+
+ # Generate audio using pre-warmed model
+ if isinstance(backend, KokoroV1):
+ chunk_index = 0
+ # For Kokoro V1, pass text and voice info with lang_code
+ async for chunk_data in self.model_manager.generate(
+ chunk_text,
+ (voice_name, voice_path),
+ speed=speed,
+ lang_code=lang_code,
+ return_timestamps=return_timestamps,
+ ):
+ # For streaming, convert to bytes
+ if output_format:
+ try:
+ chunk_data = await AudioService.convert_audio(
+ chunk_data,
+ output_format,
+ writer,
+ speed,
+ chunk_text,
+ is_last_chunk=is_last,
+ normalizer=normalizer,
+ )
+ yield chunk_data
+ except Exception as e:
+ logger.error(f"Failed to convert audio: {str(e)}")
+ else:
+ chunk_data = AudioService.trim_audio(
+ chunk_data, chunk_text, speed, is_last, normalizer
+ )
+ yield chunk_data
+ chunk_index += 1
+ else:
+ # For legacy backends, load voice tensor
+ voice_tensor = await self._voice_manager.load_voice(
+ voice_name, device=backend.device
+ )
+ chunk_data = await self.model_manager.generate(
+ tokens,
+ voice_tensor,
+ speed=speed,
+ return_timestamps=return_timestamps,
+ )
+
+ if chunk_data.audio is None:
+ logger.error("Model generated None for audio chunk")
+ return
+
+ if len(chunk_data.audio) == 0:
+ logger.error("Model generated empty audio chunk")
+ return
+
+ # For streaming, convert to bytes
+ if output_format:
+ try:
+ chunk_data = await AudioService.convert_audio(
+ chunk_data,
+ output_format,
+ writer,
+ speed,
+ chunk_text,
+ normalizer=normalizer,
+ is_last_chunk=is_last,
+ )
+ yield chunk_data
+ except Exception as e:
+ logger.error(f"Failed to convert audio: {str(e)}")
+ else:
+ trimmed = AudioService.trim_audio(
+ chunk_data, chunk_text, speed, is_last, normalizer
+ )
+ yield trimmed
+ except Exception as e:
+ logger.error(f"Failed to process tokens: {str(e)}")
+
+ async def _load_voice_from_path(self, path: str, weight: float):
+ # Check if the path is None and raise a ValueError if it is not
+ if not path:
+ raise ValueError(f"Voice not found at path: {path}")
+
+ logger.debug(f"Loading voice tensor from path: {path}")
+ return torch.load(path, map_location="cpu") * weight
+
+ async def _get_voices_path(self, voice: str) -> Tuple[str, str]:
+ """Get voice path, handling combined voices.
+
+ Args:
+ voice: Voice name or combined voice names (e.g., 'af_jadzia+af_jessica')
+
+ Returns:
+ Tuple of (voice name to use, voice path to use)
+
+ Raises:
+ RuntimeError: If voice not found
+ """
+ try:
+ # Split the voice on + and - and ensure that they get added to the list eg: hi+bob = ["hi","+","bob"]
+ split_voice = re.split(r"([-+])", voice)
+
+ # If it is only once voice there is no point in loading it up, doing nothing with it, then saving it
+ if len(split_voice) == 1:
+ # Since its a single voice the only time that the weight would matter is if voice_weight_normalization is off
+ if (
+ "(" not in voice and ")" not in voice
+ ) or settings.voice_weight_normalization == True:
+ path = await self._voice_manager.get_voice_path(voice)
+ if not path:
+ raise RuntimeError(f"Voice not found: {voice}")
+ logger.debug(f"Using single voice path: {path}")
+ return voice, path
+
+ total_weight = 0
+
+ for voice_index in range(0, len(split_voice), 2):
+ voice_object = split_voice[voice_index]
+
+ if "(" in voice_object and ")" in voice_object:
+ voice_name = voice_object.split("(")[0].strip()
+ voice_weight = float(voice_object.split("(")[1].split(")")[0])
+ else:
+ voice_name = voice_object
+ voice_weight = 1
+
+ total_weight += voice_weight
+ split_voice[voice_index] = (voice_name, voice_weight)
+
+ # If voice_weight_normalization is false prevent normalizing the weights by setting the total_weight to 1 so it divides each weight by 1
+ if settings.voice_weight_normalization == False:
+ total_weight = 1
+
+ # Load the first voice as the starting point for voices to be combined onto
+ path = await self._voice_manager.get_voice_path(split_voice[0][0])
+ combined_tensor = await self._load_voice_from_path(
+ path, split_voice[0][1] / total_weight
+ )
+
+ # Loop through each + or - in split_voice so they can be applied to combined voice
+ for operation_index in range(1, len(split_voice) - 1, 2):
+ # Get the voice path of the voice 1 index ahead of the operator
+ path = await self._voice_manager.get_voice_path(
+ split_voice[operation_index + 1][0]
+ )
+ voice_tensor = await self._load_voice_from_path(
+ path, split_voice[operation_index + 1][1] / total_weight
+ )
+
+ # Either add or subtract the voice from the current combined voice
+ if split_voice[operation_index] == "+":
+ combined_tensor += voice_tensor
+ else:
+ combined_tensor -= voice_tensor
+
+ # Save the new combined voice so it can be loaded latter
+ temp_dir = tempfile.gettempdir()
+ combined_path = os.path.join(temp_dir, f"{voice}.pt")
+ logger.debug(f"Saving combined voice to: {combined_path}")
+ torch.save(combined_tensor, combined_path)
+ return voice, combined_path
+ except Exception as e:
+ logger.error(f"Failed to get voice path: {e}")
+ raise
+
+ async def generate_audio_stream(
+ self,
+ text: str,
+ voice: str,
+ writer: StreamingAudioWriter,
+ speed: float = 1.0,
+ output_format: str = "wav",
+ lang_code: Optional[str] = None,
+ normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
+ return_timestamps: Optional[bool] = False,
+ ) -> AsyncGenerator[AudioChunk, None]:
+ """Generate and stream audio chunks."""
+ stream_normalizer = AudioNormalizer()
+ chunk_index = 0
+ current_offset = 0.0
+ try:
+ # Get backend
+ backend = self.model_manager.get_backend()
+
+ # Get voice path, handling combined voices
+ voice_name, voice_path = await self._get_voices_path(voice)
+ logger.debug(f"Using voice path: {voice_path}")
+
+ # Use provided lang_code or determine from voice name
+ pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
+ logger.info(
+ f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in audio stream"
+ )
+
+ # Process text in chunks with smart splitting
+ async for chunk_text, tokens in smart_split(
+ text,
+ lang_code=pipeline_lang_code,
+ normalization_options=normalization_options,
+ ):
+ try:
+ # Process audio for chunk
+ async for chunk_data in self._process_chunk(
+ chunk_text, # Pass text for Kokoro V1
+ tokens, # Pass tokens for legacy backends
+ voice_name, # Pass voice name
+ voice_path, # Pass voice path
+ speed,
+ writer,
+ output_format,
+ is_first=(chunk_index == 0),
+ is_last=False, # We'll update the last chunk later
+ normalizer=stream_normalizer,
+ lang_code=pipeline_lang_code, # Pass lang_code
+ return_timestamps=return_timestamps,
+ ):
+ if chunk_data.word_timestamps is not None:
+ for timestamp in chunk_data.word_timestamps:
+ timestamp.start_time += current_offset
+ timestamp.end_time += current_offset
+
+ current_offset += len(chunk_data.audio) / 24000
+
+ if chunk_data.output is not None:
+ yield chunk_data
+
+ else:
+ logger.warning(
+ f"No audio generated for chunk: '{chunk_text[:100]}...'"
+ )
+ chunk_index += 1
+ except Exception as e:
+ logger.error(
+ f"Failed to process audio for chunk: '{chunk_text[:100]}...'. Error: {str(e)}"
+ )
+ continue
+
+ # Only finalize if we successfully processed at least one chunk
+ if chunk_index > 0:
+ try:
+ # Empty tokens list to finalize audio
+ async for chunk_data in self._process_chunk(
+ "", # Empty text
+ [], # Empty tokens
+ voice_name,
+ voice_path,
+ speed,
+ writer,
+ output_format,
+ is_first=False,
+ is_last=True, # Signal this is the last chunk
+ normalizer=stream_normalizer,
+ lang_code=pipeline_lang_code, # Pass lang_code
+ ):
+ if chunk_data.output is not None:
+ yield chunk_data
+ except Exception as e:
+ logger.error(f"Failed to finalize audio stream: {str(e)}")
+
+ except Exception as e:
+ logger.error(f"Error in phoneme audio generation: {str(e)}")
+ raise e
+
+ async def generate_audio(
+ self,
+ text: str,
+ voice: str,
+ writer: StreamingAudioWriter,
+ speed: float = 1.0,
+ return_timestamps: bool = False,
+ normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
+ lang_code: Optional[str] = None,
+ ) -> AudioChunk:
+ """Generate complete audio for text using streaming internally."""
+ audio_data_chunks = []
+
+ try:
+ async for audio_stream_data in self.generate_audio_stream(
+ text,
+ voice,
+ writer,
+ speed=speed,
+ normalization_options=normalization_options,
+ return_timestamps=return_timestamps,
+ lang_code=lang_code,
+ output_format=None,
+ ):
+ if len(audio_stream_data.audio) > 0:
+ audio_data_chunks.append(audio_stream_data)
+
+ combined_audio_data = AudioChunk.combine(audio_data_chunks)
+ return combined_audio_data
+ except Exception as e:
+ logger.error(f"Error in audio generation: {str(e)}")
+ raise
+
+ async def combine_voices(self, voices: List[str]) -> torch.Tensor:
+ """Combine multiple voices.
+
+ Returns:
+ Combined voice tensor
+ """
+
+ return await self._voice_manager.combine_voices(voices)
+
+ async def list_voices(self) -> List[str]:
+ """List available voices."""
+ return await self._voice_manager.list_voices()
+
+ async def generate_from_phonemes(
+ self,
+ phonemes: str,
+ voice: str,
+ speed: float = 1.0,
+ lang_code: Optional[str] = None,
+ ) -> Tuple[np.ndarray, float]:
+ """Generate audio directly from phonemes.
+
+ Args:
+ phonemes: Phonemes in Kokoro format
+ voice: Voice name
+ speed: Speed multiplier
+ lang_code: Optional language code override
+
+ Returns:
+ Tuple of (audio array, processing time)
+ """
+ start_time = time.time()
+ try:
+ # Get backend and voice path
+ backend = self.model_manager.get_backend()
+ voice_name, voice_path = await self._get_voices_path(voice)
+
+ if isinstance(backend, KokoroV1):
+ # For Kokoro V1, use generate_from_tokens with raw phonemes
+ result = None
+ # Use provided lang_code or determine from voice name
+ pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
+ logger.info(
+ f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in phoneme pipeline"
+ )
+
+ try:
+ # Use backend's pipeline management
+ for r in backend._get_pipeline(
+ pipeline_lang_code
+ ).generate_from_tokens(
+ tokens=phonemes, # Pass raw phonemes string
+ voice=voice_path,
+ speed=speed,
+ ):
+ if r.audio is not None:
+ result = r
+ break
+ except Exception as e:
+ logger.error(f"Failed to generate from phonemes: {e}")
+ raise RuntimeError(f"Phoneme generation failed: {e}")
+
+ if result is None or result.audio is None:
+ raise ValueError("No audio generated")
+
+ processing_time = time.time() - start_time
+ return result.audio.numpy(), processing_time
+ else:
+ raise ValueError(
+ "Phoneme generation only supported with Kokoro V1 backend"
+ )
+
+ except Exception as e:
+ logger.error(f"Error in phoneme audio generation: {str(e)}")
+ raise
diff --git a/api/src/structures/__init__.py b/api/src/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c582ea34132f9b568f89949ec8724386933a3752
--- /dev/null
+++ b/api/src/structures/__init__.py
@@ -0,0 +1,17 @@
+from .schemas import (
+ CaptionedSpeechRequest,
+ CaptionedSpeechResponse,
+ OpenAISpeechRequest,
+ TTSStatus,
+ VoiceCombineRequest,
+ WordTimestamp,
+)
+
+__all__ = [
+ "OpenAISpeechRequest",
+ "CaptionedSpeechRequest",
+ "CaptionedSpeechResponse",
+ "WordTimestamp",
+ "TTSStatus",
+ "VoiceCombineRequest",
+]
diff --git a/api/src/structures/custom_responses.py b/api/src/structures/custom_responses.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f83882945dea5939caf1f4fa40b028a3f1367f8
--- /dev/null
+++ b/api/src/structures/custom_responses.py
@@ -0,0 +1,50 @@
+import json
+import typing
+from collections.abc import AsyncIterable, Iterable
+
+from pydantic import BaseModel
+from starlette.background import BackgroundTask
+from starlette.concurrency import iterate_in_threadpool
+from starlette.responses import JSONResponse, StreamingResponse
+
+
+class JSONStreamingResponse(StreamingResponse, JSONResponse):
+ """StreamingResponse that also render with JSON."""
+
+ def __init__(
+ self,
+ content: Iterable | AsyncIterable,
+ status_code: int = 200,
+ headers: dict[str, str] | None = None,
+ media_type: str | None = None,
+ background: BackgroundTask | None = None,
+ ) -> None:
+ if isinstance(content, AsyncIterable):
+ self._content_iterable: AsyncIterable = content
+ else:
+ self._content_iterable = iterate_in_threadpool(content)
+
+ async def body_iterator() -> AsyncIterable[bytes]:
+ async for content_ in self._content_iterable:
+ if isinstance(content_, BaseModel):
+ content_ = content_.model_dump()
+ yield self.render(content_)
+
+ self.body_iterator = body_iterator()
+ self.status_code = status_code
+ if media_type is not None:
+ self.media_type = media_type
+ self.background = background
+ self.init_headers(headers)
+
+ def render(self, content: typing.Any) -> bytes:
+ return (
+ json.dumps(
+ content,
+ ensure_ascii=False,
+ allow_nan=False,
+ indent=None,
+ separators=(",", ":"),
+ )
+ + "\n"
+ ).encode("utf-8")
diff --git a/api/src/structures/model_schemas.py b/api/src/structures/model_schemas.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4897ec8d8b4b64434daa9781c2ff0212f690e25
--- /dev/null
+++ b/api/src/structures/model_schemas.py
@@ -0,0 +1,16 @@
+"""Voice configuration schemas."""
+
+from pydantic import BaseModel, Field
+
+
+class VoiceConfig(BaseModel):
+ """Voice configuration."""
+
+ use_cache: bool = Field(True, description="Whether to cache loaded voices")
+ cache_size: int = Field(3, description="Number of voices to cache")
+ validate_on_load: bool = Field(
+ True, description="Whether to validate voices when loading"
+ )
+
+ class Config:
+ frozen = True # Make config immutable
diff --git a/api/src/structures/schemas.py b/api/src/structures/schemas.py
new file mode 100644
index 0000000000000000000000000000000000000000..260224c8113e9ad15f49d4f202d9ed18121a80c8
--- /dev/null
+++ b/api/src/structures/schemas.py
@@ -0,0 +1,158 @@
+from enum import Enum
+from typing import List, Literal, Optional, Union
+
+from pydantic import BaseModel, Field
+
+
+class VoiceCombineRequest(BaseModel):
+ """Request schema for voice combination endpoint that accepts either a string with + or a list"""
+
+ voices: Union[str, List[str]] = Field(
+ ...,
+ description="Either a string with voices separated by + (e.g. 'voice1+voice2') or a list of voice names to combine",
+ )
+
+
+class TTSStatus(str, Enum):
+ PENDING = "pending"
+ PROCESSING = "processing"
+ COMPLETED = "completed"
+ FAILED = "failed"
+ DELETED = "deleted" # For files removed by cleanup
+
+
+# OpenAI-compatible schemas
+class WordTimestamp(BaseModel):
+ """Word-level timestamp information"""
+
+ word: str = Field(..., description="The word or token")
+ start_time: float = Field(..., description="Start time in seconds")
+ end_time: float = Field(..., description="End time in seconds")
+
+
+class CaptionedSpeechResponse(BaseModel):
+ """Response schema for captioned speech endpoint"""
+
+ audio: str = Field(..., description="The generated audio data encoded in base 64")
+ audio_format: str = Field(..., description="The format of the output audio")
+ timestamps: Optional[List[WordTimestamp]] = Field(
+ ..., description="Word-level timestamps"
+ )
+
+
+class NormalizationOptions(BaseModel):
+ """Options for the normalization system"""
+
+ normalize: bool = Field(
+ default=True,
+ description="Normalizes input text to make it easier for the model to say",
+ )
+ unit_normalization: bool = Field(
+ default=False, description="Transforms units like 10KB to 10 kilobytes"
+ )
+ url_normalization: bool = Field(
+ default=True,
+ description="Changes urls so they can be properly pronounced by kokoro",
+ )
+ email_normalization: bool = Field(
+ default=True,
+ description="Changes emails so they can be properly pronouced by kokoro",
+ )
+ optional_pluralization_normalization: bool = Field(
+ default=True,
+ description="Replaces (s) with s so some words get pronounced correctly",
+ )
+ phone_normalization: bool = Field(
+ default=True,
+ description="Changes phone numbers so they can be properly pronouced by kokoro",
+ )
+
+
+class OpenAISpeechRequest(BaseModel):
+ """Request schema for OpenAI-compatible speech endpoint"""
+
+ model: str = Field(
+ default="kokoro",
+ description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro",
+ )
+ input: str = Field(..., description="The text to generate audio for")
+ voice: str = Field(
+ default="af_heart",
+ description="The voice to use for generation. Can be a base voice or a combined voice name.",
+ )
+ response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
+ default="mp3",
+ description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
+ )
+ download_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = (
+ Field(
+ default=None,
+ description="Optional different format for the final download. If not provided, uses response_format.",
+ )
+ )
+ speed: float = Field(
+ default=1.0,
+ ge=0.25,
+ le=4.0,
+ description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
+ )
+ stream: bool = Field(
+ default=True, # Default to streaming for OpenAI compatibility
+ description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
+ )
+ return_download_link: bool = Field(
+ default=False,
+ description="If true, returns a download link in X-Download-Path header after streaming completes",
+ )
+ lang_code: Optional[str] = Field(
+ default=None,
+ description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
+ )
+ normalization_options: Optional[NormalizationOptions] = Field(
+ default=NormalizationOptions(),
+ description="Options for the normalization system",
+ )
+
+
+class CaptionedSpeechRequest(BaseModel):
+ """Request schema for captioned speech endpoint"""
+
+ model: str = Field(
+ default="kokoro",
+ description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro",
+ )
+ input: str = Field(..., description="The text to generate audio for")
+ voice: str = Field(
+ default="af_heart",
+ description="The voice to use for generation. Can be a base voice or a combined voice name.",
+ )
+ response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
+ default="mp3",
+ description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
+ )
+ speed: float = Field(
+ default=1.0,
+ ge=0.25,
+ le=4.0,
+ description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
+ )
+ stream: bool = Field(
+ default=True, # Default to streaming for OpenAI compatibility
+ description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
+ )
+ return_timestamps: bool = Field(
+ default=True,
+ description="If true (default), returns word-level timestamps in the response",
+ )
+ return_download_link: bool = Field(
+ default=False,
+ description="If true, returns a download link in X-Download-Path header after streaming completes",
+ )
+ lang_code: Optional[str] = Field(
+ default=None,
+ description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
+ )
+ normalization_options: Optional[NormalizationOptions] = Field(
+ default=NormalizationOptions(),
+ description="Options for the normalization system",
+ )
diff --git a/api/src/structures/text_schemas.py b/api/src/structures/text_schemas.py
new file mode 100644
index 0000000000000000000000000000000000000000..e14e361022739adfaa11aa827bedb1b5b0b1c78a
--- /dev/null
+++ b/api/src/structures/text_schemas.py
@@ -0,0 +1,41 @@
+from typing import List, Optional, Union
+
+from pydantic import BaseModel, Field, field_validator
+
+
+class PhonemeRequest(BaseModel):
+ text: str
+ language: str = "a" # Default to American English
+
+
+class PhonemeResponse(BaseModel):
+ phonemes: str
+ tokens: list[int]
+
+
+class StitchOptions(BaseModel):
+ """Options for stitching audio chunks together"""
+
+ gap_method: str = Field(
+ default="static_trim",
+ description="Method to handle gaps between chunks. Currently only 'static_trim' supported.",
+ )
+ trim_ms: int = Field(
+ default=0,
+ ge=0,
+ description="Milliseconds to trim from chunk boundaries when using static_trim",
+ )
+
+ @field_validator("gap_method")
+ @classmethod
+ def validate_gap_method(cls, v: str) -> str:
+ if v != "static_trim":
+ raise ValueError("Currently only 'static_trim' gap method is supported")
+ return v
+
+
+class GenerateFromPhonemesRequest(BaseModel):
+ """Simple request for phoneme-to-speech generation"""
+
+ phonemes: str = Field(..., description="Phoneme string to synthesize")
+ voice: str = Field(..., description="Voice ID to use for generation")
diff --git a/api/tests/__init__.py b/api/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9911d8f331d96ea4dc50886f4c1550ea2cc56e3
--- /dev/null
+++ b/api/tests/__init__.py
@@ -0,0 +1 @@
+# Make tests directory a Python package
diff --git a/api/tests/conftest.py b/api/tests/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e3bba8a36e16d79c2f53da72a49823d0cc147d4
--- /dev/null
+++ b/api/tests/conftest.py
@@ -0,0 +1,71 @@
+import os
+from pathlib import Path
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import numpy as np
+import pytest
+import pytest_asyncio
+import torch
+
+from api.src.inference.model_manager import ModelManager
+from api.src.inference.voice_manager import VoiceManager
+from api.src.services.tts_service import TTSService
+from api.src.structures.model_schemas import VoiceConfig
+
+
+@pytest.fixture
+def mock_voice_tensor():
+ """Load a real voice tensor for testing."""
+ voice_path = os.path.join(
+ os.path.dirname(os.path.dirname(__file__)), "src/voices/af_bella.pt"
+ )
+ return torch.load(voice_path, map_location="cpu", weights_only=False)
+
+
+@pytest.fixture
+def mock_audio_output():
+ """Load pre-generated test audio for consistent testing."""
+ test_audio_path = os.path.join(
+ os.path.dirname(__file__), "test_data/test_audio.npy"
+ )
+ return np.load(test_audio_path) # Return as numpy array instead of bytes
+
+
+@pytest_asyncio.fixture
+async def mock_model_manager(mock_audio_output):
+ """Mock model manager for testing."""
+ manager = AsyncMock(spec=ModelManager)
+ manager.get_backend = MagicMock()
+
+ async def mock_generate(*args, **kwargs):
+ # Simulate successful audio generation
+ return np.random.rand(24000).astype(np.float32) # 1 second of random audio data
+
+ manager.generate = AsyncMock(side_effect=mock_generate)
+ return manager
+
+
+@pytest_asyncio.fixture
+async def mock_voice_manager(mock_voice_tensor):
+ """Mock voice manager for testing."""
+ manager = AsyncMock(spec=VoiceManager)
+ manager.get_voice_path = MagicMock(return_value="/mock/path/voice.pt")
+ manager.load_voice = AsyncMock(return_value=mock_voice_tensor)
+ manager.list_voices = AsyncMock(return_value=["voice1", "voice2"])
+ manager.combine_voices = AsyncMock(return_value="voice1_voice2")
+ return manager
+
+
+@pytest_asyncio.fixture
+async def tts_service(mock_model_manager, mock_voice_manager):
+ """Get mocked TTS service instance."""
+ service = TTSService()
+ service.model_manager = mock_model_manager
+ service._voice_manager = mock_voice_manager
+ return service
+
+
+@pytest.fixture
+def test_voice():
+ """Return a test voice name."""
+ return "voice1"
diff --git a/api/tests/test_audio_service.py b/api/tests/test_audio_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ba539282ef6cb7f9118fa11e26fa20ca1ae0abc
--- /dev/null
+++ b/api/tests/test_audio_service.py
@@ -0,0 +1,256 @@
+"""Tests for AudioService"""
+
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+
+from api.src.inference.base import AudioChunk
+from api.src.services.audio import AudioNormalizer, AudioService
+from api.src.services.streaming_audio_writer import StreamingAudioWriter
+
+
+@pytest.fixture(autouse=True)
+def mock_settings():
+ """Mock settings for all tests"""
+ with patch("api.src.services.audio.settings") as mock_settings:
+ mock_settings.gap_trim_ms = 250
+ yield mock_settings
+
+
+@pytest.fixture
+def sample_audio():
+ """Generate a simple sine wave for testing"""
+ sample_rate = 24000
+ duration = 0.1 # 100ms
+ t = np.linspace(0, duration, int(sample_rate * duration))
+ frequency = 440 # A4 note
+ return np.sin(2 * np.pi * frequency * t).astype(np.float32), sample_rate
+
+
+@pytest.mark.asyncio
+async def test_convert_to_wav(sample_audio):
+ """Test converting to WAV format"""
+ audio_data, sample_rate = sample_audio
+
+ writer = StreamingAudioWriter("wav", sample_rate=24000)
+ # Write and finalize in one step for WAV
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), "wav", writer, is_last_chunk=False
+ )
+
+ writer.close()
+
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
+ # Check WAV header
+ assert audio_chunk.output.startswith(b"RIFF")
+ assert b"WAVE" in audio_chunk.output[:12]
+
+
+@pytest.mark.asyncio
+async def test_convert_to_mp3(sample_audio):
+ """Test converting to MP3 format"""
+ audio_data, sample_rate = sample_audio
+
+ writer = StreamingAudioWriter("mp3", sample_rate=24000)
+
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), "mp3", writer
+ )
+
+ writer.close()
+
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
+ # Check MP3 header (ID3 or MPEG frame sync)
+ assert audio_chunk.output.startswith(b"ID3") or audio_chunk.output.startswith(
+ b"\xff\xfb"
+ )
+
+
+@pytest.mark.asyncio
+async def test_convert_to_opus(sample_audio):
+ """Test converting to Opus format"""
+
+ audio_data, sample_rate = sample_audio
+
+ writer = StreamingAudioWriter("opus", sample_rate=24000)
+
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), "opus", writer
+ )
+
+ writer.close()
+
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
+ # Check OGG header
+ assert audio_chunk.output.startswith(b"OggS")
+
+
+@pytest.mark.asyncio
+async def test_convert_to_flac(sample_audio):
+ """Test converting to FLAC format"""
+ audio_data, sample_rate = sample_audio
+
+ writer = StreamingAudioWriter("flac", sample_rate=24000)
+
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), "flac", writer
+ )
+
+ writer.close()
+
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
+ # Check FLAC header
+ assert audio_chunk.output.startswith(b"fLaC")
+
+
+@pytest.mark.asyncio
+async def test_convert_to_aac(sample_audio):
+ """Test converting to M4A format"""
+ audio_data, sample_rate = sample_audio
+
+ writer = StreamingAudioWriter("aac", sample_rate=24000)
+
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), "aac", writer
+ )
+
+ writer.close()
+
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
+ # Check ADTS header (AAC)
+ assert audio_chunk.output.startswith(b"\xff\xf0") or audio_chunk.output.startswith(
+ b"\xff\xf1"
+ )
+
+
+@pytest.mark.asyncio
+async def test_convert_to_pcm(sample_audio):
+ """Test converting to PCM format"""
+ audio_data, sample_rate = sample_audio
+
+ writer = StreamingAudioWriter("pcm", sample_rate=24000)
+
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), "pcm", writer
+ )
+
+ writer.close()
+
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
+ # PCM is raw bytes, so no header to check
+
+
+@pytest.mark.asyncio
+async def test_convert_to_invalid_format_raises_error(sample_audio):
+ """Test that converting to an invalid format raises an error"""
+ # audio_data, sample_rate = sample_audio
+ with pytest.raises(ValueError, match="Unsupported format: invalid"):
+ writer = StreamingAudioWriter("invalid", sample_rate=24000)
+
+
+@pytest.mark.asyncio
+async def test_normalization_wav(sample_audio):
+ """Test that WAV output is properly normalized to int16 range"""
+ audio_data, sample_rate = sample_audio
+
+ writer = StreamingAudioWriter("wav", sample_rate=24000)
+
+ # Create audio data outside int16 range
+ large_audio = audio_data * 1e5
+ # Write and finalize in one step for WAV
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(large_audio), "wav", writer
+ )
+
+ writer.close()
+
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
+
+
+@pytest.mark.asyncio
+async def test_normalization_pcm(sample_audio):
+ """Test that PCM output is properly normalized to int16 range"""
+ audio_data, sample_rate = sample_audio
+
+ writer = StreamingAudioWriter("pcm", sample_rate=24000)
+
+ # Create audio data outside int16 range
+ large_audio = audio_data * 1e5
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(large_audio), "pcm", writer
+ )
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
+
+
+@pytest.mark.asyncio
+async def test_invalid_audio_data():
+ """Test handling of invalid audio data"""
+ invalid_audio = np.array([]) # Empty array
+ sample_rate = 24000
+
+ writer = StreamingAudioWriter("wav", sample_rate=24000)
+
+ with pytest.raises(ValueError):
+ await AudioService.convert_audio(invalid_audio, sample_rate, "wav", writer)
+
+
+@pytest.mark.asyncio
+async def test_different_sample_rates(sample_audio):
+ """Test converting audio with different sample rates"""
+ audio_data, _ = sample_audio
+ sample_rates = [8000, 16000, 44100, 48000]
+
+ for rate in sample_rates:
+ writer = StreamingAudioWriter("wav", sample_rate=rate)
+
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), "wav", writer
+ )
+
+ writer.close()
+
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
+
+
+@pytest.mark.asyncio
+async def test_buffer_position_after_conversion(sample_audio):
+ """Test that buffer position is reset after writing"""
+ audio_data, sample_rate = sample_audio
+
+ writer = StreamingAudioWriter("wav", sample_rate=24000)
+
+ # Write and finalize in one step for first conversion
+ audio_chunk1 = await AudioService.convert_audio(
+ AudioChunk(audio_data), "wav", writer, is_last_chunk=True
+ )
+ assert isinstance(audio_chunk1.output, bytes)
+ assert isinstance(audio_chunk1, AudioChunk)
+ # Convert again to ensure buffer was properly reset
+
+ writer = StreamingAudioWriter("wav", sample_rate=24000)
+
+ audio_chunk2 = await AudioService.convert_audio(
+ AudioChunk(audio_data), "wav", writer, is_last_chunk=True
+ )
+ assert isinstance(audio_chunk2.output, bytes)
+ assert isinstance(audio_chunk2, AudioChunk)
+ assert len(audio_chunk1.output) == len(audio_chunk2.output)
diff --git a/api/tests/test_data/generate_test_data.py b/api/tests/test_data/generate_test_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f6b7cf96a12b296f961210f6ca39ee8770d7d73
--- /dev/null
+++ b/api/tests/test_data/generate_test_data.py
@@ -0,0 +1,23 @@
+import os
+
+import numpy as np
+
+
+def generate_test_audio():
+ """Generate test audio data - 1 second of 440Hz tone"""
+ # Create 1 second of silence at 24kHz
+ audio = np.zeros(24000, dtype=np.float32)
+
+ # Add a simple sine wave to make it non-zero
+ t = np.linspace(0, 1, 24000)
+ audio += 0.5 * np.sin(2 * np.pi * 440 * t) # 440 Hz tone at half amplitude
+
+ # Create test_data directory if it doesn't exist
+ os.makedirs("api/tests/test_data", exist_ok=True)
+
+ # Save the test audio
+ np.save("api/tests/test_data/test_audio.npy", audio)
+
+
+if __name__ == "__main__":
+ generate_test_audio()
diff --git a/api/tests/test_data/test_audio.npy b/api/tests/test_data/test_audio.npy
new file mode 100644
index 0000000000000000000000000000000000000000..2e06aa9fb23da8358abc9ce914785de23c9b1358
Binary files /dev/null and b/api/tests/test_data/test_audio.npy differ
diff --git a/api/tests/test_development.py b/api/tests/test_development.py
new file mode 100644
index 0000000000000000000000000000000000000000..a03b3baa0a8a6f5c299e460b62142e958e36d52c
--- /dev/null
+++ b/api/tests/test_development.py
@@ -0,0 +1,34 @@
+import base64
+import json
+from unittest.mock import MagicMock, patch
+
+import pytest
+import requests
+
+
+def test_generate_captioned_speech():
+ """Test the generate_captioned_speech function with mocked responses"""
+ # Mock the API responses
+ mock_audio_response = MagicMock()
+ mock_audio_response.status_code = 200
+
+ mock_timestamps_response = MagicMock()
+ mock_timestamps_response.status_code = 200
+ mock_timestamps_response.content = json.dumps(
+ {
+ "audio": base64.b64encode(b"mock audio data").decode("utf-8"),
+ "timestamps": [{"word": "test", "start_time": 0.0, "end_time": 1.0}],
+ }
+ )
+
+ # Patch the HTTP requests
+ with patch("requests.post", return_value=mock_timestamps_response):
+ # Import here to avoid module-level import issues
+ from examples.captioned_speech_example import generate_captioned_speech
+
+ # Test the function
+ audio, timestamps = generate_captioned_speech("test text")
+
+ # Verify we got both audio and timestamps
+ assert audio == b"mock audio data"
+ assert timestamps == [{"word": "test", "start_time": 0.0, "end_time": 1.0}]
diff --git a/api/tests/test_kokoro_v1.py b/api/tests/test_kokoro_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..29d83c5a56984b9ea32e796c383f40568a94204e
--- /dev/null
+++ b/api/tests/test_kokoro_v1.py
@@ -0,0 +1,165 @@
+from unittest.mock import ANY, MagicMock, patch
+
+import numpy as np
+import pytest
+import torch
+
+from api.src.inference.kokoro_v1 import KokoroV1
+
+
+@pytest.fixture
+def kokoro_backend():
+ """Create a KokoroV1 instance for testing."""
+ return KokoroV1()
+
+
+def test_initial_state(kokoro_backend):
+ """Test initial state of KokoroV1."""
+ assert not kokoro_backend.is_loaded
+ assert kokoro_backend._model is None
+ assert kokoro_backend._pipelines == {} # Now using dict of pipelines
+ # Device should be set based on settings
+ assert kokoro_backend.device in ["cuda", "cpu"]
+
+
+@patch("torch.cuda.is_available", return_value=True)
+@patch("torch.cuda.memory_allocated", return_value=5e9)
+def test_memory_management(mock_memory, mock_cuda, kokoro_backend):
+ """Test GPU memory management functions."""
+ # Patch backend so it thinks we have cuda
+ with patch.object(kokoro_backend, "_device", "cuda"):
+ # Test memory check
+ with patch("api.src.inference.kokoro_v1.model_config") as mock_config:
+ mock_config.pytorch_gpu.memory_threshold = 4
+ assert kokoro_backend._check_memory() == True
+
+ mock_config.pytorch_gpu.memory_threshold = 6
+ assert kokoro_backend._check_memory() == False
+
+
+@patch("torch.cuda.empty_cache")
+@patch("torch.cuda.synchronize")
+def test_clear_memory(mock_sync, mock_clear, kokoro_backend):
+ """Test memory clearing."""
+ with patch.object(kokoro_backend, "_device", "cuda"):
+ kokoro_backend._clear_memory()
+ mock_clear.assert_called_once()
+ mock_sync.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_load_model_validation(kokoro_backend):
+ """Test model loading validation."""
+ with pytest.raises(RuntimeError, match="Failed to load Kokoro model"):
+ await kokoro_backend.load_model("nonexistent_model.pth")
+
+
+def test_unload_with_pipelines(kokoro_backend):
+ """Test model unloading with multiple pipelines."""
+ # Mock loaded state with multiple pipelines
+ kokoro_backend._model = MagicMock()
+ pipeline_a = MagicMock()
+ pipeline_e = MagicMock()
+ kokoro_backend._pipelines = {"a": pipeline_a, "e": pipeline_e}
+ assert kokoro_backend.is_loaded
+
+ # Test unload
+ kokoro_backend.unload()
+ assert not kokoro_backend.is_loaded
+ assert kokoro_backend._model is None
+ assert kokoro_backend._pipelines == {} # All pipelines should be cleared
+
+
+@pytest.mark.asyncio
+async def test_generate_validation(kokoro_backend):
+ """Test generation validation."""
+ with pytest.raises(RuntimeError, match="Model not loaded"):
+ async for _ in kokoro_backend.generate("test", "voice"):
+ pass
+
+
+@pytest.mark.asyncio
+async def test_generate_from_tokens_validation(kokoro_backend):
+ """Test token generation validation."""
+ with pytest.raises(RuntimeError, match="Model not loaded"):
+ async for _ in kokoro_backend.generate_from_tokens("test tokens", "voice"):
+ pass
+
+
+def test_get_pipeline_creates_new(kokoro_backend):
+ """Test that _get_pipeline creates new pipeline for new language code."""
+ # Mock loaded state
+ kokoro_backend._model = MagicMock()
+
+ # Mock KPipeline
+ mock_pipeline = MagicMock()
+ with patch(
+ "api.src.inference.kokoro_v1.KPipeline", return_value=mock_pipeline
+ ) as mock_kpipeline:
+ # Get pipeline for Spanish
+ pipeline_e = kokoro_backend._get_pipeline("e")
+
+ # Should create new pipeline with correct params
+ mock_kpipeline.assert_called_once_with(
+ lang_code="e", model=kokoro_backend._model, device=kokoro_backend._device
+ )
+ assert pipeline_e == mock_pipeline
+ assert kokoro_backend._pipelines["e"] == mock_pipeline
+
+
+def test_get_pipeline_reuses_existing(kokoro_backend):
+ """Test that _get_pipeline reuses existing pipeline for same language code."""
+ # Mock loaded state
+ kokoro_backend._model = MagicMock()
+
+ # Mock KPipeline
+ mock_pipeline = MagicMock()
+ with patch(
+ "api.src.inference.kokoro_v1.KPipeline", return_value=mock_pipeline
+ ) as mock_kpipeline:
+ # Get pipeline twice for same language
+ pipeline1 = kokoro_backend._get_pipeline("e")
+ pipeline2 = kokoro_backend._get_pipeline("e")
+
+ # Should only create pipeline once
+ mock_kpipeline.assert_called_once()
+ assert pipeline1 == pipeline2
+ assert kokoro_backend._pipelines["e"] == mock_pipeline
+
+
+@pytest.mark.asyncio
+async def test_generate_uses_correct_pipeline(kokoro_backend):
+ """Test that generate uses correct pipeline for language code."""
+ # Mock loaded state
+ kokoro_backend._model = MagicMock()
+
+ # Mock voice path handling
+ with (
+ patch("api.src.core.paths.load_voice_tensor") as mock_load_voice,
+ patch("api.src.core.paths.save_voice_tensor"),
+ patch("tempfile.gettempdir") as mock_tempdir,
+ ):
+ mock_load_voice.return_value = torch.ones(1)
+ mock_tempdir.return_value = "/tmp"
+
+ # Mock KPipeline
+ mock_pipeline = MagicMock()
+ mock_pipeline.return_value = iter([]) # Empty generator for testing
+ with patch("api.src.inference.kokoro_v1.KPipeline", return_value=mock_pipeline):
+ # Generate with Spanish voice and explicit lang_code
+ async for _ in kokoro_backend.generate("test", "ef_voice", lang_code="e"):
+ pass
+
+ # Should create pipeline with Spanish lang_code
+ assert "e" in kokoro_backend._pipelines
+ # Use ANY to match the temp file path since it's dynamic
+ mock_pipeline.assert_called_with(
+ "test",
+ voice=ANY, # Don't check exact path since it's dynamic
+ speed=1.0,
+ model=kokoro_backend._model,
+ )
+ # Verify the voice path is a temp file path
+ call_args = mock_pipeline.call_args
+ assert isinstance(call_args[1]["voice"], str)
+ assert call_args[1]["voice"].startswith("/tmp/temp_voice_")
diff --git a/api/tests/test_normalizer.py b/api/tests/test_normalizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b48e94e311eed4c7dd5a14a7205e4c1e27d28d3
--- /dev/null
+++ b/api/tests/test_normalizer.py
@@ -0,0 +1,179 @@
+"""Tests for text normalization service"""
+
+import pytest
+
+from api.src.services.text_processing.normalizer import normalize_text
+from api.src.structures.schemas import NormalizationOptions
+
+
+def test_url_protocols():
+ """Test URL protocol handling"""
+ assert (
+ normalize_text(
+ "Check out https://example.com",
+ normalization_options=NormalizationOptions(),
+ )
+ == "Check out https example dot com"
+ )
+ assert (
+ normalize_text(
+ "Visit http://site.com", normalization_options=NormalizationOptions()
+ )
+ == "Visit http site dot com"
+ )
+ assert (
+ normalize_text(
+ "Go to https://test.org/path", normalization_options=NormalizationOptions()
+ )
+ == "Go to https test dot org slash path"
+ )
+
+
+def test_url_www():
+ """Test www prefix handling"""
+ assert (
+ normalize_text(
+ "Go to www.example.com", normalization_options=NormalizationOptions()
+ )
+ == "Go to www example dot com"
+ )
+ assert (
+ normalize_text(
+ "Visit www.test.org/docs", normalization_options=NormalizationOptions()
+ )
+ == "Visit www test dot org slash docs"
+ )
+ assert (
+ normalize_text(
+ "Check www.site.com?q=test", normalization_options=NormalizationOptions()
+ )
+ == "Check www site dot com question-mark q equals test"
+ )
+
+
+def test_url_localhost():
+ """Test localhost URL handling"""
+ assert (
+ normalize_text(
+ "Running on localhost:7860", normalization_options=NormalizationOptions()
+ )
+ == "Running on localhost colon 78 60"
+ )
+ assert (
+ normalize_text(
+ "Server at localhost:8080/api", normalization_options=NormalizationOptions()
+ )
+ == "Server at localhost colon 80 80 slash api"
+ )
+ assert (
+ normalize_text(
+ "Test localhost:3000/test?v=1", normalization_options=NormalizationOptions()
+ )
+ == "Test localhost colon 3000 slash test question-mark v equals 1"
+ )
+
+
+def test_url_ip_addresses():
+ """Test IP address URL handling"""
+ assert (
+ normalize_text(
+ "Access 0.0.0.0:9090/test", normalization_options=NormalizationOptions()
+ )
+ == "Access 0 dot 0 dot 0 dot 0 colon 90 90 slash test"
+ )
+ assert (
+ normalize_text(
+ "API at 192.168.1.1:8000", normalization_options=NormalizationOptions()
+ )
+ == "API at 192 dot 168 dot 1 dot 1 colon 8000"
+ )
+ assert (
+ normalize_text("Server 127.0.0.1", normalization_options=NormalizationOptions())
+ == "Server 127 dot 0 dot 0 dot 1"
+ )
+
+
+def test_url_raw_domains():
+ """Test raw domain handling"""
+ assert (
+ normalize_text(
+ "Visit google.com/search", normalization_options=NormalizationOptions()
+ )
+ == "Visit google dot com slash search"
+ )
+ assert (
+ normalize_text(
+ "Go to example.com/path?q=test",
+ normalization_options=NormalizationOptions(),
+ )
+ == "Go to example dot com slash path question-mark q equals test"
+ )
+ assert (
+ normalize_text(
+ "Check docs.test.com", normalization_options=NormalizationOptions()
+ )
+ == "Check docs dot test dot com"
+ )
+
+
+def test_url_email_addresses():
+ """Test email address handling"""
+ assert (
+ normalize_text(
+ "Email me at user@example.com", normalization_options=NormalizationOptions()
+ )
+ == "Email me at user at example dot com"
+ )
+ assert (
+ normalize_text(
+ "Contact admin@test.org", normalization_options=NormalizationOptions()
+ )
+ == "Contact admin at test dot org"
+ )
+ assert (
+ normalize_text(
+ "Send to test.user@site.com", normalization_options=NormalizationOptions()
+ )
+ == "Send to test dot user at site dot com"
+ )
+
+
+def test_money():
+ """Test that money text is normalized correctly"""
+ assert (
+ normalize_text(
+ "He lost $5.3 thousand.", normalization_options=NormalizationOptions()
+ )
+ == "He lost five point three thousand dollars."
+ )
+ assert (
+ normalize_text(
+ "To put it weirdly -$6.9 million",
+ normalization_options=NormalizationOptions(),
+ )
+ == "To put it weirdly minus six point nine million dollars"
+ )
+ assert (
+ normalize_text("It costs $50.3.", normalization_options=NormalizationOptions())
+ == "It costs fifty dollars and thirty cents."
+ )
+
+
+def test_non_url_text():
+ """Test that non-URL text is unaffected"""
+ assert (
+ normalize_text(
+ "This is not.a.url text", normalization_options=NormalizationOptions()
+ )
+ == "This is not-a-url text"
+ )
+ assert (
+ normalize_text(
+ "Hello, how are you today?", normalization_options=NormalizationOptions()
+ )
+ == "Hello, how are you today?"
+ )
+ assert (
+ normalize_text("It costs $50.", normalization_options=NormalizationOptions())
+ == "It costs fifty dollars."
+ )
diff --git a/api/tests/test_openai_endpoints.py b/api/tests/test_openai_endpoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5c7efcb8c22de5d31875e77f21282bd82280e29
--- /dev/null
+++ b/api/tests/test_openai_endpoints.py
@@ -0,0 +1,499 @@
+import asyncio
+import json
+import os
+from typing import AsyncGenerator, Tuple
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import numpy as np
+import pytest
+from fastapi.testclient import TestClient
+
+from api.src.core.config import settings
+from api.src.inference.base import AudioChunk
+from api.src.main import app
+from api.src.routers.openai_compatible import (
+ get_tts_service,
+ load_openai_mappings,
+ stream_audio_chunks,
+)
+from api.src.services.streaming_audio_writer import StreamingAudioWriter
+from api.src.services.tts_service import TTSService
+from api.src.structures.schemas import OpenAISpeechRequest
+
+client = TestClient(app)
+
+
+@pytest.fixture
+def test_voice():
+ """Fixture providing a test voice name."""
+ return "test_voice"
+
+
+@pytest.fixture
+def mock_openai_mappings():
+ """Mock OpenAI mappings for testing."""
+ with patch(
+ "api.src.routers.openai_compatible._openai_mappings",
+ {
+ "models": {"tts-1": "kokoro-v1_0", "tts-1-hd": "kokoro-v1_0"},
+ "voices": {"alloy": "am_adam", "nova": "bf_isabella"},
+ },
+ ):
+ yield
+
+
+@pytest.fixture
+def mock_json_file(tmp_path):
+ """Create a temporary mock JSON file."""
+ content = {
+ "models": {"test-model": "test-kokoro"},
+ "voices": {"test-voice": "test-internal"},
+ }
+ json_file = tmp_path / "test_mappings.json"
+ json_file.write_text(json.dumps(content))
+ return json_file
+
+
+def test_load_openai_mappings(mock_json_file):
+ """Test loading OpenAI mappings from JSON file"""
+ with patch("os.path.join", return_value=str(mock_json_file)):
+ mappings = load_openai_mappings()
+ assert "models" in mappings
+ assert "voices" in mappings
+ assert mappings["models"]["test-model"] == "test-kokoro"
+ assert mappings["voices"]["test-voice"] == "test-internal"
+
+
+def test_load_openai_mappings_file_not_found():
+ """Test handling of missing mappings file"""
+ with patch("os.path.join", return_value="/nonexistent/path"):
+ mappings = load_openai_mappings()
+ assert mappings == {"models": {}, "voices": {}}
+
+
+def test_list_models(mock_openai_mappings):
+ """Test listing available models endpoint"""
+ response = client.get("/v1/models")
+ assert response.status_code == 200
+ data = response.json()
+ assert data["object"] == "list"
+ assert isinstance(data["data"], list)
+ assert len(data["data"]) == 3 # tts-1, tts-1-hd, and kokoro
+
+ # Verify all expected models are present
+ model_ids = [model["id"] for model in data["data"]]
+ assert "tts-1" in model_ids
+ assert "tts-1-hd" in model_ids
+ assert "kokoro" in model_ids
+
+ # Verify model format
+ for model in data["data"]:
+ assert model["object"] == "model"
+ assert "created" in model
+ assert model["owned_by"] == "kokoro"
+
+
+def test_retrieve_model(mock_openai_mappings):
+ """Test retrieving a specific model endpoint"""
+ # Test successful model retrieval
+ response = client.get("/v1/models/tts-1")
+ assert response.status_code == 200
+ data = response.json()
+ assert data["id"] == "tts-1"
+ assert data["object"] == "model"
+ assert data["owned_by"] == "kokoro"
+ assert "created" in data
+
+ # Test non-existent model
+ response = client.get("/v1/models/nonexistent-model")
+ assert response.status_code == 404
+ error = response.json()
+ assert error["detail"]["error"] == "model_not_found"
+ assert "not found" in error["detail"]["message"]
+ assert error["detail"]["type"] == "invalid_request_error"
+
+
+@pytest.mark.asyncio
+async def test_get_tts_service_initialization():
+ """Test TTSService initialization"""
+ with patch("api.src.routers.openai_compatible._tts_service", None):
+ with patch("api.src.routers.openai_compatible._init_lock", None):
+ with patch("api.src.services.tts_service.TTSService.create") as mock_create:
+ mock_service = AsyncMock()
+ mock_create.return_value = mock_service
+
+ # Test concurrent access
+ async def get_service():
+ return await get_tts_service()
+
+ # Create multiple concurrent requests
+ tasks = [get_service() for _ in range(5)]
+ results = await asyncio.gather(*tasks)
+
+ # Verify service was created only once
+ mock_create.assert_called_once()
+ assert all(r == mock_service for r in results)
+
+
+@pytest.mark.asyncio
+async def test_stream_audio_chunks_client_disconnect():
+ """Test handling of client disconnect during streaming"""
+ mock_request = MagicMock()
+ mock_request.is_disconnected = AsyncMock(return_value=True)
+
+ mock_service = AsyncMock()
+
+ async def mock_stream(*args, **kwargs):
+ for i in range(5):
+ yield AudioChunk(np.ndarray([], np.int16), output=b"chunk")
+
+ mock_service.generate_audio_stream = mock_stream
+ mock_service.list_voices.return_value = ["test_voice"]
+
+ request = OpenAISpeechRequest(
+ model="kokoro",
+ input="Test text",
+ voice="test_voice",
+ response_format="mp3",
+ stream=True,
+ speed=1.0,
+ )
+
+ writer = StreamingAudioWriter("mp3", 24000)
+
+ chunks = []
+ async for chunk in stream_audio_chunks(mock_service, request, mock_request, writer):
+ chunks.append(chunk)
+
+ writer.close()
+
+ assert len(chunks) == 0 # Should stop immediately due to disconnect
+
+
+def test_openai_voice_mapping(mock_tts_service, mock_openai_mappings):
+ """Test OpenAI voice name mapping"""
+ mock_tts_service.list_voices.return_value = ["am_adam", "bf_isabella"]
+
+ response = client.post(
+ "/v1/audio/speech",
+ json={
+ "model": "tts-1",
+ "input": "Hello world",
+ "voice": "alloy", # OpenAI voice name
+ "response_format": "mp3",
+ "stream": False,
+ },
+ )
+ assert response.status_code == 200
+ mock_tts_service.generate_audio.assert_called_once()
+ assert mock_tts_service.generate_audio.call_args[1]["voice"] == "am_adam"
+
+
+def test_openai_voice_mapping_streaming(
+ mock_tts_service, mock_openai_mappings, mock_audio_bytes
+):
+ """Test OpenAI voice mapping in streaming mode"""
+ mock_tts_service.list_voices.return_value = ["am_adam", "bf_isabella"]
+
+ response = client.post(
+ "/v1/audio/speech",
+ json={
+ "model": "tts-1-hd",
+ "input": "Hello world",
+ "voice": "nova", # OpenAI voice name
+ "response_format": "mp3",
+ "stream": True,
+ },
+ )
+ assert response.status_code == 200
+ content = b""
+ for chunk in response.iter_bytes():
+ content += chunk
+ assert content == mock_audio_bytes
+
+
+def test_invalid_openai_model(mock_tts_service, mock_openai_mappings):
+ """Test error handling for invalid OpenAI model"""
+ response = client.post(
+ "/v1/audio/speech",
+ json={
+ "model": "invalid-model",
+ "input": "Hello world",
+ "voice": "alloy",
+ "response_format": "mp3",
+ "stream": False,
+ },
+ )
+ assert response.status_code == 400
+ error_response = response.json()
+ assert error_response["detail"]["error"] == "invalid_model"
+ assert "Unsupported model" in error_response["detail"]["message"]
+
+
+@pytest.fixture
+def mock_audio_bytes():
+ """Mock audio bytes for testing."""
+ return b"mock audio data"
+
+
+@pytest.fixture
+def mock_tts_service(mock_audio_bytes):
+ """Mock TTS service for testing."""
+ with patch("api.src.routers.openai_compatible.get_tts_service") as mock_get:
+ service = AsyncMock(spec=TTSService)
+ service.generate_audio.return_value = AudioChunk(np.zeros(1000, np.int16))
+
+ async def mock_stream(*args, **kwargs) -> AsyncGenerator[AudioChunk, None]:
+ yield AudioChunk(np.ndarray([], np.int16), output=mock_audio_bytes)
+
+ service.generate_audio_stream = mock_stream
+ service.list_voices.return_value = ["test_voice", "voice1", "voice2"]
+ service.combine_voices.return_value = "voice1_voice2"
+
+ mock_get.return_value = service
+ mock_get.side_effect = None
+ yield service
+
+
+@patch("api.src.services.audio.AudioService.convert_audio")
+def test_openai_speech_endpoint(
+ mock_convert, mock_tts_service, test_voice, mock_audio_bytes
+):
+ """Test the OpenAI-compatible speech endpoint with basic MP3 generation"""
+ # Configure mocks
+ mock_tts_service.generate_audio.return_value = AudioChunk(np.zeros(1000, np.int16))
+ mock_convert.return_value = AudioChunk(
+ np.zeros(1000, np.int16), output=mock_audio_bytes
+ )
+
+ response = client.post(
+ "/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": "Hello world",
+ "voice": test_voice,
+ "response_format": "mp3",
+ "stream": False,
+ },
+ )
+ assert response.status_code == 200
+ assert response.headers["content-type"] == "audio/mpeg"
+ assert len(response.content) > 0
+ assert response.content == mock_audio_bytes + mock_audio_bytes
+
+ mock_tts_service.generate_audio.assert_called_once()
+ assert mock_convert.call_count == 2
+
+
+def test_openai_speech_streaming(mock_tts_service, test_voice, mock_audio_bytes):
+ """Test the OpenAI-compatible speech endpoint with streaming"""
+ response = client.post(
+ "/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": "Hello world",
+ "voice": test_voice,
+ "response_format": "mp3",
+ "stream": True,
+ },
+ )
+ assert response.status_code == 200
+ assert response.headers["content-type"] == "audio/mpeg"
+ assert "Transfer-Encoding" in response.headers
+ assert response.headers["Transfer-Encoding"] == "chunked"
+
+ content = b""
+ for chunk in response.iter_bytes():
+ content += chunk
+ assert content == mock_audio_bytes
+
+
+def test_openai_speech_pcm_streaming(mock_tts_service, test_voice, mock_audio_bytes):
+ """Test PCM streaming format"""
+ response = client.post(
+ "/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": "Hello world",
+ "voice": test_voice,
+ "response_format": "pcm",
+ "stream": True,
+ },
+ )
+ assert response.status_code == 200
+ assert response.headers["content-type"] == "audio/pcm"
+
+ content = b""
+ for chunk in response.iter_bytes():
+ content += chunk
+ assert content == mock_audio_bytes
+
+
+def test_openai_speech_invalid_voice(mock_tts_service):
+ """Test error handling for invalid voice"""
+ mock_tts_service.generate_audio.side_effect = ValueError(
+ "Voice 'invalid_voice' not found"
+ )
+
+ response = client.post(
+ "/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": "Hello world",
+ "voice": "invalid_voice",
+ "response_format": "mp3",
+ "stream": False,
+ },
+ )
+ assert response.status_code == 400
+ error_response = response.json()
+ assert error_response["detail"]["error"] == "validation_error"
+ assert "Voice 'invalid_voice' not found" in error_response["detail"]["message"]
+ assert error_response["detail"]["type"] == "invalid_request_error"
+
+
+def test_openai_speech_empty_text(mock_tts_service, test_voice):
+ """Test error handling for empty text"""
+
+ async def mock_error_stream(*args, **kwargs):
+ raise ValueError("Text is empty after preprocessing")
+
+ mock_tts_service.generate_audio = mock_error_stream
+ mock_tts_service.list_voices.return_value = ["test_voice"]
+
+ response = client.post(
+ "/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": "",
+ "voice": test_voice,
+ "response_format": "mp3",
+ "stream": False,
+ },
+ )
+ assert response.status_code == 400
+ error_response = response.json()
+ assert error_response["detail"]["error"] == "validation_error"
+ assert "Text is empty after preprocessing" in error_response["detail"]["message"]
+ assert error_response["detail"]["type"] == "invalid_request_error"
+
+
+def test_openai_speech_invalid_format(mock_tts_service, test_voice):
+ """Test error handling for invalid format"""
+ response = client.post(
+ "/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": "Hello world",
+ "voice": test_voice,
+ "response_format": "invalid_format",
+ "stream": False,
+ },
+ )
+ assert response.status_code == 422 # Validation error from Pydantic
+
+
+def test_list_voices(mock_tts_service):
+ """Test listing available voices"""
+ # Override the mock for this specific test
+ mock_tts_service.list_voices.return_value = ["voice1", "voice2"]
+
+ response = client.get("/v1/audio/voices")
+ assert response.status_code == 200
+ data = response.json()
+ assert "voices" in data
+ assert len(data["voices"]) == 2
+ assert "voice1" in data["voices"]
+ assert "voice2" in data["voices"]
+
+
+@patch("api.src.routers.openai_compatible.settings")
+def test_combine_voices(mock_settings, mock_tts_service):
+ """Test combining voices endpoint"""
+ # Enable local voice saving for this test
+ mock_settings.allow_local_voice_saving = True
+
+ response = client.post("/v1/audio/voices/combine", json="voice1+voice2")
+ assert response.status_code == 200
+ assert response.headers["content-type"] == "application/octet-stream"
+ assert "voice1+voice2.pt" in response.headers["content-disposition"]
+
+
+def test_server_error(mock_tts_service, test_voice):
+ """Test handling of server errors"""
+
+ async def mock_error_stream(*args, **kwargs):
+ raise RuntimeError("Internal server error")
+
+ mock_tts_service.generate_audio = mock_error_stream
+ mock_tts_service.list_voices.return_value = ["test_voice"]
+
+ response = client.post(
+ "/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": "Hello world",
+ "voice": test_voice,
+ "response_format": "mp3",
+ "stream": False,
+ },
+ )
+ assert response.status_code == 500
+ error_response = response.json()
+ assert error_response["detail"]["error"] == "processing_error"
+ assert error_response["detail"]["type"] == "server_error"
+
+
+def test_streaming_error(mock_tts_service, test_voice):
+ """Test handling streaming errors"""
+ # Mock process_voices to raise the error
+ mock_tts_service.list_voices.side_effect = RuntimeError("Streaming failed")
+
+ response = client.post(
+ "/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": "Hello world",
+ "voice": test_voice,
+ "response_format": "mp3",
+ "stream": True,
+ },
+ )
+
+ assert response.status_code == 500
+ error_data = response.json()
+ assert error_data["detail"]["error"] == "processing_error"
+ assert error_data["detail"]["type"] == "server_error"
+ assert "Streaming failed" in error_data["detail"]["message"]
+
+
+@pytest.mark.asyncio
+async def test_streaming_initialization_error():
+ """Test handling of streaming initialization errors"""
+ mock_service = AsyncMock()
+
+ async def mock_error_stream(*args, **kwargs):
+ if False: # This makes it a proper generator
+ yield b""
+ raise RuntimeError("Failed to initialize stream")
+
+ mock_service.generate_audio_stream = mock_error_stream
+ mock_service.list_voices.return_value = ["test_voice"]
+
+ request = OpenAISpeechRequest(
+ model="kokoro",
+ input="Test text",
+ voice="test_voice",
+ response_format="mp3",
+ stream=True,
+ speed=1.0,
+ )
+
+ writer = StreamingAudioWriter("mp3", 24000)
+
+ with pytest.raises(RuntimeError) as exc:
+ async for _ in stream_audio_chunks(mock_service, request, MagicMock(), writer):
+ pass
+
+ writer.close()
+ assert "Failed to initialize stream" in str(exc.value)
diff --git a/api/tests/test_paths.py b/api/tests/test_paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..715934e7bd03599d6204733d333c683b99c9691c
--- /dev/null
+++ b/api/tests/test_paths.py
@@ -0,0 +1,138 @@
+import os
+from unittest.mock import patch
+
+import pytest
+
+from api.src.core.paths import (
+ _find_file,
+ _scan_directories,
+ get_content_type,
+ get_temp_dir_size,
+ get_temp_file_path,
+ list_temp_files,
+)
+
+
+@pytest.mark.asyncio
+async def test_find_file_exists():
+ """Test finding existing file."""
+ with patch("aiofiles.os.path.exists") as mock_exists:
+ mock_exists.return_value = True
+ path = await _find_file("test.txt", ["/test/path"])
+ assert path == "/test/path/test.txt"
+
+
+@pytest.mark.asyncio
+async def test_find_file_not_exists():
+ """Test finding non-existent file."""
+ with patch("aiofiles.os.path.exists") as mock_exists:
+ mock_exists.return_value = False
+ with pytest.raises(FileNotFoundError, match="File not found"):
+ await _find_file("test.txt", ["/test/path"])
+
+
+@pytest.mark.asyncio
+async def test_find_file_with_filter():
+ """Test finding file with filter function."""
+ with patch("aiofiles.os.path.exists") as mock_exists:
+ mock_exists.return_value = True
+ filter_fn = lambda p: p.endswith(".txt")
+ path = await _find_file("test.txt", ["/test/path"], filter_fn)
+ assert path == "/test/path/test.txt"
+
+
+@pytest.mark.asyncio
+async def test_scan_directories():
+ """Test scanning directories."""
+ mock_entry = type("MockEntry", (), {"name": "test.txt"})()
+
+ with (
+ patch("aiofiles.os.path.exists") as mock_exists,
+ patch("aiofiles.os.scandir") as mock_scandir,
+ ):
+ mock_exists.return_value = True
+ mock_scandir.return_value = [mock_entry]
+
+ files = await _scan_directories(["/test/path"])
+ assert "test.txt" in files
+
+
+@pytest.mark.asyncio
+async def test_get_content_type():
+ """Test content type detection."""
+ test_cases = [
+ ("test.html", "text/html"),
+ ("test.js", "application/javascript"),
+ ("test.css", "text/css"),
+ ("test.png", "image/png"),
+ ("test.unknown", "application/octet-stream"),
+ ]
+
+ for filename, expected in test_cases:
+ content_type = await get_content_type(filename)
+ assert content_type == expected
+
+
+@pytest.mark.asyncio
+async def test_get_temp_file_path():
+ """Test temp file path generation."""
+ with (
+ patch("aiofiles.os.path.exists") as mock_exists,
+ patch("aiofiles.os.makedirs") as mock_makedirs,
+ ):
+ mock_exists.return_value = False
+
+ path = await get_temp_file_path("test.wav")
+ assert "test.wav" in path
+ mock_makedirs.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_list_temp_files():
+ """Test listing temp files."""
+
+ class MockEntry:
+ def __init__(self, name):
+ self.name = name
+
+ def is_file(self):
+ return True
+
+ mock_entry = MockEntry("test.wav")
+
+ with (
+ patch("aiofiles.os.path.exists") as mock_exists,
+ patch("aiofiles.os.scandir") as mock_scandir,
+ ):
+ mock_exists.return_value = True
+ mock_scandir.return_value = [mock_entry]
+
+ files = await list_temp_files()
+ assert "test.wav" in files
+
+
+@pytest.mark.asyncio
+async def test_get_temp_dir_size():
+ """Test getting temp directory size."""
+
+ class MockEntry:
+ def __init__(self, path):
+ self.path = path
+
+ def is_file(self):
+ return True
+
+ mock_entry = MockEntry("/tmp/test.wav")
+ mock_stat = type("MockStat", (), {"st_size": 1024})()
+
+ with (
+ patch("aiofiles.os.path.exists") as mock_exists,
+ patch("aiofiles.os.scandir") as mock_scandir,
+ patch("aiofiles.os.stat") as mock_stat_fn,
+ ):
+ mock_exists.return_value = True
+ mock_scandir.return_value = [mock_entry]
+ mock_stat_fn.return_value = mock_stat
+
+ size = await get_temp_dir_size()
+ assert size == 1024
diff --git a/api/tests/test_text_processor.py b/api/tests/test_text_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfcbcfe40addb1fdc84d298b5888bfbbf245c34b
--- /dev/null
+++ b/api/tests/test_text_processor.py
@@ -0,0 +1,105 @@
+import pytest
+
+from api.src.services.text_processing.text_processor import (
+ get_sentence_info,
+ process_text_chunk,
+ smart_split,
+)
+
+
+def test_process_text_chunk_basic():
+ """Test basic text chunk processing."""
+ text = "Hello world"
+ tokens = process_text_chunk(text)
+ assert isinstance(tokens, list)
+ assert len(tokens) > 0
+
+
+def test_process_text_chunk_empty():
+ """Test processing empty text."""
+ text = ""
+ tokens = process_text_chunk(text)
+ assert isinstance(tokens, list)
+ assert len(tokens) == 0
+
+
+def test_process_text_chunk_phonemes():
+ """Test processing with skip_phonemize."""
+ phonemes = "h @ l @U" # Example phoneme sequence
+ tokens = process_text_chunk(phonemes, skip_phonemize=True)
+ assert isinstance(tokens, list)
+ assert len(tokens) > 0
+
+
+def test_get_sentence_info():
+ """Test sentence splitting and info extraction."""
+ text = "This is sentence one. This is sentence two! What about three?"
+ results = get_sentence_info(text, {})
+
+ assert len(results) == 3
+ for sentence, tokens, count in results:
+ assert isinstance(sentence, str)
+ assert isinstance(tokens, list)
+ assert isinstance(count, int)
+ assert count == len(tokens)
+ assert count > 0
+
+
+def test_get_sentence_info_phenomoes():
+ """Test sentence splitting and info extraction."""
+ text = (
+ "This is sentence one. This is |custom_phonemes_0|/> two! What about three?"
+ )
+ results = get_sentence_info(text, {"|custom_phonemes_0|/>": r"sˈɛntᵊns"})
+
+ assert len(results) == 3
+ assert "sˈɛntᵊns" in results[1][0]
+ for sentence, tokens, count in results:
+ assert isinstance(sentence, str)
+ assert isinstance(tokens, list)
+ assert isinstance(count, int)
+ assert count == len(tokens)
+ assert count > 0
+
+
+@pytest.mark.asyncio
+async def test_smart_split_short_text():
+ """Test smart splitting with text under max tokens."""
+ text = "This is a short test sentence."
+ chunks = []
+ async for chunk_text, chunk_tokens in smart_split(text):
+ chunks.append((chunk_text, chunk_tokens))
+
+ assert len(chunks) == 1
+ assert isinstance(chunks[0][0], str)
+ assert isinstance(chunks[0][1], list)
+
+
+@pytest.mark.asyncio
+async def test_smart_split_long_text():
+ """Test smart splitting with longer text."""
+ # Create text that should split into multiple chunks
+ text = ". ".join(["This is test sentence number " + str(i) for i in range(20)])
+
+ chunks = []
+ async for chunk_text, chunk_tokens in smart_split(text):
+ chunks.append((chunk_text, chunk_tokens))
+
+ assert len(chunks) > 1
+ for chunk_text, chunk_tokens in chunks:
+ assert isinstance(chunk_text, str)
+ assert isinstance(chunk_tokens, list)
+ assert len(chunk_tokens) > 0
+
+
+@pytest.mark.asyncio
+async def test_smart_split_with_punctuation():
+ """Test smart splitting handles punctuation correctly."""
+ text = "First sentence! Second sentence? Third sentence; Fourth sentence: Fifth sentence."
+
+ chunks = []
+ async for chunk_text, chunk_tokens in smart_split(text):
+ chunks.append(chunk_text)
+
+ # Verify punctuation is preserved
+ assert all(any(p in chunk for p in "!?;:.") for chunk in chunks)
diff --git a/api/tests/test_tts_service.py b/api/tests/test_tts_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae8447ab19425531355ae97f85a38e55f8956e6a
--- /dev/null
+++ b/api/tests/test_tts_service.py
@@ -0,0 +1,126 @@
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import numpy as np
+import pytest
+import torch
+
+from api.src.services.tts_service import TTSService
+
+
+@pytest.fixture
+def mock_managers():
+ """Mock model and voice managers."""
+
+ async def _mock_managers():
+ model_manager = AsyncMock()
+ model_manager.get_backend.return_value = MagicMock()
+
+ voice_manager = AsyncMock()
+ voice_manager.get_voice_path.return_value = "/path/to/voice.pt"
+ voice_manager.list_voices.return_value = ["voice1", "voice2"]
+
+ with (
+ patch("api.src.services.tts_service.get_model_manager") as mock_get_model,
+ patch("api.src.services.tts_service.get_voice_manager") as mock_get_voice,
+ ):
+ mock_get_model.return_value = model_manager
+ mock_get_voice.return_value = voice_manager
+ return model_manager, voice_manager
+
+ return _mock_managers()
+
+
+@pytest.fixture
+def tts_service(mock_managers):
+ """Create TTSService instance with mocked dependencies."""
+
+ async def _create_service():
+ return await TTSService.create("test_output")
+
+ return _create_service()
+
+
+@pytest.mark.asyncio
+async def test_service_creation():
+ """Test service creation and initialization."""
+ model_manager = AsyncMock()
+ voice_manager = AsyncMock()
+
+ with (
+ patch("api.src.services.tts_service.get_model_manager") as mock_get_model,
+ patch("api.src.services.tts_service.get_voice_manager") as mock_get_voice,
+ ):
+ mock_get_model.return_value = model_manager
+ mock_get_voice.return_value = voice_manager
+
+ service = await TTSService.create("test_output")
+ assert service.output_dir == "test_output"
+ assert service.model_manager is model_manager
+ assert service._voice_manager is voice_manager
+
+
+@pytest.mark.asyncio
+async def test_get_voice_path_single():
+ """Test getting path for single voice."""
+ model_manager = AsyncMock()
+ voice_manager = AsyncMock()
+ voice_manager.get_voice_path.return_value = "/path/to/voice1.pt"
+
+ with (
+ patch("api.src.services.tts_service.get_model_manager") as mock_get_model,
+ patch("api.src.services.tts_service.get_voice_manager") as mock_get_voice,
+ ):
+ mock_get_model.return_value = model_manager
+ mock_get_voice.return_value = voice_manager
+
+ service = await TTSService.create("test_output")
+ name, path = await service._get_voices_path("voice1")
+ assert name == "voice1"
+ assert path == "/path/to/voice1.pt"
+ voice_manager.get_voice_path.assert_called_once_with("voice1")
+
+
+@pytest.mark.asyncio
+async def test_get_voice_path_combined():
+ """Test getting path for combined voices."""
+ model_manager = AsyncMock()
+ voice_manager = AsyncMock()
+ voice_manager.get_voice_path.return_value = "/path/to/voice.pt"
+
+ with (
+ patch("api.src.services.tts_service.get_model_manager") as mock_get_model,
+ patch("api.src.services.tts_service.get_voice_manager") as mock_get_voice,
+ patch("torch.load") as mock_load,
+ patch("torch.save") as mock_save,
+ patch("tempfile.gettempdir") as mock_temp,
+ ):
+ mock_get_model.return_value = model_manager
+ mock_get_voice.return_value = voice_manager
+ mock_temp.return_value = "/tmp"
+ mock_load.return_value = torch.ones(10)
+
+ service = await TTSService.create("test_output")
+ name, path = await service._get_voices_path("voice1+voice2")
+ assert name == "voice1+voice2"
+ assert path.endswith("voice1+voice2.pt")
+ mock_save.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_list_voices():
+ """Test listing available voices."""
+ model_manager = AsyncMock()
+ voice_manager = AsyncMock()
+ voice_manager.list_voices.return_value = ["voice1", "voice2"]
+
+ with (
+ patch("api.src.services.tts_service.get_model_manager") as mock_get_model,
+ patch("api.src.services.tts_service.get_voice_manager") as mock_get_voice,
+ ):
+ mock_get_model.return_value = model_manager
+ mock_get_voice.return_value = voice_manager
+
+ service = await TTSService.create("test_output")
+ voices = await service.list_voices()
+ assert voices == ["voice1", "voice2"]
+ voice_manager.list_voices.assert_called_once()
diff --git a/charts/kokoro-fastapi/.helmignore b/charts/kokoro-fastapi/.helmignore
new file mode 100644
index 0000000000000000000000000000000000000000..0e8a0eb36f4ca2c939201c0d54b5d82a1ea34778
--- /dev/null
+++ b/charts/kokoro-fastapi/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/charts/kokoro-fastapi/Chart.yaml b/charts/kokoro-fastapi/Chart.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed6f6753230cb9449a5ed7b6cefdd39496d560d9
--- /dev/null
+++ b/charts/kokoro-fastapi/Chart.yaml
@@ -0,0 +1,12 @@
+apiVersion: v2
+name: kokoro-fastapi
+description: A Helm chart for deploying the Kokoro FastAPI TTS service to Kubernetes
+type: application
+version: 0.3.0
+appVersion: "0.3.0"
+
+keywords:
+ - tts
+ - fastapi
+ - gpu
+ - kokoro
diff --git a/charts/kokoro-fastapi/examples/aks-tls-values.yaml b/charts/kokoro-fastapi/examples/aks-tls-values.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..236af0a809c19e9c56147ab9f4a061151ecdf88b
--- /dev/null
+++ b/charts/kokoro-fastapi/examples/aks-tls-values.yaml
@@ -0,0 +1,54 @@
+# Tested on
+# - Azure AKS with GPU node pool with Nvidia GPU operator
+# - This setup uses 1 ingress and load balances between 2 replicas, enabling simultaneous requests
+#
+# Azure CLI command to create a GPU node pool:
+# az aks nodepool add \
+# --resource-group $AZ_RESOURCE_GROUP \
+# --cluster-name $CLUSTER_NAME \
+# --name t4gpus \
+# --node-vm-size Standard_NC4as_T4_v3 \
+# --node-count 2 \
+# --enable-cluster-autoscaler \
+# --min-count 1 \
+# --max-count 2 \
+# --priority Spot \
+# --eviction-policy Delete \
+# --spot-max-price -1 \
+# --node-taints "sku=gpu:NoSchedule,kubernetes.azure.com/scalesetpriority=spot:NoSchedule" \
+# --skip-gpu-driver-install
+
+kokoroTTS:
+ replicaCount: 8
+ port: 8880
+ tag: v0.2.0
+ pullPolicy: IfNotPresent
+
+# Azure specific settings for spot t4 GPU nodes with Nvidia GPU operator
+tolerations:
+ - key: "kubernetes.azure.com/scalesetpriority"
+ operator: Equal
+ value: "spot"
+ effect: NoSchedule
+ - key: "sku"
+ operator: Equal
+ value: "gpu"
+ effect: NoSchedule
+
+ingress:
+ enabled: true
+ className: "nginx"
+ annotations:
+ # Requires cert-manager and external-dns to be in the cluster for TLS and DNS
+ cert-manager.io/cluster-issuer: letsencrypt-prod
+ external-dns.alpha.kubernetes.io/hostname: your-external-dns-enabled-hostname
+ external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
+ hosts:
+ - host: your-external-dns-enabled-hostname
+ paths:
+ - path: /
+ pathType: Prefix
+ tls:
+ - secretName: kokoro-fastapi-tls
+ hosts:
+ - your-external-dns-enabled-hostname
\ No newline at end of file
diff --git a/charts/kokoro-fastapi/examples/gpu-operator-values.yaml b/charts/kokoro-fastapi/examples/gpu-operator-values.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b74667f982ba2467cb3d5dab68aba0da21431e0f
--- /dev/null
+++ b/charts/kokoro-fastapi/examples/gpu-operator-values.yaml
@@ -0,0 +1,56 @@
+# Follow the official NVIDIA GPU Operator documentation
+# to install the GPU operator with these settings:
+# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html
+#
+# This example is for a Nvidia T4 16gb GPU node pool with only 1 GPU on each node on Azure AKS.
+# It uses time-slicing to share the a and claim to the system that 1 GPU is 4 GPUs.
+# So each pod has access to a smaller gpu with 4gb of memory.
+#
+devicePlugin: # Remove this if you dont want to use time-slicing
+ config:
+ create: true
+ name: "time-slicing-config"
+ default: "any"
+ data:
+ any: |-
+ version: v1
+ flags:
+ migStrategy: none
+ sharing:
+ timeSlicing:
+ resources:
+ - name: nvidia.com/gpu
+ replicas: 4
+
+daemonsets:
+ tolerations:
+ - key: "sku"
+ operator: Equal
+ value: "gpu"
+ effect: NoSchedule
+ - key: "kubernetes.azure.com/scalesetpriority"
+ operator: Equal
+ value: "spot"
+ effect: NoSchedule
+
+node-feature-discovery:
+ master:
+ tolerations:
+ - key: "sku"
+ operator: Equal
+ value: "gpu"
+ effect: NoSchedule
+ - key: "kubernetes.azure.com/scalesetpriority"
+ operator: Equal
+ value: "spot"
+ effect: NoSchedule
+ worker:
+ tolerations:
+ - key: "sku"
+ operator: Equal
+ value: "gpu"
+ effect: NoSchedule
+ - key: "kubernetes.azure.com/scalesetpriority"
+ operator: Equal
+ value: "spot"
+ effect: NoSchedule
\ No newline at end of file
diff --git a/charts/kokoro-fastapi/templates/NOTES.txt b/charts/kokoro-fastapi/templates/NOTES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bc009b803366e51c07819242b110929ea8b748c1
--- /dev/null
+++ b/charts/kokoro-fastapi/templates/NOTES.txt
@@ -0,0 +1,22 @@
+1. Get the application URL by running these commands:
+{{- if .Values.ingress.enabled }}
+{{- range $host := .Values.ingress.hosts }}
+ {{- range .paths }}
+ http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
+ {{- end }}
+{{- end }}
+{{- else if contains "NodePort" .Values.service.type }}
+ export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "kokoro-fastapi.fullname" . }})
+ export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
+ echo http://$NODE_IP:$NODE_PORT
+{{- else if contains "LoadBalancer" .Values.service.type }}
+ NOTE: It may take a few minutes for the LoadBalancer IP to be available.
+ You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "kokoro-fastapi.fullname" . }}'
+ export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "kokoro-fastapi.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
+ echo http://$SERVICE_IP:{{ .Values.kokoroTTS.port }}
+{{- else if contains "ClusterIP" .Values.service.type }}
+ export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "kokoro-fastapi.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
+ export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
+ echo "Visit http://127.0.0.1:8880 to use your application"
+ kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8880:$CONTAINER_PORT
+{{- end }}
diff --git a/charts/kokoro-fastapi/templates/_helpers.tpl b/charts/kokoro-fastapi/templates/_helpers.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..849bcd751a8731b3235596a9830bf3f216ab7183
--- /dev/null
+++ b/charts/kokoro-fastapi/templates/_helpers.tpl
@@ -0,0 +1,62 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "kokoro-fastapi.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "kokoro-fastapi.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "kokoro-fastapi.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "kokoro-fastapi.labels" -}}
+helm.sh/chart: {{ include "kokoro-fastapi.chart" . }}
+{{ include "kokoro-fastapi.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "kokoro-fastapi.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "kokoro-fastapi.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Create the name of the service account to use
+*/}}
+{{- define "kokoro-fastapi.serviceAccountName" -}}
+{{- if .Values.serviceAccount.create }}
+{{- default (include "kokoro-fastapi.fullname" .) .Values.serviceAccount.name }}
+{{- else }}
+{{- default "default" .Values.serviceAccount.name }}
+{{- end }}
+{{- end }}
diff --git a/charts/kokoro-fastapi/templates/hpa.yaml b/charts/kokoro-fastapi/templates/hpa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45ba60bd948bf2f39aa285586f3243c9d861f838
--- /dev/null
+++ b/charts/kokoro-fastapi/templates/hpa.yaml
@@ -0,0 +1,28 @@
+{{- if .Values.autoscaling.enabled }}
+apiVersion: autoscaling/v2beta1
+kind: HorizontalPodAutoscaler
+metadata:
+ name: {{ include "kokoro-fastapi.fullname" . }}
+ labels:
+ {{- include "kokoro-fastapi.labels" . | nindent 4 }}
+spec:
+ scaleTargetRef:
+ apiVersion: apps/v1
+ kind: Deployment
+ name: {{ include "kokoro-fastapi.fullname" . }}
+ minReplicas: {{ .Values.autoscaling.minReplicas }}
+ maxReplicas: {{ .Values.autoscaling.maxReplicas }}
+ metrics:
+ {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
+ - type: Resource
+ resource:
+ name: cpu
+ targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
+ {{- end }}
+ {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
+ - type: Resource
+ resource:
+ name: memory
+ targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
+ {{- end }}
+{{- end }}
diff --git a/charts/kokoro-fastapi/templates/ingress.yaml b/charts/kokoro-fastapi/templates/ingress.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9c9f4e8d5cbd6626ade55c5be48d7dd577b5804
--- /dev/null
+++ b/charts/kokoro-fastapi/templates/ingress.yaml
@@ -0,0 +1,43 @@
+{{- if .Values.ingress.enabled -}}
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ name: {{ include "kokoro-fastapi.fullname" . }}
+ labels:
+ {{- include "kokoro-fastapi.labels" . | nindent 4 }}
+ {{- with .Values.ingress.annotations }}
+ annotations:
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+spec:
+ {{- with .Values.ingress.className }}
+ ingressClassName: {{ . }}
+ {{- end }}
+ {{- if .Values.ingress.tls }}
+ tls:
+ {{- range .Values.ingress.tls }}
+ - hosts:
+ {{- range .hosts }}
+ - {{ . | quote }}
+ {{- end }}
+ secretName: {{ .secretName }}
+ {{- end }}
+ {{- end }}
+ rules:
+ {{- range .Values.ingress.hosts }}
+ - host: {{ .host | quote }}
+ http:
+ paths:
+ {{- range .paths }}
+ - path: {{ .path }}
+ {{- with .pathType }}
+ pathType: {{ . }}
+ {{- end }}
+ backend:
+ service:
+ name: {{ include "kokoro-fastapi.fullname" $ }}-kokoro-tts-service
+ port:
+ number: {{ $.Values.kokoroTTS.port }}
+ {{- end }}
+ {{- end }}
+{{- end }}
diff --git a/charts/kokoro-fastapi/templates/kokoro-tts-deployment.yaml b/charts/kokoro-fastapi/templates/kokoro-tts-deployment.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2178a08a64f3b2fdde728532f69b20e56750f93a
--- /dev/null
+++ b/charts/kokoro-fastapi/templates/kokoro-tts-deployment.yaml
@@ -0,0 +1,77 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{ include "kokoro-fastapi.fullname" . }}-kokoro-tts
+ labels:
+ {{- include "kokoro-fastapi.labels" . | nindent 4 }}
+spec:
+ {{- if not .Values.autoscaling.enabled }}
+ replicas: {{ .Values.kokoroTTS.replicaCount }}
+ {{- end }}
+ selector:
+ matchLabels:
+ {{- include "kokoro-fastapi.selectorLabels" . | nindent 6 }}
+ template:
+ metadata:
+ {{- with .Values.podAnnotations }}
+ annotations:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ labels:
+ {{- include "kokoro-fastapi.selectorLabels" . | nindent 8 }}
+ spec:
+ {{- with .Values.kokoroTTS.imagePullSecrets }}
+ imagePullSecrets:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ serviceAccountName: {{ include "kokoro-fastapi.serviceAccountName" . }}
+ securityContext:
+ {{- toYaml .Values.podSecurityContext | nindent 8 }}
+ initContainers: []
+ containers:
+ - name: kokoro-tts
+ securityContext:
+ {{- toYaml .Values.securityContext | nindent 12 }}
+ image: "{{ .Values.kokoroTTS.repository }}:{{ .Values.kokoroTTS.tag | default .Chart.AppVersion }}"
+ imagePullPolicy: {{ .Values.kokoroTTS.pullPolicy }}
+ env:
+ - name: PYTHONPATH
+ value: "/app:/app/api"
+ - name: USE_GPU
+ value: "true"
+ - name: PYTHONUNBUFFERED
+ value: "1"
+ ports:
+ - name: kokoro-tts-http
+ containerPort: {{ .Values.kokoroTTS.port | default 8880 }}
+ protocol: TCP
+ livenessProbe:
+ httpGet:
+ path: /health
+ port: kokoro-tts-http
+ initialDelaySeconds: 30
+ periodSeconds: 30
+ timeoutSeconds: 5
+ readinessProbe:
+ httpGet:
+ path: /health
+ port: kokoro-tts-http
+ initialDelaySeconds: 30
+ periodSeconds: 30
+ timeoutSeconds: 5
+ resources:
+ {{- toYaml .Values.kokoroTTS.resources | nindent 12 }}
+ volumeMounts: []
+ volumes: []
+ {{- with .Values.nodeSelector }}
+ nodeSelector:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- with .Values.affinity }}
+ affinity:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- with .Values.tolerations }}
+ tolerations:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
diff --git a/charts/kokoro-fastapi/templates/kokoro-tts-service.yaml b/charts/kokoro-fastapi/templates/kokoro-tts-service.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..79c789039681da8a4d662065769a098be478aef0
--- /dev/null
+++ b/charts/kokoro-fastapi/templates/kokoro-tts-service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ include "kokoro-fastapi.fullname" . }}-kokoro-tts-service
+ labels:
+ {{- include "kokoro-fastapi.labels" . | nindent 4 }}
+spec:
+ type: {{ .Values.service.type }}
+ ports:
+ - port: {{ .Values.kokoroTTS.port }}
+ targetPort: kokoro-tts-http
+ protocol: TCP
+ name: kokoro-tts-http
+ selector:
+ {{- include "kokoro-fastapi.selectorLabels" . | nindent 4 }}
diff --git a/charts/kokoro-fastapi/templates/serviceaccount.yaml b/charts/kokoro-fastapi/templates/serviceaccount.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f062f72d65bbc9035617c401b44cb4a43b63b3a8
--- /dev/null
+++ b/charts/kokoro-fastapi/templates/serviceaccount.yaml
@@ -0,0 +1,12 @@
+{{- if .Values.serviceAccount.create -}}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: {{ include "kokoro-fastapi.serviceAccountName" . }}
+ labels:
+ {{- include "kokoro-fastapi.labels" . | nindent 4 }}
+ {{- with .Values.serviceAccount.annotations }}
+ annotations:
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+{{- end }}
diff --git a/charts/kokoro-fastapi/templates/tests/test-connection.yaml b/charts/kokoro-fastapi/templates/tests/test-connection.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b912c6a42f56f8e1d4225b0c7a64d3b910565d7
--- /dev/null
+++ b/charts/kokoro-fastapi/templates/tests/test-connection.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Pod
+metadata:
+ name: "{{ include "kokoro-fastapi.fullname" . }}-test-connection"
+ labels:
+ {{- include "kokoro-fastapi.labels" . | nindent 4 }}
+ annotations:
+ "helm.sh/hook": test
+spec:
+ containers:
+ - name: wget
+ image: busybox
+ command: ['wget']
+ args: ['{{ include "kokoro-fastapi.fullname" . }}:{{ .Values.kokoroTTS.port }}']
+ restartPolicy: Never
diff --git a/charts/kokoro-fastapi/values.yaml b/charts/kokoro-fastapi/values.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2e37e44c0597e209366db8cb7b8a04d2a6a268f
--- /dev/null
+++ b/charts/kokoro-fastapi/values.yaml
@@ -0,0 +1,75 @@
+# Default values for kokoro-fastapi.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+kokoroTTS:
+ replicaCount: 1
+ # The name of the deployment repository
+ repository: "ghcr.io/remsky/kokoro-fastapi-gpu"
+ imagePullSecrets: [] # Set if using a private image or getting rate limited
+ tag: "latest"
+ pullPolicy: Always
+ port: 8880
+ resources:
+ limits:
+ nvidia.com/gpu: 1
+ requests:
+ nvidia.com/gpu: 1
+
+nameOverride: ""
+fullnameOverride: ""
+
+serviceAccount:
+ # Specifies whether a service account should be created
+ create: true
+ # Annotations to add to the service account
+ annotations: {}
+ # The name of the service account to use.
+ # If not set and create is true, a name is generated using the fullname template
+ name: ""
+
+podAnnotations: {}
+
+podSecurityContext: {}
+ # fsGroup: 2000
+
+securityContext: {}
+ # capabilities:
+ # drop:
+ # - ALL
+ # readOnlyRootFilesystem: true
+ # runAsNonRoot: true
+ # runAsUser: 1000
+
+service:
+ type: ClusterIP
+
+ingress:
+ enabled: false
+ className: "nginx"
+ annotations: {}
+ # cert-manager.io/cluster-issuer: letsencrypt-prod
+ # external-dns.alpha.kubernetes.io/hostname: kokoro.example.com
+ # external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
+ hosts:
+ - host: kokoro.example.com
+ paths:
+ - path: /
+ pathType: Prefix
+
+ tls: []
+ # - secretName: kokoro-fastapi-tls
+ # hosts:
+ # - kokoro.example.com
+
+autoscaling:
+ enabled: false
+ minReplicas: 1
+ maxReplicas: 100
+ targetCPUUtilizationPercentage: 80
+ # targetMemoryUtilizationPercentage: 80
+
+nodeSelector: {}
+
+tolerations: []
+
+affinity: {}
diff --git a/docker-bake.hcl b/docker-bake.hcl
new file mode 100644
index 0000000000000000000000000000000000000000..e29599a0a9d48f7c095ae1194137679ef178e9a1
--- /dev/null
+++ b/docker-bake.hcl
@@ -0,0 +1,83 @@
+# Variables for reuse
+variable "VERSION" {
+ default = "latest"
+}
+
+variable "REGISTRY" {
+ default = "ghcr.io"
+}
+
+variable "OWNER" {
+ default = "remsky"
+}
+
+variable "REPO" {
+ default = "kokoro-fastapi"
+}
+
+variable "DOWNLOAD_MODEL" {
+ default = "true"
+}
+
+# Common settings shared between targets
+target "_common" {
+ context = "."
+ args = {
+ DEBIAN_FRONTEND = "noninteractive"
+ DOWNLOAD_MODEL = "${DOWNLOAD_MODEL}"
+ }
+}
+
+# Base settings for CPU builds
+target "_cpu_base" {
+ inherits = ["_common"]
+ dockerfile = "docker/cpu/Dockerfile"
+}
+
+# Base settings for GPU builds
+target "_gpu_base" {
+ inherits = ["_common"]
+ dockerfile = "docker/gpu/Dockerfile"
+}
+
+# CPU target with multi-platform support
+target "cpu" {
+ inherits = ["_cpu_base"]
+ platforms = ["linux/amd64", "linux/arm64"]
+ tags = [
+ "${REGISTRY}/${OWNER}/${REPO}-cpu:${VERSION}",
+ "${REGISTRY}/${OWNER}/${REPO}-cpu:latest"
+ ]
+}
+
+# GPU target with multi-platform support
+target "gpu" {
+ inherits = ["_gpu_base"]
+ platforms = ["linux/amd64", "linux/arm64"]
+ tags = [
+ "${REGISTRY}/${OWNER}/${REPO}-gpu:${VERSION}",
+ "${REGISTRY}/${OWNER}/${REPO}-gpu:latest"
+ ]
+}
+
+# Default group to build both CPU and GPU versions
+group "default" {
+ targets = ["cpu", "gpu"]
+}
+
+# Development targets for faster local builds
+target "cpu-dev" {
+ inherits = ["_cpu_base"]
+ # No multi-platform for dev builds
+ tags = ["${REGISTRY}/${OWNER}/${REPO}-cpu:dev"]
+}
+
+target "gpu-dev" {
+ inherits = ["_gpu_base"]
+ # No multi-platform for dev builds
+ tags = ["${REGISTRY}/${OWNER}/${REPO}-gpu:dev"]
+}
+
+group "dev" {
+ targets = ["cpu-dev", "gpu-dev"]
+}
\ No newline at end of file
diff --git a/docker/build.sh b/docker/build.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c00212718fc0d1ecc23545dde40701cf9c8da5ba
--- /dev/null
+++ b/docker/build.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+# Get version from argument or use default
+VERSION=${1:-"latest"}
+
+# Build both CPU and GPU images using docker buildx bake
+echo "Building CPU and GPU images..."
+VERSION=$VERSION docker buildx bake --push
+
+echo "Build complete!"
+echo "Created images with version: $VERSION"
diff --git a/docker/cpu/.dockerignore b/docker/cpu/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..df5f9db103950ad2818d74464bc90fc1dd4ff1c6
--- /dev/null
+++ b/docker/cpu/.dockerignore
@@ -0,0 +1,40 @@
+# Version control
+.git
+
+# Python
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.py[cod]
+*$py.class
+.pytest_cache
+.coverage
+.coveragerc
+
+# Environment
+# .env
+.venv
+env/
+venv/
+ENV/
+
+# IDE
+.idea
+.vscode
+*.swp
+*.swo
+
+# Project specific
+examples/
+Kokoro-82M/
+ui/
+tests/
+*.md
+*.txt
+!requirements.txt
+
+# Docker
+Dockerfile*
+docker-compose*
diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f52830790a51ab3aeff66613f58a478777b2d08e
--- /dev/null
+++ b/docker/cpu/Dockerfile
@@ -0,0 +1,66 @@
+FROM python:3.10-slim
+
+# Install dependencies and check espeak location
+RUN apt-get update && apt-get install -y \
+ espeak-ng \
+ espeak-ng-data \
+ git \
+ libsndfile1 \
+ curl \
+ ffmpeg \
+ g++ \
+&& apt-get clean \
+&& rm -rf /var/lib/apt/lists/* \
+&& mkdir -p /usr/share/espeak-ng-data \
+&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
+
+# Install UV using the installer script
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
+ mv /root/.local/bin/uv /usr/local/bin/ && \
+ mv /root/.local/bin/uvx /usr/local/bin/
+
+# Create non-root user and set up directories and permissions
+RUN useradd -m -u 1000 appuser && \
+ mkdir -p /app/api/src/models/v1_0 && \
+ chown -R appuser:appuser /app
+
+USER appuser
+WORKDIR /app
+
+# Copy dependency files
+COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
+
+# Install Rust (required to build sudachipy and pyopenjtalk-plus)
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
+ENV PATH="/home/appuser/.cargo/bin:$PATH"
+
+# Install dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+ uv venv --python 3.10 && \
+ uv sync --extra cpu
+
+# Copy project files including models
+COPY --chown=appuser:appuser api ./api
+COPY --chown=appuser:appuser web ./web
+COPY --chown=appuser:appuser docker/scripts/ ./
+RUN chmod +x ./entrypoint.sh
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+ PYTHONPATH=/app:/app/api \
+ PATH="/app/.venv/bin:$PATH" \
+ UV_LINK_MODE=copy \
+ USE_GPU=false \
+ PHONEMIZER_ESPEAK_PATH=/usr/bin \
+ PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
+ ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
+
+ENV DOWNLOAD_MODEL=true
+# Download model if enabled
+RUN if [ "$DOWNLOAD_MODEL" = "true" ]; then \
+ python download_model.py --output api/src/models/v1_0; \
+ fi
+
+ENV DEVICE="cpu"
+# Run FastAPI server through entrypoint.sh
+CMD ["./entrypoint.sh"]
diff --git a/docker/cpu/docker-compose.yml b/docker/cpu/docker-compose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8ca8821b6e87b6b31f148d455cce43c11dae51bd
--- /dev/null
+++ b/docker/cpu/docker-compose.yml
@@ -0,0 +1,37 @@
+name: kokoro-fastapi-cpu
+services:
+ kokoro-tts:
+ build:
+ context: ../..
+ dockerfile: docker/cpu/Dockerfile
+ volumes:
+ - ../../api:/app/api
+ ports:
+ - "8880:8880"
+ environment:
+ - PYTHONPATH=/app:/app/api
+ # ONNX Optimization Settings for vectorized operations
+ - ONNX_NUM_THREADS=8 # Maximize core usage for vectorized ops
+ - ONNX_INTER_OP_THREADS=4 # Higher inter-op for parallel matrix operations
+ - ONNX_EXECUTION_MODE=parallel
+ - ONNX_OPTIMIZATION_LEVEL=all
+ - ONNX_MEMORY_PATTERN=true
+ - ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
+
+ # # Gradio UI service [Comment out everything below if you don't need it]
+ # gradio-ui:
+ # image: ghcr.io/remsky/kokoro-fastapi-ui:v${VERSION}
+ # # Uncomment below (and comment out above) to build from source instead of using the released image
+ # build:
+ # context: ../../ui
+ # ports:
+ # - "7860:7860"
+ # volumes:
+ # - ../../ui/data:/app/ui/data
+ # - ../../ui/app.py:/app/app.py # Mount app.py for hot reload
+ # environment:
+ # - GRADIO_WATCH=True # Enable hot reloading
+ # - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
+ # - DISABLE_LOCAL_SAVING=false # Set to 'true' to disable local saving and hide file view
+ # - API_HOST=kokoro-tts # Set TTS service URL
+ # - API_PORT=8880 # Set TTS service PORT
diff --git a/docker/gpu/.dockerignore b/docker/gpu/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..8592ec7577ce05b1b71219254fa8001806d3c0b2
--- /dev/null
+++ b/docker/gpu/.dockerignore
@@ -0,0 +1,40 @@
+# Version control
+.git
+
+# Python
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.py[cod]
+*$py.class
+.pytest_cache
+.coverage
+.coveragerc
+
+# Environment
+# .env
+.venv*
+env/
+venv/
+ENV/
+
+# IDE
+.idea
+.vscode
+*.swp
+*.swo
+
+# Project specific
+examples/
+Kokoro-82M/
+ui/
+tests/
+*.md
+*.txt
+!requirements.txt
+
+# Docker
+Dockerfile*
+docker-compose*
diff --git a/docker/gpu/Dockerfile b/docker/gpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..44c1ba7999a7973748e6c2d29b27f093e237d858
--- /dev/null
+++ b/docker/gpu/Dockerfile
@@ -0,0 +1,67 @@
+FROM --platform=$BUILDPLATFORM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
+# Set non-interactive frontend
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
+RUN apt-get update && apt-get install -y \
+ python3.10 \
+ python3-venv \
+ espeak-ng \
+ espeak-ng-data \
+ git \
+ libsndfile1 \
+ curl \
+ ffmpeg \
+ g++ \
+ && apt-get clean && rm -rf /var/lib/apt/lists/* \
+ && mkdir -p /usr/share/espeak-ng-data \
+ && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
+
+# Install UV using the installer script
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
+ mv /root/.local/bin/uv /usr/local/bin/ && \
+ mv /root/.local/bin/uvx /usr/local/bin/
+
+# Create non-root user and set up directories and permissions
+RUN useradd -m -u 1001 appuser && \
+ mkdir -p /app/api/src/models/v1_0 && \
+ chown -R appuser:appuser /app
+
+USER appuser
+WORKDIR /app
+
+# Copy dependency files
+COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
+
+ENV PHONEMIZER_ESPEAK_PATH=/usr/bin \
+ PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
+ ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
+
+# Install dependencies with GPU extras (using cache mounts)
+RUN --mount=type=cache,target=/root/.cache/uv \
+ uv venv --python 3.10 && \
+ uv sync --extra gpu
+
+# Copy project files including models
+COPY --chown=appuser:appuser api ./api
+COPY --chown=appuser:appuser web ./web
+COPY --chown=appuser:appuser docker/scripts/ ./
+RUN chmod +x ./entrypoint.sh
+
+
+# Set all environment variables in one go
+ENV PYTHONUNBUFFERED=1 \
+ PYTHONPATH=/app:/app/api \
+ PATH="/app/.venv/bin:$PATH" \
+ UV_LINK_MODE=copy \
+ USE_GPU=true
+
+ENV DOWNLOAD_MODEL=true
+# Download model if enabled
+RUN if [ "$DOWNLOAD_MODEL" = "true" ]; then \
+ python download_model.py --output api/src/models/v1_0; \
+ fi
+
+ENV DEVICE="gpu"
+# Run FastAPI server through entrypoint.sh
+CMD ["./entrypoint.sh"]
diff --git a/docker/gpu/docker-compose.yml b/docker/gpu/docker-compose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9faddd89889f569cd8e4927b600528e2057d0ed2
--- /dev/null
+++ b/docker/gpu/docker-compose.yml
@@ -0,0 +1,41 @@
+name: kokoro-tts-gpu
+services:
+ kokoro-tts:
+ # image: ghcr.io/remsky/kokoro-fastapi-gpu:v${VERSION}
+ build:
+ context: ../..
+ dockerfile: docker/gpu/Dockerfile
+ volumes:
+ - ../../api:/app/api
+ user: "1001:1001" # Ensure container runs as UID 1001 (appuser)
+ ports:
+ - "8880:8880"
+ environment:
+ - PYTHONPATH=/app:/app/api
+ - USE_GPU=true
+ - PYTHONUNBUFFERED=1
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: all
+ capabilities: [gpu]
+
+ # # Gradio UI service
+ # gradio-ui:
+ # image: ghcr.io/remsky/kokoro-fastapi-ui:v${VERSION}
+ # # Uncomment below to build from source instead of using the released image
+ # # build:
+ # # context: ../../ui
+ # ports:
+ # - "7860:7860"
+ # volumes:
+ # - ../../ui/data:/app/ui/data
+ # - ../../ui/app.py:/app/app.py # Mount app.py for hot reload
+ # environment:
+ # - GRADIO_WATCH=1 # Enable hot reloading
+ # - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
+ # - DISABLE_LOCAL_SAVING=false # Set to 'true' to disable local saving and hide file view
+ # - API_HOST=kokoro-tts # Set TTS service URL
+ # - API_PORT=8880 # Set TTS service PORT
diff --git a/docker/scripts/download_model.py b/docker/scripts/download_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..67a9409cf6a877cde2544b7954703bf6ce11dd71
--- /dev/null
+++ b/docker/scripts/download_model.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""Download and prepare Kokoro v1.0 model."""
+
+import json
+import os
+from pathlib import Path
+from urllib.request import urlretrieve
+
+from loguru import logger
+
+
+def verify_files(model_path: str, config_path: str) -> bool:
+ """Verify that model files exist and are valid.
+
+ Args:
+ model_path: Path to model file
+ config_path: Path to config file
+
+ Returns:
+ True if files exist and are valid
+ """
+ try:
+ # Check files exist
+ if not os.path.exists(model_path):
+ return False
+ if not os.path.exists(config_path):
+ return False
+
+ # Verify config file is valid JSON
+ with open(config_path) as f:
+ config = json.load(f)
+
+ # Check model file size (should be non-zero)
+ if os.path.getsize(model_path) == 0:
+ return False
+
+ return True
+ except Exception:
+ return False
+
+
+def download_model(output_dir: str) -> None:
+ """Download model files from GitHub release.
+
+ Args:
+ output_dir: Directory to save model files
+ """
+ try:
+ # Create output directory
+ os.makedirs(output_dir, exist_ok=True)
+
+ # Define file paths
+ model_file = "kokoro-v1_0.pth"
+ config_file = "config.json"
+ model_path = os.path.join(output_dir, model_file)
+ config_path = os.path.join(output_dir, config_file)
+
+ # Check if files already exist and are valid
+ if verify_files(model_path, config_path):
+ logger.info("Model files already exist and are valid")
+ return
+
+ logger.info("Downloading Kokoro v1.0 model files")
+
+ # GitHub release URLs (to be updated with v0.2.0 release)
+ base_url = "https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.4"
+ model_url = f"{base_url}/{model_file}"
+ config_url = f"{base_url}/{config_file}"
+
+ # Download files
+ logger.info("Downloading model file...")
+ urlretrieve(model_url, model_path)
+
+ logger.info("Downloading config file...")
+ urlretrieve(config_url, config_path)
+
+ # Verify downloaded files
+ if not verify_files(model_path, config_path):
+ raise RuntimeError("Failed to verify downloaded files")
+
+ logger.info(f"✓ Model files prepared in {output_dir}")
+
+ except Exception as e:
+ logger.error(f"Failed to download model: {e}")
+ raise
+
+
+def main():
+ """Main entry point."""
+ import argparse
+
+ parser = argparse.ArgumentParser(description="Download Kokoro v1.0 model")
+ parser.add_argument(
+ "--output", required=True, help="Output directory for model files"
+ )
+
+ args = parser.parse_args()
+ download_model(args.output)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/docker/scripts/download_model.sh b/docker/scripts/download_model.sh
new file mode 100644
index 0000000000000000000000000000000000000000..22166e375e4164b71a83ffc6b133b5a2fdbd1840
--- /dev/null
+++ b/docker/scripts/download_model.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+
+# Find project root by looking for api directory
+find_project_root() {
+ local current_dir="$PWD"
+ local max_steps=5
+ local steps=0
+
+ while [ $steps -lt $max_steps ]; do
+ if [ -d "$current_dir/api" ]; then
+ echo "$current_dir"
+ return 0
+ fi
+ current_dir="$(dirname "$current_dir")"
+ ((steps++))
+ done
+
+ echo "Error: Could not find project root (no api directory found)" >&2
+ exit 1
+}
+
+# Function to verify files exist and are valid
+verify_files() {
+ local model_path="$1"
+ local config_path="$2"
+
+ # Check files exist
+ if [ ! -f "$model_path" ] || [ ! -f "$config_path" ]; then
+ return 1
+ fi
+
+ # Check files are not empty
+ if [ ! -s "$model_path" ] || [ ! -s "$config_path" ]; then
+ return 1
+ fi
+
+ # Try to parse config.json
+ if ! jq . "$config_path" >/dev/null 2>&1; then
+ return 1
+ fi
+
+ return 0
+}
+
+# Function to download a file
+download_file() {
+ local url="$1"
+ local output_path="$2"
+ local filename=$(basename "$output_path")
+
+ echo "Downloading $filename..."
+ mkdir -p "$(dirname "$output_path")"
+ if curl -L "$url" -o "$output_path"; then
+ echo "Successfully downloaded $filename"
+ return 0
+ else
+ echo "Error downloading $filename" >&2
+ return 1
+ fi
+}
+
+# Find project root and ensure models directory exists
+PROJECT_ROOT=$(find_project_root)
+if [ $? -ne 0 ]; then
+ exit 1
+fi
+
+MODEL_DIR="$PROJECT_ROOT/api/src/models/v1_0"
+echo "Model directory: $MODEL_DIR"
+mkdir -p "$MODEL_DIR"
+
+# Define file paths
+MODEL_FILE="kokoro-v1_0.pth"
+CONFIG_FILE="config.json"
+MODEL_PATH="$MODEL_DIR/$MODEL_FILE"
+CONFIG_PATH="$MODEL_DIR/$CONFIG_FILE"
+
+# Check if files already exist and are valid
+if verify_files "$MODEL_PATH" "$CONFIG_PATH"; then
+ echo "Model files already exist and are valid"
+ exit 0
+fi
+
+# Define URLs
+BASE_URL="https://github.com/remsky/Kokoro-FastAPI/releases/download/v1.4"
+MODEL_URL="$BASE_URL/$MODEL_FILE"
+CONFIG_URL="$BASE_URL/$CONFIG_FILE"
+
+# Download files
+success=true
+
+if ! download_file "$MODEL_URL" "$MODEL_PATH"; then
+ success=false
+fi
+
+if ! download_file "$CONFIG_URL" "$CONFIG_PATH"; then
+ success=false
+fi
+
+# Verify downloaded files
+if [ "$success" = true ] && verify_files "$MODEL_PATH" "$CONFIG_PATH"; then
+ echo "✓ Model files prepared in $MODEL_DIR"
+ exit 0
+else
+ echo "Failed to download or verify model files" >&2
+ exit 1
+fi
\ No newline at end of file
diff --git a/docker/scripts/entrypoint.sh b/docker/scripts/entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a5784951bf7ffa785d7e7dce6fc4b437f0494a6b
--- /dev/null
+++ b/docker/scripts/entrypoint.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e
+
+if [ "$DOWNLOAD_MODEL" = "true" ]; then
+ python download_model.py --output api/src/models/v1_0
+fi
+
+exec uv run --extra $DEVICE --no-sync python -m uvicorn api.src.main:app --host 0.0.0.0 --port 8880 --log-level debug
\ No newline at end of file
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/examples/assorted_checks/__init__.py b/examples/assorted_checks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/examples/assorted_checks/benchmarks/__init__.py b/examples/assorted_checks/benchmarks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/examples/assorted_checks/benchmarks/benchmark_first_token.py b/examples/assorted_checks/benchmarks/benchmark_first_token.py
new file mode 100644
index 0000000000000000000000000000000000000000..16f94c032cbe5c3c485d3019e1643b6078c1a965
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+import os
+import json
+import time
+
+import numpy as np
+import pandas as pd
+import requests
+from lib.shared_utils import save_json_results
+from lib.shared_plotting import plot_timeline, plot_correlation
+from lib.shared_benchmark_utils import enc, get_text_for_tokens
+
+
+def measure_first_token(
+ text: str, output_dir: str, tokens: int, run_number: int
+) -> dict:
+ """Measure time to audio via API calls and save the audio output"""
+ results = {
+ "text_length": len(text),
+ "token_count": len(enc.encode(text)),
+ "total_time": None,
+ "time_to_first_chunk": None,
+ "error": None,
+ "audio_path": None,
+ "audio_length": None, # Length of output audio in seconds
+ }
+
+ try:
+ start_time = time.time()
+
+ # Make request without streaming
+ response = requests.post(
+ "http://localhost:8880/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": text,
+ "voice": "af_heart",
+ "response_format": "wav",
+ "stream": False,
+ },
+ timeout=1800,
+ )
+ response.raise_for_status()
+
+ # Save complete audio
+ audio_filename = f"benchmark_tokens{tokens}_run{run_number}.wav"
+ audio_path = os.path.join(output_dir, audio_filename)
+ results["audio_path"] = audio_path
+
+ content = response.content
+ with open(audio_path, "wb") as f:
+ f.write(content)
+
+ # Calculate audio length using scipy
+ import scipy.io.wavfile as wavfile
+
+ sample_rate, audio_data = wavfile.read(audio_path)
+ results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
+ results["time_to_first_chunk"] = time.time() - start_time
+
+ results["total_time"] = time.time() - start_time
+ return results
+
+ except Exception as e:
+ results["error"] = str(e)
+ return results
+
+
+def main():
+ # Set up paths
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ output_dir = os.path.join(script_dir, "output_audio")
+ output_data_dir = os.path.join(script_dir, "output_data")
+
+ # Create output directories
+ os.makedirs(output_dir, exist_ok=True)
+ os.makedirs(output_data_dir, exist_ok=True)
+
+ # Load sample text
+ with open(
+ os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
+ ) as f:
+ text = f.read()
+
+ # Test specific token counts
+ token_sizes = [10, 25, 50, 100, 200, 500]
+ all_results = []
+
+ for tokens in token_sizes:
+ print(f"\nTesting {tokens} tokens")
+ test_text = get_text_for_tokens(text, tokens)
+ actual_tokens = len(enc.encode(test_text))
+ print(f"Text preview: {test_text[:50]}...")
+
+ # Run test 3 times for each size to get average
+ for i in range(5):
+ print(f"Run {i+1}/3...")
+ result = measure_first_token(test_text, output_dir, tokens, i + 1)
+ result["target_tokens"] = tokens
+ result["actual_tokens"] = actual_tokens
+ result["run_number"] = i + 1
+
+ print(f"Time to Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
+ print(f"Total time: {result.get('total_time', 'N/A'):.3f}s")
+
+ if result["error"]:
+ print(f"Error: {result['error']}")
+
+ all_results.append(result)
+
+ # Calculate averages per token size
+ summary = {}
+ for tokens in token_sizes:
+ matching_results = [
+ r for r in all_results if r["target_tokens"] == tokens and not r["error"]
+ ]
+ if matching_results:
+ avg_first_chunk = sum(
+ r["time_to_first_chunk"] for r in matching_results
+ ) / len(matching_results)
+ avg_total = sum(r["total_time"] for r in matching_results) / len(
+ matching_results
+ )
+ avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
+ matching_results
+ )
+ summary[tokens] = {
+ "avg_time_to_first_chunk": round(avg_first_chunk, 3),
+ "avg_total_time": round(avg_total, 3),
+ "avg_audio_length": round(avg_audio_length, 3),
+ "num_successful_runs": len(matching_results),
+ }
+
+ # Save results
+ # Save results
+ results_data = {
+ "individual_runs": all_results,
+ "summary": summary,
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+ }
+ save_json_results(
+ results_data, os.path.join(output_data_dir, "first_token_benchmark.json")
+ )
+
+ # Create plot directory if it doesn't exist
+ output_plots_dir = os.path.join(script_dir, "output_plots")
+ os.makedirs(output_plots_dir, exist_ok=True)
+
+ # Create DataFrame for plotting
+ df = pd.DataFrame(all_results)
+
+ # Create both plots
+ plot_correlation(
+ df,
+ "target_tokens",
+ "time_to_first_chunk",
+ "Time to Audio vs Input Size",
+ "Number of Input Tokens",
+ "Time to Audio (seconds)",
+ os.path.join(output_plots_dir, "first_token_latency.png"),
+ )
+
+ plot_timeline(df, os.path.join(output_plots_dir, "first_token_timeline.png"))
+
+ print("\nResults and plots saved to:")
+ print(f"- {os.path.join(output_data_dir, 'first_token_benchmark.json')}")
+ print(f"- {os.path.join(output_plots_dir, 'first_token_latency.png')}")
+ print(f"- {os.path.join(output_plots_dir, 'first_token_timeline.png')}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py b/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3b6d9228b08717a115e59b45f669e955d74ba07
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+import os
+import time
+
+import requests
+from openai import OpenAI
+from lib.stream_utils import run_benchmark
+
+OPENAI_CLIENT = OpenAI(
+ base_url="http://localhost:8880/v1", api_key="not-needed-for-local"
+)
+
+
+def measure_first_token_requests(
+ text: str, output_dir: str, tokens: int, run_number: int
+) -> dict:
+ """Measure time to audio via direct API calls and save the audio output"""
+ results = {
+ "text_length": len(text),
+ "token_count": None, # Will be set by run_benchmark
+ "total_time": None,
+ "time_to_first_chunk": None,
+ "error": None,
+ "audio_path": None,
+ "audio_length": None,
+ }
+
+ try:
+ start_time = time.time()
+
+ # Make request with streaming enabled
+ response = requests.post(
+ "http://localhost:8880/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": text,
+ "voice": "af_heart",
+ "response_format": "pcm",
+ "stream": True,
+ },
+ stream=True,
+ timeout=1800,
+ )
+ response.raise_for_status()
+
+ # Save complete audio
+ audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
+ audio_path = os.path.join(output_dir, audio_filename)
+ results["audio_path"] = audio_path
+
+ first_chunk_time = None
+ chunks = []
+ for chunk in response.iter_content(chunk_size=1024):
+ if chunk:
+ if first_chunk_time is None:
+ first_chunk_time = time.time()
+ results["time_to_first_chunk"] = first_chunk_time - start_time
+ chunks.append(chunk)
+
+ # Concatenate all PCM chunks
+ if not chunks:
+ raise ValueError("No audio chunks received")
+
+ all_audio_data = b"".join(chunks)
+
+ # Write as WAV file
+ import wave
+
+ with wave.open(audio_path, "wb") as wav_file:
+ wav_file.setnchannels(1) # Mono
+ wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
+ wav_file.setframerate(24000) # Known sample rate for Kokoro
+ wav_file.writeframes(all_audio_data)
+
+ # Calculate audio length using scipy
+ import scipy.io.wavfile as wavfile
+
+ sample_rate, audio_data = wavfile.read(audio_path)
+ results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
+
+ results["total_time"] = time.time() - start_time
+
+ # Print debug info
+ print(f"Complete audio size: {len(all_audio_data)} bytes")
+ print(f"Number of chunks received: {len(chunks)}")
+ print(f"Audio length: {results['audio_length']:.3f}s")
+
+ return results
+
+ except Exception as e:
+ results["error"] = str(e)
+ return results
+
+
+def measure_first_token_openai(
+ text: str, output_dir: str, tokens: int, run_number: int
+) -> dict:
+ """Measure time to audio via OpenAI API calls and save the audio output"""
+ results = {
+ "text_length": len(text),
+ "token_count": None, # Will be set by run_benchmark
+ "total_time": None,
+ "time_to_first_chunk": None,
+ "error": None,
+ "audio_path": None,
+ "audio_length": None,
+ }
+
+ try:
+ start_time = time.time()
+
+ # Initialize OpenAI client
+
+ # Save complete audio
+ audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
+ audio_path = os.path.join(output_dir, audio_filename)
+ results["audio_path"] = audio_path
+
+ first_chunk_time = None
+ all_audio_data = bytearray()
+ chunk_count = 0
+
+ # Make streaming request using OpenAI client
+ with OPENAI_CLIENT.audio.speech.with_streaming_response.create(
+ model="kokoro",
+ voice="af_heart",
+ response_format="pcm",
+ input=text,
+ ) as response:
+ for chunk in response.iter_bytes(chunk_size=1024):
+ if chunk:
+ chunk_count += 1
+ if first_chunk_time is None:
+ first_chunk_time = time.time()
+ results["time_to_first_chunk"] = first_chunk_time - start_time
+ all_audio_data.extend(chunk)
+
+ # Write as WAV file
+ import wave
+
+ with wave.open(audio_path, "wb") as wav_file:
+ wav_file.setnchannels(1) # Mono
+ wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
+ wav_file.setframerate(24000) # Known sample rate for Kokoro
+ wav_file.writeframes(all_audio_data)
+
+ # Calculate audio length using scipy
+ import scipy.io.wavfile as wavfile
+
+ sample_rate, audio_data = wavfile.read(audio_path)
+ results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
+
+ results["total_time"] = time.time() - start_time
+
+ # Print debug info
+ print(f"Complete audio size: {len(all_audio_data)} bytes")
+ print(f"Number of chunks received: {chunk_count}")
+ print(f"Audio length: {results['audio_length']:.3f}s")
+
+ return results
+
+ except Exception as e:
+ results["error"] = str(e)
+ return results
+
+
+def main():
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ prefix = "cpu"
+ # Run requests benchmark
+ print("\n=== Running Direct Requests Benchmark ===")
+ run_benchmark(
+ measure_first_token_requests,
+ output_dir=os.path.join(script_dir, "output_audio_stream"),
+ output_data_dir=os.path.join(script_dir, "output_data"),
+ output_plots_dir=os.path.join(script_dir, "output_plots"),
+ suffix="_stream",
+ plot_title_suffix="(Streaming)",
+ prefix=prefix,
+ )
+ # Run OpenAI benchmark
+ print("\n=== Running OpenAI Library Benchmark ===")
+ run_benchmark(
+ measure_first_token_openai,
+ output_dir=os.path.join(script_dir, "output_audio_stream_openai"),
+ output_data_dir=os.path.join(script_dir, "output_data"),
+ output_plots_dir=os.path.join(script_dir, "output_plots"),
+ suffix="_stream_openai",
+ plot_title_suffix="(OpenAI Streaming)",
+ prefix=prefix,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/assorted_checks/benchmarks/benchmark_tts_rtf.py b/examples/assorted_checks/benchmarks/benchmark_tts_rtf.py
new file mode 100644
index 0000000000000000000000000000000000000000..165da42ace876100653617bab27b8c26ba6f9dbc
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/benchmark_tts_rtf.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+import time
+import queue
+import threading
+from datetime import datetime
+
+import pandas as pd
+from lib.shared_utils import (
+ real_time_factor,
+ save_json_results,
+ get_system_metrics,
+ write_benchmark_stats,
+)
+from lib.shared_plotting import plot_correlation, plot_system_metrics
+from lib.shared_benchmark_utils import (
+ enc,
+ make_tts_request,
+ get_text_for_tokens,
+ generate_token_sizes,
+)
+
+
+class SystemMonitor:
+ def __init__(self, interval=1.0):
+ """Rough system tracker: Not always accurate"""
+ self.interval = interval
+ self.metrics_queue = queue.Queue()
+ self.stop_event = threading.Event()
+ self.metrics_timeline = []
+ self.start_time = None
+
+ def _monitor_loop(self):
+ """Background thread function to collect system metrics."""
+ while not self.stop_event.is_set():
+ metrics = get_system_metrics()
+ metrics["relative_time"] = time.time() - self.start_time
+ self.metrics_queue.put(metrics)
+ time.sleep(self.interval)
+
+ def start(self):
+ """Start the monitoring thread."""
+ self.start_time = time.time()
+ self.monitor_thread = threading.Thread(target=self._monitor_loop)
+ self.monitor_thread.daemon = True
+ self.monitor_thread.start()
+
+ def stop(self):
+ """Stop the monitoring thread and collect final metrics."""
+ self.stop_event.set()
+ if hasattr(self, "monitor_thread"):
+ self.monitor_thread.join(timeout=2)
+
+ # Collect all metrics from queue
+ while True:
+ try:
+ metrics = self.metrics_queue.get_nowait()
+ self.metrics_timeline.append(metrics)
+ except queue.Empty:
+ break
+
+ return self.metrics_timeline
+
+
+def main():
+ # Initialize system monitor
+ monitor = SystemMonitor(interval=1.0) # 1 second interval
+ # Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.)
+ prefix = "gpu"
+ # Generate token sizes
+ if "gpu" in prefix:
+ token_sizes = generate_token_sizes(
+ max_tokens=1000, dense_step=150, dense_max=1000, sparse_step=1000
+ )
+ elif "cpu" in prefix:
+ token_sizes = generate_token_sizes(
+ max_tokens=1000, dense_step=100, dense_max=500, sparse_step=250
+ )
+ else:
+ token_sizes = generate_token_sizes(max_tokens=3000)
+
+ # Set up paths relative to this file
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ output_dir = os.path.join(script_dir, "output_audio")
+ output_data_dir = os.path.join(script_dir, "output_data")
+ output_plots_dir = os.path.join(script_dir, "output_plots")
+
+ # Create output directories
+ os.makedirs(output_dir, exist_ok=True)
+ os.makedirs(output_data_dir, exist_ok=True)
+ os.makedirs(output_plots_dir, exist_ok=True)
+
+ # Function to prefix filenames
+ def prefix_path(path: str, filename: str) -> str:
+ if prefix:
+ filename = f"{prefix}_{filename}"
+ return os.path.join(path, filename)
+
+ with open(
+ os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
+ ) as f:
+ text = f.read()
+
+ total_tokens = len(enc.encode(text))
+ print(f"Total tokens in file: {total_tokens}")
+
+ print(f"Testing sizes: {token_sizes}")
+
+ results = []
+ test_start_time = time.time()
+
+ # Start system monitoring
+ monitor.start()
+
+ for num_tokens in token_sizes:
+ chunk = get_text_for_tokens(text, num_tokens)
+ actual_tokens = len(enc.encode(chunk))
+
+ print(f"\nProcessing chunk with {actual_tokens} tokens:")
+ print(f"Text preview: {chunk[:100]}...")
+
+ processing_time, audio_length = make_tts_request(
+ chunk,
+ output_dir=output_dir,
+ prefix=prefix,
+ stream=False, # Use non-streaming mode for RTF benchmarking
+ )
+ if processing_time is None or audio_length is None:
+ print("Breaking loop due to error")
+ break
+
+ # Calculate RTF using the correct formula
+ rtf = real_time_factor(processing_time, audio_length)
+ print(f"Real-Time Factor: {rtf:.5f}")
+
+ results.append(
+ {
+ "tokens": actual_tokens,
+ "processing_time": processing_time,
+ "output_length": audio_length,
+ "rtf": rtf,
+ "elapsed_time": round(time.time() - test_start_time, 5),
+ }
+ )
+
+ df = pd.DataFrame(results)
+ if df.empty:
+ print("No data to plot")
+ return
+
+ df["tokens_per_second"] = df["tokens"] / df["processing_time"]
+
+ # Write benchmark stats
+ stats = [
+ {
+ "title": "Benchmark Statistics (with correct RTF)",
+ "stats": {
+ "Total tokens processed": df["tokens"].sum(),
+ "Total audio generated (s)": df["output_length"].sum(),
+ "Total test duration (s)": df["elapsed_time"].max(),
+ "Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
+ "Average RTF": df["rtf"].mean(),
+ "Average Real Time Speed": 1 / df["rtf"].mean(),
+ },
+ },
+ {
+ "title": "Per-chunk Stats",
+ "stats": {
+ "Average chunk size (tokens)": df["tokens"].mean(),
+ "Min chunk size (tokens)": df["tokens"].min(),
+ "Max chunk size (tokens)": df["tokens"].max(),
+ "Average processing time (s)": df["processing_time"].mean(),
+ "Average output length (s)": df["output_length"].mean(),
+ },
+ },
+ {
+ "title": "Performance Ranges",
+ "stats": {
+ "Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
+ "RTF range": f"{df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x",
+ "Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x",
+ },
+ },
+ ]
+ write_benchmark_stats(
+ stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt")
+ )
+
+ # Plot Processing Time vs Token Count
+ plot_correlation(
+ df,
+ "tokens",
+ "processing_time",
+ "Processing Time vs Input Size",
+ "Number of Input Tokens",
+ "Processing Time (seconds)",
+ prefix_path(output_plots_dir, "processing_time_rtf.png"),
+ )
+
+ # Plot RTF vs Token Count
+ plot_correlation(
+ df,
+ "tokens",
+ "rtf",
+ "Real-Time Factor vs Input Size",
+ "Number of Input Tokens",
+ "Real-Time Factor (processing time / audio length)",
+ prefix_path(output_plots_dir, "realtime_factor_rtf.png"),
+ )
+
+ # Stop monitoring and get final metrics
+ final_metrics = monitor.stop()
+
+ # Convert metrics timeline to DataFrame for stats
+ metrics_df = pd.DataFrame(final_metrics)
+
+ # Add system usage stats
+ if not metrics_df.empty:
+ stats.append(
+ {
+ "title": "System Usage Statistics",
+ "stats": {
+ "Peak CPU Usage (%)": metrics_df["cpu_percent"].max(),
+ "Avg CPU Usage (%)": metrics_df["cpu_percent"].mean(),
+ "Peak RAM Usage (%)": metrics_df["ram_percent"].max(),
+ "Avg RAM Usage (%)": metrics_df["ram_percent"].mean(),
+ "Peak RAM Used (GB)": metrics_df["ram_used_gb"].max(),
+ "Avg RAM Used (GB)": metrics_df["ram_used_gb"].mean(),
+ },
+ }
+ )
+ if "gpu_memory_used" in metrics_df:
+ stats[-1]["stats"].update(
+ {
+ "Peak GPU Memory (MB)": metrics_df["gpu_memory_used"].max(),
+ "Avg GPU Memory (MB)": metrics_df["gpu_memory_used"].mean(),
+ }
+ )
+
+ # Plot system metrics
+ plot_system_metrics(
+ final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png")
+ )
+
+ # Save final results
+ save_json_results(
+ {
+ "results": results,
+ "system_metrics": final_metrics,
+ "test_duration": time.time() - test_start_time,
+ },
+ prefix_path(output_data_dir, "benchmark_results_rtf.json"),
+ )
+
+ print("\nResults saved to:")
+ print(f"- {prefix_path(output_data_dir, 'benchmark_results_rtf.json')}")
+ print(f"- {prefix_path(output_data_dir, 'benchmark_stats_rtf.txt')}")
+ print(f"- {prefix_path(output_plots_dir, 'processing_time_rtf.png')}")
+ print(f"- {prefix_path(output_plots_dir, 'realtime_factor_rtf.png')}")
+ print(f"- {prefix_path(output_plots_dir, 'system_usage_rtf.png')}")
+ print(f"\nAudio files saved in {output_dir} with prefix: {prefix or '(none)'}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/assorted_checks/benchmarks/depr_benchmark_tts.py b/examples/assorted_checks/benchmarks/depr_benchmark_tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fd600e61a4cf6b2194e3322d3bd4d9c33bf2dfc
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/depr_benchmark_tts.py
@@ -0,0 +1,181 @@
+import os
+import json
+import time
+
+import pandas as pd
+
+from examples.assorted_checks.lib.shared_utils import (
+ save_json_results,
+ get_system_metrics,
+ write_benchmark_stats,
+)
+from examples.assorted_checks.lib.shared_plotting import (
+ plot_correlation,
+ plot_system_metrics,
+)
+from examples.assorted_checks.lib.shared_benchmark_utils import (
+ enc,
+ make_tts_request,
+ get_text_for_tokens,
+ generate_token_sizes,
+)
+
+
+def main():
+ # Get optional prefix from first command line argument
+ import sys
+
+ prefix = sys.argv[1] if len(sys.argv) > 1 else ""
+
+ # Set up paths relative to this file
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ output_dir = os.path.join(script_dir, "output_audio")
+ output_data_dir = os.path.join(script_dir, "output_data")
+ output_plots_dir = os.path.join(script_dir, "output_plots")
+
+ # Create output directories
+ os.makedirs(output_dir, exist_ok=True)
+ os.makedirs(output_data_dir, exist_ok=True)
+ os.makedirs(output_plots_dir, exist_ok=True)
+
+ # Function to prefix filenames
+ def prefix_path(path: str, filename: str) -> str:
+ if prefix:
+ filename = f"{prefix}_{filename}"
+ return os.path.join(path, filename)
+
+ # Read input text
+ with open(
+ os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
+ ) as f:
+ text = f.read()
+
+ # Get total tokens in file
+ total_tokens = len(enc.encode(text))
+ print(f"Total tokens in file: {total_tokens}")
+
+ token_sizes = generate_token_sizes(total_tokens)
+
+ print(f"Testing sizes: {token_sizes}")
+
+ # Process chunks
+ results = []
+ system_metrics = []
+ test_start_time = time.time()
+
+ for num_tokens in token_sizes:
+ # Get text slice with exact token count
+ chunk = get_text_for_tokens(text, num_tokens)
+ actual_tokens = len(enc.encode(chunk))
+
+ print(f"\nProcessing chunk with {actual_tokens} tokens:")
+ print(f"Text preview: {chunk[:100]}...")
+
+ # Collect system metrics before processing
+ system_metrics.append(get_system_metrics())
+
+ processing_time, audio_length = make_tts_request(chunk)
+ if processing_time is None or audio_length is None:
+ print("Breaking loop due to error")
+ break
+
+ # Collect system metrics after processing
+ system_metrics.append(get_system_metrics())
+
+ results.append(
+ {
+ "tokens": actual_tokens,
+ "processing_time": processing_time,
+ "output_length": audio_length,
+ "realtime_factor": audio_length / processing_time,
+ "elapsed_time": time.time() - test_start_time,
+ }
+ )
+
+ # Save intermediate results
+ save_json_results(
+ {"results": results, "system_metrics": system_metrics},
+ prefix_path(output_data_dir, "benchmark_results.json"),
+ )
+
+ # Create DataFrame and calculate stats
+ df = pd.DataFrame(results)
+ if df.empty:
+ print("No data to plot")
+ return
+
+ # Calculate useful metrics
+ df["tokens_per_second"] = df["tokens"] / df["processing_time"]
+
+ # Write benchmark stats
+ stats = [
+ {
+ "title": "Benchmark Statistics",
+ "stats": {
+ "Total tokens processed": df["tokens"].sum(),
+ "Total audio generated (s)": df["output_length"].sum(),
+ "Total test duration (s)": df["elapsed_time"].max(),
+ "Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
+ "Average realtime factor": df["realtime_factor"].mean(),
+ },
+ },
+ {
+ "title": "Per-chunk Stats",
+ "stats": {
+ "Average chunk size (tokens)": df["tokens"].mean(),
+ "Min chunk size (tokens)": df["tokens"].min(),
+ "Max chunk size (tokens)": df["tokens"].max(),
+ "Average processing time (s)": df["processing_time"].mean(),
+ "Average output length (s)": df["output_length"].mean(),
+ },
+ },
+ {
+ "title": "Performance Ranges",
+ "stats": {
+ "Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
+ "Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x",
+ },
+ },
+ ]
+ write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats.txt"))
+
+ # Plot Processing Time vs Token Count
+ plot_correlation(
+ df,
+ "tokens",
+ "processing_time",
+ "Processing Time vs Input Size",
+ "Number of Input Tokens",
+ "Processing Time (seconds)",
+ prefix_path(output_plots_dir, "processing_time.png"),
+ )
+
+ # Plot Realtime Factor vs Token Count
+ plot_correlation(
+ df,
+ "tokens",
+ "realtime_factor",
+ "Realtime Factor vs Input Size",
+ "Number of Input Tokens",
+ "Realtime Factor (output length / processing time)",
+ prefix_path(output_plots_dir, "realtime_factor.png"),
+ )
+
+ # Plot system metrics
+ plot_system_metrics(
+ system_metrics, prefix_path(output_plots_dir, "system_usage.png")
+ )
+
+ print("\nResults saved to:")
+ print(f"- {prefix_path(output_data_dir, 'benchmark_results.json')}")
+ print(f"- {prefix_path(output_data_dir, 'benchmark_stats.txt')}")
+ print(f"- {prefix_path(output_plots_dir, 'processing_time.png')}")
+ print(f"- {prefix_path(output_plots_dir, 'realtime_factor.png')}")
+ print(f"- {prefix_path(output_plots_dir, 'system_usage.png')}")
+ if any("gpu_memory_used" in m for m in system_metrics):
+ print(f"- {prefix_path(output_plots_dir, 'gpu_usage.png')}")
+ print(f"\nAudio files saved in {output_dir} with prefix: {prefix or '(none)'}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/assorted_checks/benchmarks/lib/__init__.py b/examples/assorted_checks/benchmarks/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py b/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e9657e404612a19c12418297d088cb50cd002ed
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py
@@ -0,0 +1,137 @@
+"""Shared utilities specific to TTS benchmarking."""
+
+import time
+from typing import List, Tuple, Optional
+
+import requests
+import tiktoken
+
+from .shared_utils import save_audio_file, get_audio_length
+
+# Global tokenizer instance
+enc = tiktoken.get_encoding("cl100k_base")
+
+
+def get_text_for_tokens(text: str, num_tokens: int) -> str:
+ """Get a slice of text that contains exactly num_tokens tokens.
+
+ Args:
+ text: Input text to slice
+ num_tokens: Desired number of tokens
+
+ Returns:
+ str: Text slice containing exactly num_tokens tokens
+ """
+ tokens = enc.encode(text)
+ if num_tokens > len(tokens):
+ return text
+ return enc.decode(tokens[:num_tokens])
+
+
+def make_tts_request(
+ text: str,
+ output_dir: str = None,
+ timeout: int = 1800,
+ prefix: str = "",
+ stream: bool = True,
+) -> Tuple[Optional[float], Optional[float]]:
+ """Make TTS request using OpenAI-compatible endpoint.
+
+ Args:
+ text: Input text to convert to speech
+ output_dir: Directory to save audio files. If None, audio won't be saved.
+ timeout: Request timeout in seconds
+ prefix: Optional prefix for output filenames
+
+ Returns:
+ tuple: (processing_time, audio_length) in seconds, or (None, None) on error
+ """
+ try:
+ start_time = time.time()
+ if stream:
+ # For streaming, we need to collect all chunks
+ audio_chunks = []
+ response = requests.post(
+ "http://localhost:8880/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": text,
+ "voice": "af_heart",
+ "response_format": "wav",
+ "stream": True,
+ },
+ timeout=timeout,
+ stream=True,
+ )
+ response.raise_for_status()
+
+ for chunk in response.iter_content(chunk_size=8192):
+ if chunk:
+ audio_chunks.append(chunk)
+
+ # Combine all chunks
+ audio_data = b"".join(audio_chunks)
+ else:
+ response = requests.post(
+ "http://localhost:8880/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": text,
+ "voice": "af_heart",
+ "response_format": "wav",
+ "stream": False,
+ },
+ timeout=timeout,
+ )
+ response.raise_for_status()
+ audio_data = response.content
+
+ processing_time = round(time.time() - start_time, 2)
+ # Calculate audio length from audio data
+ audio_length = get_audio_length(audio_data)
+
+ # Save the audio file if output_dir is provided
+ if output_dir:
+ token_count = len(enc.encode(text))
+ output_file = save_audio_file(
+ audio_data, f"chunk_{token_count}_tokens", output_dir
+ )
+ print(f"Saved audio to {output_file}")
+
+ return processing_time, audio_length
+
+ except requests.exceptions.RequestException as e:
+ print(f"Error making request for text: {text[:50]}... Error: {str(e)}")
+ return None, None
+ except Exception as e:
+ print(f"Error processing text: {text[:50]}... Error: {str(e)}")
+ return None, None
+
+
+def generate_token_sizes(
+ max_tokens: int,
+ dense_step: int = 100,
+ dense_max: int = 1000,
+ sparse_step: int = 1000,
+) -> List[int]:
+ """Generate token size ranges with dense sampling at start.
+
+ Args:
+ max_tokens: Maximum number of tokens to generate sizes up to
+ dense_step: Step size for dense sampling range
+ dense_max: Maximum value for dense sampling
+ sparse_step: Step size for sparse sampling range
+
+ Returns:
+ list: Sorted list of token sizes
+ """
+ # Dense sampling at start
+ dense_range = list(range(dense_step, dense_max + 1, dense_step))
+
+ if max_tokens <= dense_max or sparse_step < dense_max:
+ return sorted(dense_range)
+ # Sparse sampling for larger sizes
+ sparse_range = list(range(dense_max + sparse_step, max_tokens + 1, sparse_step))
+
+ # Combine and deduplicate
+ return sorted(list(set(dense_range + sparse_range)))
diff --git a/examples/assorted_checks/benchmarks/lib/shared_plotting.py b/examples/assorted_checks/benchmarks/lib/shared_plotting.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ca48721f1feb05d4b1774df32874e70a3ef4a14
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/lib/shared_plotting.py
@@ -0,0 +1,409 @@
+"""Shared plotting utilities for benchmarks and tests."""
+
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+
+# Common style configurations
+STYLE_CONFIG = {
+ "background_color": "#1a1a2e",
+ "primary_color": "#ff2a6d",
+ "secondary_color": "#05d9e8",
+ "grid_color": "#ffffff",
+ "text_color": "#ffffff",
+ "font_sizes": {"title": 16, "label": 14, "tick": 12, "text": 10},
+}
+
+
+def setup_plot(fig, ax, title, xlabel=None, ylabel=None):
+ """Configure plot styling with consistent theme.
+
+ Args:
+ fig: matplotlib figure object
+ ax: matplotlib axis object
+ title: str, plot title
+ xlabel: str, optional x-axis label
+ ylabel: str, optional y-axis label
+
+ Returns:
+ tuple: (fig, ax) with applied styling
+ """
+ # Grid styling
+ ax.grid(True, linestyle="--", alpha=0.3, color=STYLE_CONFIG["grid_color"])
+
+ # Title and labels
+ ax.set_title(
+ title,
+ pad=20,
+ fontsize=STYLE_CONFIG["font_sizes"]["title"],
+ fontweight="bold",
+ color=STYLE_CONFIG["text_color"],
+ )
+
+ if xlabel:
+ ax.set_xlabel(
+ xlabel,
+ fontsize=STYLE_CONFIG["font_sizes"]["label"],
+ fontweight="medium",
+ color=STYLE_CONFIG["text_color"],
+ )
+ if ylabel:
+ ax.set_ylabel(
+ ylabel,
+ fontsize=STYLE_CONFIG["font_sizes"]["label"],
+ fontweight="medium",
+ color=STYLE_CONFIG["text_color"],
+ )
+
+ # Tick styling
+ ax.tick_params(
+ labelsize=STYLE_CONFIG["font_sizes"]["tick"], colors=STYLE_CONFIG["text_color"]
+ )
+
+ # Spine styling
+ for spine in ax.spines.values():
+ spine.set_color(STYLE_CONFIG["text_color"])
+ spine.set_alpha(0.3)
+ spine.set_linewidth(0.5)
+
+ # Background colors
+ ax.set_facecolor(STYLE_CONFIG["background_color"])
+ fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
+
+ return fig, ax
+
+
+def plot_system_metrics(metrics_data, output_path):
+ """Create plots for system metrics over time.
+
+ Args:
+ metrics_data: list of dicts containing system metrics
+ output_path: str, path to save the output plot
+ """
+ df = pd.DataFrame(metrics_data)
+ df["timestamp"] = pd.to_datetime(df["timestamp"])
+ elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds()
+
+ # Get baseline values
+ baseline_cpu = df["cpu_percent"].iloc[0]
+ baseline_ram = df["ram_used_gb"].iloc[0]
+ baseline_gpu = (
+ df["gpu_memory_used"].iloc[0] / 1024
+ if "gpu_memory_used" in df.columns
+ else None
+ )
+
+ # Convert GPU memory to GB if present
+ if "gpu_memory_used" in df.columns:
+ df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024
+
+ plt.style.use("dark_background")
+
+ # Create subplots based on available metrics
+ has_gpu = "gpu_memory_used" in df.columns
+ num_plots = 3 if has_gpu else 2
+ fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots))
+ fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
+
+ # Smoothing window
+ window = min(5, len(df) // 2)
+
+ # Plot CPU Usage
+ smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean()
+ sns.lineplot(
+ x=elapsed_time,
+ y=smoothed_cpu,
+ ax=axes[0],
+ color=STYLE_CONFIG["primary_color"],
+ linewidth=2,
+ )
+ axes[0].axhline(
+ y=baseline_cpu,
+ color=STYLE_CONFIG["secondary_color"],
+ linestyle="--",
+ alpha=0.5,
+ label="Baseline",
+ )
+ setup_plot(
+ fig,
+ axes[0],
+ "CPU Usage Over Time",
+ xlabel="Time (seconds)",
+ ylabel="CPU Usage (%)",
+ )
+ axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1)
+ axes[0].legend()
+
+ # Plot RAM Usage
+ smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean()
+ sns.lineplot(
+ x=elapsed_time,
+ y=smoothed_ram,
+ ax=axes[1],
+ color=STYLE_CONFIG["secondary_color"],
+ linewidth=2,
+ )
+ axes[1].axhline(
+ y=baseline_ram,
+ color=STYLE_CONFIG["primary_color"],
+ linestyle="--",
+ alpha=0.5,
+ label="Baseline",
+ )
+ setup_plot(
+ fig,
+ axes[1],
+ "RAM Usage Over Time",
+ xlabel="Time (seconds)",
+ ylabel="RAM Usage (GB)",
+ )
+ axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1)
+ axes[1].legend()
+
+ # Plot GPU Memory if available
+ if has_gpu:
+ smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean()
+ sns.lineplot(
+ x=elapsed_time,
+ y=smoothed_gpu,
+ ax=axes[2],
+ color=STYLE_CONFIG["primary_color"],
+ linewidth=2,
+ )
+ axes[2].axhline(
+ y=baseline_gpu,
+ color=STYLE_CONFIG["secondary_color"],
+ linestyle="--",
+ alpha=0.5,
+ label="Baseline",
+ )
+ setup_plot(
+ fig,
+ axes[2],
+ "GPU Memory Usage Over Time",
+ xlabel="Time (seconds)",
+ ylabel="GPU Memory (GB)",
+ )
+ axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1)
+ axes[2].legend()
+
+ plt.tight_layout()
+ plt.savefig(output_path, dpi=300, bbox_inches="tight")
+ plt.close()
+
+
+def plot_timeline(df, output_path, suffix="", prefix=""):
+ """Create timeline plot showing latency for each run.
+
+ Args:
+ df: pandas DataFrame containing run data with columns:
+ - target_tokens: number of tokens
+ - run_number: run iteration
+ - time_to_first_chunk: latency to first token
+ output_path: str, path to save the output plot
+ """
+ plt.style.use("dark_background")
+
+ # Sort by tokens and run number
+ df = df.sort_values(["target_tokens", "run_number"])
+
+ # Create figure and axis
+ fig, ax = plt.subplots(figsize=(12, 6))
+
+ # Calculate y positions for each run with tighter grouping
+ unique_tokens = sorted(df["target_tokens"].unique())
+ y_positions = {}
+ current_y = 0
+ group_spacing = 0.8 # Space between groups
+ run_spacing = 0.2 # Space between runs in a group
+
+ for tokens in unique_tokens:
+ runs = df[df["target_tokens"] == tokens]
+ base_y = current_y
+ for i, (_, run) in enumerate(runs.iterrows()):
+ y_positions[(tokens, run["run_number"])] = base_y + (i * run_spacing)
+ current_y = base_y + (len(runs) * run_spacing) + group_spacing
+
+ # Plot bars and points with more transparency
+ bar_height = 0.15
+ for _, row in df.iterrows():
+ y = y_positions[(row["target_tokens"], row["run_number"])]
+ latency = row["time_to_first_chunk"]
+
+ # Latency bar
+ ax.add_patch(
+ patches.Rectangle(
+ (0, y - bar_height / 2),
+ latency,
+ bar_height,
+ facecolor=STYLE_CONFIG["primary_color"],
+ alpha=0.3,
+ )
+ )
+
+ # End point
+ ax.plot(
+ latency,
+ y,
+ "o",
+ color=STYLE_CONFIG["secondary_color"],
+ markersize=4,
+ alpha=0.5,
+ )
+
+ # Add mean lines and values for each token group
+ for tokens in unique_tokens:
+ token_runs = df[df["target_tokens"] == tokens]
+ mean_latency = token_runs["time_to_first_chunk"].mean()
+ y_positions_for_token = [
+ y_positions[(tokens, run["run_number"])] for _, run in token_runs.iterrows()
+ ]
+ min_y = min(y_positions_for_token)
+ max_y = max(y_positions_for_token)
+ group_center = (min_y + max_y) / 2
+
+ # Plot mean line with gradient alpha
+ gradient = np.linspace(0.2, 0.8, 100)
+ for i in range(len(gradient) - 1):
+ y1 = (
+ min_y
+ - bar_height
+ + (max_y - min_y + 2 * bar_height) * (i / len(gradient))
+ )
+ y2 = (
+ min_y
+ - bar_height
+ + (max_y - min_y + 2 * bar_height) * ((i + 1) / len(gradient))
+ )
+ ax.plot(
+ [mean_latency, mean_latency],
+ [y1, y2],
+ "-",
+ color=STYLE_CONFIG["secondary_color"],
+ linewidth=3,
+ alpha=gradient[i],
+ )
+
+ # Add mean value label with background
+ label_text = f"Mean: {mean_latency:.3f}s"
+ bbox_props = dict(
+ facecolor=STYLE_CONFIG["background_color"],
+ edgecolor=STYLE_CONFIG["secondary_color"],
+ alpha=0.8,
+ pad=3,
+ linewidth=1,
+ )
+ ax.text(
+ mean_latency + 0.02,
+ group_center,
+ label_text,
+ color=STYLE_CONFIG["secondary_color"],
+ va="center",
+ fontsize=10,
+ fontweight="bold",
+ bbox=bbox_props,
+ )
+
+ # Customize plot
+ ax.set_ylim(-1, current_y)
+ ax.set_xlim(0, df["time_to_first_chunk"].max() * 1.3) # Extra space for labels
+
+ # Add labels for token groups with tighter spacing
+ group_positions = {}
+ for tokens in unique_tokens:
+ runs = df[df["target_tokens"] == tokens]
+ y_positions_for_token = [
+ y_positions[(tokens, run["run_number"])] for _, run in runs.iterrows()
+ ]
+ group_positions[tokens] = sum(y_positions_for_token) / len(
+ y_positions_for_token
+ )
+ plt.axhline(
+ y=min(y_positions_for_token) - bar_height,
+ color="white",
+ alpha=0.1,
+ linestyle="-",
+ )
+
+ # Calculate mean audio length for each token group
+ audio_lengths = {}
+ for tokens in unique_tokens:
+ token_runs = df[df["target_tokens"] == tokens]
+ audio_lengths[tokens] = token_runs["audio_length"].mean()
+
+ # Set y-ticks at group centers with token counts and audio lengths
+ plt.yticks(
+ list(group_positions.values()),
+ [
+ f"{tokens} tokens\n({audio_lengths[tokens]:.1f}s)"
+ for tokens in group_positions.keys()
+ ],
+ fontsize=10,
+ )
+
+ # Customize appearance
+ setup_plot(
+ fig,
+ ax,
+ prefix.upper() + " Time-To-Audio Latency " + suffix,
+ xlabel="Time (seconds)",
+ ylabel="Input Size",
+ )
+
+ plt.tight_layout()
+ plt.savefig(output_path, dpi=300, bbox_inches="tight")
+ plt.close()
+
+
+def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
+ """Create correlation plot with regression line and correlation coefficient.
+
+ Args:
+ df: pandas DataFrame containing the data
+ x: str, column name for x-axis
+ y: str, column name for y-axis
+ title: str, plot title
+ xlabel: str, x-axis label
+ ylabel: str, y-axis label
+ output_path: str, path to save the output plot
+ """
+ plt.style.use("dark_background")
+
+ fig, ax = plt.subplots(figsize=(12, 8))
+
+ # Scatter plot
+ sns.scatterplot(
+ data=df, x=x, y=y, s=100, alpha=0.6, color=STYLE_CONFIG["primary_color"]
+ )
+
+ # Regression line
+ sns.regplot(
+ data=df,
+ x=x,
+ y=y,
+ scatter=False,
+ color=STYLE_CONFIG["secondary_color"],
+ line_kws={"linewidth": 2},
+ )
+
+ # Add correlation coefficient
+ corr = df[x].corr(df[y])
+ plt.text(
+ 0.05,
+ 0.95,
+ f"Correlation: {corr:.2f}",
+ transform=ax.transAxes,
+ fontsize=STYLE_CONFIG["font_sizes"]["text"],
+ color=STYLE_CONFIG["text_color"],
+ bbox=dict(
+ facecolor=STYLE_CONFIG["background_color"],
+ edgecolor=STYLE_CONFIG["text_color"],
+ alpha=0.7,
+ ),
+ )
+
+ setup_plot(fig, ax, title, xlabel=xlabel, ylabel=ylabel)
+ plt.savefig(output_path, dpi=300, bbox_inches="tight")
+ plt.close()
diff --git a/examples/assorted_checks/benchmarks/lib/shared_utils.py b/examples/assorted_checks/benchmarks/lib/shared_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e3fbdb1641d178c036356439f8e8a2dd357cdb4
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/lib/shared_utils.py
@@ -0,0 +1,199 @@
+"""Shared utilities for benchmarks and tests."""
+
+import os
+import json
+import subprocess
+from typing import Any, Dict, List, Union, Optional
+from datetime import datetime
+
+import psutil
+import scipy.io.wavfile as wavfile
+
+# Check for torch availability once at module level
+TORCH_AVAILABLE = False
+try:
+ import torch
+
+ TORCH_AVAILABLE = torch.cuda.is_available()
+except ImportError:
+ pass
+
+
+def check_audio_file_is_silent(audio_path: str, threshold: float = 0.01) -> bool:
+ """Check if an audio file is silent by comparing peak amplitude to a threshold.
+
+ Args:
+ audio_path: Path to the audio file
+ threshold: Peak amplitude threshold for silence
+
+ Returns:
+ bool: True if audio is silent, False otherwise
+ """
+ rate, data = wavfile.read(audio_path)
+ peak_amplitude = max(abs(data.min()), abs(data.max())) / 32768.0 # 16-bit audio
+
+ return peak_amplitude < threshold
+
+
+def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
+ """Get audio length in seconds from bytes data.
+
+ Args:
+ audio_data: Raw audio bytes
+ temp_dir: Directory for temporary file. If None, uses system temp directory.
+
+ Returns:
+ float: Audio length in seconds
+ """
+ if temp_dir is None:
+ import tempfile
+
+ temp_dir = tempfile.gettempdir()
+
+ temp_path = os.path.join(temp_dir, "temp.wav")
+ os.makedirs(temp_dir, exist_ok=True)
+
+ with open(temp_path, "wb") as f:
+ f.write(audio_data)
+
+ try:
+ rate, data = wavfile.read(temp_path)
+ return len(data) / rate
+ finally:
+ if os.path.exists(temp_path):
+ os.remove(temp_path)
+
+
+def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
+ """Get GPU memory usage using PyTorch if available, falling back to nvidia-smi.
+
+ Args:
+ average: If True and multiple GPUs present, returns average memory usage.
+ If False, returns list of memory usage per GPU.
+
+ Returns:
+ float or List[float] or None: GPU memory usage in MB. Returns None if no GPU available.
+ If average=False and multiple GPUs present, returns list of values.
+ """
+ if TORCH_AVAILABLE:
+ n_gpus = torch.cuda.device_count()
+ memory_used = []
+ for i in range(n_gpus):
+ memory_used.append(
+ torch.cuda.memory_allocated(i) / 1024**2
+ ) # Convert to MB
+
+ if average and len(memory_used) > 0:
+ return sum(memory_used) / len(memory_used)
+ return memory_used if len(memory_used) > 1 else memory_used[0]
+
+ # Fall back to nvidia-smi
+ try:
+ result = subprocess.check_output(
+ ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
+ )
+ memory_values = [
+ float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()
+ ]
+
+ if average and len(memory_values) > 0:
+ return sum(memory_values) / len(memory_values)
+ return memory_values if len(memory_values) > 1 else memory_values[0]
+ except (subprocess.CalledProcessError, FileNotFoundError):
+ return None
+
+
+def get_system_metrics() -> Dict[str, Union[str, float]]:
+ """Get current system metrics including CPU, RAM, and GPU if available.
+
+ Returns:
+ dict: System metrics including timestamp, CPU%, RAM%, RAM GB, and GPU MB if available
+ """
+ # Get per-CPU percentages and calculate average
+ cpu_percentages = psutil.cpu_percent(percpu=True)
+ avg_cpu = sum(cpu_percentages) / len(cpu_percentages)
+
+ metrics = {
+ "timestamp": datetime.now().isoformat(),
+ "cpu_percent": round(avg_cpu, 2),
+ "ram_percent": psutil.virtual_memory().percent,
+ "ram_used_gb": psutil.virtual_memory().used / (1024**3),
+ }
+
+ gpu_mem = get_gpu_memory(average=True) # Use average for system metrics
+ if gpu_mem is not None:
+ metrics["gpu_memory_used"] = round(gpu_mem, 2)
+
+ return metrics
+
+
+def save_audio_file(audio_data: bytes, identifier: str, output_dir: str) -> str:
+ """Save audio data to a file with proper naming and directory creation.
+
+ Args:
+ audio_data: Raw audio bytes
+ identifier: String to identify this audio file (e.g. token count, test name)
+ output_dir: Directory to save the file
+
+ Returns:
+ str: Path to the saved audio file
+ """
+ os.makedirs(output_dir, exist_ok=True)
+ output_file = os.path.join(output_dir, f"{identifier}.wav")
+
+ with open(output_file, "wb") as f:
+ f.write(audio_data)
+
+ return output_file
+
+
+def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None:
+ """Write benchmark statistics to a file in a clean, organized format.
+
+ Args:
+ stats: List of dictionaries containing stat name/value pairs
+ output_file: Path to output file
+ """
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
+
+ with open(output_file, "w") as f:
+ for section in stats:
+ # Write section header
+ f.write(f"=== {section['title']} ===\n\n")
+
+ # Write stats
+ for label, value in section["stats"].items():
+ if isinstance(value, float):
+ f.write(f"{label}: {value:.2f}\n")
+ else:
+ f.write(f"{label}: {value}\n")
+ f.write("\n")
+
+
+def save_json_results(results: Dict[str, Any], output_file: str) -> None:
+ """Save benchmark results to a JSON file with proper formatting.
+
+ Args:
+ results: Dictionary of results to save
+ output_file: Path to output file
+ """
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
+ with open(output_file, "w") as f:
+ json.dump(results, f, indent=2)
+
+
+def real_time_factor(
+ processing_time: float, audio_length: float, decimals: int = 2
+) -> float:
+ """Calculate Real-Time Factor (RTF) as processing-time / length-of-audio.
+
+ Args:
+ processing_time: Time taken to process/generate audio
+ audio_length: Length of the generated audio
+ decimals: Number of decimal places to round to
+
+ Returns:
+ float: RTF value
+ """
+ rtf = processing_time / audio_length
+ return round(rtf, decimals)
diff --git a/examples/assorted_checks/benchmarks/lib/stream_utils.py b/examples/assorted_checks/benchmarks/lib/stream_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2dececa728230687d4283561873d374ffeb51b0
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/lib/stream_utils.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+import os
+import time
+import wave
+from typing import Any, Dict, List, Callable, Optional
+
+import pandas as pd
+import scipy.io.wavfile as wavfile
+
+from .shared_utils import save_json_results
+from .shared_plotting import plot_timeline, plot_correlation
+from .shared_benchmark_utils import enc, get_text_for_tokens
+
+
+def check_audio_silence(audio_path: str) -> bool:
+ """Check if audio file contains only silence"""
+ sample_rate, audio_data = wavfile.read(audio_path)
+ # Convert to float for RMS calculation
+ audio_float = audio_data.astype(float)
+ # Calculate RMS value
+ rms = (audio_float**2).mean() ** 0.5
+ # Define silence threshold (adjust if needed)
+ SILENCE_THRESHOLD = 50.0
+ return rms < SILENCE_THRESHOLD
+
+
+def process_benchmark_results(
+ all_results: List[Dict[str, Any]], token_sizes: List[int]
+) -> Dict[str, Any]:
+ """Process benchmark results and generate summary"""
+ summary = {}
+ for tokens in token_sizes:
+ matching_results = [
+ r for r in all_results if r["target_tokens"] == tokens and not r["error"]
+ ]
+ if matching_results:
+ avg_first_chunk = sum(
+ r["time_to_first_chunk"] for r in matching_results
+ ) / len(matching_results)
+ avg_total = sum(r["total_time"] for r in matching_results) / len(
+ matching_results
+ )
+ avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
+ matching_results
+ )
+ summary[tokens] = {
+ "avg_time_to_first_chunk": round(avg_first_chunk, 3),
+ "avg_total_time": round(avg_total, 3),
+ "avg_audio_length": round(avg_audio_length, 3),
+ "num_successful_runs": len(matching_results),
+ }
+ return summary
+
+
+def save_benchmark_results(
+ all_results: List[Dict[str, Any]],
+ summary: Dict[str, Any],
+ output_data_dir: str,
+ output_plots_dir: str,
+ suffix: str,
+ plot_title_suffix: str,
+ prefix: str = "",
+):
+ """Save benchmark results and generate plots"""
+ # Save results
+ results_data = {
+ "individual_runs": all_results,
+ "summary": summary,
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+ }
+ save_json_results(
+ results_data,
+ os.path.join(output_data_dir, f"{prefix}first_token_benchmark{suffix}.json"),
+ )
+
+ # Create DataFrame for plotting
+ df = pd.DataFrame(all_results)
+
+ # Create plots
+ plot_correlation(
+ df,
+ "target_tokens",
+ "time_to_first_chunk",
+ f"Time to First Audio vs Input Size {plot_title_suffix}",
+ "Number of Input Tokens",
+ "Time to First Audio (seconds)",
+ os.path.join(output_plots_dir, f"{prefix}first_token_latency{suffix}.png"),
+ )
+
+ plot_correlation(
+ df,
+ "target_tokens",
+ "total_time",
+ f"Total Time vs Input Size {plot_title_suffix}",
+ "Number of Input Tokens",
+ "Total Time (seconds)",
+ os.path.join(output_plots_dir, f"{prefix}total_time_latency{suffix}.png"),
+ )
+
+ plot_timeline(
+ df,
+ os.path.join(output_plots_dir, f"{prefix}first_token_timeline{suffix}.png"),
+ suffix=plot_title_suffix,
+ )
+
+
+def run_benchmark(
+ measure_func: Callable,
+ output_dir: str,
+ output_data_dir: str,
+ output_plots_dir: str,
+ suffix: str = "",
+ plot_title_suffix: str = "",
+ num_runs: int = 5,
+ client=None,
+ prefix="",
+):
+ """Run benchmark with the given measurement function"""
+ # Create output directories
+ os.makedirs(output_dir, exist_ok=True)
+ os.makedirs(output_data_dir, exist_ok=True)
+ os.makedirs(output_plots_dir, exist_ok=True)
+
+ # Load sample text
+ script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+ with open(
+ os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
+ ) as f:
+ text = f.read()
+
+ # Test specific token counts
+ token_sizes = [10, 50, 100, 250, 500]
+ all_results = []
+ silent_files = []
+
+ for tokens in token_sizes:
+ print(
+ f"\nTesting {tokens} tokens{' ' + plot_title_suffix if plot_title_suffix else ''}"
+ )
+ test_text = get_text_for_tokens(text, tokens)
+ actual_tokens = len(enc.encode(test_text))
+ print(f"Text preview: {test_text[:50]}...")
+
+ for i in range(num_runs):
+ print(f"Run {i+1}/{num_runs}...")
+ result = measure_func(test_text, output_dir, tokens, i + 1)
+ result["target_tokens"] = tokens
+ result["actual_tokens"] = actual_tokens
+ result["run_number"] = i + 1
+
+ # Handle time to first audio
+ first_chunk = result.get("time_to_first_chunk")
+ print(
+ f"Time to First Audio: {f'{first_chunk:.3f}s' if first_chunk is not None else 'N/A'}"
+ )
+
+ # Handle total time
+ total_time = result.get("total_time")
+ print(
+ f"Time to Save Complete: {f'{total_time:.3f}s' if total_time is not None else 'N/A'}"
+ )
+
+ # Handle audio length
+ audio_length = result.get("audio_length")
+ print(
+ f"Audio length: {f'{audio_length:.3f}s' if audio_length is not None else 'N/A'}"
+ )
+ # Calculate streaming overhead only if both values exist
+ if total_time is not None and first_chunk is not None:
+ print(f"Streaming overhead: {(total_time - first_chunk):.3f}s")
+ else:
+ print("Streaming overhead: N/A")
+
+ if result["error"]:
+ print(f"Error: {result['error']}")
+ elif result["audio_path"] and check_audio_silence(result["audio_path"]):
+ silent_files.append(result["audio_path"])
+
+ all_results.append(result)
+
+ # Process and save results
+ summary = process_benchmark_results(all_results, token_sizes)
+ save_benchmark_results(
+ all_results,
+ summary,
+ output_data_dir,
+ output_plots_dir,
+ suffix,
+ plot_title_suffix,
+ )
+
+ # Print paths
+ print("\nResults and plots saved to:")
+ print(
+ f"- {os.path.join(output_data_dir, f'{prefix}first_token_benchmark{suffix}.json')}"
+ )
+ print(
+ f"- {os.path.join(output_plots_dir, f'{prefix}first_token_latency{suffix}.png')}"
+ )
+ print(
+ f"- {os.path.join(output_plots_dir, f'{prefix}total_time_latency{suffix}.png')}"
+ )
+ print(
+ f"- {os.path.join(output_plots_dir, f'{prefix}first_token_timeline{suffix}.png')}"
+ )
+
+ # Print silence check summary
+ if silent_files:
+ print("\nWARNING: The following files contain only silence:")
+ for file in silent_files:
+ print(f"- {file}")
+ else:
+ print("\nAll generated audio files contain valid audio content.")
diff --git a/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_results_rtf.json b/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_results_rtf.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9d41c9582c9f90ced28b89ce8a62c852772ca4c
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_results_rtf.json
@@ -0,0 +1,138 @@
+{
+ "results": [
+ {
+ "tokens": 100,
+ "processing_time": 0.98,
+ "output_length": 28.975,
+ "rtf": 0.03,
+ "elapsed_time": 1.02255
+ },
+ {
+ "tokens": 200,
+ "processing_time": 1.79,
+ "output_length": 58.45,
+ "rtf": 0.03,
+ "elapsed_time": 2.84766
+ },
+ {
+ "tokens": 300,
+ "processing_time": 2.1,
+ "output_length": 86.75,
+ "rtf": 0.02,
+ "elapsed_time": 4.98201
+ },
+ {
+ "tokens": 400,
+ "processing_time": 2.66,
+ "output_length": 113.5,
+ "rtf": 0.02,
+ "elapsed_time": 7.67743
+ },
+ {
+ "tokens": 500,
+ "processing_time": 3.13,
+ "output_length": 140.225,
+ "rtf": 0.02,
+ "elapsed_time": 10.84279
+ }
+ ],
+ "system_metrics": [
+ {
+ "timestamp": "2025-01-30T05:03:26.422469",
+ "cpu_percent": 0.0,
+ "ram_percent": 18.5,
+ "ram_used_gb": 5.2551727294921875,
+ "gpu_memory_used": 1988.0,
+ "relative_time": 0.14498639106750488
+ },
+ {
+ "timestamp": "2025-01-30T05:03:27.568319",
+ "cpu_percent": 13.42,
+ "ram_percent": 18.6,
+ "ram_used_gb": 5.267307281494141,
+ "gpu_memory_used": 2025.0,
+ "relative_time": 1.1970372200012207
+ },
+ {
+ "timestamp": "2025-01-30T05:03:28.620098",
+ "cpu_percent": 12.89,
+ "ram_percent": 18.6,
+ "ram_used_gb": 5.267337799072266,
+ "gpu_memory_used": 3071.0,
+ "relative_time": 2.254074811935425
+ },
+ {
+ "timestamp": "2025-01-30T05:03:29.677030",
+ "cpu_percent": 12.43,
+ "ram_percent": 18.6,
+ "ram_used_gb": 5.29168701171875,
+ "gpu_memory_used": 2555.0,
+ "relative_time": 3.306957244873047
+ },
+ {
+ "timestamp": "2025-01-30T05:03:30.729971",
+ "cpu_percent": 12.47,
+ "ram_percent": 18.6,
+ "ram_used_gb": 5.292213439941406,
+ "gpu_memory_used": 3345.0,
+ "relative_time": 4.3373119831085205
+ },
+ {
+ "timestamp": "2025-01-30T05:03:31.760463",
+ "cpu_percent": 13.71,
+ "ram_percent": 18.7,
+ "ram_used_gb": 5.30987548828125,
+ "gpu_memory_used": 2549.0,
+ "relative_time": 5.368744850158691
+ },
+ {
+ "timestamp": "2025-01-30T05:03:32.791904",
+ "cpu_percent": 12.16,
+ "ram_percent": 18.7,
+ "ram_used_gb": 5.308803558349609,
+ "gpu_memory_used": 3358.0,
+ "relative_time": 6.418949842453003
+ },
+ {
+ "timestamp": "2025-01-30T05:03:33.842039",
+ "cpu_percent": 11.5,
+ "ram_percent": 18.7,
+ "ram_used_gb": 5.309070587158203,
+ "gpu_memory_used": 3349.0,
+ "relative_time": 7.4437031745910645
+ },
+ {
+ "timestamp": "2025-01-30T05:03:34.866692",
+ "cpu_percent": 15.38,
+ "ram_percent": 18.7,
+ "ram_used_gb": 5.2960205078125,
+ "gpu_memory_used": 3034.0,
+ "relative_time": 8.472418069839478
+ },
+ {
+ "timestamp": "2025-01-30T05:03:35.895656",
+ "cpu_percent": 13.44,
+ "ram_percent": 18.7,
+ "ram_used_gb": 5.294971466064453,
+ "gpu_memory_used": 3315.0,
+ "relative_time": 9.498533248901367
+ },
+ {
+ "timestamp": "2025-01-30T05:03:36.921589",
+ "cpu_percent": 12.64,
+ "ram_percent": 18.7,
+ "ram_used_gb": 5.297389984130859,
+ "gpu_memory_used": 3314.0,
+ "relative_time": 10.565555095672607
+ },
+ {
+ "timestamp": "2025-01-30T05:03:37.994149",
+ "cpu_percent": 8.32,
+ "ram_percent": 18.7,
+ "ram_used_gb": 5.305477142333984,
+ "gpu_memory_used": 1958.0,
+ "relative_time": 11.616873502731323
+ }
+ ],
+ "test_duration": 14.051392793655396
+}
\ No newline at end of file
diff --git a/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_stats_rtf.txt b/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_stats_rtf.txt
new file mode 100644
index 0000000000000000000000000000000000000000..37f1cfb5adc531976f25d7e4a375d0f4f4afca98
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_stats_rtf.txt
@@ -0,0 +1,23 @@
+=== Benchmark Statistics (with correct RTF) ===
+
+Total tokens processed: 1500
+Total audio generated (s): 427.90
+Total test duration (s): 10.84
+Average processing rate (tokens/s): 133.35
+Average RTF: 0.02
+Average Real Time Speed: 41.67
+
+=== Per-chunk Stats ===
+
+Average chunk size (tokens): 300.00
+Min chunk size (tokens): 100
+Max chunk size (tokens): 500
+Average processing time (s): 2.13
+Average output length (s): 85.58
+
+=== Performance Ranges ===
+
+Processing rate range (tokens/s): 102.04 - 159.74
+RTF range: 0.02x - 0.03x
+Real Time Speed range: 33.33x - 50.00x
+
diff --git a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json
new file mode 100644
index 0000000000000000000000000000000000000000..c78b5ab30f03036da00803f504e8cab7d0950401
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json
@@ -0,0 +1,337 @@
+{
+ "individual_runs": [
+ {
+ "text_length": 37,
+ "token_count": null,
+ "total_time": 1.818483829498291,
+ "time_to_first_chunk": 1.8067498207092285,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run1_stream.wav",
+ "audio_length": 3.45,
+ "target_tokens": 10,
+ "actual_tokens": 10,
+ "run_number": 1
+ },
+ {
+ "text_length": 37,
+ "token_count": null,
+ "total_time": 1.6271553039550781,
+ "time_to_first_chunk": 1.610968828201294,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run2_stream.wav",
+ "audio_length": 3.45,
+ "target_tokens": 10,
+ "actual_tokens": 10,
+ "run_number": 2
+ },
+ {
+ "text_length": 37,
+ "token_count": null,
+ "total_time": 1.5759549140930176,
+ "time_to_first_chunk": 1.561316967010498,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run3_stream.wav",
+ "audio_length": 3.45,
+ "target_tokens": 10,
+ "actual_tokens": 10,
+ "run_number": 3
+ },
+ {
+ "text_length": 37,
+ "token_count": null,
+ "total_time": 1.615680456161499,
+ "time_to_first_chunk": 1.6035709381103516,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run4_stream.wav",
+ "audio_length": 3.45,
+ "target_tokens": 10,
+ "actual_tokens": 10,
+ "run_number": 4
+ },
+ {
+ "text_length": 37,
+ "token_count": null,
+ "total_time": 1.6515357494354248,
+ "time_to_first_chunk": 1.6268820762634277,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run5_stream.wav",
+ "audio_length": 3.45,
+ "target_tokens": 10,
+ "actual_tokens": 10,
+ "run_number": 5
+ },
+ {
+ "text_length": 212,
+ "token_count": null,
+ "total_time": 7.368175268173218,
+ "time_to_first_chunk": 3.4540352821350098,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
+ "audio_length": 15.825,
+ "target_tokens": 50,
+ "actual_tokens": 50,
+ "run_number": 1
+ },
+ {
+ "text_length": 212,
+ "token_count": null,
+ "total_time": 6.931752443313599,
+ "time_to_first_chunk": 3.1553661823272705,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
+ "audio_length": 15.825,
+ "target_tokens": 50,
+ "actual_tokens": 50,
+ "run_number": 2
+ },
+ {
+ "text_length": 212,
+ "token_count": null,
+ "total_time": 6.867500066757202,
+ "time_to_first_chunk": 3.127124309539795,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
+ "audio_length": 15.825,
+ "target_tokens": 50,
+ "actual_tokens": 50,
+ "run_number": 3
+ },
+ {
+ "text_length": 212,
+ "token_count": null,
+ "total_time": 6.933881521224976,
+ "time_to_first_chunk": 3.1872360706329346,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
+ "audio_length": 15.825,
+ "target_tokens": 50,
+ "actual_tokens": 50,
+ "run_number": 4
+ },
+ {
+ "text_length": 212,
+ "token_count": null,
+ "total_time": 7.605916738510132,
+ "time_to_first_chunk": 3.6397976875305176,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
+ "audio_length": 15.825,
+ "target_tokens": 50,
+ "actual_tokens": 50,
+ "run_number": 5
+ },
+ {
+ "text_length": 448,
+ "token_count": null,
+ "total_time": 14.777218580245972,
+ "time_to_first_chunk": 3.625889778137207,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
+ "audio_length": 30.35,
+ "target_tokens": 100,
+ "actual_tokens": 100,
+ "run_number": 1
+ },
+ {
+ "text_length": 448,
+ "token_count": null,
+ "total_time": 13.911701202392578,
+ "time_to_first_chunk": 3.298157215118408,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
+ "audio_length": 30.35,
+ "target_tokens": 100,
+ "actual_tokens": 100,
+ "run_number": 2
+ },
+ {
+ "text_length": 448,
+ "token_count": null,
+ "total_time": 14.451806783676147,
+ "time_to_first_chunk": 3.8353848457336426,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
+ "audio_length": 30.35,
+ "target_tokens": 100,
+ "actual_tokens": 100,
+ "run_number": 3
+ },
+ {
+ "text_length": 448,
+ "token_count": null,
+ "total_time": 13.941124200820923,
+ "time_to_first_chunk": 3.3754897117614746,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
+ "audio_length": 30.35,
+ "target_tokens": 100,
+ "actual_tokens": 100,
+ "run_number": 4
+ },
+ {
+ "text_length": 448,
+ "token_count": null,
+ "total_time": 15.717307329177856,
+ "time_to_first_chunk": 3.6421003341674805,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
+ "audio_length": 30.35,
+ "target_tokens": 100,
+ "actual_tokens": 100,
+ "run_number": 5
+ },
+ {
+ "text_length": 1140,
+ "token_count": null,
+ "total_time": 41.16162133216858,
+ "time_to_first_chunk": 3.7044918537139893,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run1_stream.wav",
+ "audio_length": 78.175,
+ "target_tokens": 250,
+ "actual_tokens": 250,
+ "run_number": 1
+ },
+ {
+ "text_length": 1140,
+ "token_count": null,
+ "total_time": 35.43009877204895,
+ "time_to_first_chunk": 3.1040024757385254,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run2_stream.wav",
+ "audio_length": 78.175,
+ "target_tokens": 250,
+ "actual_tokens": 250,
+ "run_number": 2
+ },
+ {
+ "text_length": 1140,
+ "token_count": null,
+ "total_time": 35.285505294799805,
+ "time_to_first_chunk": 3.657808780670166,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run3_stream.wav",
+ "audio_length": 78.175,
+ "target_tokens": 250,
+ "actual_tokens": 250,
+ "run_number": 3
+ },
+ {
+ "text_length": 1140,
+ "token_count": null,
+ "total_time": 34.47842836380005,
+ "time_to_first_chunk": 3.2033851146698,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run4_stream.wav",
+ "audio_length": 78.175,
+ "target_tokens": 250,
+ "actual_tokens": 250,
+ "run_number": 4
+ },
+ {
+ "text_length": 1140,
+ "token_count": null,
+ "total_time": 36.50936222076416,
+ "time_to_first_chunk": 3.1159815788269043,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run5_stream.wav",
+ "audio_length": 78.175,
+ "target_tokens": 250,
+ "actual_tokens": 250,
+ "run_number": 5
+ },
+ {
+ "text_length": 2232,
+ "token_count": null,
+ "total_time": 86.84899735450745,
+ "time_to_first_chunk": 5.405678987503052,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
+ "audio_length": 155.125,
+ "target_tokens": 500,
+ "actual_tokens": 500,
+ "run_number": 1
+ },
+ {
+ "text_length": 2232,
+ "token_count": null,
+ "total_time": 74.72578477859497,
+ "time_to_first_chunk": 3.966891050338745,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
+ "audio_length": 155.125,
+ "target_tokens": 500,
+ "actual_tokens": 500,
+ "run_number": 2
+ },
+ {
+ "text_length": 2232,
+ "token_count": null,
+ "total_time": 68.1974081993103,
+ "time_to_first_chunk": 3.27712082862854,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
+ "audio_length": 155.125,
+ "target_tokens": 500,
+ "actual_tokens": 500,
+ "run_number": 3
+ },
+ {
+ "text_length": 2232,
+ "token_count": null,
+ "total_time": 72.68819260597229,
+ "time_to_first_chunk": 3.153608560562134,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
+ "audio_length": 155.125,
+ "target_tokens": 500,
+ "actual_tokens": 500,
+ "run_number": 4
+ },
+ {
+ "text_length": 2232,
+ "token_count": null,
+ "total_time": 67.94887590408325,
+ "time_to_first_chunk": 3.954728841781616,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
+ "audio_length": 155.125,
+ "target_tokens": 500,
+ "actual_tokens": 500,
+ "run_number": 5
+ }
+ ],
+ "summary": {
+ "10": {
+ "avg_time_to_first_chunk": 1.642,
+ "avg_total_time": 1.658,
+ "avg_audio_length": 3.45,
+ "num_successful_runs": 5
+ },
+ "50": {
+ "avg_time_to_first_chunk": 3.313,
+ "avg_total_time": 7.141,
+ "avg_audio_length": 15.825,
+ "num_successful_runs": 5
+ },
+ "100": {
+ "avg_time_to_first_chunk": 3.555,
+ "avg_total_time": 14.56,
+ "avg_audio_length": 30.35,
+ "num_successful_runs": 5
+ },
+ "250": {
+ "avg_time_to_first_chunk": 3.357,
+ "avg_total_time": 36.573,
+ "avg_audio_length": 78.175,
+ "num_successful_runs": 5
+ },
+ "500": {
+ "avg_time_to_first_chunk": 3.952,
+ "avg_total_time": 74.082,
+ "avg_audio_length": 155.125,
+ "num_successful_runs": 5
+ }
+ },
+ "timestamp": "2025-01-06 03:31:37"
+}
\ No newline at end of file
diff --git a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream_openai.json b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream_openai.json
new file mode 100644
index 0000000000000000000000000000000000000000..968fffbb2f959910627bae1e0ae8f428b6f61368
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream_openai.json
@@ -0,0 +1,337 @@
+{
+ "individual_runs": [
+ {
+ "text_length": 37,
+ "token_count": null,
+ "total_time": 1.638200044631958,
+ "time_to_first_chunk": 1.6232295036315918,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run1_stream_openai.wav",
+ "audio_length": 3.45,
+ "target_tokens": 10,
+ "actual_tokens": 10,
+ "run_number": 1
+ },
+ {
+ "text_length": 37,
+ "token_count": null,
+ "total_time": 1.4960439205169678,
+ "time_to_first_chunk": 1.4854960441589355,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run2_stream_openai.wav",
+ "audio_length": 3.45,
+ "target_tokens": 10,
+ "actual_tokens": 10,
+ "run_number": 2
+ },
+ {
+ "text_length": 37,
+ "token_count": null,
+ "total_time": 1.5055279731750488,
+ "time_to_first_chunk": 1.4948456287384033,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run3_stream_openai.wav",
+ "audio_length": 3.45,
+ "target_tokens": 10,
+ "actual_tokens": 10,
+ "run_number": 3
+ },
+ {
+ "text_length": 37,
+ "token_count": null,
+ "total_time": 1.496837854385376,
+ "time_to_first_chunk": 1.4835176467895508,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run4_stream_openai.wav",
+ "audio_length": 3.45,
+ "target_tokens": 10,
+ "actual_tokens": 10,
+ "run_number": 4
+ },
+ {
+ "text_length": 37,
+ "token_count": null,
+ "total_time": 1.7330272197723389,
+ "time_to_first_chunk": 1.7219843864440918,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run5_stream_openai.wav",
+ "audio_length": 3.45,
+ "target_tokens": 10,
+ "actual_tokens": 10,
+ "run_number": 5
+ },
+ {
+ "text_length": 212,
+ "token_count": null,
+ "total_time": 6.865253925323486,
+ "time_to_first_chunk": 3.1809072494506836,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
+ "audio_length": 15.825,
+ "target_tokens": 50,
+ "actual_tokens": 50,
+ "run_number": 1
+ },
+ {
+ "text_length": 212,
+ "token_count": null,
+ "total_time": 7.975425720214844,
+ "time_to_first_chunk": 3.2910428047180176,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
+ "audio_length": 15.825,
+ "target_tokens": 50,
+ "actual_tokens": 50,
+ "run_number": 2
+ },
+ {
+ "text_length": 212,
+ "token_count": null,
+ "total_time": 6.793715715408325,
+ "time_to_first_chunk": 3.210068464279175,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
+ "audio_length": 15.825,
+ "target_tokens": 50,
+ "actual_tokens": 50,
+ "run_number": 3
+ },
+ {
+ "text_length": 212,
+ "token_count": null,
+ "total_time": 6.639606237411499,
+ "time_to_first_chunk": 3.0641400814056396,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
+ "audio_length": 15.825,
+ "target_tokens": 50,
+ "actual_tokens": 50,
+ "run_number": 4
+ },
+ {
+ "text_length": 212,
+ "token_count": null,
+ "total_time": 8.100529193878174,
+ "time_to_first_chunk": 3.3910109996795654,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
+ "audio_length": 15.825,
+ "target_tokens": 50,
+ "actual_tokens": 50,
+ "run_number": 5
+ },
+ {
+ "text_length": 448,
+ "token_count": null,
+ "total_time": 15.246968984603882,
+ "time_to_first_chunk": 3.1980819702148438,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
+ "audio_length": 30.35,
+ "target_tokens": 100,
+ "actual_tokens": 100,
+ "run_number": 1
+ },
+ {
+ "text_length": 448,
+ "token_count": null,
+ "total_time": 15.934760332107544,
+ "time_to_first_chunk": 4.23082709312439,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
+ "audio_length": 30.35,
+ "target_tokens": 100,
+ "actual_tokens": 100,
+ "run_number": 2
+ },
+ {
+ "text_length": 448,
+ "token_count": null,
+ "total_time": 13.799078226089478,
+ "time_to_first_chunk": 3.42996883392334,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
+ "audio_length": 30.35,
+ "target_tokens": 100,
+ "actual_tokens": 100,
+ "run_number": 3
+ },
+ {
+ "text_length": 448,
+ "token_count": null,
+ "total_time": 13.400063037872314,
+ "time_to_first_chunk": 3.2097883224487305,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
+ "audio_length": 30.35,
+ "target_tokens": 100,
+ "actual_tokens": 100,
+ "run_number": 4
+ },
+ {
+ "text_length": 448,
+ "token_count": null,
+ "total_time": 14.833694219589233,
+ "time_to_first_chunk": 3.1589744091033936,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
+ "audio_length": 30.35,
+ "target_tokens": 100,
+ "actual_tokens": 100,
+ "run_number": 5
+ },
+ {
+ "text_length": 1140,
+ "token_count": null,
+ "total_time": 35.49378156661987,
+ "time_to_first_chunk": 3.852027177810669,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run1_stream_openai.wav",
+ "audio_length": 78.175,
+ "target_tokens": 250,
+ "actual_tokens": 250,
+ "run_number": 1
+ },
+ {
+ "text_length": 1140,
+ "token_count": null,
+ "total_time": 33.59433174133301,
+ "time_to_first_chunk": 3.2059006690979004,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run2_stream_openai.wav",
+ "audio_length": 78.175,
+ "target_tokens": 250,
+ "actual_tokens": 250,
+ "run_number": 2
+ },
+ {
+ "text_length": 1140,
+ "token_count": null,
+ "total_time": 34.23120045661926,
+ "time_to_first_chunk": 3.1464977264404297,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run3_stream_openai.wav",
+ "audio_length": 78.175,
+ "target_tokens": 250,
+ "actual_tokens": 250,
+ "run_number": 3
+ },
+ {
+ "text_length": 1140,
+ "token_count": null,
+ "total_time": 36.18487215042114,
+ "time_to_first_chunk": 3.188844919204712,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run4_stream_openai.wav",
+ "audio_length": 78.175,
+ "target_tokens": 250,
+ "actual_tokens": 250,
+ "run_number": 4
+ },
+ {
+ "text_length": 1140,
+ "token_count": null,
+ "total_time": 38.142744302749634,
+ "time_to_first_chunk": 3.6997063159942627,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run5_stream_openai.wav",
+ "audio_length": 78.175,
+ "target_tokens": 250,
+ "actual_tokens": 250,
+ "run_number": 5
+ },
+ {
+ "text_length": 2232,
+ "token_count": null,
+ "total_time": 71.48920440673828,
+ "time_to_first_chunk": 3.148237943649292,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
+ "audio_length": 155.125,
+ "target_tokens": 500,
+ "actual_tokens": 500,
+ "run_number": 1
+ },
+ {
+ "text_length": 2232,
+ "token_count": null,
+ "total_time": 73.53017520904541,
+ "time_to_first_chunk": 3.464594841003418,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
+ "audio_length": 155.125,
+ "target_tokens": 500,
+ "actual_tokens": 500,
+ "run_number": 2
+ },
+ {
+ "text_length": 2232,
+ "token_count": null,
+ "total_time": 75.52278685569763,
+ "time_to_first_chunk": 3.5506417751312256,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
+ "audio_length": 155.125,
+ "target_tokens": 500,
+ "actual_tokens": 500,
+ "run_number": 3
+ },
+ {
+ "text_length": 2232,
+ "token_count": null,
+ "total_time": 69.45922994613647,
+ "time_to_first_chunk": 3.495962619781494,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
+ "audio_length": 155.125,
+ "target_tokens": 500,
+ "actual_tokens": 500,
+ "run_number": 4
+ },
+ {
+ "text_length": 2232,
+ "token_count": null,
+ "total_time": 66.66928672790527,
+ "time_to_first_chunk": 3.301323175430298,
+ "error": null,
+ "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
+ "audio_length": 155.125,
+ "target_tokens": 500,
+ "actual_tokens": 500,
+ "run_number": 5
+ }
+ ],
+ "summary": {
+ "10": {
+ "avg_time_to_first_chunk": 1.562,
+ "avg_total_time": 1.574,
+ "avg_audio_length": 3.45,
+ "num_successful_runs": 5
+ },
+ "50": {
+ "avg_time_to_first_chunk": 3.227,
+ "avg_total_time": 7.275,
+ "avg_audio_length": 15.825,
+ "num_successful_runs": 5
+ },
+ "100": {
+ "avg_time_to_first_chunk": 3.446,
+ "avg_total_time": 14.643,
+ "avg_audio_length": 30.35,
+ "num_successful_runs": 5
+ },
+ "250": {
+ "avg_time_to_first_chunk": 3.419,
+ "avg_total_time": 35.529,
+ "avg_audio_length": 78.175,
+ "num_successful_runs": 5
+ },
+ "500": {
+ "avg_time_to_first_chunk": 3.392,
+ "avg_total_time": 71.334,
+ "avg_audio_length": 155.125,
+ "num_successful_runs": 5
+ }
+ },
+ "timestamp": "2025-01-06 03:42:32"
+}
\ No newline at end of file
diff --git a/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_results_rtf.json b/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_results_rtf.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b3e68a3b5dc84a90ea1fb05ed6eafbea4483fa7
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_results_rtf.json
@@ -0,0 +1,241 @@
+{
+ "results": [
+ {
+ "tokens": 150,
+ "processing_time": 1.18,
+ "output_length": 43.7,
+ "rtf": 0.03,
+ "elapsed_time": 1.20302
+ },
+ {
+ "tokens": 300,
+ "processing_time": 2.27,
+ "output_length": 86.75,
+ "rtf": 0.03,
+ "elapsed_time": 3.49958
+ },
+ {
+ "tokens": 450,
+ "processing_time": 3.49,
+ "output_length": 125.9,
+ "rtf": 0.03,
+ "elapsed_time": 7.03862
+ },
+ {
+ "tokens": 600,
+ "processing_time": 4.64,
+ "output_length": 169.325,
+ "rtf": 0.03,
+ "elapsed_time": 11.71062
+ },
+ {
+ "tokens": 750,
+ "processing_time": 5.07,
+ "output_length": 212.3,
+ "rtf": 0.02,
+ "elapsed_time": 16.83186
+ },
+ {
+ "tokens": 900,
+ "processing_time": 6.66,
+ "output_length": 258.0,
+ "rtf": 0.03,
+ "elapsed_time": 23.54135
+ }
+ ],
+ "system_metrics": [
+ {
+ "timestamp": "2025-01-30T05:06:38.733338",
+ "cpu_percent": 0.0,
+ "ram_percent": 18.6,
+ "ram_used_gb": 5.284908294677734,
+ "gpu_memory_used": 1925.0,
+ "relative_time": 0.039948463439941406
+ },
+ {
+ "timestamp": "2025-01-30T05:06:39.774003",
+ "cpu_percent": 13.37,
+ "ram_percent": 18.6,
+ "ram_used_gb": 5.2852630615234375,
+ "gpu_memory_used": 3047.0,
+ "relative_time": 1.0883615016937256
+ },
+ {
+ "timestamp": "2025-01-30T05:06:40.822449",
+ "cpu_percent": 13.68,
+ "ram_percent": 18.7,
+ "ram_used_gb": 5.303462982177734,
+ "gpu_memory_used": 3040.0,
+ "relative_time": 2.12058687210083
+ },
+ {
+ "timestamp": "2025-01-30T05:06:41.854375",
+ "cpu_percent": 15.39,
+ "ram_percent": 18.7,
+ "ram_used_gb": 5.306262969970703,
+ "gpu_memory_used": 3326.0,
+ "relative_time": 3.166278600692749
+ },
+ {
+ "timestamp": "2025-01-30T05:06:42.900882",
+ "cpu_percent": 14.19,
+ "ram_percent": 18.8,
+ "ram_used_gb": 5.337162017822266,
+ "gpu_memory_used": 2530.0,
+ "relative_time": 4.256956577301025
+ },
+ {
+ "timestamp": "2025-01-30T05:06:43.990792",
+ "cpu_percent": 12.63,
+ "ram_percent": 18.8,
+ "ram_used_gb": 5.333805084228516,
+ "gpu_memory_used": 3331.0,
+ "relative_time": 5.2854602336883545
+ },
+ {
+ "timestamp": "2025-01-30T05:06:45.019134",
+ "cpu_percent": 14.14,
+ "ram_percent": 18.8,
+ "ram_used_gb": 5.334297180175781,
+ "gpu_memory_used": 3332.0,
+ "relative_time": 6.351738929748535
+ },
+ {
+ "timestamp": "2025-01-30T05:06:46.085997",
+ "cpu_percent": 12.78,
+ "ram_percent": 18.8,
+ "ram_used_gb": 5.351467132568359,
+ "gpu_memory_used": 2596.0,
+ "relative_time": 7.392607688903809
+ },
+ {
+ "timestamp": "2025-01-30T05:06:47.127113",
+ "cpu_percent": 14.7,
+ "ram_percent": 18.9,
+ "ram_used_gb": 5.367542266845703,
+ "gpu_memory_used": 3341.0,
+ "relative_time": 8.441826343536377
+ },
+ {
+ "timestamp": "2025-01-30T05:06:48.176033",
+ "cpu_percent": 13.47,
+ "ram_percent": 18.9,
+ "ram_used_gb": 5.361263275146484,
+ "gpu_memory_used": 3339.0,
+ "relative_time": 9.500520706176758
+ },
+ {
+ "timestamp": "2025-01-30T05:06:49.234332",
+ "cpu_percent": 15.84,
+ "ram_percent": 18.9,
+ "ram_used_gb": 5.3612213134765625,
+ "gpu_memory_used": 3339.0,
+ "relative_time": 10.53744649887085
+ },
+ {
+ "timestamp": "2025-01-30T05:06:50.271159",
+ "cpu_percent": 14.89,
+ "ram_percent": 18.9,
+ "ram_used_gb": 5.379688262939453,
+ "gpu_memory_used": 3646.0,
+ "relative_time": 11.570110321044922
+ },
+ {
+ "timestamp": "2025-01-30T05:06:51.303841",
+ "cpu_percent": 15.71,
+ "ram_percent": 19.0,
+ "ram_used_gb": 5.390773773193359,
+ "gpu_memory_used": 3037.0,
+ "relative_time": 12.60651707649231
+ },
+ {
+ "timestamp": "2025-01-30T05:06:52.340383",
+ "cpu_percent": 15.46,
+ "ram_percent": 19.0,
+ "ram_used_gb": 5.389518737792969,
+ "gpu_memory_used": 3319.0,
+ "relative_time": 13.636165380477905
+ },
+ {
+ "timestamp": "2025-01-30T05:06:53.370342",
+ "cpu_percent": 13.12,
+ "ram_percent": 19.0,
+ "ram_used_gb": 5.391136169433594,
+ "gpu_memory_used": 3320.0,
+ "relative_time": 14.67578935623169
+ },
+ {
+ "timestamp": "2025-01-30T05:06:54.376175",
+ "cpu_percent": 14.98,
+ "ram_percent": 19.0,
+ "ram_used_gb": 5.390045166015625,
+ "gpu_memory_used": 3627.0,
+ "relative_time": 15.70747685432434
+ },
+ {
+ "timestamp": "2025-01-30T05:06:55.441172",
+ "cpu_percent": 13.45,
+ "ram_percent": 19.0,
+ "ram_used_gb": 5.394947052001953,
+ "gpu_memory_used": 1937.0,
+ "relative_time": 16.758784770965576
+ },
+ {
+ "timestamp": "2025-01-30T05:06:56.492442",
+ "cpu_percent": 17.03,
+ "ram_percent": 18.9,
+ "ram_used_gb": 5.361682891845703,
+ "gpu_memory_used": 3041.0,
+ "relative_time": 17.789713144302368
+ },
+ {
+ "timestamp": "2025-01-30T05:06:57.523536",
+ "cpu_percent": 13.76,
+ "ram_percent": 18.9,
+ "ram_used_gb": 5.360996246337891,
+ "gpu_memory_used": 3321.0,
+ "relative_time": 18.838542222976685
+ },
+ {
+ "timestamp": "2025-01-30T05:06:58.572158",
+ "cpu_percent": 15.94,
+ "ram_percent": 18.9,
+ "ram_used_gb": 5.3652801513671875,
+ "gpu_memory_used": 3323.0,
+ "relative_time": 19.86689043045044
+ },
+ {
+ "timestamp": "2025-01-30T05:06:59.600551",
+ "cpu_percent": 15.67,
+ "ram_percent": 18.9,
+ "ram_used_gb": 5.363399505615234,
+ "gpu_memory_used": 3630.0,
+ "relative_time": 20.89712619781494
+ },
+ {
+ "timestamp": "2025-01-30T05:07:00.631315",
+ "cpu_percent": 15.37,
+ "ram_percent": 18.9,
+ "ram_used_gb": 5.3663482666015625,
+ "gpu_memory_used": 3629.0,
+ "relative_time": 22.01374316215515
+ },
+ {
+ "timestamp": "2025-01-30T05:07:01.747500",
+ "cpu_percent": 13.79,
+ "ram_percent": 18.9,
+ "ram_used_gb": 5.367362976074219,
+ "gpu_memory_used": 3620.0,
+ "relative_time": 23.05113124847412
+ },
+ {
+ "timestamp": "2025-01-30T05:07:02.784828",
+ "cpu_percent": 10.16,
+ "ram_percent": 19.1,
+ "ram_used_gb": 5.443946838378906,
+ "gpu_memory_used": 1916.0,
+ "relative_time": 24.08937978744507
+ }
+ ],
+ "test_duration": 26.596059799194336
+}
\ No newline at end of file
diff --git a/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_stats_rtf.txt b/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_stats_rtf.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a62888f565c316b2629c28448ecbf6520680ed7d
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_stats_rtf.txt
@@ -0,0 +1,23 @@
+=== Benchmark Statistics (with correct RTF) ===
+
+Total tokens processed: 3150
+Total audio generated (s): 895.98
+Total test duration (s): 23.54
+Average processing rate (tokens/s): 133.43
+Average RTF: 0.03
+Average Real Time Speed: 35.29
+
+=== Per-chunk Stats ===
+
+Average chunk size (tokens): 525.00
+Min chunk size (tokens): 150
+Max chunk size (tokens): 900
+Average processing time (s): 3.88
+Average output length (s): 149.33
+
+=== Performance Ranges ===
+
+Processing rate range (tokens/s): 127.12 - 147.93
+RTF range: 0.02x - 0.03x
+Real Time Speed range: 33.33x - 50.00x
+
diff --git a/examples/assorted_checks/benchmarks/the_time_machine_hg_wells.txt b/examples/assorted_checks/benchmarks/the_time_machine_hg_wells.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3381d7a7aa6627753c06a6706aa5a3d52148da32
--- /dev/null
+++ b/examples/assorted_checks/benchmarks/the_time_machine_hg_wells.txt
@@ -0,0 +1,643 @@
+The Time Traveller (for so it will be convenient to speak of him) was expounding a recondite matter to us. His pale grey eyes shone and twinkled, and his usually pale face was flushed and animated. The fire burnt brightly, and the soft radiance of the incandescent lights in the lilies of silver caught the bubbles that flashed and passed in our glasses. Our chairs, being his patents, embraced and caressed us rather than submitted to be sat upon, and there was that luxurious after-dinner atmosphere, when thought runs gracefully free of the trammels of precision. And he put it to us in this way—marking the points with a lean forefinger—as we sat and lazily admired his earnestness over this new paradox (as we thought it) and his fecundity.
+
+“You must follow me carefully. I shall have to controvert one or two ideas that are almost universally accepted. The geometry, for instance, they taught you at school is founded on a misconception.”
+
+“Is not that rather a large thing to expect us to begin upon?” said Filby, an argumentative person with red hair.
+
+“I do not mean to ask you to accept anything without reasonable ground for it. You will soon admit as much as I need from you. You know of course that a mathematical line, a line of thickness nil, has no real existence. They taught you that? Neither has a mathematical plane. These things are mere abstractions.”
+
+“That is all right,” said the Psychologist.
+
+“Nor, having only length, breadth, and thickness, can a cube have a real existence.”
+
+“There I object,” said Filby. “Of course a solid body may exist. All real things—”
+
+“So most people think. But wait a moment. Can an instantaneous cube exist?”
+
+“Don’t follow you,” said Filby.
+
+“Can a cube that does not last for any time at all, have a real existence?”
+
+Filby became pensive. “Clearly,” the Time Traveller proceeded, “any real body must have extension in four directions: it must have Length, Breadth, Thickness, and—Duration. But through a natural infirmity of the flesh, which I will explain to you in a moment, we incline to overlook this fact. There are really four dimensions, three which we call the three planes of Space, and a fourth, Time. There is, however, a tendency to draw an unreal distinction between the former three dimensions and the latter, because it happens that our consciousness moves intermittently in one direction along the latter from the beginning to the end of our lives.”
+
+“That,” said a very young man, making spasmodic efforts to relight his cigar over the lamp; “that . . . very clear indeed.”
+
+“Now, it is very remarkable that this is so extensively overlooked,” continued the Time Traveller, with a slight accession of cheerfulness. “Really this is what is meant by the Fourth Dimension, though some people who talk about the Fourth Dimension do not know they mean it. It is only another way of looking at Time. There is no difference between Time and any of the three dimensions of Space except that our consciousness moves along it. But some foolish people have got hold of the wrong side of that idea. You have all heard what they have to say about this Fourth Dimension?”
+
+“I have not,” said the Provincial Mayor.
+
+“It is simply this. That Space, as our mathematicians have it, is spoken of as having three dimensions, which one may call Length, Breadth, and Thickness, and is always definable by reference to three planes, each at right angles to the others. But some philosophical people have been asking why three dimensions particularly—why not another direction at right angles to the other three?—and have even tried to construct a Four-Dimensional geometry. Professor Simon Newcomb was expounding this to the New York Mathematical Society only a month or so ago. You know how on a flat surface, which has only two dimensions, we can represent a figure of a three-dimensional solid, and similarly they think that by models of three dimensions they could represent one of four—if they could master the perspective of the thing. See?”
+
+“I think so,” murmured the Provincial Mayor; and, knitting his brows, he lapsed into an introspective state, his lips moving as one who repeats mystic words. “Yes, I think I see it now,” he said after some time, brightening in a quite transitory manner.
+
+“Well, I do not mind telling you I have been at work upon this geometry of Four Dimensions for some time. Some of my results are curious. For instance, here is a portrait of a man at eight years old, another at fifteen, another at seventeen, another at twenty-three, and so on. All these are evidently sections, as it were, Three-Dimensional representations of his Four-Dimensioned being, which is a fixed and unalterable thing.
+
+“Scientific people,” proceeded the Time Traveller, after the pause required for the proper assimilation of this, “know very well that Time is only a kind of Space. Here is a popular scientific diagram, a weather record. This line I trace with my finger shows the movement of the barometer. Yesterday it was so high, yesterday night it fell, then this morning it rose again, and so gently upward to here. Surely the mercury did not trace this line in any of the dimensions of Space generally recognised? But certainly it traced such a line, and that line, therefore, we must conclude, was along the Time-Dimension.”
+
+“But,” said the Medical Man, staring hard at a coal in the fire, “if Time is really only a fourth dimension of Space, why is it, and why has it always been, regarded as something different? And why cannot we move in Time as we move about in the other dimensions of Space?”
+
+The Time Traveller smiled. “Are you so sure we can move freely in Space? Right and left we can go, backward and forward freely enough, and men always have done so. I admit we move freely in two dimensions. But how about up and down? Gravitation limits us there.”
+
+“Not exactly,” said the Medical Man. “There are balloons.”
+
+“But before the balloons, save for spasmodic jumping and the inequalities of the surface, man had no freedom of vertical movement.”
+
+“Still they could move a little up and down,” said the Medical Man.
+
+“Easier, far easier down than up.”
+
+“And you cannot move at all in Time, you cannot get away from the present moment.”
+
+“My dear sir, that is just where you are wrong. That is just where the whole world has gone wrong. We are always getting away from the present moment. Our mental existences, which are immaterial and have no dimensions, are passing along the Time-Dimension with a uniform velocity from the cradle to the grave. Just as we should travel down if we began our existence fifty miles above the earth’s surface.”
+
+“But the great difficulty is this,” interrupted the Psychologist. ’You can move about in all directions of Space, but you cannot move about in Time.”
+
+“That is the germ of my great discovery. But you are wrong to say that we cannot move about in Time. For instance, if I am recalling an incident very vividly I go back to the instant of its occurrence: I become absent-minded, as you say. I jump back for a moment. Of course we have no means of staying back for any length of Time, any more than a savage or an animal has of staying six feet above the ground. But a civilised man is better off than the savage in this respect. He can go up against gravitation in a balloon, and why should he not hope that ultimately he may be able to stop or accelerate his drift along the Time-Dimension, or even turn about and travel the other way?”
+
+“Oh, this,” began Filby, “is all—”
+
+“Why not?” said the Time Traveller.
+
+“It’s against reason,” said Filby.
+
+“What reason?” said the Time Traveller.
+
+“You can show black is white by argument,” said Filby, “but you will never convince me.”
+
+“Possibly not,” said the Time Traveller. “But now you begin to see the object of my investigations into the geometry of Four Dimensions. Long ago I had a vague inkling of a machine—”
+
+“To travel through Time!” exclaimed the Very Young Man.
+
+“That shall travel indifferently in any direction of Space and Time, as the driver determines.”
+
+Filby contented himself with laughter.
+
+“But I have experimental verification,” said the Time Traveller.
+
+“It would be remarkably convenient for the historian,” the Psychologist suggested. “One might travel back and verify the accepted account of the Battle of Hastings, for instance!”
+
+“Don’t you think you would attract attention?” said the Medical Man. “Our ancestors had no great tolerance for anachronisms.”
+
+“One might get one’s Greek from the very lips of Homer and Plato,” the Very Young Man thought.
+
+“In which case they would certainly plough you for the Little-go. The German scholars have improved Greek so much.”
+
+“Then there is the future,” said the Very Young Man. “Just think! One might invest all one’s money, leave it to accumulate at interest, and hurry on ahead!”
+
+“To discover a society,” said I, “erected on a strictly communistic basis.”
+
+“Of all the wild extravagant theories!” began the Psychologist.
+
+“Yes, so it seemed to me, and so I never talked of it until—”
+
+“Experimental verification!” cried I. “You are going to verify that?”
+
+“The experiment!” cried Filby, who was getting brain-weary.
+
+“Let’s see your experiment anyhow,” said the Psychologist, “though it’s all humbug, you know.”
+
+The Time Traveller smiled round at us. Then, still smiling faintly, and with his hands deep in his trousers pockets, he walked slowly out of the room, and we heard his slippers shuffling down the long passage to his laboratory.
+
+The Psychologist looked at us. “I wonder what he’s got?”
+
+“Some sleight-of-hand trick or other,” said the Medical Man, and Filby tried to tell us about a conjuror he had seen at Burslem, but before he had finished his preface the Time Traveller came back, and Filby’s anecdote collapsed.
+
+II.
+The Machine
+The thing the Time Traveller held in his hand was a glittering metallic framework, scarcely larger than a small clock, and very delicately made. There was ivory in it, and some transparent crystalline substance. And now I must be explicit, for this that follows—unless his explanation is to be accepted—is an absolutely unaccountable thing. He took one of the small octagonal tables that were scattered about the room, and set it in front of the fire, with two legs on the hearthrug. On this table he placed the mechanism. Then he drew up a chair, and sat down. The only other object on the table was a small shaded lamp, the bright light of which fell upon the model. There were also perhaps a dozen candles about, two in brass candlesticks upon the mantel and several in sconces, so that the room was brilliantly illuminated. I sat in a low arm-chair nearest the fire, and I drew this forward so as to be almost between the Time Traveller and the fireplace. Filby sat behind him, looking over his shoulder. The Medical Man and the Provincial Mayor watched him in profile from the right, the Psychologist from the left. The Very Young Man stood behind the Psychologist. We were all on the alert. It appears incredible to me that any kind of trick, however subtly conceived and however adroitly done, could have been played upon us under these conditions.
+
+The Time Traveller looked at us, and then at the mechanism. “Well?” said the Psychologist.
+
+“This little affair,” said the Time Traveller, resting his elbows upon the table and pressing his hands together above the apparatus, “is only a model. It is my plan for a machine to travel through time. You will notice that it looks singularly askew, and that there is an odd twinkling appearance about this bar, as though it was in some way unreal.” He pointed to the part with his finger. “Also, here is one little white lever, and here is another.”
+
+The Medical Man got up out of his chair and peered into the thing. “It’s beautifully made,” he said.
+
+“It took two years to make,” retorted the Time Traveller. Then, when we had all imitated the action of the Medical Man, he said: “Now I want you clearly to understand that this lever, being pressed over, sends the machine gliding into the future, and this other reverses the motion. This saddle represents the seat of a time traveller. Presently I am going to press the lever, and off the machine will go. It will vanish, pass into future Time, and disappear. Have a good look at the thing. Look at the table too, and satisfy yourselves there is no trickery. I don’t want to waste this model, and then be told I’m a quack.”
+
+There was a minute’s pause perhaps. The Psychologist seemed about to speak to me, but changed his mind. Then the Time Traveller put forth his finger towards the lever. “No,” he said suddenly. “Lend me your hand.” And turning to the Psychologist, he took that individual’s hand in his own and told him to put out his forefinger. So that it was the Psychologist himself who sent forth the model Time Machine on its interminable voyage. We all saw the lever turn. I am absolutely certain there was no trickery. There was a breath of wind, and the lamp flame jumped. One of the candles on the mantel was blown out, and the little machine suddenly swung round, became indistinct, was seen as a ghost for a second perhaps, as an eddy of faintly glittering brass and ivory; and it was gone—vanished! Save for the lamp the table was bare.
+
+Everyone was silent for a minute. Then Filby said he was damned.
+
+The Psychologist recovered from his stupor, and suddenly looked under the table. At that the Time Traveller laughed cheerfully. “Well?” he said, with a reminiscence of the Psychologist. Then, getting up, he went to the tobacco jar on the mantel, and with his back to us began to fill his pipe.
+
+We stared at each other. “Look here,” said the Medical Man, “are you in earnest about this? Do you seriously believe that that machine has travelled into time?”
+
+“Certainly,” said the Time Traveller, stooping to light a spill at the fire. Then he turned, lighting his pipe, to look at the Psychologist’s face. (The Psychologist, to show that he was not unhinged, helped himself to a cigar and tried to light it uncut.) “What is more, I have a big machine nearly finished in there”—he indicated the laboratory—“and when that is put together I mean to have a journey on my own account.”
+
+“You mean to say that that machine has travelled into the future?” said Filby.
+
+“Into the future or the past—I don’t, for certain, know which.”
+
+After an interval the Psychologist had an inspiration. “It must have gone into the past if it has gone anywhere,” he said.
+
+“Why?” said the Time Traveller.
+
+“Because I presume that it has not moved in space, and if it travelled into the future it would still be here all this time, since it must have travelled through this time.”
+
+“But,” said I, “If it travelled into the past it would have been visible when we came first into this room; and last Thursday when we were here; and the Thursday before that; and so forth!”
+
+“Serious objections,” remarked the Provincial Mayor, with an air of impartiality, turning towards the Time Traveller.
+
+“Not a bit,” said the Time Traveller, and, to the Psychologist: “You think. You can explain that. It’s presentation below the threshold, you know, diluted presentation.”
+
+“Of course,” said the Psychologist, and reassured us. “That’s a simple point of psychology. I should have thought of it. It’s plain enough, and helps the paradox delightfully. We cannot see it, nor can we appreciate this machine, any more than we can the spoke of a wheel spinning, or a bullet flying through the air. If it is travelling through time fifty times or a hundred times faster than we are, if it gets through a minute while we get through a second, the impression it creates will of course be only one-fiftieth or one-hundredth of what it would make if it were not travelling in time. That’s plain enough.” He passed his hand through the space in which the machine had been. “You see?” he said, laughing.
+
+We sat and stared at the vacant table for a minute or so. Then the Time Traveller asked us what we thought of it all.
+
+“It sounds plausible enough tonight,” said the Medical Man; “but wait until tomorrow. Wait for the common sense of the morning.”
+
+“Would you like to see the Time Machine itself?” asked the Time Traveller. And therewith, taking the lamp in his hand, he led the way down the long, draughty corridor to his laboratory. I remember vividly the flickering light, his queer, broad head in silhouette, the dance of the shadows, how we all followed him, puzzled but incredulous, and how there in the laboratory we beheld a larger edition of the little mechanism which we had seen vanish from before our eyes. Parts were of nickel, parts of ivory, parts had certainly been filed or sawn out of rock crystal. The thing was generally complete, but the twisted crystalline bars lay unfinished upon the bench beside some sheets of drawings, and I took one up for a better look at it. Quartz it seemed to be.
+
+“Look here,” said the Medical Man, “are you perfectly serious? Or is this a trick—like that ghost you showed us last Christmas?”
+
+“Upon that machine,” said the Time Traveller, holding the lamp aloft, “I intend to explore time. Is that plain? I was never more serious in my life.”
+
+None of us quite knew how to take it.
+
+I caught Filby’s eye over the shoulder of the Medical Man, and he winked at me solemnly.
+
+III.
+The Time Traveller Returns
+I think that at that time none of us quite believed in the Time Machine. The fact is, the Time Traveller was one of those men who are too clever to be believed: you never felt that you saw all round him; you always suspected some subtle reserve, some ingenuity in ambush, behind his lucid frankness. Had Filby shown the model and explained the matter in the Time Traveller’s words, we should have shown him far less scepticism. For we should have perceived his motives: a pork-butcher could understand Filby. But the Time Traveller had more than a touch of whim among his elements, and we distrusted him. Things that would have made the fame of a less clever man seemed tricks in his hands. It is a mistake to do things too easily. The serious people who took him seriously never felt quite sure of his deportment; they were somehow aware that trusting their reputations for judgment with him was like furnishing a nursery with eggshell china. So I don’t think any of us said very much about time travelling in the interval between that Thursday and the next, though its odd potentialities ran, no doubt, in most of our minds: its plausibility, that is, its practical incredibleness, the curious possibilities of anachronism and of utter confusion it suggested. For my own part, I was particularly preoccupied with the trick of the model. That I remember discussing with the Medical Man, whom I met on Friday at the Linnæan. He said he had seen a similar thing at Tübingen, and laid considerable stress on the blowing-out of the candle. But how the trick was done he could not explain.
+
+The next Thursday I went again to Richmond—I suppose I was one of the Time Traveller’s most constant guests—and, arriving late, found four or five men already assembled in his drawing-room. The Medical Man was standing before the fire with a sheet of paper in one hand and his watch in the other. I looked round for the Time Traveller, and—“It’s half-past seven now,” said the Medical Man. “I suppose we’d better have dinner?”
+
+“Where’s——?” said I, naming our host.
+
+“You’ve just come? It’s rather odd. He’s unavoidably detained. He asks me in this note to lead off with dinner at seven if he’s not back. Says he’ll explain when he comes.”
+
+“It seems a pity to let the dinner spoil,” said the Editor of a well-known daily paper; and thereupon the Doctor rang the bell.
+
+The Psychologist was the only person besides the Doctor and myself who had attended the previous dinner. The other men were Blank, the Editor aforementioned, a certain journalist, and another—a quiet, shy man with a beard—whom I didn’t know, and who, as far as my observation went, never opened his mouth all the evening. There was some speculation at the dinner-table about the Time Traveller’s absence, and I suggested time travelling, in a half-jocular spirit. The Editor wanted that explained to him, and the Psychologist volunteered a wooden account of the “ingenious paradox and trick” we had witnessed that day week. He was in the midst of his exposition when the door from the corridor opened slowly and without noise. I was facing the door, and saw it first. “Hallo!” I said. “At last!” And the door opened wider, and the Time Traveller stood before us. I gave a cry of surprise. “Good heavens! man, what’s the matter?” cried the Medical Man, who saw him next. And the whole tableful turned towards the door.
+
+He was in an amazing plight. His coat was dusty and dirty, and smeared with green down the sleeves; his hair disordered, and as it seemed to me greyer—either with dust and dirt or because its colour had actually faded. His face was ghastly pale; his chin had a brown cut on it—a cut half-healed; his expression was haggard and drawn, as by intense suffering. For a moment he hesitated in the doorway, as if he had been dazzled by the light. Then he came into the room. He walked with just such a limp as I have seen in footsore tramps. We stared at him in silence, expecting him to speak.
+
+He said not a word, but came painfully to the table, and made a motion towards the wine. The Editor filled a glass of champagne, and pushed it towards him. He drained it, and it seemed to do him good: for he looked round the table, and the ghost of his old smile flickered across his face. “What on earth have you been up to, man?” said the Doctor. The Time Traveller did not seem to hear. “Don’t let me disturb you,” he said, with a certain faltering articulation. “I’m all right.” He stopped, held out his glass for more, and took it off at a draught. “That’s good,” he said. His eyes grew brighter, and a faint colour came into his cheeks. His glance flickered over our faces with a certain dull approval, and then went round the warm and comfortable room. Then he spoke again, still as it were feeling his way among his words. “I’m going to wash and dress, and then I’ll come down and explain things.... Save me some of that mutton. I’m starving for a bit of meat.”
+
+He looked across at the Editor, who was a rare visitor, and hoped he was all right. The Editor began a question. “Tell you presently,” said the Time Traveller. “I’m—funny! Be all right in a minute.”
+
+He put down his glass, and walked towards the staircase door. Again I remarked his lameness and the soft padding sound of his footfall, and standing up in my place, I saw his feet as he went out. He had nothing on them but a pair of tattered, blood-stained socks. Then the door closed upon him. I had half a mind to follow, till I remembered how he detested any fuss about himself. For a minute, perhaps, my mind was wool-gathering. Then, “Remarkable Behaviour of an Eminent Scientist,” I heard the Editor say, thinking (after his wont) in headlines. And this brought my attention back to the bright dinner-table.
+
+“What’s the game?” said the Journalist. “Has he been doing the Amateur Cadger? I don’t follow.” I met the eye of the Psychologist, and read my own interpretation in his face. I thought of the Time Traveller limping painfully upstairs. I don’t think anyone else had noticed his lameness.
+
+The first to recover completely from this surprise was the Medical Man, who rang the bell—the Time Traveller hated to have servants waiting at dinner—for a hot plate. At that the Editor turned to his knife and fork with a grunt, and the Silent Man followed suit. The dinner was resumed. Conversation was exclamatory for a little while with gaps of wonderment; and then the Editor got fervent in his curiosity. “Does our friend eke out his modest income with a crossing? or has he his Nebuchadnezzar phases?” he inquired. “I feel assured it’s this business of the Time Machine,” I said, and took up the Psychologist’s account of our previous meeting. The new guests were frankly incredulous. The Editor raised objections. “What was this time travelling? A man couldn’t cover himself with dust by rolling in a paradox, could he?” And then, as the idea came home to him, he resorted to caricature. Hadn’t they any clothes-brushes in the Future? The Journalist too, would not believe at any price, and joined the Editor in the easy work of heaping ridicule on the whole thing. They were both the new kind of journalist—very joyous, irreverent young men. “Our Special Correspondent in the Day after Tomorrow reports,” the Journalist was saying—or rather shouting—when the Time Traveller came back. He was dressed in ordinary evening clothes, and nothing save his haggard look remained of the change that had startled me.
+
+“I say,” said the Editor hilariously, “these chaps here say you have been travelling into the middle of next week! Tell us all about little Rosebery, will you? What will you take for the lot?”
+
+The Time Traveller came to the place reserved for him without a word. He smiled quietly, in his old way. “Where’s my mutton?” he said. “What a treat it is to stick a fork into meat again!”
+
+“Story!” cried the Editor.
+
+“Story be damned!” said the Time Traveller. “I want something to eat. I won’t say a word until I get some peptone into my arteries. Thanks. And the salt.”
+
+“One word,” said I. “Have you been time travelling?”
+
+“Yes,” said the Time Traveller, with his mouth full, nodding his head.
+
+“I’d give a shilling a line for a verbatim note,” said the Editor. The Time Traveller pushed his glass towards the Silent Man and rang it with his fingernail; at which the Silent Man, who had been staring at his face, started convulsively, and poured him wine. The rest of the dinner was uncomfortable. For my own part, sudden questions kept on rising to my lips, and I dare say it was the same with the others. The Journalist tried to relieve the tension by telling anecdotes of Hettie Potter. The Time Traveller devoted his attention to his dinner, and displayed the appetite of a tramp. The Medical Man smoked a cigarette, and watched the Time Traveller through his eyelashes. The Silent Man seemed even more clumsy than usual, and drank champagne with regularity and determination out of sheer nervousness. At last the Time Traveller pushed his plate away, and looked round us. “I suppose I must apologise,” he said. “I was simply starving. I’ve had a most amazing time.” He reached out his hand for a cigar, and cut the end. “But come into the smoking-room. It’s too long a story to tell over greasy plates.” And ringing the bell in passing, he led the way into the adjoining room.
+
+“You have told Blank, and Dash, and Chose about the machine?” he said to me, leaning back in his easy-chair and naming the three new guests.
+
+“But the thing’s a mere paradox,” said the Editor.
+
+“I can’t argue tonight. I don’t mind telling you the story, but I can’t argue. I will,” he went on, “tell you the story of what has happened to me, if you like, but you must refrain from interruptions. I want to tell it. Badly. Most of it will sound like lying. So be it! It’s true—every word of it, all the same. I was in my laboratory at four o’clock, and since then … I’ve lived eight days … such days as no human being ever lived before! I’m nearly worn out, but I shan’t sleep till I’ve told this thing over to you. Then I shall go to bed. But no interruptions! Is it agreed?”
+
+“Agreed,” said the Editor, and the rest of us echoed “Agreed.” And with that the Time Traveller began his story as I have set it forth. He sat back in his chair at first, and spoke like a weary man. Afterwards he got more animated. In writing it down I feel with only too much keenness the inadequacy of pen and ink—and, above all, my own inadequacy—to express its quality. You read, I will suppose, attentively enough; but you cannot see the speaker’s white, sincere face in the bright circle of the little lamp, nor hear the intonation of his voice. You cannot know how his expression followed the turns of his story! Most of us hearers were in shadow, for the candles in the smoking-room had not been lighted, and only the face of the Journalist and the legs of the Silent Man from the knees downward were illuminated. At first we glanced now and again at each other. After a time we ceased to do that, and looked only at the Time Traveller’s face.
+
+IV.
+Time Travelling
+>
+“I told some of you last Thursday of the principles of the Time Machine, and showed you the actual thing itself, incomplete in the workshop. There it is now, a little travel-worn, truly; and one of the ivory bars is cracked, and a brass rail bent; but the rest of it’s sound enough. I expected to finish it on Friday; but on Friday, when the putting together was nearly done, I found that one of the nickel bars was exactly one inch too short, and this I had to get remade; so that the thing was not complete until this morning. It was at ten o’clock today that the first of all Time Machines began its career. I gave it a last tap, tried all the screws again, put one more drop of oil on the quartz rod, and sat myself in the saddle. I suppose a suicide who holds a pistol to his skull feels much the same wonder at what will come next as I felt then. I took the starting lever in one hand and the stopping one in the other, pressed the first, and almost immediately the second. I seemed to reel; I felt a nightmare sensation of falling; and, looking round, I saw the laboratory exactly as before. Had anything happened? For a moment I suspected that my intellect had tricked me. Then I noted the clock. A moment before, as it seemed, it had stood at a minute or so past ten; now it was nearly half-past three!
+
+“I drew a breath, set my teeth, gripped the starting lever with both hands, and went off with a thud. The laboratory got hazy and went dark. Mrs. Watchett came in and walked, apparently without seeing me, towards the garden door. I suppose it took her a minute or so to traverse the place, but to me she seemed to shoot across the room like a rocket. I pressed the lever over to its extreme position. The night came like the turning out of a lamp, and in another moment came tomorrow. The laboratory grew faint and hazy, then fainter and ever fainter. Tomorrow night came black, then day again, night again, day again, faster and faster still. An eddying murmur filled my ears, and a strange, dumb confusedness descended on my mind.
+
+“I am afraid I cannot convey the peculiar sensations of time travelling. They are excessively unpleasant. There is a feeling exactly like that one has upon a switchback—of a helpless headlong motion! I felt the same horrible anticipation, too, of an imminent smash. As I put on pace, night followed day like the flapping of a black wing. The dim suggestion of the laboratory seemed presently to fall away from me, and I saw the sun hopping swiftly across the sky, leaping it every minute, and every minute marking a day. I supposed the laboratory had been destroyed and I had come into the open air. I had a dim impression of scaffolding, but I was already going too fast to be conscious of any moving things. The slowest snail that ever crawled dashed by too fast for me. The twinkling succession of darkness and light was excessively painful to the eye. Then, in the intermittent darknesses, I saw the moon spinning swiftly through her quarters from new to full, and had a faint glimpse of the circling stars. Presently, as I went on, still gaining velocity, the palpitation of night and day merged into one continuous greyness; the sky took on a wonderful deepness of blue, a splendid luminous colour like that of early twilight; the jerking sun became a streak of fire, a brilliant arch, in space; the moon a fainter fluctuating band; and I could see nothing of the stars, save now and then a brighter circle flickering in the blue.
+
+“The landscape was misty and vague. I was still on the hillside upon which this house now stands, and the shoulder rose above me grey and dim. I saw trees growing and changing like puffs of vapour, now brown, now green; they grew, spread, shivered, and passed away. I saw huge buildings rise up faint and fair, and pass like dreams. The whole surface of the earth seemed changed—melting and flowing under my eyes. The little hands upon the dials that registered my speed raced round faster and faster. Presently I noted that the sun belt swayed up and down, from solstice to solstice, in a minute or less, and that consequently my pace was over a year a minute; and minute by minute the white snow flashed across the world, and vanished, and was followed by the bright, brief green of spring.
+
+“The unpleasant sensations of the start were less poignant now. They merged at last into a kind of hysterical exhilaration. I remarked, indeed, a clumsy swaying of the machine, for which I was unable to account. But my mind was too confused to attend to it, so with a kind of madness growing upon me, I flung myself into futurity. At first I scarce thought of stopping, scarce thought of anything but these new sensations. But presently a fresh series of impressions grew up in my mind—a certain curiosity and therewith a certain dread—until at last they took complete possession of me. What strange developments of humanity, what wonderful advances upon our rudimentary civilisation, I thought, might not appear when I came to look nearly into the dim elusive world that raced and fluctuated before my eyes! I saw great and splendid architecture rising about me, more massive than any buildings of our own time, and yet, as it seemed, built of glimmer and mist. I saw a richer green flow up the hillside, and remain there, without any wintry intermission. Even through the veil of my confusion the earth seemed very fair. And so my mind came round to the business of stopping.
+
+“The peculiar risk lay in the possibility of my finding some substance in the space which I, or the machine, occupied. So long as I travelled at a high velocity through time, this scarcely mattered: I was, so to speak, attenuated—was slipping like a vapour through the interstices of intervening substances! But to come to a stop involved the jamming of myself, molecule by molecule, into whatever lay in my way; meant bringing my atoms into such intimate contact with those of the obstacle that a profound chemical reaction—possibly a far-reaching explosion—would result, and blow myself and my apparatus out of all possible dimensions—into the Unknown. This possibility had occurred to me again and again while I was making the machine; but then I had cheerfully accepted it as an unavoidable risk—one of the risks a man has got to take! Now the risk was inevitable, I no longer saw it in the same cheerful light. The fact is that, insensibly, the absolute strangeness of everything, the sickly jarring and swaying of the machine, above all, the feeling of prolonged falling, had absolutely upset my nerves. I told myself that I could never stop, and with a gust of petulance I resolved to stop forthwith. Like an impatient fool, I lugged over the lever, and incontinently the thing went reeling over, and I was flung headlong through the air.
+
+“There was the sound of a clap of thunder in my ears. I may have been stunned for a moment. A pitiless hail was hissing round me, and I was sitting on soft turf in front of the overset machine. Everything still seemed grey, but presently I remarked that the confusion in my ears was gone. I looked round me. I was on what seemed to be a little lawn in a garden, surrounded by rhododendron bushes, and I noticed that their mauve and purple blossoms were dropping in a shower under the beating of the hailstones. The rebounding, dancing hail hung in a little cloud over the machine, and drove along the ground like smoke. In a moment I was wet to the skin. ‘Fine hospitality,’ said I, ‘to a man who has travelled innumerable years to see you.’
+
+“Presently I thought what a fool I was to get wet. I stood up and looked round me. A colossal figure, carved apparently in some white stone, loomed indistinctly beyond the rhododendrons through the hazy downpour. But all else of the world was invisible.
+
+“My sensations would be hard to describe. As the columns of hail grew thinner, I saw the white figure more distinctly. It was very large, for a silver birch-tree touched its shoulder. It was of white marble, in shape something like a winged sphinx, but the wings, instead of being carried vertically at the sides, were spread so that it seemed to hover. The pedestal, it appeared to me, was of bronze, and was thick with verdigris. It chanced that the face was towards me; the sightless eyes seemed to watch me; there was the faint shadow of a smile on the lips. It was greatly weather-worn, and that imparted an unpleasant suggestion of disease. I stood looking at it for a little space—half a minute, perhaps, or half an hour. It seemed to advance and to recede as the hail drove before it denser or thinner. At last I tore my eyes from it for a moment, and saw that the hail curtain had worn threadbare, and that the sky was lightening with the promise of the sun.
+
+“I looked up again at the crouching white shape, and the full temerity of my voyage came suddenly upon me. What might appear when that hazy curtain was altogether withdrawn? What might not have happened to men? What if cruelty had grown into a common passion? What if in this interval the race had lost its manliness, and had developed into something inhuman, unsympathetic, and overwhelmingly powerful? I might seem some old-world savage animal, only the more dreadful and disgusting for our common likeness—a foul creature to be incontinently slain.
+
+“Already I saw other vast shapes—huge buildings with intricate parapets and tall columns, with a wooded hillside dimly creeping in upon me through the lessening storm. I was seized with a panic fear. I turned frantically to the Time Machine, and strove hard to readjust it. As I did so the shafts of the sun smote through the thunderstorm. The grey downpour was swept aside and vanished like the trailing garments of a ghost. Above me, in the intense blue of the summer sky, some faint brown shreds of cloud whirled into nothingness. The great buildings about me stood out clear and distinct, shining with the wet of the thunderstorm, and picked out in white by the unmelted hailstones piled along their courses. I felt naked in a strange world. I felt as perhaps a bird may feel in the clear air, knowing the hawk wings above and will swoop. My fear grew to frenzy. I took a breathing space, set my teeth, and again grappled fiercely, wrist and knee, with the machine. It gave under my desperate onset and turned over. It struck my chin violently. One hand on the saddle, the other on the lever, I stood panting heavily in attitude to mount again.
+
+“But with this recovery of a prompt retreat my courage recovered. I looked more curiously and less fearfully at this world of the remote future. In a circular opening, high up in the wall of the nearer house, I saw a group of figures clad in rich soft robes. They had seen me, and their faces were directed towards me.
+
+“Then I heard voices approaching me. Coming through the bushes by the White Sphinx were the heads and shoulders of men running. One of these emerged in a pathway leading straight to the little lawn upon which I stood with my machine. He was a slight creature—perhaps four feet high—clad in a purple tunic, girdled at the waist with a leather belt. Sandals or buskins—I could not clearly distinguish which—were on his feet; his legs were bare to the knees, and his head was bare. Noticing that, I noticed for the first time how warm the air was.
+
+“He struck me as being a very beautiful and graceful creature, but indescribably frail. His flushed face reminded me of the more beautiful kind of consumptive—that hectic beauty of which we used to hear so much. At the sight of him I suddenly regained confidence. I took my hands from the machine.
+
+V.
+In the Golden Age
+“In another moment we were standing face to face, I and this fragile thing out of futurity. He came straight up to me and laughed into my eyes. The absence from his bearing of any sign of fear struck me at once. Then he turned to the two others who were following him and spoke to them in a strange and very sweet and liquid tongue.
+
+“There were others coming, and presently a little group of perhaps eight or ten of these exquisite creatures were about me. One of them addressed me. It came into my head, oddly enough, that my voice was too harsh and deep for them. So I shook my head, and, pointing to my ears, shook it again. He came a step forward, hesitated, and then touched my hand. Then I felt other soft little tentacles upon my back and shoulders. They wanted to make sure I was real. There was nothing in this at all alarming. Indeed, there was something in these pretty little people that inspired confidence—a graceful gentleness, a certain childlike ease. And besides, they looked so frail that I could fancy myself flinging the whole dozen of them about like ninepins. But I made a sudden motion to warn them when I saw their little pink hands feeling at the Time Machine. Happily then, when it was not too late, I thought of a danger I had hitherto forgotten, and reaching over the bars of the machine I unscrewed the little levers that would set it in motion, and put these in my pocket. Then I turned again to see what I could do in the way of communication.
+
+“And then, looking more nearly into their features, I saw some further peculiarities in their Dresden china type of prettiness. Their hair, which was uniformly curly, came to a sharp end at the neck and cheek; there was not the faintest suggestion of it on the face, and their ears were singularly minute. The mouths were small, with bright red, rather thin lips, and the little chins ran to a point. The eyes were large and mild; and—this may seem egotism on my part—I fancied even that there was a certain lack of the interest I might have expected in them.
+
+“As they made no effort to communicate with me, but simply stood round me smiling and speaking in soft cooing notes to each other, I began the conversation. I pointed to the Time Machine and to myself. Then, hesitating for a moment how to express Time, I pointed to the sun. At once a quaintly pretty little figure in chequered purple and white followed my gesture, and then astonished me by imitating the sound of thunder.
+
+“For a moment I was staggered, though the import of his gesture was plain enough. The question had come into my mind abruptly: were these creatures fools? You may hardly understand how it took me. You see, I had always anticipated that the people of the year Eight Hundred and Two Thousand odd would be incredibly in front of us in knowledge, art, everything. Then one of them suddenly asked me a question that showed him to be on the intellectual level of one of our five-year-old children—asked me, in fact, if I had come from the sun in a thunderstorm! It let loose the judgment I had suspended upon their clothes, their frail light limbs, and fragile features. A flow of disappointment rushed across my mind. For a moment I felt that I had built the Time Machine in vain.
+
+“I nodded, pointed to the sun, and gave them such a vivid rendering of a thunderclap as startled them. They all withdrew a pace or so and bowed. Then came one laughing towards me, carrying a chain of beautiful flowers altogether new to me, and put it about my neck. The idea was received with melodious applause; and presently they were all running to and fro for flowers, and laughingly flinging them upon me until I was almost smothered with blossom. You who have never seen the like can scarcely imagine what delicate and wonderful flowers countless years of culture had created. Then someone suggested that their plaything should be exhibited in the nearest building, and so I was led past the sphinx of white marble, which had seemed to watch me all the while with a smile at my astonishment, towards a vast grey edifice of fretted stone. As I went with them the memory of my confident anticipations of a profoundly grave and intellectual posterity came, with irresistible merriment, to my mind.
+
+“The building had a huge entry, and was altogether of colossal dimensions. I was naturally most occupied with the growing crowd of little people, and with the big open portals that yawned before me shadowy and mysterious. My general impression of the world I saw over their heads was a tangled waste of beautiful bushes and flowers, a long neglected and yet weedless garden. I saw a number of tall spikes of strange white flowers, measuring a foot perhaps across the spread of the waxen petals. They grew scattered, as if wild, among the variegated shrubs, but, as I say, I did not examine them closely at this time. The Time Machine was left deserted on the turf among the rhododendrons.
+
+“The arch of the doorway was richly carved, but naturally I did not observe the carving very narrowly, though I fancied I saw suggestions of old Phœnician decorations as I passed through, and it struck me that they were very badly broken and weather-worn. Several more brightly clad people met me in the doorway, and so we entered, I, dressed in dingy nineteenth-century garments, looking grotesque enough, garlanded with flowers, and surrounded by an eddying mass of bright, soft-coloured robes and shining white limbs, in a melodious whirl of laughter and laughing speech.
+
+“The big doorway opened into a proportionately great hall hung with brown. The roof was in shadow, and the windows, partially glazed with coloured glass and partially unglazed, admitted a tempered light. The floor was made up of huge blocks of some very hard white metal, not plates nor slabs—blocks, and it was so much worn, as I judged by the going to and fro of past generations, as to be deeply channelled along the more frequented ways. Transverse to the length were innumerable tables made of slabs of polished stone, raised, perhaps, a foot from the floor, and upon these were heaps of fruits. Some I recognised as a kind of hypertrophied raspberry and orange, but for the most part they were strange.
+
+“Between the tables was scattered a great number of cushions. Upon these my conductors seated themselves, signing for me to do likewise. With a pretty absence of ceremony they began to eat the fruit with their hands, flinging peel and stalks, and so forth, into the round openings in the sides of the tables. I was not loath to follow their example, for I felt thirsty and hungry. As I did so I surveyed the hall at my leisure.
+
+“And perhaps the thing that struck me most was its dilapidated look. The stained-glass windows, which displayed only a geometrical pattern, were broken in many places, and the curtains that hung across the lower end were thick with dust. And it caught my eye that the corner of the marble table near me was fractured. Nevertheless, the general effect was extremely rich and picturesque. There were, perhaps, a couple of hundred people dining in the hall, and most of them, seated as near to me as they could come, were watching me with interest, their little eyes shining over the fruit they were eating. All were clad in the same soft, and yet strong, silky material.
+
+“Fruit, by the bye, was all their diet. These people of the remote future were strict vegetarians, and while I was with them, in spite of some carnal cravings, I had to be frugivorous also. Indeed, I found afterwards that horses, cattle, sheep, dogs, had followed the Ichthyosaurus into extinction. But the fruits were very delightful; one, in particular, that seemed to be in season all the time I was there—a floury thing in a three-sided husk—was especially good, and I made it my staple. At first I was puzzled by all these strange fruits, and by the strange flowers I saw, but later I began to perceive their import.
+
+“However, I am telling you of my fruit dinner in the distant future now. So soon as my appetite was a little checked, I determined to make a resolute attempt to learn the speech of these new men of mine. Clearly that was the next thing to do. The fruits seemed a convenient thing to begin upon, and holding one of these up I began a series of interrogative sounds and gestures. I had some considerable difficulty in conveying my meaning. At first my efforts met with a stare of surprise or inextinguishable laughter, but presently a fair-haired little creature seemed to grasp my intention and repeated a name. They had to chatter and explain the business at great length to each other, and my first attempts to make the exquisite little sounds of their language caused an immense amount of genuine, if uncivil, amusement. However, I felt like a schoolmaster amidst children, and persisted, and presently I had a score of noun substantives at least at my command; and then I got to demonstrative pronouns, and even the verb ‘to eat.’ But it was slow work, and the little people soon tired and wanted to get away from my interrogations, so I determined, rather of necessity, to let them give their lessons in little doses when they felt inclined. And very little doses I found they were before long, for I never met people more indolent or more easily fatigued.
+
+VI.
+The Sunset of Mankind
+“A queer thing I soon discovered about my little hosts, and that was their lack of interest. They would come to me with eager cries of astonishment, like children, but, like children they would soon stop examining me, and wander away after some other toy. The dinner and my conversational beginnings ended, I noted for the first time that almost all those who had surrounded me at first were gone. It is odd, too, how speedily I came to disregard these little people. I went out through the portal into the sunlit world again as soon as my hunger was satisfied. I was continually meeting more of these men of the future, who would follow me a little distance, chatter and laugh about me, and, having smiled and gesticulated in a friendly way, leave me again to my own devices.
+
+“The calm of evening was upon the world as I emerged from the great hall, and the scene was lit by the warm glow of the setting sun. At first things were very confusing. Everything was so entirely different from the world I had known—even the flowers. The big building I had left was situated on the slope of a broad river valley, but the Thames had shifted, perhaps, a mile from its present position. I resolved to mount to the summit of a crest, perhaps a mile and a half away, from which I could get a wider view of this our planet in the year Eight Hundred and Two Thousand Seven Hundred and One, A.D. For that, I should explain, was the date the little dials of my machine recorded.
+
+“As I walked I was watching for every impression that could possibly help to explain the condition of ruinous splendour in which I found the world—for ruinous it was. A little way up the hill, for instance, was a great heap of granite, bound together by masses of aluminium, a vast labyrinth of precipitous walls and crumpled heaps, amidst which were thick heaps of very beautiful pagoda-like plants—nettles possibly—but wonderfully tinted with brown about the leaves, and incapable of stinging. It was evidently the derelict remains of some vast structure, to what end built I could not determine. It was here that I was destined, at a later date, to have a very strange experience—the first intimation of a still stranger discovery—but of that I will speak in its proper place.
+
+“Looking round, with a sudden thought, from a terrace on which I rested for a while, I realised that there were no small houses to be seen. Apparently the single house, and possibly even the household, had vanished. Here and there among the greenery were palace-like buildings, but the house and the cottage, which form such characteristic features of our own English landscape, had disappeared.
+
+“‘Communism,’ said I to myself.
+
+“And on the heels of that came another thought. I looked at the half-dozen little figures that were following me. Then, in a flash, I perceived that all had the same form of costume, the same soft hairless visage, and the same girlish rotundity of limb. It may seem strange, perhaps, that I had not noticed this before. But everything was so strange. Now, I saw the fact plainly enough. In costume, and in all the differences of texture and bearing that now mark off the sexes from each other, these people of the future were alike. And the children seemed to my eyes to be but the miniatures of their parents. I judged then that the children of that time were extremely precocious, physically at least, and I found afterwards abundant verification of my opinion.
+
+“Seeing the ease and security in which these people were living, I felt that this close resemblance of the sexes was after all what one would expect; for the strength of a man and the softness of a woman, the institution of the family, and the differentiation of occupations are mere militant necessities of an age of physical force. Where population is balanced and abundant, much childbearing becomes an evil rather than a blessing to the State; where violence comes but rarely and offspring are secure, there is less necessity—indeed there is no necessity—for an efficient family, and the specialisation of the sexes with reference to their children’s needs disappears. We see some beginnings of this even in our own time, and in this future age it was complete. This, I must remind you, was my speculation at the time. Later, I was to appreciate how far it fell short of the reality.
+
+“While I was musing upon these things, my attention was attracted by a pretty little structure, like a well under a cupola. I thought in a transitory way of the oddness of wells still existing, and then resumed the thread of my speculations. There were no large buildings towards the top of the hill, and as my walking powers were evidently miraculous, I was presently left alone for the first time. With a strange sense of freedom and adventure I pushed on up to the crest.
+
+“There I found a seat of some yellow metal that I did not recognise, corroded in places with a kind of pinkish rust and half smothered in soft moss, the arm-rests cast and filed into the resemblance of griffins’ heads. I sat down on it, and I surveyed the broad view of our old world under the sunset of that long day. It was as sweet and fair a view as I have ever seen. The sun had already gone below the horizon and the west was flaming gold, touched with some horizontal bars of purple and crimson. Below was the valley of the Thames, in which the river lay like a band of burnished steel. I have already spoken of the great palaces dotted about among the variegated greenery, some in ruins and some still occupied. Here and there rose a white or silvery figure in the waste garden of the earth, here and there came the sharp vertical line of some cupola or obelisk. There were no hedges, no signs of proprietary rights, no evidences of agriculture; the whole earth had become a garden.
+
+“So watching, I began to put my interpretation upon the things I had seen, and as it shaped itself to me that evening, my interpretation was something in this way. (Afterwards I found I had got only a half truth—or only a glimpse of one facet of the truth.)
+
+“It seemed to me that I had happened upon humanity upon the wane. The ruddy sunset set me thinking of the sunset of mankind. For the first time I began to realise an odd consequence of the social effort in which we are at present engaged. And yet, come to think, it is a logical consequence enough. Strength is the outcome of need; security sets a premium on feebleness. The work of ameliorating the conditions of life—the true civilising process that makes life more and more secure—had gone steadily on to a climax. One triumph of a united humanity over Nature had followed another. Things that are now mere dreams had become projects deliberately put in hand and carried forward. And the harvest was what I saw!
+
+“After all, the sanitation and the agriculture of today are still in the rudimentary stage. The science of our time has attacked but a little department of the field of human disease, but, even so, it spreads its operations very steadily and persistently. Our agriculture and horticulture destroy a weed just here and there and cultivate perhaps a score or so of wholesome plants, leaving the greater number to fight out a balance as they can. We improve our favourite plants and animals—and how few they are—gradually by selective breeding; now a new and better peach, now a seedless grape, now a sweeter and larger flower, now a more convenient breed of cattle. We improve them gradually, because our ideals are vague and tentative, and our knowledge is very limited; because Nature, too, is shy and slow in our clumsy hands. Some day all this will be better organised, and still better. That is the drift of the current in spite of the eddies. The whole world will be intelligent, educated, and co-operating; things will move faster and faster towards the subjugation of Nature. In the end, wisely and carefully we shall readjust the balance of animal and vegetable life to suit our human needs.
+
+“This adjustment, I say, must have been done, and done well; done indeed for all Time, in the space of Time across which my machine had leapt. The air was free from gnats, the earth from weeds or fungi; everywhere were fruits and sweet and delightful flowers; brilliant butterflies flew hither and thither. The ideal of preventive medicine was attained. Diseases had been stamped out. I saw no evidence of any contagious diseases during all my stay. And I shall have to tell you later that even the processes of putrefaction and decay had been profoundly affected by these changes.
+
+“Social triumphs, too, had been effected. I saw mankind housed in splendid shelters, gloriously clothed, and as yet I had found them engaged in no toil. There were no signs of struggle, neither social nor economical struggle. The shop, the advertisement, traffic, all that commerce which constitutes the body of our world, was gone. It was natural on that golden evening that I should jump at the idea of a social paradise. The difficulty of increasing population had been met, I guessed, and population had ceased to increase.
+
+“But with this change in condition comes inevitably adaptations to the change. What, unless biological science is a mass of errors, is the cause of human intelligence and vigour? Hardship and freedom: conditions under which the active, strong, and subtle survive and the weaker go to the wall; conditions that put a premium upon the loyal alliance of capable men, upon self-restraint, patience, and decision. And the institution of the family, and the emotions that arise therein, the fierce jealousy, the tenderness for offspring, parental self-devotion, all found their justification and support in the imminent dangers of the young. Now, where are these imminent dangers? There is a sentiment arising, and it will grow, against connubial jealousy, against fierce maternity, against passion of all sorts; unnecessary things now, and things that make us uncomfortable, savage survivals, discords in a refined and pleasant life.
+
+“I thought of the physical slightness of the people, their lack of intelligence, and those big abundant ruins, and it strengthened my belief in a perfect conquest of Nature. For after the battle comes Quiet. Humanity had been strong, energetic, and intelligent, and had used all its abundant vitality to alter the conditions under which it lived. And now came the reaction of the altered conditions.
+
+“Under the new conditions of perfect comfort and security, that restless energy, that with us is strength, would become weakness. Even in our own time certain tendencies and desires, once necessary to survival, are a constant source of failure. Physical courage and the love of battle, for instance, are no great help—may even be hindrances—to a civilised man. And in a state of physical balance and security, power, intellectual as well as physical, would be out of place. For countless years I judged there had been no danger of war or solitary violence, no danger from wild beasts, no wasting disease to require strength of constitution, no need of toil. For such a life, what we should call the weak are as well equipped as the strong, are indeed no longer weak. Better equipped indeed they are, for the strong would be fretted by an energy for which there was no outlet. No doubt the exquisite beauty of the buildings I saw was the outcome of the last surgings of the now purposeless energy of mankind before it settled down into perfect harmony with the conditions under which it lived—the flourish of that triumph which began the last great peace. This has ever been the fate of energy in security; it takes to art and to eroticism, and then come languor and decay.
+
+“Even this artistic impetus would at last die away—had almost died in the Time I saw. To adorn themselves with flowers, to dance, to sing in the sunlight: so much was left of the artistic spirit, and no more. Even that would fade in the end into a contented inactivity. We are kept keen on the grindstone of pain and necessity, and it seemed to me that here was that hateful grindstone broken at last!
+
+“As I stood there in the gathering dark I thought that in this simple explanation I had mastered the problem of the world—mastered the whole secret of these delicious people. Possibly the checks they had devised for the increase of population had succeeded too well, and their numbers had rather diminished than kept stationary. That would account for the abandoned ruins. Very simple was my explanation, and plausible enough—as most wrong theories are!
+
+VII.
+A Sudden Shock
+“As I stood there musing over this too perfect triumph of man, the full moon, yellow and gibbous, came up out of an overflow of silver light in the north-east. The bright little figures ceased to move about below, a noiseless owl flitted by, and I shivered with the chill of the night. I determined to descend and find where I could sleep.
+
+“I looked for the building I knew. Then my eye travelled along to the figure of the White Sphinx upon the pedestal of bronze, growing distinct as the light of the rising moon grew brighter. I could see the silver birch against it. There was the tangle of rhododendron bushes, black in the pale light, and there was the little lawn. I looked at the lawn again. A queer doubt chilled my complacency. ‘No,’ said I stoutly to myself, ‘that was not the lawn.’
+
+“But it was the lawn. For the white leprous face of the sphinx was towards it. Can you imagine what I felt as this conviction came home to me? But you cannot. The Time Machine was gone!
+
+“At once, like a lash across the face, came the possibility of losing my own age, of being left helpless in this strange new world. The bare thought of it was an actual physical sensation. I could feel it grip me at the throat and stop my breathing. In another moment I was in a passion of fear and running with great leaping strides down the slope. Once I fell headlong and cut my face; I lost no time in stanching the blood, but jumped up and ran on, with a warm trickle down my cheek and chin. All the time I ran I was saying to myself: ‘They have moved it a little, pushed it under the bushes out of the way.’ Nevertheless, I ran with all my might. All the time, with the certainty that sometimes comes with excessive dread, I knew that such assurance was folly, knew instinctively that the machine was removed out of my reach. My breath came with pain. I suppose I covered the whole distance from the hill crest to the little lawn, two miles perhaps, in ten minutes. And I am not a young man. I cursed aloud, as I ran, at my confident folly in leaving the machine, wasting good breath thereby. I cried aloud, and none answered. Not a creature seemed to be stirring in that moonlit world.
+
+“When I reached the lawn my worst fears were realised. Not a trace of the thing was to be seen. I felt faint and cold when I faced the empty space among the black tangle of bushes. I ran round it furiously, as if the thing might be hidden in a corner, and then stopped abruptly, with my hands clutching my hair. Above me towered the sphinx, upon the bronze pedestal, white, shining, leprous, in the light of the rising moon. It seemed to smile in mockery of my dismay.
+
+“I might have consoled myself by imagining the little people had put the mechanism in some shelter for me, had I not felt assured of their physical and intellectual inadequacy. That is what dismayed me: the sense of some hitherto unsuspected power, through whose intervention my invention had vanished. Yet, for one thing I felt assured: unless some other age had produced its exact duplicate, the machine could not have moved in time. The attachment of the levers—I will show you the method later—prevented anyone from tampering with it in that way when they were removed. It had moved, and was hid, only in space. But then, where could it be?
+
+“I think I must have had a kind of frenzy. I remember running violently in and out among the moonlit bushes all round the sphinx, and startling some white animal that, in the dim light, I took for a small deer. I remember, too, late that night, beating the bushes with my clenched fist until my knuckles were gashed and bleeding from the broken twigs. Then, sobbing and raving in my anguish of mind, I went down to the great building of stone. The big hall was dark, silent, and deserted. I slipped on the uneven floor, and fell over one of the malachite tables, almost breaking my shin. I lit a match and went on past the dusty curtains, of which I have told you.
+
+“There I found a second great hall covered with cushions, upon which, perhaps, a score or so of the little people were sleeping. I have no doubt they found my second appearance strange enough, coming suddenly out of the quiet darkness with inarticulate noises and the splutter and flare of a match. For they had forgotten about matches. ‘Where is my Time Machine?’ I began, bawling like an angry child, laying hands upon them and shaking them up together. It must have been very queer to them. Some laughed, most of them looked sorely frightened. When I saw them standing round me, it came into my head that I was doing as foolish a thing as it was possible for me to do under the circumstances, in trying to revive the sensation of fear. For, reasoning from their daylight behaviour, I thought that fear must be forgotten.
+
+“Abruptly, I dashed down the match, and knocking one of the people over in my course, went blundering across the big dining-hall again, out under the moonlight. I heard cries of terror and their little feet running and stumbling this way and that. I do not remember all I did as the moon crept up the sky. I suppose it was the unexpected nature of my loss that maddened me. I felt hopelessly cut off from my own kind—a strange animal in an unknown world. I must have raved to and fro, screaming and crying upon God and Fate. I have a memory of horrible fatigue, as the long night of despair wore away; of looking in this impossible place and that; of groping among moonlit ruins and touching strange creatures in the black shadows; at last, of lying on the ground near the sphinx and weeping with absolute wretchedness, even anger at the folly of leaving the machine having leaked away with my strength. I had nothing left but misery. Then I slept, and when I woke again it was full day, and a couple of sparrows were hopping round me on the turf within reach of my arm.
+
+“I sat up in the freshness of the morning, trying to remember how I had got there, and why I had such a profound sense of desertion and despair. Then things came clear in my mind. With the plain, reasonable daylight, I could look my circumstances fairly in the face. I saw the wild folly of my frenzy overnight, and I could reason with myself. ‘Suppose the worst?’ I said. ‘Suppose the machine altogether lost—perhaps destroyed? It behoves me to be calm and patient, to learn the way of the people, to get a clear idea of the method of my loss, and the means of getting materials and tools; so that in the end, perhaps, I may make another.’ That would be my only hope, a poor hope, perhaps, but better than despair. And, after all, it was a beautiful and curious world.
+
+“But probably the machine had only been taken away. Still, I must be calm and patient, find its hiding-place, and recover it by force or cunning. And with that I scrambled to my feet and looked about me, wondering where I could bathe. I felt weary, stiff, and travel-soiled. The freshness of the morning made me desire an equal freshness. I had exhausted my emotion. Indeed, as I went about my business, I found myself wondering at my intense excitement overnight. I made a careful examination of the ground about the little lawn. I wasted some time in futile questionings, conveyed, as well as I was able, to such of the little people as came by. They all failed to understand my gestures; some were simply stolid, some thought it was a jest and laughed at me. I had the hardest task in the world to keep my hands off their pretty laughing faces. It was a foolish impulse, but the devil begotten of fear and blind anger was ill curbed and still eager to take advantage of my perplexity. The turf gave better counsel. I found a groove ripped in it, about midway between the pedestal of the sphinx and the marks of my feet where, on arrival, I had struggled with the overturned machine. There were other signs of removal about, with queer narrow footprints like those I could imagine made by a sloth. This directed my closer attention to the pedestal. It was, as I think I have said, of bronze. It was not a mere block, but highly decorated with deep framed panels on either side. I went and rapped at these. The pedestal was hollow. Examining the panels with care I found them discontinuous with the frames. There were no handles or keyholes, but possibly the panels, if they were doors, as I supposed, opened from within. One thing was clear enough to my mind. It took no very great mental effort to infer that my Time Machine was inside that pedestal. But how it got there was a different problem.
+
+“I saw the heads of two orange-clad people coming through the bushes and under some blossom-covered apple-trees towards me. I turned smiling to them, and beckoned them to me. They came, and then, pointing to the bronze pedestal, I tried to intimate my wish to open it. But at my first gesture towards this they behaved very oddly. I don’t know how to convey their expression to you. Suppose you were to use a grossly improper gesture to a delicate-minded woman—it is how she would look. They went off as if they had received the last possible insult. I tried a sweet-looking little chap in white next, with exactly the same result. Somehow, his manner made me feel ashamed of myself. But, as you know, I wanted the Time Machine, and I tried him once more. As he turned off, like the others, my temper got the better of me. In three strides I was after him, had him by the loose part of his robe round the neck, and began dragging him towards the sphinx. Then I saw the horror and repugnance of his face, and all of a sudden I let him go.
+
+“But I was not beaten yet. I banged with my fist at the bronze panels. I thought I heard something stir inside—to be explicit, I thought I heard a sound like a chuckle—but I must have been mistaken. Then I got a big pebble from the river, and came and hammered till I had flattened a coil in the decorations, and the verdigris came off in powdery flakes. The delicate little people must have heard me hammering in gusty outbreaks a mile away on either hand, but nothing came of it. I saw a crowd of them upon the slopes, looking furtively at me. At last, hot and tired, I sat down to watch the place. But I was too restless to watch long; I am too Occidental for a long vigil. I could work at a problem for years, but to wait inactive for twenty-four hours—that is another matter.
+
+“I got up after a time, and began walking aimlessly through the bushes towards the hill again. ‘Patience,’ said I to myself. ‘If you want your machine again you must leave that sphinx alone. If they mean to take your machine away, it’s little good your wrecking their bronze panels, and if they don’t, you will get it back as soon as you can ask for it. To sit among all those unknown things before a puzzle like that is hopeless. That way lies monomania. Face this world. Learn its ways, watch it, be careful of too hasty guesses at its meaning. In the end you will find clues to it all.’ Then suddenly the humour of the situation came into my mind: the thought of the years I had spent in study and toil to get into the future age, and now my passion of anxiety to get out of it. I had made myself the most complicated and the most hopeless trap that ever a man devised. Although it was at my own expense, I could not help myself. I laughed aloud.
+
+“Going through the big palace, it seemed to me that the little people avoided me. It may have been my fancy, or it may have had something to do with my hammering at the gates of bronze. Yet I felt tolerably sure of the avoidance. I was careful, however, to show no concern and to abstain from any pursuit of them, and in the course of a day or two things got back to the old footing. I made what progress I could in the language, and in addition I pushed my explorations here and there. Either I missed some subtle point or their language was excessively simple—almost exclusively composed of concrete substantives and verbs. There seemed to be few, if any, abstract terms, or little use of figurative language. Their sentences were usually simple and of two words, and I failed to convey or understand any but the simplest propositions. I determined to put the thought of my Time Machine and the mystery of the bronze doors under the sphinx, as much as possible in a corner of memory, until my growing knowledge would lead me back to them in a natural way. Yet a certain feeling, you may understand, tethered me in a circle of a few miles round the point of my arrival.
+
+VIII.
+Explanation
+“So far as I could see, all the world displayed the same exuberant richness as the Thames valley. From every hill I climbed I saw the same abundance of splendid buildings, endlessly varied in material and style, the same clustering thickets of evergreens, the same blossom-laden trees and tree ferns. Here and there water shone like silver, and beyond, the land rose into blue undulating hills, and so faded into the serenity of the sky. A peculiar feature, which presently attracted my attention, was the presence of certain circular wells, several, as it seemed to me, of a very great depth. One lay by the path up the hill which I had followed during my first walk. Like the others, it was rimmed with bronze, curiously wrought, and protected by a little cupola from the rain. Sitting by the side of these wells, and peering down into the shafted darkness, I could see no gleam of water, nor could I start any reflection with a lighted match. But in all of them I heard a certain sound: a thud—thud—thud, like the beating of some big engine; and I discovered, from the flaring of my matches, that a steady current of air set down the shafts. Further, I threw a scrap of paper into the throat of one, and, instead of fluttering slowly down, it was at once sucked swiftly out of sight.
+
+“After a time, too, I came to connect these wells with tall towers standing here and there upon the slopes; for above them there was often just such a flicker in the air as one sees on a hot day above a sun-scorched beach. Putting things together, I reached a strong suggestion of an extensive system of subterranean ventilation, whose true import it was difficult to imagine. I was at first inclined to associate it with the sanitary apparatus of these people. It was an obvious conclusion, but it was absolutely wrong.
+
+“And here I must admit that I learnt very little of drains and bells and modes of conveyance, and the like conveniences, during my time in this real future. In some of these visions of Utopias and coming times which I have read, there is a vast amount of detail about building, and social arrangements, and so forth. But while such details are easy enough to obtain when the whole world is contained in one’s imagination, they are altogether inaccessible to a real traveller amid such realities as I found here. Conceive the tale of London which a negro, fresh from Central Africa, would take back to his tribe! What would he know of railway companies, of social movements, of telephone and telegraph wires, of the Parcels Delivery Company, and postal orders and the like? Yet we, at least, should be willing enough to explain these things to him! And even of what he knew, how much could he make his untravelled friend either apprehend or believe? Then, think how narrow the gap between a negro and a white man of our own times, and how wide the interval between myself and these of the Golden Age! I was sensible of much which was unseen, and which contributed to my comfort; but save for a general impression of automatic organisation, I fear I can convey very little of the difference to your mind.
+
+“In the matter of sepulture, for instance, I could see no signs of crematoria nor anything suggestive of tombs. But it occurred to me that, possibly, there might be cemeteries (or crematoria) somewhere beyond the range of my explorings. This, again, was a question I deliberately put to myself, and my curiosity was at first entirely defeated upon the point. The thing puzzled me, and I was led to make a further remark, which puzzled me still more: that aged and infirm among this people there were none.
+
+“I must confess that my satisfaction with my first theories of an automatic civilisation and a decadent humanity did not long endure. Yet I could think of no other. Let me put my difficulties. The several big palaces I had explored were mere living places, great dining-halls and sleeping apartments. I could find no machinery, no appliances of any kind. Yet these people were clothed in pleasant fabrics that must at times need renewal, and their sandals, though undecorated, were fairly complex specimens of metalwork. Somehow such things must be made. And the little people displayed no vestige of a creative tendency. There were no shops, no workshops, no sign of importations among them. They spent all their time in playing gently, in bathing in the river, in making love in a half-playful fashion, in eating fruit and sleeping. I could not see how things were kept going.
+
+“Then, again, about the Time Machine: something, I knew not what, had taken it into the hollow pedestal of the White Sphinx. Why? For the life of me I could not imagine. Those waterless wells, too, those flickering pillars. I felt I lacked a clue. I felt—how shall I put it? Suppose you found an inscription, with sentences here and there in excellent plain English, and interpolated therewith, others made up of words, of letters even, absolutely unknown to you? Well, on the third day of my visit, that was how the world of Eight Hundred and Two Thousand Seven Hundred and One presented itself to me!
+
+“That day, too, I made a friend—of a sort. It happened that, as I was watching some of the little people bathing in a shallow, one of them was seized with cramp and began drifting downstream. The main current ran rather swiftly, but not too strongly for even a moderate swimmer. It will give you an idea, therefore, of the strange deficiency in these creatures, when I tell you that none made the slightest attempt to rescue the weakly crying little thing which was drowning before their eyes. When I realised this, I hurriedly slipped off my clothes, and, wading in at a point lower down, I caught the poor mite and drew her safe to land. A little rubbing of the limbs soon brought her round, and I had the satisfaction of seeing she was all right before I left her. I had got to such a low estimate of her kind that I did not expect any gratitude from her. In that, however, I was wrong.
+
+“This happened in the morning. In the afternoon I met my little woman, as I believe it was, as I was returning towards my centre from an exploration, and she received me with cries of delight and presented me with a big garland of flowers—evidently made for me and me alone. The thing took my imagination. Very possibly I had been feeling desolate. At any rate I did my best to display my appreciation of the gift. We were soon seated together in a little stone arbour, engaged in conversation, chiefly of smiles. The creature’s friendliness affected me exactly as a child’s might have done. We passed each other flowers, and she kissed my hands. I did the same to hers. Then I tried talk, and found that her name was Weena, which, though I don’t know what it meant, somehow seemed appropriate enough. That was the beginning of a queer friendship which lasted a week, and ended—as I will tell you!
+
+“She was exactly like a child. She wanted to be with me always. She tried to follow me everywhere, and on my next journey out and about it went to my heart to tire her down, and leave her at last, exhausted and calling after me rather plaintively. But the problems of the world had to be mastered. I had not, I said to myself, come into the future to carry on a miniature flirtation. Yet her distress when I left her was very great, her expostulations at the parting were sometimes frantic, and I think, altogether, I had as much trouble as comfort from her devotion. Nevertheless she was, somehow, a very great comfort. I thought it was mere childish affection that made her cling to me. Until it was too late, I did not clearly know what I had inflicted upon her when I left her. Nor until it was too late did I clearly understand what she was to me. For, by merely seeming fond of me, and showing in her weak, futile way that she cared for me, the little doll of a creature presently gave my return to the neighbourhood of the White Sphinx almost the feeling of coming home; and I would watch for her tiny figure of white and gold so soon as I came over the hill.
+
+“It was from her, too, that I learnt that fear had not yet left the world. She was fearless enough in the daylight, and she had the oddest confidence in me; for once, in a foolish moment, I made threatening grimaces at her, and she simply laughed at them. But she dreaded the dark, dreaded shadows, dreaded black things. Darkness to her was the one thing dreadful. It was a singularly passionate emotion, and it set me thinking and observing. I discovered then, among other things, that these little people gathered into the great houses after dark, and slept in droves. To enter upon them without a light was to put them into a tumult of apprehension. I never found one out of doors, or one sleeping alone within doors, after dark. Yet I was still such a blockhead that I missed the lesson of that fear, and in spite of Weena’s distress, I insisted upon sleeping away from these slumbering multitudes.
+
+“It troubled her greatly, but in the end her odd affection for me triumphed, and for five of the nights of our acquaintance, including the last night of all, she slept with her head pillowed on my arm. But my story slips away from me as I speak of her. It must have been the night before her rescue that I was awakened about dawn. I had been restless, dreaming most disagreeably that I was drowned, and that sea anemones were feeling over my face with their soft palps. I woke with a start, and with an odd fancy that some greyish animal had just rushed out of the chamber. I tried to get to sleep again, but I felt restless and uncomfortable. It was that dim grey hour when things are just creeping out of darkness, when everything is colourless and clear cut, and yet unreal. I got up, and went down into the great hall, and so out upon the flagstones in front of the palace. I thought I would make a virtue of necessity, and see the sunrise.
+
+“The moon was setting, and the dying moonlight and the first pallor of dawn were mingled in a ghastly half-light. The bushes were inky black, the ground a sombre grey, the sky colourless and cheerless. And up the hill I thought I could see ghosts. Three several times, as I scanned the slope, I saw white figures. Twice I fancied I saw a solitary white, ape-like creature running rather quickly up the hill, and once near the ruins I saw a leash of them carrying some dark body. They moved hastily. I did not see what became of them. It seemed that they vanished among the bushes. The dawn was still indistinct, you must understand. I was feeling that chill, uncertain, early-morning feeling you may have known. I doubted my eyes.
+
+“As the eastern sky grew brighter, and the light of the day came on and its vivid colouring returned upon the world once more, I scanned the view keenly. But I saw no vestige of my white figures. They were mere creatures of the half-light. ‘They must have been ghosts,’ I said; ‘I wonder whence they dated.’ For a queer notion of Grant Allen’s came into my head, and amused me. If each generation die and leave ghosts, he argued, the world at last will get overcrowded with them. On that theory they would have grown innumerable some Eight Hundred Thousand Years hence, and it was no great wonder to see four at once. But the jest was unsatisfying, and I was thinking of these figures all the morning, until Weena’s rescue drove them out of my head. I associated them in some indefinite way with the white animal I had startled in my first passionate search for the Time Machine. But Weena was a pleasant substitute. Yet all the same, they were soon destined to take far deadlier possession of my mind.
+
+“I think I have said how much hotter than our own was the weather of this Golden Age. I cannot account for it. It may be that the sun was hotter, or the earth nearer the sun. It is usual to assume that the sun will go on cooling steadily in the future. But people, unfamiliar with such speculations as those of the younger Darwin, forget that the planets must ultimately fall back one by one into the parent body. As these catastrophes occur, the sun will blaze with renewed energy; and it may be that some inner planet had suffered this fate. Whatever the reason, the fact remains that the sun was very much hotter than we know it.
+
+“Well, one very hot morning—my fourth, I think—as I was seeking shelter from the heat and glare in a colossal ruin near the great house where I slept and fed, there happened this strange thing. Clambering among these heaps of masonry, I found a narrow gallery, whose end and side windows were blocked by fallen masses of stone. By contrast with the brilliancy outside, it seemed at first impenetrably dark to me. I entered it groping, for the change from light to blackness made spots of colour swim before me. Suddenly I halted spellbound. A pair of eyes, luminous by reflection against the daylight without, was watching me out of the darkness.
+
+“The old instinctive dread of wild beasts came upon me. I clenched my hands and steadfastly looked into the glaring eyeballs. I was afraid to turn. Then the thought of the absolute security in which humanity appeared to be living came to my mind. And then I remembered that strange terror of the dark. Overcoming my fear to some extent, I advanced a step and spoke. I will admit that my voice was harsh and ill-controlled. I put out my hand and touched something soft. At once the eyes darted sideways, and something white ran past me. I turned with my heart in my mouth, and saw a queer little ape-like figure, its head held down in a peculiar manner, running across the sunlit space behind me. It blundered against a block of granite, staggered aside, and in a moment was hidden in a black shadow beneath another pile of ruined masonry.
+
+“My impression of it is, of course, imperfect; but I know it was a dull white, and had strange large greyish-red eyes; also that there was flaxen hair on its head and down its back. But, as I say, it went too fast for me to see distinctly. I cannot even say whether it ran on all fours, or only with its forearms held very low. After an instant’s pause I followed it into the second heap of ruins. I could not find it at first; but, after a time in the profound obscurity, I came upon one of those round well-like openings of which I have told you, half closed by a fallen pillar. A sudden thought came to me. Could this Thing have vanished down the shaft? I lit a match, and, looking down, I saw a small, white, moving creature, with large bright eyes which regarded me steadfastly as it retreated. It made me shudder. It was so like a human spider! It was clambering down the wall, and now I saw for the first time a number of metal foot and hand rests forming a kind of ladder down the shaft. Then the light burned my fingers and fell out of my hand, going out as it dropped, and when I had lit another the little monster had disappeared.
+
+“I do not know how long I sat peering down that well. It was not for some time that I could succeed in persuading myself that the thing I had seen was human. But, gradually, the truth dawned on me: that Man had not remained one species, but had differentiated into two distinct animals: that my graceful children of the Upper World were not the sole descendants of our generation, but that this bleached, obscene, nocturnal Thing, which had flashed before me, was also heir to all the ages.
+
+“I thought of the flickering pillars and of my theory of an underground ventilation. I began to suspect their true import. And what, I wondered, was this Lemur doing in my scheme of a perfectly balanced organisation? How was it related to the indolent serenity of the beautiful Overworlders? And what was hidden down there, at the foot of that shaft? I sat upon the edge of the well telling myself that, at any rate, there was nothing to fear, and that there I must descend for the solution of my difficulties. And withal I was absolutely afraid to go! As I hesitated, two of the beautiful upperworld people came running in their amorous sport across the daylight in the shadow. The male pursued the female, flinging flowers at her as he ran.
+
+“They seemed distressed to find me, my arm against the overturned pillar, peering down the well. Apparently it was considered bad form to remark these apertures; for when I pointed to this one, and tried to frame a question about it in their tongue, they were still more visibly distressed and turned away. But they were interested by my matches, and I struck some to amuse them. I tried them again about the well, and again I failed. So presently I left them, meaning to go back to Weena, and see what I could get from her. But my mind was already in revolution; my guesses and impressions were slipping and sliding to a new adjustment. I had now a clue to the import of these wells, to the ventilating towers, to the mystery of the ghosts; to say nothing of a hint at the meaning of the bronze gates and the fate of the Time Machine! And very vaguely there came a suggestion towards the solution of the economic problem that had puzzled me.
+
+“Here was the new view. Plainly, this second species of Man was subterranean. There were three circumstances in particular which made me think that its rare emergence above ground was the outcome of a long-continued underground habit. In the first place, there was the bleached look common in most animals that live largely in the dark—the white fish of the Kentucky caves, for instance. Then, those large eyes, with that capacity for reflecting light, are common features of nocturnal things—witness the owl and the cat. And last of all, that evident confusion in the sunshine, that hasty yet fumbling awkward flight towards dark shadow, and that peculiar carriage of the head while in the light—all reinforced the theory of an extreme sensitiveness of the retina.
+
+“Beneath my feet, then, the earth must be tunnelled enormously, and these tunnellings were the habitat of the New Race. The presence of ventilating shafts and wells along the hill slopes—everywhere, in fact, except along the river valley—showed how universal were its ramifications. What so natural, then, as to assume that it was in this artificial Underworld that such work as was necessary to the comfort of the daylight race was done? The notion was so plausible that I at once accepted it, and went on to assume the how of this splitting of the human species. I dare say you will anticipate the shape of my theory; though, for myself, I very soon felt that it fell far short of the truth.
+
+“At first, proceeding from the problems of our own age, it seemed clear as daylight to me that the gradual widening of the present merely temporary and social difference between the Capitalist and the Labourer was the key to the whole position. No doubt it will seem grotesque enough to you—and wildly incredible!—and yet even now there are existing circumstances to point that way. There is a tendency to utilise underground space for the less ornamental purposes of civilisation; there is the Metropolitan Railway in London, for instance, there are new electric railways, there are subways, there are underground workrooms and restaurants, and they increase and multiply. Evidently, I thought, this tendency had increased till Industry had gradually lost its birthright in the sky. I mean that it had gone deeper and deeper into larger and ever larger underground factories, spending a still-increasing amount of its time therein, till, in the end—! Even now, does not an East-end worker live in such artificial conditions as practically to be cut off from the natural surface of the earth?
+
+“Again, the exclusive tendency of richer people—due, no doubt, to the increasing refinement of their education, and the widening gulf between them and the rude violence of the poor—is already leading to the closing, in their interest, of considerable portions of the surface of the land. About London, for instance, perhaps half the prettier country is shut in against intrusion. And this same widening gulf—which is due to the length and expense of the higher educational process and the increased facilities for and temptations towards refined habits on the part of the rich—will make that exchange between class and class, that promotion by intermarriage which at present retards the splitting of our species along lines of social stratification, less and less frequent. So, in the end, above ground you must have the Haves, pursuing pleasure and comfort and beauty, and below ground the Have-nots, the Workers getting continually adapted to the conditions of their labour. Once they were there, they would no doubt have to pay rent, and not a little of it, for the ventilation of their caverns; and if they refused, they would starve or be suffocated for arrears. Such of them as were so constituted as to be miserable and rebellious would die; and, in the end, the balance being permanent, the survivors would become as well adapted to the conditions of underground life, and as happy in their way, as the Overworld people were to theirs. As it seemed to me, the refined beauty and the etiolated pallor followed naturally enough.
+
+“The great triumph of Humanity I had dreamed of took a different shape in my mind. It had been no such triumph of moral education and general co-operation as I had imagined. Instead, I saw a real aristocracy, armed with a perfected science and working to a logical conclusion the industrial system of today. Its triumph had not been simply a triumph over Nature, but a triumph over Nature and the fellow-man. This, I must warn you, was my theory at the time. I had no convenient cicerone in the pattern of the Utopian books. My explanation may be absolutely wrong. I still think it is the most plausible one. But even on this supposition the balanced civilisation that was at last attained must have long since passed its zenith, and was now far fallen into decay. The too-perfect security of the Overworlders had led them to a slow movement of degeneration, to a general dwindling in size, strength, and intelligence. That I could see clearly enough already. What had happened to the Undergrounders I did not yet suspect; but, from what I had seen of the Morlocks—that, by the bye, was the name by which these creatures were called—I could imagine that the modification of the human type was even far more profound than among the ‘Eloi,’ the beautiful race that I already knew.
+
+“Then came troublesome doubts. Why had the Morlocks taken my Time Machine? For I felt sure it was they who had taken it. Why, too, if the Eloi were masters, could they not restore the machine to me? And why were they so terribly afraid of the dark? I proceeded, as I have said, to question Weena about this Underworld, but here again I was disappointed. At first she would not understand my questions, and presently she refused to answer them. She shivered as though the topic was unendurable. And when I pressed her, perhaps a little harshly, she burst into tears. They were the only tears, except my own, I ever saw in that Golden Age. When I saw them I ceased abruptly to trouble about the Morlocks, and was only concerned in banishing these signs of her human inheritance from Weena’s eyes. And very soon she was smiling and clapping her hands, while I solemnly burnt a match.
+
+IX.
+The Morlocks
+“It may seem odd to you, but it was two days before I could follow up the new-found clue in what was manifestly the proper way. I felt a peculiar shrinking from those pallid bodies. They were just the half-bleached colour of the worms and things one sees preserved in spirit in a zoological museum. And they were filthily cold to the touch. Probably my shrinking was largely due to the sympathetic influence of the Eloi, whose disgust of the Morlocks I now began to appreciate.
+
+“The next night I did not sleep well. Probably my health was a little disordered. I was oppressed with perplexity and doubt. Once or twice I had a feeling of intense fear for which I could perceive no definite reason. I remember creeping noiselessly into the great hall where the little people were sleeping in the moonlight—that night Weena was among them—and feeling reassured by their presence. It occurred to me even then, that in the course of a few days the moon must pass through its last quarter, and the nights grow dark, when the appearances of these unpleasant creatures from below, these whitened Lemurs, this new vermin that had replaced the old, might be more abundant. And on both these days I had the restless feeling of one who shirks an inevitable duty. I felt assured that the Time Machine was only to be recovered by boldly penetrating these mysteries of underground. Yet I could not face the mystery. If only I had had a companion it would have been different. But I was so horribly alone, and even to clamber down into the darkness of the well appalled me. I don’t know if you will understand my feeling, but I never felt quite safe at my back.
+
+“It was this restlessness, this insecurity, perhaps, that drove me farther and farther afield in my exploring expeditions. Going to the south-westward towards the rising country that is now called Combe Wood, I observed far-off, in the direction of nineteenth-century Banstead, a vast green structure, different in character from any I had hitherto seen. It was larger than the largest of the palaces or ruins I knew, and the façade had an Oriental look: the face of it having the lustre, as well as the pale-green tint, a kind of bluish-green, of a certain type of Chinese porcelain. This difference in aspect suggested a difference in use, and I was minded to push on and explore. But the day was growing late, and I had come upon the sight of the place after a long and tiring circuit; so I resolved to hold over the adventure for the following day, and I returned to the welcome and the caresses of little Weena. But next morning I perceived clearly enough that my curiosity regarding the Palace of Green Porcelain was a piece of self-deception, to enable me to shirk, by another day, an experience I dreaded. I resolved I would make the descent without further waste of time, and started out in the early morning towards a well near the ruins of granite and aluminium.
+
+“Little Weena ran with me. She danced beside me to the well, but when she saw me lean over the mouth and look downward, she seemed strangely disconcerted. ‘Good-bye, little Weena,’ I said, kissing her; and then putting her down, I began to feel over the parapet for the climbing hooks. Rather hastily, I may as well confess, for I feared my courage might leak away! At first she watched me in amazement. Then she gave a most piteous cry, and running to me, she began to pull at me with her little hands. I think her opposition nerved me rather to proceed. I shook her off, perhaps a little roughly, and in another moment I was in the throat of the well. I saw her agonised face over the parapet, and smiled to reassure her. Then I had to look down at the unstable hooks to which I clung.
+
+“I had to clamber down a shaft of perhaps two hundred yards. The descent was effected by means of metallic bars projecting from the sides of the well, and these being adapted to the needs of a creature much smaller and lighter than myself, I was speedily cramped and fatigued by the descent. And not simply fatigued! One of the bars bent suddenly under my weight, and almost swung me off into the blackness beneath. For a moment I hung by one hand, and after that experience I did not dare to rest again. Though my arms and back were presently acutely painful, I went on clambering down the sheer descent with as quick a motion as possible. Glancing upward, I saw the aperture, a small blue disc, in which a star was visible, while little Weena’s head showed as a round black projection. The thudding sound of a machine below grew louder and more oppressive. Everything save that little disc above was profoundly dark, and when I looked up again Weena had disappeared.
+
+“I was in an agony of discomfort. I had some thought of trying to go up the shaft again, and leave the Underworld alone. But even while I turned this over in my mind I continued to descend. At last, with intense relief, I saw dimly coming up, a foot to the right of me, a slender loophole in the wall. Swinging myself in, I found it was the aperture of a narrow horizontal tunnel in which I could lie down and rest. It was not too soon. My arms ached, my back was cramped, and I was trembling with the prolonged terror of a fall. Besides this, the unbroken darkness had had a distressing effect upon my eyes. The air was full of the throb and hum of machinery pumping air down the shaft.
+
+“I do not know how long I lay. I was arroused by a soft hand touching my face. Starting up in the darkness I snatched at my matches and, hastily striking one, I saw three stooping white creatures similar to the one I had seen above ground in the ruin, hastily retreating before the light. Living, as they did, in what appeared to me impenetrable darkness, their eyes were abnormally large and sensitive, just as are the pupils of the abysmal fishes, and they reflected the light in the same way. I have no doubt they could see me in that rayless obscurity, and they did not seem to have any fear of me apart from the light. But, so soon as I struck a match in order to see them, they fled incontinently, vanishing into dark gutters and tunnels, from which their eyes glared at me in the strangest fashion.
+
+“I tried to call to them, but the language they had was apparently different from that of the Overworld people; so that I was needs left to my own unaided efforts, and the thought of flight before exploration was even then in my mind. But I said to myself, ‘You are in for it now,’ and, feeling my way along the tunnel, I found the noise of machinery grow louder. Presently the walls fell away from me, and I came to a large open space, and striking another match, saw that I had entered a vast arched cavern, which stretched into utter darkness beyond the range of my light. The view I had of it was as much as one could see in the burning of a match.
+
+“Necessarily my memory is vague. Great shapes like big machines rose out of the dimness, and cast grotesque black shadows, in which dim spectral Morlocks sheltered from the glare. The place, by the bye, was very stuffy and oppressive, and the faint halitus of freshly-shed blood was in the air. Some way down the central vista was a little table of white metal, laid with what seemed a meal. The Morlocks at any rate were carnivorous! Even at the time, I remember wondering what large animal could have survived to furnish the red joint I saw. It was all very indistinct: the heavy smell, the big unmeaning shapes, the obscene figures lurking in the shadows, and only waiting for the darkness to come at me again! Then the match burnt down, and stung my fingers, and fell, a wriggling red spot in the blackness.
+
+“I have thought since how particularly ill-equipped I was for such an experience. When I had started with the Time Machine, I had started with the absurd assumption that the men of the Future would certainly be infinitely ahead of ourselves in all their appliances. I had come without arms, without medicine, without anything to smoke—at times I missed tobacco frightfully!—even without enough matches. If only I had thought of a Kodak! I could have flashed that glimpse of the Underworld in a second, and examined it at leisure. But, as it was, I stood there with only the weapons and the powers that Nature had endowed me with—hands, feet, and teeth; these, and four safety-matches that still remained to me.
+
+“I was afraid to push my way in among all this machinery in the dark, and it was only with my last glimpse of light I discovered that my store of matches had run low. It had never occurred to me until that moment that there was any need to economise them, and I had wasted almost half the box in astonishing the Overworlders, to whom fire was a novelty. Now, as I say, I had four left, and while I stood in the dark, a hand touched mine, lank fingers came feeling over my face, and I was sensible of a peculiar unpleasant odour. I fancied I heard the breathing of a crowd of those dreadful little beings about me. I felt the box of matches in my hand being gently disengaged, and other hands behind me plucking at my clothing. The sense of these unseen creatures examining me was indescribably unpleasant. The sudden realisation of my ignorance of their ways of thinking and doing came home to me very vividly in the darkness. I shouted at them as loudly as I could. They started away, and then I could feel them approaching me again. They clutched at me more boldly, whispering odd sounds to each other. I shivered violently, and shouted again—rather discordantly. This time they were not so seriously alarmed, and they made a queer laughing noise as they came back at me. I will confess I was horribly frightened. I determined to strike another match and escape under the protection of its glare. I did so, and eking out the flicker with a scrap of paper from my pocket, I made good my retreat to the narrow tunnel. But I had scarce entered this when my light was blown out and in the blackness I could hear the Morlocks rustling like wind among leaves, and pattering like the rain, as they hurried after me.
+
+“In a moment I was clutched by several hands, and there was no mistaking that they were trying to haul me back. I struck another light, and waved it in their dazzled faces. You can scarce imagine how nauseatingly inhuman they looked—those pale, chinless faces and great, lidless, pinkish-grey eyes!—as they stared in their blindness and bewilderment. But I did not stay to look, I promise you: I retreated again, and when my second match had ended, I struck my third. It had almost burnt through when I reached the opening into the shaft. I lay down on the edge, for the throb of the great pump below made me giddy. Then I felt sideways for the projecting hooks, and, as I did so, my feet were grasped from behind, and I was violently tugged backward. I lit my last match … and it incontinently went out. But I had my hand on the climbing bars now, and, kicking violently, I disengaged myself from the clutches of the Morlocks, and was speedily clambering up the shaft, while they stayed peering and blinking up at me: all but one little wretch who followed me for some way, and well-nigh secured my boot as a trophy.
+
+“That climb seemed interminable to me. With the last twenty or thirty feet of it a deadly nausea came upon me. I had the greatest difficulty in keeping my hold. The last few yards was a frightful struggle against this faintness. Several times my head swam, and I felt all the sensations of falling. At last, however, I got over the well-mouth somehow, and staggered out of the ruin into the blinding sunlight. I fell upon my face. Even the soil smelt sweet and clean. Then I remember Weena kissing my hands and ears, and the voices of others among the Eloi. Then, for a time, I was insensible.
+
+X.
+When Night Came
+“Now, indeed, I seemed in a worse case than before. Hitherto, except during my night’s anguish at the loss of the Time Machine, I had felt a sustaining hope of ultimate escape, but that hope was staggered by these new discoveries. Hitherto I had merely thought myself impeded by the childish simplicity of the little people, and by some unknown forces which I had only to understand to overcome; but there was an altogether new element in the sickening quality of the Morlocks—a something inhuman and malign. Instinctively I loathed them. Before, I had felt as a man might feel who had fallen into a pit: my concern was with the pit and how to get out of it. Now I felt like a beast in a trap, whose enemy would come upon him soon.
+
+“The enemy I dreaded may surprise you. It was the darkness of the new moon. Weena had put this into my head by some at first incomprehensible remarks about the Dark Nights. It was not now such a very difficult problem to guess what the coming Dark Nights might mean. The moon was on the wane: each night there was a longer interval of darkness. And I now understood to some slight degree at least the reason of the fear of the little Upperworld people for the dark. I wondered vaguely what foul villainy it might be that the Morlocks did under the new moon. I felt pretty sure now that my second hypothesis was all wrong. The Upperworld people might once have been the favoured aristocracy, and the Morlocks their mechanical servants: but that had long since passed away. The two species that had resulted from the evolution of man were sliding down towards, or had already arrived at, an altogether new relationship. The Eloi, like the Carlovignan kings, had decayed to a mere beautiful futility. They still possessed the earth on sufferance: since the Morlocks, subterranean for innumerable generations, had come at last to find the daylit surface intolerable. And the Morlocks made their garments, I inferred, and maintained them in their habitual needs, perhaps through the survival of an old habit of service. They did it as a standing horse paws with his foot, or as a man enjoys killing animals in sport: because ancient and departed necessities had impressed it on the organism. But, clearly, the old order was already in part reversed. The Nemesis of the delicate ones was creeping on apace. Ages ago, thousands of generations ago, man had thrust his brother man out of the ease and the sunshine. And now that brother was coming back—changed! Already the Eloi had begun to learn one old lesson anew. They were becoming reacquainted with Fear. And suddenly there came into my head the memory of the meat I had seen in the Underworld. It seemed odd how it floated into my mind: not stirred up as it were by the current of my meditations, but coming in almost like a question from outside. I tried to recall the form of it. I had a vague sense of something familiar, but I could not tell what it was at the time.
+
+“Still, however helpless the little people in the presence of their mysterious Fear, I was differently constituted. I came out of this age of ours, this ripe prime of the human race, when Fear does not paralyse and mystery has lost its terrors. I at least would defend myself. Without further delay I determined to make myself arms and a fastness where I might sleep. With that refuge as a base, I could face this strange world with some of that confidence I had lost in realising to what creatures night by night I lay exposed. I felt I could never sleep again until my bed was secure from them. I shuddered with horror to think how they must already have examined me.
+
+“I wandered during the afternoon along the valley of the Thames, but found nothing that commended itself to my mind as inaccessible. All the buildings and trees seemed easily practicable to such dexterous climbers as the Morlocks, to judge by their wells, must be. Then the tall pinnacles of the Palace of Green Porcelain and the polished gleam of its walls came back to my memory; and in the evening, taking Weena like a child upon my shoulder, I went up the hills towards the south-west. The distance, I had reckoned, was seven or eight miles, but it must have been nearer eighteen. I had first seen the place on a moist afternoon when distances are deceptively diminished. In addition, the heel of one of my shoes was loose, and a nail was working through the sole—they were comfortable old shoes I wore about indoors—so that I was lame. And it was already long past sunset when I came in sight of the palace, silhouetted black against the pale yellow of the sky.
+
+“Weena had been hugely delighted when I began to carry her, but after a while she desired me to let her down, and ran along by the side of me, occasionally darting off on either hand to pick flowers to stick in my pockets. My pockets had always puzzled Weena, but at the last she had concluded that they were an eccentric kind of vases for floral decoration. At least she utilised them for that purpose. And that reminds me! In changing my jacket I found…”
+
+The Time Traveller paused, put his hand into his pocket, and silently placed two withered flowers, not unlike very large white mallows, upon the little table. Then he resumed his narrative.
+
+“As the hush of evening crept over the world and we proceeded over the hill crest towards Wimbledon, Weena grew tired and wanted to return to the house of grey stone. But I pointed out the distant pinnacles of the Palace of Green Porcelain to her, and contrived to make her understand that we were seeking a refuge there from her Fear. You know that great pause that comes upon things before the dusk? Even the breeze stops in the trees. To me there is always an air of expectation about that evening stillness. The sky was clear, remote, and empty save for a few horizontal bars far down in the sunset. Well, that night the expectation took the colour of my fears. In that darkling calm my senses seemed preternaturally sharpened. I fancied I could even feel the hollowness of the ground beneath my feet: could, indeed, almost see through it the Morlocks on their ant-hill going hither and thither and waiting for the dark. In my excitement I fancied that they would receive my invasion of their burrows as a declaration of war. And why had they taken my Time Machine?
+
+“So we went on in the quiet, and the twilight deepened into night. The clear blue of the distance faded, and one star after another came out. The ground grew dim and the trees black. Weena’s fears and her fatigue grew upon her. I took her in my arms and talked to her and caressed her. Then, as the darkness grew deeper, she put her arms round my neck, and, closing her eyes, tightly pressed her face against my shoulder. So we went down a long slope into a valley, and there in the dimness I almost walked into a little river. This I waded, and went up the opposite side of the valley, past a number of sleeping houses, and by a statue—a Faun, or some such figure, minus the head. Here too were acacias. So far I had seen nothing of the Morlocks, but it was yet early in the night, and the darker hours before the old moon rose were still to come.
+
+“From the brow of the next hill I saw a thick wood spreading wide and black before me. I hesitated at this. I could see no end to it, either to the right or the left. Feeling tired—my feet, in particular, were very sore—I carefully lowered Weena from my shoulder as I halted, and sat down upon the turf. I could no longer see the Palace of Green Porcelain, and I was in doubt of my direction. I looked into the thickness of the wood and thought of what it might hide. Under that dense tangle of branches one would be out of sight of the stars. Even were there no other lurking danger—a danger I did not care to let my imagination loose upon—there would still be all the roots to stumble over and the tree-boles to strike against. I was very tired, too, after the excitements of the day; so I decided that I would not face it, but would pass the night upon the open hill.
+
+“Weena, I was glad to find, was fast asleep. I carefully wrapped her in my jacket, and sat down beside her to wait for the moonrise. The hillside was quiet and deserted, but from the black of the wood there came now and then a stir of living things. Above me shone the stars, for the night was very clear. I felt a certain sense of friendly comfort in their twinkling. All the old constellations had gone from the sky, however: that slow movement which is imperceptible in a hundred human lifetimes, had long since rearranged them in unfamiliar groupings. But the Milky Way, it seemed to me, was still the same tattered streamer of star-dust as of yore. Southward (as I judged it) was a very bright red star that was new to me; it was even more splendid than our own green Sirius. And amid all these scintillating points of light one bright planet shone kindly and steadily like the face of an old friend.
+
+“Looking at these stars suddenly dwarfed my own troubles and all the gravities of terrestrial life. I thought of their unfathomable distance, and the slow inevitable drift of their movements out of the unknown past into the unknown future. I thought of the great precessional cycle that the pole of the earth describes. Only forty times had that silent revolution occurred during all the years that I had traversed. And during these few revolutions all the activity, all the traditions, the complex organisations, the nations, languages, literatures, aspirations, even the mere memory of Man as I knew him, had been swept out of existence. Instead were these frail creatures who had forgotten their high ancestry, and the white Things of which I went in terror. Then I thought of the Great Fear that was between the two species, and for the first time, with a sudden shiver, came the clear knowledge of what the meat I had seen might be. Yet it was too horrible! I looked at little Weena sleeping beside me, her face white and starlike under the stars, and forthwith dismissed the thought.
+
+“Through that long night I held my mind off the Morlocks as well as I could, and whiled away the time by trying to fancy I could find signs of the old constellations in the new confusion. The sky kept very clear, except for a hazy cloud or so. No doubt I dozed at times. Then, as my vigil wore on, came a faintness in the eastward sky, like the reflection of some colourless fire, and the old moon rose, thin and peaked and white. And close behind, and overtaking it, and overflowing it, the dawn came, pale at first, and then growing pink and warm. No Morlocks had approached us. Indeed, I had seen none upon the hill that night. And in the confidence of renewed day it almost seemed to me that my fear had been unreasonable. I stood up and found my foot with the loose heel swollen at the ankle and painful under the heel; so I sat down again, took off my shoes, and flung them away.
+
+“I awakened Weena, and we went down into the wood, now green and pleasant instead of black and forbidding. We found some fruit wherewith to break our fast. We soon met others of the dainty ones, laughing and dancing in the sunlight as though there was no such thing in nature as the night. And then I thought once more of the meat that I had seen. I felt assured now of what it was, and from the bottom of my heart I pitied this last feeble rill from the great flood of humanity. Clearly, at some time in the Long-Ago of human decay the Morlocks’ food had run short. Possibly they had lived on rats and such-like vermin. Even now man is far less discriminating and exclusive in his food than he was—far less than any monkey. His prejudice against human flesh is no deep-seated instinct. And so these inhuman sons of men——! I tried to look at the thing in a scientific spirit. After all, they were less human and more remote than our cannibal ancestors of three or four thousand years ago. And the intelligence that would have made this state of things a torment had gone. Why should I trouble myself? These Eloi were mere fatted cattle, which the ant-like Morlocks preserved and preyed upon—probably saw to the breeding of. And there was Weena dancing at my side!
+
+“Then I tried to preserve myself from the horror that was coming upon me, by regarding it as a rigorous punishment of human selfishness. Man had been content to live in ease and delight upon the labours of his fellow-man, had taken Necessity as his watchword and excuse, and in the fullness of time Necessity had come home to him. I even tried a Carlyle-like scorn of this wretched aristocracy in decay. But this attitude of mind was impossible. However great their intellectual degradation, the Eloi had kept too much of the human form not to claim my sympathy, and to make me perforce a sharer in their degradation and their Fear.
+
+“I had at that time very vague ideas as to the course I should pursue. My first was to secure some safe place of refuge, and to make myself such arms of metal or stone as I could contrive. That necessity was immediate. In the next place, I hoped to procure some means of fire, so that I should have the weapon of a torch at hand, for nothing, I knew, would be more efficient against these Morlocks. Then I wanted to arrange some contrivance to break open the doors of bronze under the White Sphinx. I had in mind a battering ram. I had a persuasion that if I could enter those doors and carry a blaze of light before me I should discover the Time Machine and escape. I could not imagine the Morlocks were strong enough to move it far away. Weena I had resolved to bring with me to our own time. And turning such schemes over in my mind I pursued our way towards the building which my fancy had chosen as our dwelling.
+
+XI.
+The Palace of Green Porcelain
+“I found the Palace of Green Porcelain, when we approached it about noon, deserted and falling into ruin. Only ragged vestiges of glass remained in its windows, and great sheets of the green facing had fallen away from the corroded metallic framework. It lay very high upon a turfy down, and looking north-eastward before I entered it, I was surprised to see a large estuary, or even creek, where I judged Wandsworth and Battersea must once have been. I thought then—though I never followed up the thought—of what might have happened, or might be happening, to the living things in the sea.
+
+“The material of the Palace proved on examination to be indeed porcelain, and along the face of it I saw an inscription in some unknown character. I thought, rather foolishly, that Weena might help me to interpret this, but I only learnt that the bare idea of writing had never entered her head. She always seemed to me, I fancy, more human than she was, perhaps because her affection was so human.
+
+“Within the big valves of the door—which were open and broken—we found, instead of the customary hall, a long gallery lit by many side windows. At the first glance I was reminded of a museum. The tiled floor was thick with dust, and a remarkable array of miscellaneous objects was shrouded in the same grey covering. Then I perceived, standing strange and gaunt in the centre of the hall, what was clearly the lower part of a huge skeleton. I recognised by the oblique feet that it was some extinct creature after the fashion of the Megatherium. The skull and the upper bones lay beside it in the thick dust, and in one place, where rain-water had dropped through a leak in the roof, the thing itself had been worn away. Further in the gallery was the huge skeleton barrel of a Brontosaurus. My museum hypothesis was confirmed. Going towards the side I found what appeared to be sloping shelves, and clearing away the thick dust, I found the old familiar glass cases of our own time. But they must have been air-tight to judge from the fair preservation of some of their contents.
+
+“Clearly we stood among the ruins of some latter-day South Kensington! Here, apparently, was the Palæontological Section, and a very splendid array of fossils it must have been, though the inevitable process of decay that had been staved off for a time, and had, through the extinction of bacteria and fungi, lost ninety-nine hundredths of its force, was nevertheless, with extreme sureness if with extreme slowness at work again upon all its treasures. Here and there I found traces of the little people in the shape of rare fossils broken to pieces or threaded in strings upon reeds. And the cases had in some instances been bodily removed—by the Morlocks, as I judged. The place was very silent. The thick dust deadened our footsteps. Weena, who had been rolling a sea urchin down the sloping glass of a case, presently came, as I stared about me, and very quietly took my hand and stood beside me.
+
+“And at first I was so much surprised by this ancient monument of an intellectual age that I gave no thought to the possibilities it presented. Even my preoccupation about the Time Machine receded a little from my mind.
+
+“To judge from the size of the place, this Palace of Green Porcelain had a great deal more in it than a Gallery of Palæontology; possibly historical galleries; it might be, even a library! To me, at least in my present circumstances, these would be vastly more interesting than this spectacle of old-time geology in decay. Exploring, I found another short gallery running transversely to the first. This appeared to be devoted to minerals, and the sight of a block of sulphur set my mind running on gunpowder. But I could find no saltpetre; indeed, no nitrates of any kind. Doubtless they had deliquesced ages ago. Yet the sulphur hung in my mind, and set up a train of thinking. As for the rest of the contents of that gallery, though on the whole they were the best preserved of all I saw, I had little interest. I am no specialist in mineralogy, and I went on down a very ruinous aisle running parallel to the first hall I had entered. Apparently this section had been devoted to natural history, but everything had long since passed out of recognition. A few shrivelled and blackened vestiges of what had once been stuffed animals, desiccated mummies in jars that had once held spirit, a brown dust of departed plants: that was all! I was sorry for that, because I should have been glad to trace the patient readjustments by which the conquest of animated nature had been attained. Then we came to a gallery of simply colossal proportions, but singularly ill-lit, the floor of it running downward at a slight angle from the end at which I entered. At intervals white globes hung from the ceiling—many of them cracked and smashed—which suggested that originally the place had been artificially lit. Here I was more in my element, for rising on either side of me were the huge bulks of big machines, all greatly corroded and many broken down, but some still fairly complete. You know I have a certain weakness for mechanism, and I was inclined to linger among these; the more so as for the most part they had the interest of puzzles, and I could make only the vaguest guesses at what they were for. I fancied that if I could solve their puzzles I should find myself in possession of powers that might be of use against the Morlocks.
+
+“Suddenly Weena came very close to my side. So suddenly that she startled me. Had it not been for her I do not think I should have noticed that the floor of the gallery sloped at all. [Footnote: It may be, of course, that the floor did not slope, but that the museum was built into the side of a hill.—ED.] The end I had come in at was quite above ground, and was lit by rare slit-like windows. As you went down the length, the ground came up against these windows, until at last there was a pit like the ‘area‘ of a London house before each, and only a narrow line of daylight at the top. I went slowly along, puzzling about the machines, and had been too intent upon them to notice the gradual diminution of the light, until Weena’s increasing apprehensions drew my attention. Then I saw that the gallery ran down at last into a thick darkness. I hesitated, and then, as I looked round me, I saw that the dust was less abundant and its surface less even. Further away towards the dimness, it appeared to be broken by a number of small narrow footprints. My sense of the immediate presence of the Morlocks revived at that. I felt that I was wasting my time in the academic examination of machinery. I called to mind that it was already far advanced in the afternoon, and that I had still no weapon, no refuge, and no means of making a fire. And then down in the remote blackness of the gallery I heard a peculiar pattering, and the same odd noises I had heard down the well.
+
+“I took Weena’s hand. Then, struck with a sudden idea, I left her and turned to a machine from which projected a lever not unlike those in a signal-box. Clambering upon the stand, and grasping this lever in my hands, I put all my weight upon it sideways. Suddenly Weena, deserted in the central aisle, began to whimper. I had judged the strength of the lever pretty correctly, for it snapped after a minute’s strain, and I rejoined her with a mace in my hand more than sufficient, I judged, for any Morlock skull I might encounter. And I longed very much to kill a Morlock or so. Very inhuman, you may think, to want to go killing one’s own descendants! But it was impossible, somehow, to feel any humanity in the things. Only my disinclination to leave Weena, and a persuasion that if I began to slake my thirst for murder my Time Machine might suffer, restrained me from going straight down the gallery and killing the brutes I heard.
+
+“Well, mace in one hand and Weena in the other, I went out of that gallery and into another and still larger one, which at the first glance reminded me of a military chapel hung with tattered flags. The brown and charred rags that hung from the sides of it, I presently recognised as the decaying vestiges of books. They had long since dropped to pieces, and every semblance of print had left them. But here and there were warped boards and cracked metallic clasps that told the tale well enough. Had I been a literary man I might, perhaps, have moralised upon the futility of all ambition. But as it was, the thing that struck me with keenest force was the enormous waste of labour to which this sombre wilderness of rotting paper testified. At the time I will confess that I thought chiefly of the Philosophical Transactions and my own seventeen papers upon physical optics.
+
+“Then, going up a broad staircase, we came to what may once have been a gallery of technical chemistry. And here I had not a little hope of useful discoveries. Except at one end where the roof had collapsed, this gallery was well preserved. I went eagerly to every unbroken case. And at last, in one of the really air-tight cases, I found a box of matches. Very eagerly I tried them. They were perfectly good. They were not even damp. I turned to Weena. ‘Dance,’ I cried to her in her own tongue. For now I had a weapon indeed against the horrible creatures we feared. And so, in that derelict museum, upon the thick soft carpeting of dust, to Weena’s huge delight, I solemnly performed a kind of composite dance, whistling The Land of the Leal as cheerfully as I could. In part it was a modest cancan, in part a step dance, in part a skirt dance (so far as my tail-coat permitted), and in part original. For I am naturally inventive, as you know.
+
+“Now, I still think that for this box of matches to have escaped the wear of time for immemorial years was a most strange, as for me it was a most fortunate, thing. Yet, oddly enough, I found a far unlikelier substance, and that was camphor. I found it in a sealed jar, that by chance, I suppose, had been really hermetically sealed. I fancied at first that it was paraffin wax, and smashed the glass accordingly. But the odour of camphor was unmistakable. In the universal decay this volatile substance had chanced to survive, perhaps through many thousands of centuries. It reminded me of a sepia painting I had once seen done from the ink of a fossil Belemnite that must have perished and become fossilised millions of years ago. I was about to throw it away, but I remembered that it was inflammable and burnt with a good bright flame—was, in fact, an excellent candle—and I put it in my pocket. I found no explosives, however, nor any means of breaking down the bronze doors. As yet my iron crowbar was the most helpful thing I had chanced upon. Nevertheless I left that gallery greatly elated.
+
+“I cannot tell you all the story of that long afternoon. It would require a great effort of memory to recall my explorations in at all the proper order. I remember a long gallery of rusting stands of arms, and how I hesitated between my crowbar and a hatchet or a sword. I could not carry both, however, and my bar of iron promised best against the bronze gates. There were numbers of guns, pistols, and rifles. The most were masses of rust, but many were of some new metal, and still fairly sound. But any cartridges or powder there may once have been had rotted into dust. One corner I saw was charred and shattered; perhaps, I thought, by an explosion among the specimens. In another place was a vast array of idols—Polynesian, Mexican, Grecian, Phœnician, every country on earth, I should think. And here, yielding to an irresistible impulse, I wrote my name upon the nose of a steatite monster from South America that particularly took my fancy.
+
+“As the evening drew on, my interest waned. I went through gallery after gallery, dusty, silent, often ruinous, the exhibits sometimes mere heaps of rust and lignite, sometimes fresher. In one place I suddenly found myself near the model of a tin mine, and then by the merest accident I discovered, in an air-tight case, two dynamite cartridges! I shouted ‘Eureka!’ and smashed the case with joy. Then came a doubt. I hesitated. Then, selecting a little side gallery, I made my essay. I never felt such a disappointment as I did in waiting five, ten, fifteen minutes for an explosion that never came. Of course the things were dummies, as I might have guessed from their presence. I really believe that had they not been so, I should have rushed off incontinently and blown Sphinx, bronze doors, and (as it proved) my chances of finding the Time Machine, all together into non-existence.
+
+“It was after that, I think, that we came to a little open court within the palace. It was turfed, and had three fruit-trees. So we rested and refreshed ourselves. Towards sunset I began to consider our position. Night was creeping upon us, and my inaccessible hiding-place had still to be found. But that troubled me very little now. I had in my possession a thing that was, perhaps, the best of all defences against the Morlocks—I had matches! I had the camphor in my pocket, too, if a blaze were needed. It seemed to me that the best thing we could do would be to pass the night in the open, protected by a fire. In the morning there was the getting of the Time Machine. Towards that, as yet, I had only my iron mace. But now, with my growing knowledge, I felt very differently towards those bronze doors. Up to this, I had refrained from forcing them, largely because of the mystery on the other side. They had never impressed me as being very strong, and I hoped to find my bar of iron not altogether inadequate for the work.
+
+XII.
+In the Darkness
+“We emerged from the Palace while the sun was still in part above the horizon. I was determined to reach the White Sphinx early the next morning, and ere the dusk I purposed pushing through the woods that had stopped me on the previous journey. My plan was to go as far as possible that night, and then, building a fire, to sleep in the protection of its glare. Accordingly, as we went along I gathered any sticks or dried grass I saw, and presently had my arms full of such litter. Thus loaded, our progress was slower than I had anticipated, and besides Weena was tired. And I, also, began to suffer from sleepiness too; so that it was full night before we reached the wood. Upon the shrubby hill of its edge Weena would have stopped, fearing the darkness before us; but a singular sense of impending calamity, that should indeed have served me as a warning, drove me onward. I had been without sleep for a night and two days, and I was feverish and irritable. I felt sleep coming upon me, and the Morlocks with it.
+
+“While we hesitated, among the black bushes behind us, and dim against their blackness, I saw three crouching figures. There was scrub and long grass all about us, and I did not feel safe from their insidious approach. The forest, I calculated, was rather less than a mile across. If we could get through it to the bare hillside, there, as it seemed to me, was an altogether safer resting-place; I thought that with my matches and my camphor I could contrive to keep my path illuminated through the woods. Yet it was evident that if I was to flourish matches with my hands I should have to abandon my firewood; so, rather reluctantly, I put it down. And then it came into my head that I would amaze our friends behind by lighting it. I was to discover the atrocious folly of this proceeding, but it came to my mind as an ingenious move for covering our retreat.
+
+“I don’t know if you have ever thought what a rare thing flame must be in the absence of man and in a temperate climate. The sun’s heat is rarely strong enough to burn, even when it is focused by dewdrops, as is sometimes the case in more tropical districts. Lightning may blast and blacken, but it rarely gives rise to widespread fire. Decaying vegetation may occasionally smoulder with the heat of its fermentation, but this rarely results in flame. In this decadence, too, the art of fire-making had been forgotten on the earth. The red tongues that went licking up my heap of wood were an altogether new and strange thing to Weena.
+
+“She wanted to run to it and play with it. I believe she would have cast herself into it had I not restrained her. But I caught her up, and in spite of her struggles, plunged boldly before me into the wood. For a little way the glare of my fire lit the path. Looking back presently, I could see, through the crowded stems, that from my heap of sticks the blaze had spread to some bushes adjacent, and a curved line of fire was creeping up the grass of the hill. I laughed at that, and turned again to the dark trees before me. It was very black, and Weena clung to me convulsively, but there was still, as my eyes grew accustomed to the darkness, sufficient light for me to avoid the stems. Overhead it was simply black, except where a gap of remote blue sky shone down upon us here and there. I lit none of my matches because I had no hand free. Upon my left arm I carried my little one, in my right hand I had my iron bar.
+
+“For some way I heard nothing but the crackling twigs under my feet, the faint rustle of the breeze above, and my own breathing and the throb of the blood-vessels in my ears. Then I seemed to know of a pattering behind me. I pushed on grimly. The pattering grew more distinct, and then I caught the same queer sound and voices I had heard in the Underworld. There were evidently several of the Morlocks, and they were closing in upon me. Indeed, in another minute I felt a tug at my coat, then something at my arm. And Weena shivered violently, and became quite still.
+
+“It was time for a match. But to get one I must put her down. I did so, and, as I fumbled with my pocket, a struggle began in the darkness about my knees, perfectly silent on her part and with the same peculiar cooing sounds from the Morlocks. Soft little hands, too, were creeping over my coat and back, touching even my neck. Then the match scratched and fizzed. I held it flaring, and saw the white backs of the Morlocks in flight amid the trees. I hastily took a lump of camphor from my pocket, and prepared to light it as soon as the match should wane. Then I looked at Weena. She was lying clutching my feet and quite motionless, with her face to the ground. With a sudden fright I stooped to her. She seemed scarcely to breathe. I lit the block of camphor and flung it to the ground, and as it split and flared up and drove back the Morlocks and the shadows, I knelt down and lifted her. The wood behind seemed full of the stir and murmur of a great company!
+
+“She seemed to have fainted. I put her carefully upon my shoulder and rose to push on, and then there came a horrible realisation. In manœuvring with my matches and Weena, I had turned myself about several times, and now I had not the faintest idea in what direction lay my path. For all I knew, I might be facing back towards the Palace of Green Porcelain. I found myself in a cold sweat. I had to think rapidly what to do. I determined to build a fire and encamp where we were. I put Weena, still motionless, down upon a turfy bole, and very hastily, as my first lump of camphor waned, I began collecting sticks and leaves. Here and there out of the darkness round me the Morlocks’ eyes shone like carbuncles.
+
+“The camphor flickered and went out. I lit a match, and as I did so, two white forms that had been approaching Weena dashed hastily away. One was so blinded by the light that he came straight for me, and I felt his bones grind under the blow of my fist. He gave a whoop of dismay, staggered a little way, and fell down. I lit another piece of camphor, and went on gathering my bonfire. Presently I noticed how dry was some of the foliage above me, for since my arrival on the Time Machine, a matter of a week, no rain had fallen. So, instead of casting about among the trees for fallen twigs, I began leaping up and dragging down branches. Very soon I had a choking smoky fire of green wood and dry sticks, and could economise my camphor. Then I turned to where Weena lay beside my iron mace. I tried what I could to revive her, but she lay like one dead. I could not even satisfy myself whether or not she breathed.
+
+“Now, the smoke of the fire beat over towards me, and it must have made me heavy of a sudden. Moreover, the vapour of camphor was in the air. My fire would not need replenishing for an hour or so. I felt very weary after my exertion, and sat down. The wood, too, was full of a slumbrous murmur that I did not understand. I seemed just to nod and open my eyes. But all was dark, and the Morlocks had their hands upon me. Flinging off their clinging fingers I hastily felt in my pocket for the match-box, and—it had gone! Then they gripped and closed with me again. In a moment I knew what had happened. I had slept, and my fire had gone out, and the bitterness of death came over my soul. The forest seemed full of the smell of burning wood. I was caught by the neck, by the hair, by the arms, and pulled down. It was indescribably horrible in the darkness to feel all these soft creatures heaped upon me. I felt as if I was in a monstrous spider’s web. I was overpowered, and went down. I felt little teeth nipping at my neck. I rolled over, and as I did so my hand came against my iron lever. It gave me strength. I struggled up, shaking the human rats from me, and, holding the bar short, I thrust where I judged their faces might be. I could feel the succulent giving of flesh and bone under my blows, and for a moment I was free.
+
+“The strange exultation that so often seems to accompany hard fighting came upon me. I knew that both I and Weena were lost, but I determined to make the Morlocks pay for their meat. I stood with my back to a tree, swinging the iron bar before me. The whole wood was full of the stir and cries of them. A minute passed. Their voices seemed to rise to a higher pitch of excitement, and their movements grew faster. Yet none came within reach. I stood glaring at the blackness. Then suddenly came hope. What if the Morlocks were afraid? And close on the heels of that came a strange thing. The darkness seemed to grow luminous. Very dimly I began to see the Morlocks about me—three battered at my feet—and then I recognised, with incredulous surprise, that the others were running, in an incessant stream, as it seemed, from behind me, and away through the wood in front. And their backs seemed no longer white, but reddish. As I stood agape, I saw a little red spark go drifting across a gap of starlight between the branches, and vanish. And at that I understood the smell of burning wood, the slumbrous murmur that was growing now into a gusty roar, the red glow, and the Morlocks’ flight.
+
+“Stepping out from behind my tree and looking back, I saw, through the black pillars of the nearer trees, the flames of the burning forest. It was my first fire coming after me. With that I looked for Weena, but she was gone. The hissing and crackling behind me, the explosive thud as each fresh tree burst into flame, left little time for reflection. My iron bar still gripped, I followed in the Morlocks’ path. It was a close race. Once the flames crept forward so swiftly on my right as I ran that I was outflanked and had to strike off to the left. But at last I emerged upon a small open space, and as I did so, a Morlock came blundering towards me, and past me, and went on straight into the fire!
+
+“And now I was to see the most weird and horrible thing, I think, of all that I beheld in that future age. This whole space was as bright as day with the reflection of the fire. In the centre was a hillock or tumulus, surmounted by a scorched hawthorn. Beyond this was another arm of the burning forest, with yellow tongues already writhing from it, completely encircling the space with a fence of fire. Upon the hillside were some thirty or forty Morlocks, dazzled by the light and heat, and blundering hither and thither against each other in their bewilderment. At first I did not realise their blindness, and struck furiously at them with my bar, in a frenzy of fear, as they approached me, killing one and crippling several more. But when I had watched the gestures of one of them groping under the hawthorn against the red sky, and heard their moans, I was assured of their absolute helplessness and misery in the glare, and I struck no more of them.
+
+“Yet every now and then one would come straight towards me, setting loose a quivering horror that made me quick to elude him. At one time the flames died down somewhat, and I feared the foul creatures would presently be able to see me. I was thinking of beginning the fight by killing some of them before this should happen; but the fire burst out again brightly, and I stayed my hand. I walked about the hill among them and avoided them, looking for some trace of Weena. But Weena was gone.
+
+“At last I sat down on the summit of the hillock, and watched this strange incredible company of blind things groping to and fro, and making uncanny noises to each other, as the glare of the fire beat on them. The coiling uprush of smoke streamed across the sky, and through the rare tatters of that red canopy, remote as though they belonged to another universe, shone the little stars. Two or three Morlocks came blundering into me, and I drove them off with blows of my fists, trembling as I did so.
+
+“For the most part of that night I was persuaded it was a nightmare. I bit myself and screamed in a passionate desire to awake. I beat the ground with my hands, and got up and sat down again, and wandered here and there, and again sat down. Then I would fall to rubbing my eyes and calling upon God to let me awake. Thrice I saw Morlocks put their heads down in a kind of agony and rush into the flames. But, at last, above the subsiding red of the fire, above the streaming masses of black smoke and the whitening and blackening tree stumps, and the diminishing numbers of these dim creatures, came the white light of the day.
+
+“I searched again for traces of Weena, but there were none. It was plain that they had left her poor little body in the forest. I cannot describe how it relieved me to think that it had escaped the awful fate to which it seemed destined. As I thought of that, I was almost moved to begin a massacre of the helpless abominations about me, but I contained myself. The hillock, as I have said, was a kind of island in the forest. From its summit I could now make out through a haze of smoke the Palace of Green Porcelain, and from that I could get my bearings for the White Sphinx. And so, leaving the remnant of these damned souls still going hither and thither and moaning, as the day grew clearer, I tied some grass about my feet and limped on across smoking ashes and among black stems that still pulsated internally with fire, towards the hiding-place of the Time Machine. I walked slowly, for I was almost exhausted, as well as lame, and I felt the intensest wretchedness for the horrible death of little Weena. It seemed an overwhelming calamity. Now, in this old familiar room, it is more like the sorrow of a dream than an actual loss. But that morning it left me absolutely lonely again—terribly alone. I began to think of this house of mine, of this fireside, of some of you, and with such thoughts came a longing that was pain.
+
+“But, as I walked over the smoking ashes under the bright morning sky, I made a discovery. In my trouser pocket were still some loose matches. The box must have leaked before it was lost.
+
+XIII.
+The Trap of the White Sphinx
+“About eight or nine in the morning I came to the same seat of yellow metal from which I had viewed the world upon the evening of my arrival. I thought of my hasty conclusions upon that evening and could not refrain from laughing bitterly at my confidence. Here was the same beautiful scene, the same abundant foliage, the same splendid palaces and magnificent ruins, the same silver river running between its fertile banks. The gay robes of the beautiful people moved hither and thither among the trees. Some were bathing in exactly the place where I had saved Weena, and that suddenly gave me a keen stab of pain. And like blots upon the landscape rose the cupolas above the ways to the Underworld. I understood now what all the beauty of the Overworld people covered. Very pleasant was their day, as pleasant as the day of the cattle in the field. Like the cattle, they knew of no enemies and provided against no needs. And their end was the same.
+
+“I grieved to think how brief the dream of the human intellect had been. It had committed suicide. It had set itself steadfastly towards comfort and ease, a balanced society with security and permanency as its watchword, it had attained its hopes—to come to this at last. Once, life and property must have reached almost absolute safety. The rich had been assured of his wealth and comfort, the toiler assured of his life and work. No doubt in that perfect world there had been no unemployed problem, no social question left unsolved. And a great quiet had followed.
+
+“It is a law of nature we overlook, that intellectual versatility is the compensation for change, danger, and trouble. An animal perfectly in harmony with its environment is a perfect mechanism. Nature never appeals to intelligence until habit and instinct are useless. There is no intelligence where there is no change and no need of change. Only those animals partake of intelligence that have to meet a huge variety of needs and dangers.
+
+“So, as I see it, the Upperworld man had drifted towards his feeble prettiness, and the Underworld to mere mechanical industry. But that perfect state had lacked one thing even for mechanical perfection—absolute permanency. Apparently as time went on, the feeding of an Underworld, however it was effected, had become disjointed. Mother Necessity, who had been staved off for a few thousand years, came back again, and she began below. The Underworld being in contact with machinery, which, however perfect, still needs some little thought outside habit, had probably retained perforce rather more initiative, if less of every other human character, than the Upper. And when other meat failed them, they turned to what old habit had hitherto forbidden. So I say I saw it in my last view of the world of Eight Hundred and Two Thousand Seven Hundred and One. It may be as wrong an explanation as mortal wit could invent. It is how the thing shaped itself to me, and as that I give it to you.
+
+“After the fatigues, excitements, and terrors of the past days, and in spite of my grief, this seat and the tranquil view and the warm sunlight were very pleasant. I was very tired and sleepy, and soon my theorising passed into dozing. Catching myself at that, I took my own hint, and spreading myself out upon the turf I had a long and refreshing sleep.
+
+“I awoke a little before sunsetting. I now felt safe against being caught napping by the Morlocks, and, stretching myself, I came on down the hill towards the White Sphinx. I had my crowbar in one hand, and the other hand played with the matches in my pocket.
+
+“And now came a most unexpected thing. As I approached the pedestal of the sphinx I found the bronze valves were open. They had slid down into grooves.
+
+“At that I stopped short before them, hesitating to enter.
+
+“Within was a small apartment, and on a raised place in the corner of this was the Time Machine. I had the small levers in my pocket. So here, after all my elaborate preparations for the siege of the White Sphinx, was a meek surrender. I threw my iron bar away, almost sorry not to use it.
+
+“A sudden thought came into my head as I stooped towards the portal. For once, at least, I grasped the mental operations of the Morlocks. Suppressing a strong inclination to laugh, I stepped through the bronze frame and up to the Time Machine. I was surprised to find it had been carefully oiled and cleaned. I have suspected since that the Morlocks had even partially taken it to pieces while trying in their dim way to grasp its purpose.
+
+“Now as I stood and examined it, finding a pleasure in the mere touch of the contrivance, the thing I had expected happened. The bronze panels suddenly slid up and struck the frame with a clang. I was in the dark—trapped. So the Morlocks thought. At that I chuckled gleefully.
+
+“I could already hear their murmuring laughter as they came towards me. Very calmly I tried to strike the match. I had only to fix on the levers and depart then like a ghost. But I had overlooked one little thing. The matches were of that abominable kind that light only on the box.
+
+“You may imagine how all my calm vanished. The little brutes were close upon me. One touched me. I made a sweeping blow in the dark at them with the levers, and began to scramble into the saddle of the machine. Then came one hand upon me and then another. Then I had simply to fight against their persistent fingers for my levers, and at the same time feel for the studs over which these fitted. One, indeed, they almost got away from me. As it slipped from my hand, I had to butt in the dark with my head—I could hear the Morlock’s skull ring—to recover it. It was a nearer thing than the fight in the forest, I think, this last scramble.
+
+“But at last the lever was fixed and pulled over. The clinging hands slipped from me. The darkness presently fell from my eyes. I found myself in the same grey light and tumult I have already described.
+
+XIV.
+The Further Vision
+“I have already told you of the sickness and confusion that comes with time travelling. And this time I was not seated properly in the saddle, but sideways and in an unstable fashion. For an indefinite time I clung to the machine as it swayed and vibrated, quite unheeding how I went, and when I brought myself to look at the dials again I was amazed to find where I had arrived. One dial records days, and another thousands of days, another millions of days, and another thousands of millions. Now, instead of reversing the levers, I had pulled them over so as to go forward with them, and when I came to look at these indicators I found that the thousands hand was sweeping round as fast as the seconds hand of a watch—into futurity.
+
+“As I drove on, a peculiar change crept over the appearance of things. The palpitating greyness grew darker; then—though I was still travelling with prodigious velocity—the blinking succession of day and night, which was usually indicative of a slower pace, returned, and grew more and more marked. This puzzled me very much at first. The alternations of night and day grew slower and slower, and so did the passage of the sun across the sky, until they seemed to stretch through centuries. At last a steady twilight brooded over the earth, a twilight only broken now and then when a comet glared across the darkling sky. The band of light that had indicated the sun had long since disappeared; for the sun had ceased to set—it simply rose and fell in the west, and grew ever broader and more red. All trace of the moon had vanished. The circling of the stars, growing slower and slower, had given place to creeping points of light. At last, some time before I stopped, the sun, red and very large, halted motionless upon the horizon, a vast dome glowing with a dull heat, and now and then suffering a momentary extinction. At one time it had for a little while glowed more brilliantly again, but it speedily reverted to its sullen red heat. I perceived by this slowing down of its rising and setting that the work of the tidal drag was done. The earth had come to rest with one face to the sun, even as in our own time the moon faces the earth. Very cautiously, for I remembered my former headlong fall, I began to reverse my motion. Slower and slower went the circling hands until the thousands one seemed motionless and the daily one was no longer a mere mist upon its scale. Still slower, until the dim outlines of a desolate beach grew visible.
+
+“I stopped very gently and sat upon the Time Machine, looking round. The sky was no longer blue. North-eastward it was inky black, and out of the blackness shone brightly and steadily the pale white stars. Overhead it was a deep Indian red and starless, and south-eastward it grew brighter to a glowing scarlet where, cut by the horizon, lay the huge hull of the sun, red and motionless. The rocks about me were of a harsh reddish colour, and all the trace of life that I could see at first was the intensely green vegetation that covered every projecting point on their south-eastern face. It was the same rich green that one sees on forest moss or on the lichen in caves: plants which like these grow in a perpetual twilight.
+
+“The machine was standing on a sloping beach. The sea stretched away to the south-west, to rise into a sharp bright horizon against the wan sky. There were no breakers and no waves, for not a breath of wind was stirring. Only a slight oily swell rose and fell like a gentle breathing, and showed that the eternal sea was still moving and living. And along the margin where the water sometimes broke was a thick incrustation of salt—pink under the lurid sky. There was a sense of oppression in my head, and I noticed that I was breathing very fast. The sensation reminded me of my only experience of mountaineering, and from that I judged the air to be more rarefied than it is now.
+
+“Far away up the desolate slope I heard a harsh scream, and saw a thing like a huge white butterfly go slanting and fluttering up into the sky and, circling, disappear over some low hillocks beyond. The sound of its voice was so dismal that I shivered and seated myself more firmly upon the machine. Looking round me again, I saw that, quite near, what I had taken to be a reddish mass of rock was moving slowly towards me. Then I saw the thing was really a monstrous crab-like creature. Can you imagine a crab as large as yonder table, with its many legs moving slowly and uncertainly, its big claws swaying, its long antennæ, like carters’ whips, waving and feeling, and its stalked eyes gleaming at you on either side of its metallic front? Its back was corrugated and ornamented with ungainly bosses, and a greenish incrustation blotched it here and there. I could see the many palps of its complicated mouth flickering and feeling as it moved.
+
+“As I stared at this sinister apparition crawling towards me, I felt a tickling on my cheek as though a fly had lighted there. I tried to brush it away with my hand, but in a moment it returned, and almost immediately came another by my ear. I struck at this, and caught something threadlike. It was drawn swiftly out of my hand. With a frightful qualm, I turned, and I saw that I had grasped the antenna of another monster crab that stood just behind me. Its evil eyes were wriggling on their stalks, its mouth was all alive with appetite, and its vast ungainly claws, smeared with an algal slime, were descending upon me. In a moment my hand was on the lever, and I had placed a month between myself and these monsters. But I was still on the same beach, and I saw them distinctly now as soon as I stopped. Dozens of them seemed to be crawling here and there, in the sombre light, among the foliated sheets of intense green.
+
+“I cannot convey the sense of abominable desolation that hung over the world. The red eastern sky, the northward blackness, the salt Dead Sea, the stony beach crawling with these foul, slow-stirring monsters, the uniform poisonous-looking green of the lichenous plants, the thin air that hurts one’s lungs: all contributed to an appalling effect. I moved on a hundred years, and there was the same red sun—a little larger, a little duller—the same dying sea, the same chill air, and the same crowd of earthy crustacea creeping in and out among the green weed and the red rocks. And in the westward sky, I saw a curved pale line like a vast new moon.
+
+“So I travelled, stopping ever and again, in great strides of a thousand years or more, drawn on by the mystery of the earth’s fate, watching with a strange fascination the sun grow larger and duller in the westward sky, and the life of the old earth ebb away. At last, more than thirty million years hence, the huge red-hot dome of the sun had come to obscure nearly a tenth part of the darkling heavens. Then I stopped once more, for the crawling multitude of crabs had disappeared, and the red beach, save for its livid green liverworts and lichens, seemed lifeless. And now it was flecked with white. A bitter cold assailed me. Rare white flakes ever and again came eddying down. To the north-eastward, the glare of snow lay under the starlight of the sable sky, and I could see an undulating crest of hillocks pinkish white. There were fringes of ice along the sea margin, with drifting masses farther out; but the main expanse of that salt ocean, all bloody under the eternal sunset, was still unfrozen.
+
+“I looked about me to see if any traces of animal life remained. A certain indefinable apprehension still kept me in the saddle of the machine. But I saw nothing moving, in earth or sky or sea. The green slime on the rocks alone testified that life was not extinct. A shallow sandbank had appeared in the sea and the water had receded from the beach. I fancied I saw some black object flopping about upon this bank, but it became motionless as I looked at it, and I judged that my eye had been deceived, and that the black object was merely a rock. The stars in the sky were intensely bright and seemed to me to twinkle very little.
+
+“Suddenly I noticed that the circular westward outline of the sun had changed; that a concavity, a bay, had appeared in the curve. I saw this grow larger. For a minute perhaps I stared aghast at this blackness that was creeping over the day, and then I realised that an eclipse was beginning. Either the moon or the planet Mercury was passing across the sun’s disk. Naturally, at first I took it to be the moon, but there is much to incline me to believe that what I really saw was the transit of an inner planet passing very near to the earth.
+
+“The darkness grew apace; a cold wind began to blow in freshening gusts from the east, and the showering white flakes in the air increased in number. From the edge of the sea came a ripple and whisper. Beyond these lifeless sounds the world was silent. Silent? It would be hard to convey the stillness of it. All the sounds of man, the bleating of sheep, the cries of birds, the hum of insects, the stir that makes the background of our lives—all that was over. As the darkness thickened, the eddying flakes grew more abundant, dancing before my eyes; and the cold of the air more intense. At last, one by one, swiftly, one after the other, the white peaks of the distant hills vanished into blackness. The breeze rose to a moaning wind. I saw the black central shadow of the eclipse sweeping towards me. In another moment the pale stars alone were visible. All else was rayless obscurity. The sky was absolutely black.
+
+“A horror of this great darkness came on me. The cold, that smote to my marrow, and the pain I felt in breathing, overcame me. I shivered, and a deadly nausea seized me. Then like a red-hot bow in the sky appeared the edge of the sun. I got off the machine to recover myself. I felt giddy and incapable of facing the return journey. As I stood sick and confused I saw again the moving thing upon the shoal—there was no mistake now that it was a moving thing—against the red water of the sea. It was a round thing, the size of a football perhaps, or, it may be, bigger, and tentacles trailed down from it; it seemed black against the weltering blood-red water, and it was hopping fitfully about. Then I felt I was fainting. But a terrible dread of lying helpless in that remote and awful twilight sustained me while I clambered upon the saddle.
+
+XV.
+The Time Traveller’s Return
+“So I came back. For a long time I must have been insensible upon the machine. The blinking succession of the days and nights was resumed, the sun got golden again, the sky blue. I breathed with greater freedom. The fluctuating contours of the land ebbed and flowed. The hands spun backward upon the dials. At last I saw again the dim shadows of houses, the evidences of decadent humanity. These, too, changed and passed, and others came. Presently, when the million dial was at zero, I slackened speed. I began to recognise our own pretty and familiar architecture, the thousands hand ran back to the starting-point, the night and day flapped slower and slower. Then the old walls of the laboratory came round me. Very gently, now, I slowed the mechanism down.
+
+“I saw one little thing that seemed odd to me. I think I have told you that when I set out, before my velocity became very high, Mrs. Watchett had walked across the room, travelling, as it seemed to me, like a rocket. As I returned, I passed again across that minute when she traversed the laboratory. But now her every motion appeared to be the exact inversion of her previous ones. The door at the lower end opened, and she glided quietly up the laboratory, back foremost, and disappeared behind the door by which she had previously entered. Just before that I seemed to see Hillyer for a moment; but he passed like a flash.
+
+“Then I stopped the machine, and saw about me again the old familiar laboratory, my tools, my appliances just as I had left them. I got off the thing very shakily, and sat down upon my bench. For several minutes I trembled violently. Then I became calmer. Around me was my old workshop again, exactly as it had been. I might have slept there, and the whole thing have been a dream.
+
+“And yet, not exactly! The thing had started from the south-east corner of the laboratory. It had come to rest again in the north-west, against the wall where you saw it. That gives you the exact distance from my little lawn to the pedestal of the White Sphinx, into which the Morlocks had carried my machine.
+
+“For a time my brain went stagnant. Presently I got up and came through the passage here, limping, because my heel was still painful, and feeling sorely begrimed. I saw the Pall Mall Gazette on the table by the door. I found the date was indeed today, and looking at the timepiece, saw the hour was almost eight o’clock. I heard your voices and the clatter of plates. I hesitated—I felt so sick and weak. Then I sniffed good wholesome meat, and opened the door on you. You know the rest. I washed, and dined, and now I am telling you the story.
+
+XVI.
+After the Story
+“I know,” he said, after a pause, “that all this will be absolutely incredible to you, but to me the one incredible thing is that I am here tonight in this old familiar room looking into your friendly faces and telling you these strange adventures.” He looked at the Medical Man. “No. I cannot expect you to believe it. Take it as a lie—or a prophecy. Say I dreamed it in the workshop. Consider I have been speculating upon the destinies of our race, until I have hatched this fiction. Treat my assertion of its truth as a mere stroke of art to enhance its interest. And taking it as a story, what do you think of it?”
+
+He took up his pipe, and began, in his old accustomed manner, to tap with it nervously upon the bars of the grate. There was a momentary stillness. Then chairs began to creak and shoes to scrape upon the carpet. I took my eyes off the Time Traveller’s face, and looked round at his audience. They were in the dark, and little spots of colour swam before them. The Medical Man seemed absorbed in the contemplation of our host. The Editor was looking hard at the end of his cigar—the sixth. The Journalist fumbled for his watch. The others, as far as I remember, were motionless.
+
+The Editor stood up with a sigh. “What a pity it is you’re not a writer of stories!” he said, putting his hand on the Time Traveller’s shoulder.
+
+“You don’t believe it?”
+
+“Well——”
+
+“I thought not.”
+
+The Time Traveller turned to us. “Where are the matches?” he said. He lit one and spoke over his pipe, puffing. “To tell you the truth... I hardly believe it myself..... And yet...”
+
+His eye fell with a mute inquiry upon the withered white flowers upon the little table. Then he turned over the hand holding his pipe, and I saw he was looking at some half-healed scars on his knuckles.
+
+The Medical Man rose, came to the lamp, and examined the flowers. “The gynæceum’s odd,” he said. The Psychologist leant forward to see, holding out his hand for a specimen.
+
+“I’m hanged if it isn’t a quarter to one,” said the Journalist. “How shall we get home?”
+
+“Plenty of cabs at the station,” said the Psychologist.
+
+“It’s a curious thing,” said the Medical Man; “but I certainly don’t know the natural order of these flowers. May I have them?”
+
+The Time Traveller hesitated. Then suddenly: “Certainly not.”
+
+“Where did you really get them?” said the Medical Man.
+
+The Time Traveller put his hand to his head. He spoke like one who was trying to keep hold of an idea that eluded him. “They were put into my pocket by Weena, when I travelled into Time.” He stared round the room. “I’m damned if it isn’t all going. This room and you and the atmosphere of every day is too much for my memory. Did I ever make a Time Machine, or a model of a Time Machine? Or is it all only a dream? They say life is a dream, a precious poor dream at times—but I can’t stand another that won’t fit. It’s madness. And where did the dream come from? … I must look at that machine. If there is one!”
+
+He caught up the lamp swiftly, and carried it, flaring red, through the door into the corridor. We followed him. There in the flickering light of the lamp was the machine sure enough, squat, ugly, and askew, a thing of brass, ebony, ivory, and translucent glimmering quartz. Solid to the touch—for I put out my hand and felt the rail of it—and with brown spots and smears upon the ivory, and bits of grass and moss upon the lower parts, and one rail bent awry.
+
+The Time Traveller put the lamp down on the bench, and ran his hand along the damaged rail. “It’s all right now,” he said. “The story I told you was true. I’m sorry to have brought you out here in the cold.” He took up the lamp, and, in an absolute silence, we returned to the smoking-room.
+
+He came into the hall with us and helped the Editor on with his coat. The Medical Man looked into his face and, with a certain hesitation, told him he was suffering from overwork, at which he laughed hugely. I remember him standing in the open doorway, bawling good-night.
+
+I shared a cab with the Editor. He thought the tale a “gaudy lie.” For my own part I was unable to come to a conclusion. The story was so fantastic and incredible, the telling so credible and sober. I lay awake most of the night thinking about it. I determined to go next day and see the Time Traveller again. I was told he was in the laboratory, and being on easy terms in the house, I went up to him. The laboratory, however, was empty. I stared for a minute at the Time Machine and put out my hand and touched the lever. At that the squat substantial-looking mass swayed like a bough shaken by the wind. Its instability startled me extremely, and I had a queer reminiscence of the childish days when I used to be forbidden to meddle. I came back through the corridor. The Time Traveller met me in the smoking-room. He was coming from the house. He had a small camera under one arm and a knapsack under the other. He laughed when he saw me, and gave me an elbow to shake. “I’m frightfully busy,” said he, “with that thing in there.”
+
+“But is it not some hoax?” I said. “Do you really travel through time?”
+
+“Really and truly I do.” And he looked frankly into my eyes. He hesitated. His eye wandered about the room. “I only want half an hour,” he said. “I know why you came, and it’s awfully good of you. There’s some magazines here. If you’ll stop to lunch I’ll prove you this time travelling up to the hilt, specimens and all. If you’ll forgive my leaving you now?”
+
+I consented, hardly comprehending then the full import of his words, and he nodded and went on down the corridor. I heard the door of the laboratory slam, seated myself in a chair, and took up a daily paper. What was he going to do before lunch-time? Then suddenly I was reminded by an advertisement that I had promised to meet Richardson, the publisher, at two. I looked at my watch, and saw that I could barely save that engagement. I got up and went down the passage to tell the Time Traveller.
+
+As I took hold of the handle of the door I heard an exclamation, oddly truncated at the end, and a click and a thud. A gust of air whirled round me as I opened the door, and from within came the sound of broken glass falling on the floor. The Time Traveller was not there. I seemed to see a ghostly, indistinct figure sitting in a whirling mass of black and brass for a moment—a figure so transparent that the bench behind with its sheets of drawings was absolutely distinct; but this phantasm vanished as I rubbed my eyes. The Time Machine had gone. Save for a subsiding stir of dust, the further end of the laboratory was empty. A pane of the skylight had, apparently, just been blown in.
+
+I felt an unreasonable amazement. I knew that something strange had happened, and for the moment could not distinguish what the strange thing might be. As I stood staring, the door into the garden opened, and the man-servant appeared.
+
+We looked at each other. Then ideas began to come. “Has Mr. —— gone out that way?” said I.
+
+“No, sir. No one has come out this way. I was expecting to find him here.”
+
+At that I understood. At the risk of disappointing Richardson I stayed on, waiting for the Time Traveller; waiting for the second, perhaps still stranger story, and the specimens and photographs he would bring with him. But I am beginning now to fear that I must wait a lifetime. The Time Traveller vanished three years ago. And, as everybody knows now, he has never returned.
+
+Epilogue
+One cannot choose but wonder. Will he ever return? It may be that he swept back into the past, and fell among the blood-drinking, hairy savages of the Age of Unpolished Stone; into the abysses of the Cretaceous Sea; or among the grotesque saurians, the huge reptilian brutes of the Jurassic times. He may even now—if I may use the phrase—be wandering on some plesiosaurus-haunted Oolitic coral reef, or beside the lonely saline seas of the Triassic Age. Or did he go forward, into one of the nearer ages, in which men are still men, but with the riddles of our own time answered and its wearisome problems solved? Into the manhood of the race: for I, for my own part, cannot think that these latter days of weak experiment, fragmentary theory, and mutual discord are indeed man’s culminating time! I say, for my own part. He, I know—for the question had been discussed among us long before the Time Machine was made—thought but cheerlessly of the Advancement of Mankind, and saw in the growing pile of civilisation only a foolish heaping that must inevitably fall back upon and destroy its makers in the end. If that is so, it remains for us to live as though it were not so. But to me the future is still black and blank—is a vast ignorance, lit at a few casual places by the memory of his story. And I have by me, for my comfort, two strange white flowers—shrivelled now, and brown and flat and brittle—to witness that even when mind and strength had gone, gratitude and a mutual tenderness still lived on in the heart of man.
\ No newline at end of file
diff --git a/examples/assorted_checks/generate_readme_plots.py b/examples/assorted_checks/generate_readme_plots.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6e5da2f57b4e4a9070bfca0f1f27e8a3cec455b
--- /dev/null
+++ b/examples/assorted_checks/generate_readme_plots.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+"""Script to generate all plots needed for the README."""
+
+import os
+import sys
+import shutil
+from pathlib import Path
+
+from validate_wav import validate_tts
+
+# Get absolute paths
+script_dir = Path(__file__).parent.resolve()
+project_root = script_dir.parent.parent
+
+# Add directories to Python path for imports
+sys.path.append(str(script_dir))
+sys.path.append(str(script_dir / "benchmarks"))
+
+# Import test scripts
+from benchmark_tts_rtf import main as benchmark_rtf
+from test_formats.test_audio_formats import main as test_formats
+from benchmark_first_token_stream_unified import main as benchmark_stream
+from test_combinations.test_analyze_combined_voices import main as test_voice_analysis
+
+# Remove directories from path after imports
+sys.path.remove(str(script_dir))
+sys.path.remove(str(script_dir / "benchmarks"))
+
+
+def ensure_assets_dir():
+ """Create assets directory if it doesn't exist."""
+ assets_dir = project_root / "assets"
+ assets_dir.mkdir(exist_ok=True)
+ return assets_dir
+
+
+def copy_plot(src_path: str, dest_name: str, assets_dir: Path):
+ """Copy a plot to the assets directory with a new name."""
+ if os.path.exists(src_path):
+ shutil.copy2(src_path, assets_dir / dest_name)
+ print(f"Copied {src_path} to {assets_dir / dest_name}")
+ else:
+ print(f"Warning: Source plot not found at {src_path}")
+
+
+def validate_and_print(wav_path: str, category: str):
+ """Validate a WAV file and print results."""
+ if not os.path.exists(wav_path):
+ print(f"Warning: WAV file not found at {wav_path}")
+ return
+
+ print(f"\n=== Validating {category} Audio ===")
+ result = validate_tts(wav_path)
+
+ if "error" in result:
+ print(f"Error: {result['error']}")
+ else:
+ print(f"Duration: {result['duration']}")
+ print(f"Sample Rate: {result['sample_rate']} Hz")
+ print(f"Peak Amplitude: {result['peak_amplitude']}")
+ print(f"RMS Level: {result['rms_level']}")
+
+ if result["issues"]:
+ print("\nIssues Found:")
+ for issue in result["issues"]:
+ print(f"- {issue}")
+ else:
+ print("\nNo issues found")
+
+
+def main():
+ """Generate all plots needed for the README."""
+ # Ensure assets directory exists
+ prefix = "gpu"
+ assets_dir = ensure_assets_dir()
+
+ print("\n=== Generating Format Comparison Plot ===")
+ test_formats()
+ copy_plot(
+ str(script_dir / "test_formats/output/test_formats/format_comparison.png"),
+ "format_comparison.png",
+ assets_dir,
+ )
+ # Validate WAV output from format test
+ validate_and_print(
+ str(script_dir / "test_formats/output/test_formats/speech.wav"),
+ "Format Test WAV",
+ )
+
+ print("\n=== Generating Voice Analysis Plot ===")
+ test_voice_analysis()
+ copy_plot(
+ str(script_dir / "test_combinations/output/analysis_comparison.png"),
+ "voice_analysis.png",
+ assets_dir,
+ )
+ # Validate combined voice output
+ validate_and_print(
+ str(
+ script_dir
+ / "test_combinations/output/analysis_combined_af_bella_af_nicole.wav"
+ ),
+ "Combined Voice",
+ )
+
+ print("\n=== Generating Performance Benchmark Plots ===")
+ benchmark_rtf()
+ copy_plot(
+ str(script_dir / f"benchmarks/output_plots/{prefix}_processing_time_rtf.png"),
+ f"{prefix}_processing_time.png",
+ assets_dir,
+ )
+ copy_plot(
+ str(script_dir / f"benchmarks/output_plots/{prefix}_realtime_factor_rtf.png"),
+ f"{prefix}_realtime_factor.png",
+ assets_dir,
+ )
+ # Validate RTF benchmark output (~500 tokens)
+ validate_and_print(
+ str(script_dir / "benchmarks/output_audio/chunk_450_tokens.wav"),
+ "RTF Benchmark",
+ )
+
+ print("\n=== Generating Streaming Benchmark Plots ===")
+ benchmark_stream()
+
+ # Copy direct streaming plots
+ copy_plot(
+ str(script_dir / "benchmarks/output_plots/first_token_latency_stream.png"),
+ f"{prefix}_first_token_latency_direct.png",
+ assets_dir,
+ )
+ copy_plot(
+ str(script_dir / "benchmarks/output_plots/first_token_timeline_stream.png"),
+ f"{prefix}_first_token_timeline_direct.png",
+ assets_dir,
+ )
+ copy_plot(
+ str(script_dir / "benchmarks/output_plots/total_time_latency_stream.png"),
+ f"{prefix}_total_time_latency_direct.png",
+ assets_dir,
+ )
+
+ # Copy OpenAI streaming plots
+ copy_plot(
+ str(
+ script_dir / "benchmarks/output_plots/first_token_latency_stream_openai.png"
+ ),
+ f"{prefix}_first_token_latency_openai.png",
+ assets_dir,
+ )
+ copy_plot(
+ str(
+ script_dir
+ / "benchmarks/output_plots/first_token_timeline_stream_openai.png"
+ ),
+ f"{prefix}_first_token_timeline_openai.png",
+ assets_dir,
+ )
+ copy_plot(
+ str(
+ script_dir / "benchmarks/output_plots/total_time_latency_stream_openai.png"
+ ),
+ f"{prefix}_total_time_latency_openai.png",
+ assets_dir,
+ )
+
+ # Wait a moment for files to be generated
+ import time
+
+ time.sleep(2)
+
+ # Validate streaming outputs (~500 tokens)
+ validate_and_print(
+ str(
+ script_dir
+ / "benchmarks/output_audio_stream/benchmark_tokens500_run1_stream.wav"
+ ),
+ "Direct Streaming",
+ )
+ validate_and_print(
+ str(
+ script_dir
+ / "benchmarks/output_audio_stream_openai/benchmark_tokens500_run1_stream_openai.wav"
+ ),
+ "OpenAI Streaming",
+ )
+
+ validate_and_print(
+ str(script_dir / "test_formats/output/test_formats/test_audio.wav"),
+ "Format Test WAV",
+ )
+
+ print("\nAll plots have been generated and copied to the assets directory")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py b/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c314286a92212a7644c5db2b18caa727f8a936
--- /dev/null
+++ b/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+import os
+import time
+import wave
+from pathlib import Path
+
+import numpy as np
+import matplotlib.pyplot as plt
+from openai import OpenAI
+
+# Create output directory
+output_dir = Path(__file__).parent / "output"
+output_dir.mkdir(exist_ok=True)
+
+# Initialize OpenAI client
+client = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed")
+
+# Test text that showcases voice characteristics
+text = """The quick brown fox jumps over the lazy dog.
+ How vexingly quick daft zebras jump!
+ The five boxing wizards jump quickly."""
+
+def generate_and_save_audio(voice: str, output_path: str):
+ """Generate audio using specified voice and save to WAV file."""
+ print(f"\nGenerating audio for voice: {voice}")
+ start_time = time.time()
+
+ # Generate audio using streaming response
+ with client.audio.speech.with_streaming_response.create(
+ model="kokoro",
+ voice=voice,
+ response_format="wav",
+ input=text,
+ ) as response:
+ # Save the audio stream to file
+ with open(output_path, "wb") as f:
+ for chunk in response.iter_bytes():
+ f.write(chunk)
+
+ duration = time.time() - start_time
+ print(f"Generated in {duration:.2f}s")
+ print(f"Saved to {output_path}")
+ return output_path
+
+def analyze_audio(filepath: str):
+ """Analyze audio file and return key characteristics."""
+ print(f"\nAnalyzing {filepath}")
+ try:
+ print(f"\nTrying to read {filepath}")
+ with wave.open(filepath, 'rb') as wf:
+ sample_rate = wf.getframerate()
+ samples = np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16)
+ print(f"Successfully read file:")
+ print(f"Sample rate: {sample_rate}")
+ print(f"Samples shape: {samples.shape}")
+ print(f"Samples dtype: {samples.dtype}")
+ print(f"First few samples: {samples[:10]}")
+ except Exception as e:
+ print(f"Error reading file: {str(e)}")
+ raise
+
+ # Convert to float64 for calculations
+ samples = samples.astype(np.float64) / 32768.0 # Normalize 16-bit audio
+
+ # Convert to mono if stereo
+ if len(samples.shape) > 1:
+ samples = np.mean(samples, axis=1)
+
+ # Calculate basic stats
+ duration = len(samples) / sample_rate
+ max_amp = np.max(np.abs(samples))
+ rms = np.sqrt(np.mean(samples**2))
+
+ # Calculate frequency characteristics
+ # Compute FFT
+ N = len(samples)
+ yf = np.fft.fft(samples)
+ xf = np.fft.fftfreq(N, 1 / sample_rate)[:N//2]
+ magnitude = 2.0/N * np.abs(yf[0:N//2])
+ # Calculate spectral centroid
+ spectral_centroid = np.sum(xf * magnitude) / np.sum(magnitude)
+ # Determine dominant frequencies
+ dominant_freqs = xf[magnitude.argsort()[-5:]][::-1].tolist()
+
+ return {
+ 'samples': samples,
+ 'sample_rate': sample_rate,
+ 'duration': duration,
+ 'max_amplitude': max_amp,
+ 'rms': rms,
+ 'spectral_centroid': spectral_centroid,
+ 'dominant_frequencies': dominant_freqs
+ }
+
+def plot_comparison(analyses, output_dir):
+ """Create detailed comparison plots of the audio analyses."""
+ plt.style.use('dark_background')
+
+ # Plot waveforms
+ fig_wave = plt.figure(figsize=(15, 10))
+ fig_wave.patch.set_facecolor('#1a1a2e')
+
+ for i, (name, data) in enumerate(analyses.items()):
+ ax = plt.subplot(len(analyses), 1, i+1)
+ samples = data['samples']
+ time = np.arange(len(samples)) / data['sample_rate']
+ plt.plot(time, samples / data['max_amplitude'], linewidth=0.5, color='#ff2a6d')
+ plt.title(f"Waveform: {name}", color='white', pad=20)
+ plt.xlabel("Time (seconds)", color='white')
+ plt.ylabel("Normalized Amplitude", color='white')
+ plt.grid(True, alpha=0.3)
+ ax.set_facecolor('#1a1a2e')
+ plt.ylim(-1.1, 1.1)
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'waveforms.png', dpi=300, bbox_inches='tight')
+ plt.close()
+
+ # Plot spectral characteristics
+ fig_spec = plt.figure(figsize=(15, 10))
+ fig_spec.patch.set_facecolor('#1a1a2e')
+
+ for i, (name, data) in enumerate(analyses.items()):
+ # Calculate spectrogram
+ samples = data['samples']
+ sample_rate = data['sample_rate']
+ nperseg = 2048
+ f, t, Sxx = plt.mlab.specgram(samples, NFFT=2048, Fs=sample_rate,
+ noverlap=nperseg//2, scale='dB')
+
+ ax = plt.subplot(len(analyses), 1, i+1)
+ plt.pcolormesh(t, f, Sxx, shading='gouraud', cmap='magma')
+ plt.title(f"Spectrogram: {name}", color='white', pad=20)
+ plt.ylabel('Frequency [Hz]', color='white')
+ plt.xlabel('Time [sec]', color='white')
+ plt.colorbar(label='Intensity [dB]')
+ ax.set_facecolor('#1a1a2e')
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'spectrograms.png', dpi=300, bbox_inches='tight')
+ plt.close()
+
+ # Plot voice characteristics comparison
+ fig_chars = plt.figure(figsize=(15, 8))
+ fig_chars.patch.set_facecolor('#1a1a2e')
+
+ # Extract characteristics
+ names = list(analyses.keys())
+ rms_values = [data['rms'] for data in analyses.values()]
+ centroids = [data['spectral_centroid'] for data in analyses.values()]
+ max_amps = [data['max_amplitude'] for data in analyses.values()]
+
+ # Plot characteristics
+ x = np.arange(len(names))
+ width = 0.25
+
+ ax = plt.subplot(111)
+ ax.bar(x - width, rms_values, width, label='RMS (Texture)', color='#ff2a6d')
+ ax.bar(x, [c/1000 for c in centroids], width, label='Spectral Centroid/1000 (Brightness)', color='#05d9e8')
+ ax.bar(x + width, max_amps, width, label='Max Amplitude', color='#ff65bd')
+
+ ax.set_xticks(x)
+ ax.set_xticklabels(names, rotation=45, ha='right')
+ ax.legend()
+ ax.set_title('Voice Characteristics Comparison', color='white', pad=20)
+ ax.set_facecolor('#1a1a2e')
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'characteristics.png', dpi=300, bbox_inches='tight')
+ plt.close()
+
+ print(f"\nSaved comparison plots to {output_dir}")
+
+def main():
+ # Test different voice combinations with weights
+ voices = {
+ 'af_bella': output_dir / 'af_bella.wav',
+ 'af_kore': output_dir / 'af_kore.wav',
+ 'af_bella(0.2)+af_kore(0.8)': output_dir / 'af_bella_20_af_kore_80.wav',
+ 'af_bella(0.8)+af_kore(0.2)': output_dir / 'af_bella_80_af_kore_20.wav',
+ 'af_bella(0.5)+af_kore(0.5)': output_dir / 'af_bella_50_af_kore_50.wav'
+ }
+
+ # Generate audio for each voice/combination
+ for voice, path in voices.items():
+ try:
+ generate_and_save_audio(voice, str(path))
+ except Exception as e:
+ print(f"Error generating audio for {voice}: {e}")
+ continue
+
+ # Analyze each audio file
+ analyses = {}
+ for name, path in voices.items():
+ try:
+ analyses[name] = analyze_audio(str(path))
+ except Exception as e:
+ print(f"Error analyzing {name}: {e}")
+ continue
+
+ # Create comparison plots
+ if analyses:
+ plot_comparison(analyses, output_dir)
+ else:
+ print("No analyses to plot")
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/assorted_checks/test_combinations/test_download_voice.py b/examples/assorted_checks/test_combinations/test_download_voice.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2b847cfd93eaa085af7e83735a843a6ed0b17ab
--- /dev/null
+++ b/examples/assorted_checks/test_combinations/test_download_voice.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+import os
+from pathlib import Path
+import requests
+
+# Create output directory
+output_dir = Path(__file__).parent / "output"
+output_dir.mkdir(exist_ok=True)
+
+def download_combined_voice(voice1: str, voice2: str, weights: tuple[float, float] = None) -> str:
+ """Download a combined voice file.
+
+ Args:
+ voice1: First voice name
+ voice2: Second voice name
+ weights: Optional tuple of weights (w1, w2). If not provided, uses equal weights.
+
+ Returns:
+ Path to downloaded .pt file
+ """
+ print(f"\nDownloading combined voice: {voice1} + {voice2}")
+
+ # Construct voice string with optional weights
+ if weights:
+ voice_str = f"{voice1}({weights[0]})+{voice2}({weights[1]})"
+ else:
+ voice_str = f"{voice1}+{voice2}"
+
+ # Make the request to combine voices
+ response = requests.post(
+ "http://localhost:8880/v1/audio/voices/combine",
+ json=voice_str
+ )
+
+ if response.status_code != 200:
+ raise Exception(f"Failed to combine voices: {response.text}")
+
+ # Save the .pt file
+ output_path = output_dir / f"{voice_str}.pt"
+ with open(output_path, "wb") as f:
+ f.write(response.content)
+
+ print(f"Saved combined voice to {output_path}")
+ return str(output_path)
+
+def main():
+ # Test downloading various voice combinations
+ combinations = [
+ # Equal weights (default)
+ ("af_bella", "af_kore"),
+
+ # Different weight combinations
+ ("af_bella", "af_kore", (0.2, 0.8)),
+ ("af_bella", "af_kore", (0.8, 0.2)),
+ ("af_bella", "af_kore", (0.5, 0.5)),
+
+ # Test with different voices
+ ("af_bella", "af_jadzia"),
+ ("af_bella", "af_jadzia", (0.3, 0.7))
+ ]
+
+ for combo in combinations:
+ try:
+ if len(combo) == 3:
+ voice1, voice2, weights = combo
+ download_combined_voice(voice1, voice2, weights)
+ else:
+ voice1, voice2 = combo
+ download_combined_voice(voice1, voice2)
+ except Exception as e:
+ print(f"Error downloading combination {combo}: {e}")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/examples/assorted_checks/test_formats/test_audio_formats.py b/examples/assorted_checks/test_formats/test_audio_formats.py
new file mode 100644
index 0000000000000000000000000000000000000000..efd527acf4909eb60d6312e92e432cb1958d35c3
--- /dev/null
+++ b/examples/assorted_checks/test_formats/test_audio_formats.py
@@ -0,0 +1,322 @@
+"""Test script to generate and analyze different audio formats"""
+
+import os
+import time
+from pathlib import Path
+
+import numpy as np
+import openai
+import requests
+import soundfile as sf
+import matplotlib.pyplot as plt
+from scipy.io import wavfile
+
+SAMPLE_TEXT = """
+That is the germ of my great discovery. But you are wrong to say that we cannot move about in Time.
+"""
+
+# Configure OpenAI client
+client = openai.OpenAI(
+ timeout=60,
+ api_key="notneeded", # API key not required for our endpoint
+ base_url="http://localhost:8880/v1", # Point to our local server with v1 prefix
+)
+
+
+def setup_plot(fig, ax, title):
+ """Configure plot styling"""
+ # Improve grid
+ ax.grid(True, linestyle="--", alpha=0.3, color="#ffffff")
+
+ # Set title and labels with better fonts and more padding
+ ax.set_title(title, pad=40, fontsize=16, fontweight="bold", color="#ffffff")
+ ax.set_xlabel(ax.get_xlabel(), fontsize=14, fontweight="medium", color="#ffffff")
+ ax.set_ylabel(ax.get_ylabel(), fontsize=14, fontweight="medium", color="#ffffff")
+
+ # Improve tick labels
+ ax.tick_params(labelsize=12, colors="#ffffff")
+
+ # Style spines
+ for spine in ax.spines.values():
+ spine.set_color("#ffffff")
+ spine.set_alpha(0.3)
+ spine.set_linewidth(0.5)
+
+ # Set background colors
+ ax.set_facecolor("#1a1a2e")
+ fig.patch.set_facecolor("#1a1a2e")
+
+ return fig, ax
+
+
+def plot_format_comparison(stats: list, output_dir: str):
+ """Plot audio format comparison"""
+ plt.style.use("dark_background")
+
+ # Create figure with subplots
+ fig = plt.figure(figsize=(18, 16)) # Taller figure to accommodate bottom legend
+ fig.patch.set_facecolor("#1a1a2e")
+
+ # Create subplot grid with balanced spacing for waveforms
+ gs_waves = plt.GridSpec(
+ len(stats), 1, left=0.15, right=0.85, top=0.9, bottom=0.35, hspace=0.4
+ )
+
+ # Plot waveforms for each format
+ for i, stat in enumerate(stats):
+ format_name = stat["format"].upper()
+ try:
+ file_path = os.path.join(output_dir, f"test_audio.{stat['format']}")
+
+ if stat["format"] == "wav":
+ # Use scipy.io.wavfile for WAV files
+ sr, data = wavfile.read(file_path)
+ data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
+ elif stat["format"] == "pcm":
+ # Read raw 16-bit signed little-endian PCM data at 24kHz
+ data = np.frombuffer(
+ open(file_path, "rb").read(), dtype=" dict:
+ """Get audio file statistics"""
+ file_size = os.path.getsize(file_path)
+ file_size_kb = file_size / 1024 # Convert to KB
+ format_name = Path(file_path).suffix[1:]
+
+ if format_name == "wav":
+ # Use scipy.io.wavfile for WAV files
+ sample_rate, data = wavfile.read(file_path)
+ data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
+ duration = len(data) / sample_rate
+ channels = 1 if len(data.shape) == 1 else data.shape[1]
+ elif format_name == "pcm":
+ # For PCM, read raw 16-bit signed little-endian PCM data at 24kHz
+ data = np.frombuffer(
+ open(file_path, "rb").read(), dtype=" List[str]:
+ """Create a variety of test cases with different characteristics"""
+
+ # Helper to create random text with specific patterns
+ def random_text(length: int) -> str:
+ return "".join(
+ random.choice(string.ascii_letters + string.digits + " .,!?")
+ for _ in range(length)
+ )
+
+ test_cases = []
+
+ # Base test cases that hit specific patterns
+ base_cases = [
+ "Dr. Smith and Mr. Jones discussed the $1,234.56 million investment.",
+ "Yeah, they met at 10:30 and reviewed A.B.C. documentation with Mrs. Brown etc.",
+ 'The temperature was 72.5 degrees (quite normal) for "this time" of year.',
+ "X's and Y's properties cost £50 million in the 1990s",
+ "こんにちは。今日は!",
+ ]
+
+ # Add base cases
+ test_cases.extend(base_cases)
+
+ # Add variations with random content
+ for length in [100, 1000, 10000]:
+ # Create 3 variations of each length
+ for _ in range(3):
+ text = random_text(length)
+ # Insert some patterns we're looking for
+ text = text.replace(text[10:20], "Dr. Smith")
+ text = text.replace(text[30:40], "$1,234.56")
+ text = text.replace(text[50:60], "A.B.C. xyz")
+ test_cases.append(text)
+
+ return test_cases
+
+
+class TextNormalizerInline:
+ """Text normalizer using inline patterns"""
+
+ def normalize(self, text: str) -> str:
+ # Replace quotes and brackets
+ text = text.replace(chr(8216), "'").replace(chr(8217), "'")
+ text = text.replace("«", chr(8220)).replace("»", chr(8221))
+ text = text.replace(chr(8220), '"').replace(chr(8221), '"')
+ text = text.replace("(", "«").replace(")", "»")
+
+ # Handle CJK punctuation
+ for a, b in zip("、。!,:;?", ",.!,:;?"):
+ text = text.replace(a, b + " ")
+
+ text = re.sub(r"[^\S \n]", " ", text)
+ text = re.sub(r" +", " ", text)
+ text = re.sub(r"(?<=\n) +(?=\n)", "", text)
+ text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
+ text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
+ text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
+ text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
+ text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
+ text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
+ text = re.sub(
+ r"\d*\.\d+|\b\d{4}s?\b|(? str:
+ # Replace quotes and brackets
+ text = text.replace(chr(8216), "'").replace(chr(8217), "'")
+ text = text.replace("«", chr(8220)).replace("»", chr(8221))
+ text = text.replace(chr(8220), '"').replace(chr(8221), '"')
+ text = text.replace("(", "«").replace(")", "»")
+
+ # Handle CJK punctuation
+ for a, b in zip("、。!,:;?", ",.!,:;?"):
+ text = text.replace(a, b + " ")
+
+ # Use compiled patterns
+ text = self.patterns["whitespace"].sub(" ", text)
+ text = self.patterns["multi_space"].sub(" ", text)
+ text = self.patterns["newline_space"].sub("", text)
+ text = self.patterns["doctor"].sub("Doctor", text)
+ text = self.patterns["mister"].sub("Mister", text)
+ text = self.patterns["miss"].sub("Miss", text)
+ text = self.patterns["mrs"].sub("Mrs", text)
+ text = self.patterns["etc"].sub("etc", text)
+ text = self.patterns["yeah"].sub(r"\1e'a", text)
+ text = self.patterns["numbers"].sub(split_num, text)
+ text = self.patterns["comma_in_number"].sub("", text)
+ text = self.patterns["money"].sub(handle_money, text)
+ text = self.patterns["decimal"].sub(handle_decimal, text)
+ text = self.patterns["range"].sub(" to ", text)
+ text = self.patterns["s_after_number"].sub(" S", text)
+ text = self.patterns["possessive_s"].sub("'S", text)
+ text = self.patterns["x_possessive"].sub("s", text)
+ text = self.patterns["initials"].sub(
+ lambda m: m.group().replace(".", "-"), text
+ )
+ text = self.patterns["single_initial"].sub("-", text)
+
+ return text.strip()
+
+
+class TextNormalizerHybrid:
+ """Text normalizer using hybrid approach - compile only complex/frequent patterns"""
+
+ def __init__(self):
+ # Only compile patterns that are complex or frequently used
+ self.patterns = {
+ "whitespace": re.compile(r"[^\S \n]"),
+ "numbers": re.compile(
+ r"\d*\.\d+|\b\d{4}s?\b|(? str:
+ # Replace quotes and brackets
+ text = text.replace(chr(8216), "'").replace(chr(8217), "'")
+ text = text.replace("«", chr(8220)).replace("»", chr(8221))
+ text = text.replace(chr(8220), '"').replace(chr(8221), '"')
+ text = text.replace("(", "«").replace(")", "»")
+
+ # Handle CJK punctuation
+ for a, b in zip("、。!,:;?", ",.!,:;?"):
+ text = text.replace(a, b + " ")
+
+ # Use compiled patterns for complex operations
+ text = self.patterns["whitespace"].sub(" ", text)
+ text = self.patterns["numbers"].sub(split_num, text)
+ text = self.patterns["money"].sub(handle_money, text)
+ text = self.patterns["initials"].sub(
+ lambda m: m.group().replace(".", "-"), text
+ )
+
+ # Use inline patterns for simpler operations
+ text = re.sub(r" +", " ", text)
+ text = re.sub(r"(?<=\n) +(?=\n)", "", text)
+ text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
+ text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
+ text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
+ text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
+ text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
+ text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
+ text = re.sub(r"(?<=\d),(?=\d)", "", text)
+ text = re.sub(r"\d*\.\d+", handle_decimal, text)
+ text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
+ text = re.sub(r"(?<=\d)S", " S", text)
+ text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
+ text = re.sub(r"(?<=X')S\b", "s", text)
+ text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
+
+ return text.strip()
+
+
+def split_num(match: re.Match) -> str:
+ """Split numbers for TTS processing"""
+ num = match.group(0)
+ if ":" in num:
+ h, m = num.split(":")
+ return f"{h} {m}"
+ if num.endswith("s"):
+ return f"{num[:-1]} s"
+ return num
+
+
+def handle_money(match: re.Match) -> str:
+ """Format money strings for TTS"""
+ text = match.group(0)
+ return text.replace("$", " dollars ").replace("£", " pounds ")
+
+
+def handle_decimal(match: re.Match) -> str:
+ """Format decimal numbers for TTS"""
+ num = match.group(0)
+ return num.replace(".", " point ")
+
+
+def benchmark_normalizers(
+ test_cases: List[str], iterations: int = 100
+) -> Tuple[float, float, float]:
+ """Benchmark all three implementations"""
+
+ normalizers = {
+ "inline": TextNormalizerInline(),
+ "compiled": TextNormalizerCompiled(),
+ "hybrid": TextNormalizerHybrid(),
+ }
+
+ results = {}
+
+ # Test each normalizer
+ for name, normalizer in normalizers.items():
+ start = time.perf_counter()
+
+ # Run normalizations
+ for _ in range(iterations):
+ for test in test_cases:
+ normalizer.normalize(test)
+
+ results[name] = time.perf_counter() - start
+
+ return results
+
+
+def verify_outputs(test_cases: List[str]) -> bool:
+ """Verify that all implementations produce identical output"""
+ normalizers = {
+ "inline": TextNormalizerInline(),
+ "compiled": TextNormalizerCompiled(),
+ "hybrid": TextNormalizerHybrid(),
+ }
+
+ for test in test_cases:
+ results = [norm.normalize(test) for norm in normalizers.values()]
+ if not all(r == results[0] for r in results):
+ return False
+ return True
+
+
+def main():
+ # Create test cases
+ print("Generating test cases...")
+ test_cases = create_test_cases()
+ total_chars = sum(len(t) for t in test_cases)
+ print(
+ f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters"
+ )
+
+ # Verify output consistency
+ print("\nVerifying output consistency...")
+ if verify_outputs(test_cases):
+ print("✓ All implementations produce identical output")
+ else:
+ print("✗ Warning: Implementations produce different outputs!")
+ return
+
+ # Run benchmarks
+ print("\nRunning benchmarks...")
+ iterations = 100
+ results = benchmark_normalizers(test_cases, iterations)
+
+ # Print results
+ print(f"\nResults for {iterations} iterations: ")
+ for name, time_taken in results.items():
+ print(f"{name.capitalize()}: {time_taken:.3f}s")
+
+
+main()
diff --git a/examples/assorted_checks/test_openai/test_openai_tts.py b/examples/assorted_checks/test_openai/test_openai_tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..01124af386e41fba6e80ec30eab29f87007e81e5
--- /dev/null
+++ b/examples/assorted_checks/test_openai/test_openai_tts.py
@@ -0,0 +1,75 @@
+from pathlib import Path
+
+import openai
+
+# Configure OpenAI client to use our local endpoint
+client = openai.OpenAI(
+ timeout=30,
+ api_key="notneeded", # API key not required for our endpoint
+ base_url="http://localhost:8880/v1", # Point to our local server with v1 prefix
+)
+
+# Create output directory if it doesn't exist
+output_dir = Path(__file__).parent / "output"
+output_dir.mkdir(exist_ok=True)
+
+
+def test_format(
+ format: str, text: str = "The quick brown fox jumped over the lazy dog."
+):
+ speech_file = output_dir / f"speech_{format}.{format}"
+ print(f"\nTesting {format} format...")
+ print(f"Making request to {client.base_url}/audio/speech...")
+
+ try:
+ response = client.audio.speech.create(
+ model="tts-1", voice="af_heart", input=text, response_format=format
+ )
+
+ print("Got response, saving to file...")
+ with open(speech_file, "wb") as f:
+ f.write(response.content)
+ print(f"Success! Saved to: {speech_file}")
+
+ except Exception as e:
+ print(f"Error: {str(e)}")
+
+
+def test_speed(speed: float):
+ speech_file = output_dir / f"speech_speed_{speed}.wav"
+ print(f"\nTesting speed {speed}x...")
+ print(f"Making request to {client.base_url}/audio/speech...")
+
+ try:
+ response = client.audio.speech.create(
+ model="tts-1",
+ voice="af_heart",
+ input="The quick brown fox jumped over the lazy dog.",
+ response_format="wav",
+ speed=speed,
+ )
+
+ print("Got response, saving to file...")
+ with open(speech_file, "wb") as f:
+ f.write(response.content)
+ print(f"Success! Saved to: {speech_file}")
+
+ except Exception as e:
+ print(f"Error: {str(e)}")
+
+
+# Test different formats
+for format in ["wav", "mp3", "opus", "aac", "flac", "pcm"]:
+ test_format(format) # aac and pcm should fail as they are not supported
+
+# Test different speeds
+for speed in [0.25, 1.0, 2.0, 4.0]: # 5.0 should fail as it's out of range
+ test_speed(speed)
+
+# Test long text
+test_format(
+ "wav",
+ """
+That is the germ of my great discovery. But you are wrong to say that we cannot move about in Time. For instance, if I am recalling an incident very vividly I go back to the instant of its occurrence: I become absent-minded, as you say. I jump back for a moment.
+""",
+)
diff --git a/examples/assorted_checks/test_voices/analyze_voice_dimensions.py b/examples/assorted_checks/test_voices/analyze_voice_dimensions.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c7a7f82e4fb13a142a45cbbed4e848001c376b1
--- /dev/null
+++ b/examples/assorted_checks/test_voices/analyze_voice_dimensions.py
@@ -0,0 +1,54 @@
+import os
+import torch
+from loguru import logger
+
+def analyze_voice_file(file_path):
+ """Analyze dimensions and statistics of a voice tensor."""
+ try:
+ tensor = torch.load(file_path, map_location="cpu")
+ logger.info(f"\nAnalyzing {os.path.basename(file_path)}:")
+ logger.info(f"Shape: {tensor.shape}")
+ logger.info(f"Mean: {tensor.mean().item():.4f}")
+ logger.info(f"Std: {tensor.std().item():.4f}")
+ logger.info(f"Min: {tensor.min().item():.4f}")
+ logger.info(f"Max: {tensor.max().item():.4f}")
+ return tensor.shape
+ except Exception as e:
+ logger.error(f"Error analyzing {file_path}: {e}")
+ return None
+
+def main():
+ """Analyze voice files in the voices directory."""
+ # Get the project root directory
+ current_dir = os.path.dirname(os.path.abspath(__file__))
+ project_root = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
+ voices_dir = os.path.join(project_root, "api", "src", "voices", "v1_0")
+
+ logger.info(f"Scanning voices in: {voices_dir}")
+
+ # Track shapes for comparison
+ shapes = {}
+
+ # Analyze each .pt file
+ for file in os.listdir(voices_dir):
+ if file.endswith('.pt'):
+ file_path = os.path.join(voices_dir, file)
+ shape = analyze_voice_file(file_path)
+ if shape:
+ shapes[file] = shape
+
+ # Report findings
+ logger.info("\nShape Analysis:")
+ shape_groups = {}
+ for file, shape in shapes.items():
+ if shape not in shape_groups:
+ shape_groups[shape] = []
+ shape_groups[shape].append(file)
+
+ for shape, files in shape_groups.items():
+ logger.info(f"\nShape {shape}:")
+ for file in files:
+ logger.info(f" - {file}")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/examples/assorted_checks/test_voices/test_all_voices.py b/examples/assorted_checks/test_voices/test_all_voices.py
new file mode 100644
index 0000000000000000000000000000000000000000..a143e83b4087f4b6db44e57e0ada8542f5baf8fa
--- /dev/null
+++ b/examples/assorted_checks/test_voices/test_all_voices.py
@@ -0,0 +1,60 @@
+from pathlib import Path
+
+import openai
+import requests
+
+SAMPLE_TEXT = """
+That is the germ of my great discovery. But you are wrong to say that we cannot move about in Time. For instance, if I am recalling an incident very vividly I go back to the instant of its occurrence: I become absent-minded, as you say. I jump back for a moment.
+"""
+
+# Configure OpenAI client to use our local endpoint
+client = openai.OpenAI(
+ timeout=60,
+ api_key="notneeded", # API key not required for our endpoint
+ base_url="http://localhost:8880/v1", # Point to our local server with v1 prefix
+)
+
+# Create output directory if it doesn't exist
+output_dir = Path(__file__).parent / "output"
+output_dir.mkdir(exist_ok=True)
+
+
+def test_voice(voice: str):
+ speech_file = output_dir / f"speech_{voice}.mp3"
+ print(f"\nTesting voice: {voice}")
+ print(f"Making request to {client.base_url}/audio/speech...")
+
+ try:
+ response = client.audio.speech.create(
+ model="kokoro", voice=voice, input=SAMPLE_TEXT, response_format="mp3"
+ )
+
+ print("Got response, saving to file...")
+ with open(speech_file, "wb") as f:
+ f.write(response.content)
+ print(f"Success! Saved to: {speech_file}")
+
+ except Exception as e:
+ print(f"Error with voice {voice}: {str(e)}")
+
+
+# First, get list of available voices using requests
+print("Getting list of available voices...")
+try:
+ # Convert base_url to string and ensure no double slashes
+ base_url = str(client.base_url).rstrip("/")
+ response = requests.get(f"{base_url}/audio/voices")
+ if response.status_code != 200:
+ raise Exception(f"Failed to get voices: {response.text}")
+ data = response.json()
+ if "voices" not in data:
+ raise Exception(f"Unexpected response format: {data}")
+ voices = data["voices"]
+ print(f"Found {len(voices)} voices: {', '.join(voices)}")
+
+ # Test each voice
+ for voice in voices:
+ test_voice(voice)
+
+except Exception as e:
+ print(f"Error getting voices: {str(e)}")
diff --git a/examples/assorted_checks/test_voices/trim_voice_dimensions.py b/examples/assorted_checks/test_voices/trim_voice_dimensions.py
new file mode 100644
index 0000000000000000000000000000000000000000..90f7013f9f1717cea47881ffe5d8b91c9defa48b
--- /dev/null
+++ b/examples/assorted_checks/test_voices/trim_voice_dimensions.py
@@ -0,0 +1,85 @@
+import os
+import torch
+from loguru import logger
+
+def analyze_voice_content(tensor):
+ """Analyze the content distribution in the voice tensor."""
+ # Look at the variance along the first dimension to see where the information is concentrated
+ variance = torch.var(tensor, dim=(1,2)) # Variance across features
+ logger.info(f"Variance distribution:")
+ logger.info(f"First 5 rows variance: {variance[:5].mean().item():.6f}")
+ logger.info(f"Last 5 rows variance: {variance[-5:].mean().item():.6f}")
+ return variance
+
+def trim_voice_tensor(tensor):
+ """Trim a 511x1x256 tensor to 510x1x256 by removing the row with least impact."""
+ if tensor.shape[0] != 511:
+ raise ValueError(f"Expected tensor with first dimension 511, got {tensor.shape[0]}")
+
+ # Analyze variance contribution of each row
+ variance = analyze_voice_content(tensor)
+
+ # Determine which end has lower variance (less information)
+ start_var = variance[:5].mean().item()
+ end_var = variance[-5:].mean().item()
+
+ # Remove from the end with lower variance
+ if end_var < start_var:
+ logger.info("Trimming last row (lower variance at end)")
+ return tensor[:-1]
+ else:
+ logger.info("Trimming first row (lower variance at start)")
+ return tensor[1:]
+
+def process_voice_file(file_path):
+ """Process a single voice file."""
+ try:
+ tensor = torch.load(file_path, map_location="cpu")
+ if tensor.shape[0] != 511:
+ logger.info(f"Skipping {os.path.basename(file_path)} - already correct shape {tensor.shape}")
+ return False
+
+ logger.info(f"\nProcessing {os.path.basename(file_path)}:")
+ logger.info(f"Original shape: {tensor.shape}")
+
+ # Create backup
+ backup_path = file_path + ".backup"
+ if not os.path.exists(backup_path):
+ torch.save(tensor, backup_path)
+ logger.info(f"Created backup at {backup_path}")
+
+ # Trim tensor
+ trimmed = trim_voice_tensor(tensor)
+ logger.info(f"New shape: {trimmed.shape}")
+
+ # Save trimmed tensor
+ torch.save(trimmed, file_path)
+ logger.info(f"Saved trimmed tensor to {file_path}")
+
+ return True
+ except Exception as e:
+ logger.error(f"Error processing {file_path}: {e}")
+ return False
+
+def main():
+ """Process voice files in the voices directory."""
+ # Get the project root directory
+ current_dir = os.path.dirname(os.path.abspath(__file__))
+ project_root = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
+ voices_dir = os.path.join(project_root, "api", "src", "voices", "v1_0")
+
+ logger.info(f"Processing voices in: {voices_dir}")
+
+ processed = 0
+ for file in os.listdir(voices_dir):
+ if file.endswith('.pt') and not file.endswith('.backup'):
+ file_path = os.path.join(voices_dir, file)
+ if process_voice_file(file_path):
+ processed += 1
+
+ logger.info(f"\nProcessed {processed} voice files")
+ logger.info("Backups created with .backup extension")
+ logger.info("To restore backups if needed, remove .backup extension to replace trimmed files")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/examples/assorted_checks/validate_wav.py b/examples/assorted_checks/validate_wav.py
new file mode 100644
index 0000000000000000000000000000000000000000..844655aeaef78d3046a18da29e854350b7f9c656
--- /dev/null
+++ b/examples/assorted_checks/validate_wav.py
@@ -0,0 +1,276 @@
+import argparse
+from typing import Any, Dict
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+from tqdm import tqdm
+
+
+def validate_tts(wav_path: str) -> dict:
+ """
+ Validation checks for TTS-generated audio files to detect common artifacts.
+ """
+ try:
+ # Load and process audio
+ audio, sr = sf.read(wav_path)
+ if len(audio.shape) > 1:
+ audio = np.mean(audio, axis=1)
+
+ duration = len(audio) / sr
+ issues = []
+
+ # Basic quality checks
+ abs_audio = np.abs(audio)
+ stats = {
+ "rms": float(np.sqrt(np.mean(audio**2))),
+ "peak": float(np.max(abs_audio)),
+ "dc_offset": float(np.mean(audio)),
+ }
+
+ clip_count = np.sum(abs_audio >= 0.99)
+ clip_percent = (clip_count / len(audio)) * 100
+
+ if duration < 0.1:
+ issues.append(
+ "WARNING: Audio is suspiciously short - possible failed generation"
+ )
+
+ if stats["peak"] >= 1.0:
+ if clip_percent > 1.0:
+ issues.append(
+ f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)"
+ )
+ elif clip_percent > 0.01:
+ issues.append(
+ f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)"
+ )
+
+ if stats["rms"] < 0.01:
+ issues.append("WARNING: Audio is very quiet - possible failed generation")
+
+ if abs(stats["dc_offset"]) > 0.1:
+ issues.append(f"WARNING: High DC offset ({stats['dc_offset']:.3f})")
+
+ # Check for long silence gaps
+ eps = np.finfo(float).eps
+ db = 20 * np.log10(abs_audio + eps)
+ silence_threshold = -45 # dB
+ min_silence = 2.0 # seconds
+ window_size = int(min_silence * sr)
+ silence_count = 0
+ last_silence = -1
+
+ start_idx = int(0.2 * sr) # Skip first 0.2s
+ for i in tqdm(
+ range(start_idx, len(db) - window_size, window_size),
+ desc="Checking for silence",
+ ):
+ window = db[i : i + window_size]
+ if np.mean(window) < silence_threshold:
+ silent_ratio = np.mean(window < silence_threshold)
+ if silent_ratio > 0.9:
+ if last_silence == -1 or (i / sr - last_silence) > 2.0:
+ silence_count += 1
+ last_silence = i / sr
+ issues.append(
+ f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)"
+ )
+
+ if silence_count > 2:
+ issues.append(
+ f"WARNING: Multiple long silences found ({silence_count} total)"
+ )
+
+ # Detect audio artifacts
+ diff = np.diff(audio)
+ abs_diff = np.abs(diff)
+ window_size = min(int(0.005 * sr), 256)
+ window = np.ones(window_size) / window_size
+ local_avg_diff = np.convolve(abs_diff, window, mode="same")
+
+ spikes = (abs_diff > (10 * local_avg_diff)) & (abs_diff > 0.1)
+ artifact_indices = np.nonzero(spikes)[0]
+
+ artifacts = []
+ if len(artifact_indices) > 0:
+ gaps = np.diff(artifact_indices)
+ min_gap = int(0.005 * sr)
+ break_points = np.nonzero(gaps > min_gap)[0] + 1
+ groups = np.split(artifact_indices, break_points)
+
+ for group in groups:
+ if len(group) >= 5:
+ severity = np.max(abs_diff[group])
+ if severity > 0.2:
+ center_idx = group[len(group) // 2]
+ artifacts.append(
+ {
+ "time": float(
+ center_idx / sr
+ ), # Ensure float for consistent timing
+ "severity": float(severity),
+ }
+ )
+ issues.append(
+ f"WARNING: Audio discontinuity at {center_idx/sr:.3f}s "
+ f"(severity: {severity:.3f})"
+ )
+
+ # Check for repeated speech segments
+ for chunk_duration in tqdm(
+ [0.5, 2.5, 5.0, 10.0], desc="Checking for repeated speech"
+ ):
+ chunk_size = int(chunk_duration * sr)
+ overlap = int(0.2 * chunk_size)
+
+ for i in range(0, len(audio) - 2 * chunk_size, overlap):
+ chunk1 = audio[i : i + chunk_size]
+ chunk2 = audio[i + chunk_size : i + 2 * chunk_size]
+
+ if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01:
+ continue
+
+ try:
+ correlation = np.corrcoef(chunk1, chunk2)[0, 1]
+ if not np.isnan(correlation) and correlation > 0.92:
+ issues.append(
+ f"WARNING: Possible repeated speech at {i/sr:.1f}s "
+ f"(~{int(chunk_duration*160/60):d} words, correlation: {correlation:.3f})"
+ )
+ break
+ except:
+ continue
+
+ return {
+ "file": wav_path,
+ "duration": f"{duration:.2f}s",
+ "sample_rate": sr,
+ "peak_amplitude": f"{stats['peak']:.3f}",
+ "rms_level": f"{stats['rms']:.3f}",
+ "dc_offset": f"{stats['dc_offset']:.3f}",
+ "artifact_count": len(artifacts),
+ "artifact_locations": [a["time"] for a in artifacts],
+ "artifact_severities": [a["severity"] for a in artifacts],
+ "issues": issues,
+ "valid": len(issues) == 0,
+ }
+
+ except Exception as e:
+ return {"file": wav_path, "error": str(e), "valid": False}
+
+
+def generate_analysis_plots(
+ wav_path: str, output_dir: str, validation_result: Dict[str, Any]
+):
+ """
+ Generate analysis plots for audio file with time-aligned visualizations.
+ """
+ import matplotlib.pyplot as plt
+ from scipy.signal import spectrogram
+
+ # Load audio
+ audio, sr = sf.read(wav_path)
+ if len(audio.shape) > 1:
+ audio = np.mean(audio, axis=1)
+
+ # Create figure with shared x-axis
+ fig = plt.figure(figsize=(15, 8))
+ gs = plt.GridSpec(2, 1, height_ratios=[1.2, 0.8], hspace=0.1)
+ ax1 = fig.add_subplot(gs[0])
+ ax2 = fig.add_subplot(gs[1], sharex=ax1)
+
+ # Calculate spectrogram
+ nperseg = 2048
+ noverlap = 1536
+ f, t, Sxx = spectrogram(
+ audio, sr, nperseg=nperseg, noverlap=noverlap, window="hann", scaling="spectrum"
+ )
+
+ # Plot spectrogram
+ im = ax1.pcolormesh(
+ t,
+ f,
+ 10 * np.log10(Sxx + 1e-10),
+ shading="gouraud",
+ cmap="viridis",
+ vmin=-100,
+ vmax=-20,
+ )
+ ax1.set_ylabel("Frequency [Hz]", fontsize=10)
+ cbar = plt.colorbar(im, ax=ax1, label="dB")
+ ax1.set_title("Spectrogram", pad=10, fontsize=12)
+
+ # Plot waveform with exact time alignment
+ times = np.arange(len(audio)) / sr
+ ax2.plot(times, audio, color="#2E5596", alpha=0.7, linewidth=0.5, label="Audio")
+ ax2.set_ylabel("Amplitude", fontsize=10)
+ ax2.set_xlabel("Time [sec]", fontsize=10)
+ ax2.grid(True, alpha=0.2)
+
+ # Add artifact markers
+ if (
+ "artifact_locations" in validation_result
+ and validation_result["artifact_locations"]
+ ):
+ for loc in validation_result["artifact_locations"]:
+ ax1.axvline(x=loc, color="red", alpha=0.7, linewidth=2)
+ ax2.axvline(
+ x=loc, color="red", alpha=0.7, linewidth=2, label="Detected Artifacts"
+ )
+
+ # Add legend to both plots
+ if len(validation_result["artifact_locations"]) > 0:
+ ax1.plot([], [], color="red", linewidth=2, label="Detected Artifacts")
+ ax1.legend(loc="upper right", fontsize=8)
+ # Only add unique labels to legend
+ handles, labels = ax2.get_legend_handles_labels()
+ unique_labels = dict(zip(labels, handles))
+ ax2.legend(
+ unique_labels.values(),
+ unique_labels.keys(),
+ loc="upper right",
+ fontsize=8,
+ )
+
+ # Set common x limits
+ xlim = (0, len(audio) / sr)
+ ax1.set_xlim(xlim)
+ ax2.set_xlim(xlim)
+ og_filename = Path(wav_path).name.split(".")[0]
+ # Save plot
+ plt.savefig(
+ Path(output_dir) / f"{og_filename}_audio_analysis.png",
+ dpi=300,
+ bbox_inches="tight",
+ )
+ plt.close()
+
+
+if __name__ == "__main__":
+ wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\assorted_checks\benchmarks\output_audio\chunk_600_tokens.wav"
+ silent = False
+
+ print(f"\n\n Processing:\n\t{wav_file}")
+ result = validate_tts(wav_file)
+ if not silent:
+ wav_root_dir = Path(wav_file).parent
+ generate_analysis_plots(wav_file, wav_root_dir, result)
+
+ print(f"\nValidating: {result['file']}")
+ if "error" in result:
+ print(f"Error: {result['error']}")
+ else:
+ print(f"Duration: {result['duration']}")
+ print(f"Sample Rate: {result['sample_rate']} Hz")
+ print(f"Peak Amplitude: {result['peak_amplitude']}")
+ print(f"RMS Level: {result['rms_level']}")
+ print(f"DC Offset: {result['dc_offset']}")
+ print(f"Detected Artifacts: {result['artifact_count']}")
+
+ if result["issues"]:
+ print("\nIssues Found:")
+ for issue in result["issues"]:
+ print(f"- {issue}")
+ else:
+ print("\nNo issues found")
diff --git a/examples/assorted_checks/validate_wavs.py b/examples/assorted_checks/validate_wavs.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebf114fc762815ce15999da7923ea822bdc25541
--- /dev/null
+++ b/examples/assorted_checks/validate_wavs.py
@@ -0,0 +1,78 @@
+import argparse
+from pathlib import Path
+
+from validate_wav import validate_tts
+
+
+def print_validation_result(result: dict, rel_path: Path):
+ """Print full validation details for a single file."""
+ print(f"\nValidating: {rel_path}")
+ if "error" in result:
+ print(f"Error: {result['error']}")
+ else:
+ print(f"Duration: {result['duration']}")
+ print(f"Sample Rate: {result['sample_rate']} Hz")
+ print(f"Peak Amplitude: {result['peak_amplitude']}")
+ print(f"RMS Level: {result['rms_level']}")
+ print(f"DC Offset: {result['dc_offset']}")
+
+ if result["issues"]:
+ print("\nIssues Found:")
+ for issue in result["issues"]:
+ print(f"- {issue}")
+ else:
+ print("\nNo issues found")
+
+
+def validate_directory(directory: str):
+ """Validate all wav files in a directory with detailed output and summary."""
+ dir_path = Path(directory)
+
+ # Find all wav files (including nested directories)
+ wav_files = list(dir_path.rglob("*.wav"))
+ wav_files.extend(dir_path.rglob("*.mp3")) # Also check mp3s
+ wav_files = sorted(wav_files)
+
+ if not wav_files:
+ print(f"No .wav or .mp3 files found in {directory}")
+ return
+
+ print(f"Found {len(wav_files)} files in {directory}")
+ print("=" * 80)
+
+ # Store results for summary
+ results = []
+
+ # Detailed validation output
+ for wav_file in wav_files:
+ result = validate_tts(str(wav_file))
+ rel_path = wav_file.relative_to(dir_path)
+ print_validation_result(result, rel_path)
+ results.append((rel_path, result))
+ print("=" * 80)
+
+ # Summary with detailed issues
+ print("\nSUMMARY:")
+ for rel_path, result in results:
+ if "error" in result:
+ print(f"{rel_path}: ERROR - {result['error']}")
+ elif result["issues"]:
+ # Show first issue in summary, indicate if there are more
+ issues = result["issues"]
+ first_issue = issues[0].replace("WARNING: ", "")
+ if len(issues) > 1:
+ print(
+ f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)"
+ )
+ else:
+ print(f"{rel_path}: FAIL - {first_issue}")
+ else:
+ print(f"{rel_path}: PASS")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Batch validate TTS wav files")
+ parser.add_argument("directory", help="Directory containing wav files to validate")
+ args = parser.parse_args()
+
+ validate_directory(args.directory)
diff --git a/examples/captioned_speech_example.py b/examples/captioned_speech_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a622b1394c10a2f2e73db61cbccbc09f8ee497b
--- /dev/null
+++ b/examples/captioned_speech_example.py
@@ -0,0 +1,103 @@
+import json
+from typing import Tuple, Optional, Dict, List
+from pathlib import Path
+
+import base64
+import requests
+
+# Get the directory this script is in
+SCRIPT_DIR = Path(__file__).absolute().parent
+
+def generate_captioned_speech(
+ text: str,
+ voice: str = "af_heart",
+ speed: float = 1.0,
+ response_format: str = "mp3"
+) -> Tuple[Optional[bytes], Optional[List[Dict]]]:
+ """Generate audio with word-level timestamps."""
+ response = requests.post(
+ "http://localhost:8880/dev/captioned_speech",
+ json={
+ "model": "kokoro",
+ "input": text,
+ "voice": voice,
+ "speed": speed,
+ "response_format": response_format,
+ "stream": False
+ }
+ )
+
+ print(f"Response status: {response.status_code}")
+
+ if response.status_code != 200:
+ print(f"Error response: {response.text}")
+ return None, None
+
+ try:
+ audio_json=json.loads(response.content)
+
+ # Decode base 64 stream to bytes
+ chunk_audio=base64.b64decode(audio_json["audio"].encode("utf-8"))
+
+ # Print word level timestamps
+ print(audio_json["timestamps"])
+
+ if not chunk_audio:
+ print("Error: Empty audio content")
+ return None, None
+
+ return chunk_audio, audio_json["timestamps"]
+ except json.JSONDecodeError as e:
+ print(f"Error parsing timestamps: {e}")
+ return None, None
+ except requests.RequestException as e:
+ print(f"Error retrieving timestamps: {e}")
+ return None, None
+
+def main():
+ # Example texts to convert
+ examples = [
+ "Hello world! Welcome to the captioned speech system.",
+ "The quick brown fox jumps over the lazy dog.",
+ """Of course if you come to the place fresh from New York, you are deceived. Your standard of vision is all astray, You do think the place is quiet. You do imagine that Mr. Smith is asleep merely because he closes his eyes as he stands. But live in Mariposa for six months or a year and then you will begin to understand it better; the buildings get higher and higher; the Mariposa House grows more and more luxurious; McCarthy's block towers to the sky; the 'buses roar and hum to the station; the trains shriek; the traffic multiplies; the people move faster and faster; a dense crowd swirls to and fro in the post-office and the five and ten cent store—and amusements! well, now! lacrosse, baseball, excursions, dances, the Fireman's Ball every winter and the Catholic picnic every summer; and music—the town band in the park every Wednesday evening, and the Oddfellows' brass band on the street every other Friday; the Mariposa Quartette, the Salvation Army—why, after a few months' residence you begin to realize that the place is a mere mad round of gaiety."""
+ ]
+
+ print("Generating captioned speech for example texts...\n")
+
+ # Create output directory in same directory as script
+ output_dir = SCRIPT_DIR / "output"
+ output_dir.mkdir(exist_ok=True)
+
+ for i, text in enumerate(examples):
+ print(f"\nExample {i+1}:")
+ print(f"Input text: {text}")
+ try:
+ # Generate audio and get timestamps
+ audio_bytes, word_timestamps = generate_captioned_speech(text)
+
+ if not audio_bytes or not word_timestamps:
+ print("Error: No audio data or timestamps generated")
+ continue
+
+ # Save audio file
+ audio_path = output_dir / f"captioned_example_{i+1}.wav"
+ with audio_path.open("wb") as f:
+ f.write(audio_bytes)
+ print(f"Audio saved to: {audio_path}")
+
+ # Save timestamps to JSON
+ timestamps_path = output_dir / f"captioned_example_{i+1}_timestamps.json"
+ with timestamps_path.open("w") as f:
+ json.dump(word_timestamps, f, indent=2)
+ print(f"Timestamps saved to: {timestamps_path}")
+
+ # Print timestamps
+ print("\nWord-level timestamps:")
+ for ts in word_timestamps:
+ print(f"{ts['word']}: {ts['start_time']:.3f}s - {ts['end_time']:.3f}s")
+
+ except requests.RequestException as e:
+ print(f"Error: {e}\n")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/examples/openai_streaming_audio.py b/examples/openai_streaming_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec3e37160ca5175c527bb84bda7f3661ec54e007
--- /dev/null
+++ b/examples/openai_streaming_audio.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env rye run python
+import time
+from pathlib import Path
+
+from openai import OpenAI
+
+# gets OPENAI_API_KEY from your environment variables
+openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
+
+speech_file_path = Path(__file__).parent / "speech.mp3"
+
+
+def main() -> None:
+ stream_to_speakers()
+
+ # Create text-to-speech audio file
+ with openai.audio.speech.with_streaming_response.create(
+ model="kokoro",
+ voice="af_bella",
+ input="the quick brown fox jumped over the lazy dogs",
+ ) as response:
+ response.stream_to_file(speech_file_path)
+
+
+def stream_to_speakers() -> None:
+ import pyaudio
+
+ player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
+
+ start_time = time.time()
+
+ with openai.audio.speech.with_streaming_response.create(
+ model="kokoro",
+ voice="af_bella+af_irulan",
+ response_format="pcm", # similar to WAV, but without a header chunk at the start.
+ input="""I see skies of blue and clouds of white
+ The bright blessed days, the dark sacred nights
+ And I think to myself
+ What a wonderful world""",
+ ) as response:
+ print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
+ for chunk in response.iter_bytes(chunk_size=1024):
+ player_stream.write(chunk)
+
+ print(f"Done in {int((time.time() - start_time) * 1000)}ms.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/phoneme_examples/examples/phoneme_examples/output/phoneme_test.wav b/examples/phoneme_examples/examples/phoneme_examples/output/phoneme_test.wav
new file mode 100644
index 0000000000000000000000000000000000000000..857ada9deae717742c849d69e823890f48326994
Binary files /dev/null and b/examples/phoneme_examples/examples/phoneme_examples/output/phoneme_test.wav differ
diff --git a/examples/phoneme_examples/generate_phonemes.py b/examples/phoneme_examples/generate_phonemes.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb33c5d807e1267131fc0beed25465add855cb27
--- /dev/null
+++ b/examples/phoneme_examples/generate_phonemes.py
@@ -0,0 +1,114 @@
+import json
+from typing import Tuple, Optional, Union, List
+from pathlib import Path
+
+import requests
+
+# Get the directory this script is in
+SCRIPT_DIR = Path(__file__).parent.absolute()
+
+
+def get_phonemes(text: str, language: str = "a") -> Tuple[str, list[int]]:
+ """Get phonemes and tokens for input text.
+
+ Args:
+ text: Input text to convert to phonemes
+ language: Language code (defaults to "a" for American English)
+
+ Returns:
+ Tuple of (phonemes string, token list)
+ """
+ # Create the request payload
+ payload = {"text": text, "language": language}
+
+ # Make POST request to the phonemize endpoint
+ response = requests.post("http://localhost:8880/dev/phonemize", json=payload)
+
+ # Raise exception for error status codes
+ response.raise_for_status()
+
+ # Parse the response
+ result = response.json()
+ return result["phonemes"], result["tokens"]
+
+
+def generate_audio_from_phonemes(phonemes: str, voice: str = "af_bella") -> Optional[bytes]:
+ """Generate audio from phonemes."""
+ response = requests.post(
+ "http://localhost:8880/dev/generate_from_phonemes",
+ json={"phonemes": phonemes, "voice": voice},
+ headers={"Accept": "audio/wav"}
+ )
+
+ print(f"Response status: {response.status_code}")
+ print(f"Response headers: {dict(response.headers)}")
+ print(f"Response content type: {response.headers.get('Content-Type')}")
+ print(f"Response length: {len(response.content)} bytes")
+
+ if response.status_code != 200:
+ print(f"Error response: {response.text}")
+ return None
+
+ if not response.content:
+ print("Error: Empty response content")
+ return None
+
+ return response.content
+
+
+def main():
+ # Example texts to convert
+ examples = [
+ "Hello world! Welcome to the phoneme generation system.",
+ "How are you today? I am doing reasonably well, thank you for asking",
+ """This is a test of the phoneme generation system. Do not be alarmed.
+ This is only a test. If this were a real phoneme emergency, '
+ you would be instructed to a phoneme shelter in your area. Repeat.
+ This is a test of the phoneme generation system. Do not be alarmed.
+ This is only a test. If this were a real phoneme emergency, '
+ you would be instructed to a phoneme shelter in your area. Repeat.
+ This is a test of the phoneme generation system. Do not be alarmed.
+ This is only a test. If this were a real phoneme emergency, '
+ you would be instructed to a phoneme shelter in your area""",
+ ]
+
+ print("Generating phonemes and audio for example texts...\n")
+
+ # Create output directory in same directory as script
+ output_dir = SCRIPT_DIR / "output"
+ output_dir.mkdir(exist_ok=True)
+
+ for i, text in enumerate(examples):
+ print(f"{len(text)}: Input text: {text}")
+ try:
+ # Get phonemes
+ phonemes, tokens = get_phonemes(text)
+ print(f"{len(phonemes)} Phonemes: {phonemes}")
+ print(f"{len(tokens)} Tokens: {tokens}")
+
+ # Generate audio from phonemes
+ print("Generating audio...")
+ audio_bytes = generate_audio_from_phonemes(phonemes)
+
+ if not audio_bytes:
+ print("Error: No audio data generated")
+ continue
+
+ # Log response size
+ print(f"Generated {len(audio_bytes)} bytes of audio data")
+
+ if audio_bytes:
+ # Save audio file
+ output_path = output_dir / f"example_{i+1}.wav"
+ with output_path.open("wb") as f:
+ f.write(audio_bytes)
+ print(f"Audio saved to: {output_path}")
+
+ print()
+
+ except requests.RequestException as e:
+ print(f"Error: {e}\n")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/phoneme_examples/test_phoneme_generation.py b/examples/phoneme_examples/test_phoneme_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..47009447205cfe3cae31e66585f7eae5d6dcf5f0
--- /dev/null
+++ b/examples/phoneme_examples/test_phoneme_generation.py
@@ -0,0 +1,48 @@
+import requests
+import os
+import json
+
+def main():
+ # Test phoneme string
+ phonemes = "hˈɛloʊ wˈɜrld" # "Hello world" in phonemes
+
+ try:
+ print("\nTesting phoneme generation via API...")
+
+ # Create request payload
+ payload = {
+ "phonemes": phonemes,
+ "voice": "af_bella" # Using bella voice
+ }
+
+ # Make request to the API endpoint
+ response = requests.post(
+ "http://localhost:8880/dev/generate_from_phonemes",
+ json=payload,
+ stream=True # Enable streaming for audio data
+ )
+
+ # Check if request was successful
+ if response.status_code == 200:
+ # Create output directory if it doesn't exist
+ os.makedirs("examples/phoneme_examples/output", exist_ok=True)
+
+ # Save the audio response
+ output_path = 'examples/phoneme_examples/output/phoneme_test.wav'
+ with open(output_path, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=8192):
+ if chunk:
+ f.write(chunk)
+
+ print(f"\nAudio saved to: {output_path}")
+ print("\nPhoneme test completed successfully!")
+ print(f"\nInput phonemes: {phonemes}")
+ else:
+ print(f"Error: API request failed with status code {response.status_code}")
+ print(f"Response: {response.text}")
+
+ except Exception as e:
+ print(f"An error occurred: {str(e)}")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/examples/requirements.txt b/examples/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7914d4cb9d84c566d31cf2b50e32048197232de5
--- /dev/null
+++ b/examples/requirements.txt
@@ -0,0 +1,2 @@
+openai>=1.0.0
+pyaudio>=0.2.13
diff --git a/examples/simul_file_test.py b/examples/simul_file_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1f14a0154159894312538f8e92be23135fb2d94
--- /dev/null
+++ b/examples/simul_file_test.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env rye run python
+import asyncio
+import time
+from pathlib import Path
+from openai import AsyncOpenAI
+
+# Initialize async client
+openai = AsyncOpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
+
+async def save_to_file(text: str, file_id: int) -> None:
+ """Save TTS output to file asynchronously"""
+ speech_file_path = Path(__file__).parent / f"speech_{file_id}.mp3"
+
+ start_time = time.time()
+ print(f"Starting file {file_id}")
+
+ try:
+ # Use streaming endpoint with mp3 format
+ async with openai.audio.speech.with_streaming_response.create(
+ model="kokoro",
+ voice="af_bella",
+ input=text,
+ response_format="mp3"
+ ) as response:
+ print(f"File {file_id} - Time to first byte: {int((time.time() - start_time) * 1000)}ms")
+
+ # Open file in binary write mode
+ with open(speech_file_path, 'wb') as f:
+ async for chunk in response.iter_bytes():
+ f.write(chunk)
+
+ print(f"File {file_id} completed in {int((time.time() - start_time) * 1000)}ms")
+ except Exception as e:
+ print(f"Error processing file {file_id}: {e}")
+
+async def main() -> None:
+ # Different text samples for variety
+ texts = [
+ "The quick brown fox jumped over the lazy dogs. I see skies of blue and clouds of white",
+ "I see skies of blue and clouds of white. I see skies of blue and clouds of white",
+ ]
+
+ # Create tasks for saving to files
+ file_tasks = [
+ save_to_file(text, i)
+ for i, text in enumerate(texts)
+ ]
+
+ # Run file tasks concurrently
+ await asyncio.gather(*file_tasks)
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/examples/simul_openai_streaming_audio.py b/examples/simul_openai_streaming_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..befae0192ad0d0d04bd15d49db8323bc409d4f6f
--- /dev/null
+++ b/examples/simul_openai_streaming_audio.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env rye run python
+import asyncio
+import time
+from pathlib import Path
+import pyaudio
+from openai import AsyncOpenAI
+
+# Initialize async client
+openai = AsyncOpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
+
+# Create a shared PyAudio instance
+p = pyaudio.PyAudio()
+
+async def stream_to_speakers(text: str, stream_id: int) -> None:
+ """Stream TTS audio to speakers asynchronously"""
+ player_stream = p.open(
+ format=pyaudio.paInt16,
+ channels=1,
+ rate=24000,
+ output=True
+ )
+
+ start_time = time.time()
+ print(f"Starting stream {stream_id}")
+
+ try:
+ async with openai.audio.speech.with_streaming_response.create(
+ model="kokoro",
+ voice="af_bella",
+ response_format="pcm",
+ input=text
+ ) as response:
+ print(f"Stream {stream_id} - Time to first byte: {int((time.time() - start_time) * 1000)}ms")
+
+ async for chunk in response.iter_bytes(chunk_size=1024):
+ player_stream.write(chunk)
+ # Small sleep to allow other coroutines to run
+ await asyncio.sleep(0.001)
+
+ print(f"Stream {stream_id} completed in {int((time.time() - start_time) * 1000)}ms")
+
+ finally:
+ player_stream.stop_stream()
+ player_stream.close()
+
+async def save_to_file(text: str, file_id: int) -> None:
+ """Save TTS output to file asynchronously"""
+ speech_file_path = Path(__file__).parent / f"speech_{file_id}.mp3"
+
+ async with openai.audio.speech.with_streaming_response.create(
+ model="kokoro",
+ voice="af_bella",
+ input=text
+ ) as response:
+ # Open file in binary write mode
+ with open(speech_file_path, 'wb') as f:
+ async for chunk in response.iter_bytes():
+ f.write(chunk)
+ print(f"File {file_id} saved to {speech_file_path}")
+
+async def main() -> None:
+ # Different text samples for variety
+ texts = [
+ "The quick brown fox jumped over the lazy dogs. I see skies of blue and clouds of white",
+ "I see skies of blue and clouds of white. I see skies of blue and clouds of white",
+ ]
+
+ # Create tasks for streaming to speakers
+ speaker_tasks = [
+ stream_to_speakers(text, i)
+ for i, text in enumerate(texts)
+ ]
+
+ # Create tasks for saving to files
+ file_tasks = [
+ save_to_file(text, i)
+ for i, text in enumerate(texts)
+ ]
+
+ # Combine all tasks
+ all_tasks = speaker_tasks + file_tasks
+
+ # Run all tasks concurrently
+ try:
+ await asyncio.gather(*all_tasks)
+ finally:
+ # Clean up PyAudio
+ p.terminate()
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/examples/simul_speaker_test.py b/examples/simul_speaker_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2e4354e2440f1ff848c148e293bb0bcb9d3c38c
--- /dev/null
+++ b/examples/simul_speaker_test.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env rye run python
+import asyncio
+import time
+import pyaudio
+from openai import AsyncOpenAI
+
+# Initialize async client
+openai = AsyncOpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
+
+# Create a shared PyAudio instance
+p = pyaudio.PyAudio()
+
+async def stream_to_speakers(text: str, stream_id: int) -> None:
+ """Stream TTS audio to speakers asynchronously"""
+ player_stream = p.open(
+ format=pyaudio.paInt16,
+ channels=1,
+ rate=24000,
+ output=True
+ )
+
+ start_time = time.time()
+ print(f"Starting stream {stream_id}")
+
+ try:
+ async with openai.audio.speech.with_streaming_response.create(
+ model="kokoro",
+ voice="af_bella",
+ response_format="pcm",
+ input=text
+ ) as response:
+ print(f"Stream {stream_id} - Time to first byte: {int((time.time() - start_time) * 1000)}ms")
+
+ async for chunk in response.iter_bytes(chunk_size=1024):
+ player_stream.write(chunk)
+ # Small sleep to allow other coroutines to run
+ await asyncio.sleep(0.001)
+
+ print(f"Stream {stream_id} completed in {int((time.time() - start_time) * 1000)}ms")
+
+ finally:
+ player_stream.stop_stream()
+ player_stream.close()
+
+async def main() -> None:
+ # Different text samples for variety
+ texts = [
+ "The quick brown fox jumped over the lazy dogs. I see skies of blue and clouds of white",
+ "I see skies of blue and clouds of white. I see skies of blue and clouds of white",
+ ]
+
+ # Create tasks for streaming to speakers
+ speaker_tasks = [
+ stream_to_speakers(text, i)
+ for i, text in enumerate(texts)
+ ]
+
+ # Run speaker tasks concurrently
+ try:
+ await asyncio.gather(*speaker_tasks)
+ finally:
+ # Clean up PyAudio
+ p.terminate()
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/examples/stream_tts_playback.py b/examples/stream_tts_playback.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e5ac5eac66a2e7d018c9753d6c3994938de70e2
--- /dev/null
+++ b/examples/stream_tts_playback.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+import os
+import time
+import wave
+
+import numpy as np
+import requests
+import sounddevice as sd
+
+
+def play_streaming_tts(text: str, output_file: str = None, voice: str = "af_sky"):
+ """Stream TTS audio and play it back in real-time"""
+
+ print("\nStarting TTS stream request...")
+ start_time = time.time()
+
+ # Initialize variables
+ sample_rate = 24000 # Known sample rate for Kokoro
+ audio_started = False
+ chunk_count = 0
+ total_bytes = 0
+ first_chunk_time = None
+ all_audio_data = bytearray() # Raw PCM audio data
+
+ # Start sounddevice stream with buffer
+ stream = sd.OutputStream(
+ samplerate=sample_rate,
+ channels=1,
+ dtype=np.int16,
+ blocksize=1024, # Buffer size in samples
+ latency="low", # Request low latency
+ )
+ stream.start()
+
+ # Make streaming request to API
+ try:
+ response = requests.post(
+ "http://localhost:8880/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": text,
+ "voice": voice,
+ "response_format": "pcm",
+ "stream": True,
+ },
+ stream=True,
+ timeout=1800,
+ )
+ response.raise_for_status()
+ print(f"Request started successfully after {time.time() - start_time:.2f}s")
+
+ # Process streaming response with smaller chunks for lower latency
+ for chunk in response.iter_content(
+ chunk_size=512
+ ): # 512 bytes = 256 samples at 16-bit
+ if chunk:
+ chunk_count += 1
+ total_bytes += len(chunk)
+
+ # Handle first chunk
+ if not audio_started:
+ first_chunk_time = time.time()
+ print(
+ f"\nReceived first chunk after {first_chunk_time - start_time:.2f}s"
+ )
+ print(f"First chunk size: {len(chunk)} bytes")
+ audio_started = True
+
+ # Convert bytes to numpy array and play
+ audio_chunk = np.frombuffer(chunk, dtype=np.int16)
+ stream.write(audio_chunk)
+
+ # Accumulate raw audio data
+ all_audio_data.extend(chunk)
+
+ # Log progress every 10 chunks
+ if chunk_count % 100 == 0:
+ elapsed = time.time() - start_time
+ print(
+ f"Progress: {chunk_count} chunks, {total_bytes/1024:.1f}KB received, {elapsed:.1f}s elapsed"
+ )
+
+ # Final stats
+ total_time = time.time() - start_time
+ print(f"\nStream complete:")
+ print(f"Total chunks: {chunk_count}")
+ print(f"Total data: {total_bytes/1024:.1f}KB")
+ print(f"Total time: {total_time:.2f}s")
+ print(f"Average speed: {(total_bytes/1024)/total_time:.1f}KB/s")
+
+ # Save as WAV file
+ if output_file:
+ print(f"\nWriting audio to {output_file}")
+ with wave.open(output_file, "wb") as wav_file:
+ wav_file.setnchannels(1) # Mono
+ wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
+ wav_file.setframerate(sample_rate)
+ wav_file.writeframes(all_audio_data)
+ print(f"Saved {len(all_audio_data)} bytes of audio data")
+
+ # Clean up
+ stream.stop()
+ stream.close()
+
+ except requests.exceptions.ConnectionError as e:
+ print(f"Connection error - Is the server running? Error: {str(e)}")
+ stream.stop()
+ stream.close()
+ except Exception as e:
+ print(f"Error during streaming: {str(e)}")
+ stream.stop()
+ stream.close()
+
+
+def main():
+ # Load sample text from HG Wells
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ wells_path = os.path.join(
+ script_dir, "assorted_checks/benchmarks/the_time_machine_hg_wells.txt"
+ )
+ output_path = os.path.join(script_dir, "output.wav")
+
+ with open(wells_path, "r", encoding="utf-8") as f:
+ full_text = f.read()
+ # Take first few paragraphs
+ text = " ".join(full_text.split("\n\n")[1:3])
+
+ print("\nStarting TTS stream playback...")
+ print(f"Text length: {len(text)} characters")
+ print("\nFirst 100 characters:")
+ print(text[:100] + "...")
+
+ play_streaming_tts(text, output_file=output_path)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/streaming_refactor/benchmark_unified_streaming.py b/examples/streaming_refactor/benchmark_unified_streaming.py
new file mode 100644
index 0000000000000000000000000000000000000000..369fb6616e3e46767c014b131e6ac50e899d8cd7
--- /dev/null
+++ b/examples/streaming_refactor/benchmark_unified_streaming.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+"""Benchmark script for unified streaming implementation"""
+
+import asyncio
+import time
+from pathlib import Path
+from typing import List, Tuple
+
+from openai import OpenAI
+import numpy as np
+import matplotlib.pyplot as plt
+
+# Initialize OpenAI client
+client = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed")
+
+TEST_TEXTS = {
+ "short": "The quick brown fox jumps over the lazy dog.",
+ "medium": """In a bustling city, life moves at a rapid pace.
+ People hurry along the sidewalks, while cars navigate
+ through the busy streets. The air is filled with the
+ sounds of urban activity.""",
+ "long": """The technological revolution has transformed how we live and work.
+ From artificial intelligence to renewable energy, innovations continue
+ to shape our future. As we face global challenges, scientific advances
+ offer new solutions. The intersection of technology and human creativity
+ drives progress forward, opening new possibilities for tomorrow."""
+}
+
+async def benchmark_streaming(text_name: str, text: str) -> Tuple[float, float, int]:
+ """Benchmark streaming performance
+
+ Returns:
+ Tuple of (time to first byte, total time, total bytes)
+ """
+ start_time = time.time()
+ total_bytes = 0
+ first_byte_time = None
+
+ with client.audio.speech.with_streaming_response.create(
+ model="kokoro",
+ voice="af_bella",
+ response_format="pcm",
+ input=text,
+ ) as response:
+ for chunk in response.iter_bytes(chunk_size=1024):
+ if first_byte_time is None:
+ first_byte_time = time.time() - start_time
+ total_bytes += len(chunk)
+
+ total_time = time.time() - start_time
+ return first_byte_time, total_time, total_bytes
+
+async def benchmark_non_streaming(text_name: str, text: str) -> Tuple[float, int]:
+ """Benchmark non-streaming performance
+
+ Returns:
+ Tuple of (total time, total bytes)
+ """
+ start_time = time.time()
+ speech_file = Path(__file__).parent / f"non_stream_{text_name}.mp3"
+
+ with client.audio.speech.with_streaming_response.create(
+ model="kokoro",
+ voice="af_bella",
+ input=text,
+ ) as response:
+ response.stream_to_file(speech_file)
+
+ total_time = time.time() - start_time
+ total_bytes = speech_file.stat().st_size
+ return total_time, total_bytes
+
+def plot_results(results: dict):
+ """Plot benchmark results"""
+ plt.figure(figsize=(12, 6))
+
+ # Prepare data
+ text_lengths = [len(text) for text in TEST_TEXTS.values()]
+ streaming_times = [r["streaming"]["total_time"] for r in results.values()]
+ non_streaming_times = [r["non_streaming"]["total_time"] for r in results.values()]
+ first_byte_times = [r["streaming"]["first_byte_time"] for r in results.values()]
+
+ # Plot times
+ x = np.arange(len(TEST_TEXTS))
+ width = 0.25
+
+ plt.bar(x - width, streaming_times, width, label='Streaming Total Time')
+ plt.bar(x, non_streaming_times, width, label='Non-Streaming Total Time')
+ plt.bar(x + width, first_byte_times, width, label='Time to First Byte')
+
+ plt.xlabel('Text Length (characters)')
+ plt.ylabel('Time (seconds)')
+ plt.title('Unified Streaming Performance Comparison')
+ plt.xticks(x, text_lengths)
+ plt.legend()
+
+ # Save plot
+ plt.savefig(Path(__file__).parent / 'benchmark_results.png')
+ plt.close()
+
+async def main():
+ """Run benchmarks"""
+ print("Starting unified streaming benchmarks...")
+
+ results = {}
+
+ for name, text in TEST_TEXTS.items():
+ print(f"\nTesting {name} text ({len(text)} chars)...")
+
+ # Test streaming
+ print("Running streaming test...")
+ first_byte_time, stream_total_time, stream_bytes = await benchmark_streaming(name, text)
+
+ # Test non-streaming
+ print("Running non-streaming test...")
+ non_stream_total_time, non_stream_bytes = await benchmark_non_streaming(name, text)
+
+ results[name] = {
+ "text_length": len(text),
+ "streaming": {
+ "first_byte_time": first_byte_time,
+ "total_time": stream_total_time,
+ "total_bytes": stream_bytes,
+ "throughput": stream_bytes / stream_total_time / 1024 # KB/s
+ },
+ "non_streaming": {
+ "total_time": non_stream_total_time,
+ "total_bytes": non_stream_bytes,
+ "throughput": non_stream_bytes / non_stream_total_time / 1024 # KB/s
+ }
+ }
+
+ # Print results for this test
+ print(f"\nResults for {name} text:")
+ print(f"Streaming:")
+ print(f" Time to first byte: {first_byte_time:.3f}s")
+ print(f" Total time: {stream_total_time:.3f}s")
+ print(f" Throughput: {stream_bytes/stream_total_time/1024:.1f} KB/s")
+ print(f"Non-streaming:")
+ print(f" Total time: {non_stream_total_time:.3f}s")
+ print(f" Throughput: {non_stream_bytes/non_stream_total_time/1024:.1f} KB/s")
+
+ # Plot results
+ plot_results(results)
+ print("\nBenchmark results have been plotted to benchmark_results.png")
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/examples/streaming_refactor/test_unified_streaming.py b/examples/streaming_refactor/test_unified_streaming.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9bc5e5ee583cc8172512cfe9bacfac732276194
--- /dev/null
+++ b/examples/streaming_refactor/test_unified_streaming.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""Test script for unified streaming implementation"""
+
+import asyncio
+import time
+from pathlib import Path
+
+from openai import OpenAI
+
+# Initialize OpenAI client
+client = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed")
+
+async def test_streaming_to_file():
+ """Test streaming to file"""
+ print("\nTesting streaming to file...")
+ speech_file = Path(__file__).parent / "stream_output.mp3"
+
+ start_time = time.time()
+ with client.audio.speech.with_streaming_response.create(
+ model="kokoro",
+ voice="af_bella",
+ input="Testing unified streaming implementation with a short phrase.",
+ ) as response:
+ response.stream_to_file(speech_file)
+
+ print(f"Streaming to file completed in {(time.time() - start_time):.2f}s")
+ print(f"Output saved to: {speech_file}")
+
+async def test_streaming_chunks():
+ """Test streaming chunks for real-time playback"""
+ print("\nTesting chunk streaming...")
+
+ start_time = time.time()
+ chunk_count = 0
+ total_bytes = 0
+
+ with client.audio.speech.with_streaming_response.create(
+ model="kokoro",
+ voice="af_bella",
+ response_format="pcm",
+ input="""This is a longer text to test chunk streaming.
+ We want to verify that the unified streaming implementation
+ works efficiently for both small and large inputs.""",
+ ) as response:
+ print(f"Time to first byte: {(time.time() - start_time):.3f}s")
+
+ for chunk in response.iter_bytes(chunk_size=1024):
+ chunk_count += 1
+ total_bytes += len(chunk)
+ # In real usage, this would go to audio playback
+ # For testing, we just count chunks and bytes
+
+ total_time = time.time() - start_time
+ print(f"Received {chunk_count} chunks, {total_bytes} bytes")
+ print(f"Total streaming time: {total_time:.2f}s")
+ print(f"Average throughput: {total_bytes/total_time/1024:.1f} KB/s")
+
+async def main():
+ """Run all tests"""
+ print("Starting unified streaming tests...")
+
+ # Test both streaming modes
+ await test_streaming_to_file()
+ await test_streaming_chunks()
+
+ print("\nAll tests completed!")
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..5d082f793e0ca8954f29b436a88608d536ba5ecb
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,97 @@
+[project]
+name = "kokoro-fastapi"
+version = "0.3.0"
+description = "FastAPI TTS Service"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+ # Core dependencies
+ "fastapi==0.115.6",
+ "uvicorn==0.34.0",
+ "click>=8.0.0",
+ "pydantic==2.10.4",
+ "pydantic-settings==2.7.0",
+ "python-dotenv==1.0.1",
+ "sqlalchemy==2.0.27",
+ # ML/DL Base
+ "numpy>=1.26.0",
+ "scipy==1.14.1",
+ # Audio processing
+ "soundfile==0.13.0",
+ "regex==2024.11.6",
+ # Utilities
+ "aiofiles==23.2.1",
+ "tqdm==4.67.1",
+ "requests==2.32.3",
+ "munch==4.0.0",
+ "tiktoken==0.8.0",
+ "loguru==0.7.3",
+ "openai>=1.59.6",
+ "pydub>=0.25.1",
+ "matplotlib>=3.10.0",
+ "mutagen>=1.47.0",
+ "psutil>=6.1.1",
+ "espeakng-loader==0.2.4",
+ "kokoro==0.9.2",
+ "misaki[en,ja,ko,zh]==0.9.3",
+ "spacy==3.8.5",
+ "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl",
+ "inflect>=7.5.0",
+ "phonemizer-fork>=3.3.2",
+ "av>=14.2.0",
+ "text2num>=2.5.1",
+]
+
+[project.optional-dependencies]
+gpu = [
+ "torch==2.6.0+cu124",
+]
+cpu = [
+ "torch==2.6.0",
+]
+test = [
+ "pytest==8.3.5",
+ "pytest-cov==6.0.0",
+ "httpx==0.26.0",
+ "pytest-asyncio==0.25.3",
+ "tomli>=2.0.1",
+ "jinja2>=3.1.6"
+]
+
+[tool.uv]
+conflicts = [
+ [
+ { extra = "cpu" },
+ { extra = "gpu" },
+ ],
+]
+
+[tool.uv.sources]
+torch = [
+ { index = "pytorch-cpu", extra = "cpu" },
+ { index = "pytorch-cuda", extra = "gpu" },
+]
+
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cuda"
+url = "https://download.pytorch.org/whl/cu124"
+explicit = true
+
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools]
+package-dir = {"" = "api/src"}
+packages.find = {where = ["api/src"], namespaces = true}
+
+[tool.pytest.ini_options]
+testpaths = ["api/tests", "ui/tests"]
+python_files = ["test_*.py"]
+addopts = "--cov=api --cov=ui --cov-report=term-missing --cov-config=.coveragerc --full-trace"
+asyncio_mode = "auto"
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..3bcd461bf66313bfb75164fd8074353ef6307cee
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+testpaths = api/tests
+python_files = test_*.py
+addopts = -v --tb=short --cov=api --cov-report=term-missing --cov-config=.coveragerc
+pythonpath = .
diff --git a/scripts/fix_misaki.py b/scripts/fix_misaki.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7eb2a69cbdeb47b7e797245a8241f7b17433ef3
--- /dev/null
+++ b/scripts/fix_misaki.py
@@ -0,0 +1,46 @@
+"""
+Patch for misaki package to fix the EspeakWrapper.set_data_path issue.
+"""
+
+import importlib.util
+import os
+import sys
+
+# Find the misaki package
+try:
+ import misaki
+
+ misaki_path = os.path.dirname(misaki.__file__)
+ print(f"Found misaki package at: {misaki_path}")
+except ImportError:
+ print("Misaki package not found. Make sure it's installed.")
+ sys.exit(1)
+
+# Path to the espeak.py file
+espeak_file = os.path.join(misaki_path, "espeak.py")
+
+if not os.path.exists(espeak_file):
+ print(f"Could not find {espeak_file}")
+ sys.exit(1)
+
+# Read the current content
+with open(espeak_file, "r") as f:
+ content = f.read()
+
+# Check if the problematic line exists
+if "EspeakWrapper.set_data_path(espeakng_loader.get_data_path())" in content:
+ # Replace the problematic line
+ new_content = content.replace(
+ "EspeakWrapper.set_data_path(espeakng_loader.get_data_path())",
+ "# Fixed line to use data_path attribute instead of set_data_path method\n"
+ "EspeakWrapper.data_path = espeakng_loader.get_data_path()",
+ )
+
+ # Write the modified content back
+ with open(espeak_file, "w") as f:
+ f.write(new_content)
+
+ print(f"Successfully patched {espeak_file}")
+else:
+ print(f"The problematic line was not found in {espeak_file}")
+ print("The file may have already been patched or the issue is different.")
diff --git a/scripts/update_badges.py b/scripts/update_badges.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd41b43bd0e24097d764493c6e7600b00945abad
--- /dev/null
+++ b/scripts/update_badges.py
@@ -0,0 +1,139 @@
+import re
+import subprocess
+from pathlib import Path
+
+import tomli
+
+
+def extract_dependency_info():
+ """Extract version for kokoro and misaki from pyproject.toml"""
+ with open("pyproject.toml", "rb") as f:
+ pyproject = tomli.load(f)
+
+ deps = pyproject["project"]["dependencies"]
+ info = {}
+ kokoro_found = False
+ misaki_found = False
+
+ for dep in deps:
+ # Match kokoro==version
+ kokoro_match = re.match(r"^kokoro==(.+)$", dep)
+ if kokoro_match:
+ info["kokoro"] = {"version": kokoro_match.group(1)}
+ kokoro_found = True
+
+ # Match misaki[...] ==version or misaki==version
+ misaki_match = re.match(r"^misaki(?:\[.*?\])?==(.+)$", dep)
+ if misaki_match:
+ info["misaki"] = {"version": misaki_match.group(1)}
+ misaki_found = True
+
+ # Stop if both found
+ if kokoro_found and misaki_found:
+ break
+
+ if not kokoro_found:
+ raise ValueError("Kokoro version not found in pyproject.toml dependencies")
+ if not misaki_found:
+ raise ValueError("Misaki version not found in pyproject.toml dependencies")
+
+ return info
+
+
+def run_pytest_with_coverage():
+ """Run pytest with coverage and return the results"""
+ try:
+ # Run pytest with coverage
+ result = subprocess.run(
+ ["pytest", "--cov=api", "-v"], capture_output=True, text=True, check=True
+ )
+
+ # Extract test results
+ test_output = result.stdout
+ passed_tests = len(re.findall(r"PASSED", test_output))
+
+ # Extract coverage from .coverage file
+ coverage_output = subprocess.run(
+ ["coverage", "report"], capture_output=True, text=True, check=True
+ ).stdout
+
+ # Extract total coverage percentage
+ coverage_match = re.search(r"TOTAL\s+\d+\s+\d+\s+(\d+)%", coverage_output)
+ coverage_percentage = coverage_match.group(1) if coverage_match else "0"
+
+ return passed_tests, coverage_percentage
+ except subprocess.CalledProcessError as e:
+ print(f"Error running tests: {e}")
+ print(f"Output: {e.output}")
+ return 0, "0"
+
+
+def update_readme_badges(passed_tests, coverage_percentage, dep_info):
+ """Update the badges in the README file"""
+ readme_path = Path("README.md")
+ if not readme_path.exists():
+ print("README.md not found")
+ return False
+
+ content = readme_path.read_text()
+
+ # Update tests badge
+ content = re.sub(
+ r"!\[Tests\]\(https://img\.shields\.io/badge/tests-\d+%20passed-[a-zA-Z]+\)",
+ f"",
+ content,
+ )
+
+ # Update coverage badge
+ content = re.sub(
+ r"!\[Coverage\]\(https://img\.shields\.io/badge/coverage-\d+%25-[a-zA-Z]+\)",
+ f"",
+ content,
+ )
+
+ # Update kokoro badge
+ if "kokoro" in dep_info:
+ # Find badge like kokoro-v0.9.2::abcdefg-BB5420 or kokoro-v0.9.2-BB5420
+ kokoro_version = dep_info["kokoro"]["version"]
+ content = re.sub(
+ r"(!\[Kokoro\]\(https://img\.shields\.io/badge/kokoro-)[^)-]+(-BB5420\))",
+ lambda m: f"{m.group(1)}{kokoro_version}{m.group(2)}",
+ content,
+ )
+
+ # Update misaki badge
+ if "misaki" in dep_info:
+ # Find badge like misaki-v0.9.3::abcdefg-B8860B or misaki-v0.9.3-B8860B
+ misaki_version = dep_info["misaki"]["version"]
+ content = re.sub(
+ r"(!\[Misaki\]\(https://img\.shields\.io/badge/misaki-)[^)-]+(-B8860B\))",
+ lambda m: f"{m.group(1)}{misaki_version}{m.group(2)}",
+ content,
+ )
+
+ readme_path.write_text(content)
+ return True
+
+
+def main():
+ # Get dependency info
+ dep_info = extract_dependency_info()
+
+ # Run tests and get coverage
+ passed_tests, coverage_percentage = run_pytest_with_coverage()
+
+ # Update badges
+ if update_readme_badges(passed_tests, coverage_percentage, dep_info):
+ print(f"Updated badges:")
+ print(f"- Tests: {passed_tests} passed")
+ print(f"- Coverage: {coverage_percentage}%")
+ if "kokoro" in dep_info:
+ print(f"- Kokoro: {dep_info['kokoro']['version']}")
+ if "misaki" in dep_info:
+ print(f"- Misaki: {dep_info['misaki']['version']}")
+ else:
+ print("Failed to update badges")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/update_version.py b/scripts/update_version.py
new file mode 100755
index 0000000000000000000000000000000000000000..e204a56fb9ef9aba51dbf4ecf149d786a6e50150
--- /dev/null
+++ b/scripts/update_version.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+"""
+Version Update Script
+
+This script reads the version from the VERSION file and updates references
+in pyproject.toml, the Helm chart, and README.md.
+"""
+
+import re
+from pathlib import Path
+
+import yaml
+
+# Get the project root directory
+ROOT_DIR = Path(__file__).parent.parent
+
+# --- Configuration ---
+VERSION_FILE = ROOT_DIR / "VERSION"
+PYPROJECT_FILE = ROOT_DIR / "pyproject.toml"
+HELM_CHART_FILE = ROOT_DIR / "charts" / "kokoro-fastapi" / "Chart.yaml"
+README_FILE = ROOT_DIR / "README.md"
+# --- End Configuration ---
+
+
+def update_pyproject(version: str):
+ """Updates the version in pyproject.toml"""
+ if not PYPROJECT_FILE.exists():
+ print(f"Skipping: {PYPROJECT_FILE} not found.")
+ return
+
+ try:
+ content = PYPROJECT_FILE.read_text()
+ # Regex to find and capture current version = "X.Y.Z" under [project]
+ pattern = r'(^\[project\]\s*(?:.*\s)*?version\s*=\s*)"([^"]+)"'
+ match = re.search(pattern, content, flags=re.MULTILINE)
+
+ if not match:
+ print(f"Warning: Version pattern not found in {PYPROJECT_FILE}")
+ return
+
+ current_version = match.group(2)
+ if current_version == version:
+ print(f"Already up-to-date: {PYPROJECT_FILE} (version {version})")
+ else:
+ # Perform replacement
+ new_content = re.sub(
+ pattern, rf'\1"{version}"', content, count=1, flags=re.MULTILINE
+ )
+ PYPROJECT_FILE.write_text(new_content)
+ print(f"Updated {PYPROJECT_FILE} from {current_version} to {version}")
+
+ except Exception as e:
+ print(f"Error processing {PYPROJECT_FILE}: {e}")
+
+
+def update_helm_chart(version: str):
+ """Updates the version and appVersion in the Helm chart"""
+ if not HELM_CHART_FILE.exists():
+ print(f"Skipping: {HELM_CHART_FILE} not found.")
+ return
+
+ try:
+ content = HELM_CHART_FILE.read_text()
+ original_content = content
+ updated_count = 0
+
+ # Update 'version:' line (unquoted)
+ # Looks for 'version:' followed by optional whitespace and the version number
+ version_pattern = r"^(version:\s*)(\S+)"
+ current_version_match = re.search(version_pattern, content, flags=re.MULTILINE)
+ if current_version_match and current_version_match.group(2) != version:
+ content = re.sub(
+ version_pattern,
+ rf"\g<1>{version}",
+ content,
+ count=1,
+ flags=re.MULTILINE,
+ )
+ print(
+ f"Updating 'version' in {HELM_CHART_FILE} from {current_version_match.group(2)} to {version}"
+ )
+ updated_count += 1
+ elif current_version_match:
+ print(f"Already up-to-date: 'version' in {HELM_CHART_FILE} is {version}")
+ else:
+ print(f"Warning: 'version:' pattern not found in {HELM_CHART_FILE}")
+
+ # Update 'appVersion:' line (quoted or unquoted)
+ # Looks for 'appVersion:' followed by optional whitespace, optional quote, the version, optional quote
+ app_version_pattern = r"^(appVersion:\s*)(\"?)([^\"\s]+)(\"?)"
+ current_app_version_match = re.search(
+ app_version_pattern, content, flags=re.MULTILINE
+ )
+
+ if current_app_version_match:
+ leading_whitespace = current_app_version_match.group(
+ 1
+ ) # e.g., "appVersion: "
+ opening_quote = current_app_version_match.group(2) # e.g., '"' or ''
+ current_app_ver = current_app_version_match.group(3) # e.g., '0.2.0'
+ closing_quote = current_app_version_match.group(4) # e.g., '"' or ''
+
+ # Check if quotes were consistent (both present or both absent)
+ if opening_quote != closing_quote:
+ print(
+ f"Warning: Inconsistent quotes found for appVersion in {HELM_CHART_FILE}. Skipping update for this line."
+ )
+ elif (
+ current_app_ver == version and opening_quote == '"'
+ ): # Check if already correct *and* quoted
+ print(
+ f"Already up-to-date: 'appVersion' in {HELM_CHART_FILE} is \"{version}\""
+ )
+ else:
+ # Always replace with the quoted version
+ replacement = f'{leading_whitespace}"{version}"' # Ensure quotes
+ original_display = f"{opening_quote}{current_app_ver}{closing_quote}" # How it looked before
+ target_display = f'"{version}"' # How it should look
+
+ # Only report update if the displayed value actually changes
+ if original_display != target_display:
+ content = re.sub(
+ app_version_pattern,
+ replacement,
+ content,
+ count=1,
+ flags=re.MULTILINE,
+ )
+ print(
+ f"Updating 'appVersion' in {HELM_CHART_FILE} from {original_display} to {target_display}"
+ )
+ updated_count += 1
+ else:
+ # It matches the target version but might need quoting fixed silently if we didn't update
+ # Or it was already correct. Check if content changed. If not, report up-to-date.
+ if not (
+ content != original_content and updated_count > 0
+ ): # Avoid double message if version also changed
+ print(
+ f"Already up-to-date: 'appVersion' in {HELM_CHART_FILE} is {target_display}"
+ )
+
+ else:
+ print(f"Warning: 'appVersion:' pattern not found in {HELM_CHART_FILE}")
+
+ # Write back only if changes were made
+ if content != original_content:
+ HELM_CHART_FILE.write_text(content)
+ # Confirmation message printed above during the specific update
+ elif updated_count == 0 and current_version_match and current_app_version_match:
+ # If no updates were made but patterns were found, confirm it's up-to-date overall
+ print(f"Already up-to-date: {HELM_CHART_FILE} (version {version})")
+
+ except Exception as e:
+ print(f"Error processing {HELM_CHART_FILE}: {e}")
+
+
+def update_readme(version_with_v: str):
+ """Updates Docker image tags in README.md"""
+ if not README_FILE.exists():
+ print(f"Skipping: {README_FILE} not found.")
+ return
+
+ try:
+ content = README_FILE.read_text()
+ # Regex to find and capture current ghcr.io/.../kokoro-fastapi-(cpu|gpu):vX.Y.Z
+ pattern = r"(ghcr\.io/remsky/kokoro-fastapi-(?:cpu|gpu)):(v\d+\.\d+\.\d+)"
+ matches = list(re.finditer(pattern, content)) # Find all occurrences
+
+ if not matches:
+ print(f"Warning: Docker image tag pattern not found in {README_FILE}")
+ else:
+ updated_needed = False
+ for match in matches:
+ current_tag = match.group(2)
+ if current_tag != version_with_v:
+ updated_needed = True
+ break # Only need one mismatch to trigger update
+
+ if updated_needed:
+ # Perform replacement on all occurrences
+ new_content = re.sub(pattern, rf"\1:{version_with_v}", content)
+ README_FILE.write_text(new_content)
+ print(f"Updated Docker image tags in {README_FILE} to {version_with_v}")
+ else:
+ print(
+ f"Already up-to-date: Docker image tags in {README_FILE} (version {version_with_v})"
+ )
+
+ # Check for ':latest' tag usage remains the same
+ if ":latest" in content:
+ print(
+ f"Warning: Found ':latest' tag in {README_FILE}. Consider updating manually if needed."
+ )
+
+ except Exception as e:
+ print(f"Error processing {README_FILE}: {e}")
+
+
+def main():
+ # Read the version from the VERSION file
+ if not VERSION_FILE.exists():
+ print(f"Error: {VERSION_FILE} not found.")
+ return
+
+ try:
+ version = VERSION_FILE.read_text().strip()
+ if not re.match(r"^\d+\.\d+\.\d+$", version):
+ print(
+ f"Error: Invalid version format '{version}' in {VERSION_FILE}. Expected X.Y.Z"
+ )
+ return
+ except Exception as e:
+ print(f"Error reading {VERSION_FILE}: {e}")
+ return
+
+ print(f"Read version: {version} from {VERSION_FILE}")
+ print("-" * 20)
+
+ # Prepare versions (with and without 'v')
+ version_plain = version
+ version_with_v = f"v{version}"
+
+ # Update files
+ update_pyproject(version_plain)
+ update_helm_chart(version_plain)
+ update_readme(version_with_v)
+
+ print("-" * 20)
+ print("Version update script finished.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/ui/Dockerfile b/ui/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e1726fbb25d4c632f036bf9323952b0ed5220643
--- /dev/null
+++ b/ui/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.10-slim
+
+WORKDIR /app/ui
+
+# Install dependencies
+RUN pip install gradio==5.9.1 requests==2.32.3
+
+# Create necessary directories
+RUN mkdir -p data/inputs data/outputs
+
+# Copy the application files
+COPY . .
+
+ENV API_HOST=kokoro-tts
+ENV API_PORT=8880
+
+# Run the Gradio app
+CMD ["python", "app.py"]
diff --git a/ui/app.py b/ui/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..96aae35ec00be26cb45f16a0b4a1d5069c60ba15
--- /dev/null
+++ b/ui/app.py
@@ -0,0 +1,5 @@
+from lib.interface import create_interface
+
+if __name__ == "__main__":
+ demo = create_interface()
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
diff --git a/ui/data/inputs/test_timemachine.txt b/ui/data/inputs/test_timemachine.txt
new file mode 100644
index 0000000000000000000000000000000000000000..50583f33387d6b09716575243573c5f0caa5a8c8
--- /dev/null
+++ b/ui/data/inputs/test_timemachine.txt
@@ -0,0 +1,151 @@
+The Time Traveller (for so it will be convenient to speak of him) was expounding a recondite matter to us. His pale grey eyes shone and twinkled, and his usually pale face was flushed and animated. The fire burnt brightly, and the soft radiance of the incandescent lights in the lilies of silver caught the bubbles that flashed and passed in our glasses. Our chairs, being his patents, embraced and caressed us rather than submitted to be sat upon, and there was that luxurious after-dinner atmosphere, when thought runs gracefully free of the trammels of precision. And he put it to us in this way—marking the points with a lean forefinger—as we sat and lazily admired his earnestness over this new paradox (as we thought it) and his fecundity.
+
+“You must follow me carefully. I shall have to controvert one or two ideas that are almost universally accepted. The geometry, for instance, they taught you at school is founded on a misconception.”
+
+“Is not that rather a large thing to expect us to begin upon?” said Filby, an argumentative person with red hair.
+
+“I do not mean to ask you to accept anything without reasonable ground for it. You will soon admit as much as I need from you. You know of course that a mathematical line, a line of thickness nil, has no real existence. They taught you that? Neither has a mathematical plane. These things are mere abstractions.”
+
+“That is all right,” said the Psychologist.
+
+“Nor, having only length, breadth, and thickness, can a cube have a real existence.”
+
+“There I object,” said Filby. “Of course a solid body may exist. All real things—”
+
+“So most people think. But wait a moment. Can an instantaneous cube exist?”
+
+“Don’t follow you,” said Filby.
+
+“Can a cube that does not last for any time at all, have a real existence?”
+
+Filby became pensive. “Clearly,” the Time Traveller proceeded, “any real body must have extension in four directions: it must have Length, Breadth, Thickness, and—Duration. But through a natural infirmity of the flesh, which I will explain to you in a moment, we incline to overlook this fact. There are really four dimensions, three which we call the three planes of Space, and a fourth, Time. There is, however, a tendency to draw an unreal distinction between the former three dimensions and the latter, because it happens that our consciousness moves intermittently in one direction along the latter from the beginning to the end of our lives.”
+
+“That,” said a very young man, making spasmodic efforts to relight his cigar over the lamp; “that . . . very clear indeed.”
+
+“Now, it is very remarkable that this is so extensively overlooked,” continued the Time Traveller, with a slight accession of cheerfulness. “Really this is what is meant by the Fourth Dimension, though some people who talk about the Fourth Dimension do not know they mean it. It is only another way of looking at Time. There is no difference between Time and any of the three dimensions of Space except that our consciousness moves along it. But some foolish people have got hold of the wrong side of that idea. You have all heard what they have to say about this Fourth Dimension?”
+
+“I have not,” said the Provincial Mayor.
+
+“It is simply this. That Space, as our mathematicians have it, is spoken of as having three dimensions, which one may call Length, Breadth, and Thickness, and is always definable by reference to three planes, each at right angles to the others. But some philosophical people have been asking why three dimensions particularly—why not another direction at right angles to the other three?—and have even tried to construct a Four-Dimensional geometry. Professor Simon Newcomb was expounding this to the New York Mathematical Society only a month or so ago. You know how on a flat surface, which has only two dimensions, we can represent a figure of a three-dimensional solid, and similarly they think that by models of three dimensions they could represent one of four—if they could master the perspective of the thing. See?”
+
+“I think so,” murmured the Provincial Mayor; and, knitting his brows, he lapsed into an introspective state, his lips moving as one who repeats mystic words. “Yes, I think I see it now,” he said after some time, brightening in a quite transitory manner.
+
+“Well, I do not mind telling you I have been at work upon this geometry of Four Dimensions for some time. Some of my results are curious. For instance, here is a portrait of a man at eight years old, another at fifteen, another at seventeen, another at twenty-three, and so on. All these are evidently sections, as it were, Three-Dimensional representations of his Four-Dimensioned being, which is a fixed and unalterable thing.
+
+“Scientific people,” proceeded the Time Traveller, after the pause required for the proper assimilation of this, “know very well that Time is only a kind of Space. Here is a popular scientific diagram, a weather record. This line I trace with my finger shows the movement of the barometer. Yesterday it was so high, yesterday night it fell, then this morning it rose again, and so gently upward to here. Surely the mercury did not trace this line in any of the dimensions of Space generally recognised? But certainly it traced such a line, and that line, therefore, we must conclude, was along the Time-Dimension.”
+
+“But,” said the Medical Man, staring hard at a coal in the fire, “if Time is really only a fourth dimension of Space, why is it, and why has it always been, regarded as something different? And why cannot we move in Time as we move about in the other dimensions of Space?”
+
+The Time Traveller smiled. “Are you so sure we can move freely in Space? Right and left we can go, backward and forward freely enough, and men always have done so. I admit we move freely in two dimensions. But how about up and down? Gravitation limits us there.”
+
+“Not exactly,” said the Medical Man. “There are balloons.”
+
+“But before the balloons, save for spasmodic jumping and the inequalities of the surface, man had no freedom of vertical movement.”
+
+“Still they could move a little up and down,” said the Medical Man.
+
+“Easier, far easier down than up.”
+
+“And you cannot move at all in Time, you cannot get away from the present moment.”
+
+“My dear sir, that is just where you are wrong. That is just where the whole world has gone wrong. We are always getting away from the present moment. Our mental existences, which are immaterial and have no dimensions, are passing along the Time-Dimension with a uniform velocity from the cradle to the grave. Just as we should travel down if we began our existence fifty miles above the earth’s surface.”
+
+“But the great difficulty is this,” interrupted the Psychologist. ’You can move about in all directions of Space, but you cannot move about in Time.”
+
+“That is the germ of my great discovery. But you are wrong to say that we cannot move about in Time. For instance, if I am recalling an incident very vividly I go back to the instant of its occurrence: I become absent-minded, as you say. I jump back for a moment. Of course we have no means of staying back for any length of Time, any more than a savage or an animal has of staying six feet above the ground. But a civilised man is better off than the savage in this respect. He can go up against gravitation in a balloon, and why should he not hope that ultimately he may be able to stop or accelerate his drift along the Time-Dimension, or even turn about and travel the other way?”
+
+“Oh, this,” began Filby, “is all—”
+
+“Why not?” said the Time Traveller.
+
+“It’s against reason,” said Filby.
+
+“What reason?” said the Time Traveller.
+
+“You can show black is white by argument,” said Filby, “but you will never convince me.”
+
+“Possibly not,” said the Time Traveller. “But now you begin to see the object of my investigations into the geometry of Four Dimensions. Long ago I had a vague inkling of a machine—”
+
+“To travel through Time!” exclaimed the Very Young Man.
+
+“That shall travel indifferently in any direction of Space and Time, as the driver determines.”
+
+Filby contented himself with laughter.
+
+“But I have experimental verification,” said the Time Traveller.
+
+“It would be remarkably convenient for the historian,” the Psychologist suggested. “One might travel back and verify the accepted account of the Battle of Hastings, for instance!”
+
+“Don’t you think you would attract attention?” said the Medical Man. “Our ancestors had no great tolerance for anachronisms.”
+
+“One might get one’s Greek from the very lips of Homer and Plato,” the Very Young Man thought.
+
+“In which case they would certainly plough you for the Little-go. The German scholars have improved Greek so much.”
+
+“Then there is the future,” said the Very Young Man. “Just think! One might invest all one’s money, leave it to accumulate at interest, and hurry on ahead!”
+
+“To discover a society,” said I, “erected on a strictly communistic basis.”
+
+“Of all the wild extravagant theories!” began the Psychologist.
+
+“Yes, so it seemed to me, and so I never talked of it until—”
+
+“Experimental verification!” cried I. “You are going to verify that?”
+
+“The experiment!” cried Filby, who was getting brain-weary.
+
+“Let’s see your experiment anyhow,” said the Psychologist, “though it’s all humbug, you know.”
+
+The Time Traveller smiled round at us. Then, still smiling faintly, and with his hands deep in his trousers pockets, he walked slowly out of the room, and we heard his slippers shuffling down the long passage to his laboratory.
+
+The Psychologist looked at us. “I wonder what he’s got?”
+
+“Some sleight-of-hand trick or other,” said the Medical Man, and Filby tried to tell us about a conjuror he had seen at Burslem, but before he had finished his preface the Time Traveller came back, and Filby’s anecdote collapsed.
+
+II.
+The Machine
+The thing the Time Traveller held in his hand was a glittering metallic framework, scarcely larger than a small clock, and very delicately made. There was ivory in it, and some transparent crystalline substance. And now I must be explicit, for this that follows—unless his explanation is to be accepted—is an absolutely unaccountable thing. He took one of the small octagonal tables that were scattered about the room, and set it in front of the fire, with two legs on the hearthrug. On this table he placed the mechanism. Then he drew up a chair, and sat down. The only other object on the table was a small shaded lamp, the bright light of which fell upon the model. There were also perhaps a dozen candles about, two in brass candlesticks upon the mantel and several in sconces, so that the room was brilliantly illuminated. I sat in a low arm-chair nearest the fire, and I drew this forward so as to be almost between the Time Traveller and the fireplace. Filby sat behind him, looking over his shoulder. The Medical Man and the Provincial Mayor watched him in profile from the right, the Psychologist from the left. The Very Young Man stood behind the Psychologist. We were all on the alert. It appears incredible to me that any kind of trick, however subtly conceived and however adroitly done, could have been played upon us under these conditions.
+
+The Time Traveller looked at us, and then at the mechanism. “Well?” said the Psychologist.
+
+“This little affair,” said the Time Traveller, resting his elbows upon the table and pressing his hands together above the apparatus, “is only a model. It is my plan for a machine to travel through time. You will notice that it looks singularly askew, and that there is an odd twinkling appearance about this bar, as though it was in some way unreal.” He pointed to the part with his finger. “Also, here is one little white lever, and here is another.”
+
+The Medical Man got up out of his chair and peered into the thing. “It’s beautifully made,” he said.
+
+“It took two years to make,” retorted the Time Traveller. Then, when we had all imitated the action of the Medical Man, he said: “Now I want you clearly to understand that this lever, being pressed over, sends the machine gliding into the future, and this other reverses the motion. This saddle represents the seat of a time traveller. Presently I am going to press the lever, and off the machine will go. It will vanish, pass into future Time, and disappear. Have a good look at the thing. Look at the table too, and satisfy yourselves there is no trickery. I don’t want to waste this model, and then be told I’m a quack.”
+
+There was a minute’s pause perhaps. The Psychologist seemed about to speak to me, but changed his mind. Then the Time Traveller put forth his finger towards the lever. “No,” he said suddenly. “Lend me your hand.” And turning to the Psychologist, he took that individual’s hand in his own and told him to put out his forefinger. So that it was the Psychologist himself who sent forth the model Time Machine on its interminable voyage. We all saw the lever turn. I am absolutely certain there was no trickery. There was a breath of wind, and the lamp flame jumped. One of the candles on the mantel was blown out, and the little machine suddenly swung round, became indistinct, was seen as a ghost for a second perhaps, as an eddy of faintly glittering brass and ivory; and it was gone—vanished! Save for the lamp the table was bare.
+
+Everyone was silent for a minute. Then Filby said he was damned.
+
+The Psychologist recovered from his stupor, and suddenly looked under the table. At that the Time Traveller laughed cheerfully. “Well?” he said, with a reminiscence of the Psychologist. Then, getting up, he went to the tobacco jar on the mantel, and with his back to us began to fill his pipe.
+
+We stared at each other. “Look here,” said the Medical Man, “are you in earnest about this? Do you seriously believe that that machine has travelled into time?”
+
+“Certainly,” said the Time Traveller, stooping to light a spill at the fire. Then he turned, lighting his pipe, to look at the Psychologist’s face. (The Psychologist, to show that he was not unhinged, helped himself to a cigar and tried to light it uncut.) “What is more, I have a big machine nearly finished in there”—he indicated the laboratory—“and when that is put together I mean to have a journey on my own account.”
+
+“You mean to say that that machine has travelled into the future?” said Filby.
+
+“Into the future or the past—I don’t, for certain, know which.”
+
+After an interval the Psychologist had an inspiration. “It must have gone into the past if it has gone anywhere,” he said.
+
+“Why?” said the Time Traveller.
+
+“Because I presume that it has not moved in space, and if it travelled into the future it would still be here all this time, since it must have travelled through this time.”
+
+“But,” said I, “If it travelled into the past it would have been visible when we came first into this room; and last Thursday when we were here; and the Thursday before that; and so forth!”
+
+“Serious objections,” remarked the Provincial Mayor, with an air of impartiality, turning towards the Time Traveller.
+
+“Not a bit,” said the Time Traveller, and, to the Psychologist: “You think. You can explain that. It’s presentation below the threshold, you know, diluted presentation.”
+
+“Of course,” said the Psychologist, and reassured us. “That’s a simple point of psychology. I should have thought of it. It’s plain enough, and helps the paradox delightfully. We cannot see it, nor can we appreciate this machine, any more than we can the spoke of a wheel spinning, or a bullet flying through the air. If it is travelling through time fifty times or a hundred times faster than we are, if it gets through a minute while we get through a second, the impression it creates will of course be only one-fiftieth or one-hundredth of what it would make if it were not travelling in time. That’s plain enough.” He passed his hand through the space in which the machine had been. “You see?” he said, laughing.
+
+We sat and stared at the vacant table for a minute or so. Then the Time Traveller asked us what we thought of it all.
+
+“It sounds plausible enough tonight,” said the Medical Man; “but wait until tomorrow. Wait for the common sense of the morning.”
+
+“Would you like to see the Time Machine itself?” asked the Time Traveller. And therewith, taking the lamp in his hand, he led the way down the long, draughty corridor to his laboratory. I remember vividly the flickering light, his queer, broad head in silhouette, the dance of the shadows, how we all followed him, puzzled but incredulous, and how there in the laboratory we beheld a larger edition of the little mechanism which we had seen vanish from before our eyes. Parts were of nickel, parts of ivory, parts had certainly been filed or sawn out of rock crystal. The thing was generally complete, but the twisted crystalline bars lay unfinished upon the bench beside some
+The Time Traveller Returns
+I think that at that time none of us quite believed in the Time Machine. The fact is, the Time Traveller was one of those men who are too clever to be believed: you never felt that you saw all round him; you always suspected some subtle reserve, some ingenuity in ambush, behind his lucid frankness. Had Filby shown the model and explained the matter in the Time Traveller’s words, we should have shown him far less scepticism. For we should have perceived his motives: a pork-butcher could understand Filby. But the Time Traveller had more than a touch of whim among his elements, and we distrusted him. Things that would have made the fame of a less clever man seemed tricks in his hands. It is a mistake to do things too easily. The serious people who took him seriously never felt quite sure of his deportment; they were somehow aware that trusting their reputations for judgment with him was like furnishing a nursery with eggshell china. So I don’t think any of us said very much about time travelling in the interval between that Thursday and the next, though its odd potentialities ran, no doubt, in most of our minds: its plausibility, that is, its practical incredibleness, the curious possibilities of anachronism and of utter confusion it suggested. For my own part, I was particularly preoccupied with the trick of the model. That I remember discussing with the Medical Man, whom I met on Friday at the Linnæan. He said he had seen a similar thing at Tübingen, and laid considerable stress on the blowing-out of the candle. But how the trick was done he could not explai
\ No newline at end of file
diff --git a/ui/depr_tests/conftest.py b/ui/depr_tests/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a65b691d70a0119020edf8f3ade88f09fd0c8c3
--- /dev/null
+++ b/ui/depr_tests/conftest.py
@@ -0,0 +1,52 @@
+from unittest.mock import AsyncMock, Mock
+
+import pytest
+
+from api.src.services.tts_service import TTSService
+
+
+@pytest.fixture
+async def mock_model_manager():
+ """Mock model manager for UI tests"""
+ manager = AsyncMock()
+ manager.get_backend = Mock(return_value=Mock(device="cpu"))
+ return manager
+
+
+@pytest.fixture
+async def mock_voice_manager():
+ """Mock voice manager for UI tests"""
+ manager = AsyncMock()
+ manager.list_voices = AsyncMock(return_value=["af_heart", "bm_lewis", "af_sarah"])
+ return manager
+
+
+@pytest.fixture
+async def mock_tts_service(mock_model_manager, mock_voice_manager):
+ """Mock TTSService for UI tests"""
+ service = AsyncMock()
+ service.model_manager = mock_model_manager
+ service._voice_manager = mock_voice_manager
+ return service
+
+
+@pytest.fixture(autouse=True)
+async def setup_mocks(
+ monkeypatch, mock_model_manager, mock_voice_manager, mock_tts_service
+):
+ """Setup global mocks for UI tests"""
+
+ async def mock_get_model():
+ return mock_model_manager
+
+ async def mock_get_voice():
+ return mock_voice_manager
+
+ async def mock_create_service():
+ return mock_tts_service
+
+ monkeypatch.setattr("api.src.inference.model_manager.get_manager", mock_get_model)
+ monkeypatch.setattr("api.src.inference.voice_manager.get_manager", mock_get_voice)
+ monkeypatch.setattr(
+ "api.src.services.tts_service.TTSService.create", mock_create_service
+ )
diff --git a/ui/depr_tests/test_api.py b/ui/depr_tests/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..37157f024c1b49e3df7709b93894e268ad6ce3f6
--- /dev/null
+++ b/ui/depr_tests/test_api.py
@@ -0,0 +1,167 @@
+from unittest.mock import mock_open, patch
+
+import pytest
+import requests
+
+from ui.lib import api
+
+
+@pytest.fixture
+def mock_response():
+ class MockResponse:
+ def __init__(self, json_data, status_code=200, content=b"audio data"):
+ self._json = json_data
+ self.status_code = status_code
+ self.content = content
+
+ def json(self):
+ return self._json
+
+ def raise_for_status(self):
+ if self.status_code != 200:
+ raise requests.exceptions.HTTPError(f"HTTP {self.status_code}")
+
+ return MockResponse
+
+
+def test_check_api_status_success(mock_response):
+ """Test successful API status check"""
+ mock_data = {"voices": ["voice1", "voice2"]}
+ with patch("requests.get", return_value=mock_response(mock_data)):
+ status, voices = api.check_api_status()
+ assert status is True
+ assert voices == ["voice1", "voice2"]
+
+
+def test_check_api_status_no_voices(mock_response):
+ """Test API response with no voices"""
+ with patch("requests.get", return_value=mock_response({"voices": []})):
+ status, voices = api.check_api_status()
+ assert status is False
+ assert voices == []
+
+
+def test_check_api_status_timeout():
+ """Test API timeout"""
+ with patch("requests.get", side_effect=requests.exceptions.Timeout):
+ status, voices = api.check_api_status()
+ assert status is False
+ assert voices == []
+
+
+def test_check_api_status_connection_error():
+ """Test API connection error"""
+ with patch("requests.get", side_effect=requests.exceptions.ConnectionError):
+ status, voices = api.check_api_status()
+ assert status is False
+ assert voices == []
+
+
+def test_text_to_speech_success(mock_response, tmp_path):
+ """Test successful speech generation"""
+ with (
+ patch("requests.post", return_value=mock_response({})),
+ patch("ui.lib.api.OUTPUTS_DIR", str(tmp_path)),
+ patch("builtins.open", mock_open()) as mock_file,
+ ):
+ result = api.text_to_speech("test text", "voice1", "mp3", 1.0)
+
+ assert result is not None
+ assert "output_" in result
+ assert result.endswith(".mp3")
+ mock_file.assert_called_once()
+
+
+def test_text_to_speech_empty_text():
+ """Test speech generation with empty text"""
+ result = api.text_to_speech("", "voice1", "mp3", 1.0)
+ assert result is None
+
+
+def test_text_to_speech_timeout():
+ """Test speech generation timeout"""
+ with patch("requests.post", side_effect=requests.exceptions.Timeout):
+ result = api.text_to_speech("test", "voice1", "mp3", 1.0)
+ assert result is None
+
+
+def test_text_to_speech_request_error():
+ """Test speech generation request error"""
+ with patch("requests.post", side_effect=requests.exceptions.RequestException):
+ result = api.text_to_speech("test", "voice1", "mp3", 1.0)
+ assert result is None
+
+
+def test_get_status_html_available():
+ """Test status HTML generation for available service"""
+ html = api.get_status_html(True)
+ assert "green" in html
+ assert "Available" in html
+
+
+def test_get_status_html_unavailable():
+ """Test status HTML generation for unavailable service"""
+ html = api.get_status_html(False)
+ assert "red" in html
+ assert "Unavailable" in html
+
+
+def test_text_to_speech_api_params(mock_response, tmp_path):
+ """Test correct API parameters are sent"""
+ test_cases = [
+ # Single voice as string
+ ("voice1", "voice1"),
+ # Multiple voices as list
+ (["voice1", "voice2"], "voice1+voice2"),
+ # Single voice as list
+ (["voice1"], "voice1"),
+ ]
+
+ for input_voice, expected_voice in test_cases:
+ with (
+ patch("requests.post") as mock_post,
+ patch("ui.lib.api.OUTPUTS_DIR", str(tmp_path)),
+ patch("builtins.open", mock_open()),
+ ):
+ mock_post.return_value = mock_response({})
+ api.text_to_speech("test text", input_voice, "mp3", 1.5)
+
+ mock_post.assert_called_once()
+ args, kwargs = mock_post.call_args
+
+ # Check request body
+ assert kwargs["json"] == {
+ "model": "kokoro",
+ "input": "test text",
+ "voice": expected_voice,
+ "response_format": "mp3",
+ "speed": 1.5,
+ }
+
+ # Check headers and timeout
+ assert kwargs["headers"] == {"Content-Type": "application/json"}
+ assert kwargs["timeout"] == 300
+
+
+def test_text_to_speech_output_filename(mock_response, tmp_path):
+ """Test output filename contains correct voice identifier"""
+ test_cases = [
+ # Single voice
+ ("voice1", lambda f: "voice-voice1" in f),
+ # Multiple voices
+ (["voice1", "voice2"], lambda f: "voice-voice1+voice2" in f),
+ ]
+
+ for input_voice, filename_check in test_cases:
+ with (
+ patch("requests.post", return_value=mock_response({})),
+ patch("ui.lib.api.OUTPUTS_DIR", str(tmp_path)),
+ patch("builtins.open", mock_open()) as mock_file,
+ ):
+ result = api.text_to_speech("test text", input_voice, "mp3", 1.0)
+
+ assert result is not None
+ assert filename_check(result), (
+ f"Expected voice pattern not found in filename: {result}"
+ )
+ mock_file.assert_called_once()
diff --git a/ui/depr_tests/test_components.py b/ui/depr_tests/test_components.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddd831b8c8380da1a310278016a1d30c7a370cff
--- /dev/null
+++ b/ui/depr_tests/test_components.py
@@ -0,0 +1,114 @@
+import gradio as gr
+import pytest
+
+from ui.lib.components.model import create_model_column
+from ui.lib.components.output import create_output_column
+from ui.lib.config import AUDIO_FORMATS
+
+
+def test_create_model_column_structure():
+ """Test that create_model_column returns the expected structure"""
+ voice_ids = ["voice1", "voice2"]
+ column, components = create_model_column(voice_ids)
+
+ # Test return types
+ assert isinstance(column, gr.Column)
+ assert isinstance(components, dict)
+
+ # Test expected components presence
+ expected_components = {"status_btn", "voice", "format", "speed"}
+ assert set(components.keys()) == expected_components
+
+ # Test component types
+ assert isinstance(components["status_btn"], gr.Button)
+ assert isinstance(components["voice"], gr.Dropdown)
+ assert isinstance(components["format"], gr.Dropdown)
+ assert isinstance(components["speed"], gr.Slider)
+
+
+def test_model_column_default_values():
+ """Test the default values of model column components"""
+ voice_ids = ["voice1", "voice2"]
+ _, components = create_model_column(voice_ids)
+
+ # Test voice dropdown
+ # Gradio Dropdown converts choices to (value, label) tuples
+ expected_choices = [(voice_id, voice_id) for voice_id in voice_ids]
+ assert components["voice"].choices == expected_choices
+ # Value is not converted to tuple format for the value property
+ assert components["voice"].value == [voice_ids[0]]
+ assert components["voice"].interactive is True
+ assert components["voice"].multiselect is True
+ assert components["voice"].label == "Voice(s)"
+
+ # Test format dropdown
+ # Gradio Dropdown converts choices to (value, label) tuples
+ expected_format_choices = [(fmt, fmt) for fmt in AUDIO_FORMATS]
+ assert components["format"].choices == expected_format_choices
+ assert components["format"].value == "mp3"
+
+ # Test speed slider
+ assert components["speed"].minimum == 0.5
+ assert components["speed"].maximum == 2.0
+ assert components["speed"].value == 1.0
+ assert components["speed"].step == 0.1
+
+
+def test_model_column_no_voices():
+ """Test model column creation with no voice IDs"""
+ _, components = create_model_column([])
+
+ assert components["voice"].choices == []
+ assert components["voice"].value is None
+
+
+def test_create_output_column_structure():
+ """Test that create_output_column returns the expected structure"""
+ column, components = create_output_column()
+
+ # Test return types
+ assert isinstance(column, gr.Column)
+ assert isinstance(components, dict)
+
+ # Test expected components presence
+ expected_components = {
+ "audio_output",
+ "output_files",
+ "play_btn",
+ "selected_audio",
+ "clear_outputs",
+ }
+ assert set(components.keys()) == expected_components
+
+ # Test component types
+ assert isinstance(components["audio_output"], gr.Audio)
+ assert isinstance(components["output_files"], gr.Dropdown)
+ assert isinstance(components["play_btn"], gr.Button)
+ assert isinstance(components["selected_audio"], gr.Audio)
+ assert isinstance(components["clear_outputs"], gr.Button)
+
+
+def test_output_column_configuration():
+ """Test the configuration of output column components"""
+ _, components = create_output_column()
+
+ # Test audio output configuration
+ assert components["audio_output"].label == "Generated Speech"
+ assert components["audio_output"].type == "filepath"
+
+ # Test output files dropdown
+ assert components["output_files"].label == "Previous Outputs"
+ assert components["output_files"].allow_custom_value is True
+
+ # Test play button
+ assert components["play_btn"].value == "▶️ Play Selected"
+ assert components["play_btn"].size == "sm"
+
+ # Test selected audio configuration
+ assert components["selected_audio"].label == "Selected Output"
+ assert components["selected_audio"].type == "filepath"
+ assert components["selected_audio"].visible is False
+
+ # Test clear outputs button
+ assert components["clear_outputs"].size == "sm"
+ assert components["clear_outputs"].variant == "secondary"
diff --git a/ui/depr_tests/test_files.py b/ui/depr_tests/test_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..30be29319d583a3ab93c137c4aedf2bf5eac542a
--- /dev/null
+++ b/ui/depr_tests/test_files.py
@@ -0,0 +1,198 @@
+import os
+from unittest.mock import patch
+
+import pytest
+
+from ui.lib import files
+from ui.lib.config import AUDIO_FORMATS
+
+
+@pytest.fixture
+def mock_dirs(tmp_path):
+ """Create temporary input and output directories"""
+ inputs_dir = tmp_path / "inputs"
+ outputs_dir = tmp_path / "outputs"
+ inputs_dir.mkdir()
+ outputs_dir.mkdir()
+
+ with (
+ patch("ui.lib.files.INPUTS_DIR", str(inputs_dir)),
+ patch("ui.lib.files.OUTPUTS_DIR", str(outputs_dir)),
+ ):
+ yield inputs_dir, outputs_dir
+
+
+def test_list_input_files_empty(mock_dirs):
+ """Test listing input files from empty directory"""
+ assert files.list_input_files() == []
+
+
+def test_list_input_files(mock_dirs):
+ """Test listing input files with various files"""
+ inputs_dir, _ = mock_dirs
+
+ # Create test files
+ (inputs_dir / "test1.txt").write_text("content1")
+ (inputs_dir / "test2.txt").write_text("content2")
+ (inputs_dir / "nottext.pdf").write_text("should not be listed")
+
+ result = files.list_input_files()
+ assert len(result) == 2
+ assert "test1.txt" in result
+ assert "test2.txt" in result
+ assert "nottext.pdf" not in result
+
+
+def test_list_output_files_empty(mock_dirs):
+ """Test listing output files from empty directory"""
+ assert files.list_output_files() == []
+
+
+def test_list_output_files(mock_dirs):
+ """Test listing output files with various formats"""
+ _, outputs_dir = mock_dirs
+
+ # Create test files for each format
+ for fmt in AUDIO_FORMATS:
+ (outputs_dir / f"test.{fmt}").write_text("dummy content")
+ (outputs_dir / "test.txt").write_text("should not be listed")
+
+ result = files.list_output_files()
+ assert len(result) == len(AUDIO_FORMATS)
+ for fmt in AUDIO_FORMATS:
+ assert any(f".{fmt}" in file for file in result)
+
+
+def test_read_text_file_empty_filename(mock_dirs):
+ """Test reading with empty filename"""
+ assert files.read_text_file("") == ""
+
+
+def test_read_text_file_nonexistent(mock_dirs):
+ """Test reading nonexistent file"""
+ assert files.read_text_file("nonexistent.txt") == ""
+
+
+def test_read_text_file_success(mock_dirs):
+ """Test successful file reading"""
+ inputs_dir, _ = mock_dirs
+ content = "Test content\nMultiple lines"
+ (inputs_dir / "test.txt").write_text(content)
+
+ assert files.read_text_file("test.txt") == content
+
+
+def test_save_text_empty(mock_dirs):
+ """Test saving empty text"""
+ assert files.save_text("") is None
+ assert files.save_text(" ") is None
+
+
+def test_save_text_auto_filename(mock_dirs):
+ """Test saving text with auto-generated filename"""
+ inputs_dir, _ = mock_dirs
+
+ # First save
+ filename1 = files.save_text("content1")
+ assert filename1 == "input_1.txt"
+ assert (inputs_dir / filename1).read_text() == "content1"
+
+ # Second save
+ filename2 = files.save_text("content2")
+ assert filename2 == "input_2.txt"
+ assert (inputs_dir / filename2).read_text() == "content2"
+
+
+def test_save_text_custom_filename(mock_dirs):
+ """Test saving text with custom filename"""
+ inputs_dir, _ = mock_dirs
+
+ filename = files.save_text("content", "custom.txt")
+ assert filename == "custom.txt"
+ assert (inputs_dir / filename).read_text() == "content"
+
+
+def test_save_text_duplicate_filename(mock_dirs):
+ """Test saving text with duplicate filename"""
+ inputs_dir, _ = mock_dirs
+
+ # First save
+ filename1 = files.save_text("content1", "test.txt")
+ assert filename1 == "test.txt"
+
+ # Save with same filename
+ filename2 = files.save_text("content2", "test.txt")
+ assert filename2 == "test_1.txt"
+
+ assert (inputs_dir / "test.txt").read_text() == "content1"
+ assert (inputs_dir / "test_1.txt").read_text() == "content2"
+
+
+def test_delete_all_input_files(mock_dirs):
+ """Test deleting all input files"""
+ inputs_dir, _ = mock_dirs
+
+ # Create test files
+ (inputs_dir / "test1.txt").write_text("content1")
+ (inputs_dir / "test2.txt").write_text("content2")
+ (inputs_dir / "keep.pdf").write_text("should not be deleted")
+
+ assert files.delete_all_input_files() is True
+ remaining_files = list(inputs_dir.iterdir())
+ assert len(remaining_files) == 1
+ assert remaining_files[0].name == "keep.pdf"
+
+
+def test_delete_all_output_files(mock_dirs):
+ """Test deleting all output files"""
+ _, outputs_dir = mock_dirs
+
+ # Create test files
+ for fmt in AUDIO_FORMATS:
+ (outputs_dir / f"test.{fmt}").write_text("dummy content")
+ (outputs_dir / "keep.txt").write_text("should not be deleted")
+
+ assert files.delete_all_output_files() is True
+ remaining_files = list(outputs_dir.iterdir())
+ assert len(remaining_files) == 1
+ assert remaining_files[0].name == "keep.txt"
+
+
+def test_process_uploaded_file_empty_path(mock_dirs):
+ """Test processing empty file path"""
+ assert files.process_uploaded_file("") is False
+
+
+def test_process_uploaded_file_invalid_extension(mock_dirs, tmp_path):
+ """Test processing file with invalid extension"""
+ test_file = tmp_path / "test.pdf"
+ test_file.write_text("content")
+ assert files.process_uploaded_file(str(test_file)) is False
+
+
+def test_process_uploaded_file_success(mock_dirs, tmp_path):
+ """Test successful file upload processing"""
+ inputs_dir, _ = mock_dirs
+
+ # Create source file
+ source_file = tmp_path / "test.txt"
+ source_file.write_text("test content")
+
+ assert files.process_uploaded_file(str(source_file)) is True
+ assert (inputs_dir / "test.txt").read_text() == "test content"
+
+
+def test_process_uploaded_file_duplicate(mock_dirs, tmp_path):
+ """Test processing file with duplicate name"""
+ inputs_dir, _ = mock_dirs
+
+ # Create existing file
+ (inputs_dir / "test.txt").write_text("existing content")
+
+ # Create source file
+ source_file = tmp_path / "test.txt"
+ source_file.write_text("new content")
+
+ assert files.process_uploaded_file(str(source_file)) is True
+ assert (inputs_dir / "test.txt").read_text() == "existing content"
+ assert (inputs_dir / "test_1.txt").read_text() == "new content"
diff --git a/ui/depr_tests/test_handlers.py b/ui/depr_tests/test_handlers.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a71b086f849e616f450a7834af5f00e253df92
--- /dev/null
+++ b/ui/depr_tests/test_handlers.py
@@ -0,0 +1,4 @@
+"""
+Drop all tests for now. The Gradio event system is too complex to test properly.
+We'll need to find a better way to test the UI functionality.
+"""
diff --git a/ui/depr_tests/test_input.py b/ui/depr_tests/test_input.py
new file mode 100644
index 0000000000000000000000000000000000000000..2919fd09c407b18c1f8ec4ab9b3e7c07ed441036
--- /dev/null
+++ b/ui/depr_tests/test_input.py
@@ -0,0 +1,75 @@
+import gradio as gr
+import pytest
+
+from ui.lib.components.input import create_input_column
+
+
+def test_create_input_column_structure():
+ """Test that create_input_column returns the expected structure"""
+ column, components = create_input_column()
+
+ # Test the return types
+ assert isinstance(column, gr.Column)
+ assert isinstance(components, dict)
+
+ # Test that all expected components are present
+ expected_components = {
+ "tabs",
+ "text_input",
+ "file_select",
+ "file_upload",
+ "file_preview",
+ "text_submit",
+ "file_submit",
+ "clear_files",
+ }
+ assert set(components.keys()) == expected_components
+
+ # Test component types
+ assert isinstance(components["tabs"], gr.Tabs)
+ assert isinstance(components["text_input"], gr.Textbox)
+ assert isinstance(components["file_select"], gr.Dropdown)
+ assert isinstance(components["file_upload"], gr.File)
+ assert isinstance(components["file_preview"], gr.Textbox)
+ assert isinstance(components["text_submit"], gr.Button)
+ assert isinstance(components["file_submit"], gr.Button)
+ assert isinstance(components["clear_files"], gr.Button)
+
+
+def test_text_input_configuration():
+ """Test the text input component configuration"""
+ _, components = create_input_column()
+ text_input = components["text_input"]
+
+ assert text_input.label == "Text to speak"
+ assert text_input.placeholder == "Enter text here..."
+ assert text_input.lines == 4
+
+
+def test_file_upload_configuration():
+ """Test the file upload component configuration"""
+ _, components = create_input_column()
+ file_upload = components["file_upload"]
+
+ assert file_upload.label == "Upload Text File (.txt)"
+ assert file_upload.file_types == [".txt"]
+
+
+def test_button_configurations():
+ """Test the button configurations"""
+ _, components = create_input_column()
+
+ # Test text submit button
+ assert components["text_submit"].value == "Generate Speech"
+ assert components["text_submit"].variant == "primary"
+ assert components["text_submit"].size == "lg"
+
+ # Test file submit button
+ assert components["file_submit"].value == "Generate Speech"
+ assert components["file_submit"].variant == "primary"
+ assert components["file_submit"].size == "lg"
+
+ # Test clear files button
+ assert components["clear_files"].value == "Clear Files"
+ assert components["clear_files"].variant == "secondary"
+ assert components["clear_files"].size == "lg"
diff --git a/ui/depr_tests/test_interface.py b/ui/depr_tests/test_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9c49629806862862ecdae8587845d730a7c5e17
--- /dev/null
+++ b/ui/depr_tests/test_interface.py
@@ -0,0 +1,150 @@
+from unittest.mock import MagicMock, PropertyMock, patch
+
+import gradio as gr
+import pytest
+
+from ui.lib.interface import create_interface
+
+
+@pytest.fixture
+def mock_timer():
+ """Create a mock timer with events property"""
+
+ class MockEvent:
+ def __init__(self, fn):
+ self.fn = fn
+
+ class MockTimer:
+ def __init__(self):
+ self._fn = None
+ self.value = 5
+
+ @property
+ def events(self):
+ return [MockEvent(self._fn)] if self._fn else []
+
+ def tick(self, fn, outputs):
+ self._fn = fn
+
+ return MockTimer()
+
+
+def test_create_interface_structure():
+ """Test the basic structure of the created interface"""
+ with patch("ui.lib.api.check_api_status", return_value=(False, [])):
+ demo = create_interface()
+
+ # Test interface type and theme
+ assert isinstance(demo, gr.Blocks)
+ assert demo.title == "Kokoro TTS Demo"
+ assert isinstance(demo.theme, gr.themes.Monochrome)
+
+
+def test_interface_html_links():
+ """Test that HTML links are properly configured"""
+ with patch("ui.lib.api.check_api_status", return_value=(False, [])):
+ demo = create_interface()
+
+ # Find HTML component
+ html_components = [
+ comp for comp in demo.blocks.values() if isinstance(comp, gr.HTML)
+ ]
+ assert len(html_components) > 0
+ html = html_components[0]
+
+ # Check for required links
+ assert 'href="https://huggingface.co/hexgrad/Kokoro-82M"' in html.value
+ assert 'href="https://github.com/remsky/Kokoro-FastAPI"' in html.value
+ assert "Kokoro-82M HF Repo" in html.value
+ assert "Kokoro-FastAPI Repo" in html.value
+
+
+def test_update_status_available(mock_timer):
+ """Test status update when service is available"""
+ voices = ["voice1", "voice2"]
+ with (
+ patch("ui.lib.api.check_api_status", return_value=(True, voices)),
+ patch("gradio.Timer", return_value=mock_timer),
+ ):
+ demo = create_interface()
+
+ # Get the update function
+ update_fn = mock_timer.events[0].fn
+
+ # Test update with available service
+ updates = update_fn()
+
+ assert "Available" in updates[0]["value"]
+ assert updates[1]["choices"] == voices
+ assert updates[1]["value"] == voices[0]
+ assert updates[2]["active"] is False # Timer should stop
+
+
+def test_update_status_unavailable(mock_timer):
+ """Test status update when service is unavailable"""
+ with (
+ patch("ui.lib.api.check_api_status", return_value=(False, [])),
+ patch("gradio.Timer", return_value=mock_timer),
+ ):
+ demo = create_interface()
+ update_fn = mock_timer.events[0].fn
+
+ updates = update_fn()
+
+ assert "Waiting for Service" in updates[0]["value"]
+ assert updates[1]["choices"] == []
+ assert updates[1]["value"] is None
+ assert updates[2]["active"] is True # Timer should continue
+
+
+def test_update_status_error(mock_timer):
+ """Test status update when an error occurs"""
+ with (
+ patch("ui.lib.api.check_api_status", side_effect=Exception("Test error")),
+ patch("gradio.Timer", return_value=mock_timer),
+ ):
+ demo = create_interface()
+ update_fn = mock_timer.events[0].fn
+
+ updates = update_fn()
+
+ assert "Connection Error" in updates[0]["value"]
+ assert updates[1]["choices"] == []
+ assert updates[1]["value"] is None
+ assert updates[2]["active"] is True # Timer should continue
+
+
+def test_timer_configuration(mock_timer):
+ """Test timer configuration"""
+ with (
+ patch("ui.lib.api.check_api_status", return_value=(False, [])),
+ patch("gradio.Timer", return_value=mock_timer),
+ ):
+ demo = create_interface()
+
+ assert mock_timer.value == 5 # Check interval is 5 seconds
+ assert len(mock_timer.events) == 1 # Should have one event handler
+
+
+def test_interface_components_presence():
+ """Test that all required components are present"""
+ with patch("ui.lib.api.check_api_status", return_value=(False, [])):
+ demo = create_interface()
+
+ # Check for main component sections
+ components = {
+ comp.label
+ for comp in demo.blocks.values()
+ if hasattr(comp, "label") and comp.label
+ }
+
+ required_components = {
+ "Text to speak",
+ "Voice(s)",
+ "Audio Format",
+ "Speed",
+ "Generated Speech",
+ "Previous Outputs",
+ }
+
+ assert required_components.issubset(components)
diff --git a/ui/lib/__init__.py b/ui/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ui/lib/api.py b/ui/lib/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bb8b87c689eea9e56d252466a96af1999adeb71
--- /dev/null
+++ b/ui/lib/api.py
@@ -0,0 +1,92 @@
+import datetime
+import os
+from typing import List, Optional, Tuple
+
+import requests
+
+from .config import API_URL, OUTPUTS_DIR
+
+
+def check_api_status() -> Tuple[bool, List[str]]:
+ """Check TTS service status and get available voices."""
+ try:
+ # Use a longer timeout during startup
+ response = requests.get(
+ f"{API_URL}/v1/audio/voices",
+ timeout=30, # Increased timeout for initial startup period
+ )
+ response.raise_for_status()
+ voices = response.json().get("voices", [])
+ if voices:
+ return True, voices
+ print("No voices found in response")
+ return False, []
+ except requests.exceptions.Timeout:
+ print("API request timed out (waiting for service startup)")
+ return False, []
+ except requests.exceptions.ConnectionError as e:
+ print(f"Connection error (service may be starting up): {str(e)}")
+ return False, []
+ except requests.exceptions.RequestException as e:
+ print(f"API request failed: {str(e)}")
+ return False, []
+ except Exception as e:
+ print(f"Unexpected error checking API status: {str(e)}")
+ return False, []
+
+
+def text_to_speech(
+ text: str, voice_id: str | list, format: str, speed: float
+) -> Optional[str]:
+ """Generate speech from text using TTS API."""
+ if not text.strip():
+ return None
+
+ # Handle multiple voices
+ voice_str = voice_id if isinstance(voice_id, str) else "+".join(voice_id)
+
+ # Create output filename
+ timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+ output_filename = f"output_{timestamp}_voice-{voice_str}_speed-{speed}.{format}"
+ output_path = os.path.join(OUTPUTS_DIR, output_filename)
+
+ try:
+ response = requests.post(
+ f"{API_URL}/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": text,
+ "voice": voice_str,
+ "response_format": format,
+ "speed": float(speed),
+ },
+ headers={"Content-Type": "application/json"},
+ timeout=300, # Longer timeout for speech generation
+ )
+ response.raise_for_status()
+
+ with open(output_path, "wb") as f:
+ f.write(response.content)
+ return output_path
+
+ except requests.exceptions.Timeout:
+ print("Speech generation request timed out")
+ return None
+ except requests.exceptions.RequestException as e:
+ print(f"Speech generation request failed: {str(e)}")
+ return None
+ except Exception as e:
+ print(f"Unexpected error generating speech: {str(e)}")
+ return None
+
+
+def get_status_html(is_available: bool) -> str:
+ """Generate HTML for status indicator."""
+ color = "green" if is_available else "red"
+ status = "Available" if is_available else "Unavailable"
+ return f"""
+
+
+ TTS Service: {status}
+
+ """
diff --git a/ui/lib/components/__init__.py b/ui/lib/components/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d66be38159640c7e43a491d193c8908495be56e
--- /dev/null
+++ b/ui/lib/components/__init__.py
@@ -0,0 +1,5 @@
+from .input import create_input_column
+from .model import create_model_column
+from .output import create_output_column
+
+__all__ = ["create_input_column", "create_model_column", "create_output_column"]
diff --git a/ui/lib/components/input.py b/ui/lib/components/input.py
new file mode 100644
index 0000000000000000000000000000000000000000..b830b5688851b851b495fcd67cba12328adc4075
--- /dev/null
+++ b/ui/lib/components/input.py
@@ -0,0 +1,84 @@
+from typing import Tuple
+
+import gradio as gr
+
+from .. import files
+
+
+def create_input_column(disable_local_saving: bool = False) -> Tuple[gr.Column, dict]:
+ """Create the input column with text input and file handling."""
+ with gr.Column(scale=1) as col:
+ text_input = gr.Textbox(
+ label="Text to speak", placeholder="Enter text here...", lines=4
+ )
+
+ # Always show file upload but handle differently based on disable_local_saving
+ file_upload = gr.File(label="Upload Text File (.txt)", file_types=[".txt"])
+
+ if not disable_local_saving:
+ # Show full interface with tabs when saving is enabled
+ with gr.Tabs() as tabs:
+ # Set first tab as selected by default
+ tabs.selected = 0
+ # Direct Input Tab
+ with gr.TabItem("Direct Input"):
+ text_submit_direct = gr.Button(
+ "Generate Speech", variant="primary", size="lg"
+ )
+
+ # File Input Tab
+ with gr.TabItem("From File"):
+ # Existing files dropdown
+ input_files_list = gr.Dropdown(
+ label="Select Existing File",
+ choices=files.list_input_files(),
+ value=None,
+ )
+
+ file_preview = gr.Textbox(
+ label="File Content Preview", interactive=False, lines=4
+ )
+
+ with gr.Row():
+ file_submit = gr.Button(
+ "Generate Speech", variant="primary", size="lg"
+ )
+ clear_files = gr.Button(
+ "Clear Files", variant="secondary", size="lg"
+ )
+ else:
+ # Just show the generate button when saving is disabled
+ text_submit_direct = gr.Button(
+ "Generate Speech", variant="primary", size="lg"
+ )
+ tabs = None
+ input_files_list = None
+ file_preview = None
+ file_submit = None
+ clear_files = None
+
+ # Initialize components based on disable_local_saving
+ if disable_local_saving:
+ components = {
+ "tabs": None,
+ "text_input": text_input,
+ "text_submit": text_submit_direct,
+ "file_select": None,
+ "file_upload": file_upload, # Keep file upload even when saving is disabled
+ "file_preview": None,
+ "file_submit": None,
+ "clear_files": None,
+ }
+ else:
+ components = {
+ "tabs": tabs,
+ "text_input": text_input,
+ "text_submit": text_submit_direct,
+ "file_select": input_files_list,
+ "file_upload": file_upload,
+ "file_preview": file_preview,
+ "file_submit": file_submit,
+ "clear_files": clear_files,
+ }
+
+ return col, components
diff --git a/ui/lib/components/model.py b/ui/lib/components/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3426bc837e2ed7f01db8ecec0d49223b20e85ba
--- /dev/null
+++ b/ui/lib/components/model.py
@@ -0,0 +1,42 @@
+from typing import Optional, Tuple
+
+import gradio as gr
+
+from .. import api, config
+
+
+def create_model_column(voice_ids: Optional[list] = None) -> Tuple[gr.Column, dict]:
+ """Create the model settings column."""
+ if voice_ids is None:
+ voice_ids = []
+
+ with gr.Column(scale=1) as col:
+ gr.Markdown("### Model Settings")
+
+ # Status button starts in waiting state
+ status_btn = gr.Button(
+ "⌛ TTS Service: Waiting for Service...", variant="secondary"
+ )
+
+ voice_input = gr.Dropdown(
+ choices=voice_ids,
+ label="Voice(s)",
+ value=voice_ids[0] if voice_ids else None,
+ interactive=True,
+ multiselect=True,
+ )
+ format_input = gr.Dropdown(
+ choices=config.AUDIO_FORMATS, label="Audio Format", value="mp3"
+ )
+ speed_input = gr.Slider(
+ minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"
+ )
+
+ components = {
+ "status_btn": status_btn,
+ "voice": voice_input,
+ "format": format_input,
+ "speed": speed_input,
+ }
+
+ return col, components
diff --git a/ui/lib/components/output.py b/ui/lib/components/output.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e7412cd2bcf4a0d480dce0731fb207e04c82724
--- /dev/null
+++ b/ui/lib/components/output.py
@@ -0,0 +1,55 @@
+from typing import Tuple
+
+import gradio as gr
+
+from .. import files
+
+
+def create_output_column(disable_local_saving: bool = False) -> Tuple[gr.Column, dict]:
+ """Create the output column with audio player and file list."""
+ with gr.Column(scale=1) as col:
+ gr.Markdown("### Latest Output")
+ audio_output = gr.Audio(
+ label="Generated Speech",
+ type="filepath",
+ waveform_options={"waveform_color": "#4C87AB"},
+ )
+
+ # Create file-related components with visible=False when local saving is disabled
+ gr.Markdown("### Generated Files", visible=not disable_local_saving)
+ output_files = gr.Dropdown(
+ label="Previous Outputs",
+ choices=files.list_output_files() if not disable_local_saving else [],
+ value=None,
+ allow_custom_value=True,
+ visible=not disable_local_saving,
+ )
+
+ play_btn = gr.Button(
+ "▶️ Play Selected",
+ size="sm",
+ visible=not disable_local_saving,
+ )
+
+ selected_audio = gr.Audio(
+ label="Selected Output",
+ type="filepath",
+ visible=False, # Always initially hidden
+ )
+
+ clear_outputs = gr.Button(
+ "⚠️ Delete All Previously Generated Output Audio 🗑️",
+ size="sm",
+ variant="secondary",
+ visible=not disable_local_saving,
+ )
+
+ components = {
+ "audio_output": audio_output,
+ "output_files": output_files,
+ "play_btn": play_btn,
+ "selected_audio": selected_audio,
+ "clear_outputs": clear_outputs,
+ }
+
+ return col, components
diff --git a/ui/lib/config.py b/ui/lib/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e6cfe89025474fd533f046079ed181128fc1d4c
--- /dev/null
+++ b/ui/lib/config.py
@@ -0,0 +1,43 @@
+import os
+
+# API Configuration
+API_HOST = os.getenv("API_HOST", "kokoro-tts")
+API_PORT = os.getenv("API_PORT", "8880")
+API_URL = f"http://{API_HOST}:{API_PORT}"
+
+# File paths
+INPUTS_DIR = "app/ui/data/inputs"
+OUTPUTS_DIR = "app/ui/data/outputs"
+
+# Create directories if they don't exist
+
+os.makedirs(INPUTS_DIR, exist_ok=True)
+os.makedirs(OUTPUTS_DIR, exist_ok=True)
+
+# Audio formats
+AUDIO_FORMATS = ["mp3", "wav", "opus", "flac"]
+
+# UI Theme
+THEME = "monochrome"
+CSS = """
+.gradio-container {
+ max-width: 1000px;
+ margin: auto;
+}
+
+.banner-container {
+ background: transparent !important;
+ border: none !important;
+ box-shadow: none !important;
+ margin-bottom: 2rem;
+}
+
+.banner-container img {
+ width: 100%;
+ max-width: 600px;
+ border-radius: 10px;
+ margin: 20px auto;
+ display: block;
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
+"""
diff --git a/ui/lib/files.py b/ui/lib/files.py
new file mode 100644
index 0000000000000000000000000000000000000000..f79b88faa8e7bc992aebb392dc2eff33ea2408c0
--- /dev/null
+++ b/ui/lib/files.py
@@ -0,0 +1,124 @@
+import datetime
+import os
+from typing import List, Optional, Tuple
+
+from .config import AUDIO_FORMATS, INPUTS_DIR, OUTPUTS_DIR
+
+
+def list_input_files() -> List[str]:
+ """List all input text files."""
+ return [f for f in os.listdir(INPUTS_DIR) if f.endswith(".txt")]
+
+
+def list_output_files() -> List[str]:
+ """List all output audio files, sorted by most recent first."""
+ files = [
+ os.path.join(OUTPUTS_DIR, f)
+ for f in os.listdir(OUTPUTS_DIR)
+ if any(f.endswith(ext) for ext in AUDIO_FORMATS)
+ ]
+ # Sort files by modification time, most recent first
+ return sorted(files, key=os.path.getmtime, reverse=True)
+
+
+def read_text_file(filename: str) -> str:
+ """Read content of a text file."""
+ if not filename:
+ return ""
+ try:
+ file_path = os.path.join(INPUTS_DIR, filename)
+ with open(file_path, "r", encoding="utf-8") as f:
+ return f.read()
+ except:
+ return ""
+
+
+def save_text(text: str, filename: Optional[str] = None) -> Optional[str]:
+ """Save text to a file. Returns the filename if successful."""
+ if not text.strip():
+ return None
+
+ if filename is None:
+ # Use input_1.txt, input_2.txt, etc.
+ base = "input"
+ counter = 1
+ while True:
+ filename = f"{base}_{counter}.txt"
+ if not os.path.exists(os.path.join(INPUTS_DIR, filename)):
+ break
+ counter += 1
+ else:
+ # Handle duplicate filenames by adding _1, _2, etc.
+ base = os.path.splitext(filename)[0]
+ ext = os.path.splitext(filename)[1] or ".txt"
+ counter = 1
+ while os.path.exists(os.path.join(INPUTS_DIR, filename)):
+ filename = f"{base}_{counter}{ext}"
+ counter += 1
+
+ filepath = os.path.join(INPUTS_DIR, filename)
+ try:
+ with open(filepath, "w", encoding="utf-8") as f:
+ f.write(text)
+ return filename
+ except Exception as e:
+ print(f"Error saving file: {e}")
+ return None
+
+
+def delete_all_input_files() -> bool:
+ """Delete all files from the inputs directory. Returns True if successful."""
+ try:
+ for filename in os.listdir(INPUTS_DIR):
+ if filename.endswith(".txt"):
+ file_path = os.path.join(INPUTS_DIR, filename)
+ os.remove(file_path)
+ return True
+ except Exception as e:
+ print(f"Error deleting input files: {e}")
+ return False
+
+
+def delete_all_output_files() -> bool:
+ """Delete all audio files from the outputs directory. Returns True if successful."""
+ try:
+ for filename in os.listdir(OUTPUTS_DIR):
+ if any(filename.endswith(ext) for ext in AUDIO_FORMATS):
+ file_path = os.path.join(OUTPUTS_DIR, filename)
+ os.remove(file_path)
+ return True
+ except Exception as e:
+ print(f"Error deleting output files: {e}")
+ return False
+
+
+def process_uploaded_file(file_path: str) -> bool:
+ """Save uploaded file to inputs directory. Returns True if successful."""
+ if not file_path:
+ return False
+
+ try:
+ filename = os.path.basename(file_path)
+ if not filename.endswith(".txt"):
+ return False
+
+ # Create target path in inputs directory
+ target_path = os.path.join(INPUTS_DIR, filename)
+
+ # If file exists, add number suffix
+ base, ext = os.path.splitext(filename)
+ counter = 1
+ while os.path.exists(target_path):
+ new_name = f"{base}_{counter}{ext}"
+ target_path = os.path.join(INPUTS_DIR, new_name)
+ counter += 1
+
+ # Copy file to inputs directory
+ import shutil
+
+ shutil.copy2(file_path, target_path)
+ return True
+
+ except Exception as e:
+ print(f"Error saving uploaded file: {e}")
+ return False
diff --git a/ui/lib/handlers.py b/ui/lib/handlers.py
new file mode 100644
index 0000000000000000000000000000000000000000..224f650967e13de63e8773671772c074cbd1eff7
--- /dev/null
+++ b/ui/lib/handlers.py
@@ -0,0 +1,271 @@
+import os
+import shutil
+
+import gradio as gr
+
+from . import api, files
+
+
+def setup_event_handlers(components: dict, disable_local_saving: bool = False):
+ """Set up all event handlers for the UI components."""
+
+ def refresh_status():
+ try:
+ is_available, voices = api.check_api_status()
+ status = "Available" if is_available else "Waiting for Service..."
+
+ if is_available and voices:
+ # Preserve current voice selection if it exists and is still valid
+ current_voice = components["model"]["voice"].value
+ default_voice = current_voice if current_voice in voices else voices[0]
+ return [
+ gr.update(
+ value=f"🔄 TTS Service: {status}",
+ interactive=True,
+ variant="secondary",
+ ),
+ gr.update(choices=voices, value=default_voice),
+ ]
+ return [
+ gr.update(
+ value=f"⌛ TTS Service: {status}",
+ interactive=True,
+ variant="secondary",
+ ),
+ gr.update(choices=[], value=None),
+ ]
+ except Exception as e:
+ print(f"Error in refresh status: {str(e)}")
+ return [
+ gr.update(
+ value="❌ TTS Service: Connection Error",
+ interactive=True,
+ variant="secondary",
+ ),
+ gr.update(choices=[], value=None),
+ ]
+
+ def handle_file_select(filename):
+ if filename:
+ try:
+ text = files.read_text_file(filename)
+ if text:
+ preview = text[:200] + "..." if len(text) > 200 else text
+ return gr.update(value=preview)
+ except Exception as e:
+ print(f"Error reading file: {e}")
+ return gr.update(value="")
+
+ def handle_file_upload(file):
+ if file is None:
+ return (
+ ""
+ if disable_local_saving
+ else [gr.update(choices=files.list_input_files())]
+ )
+
+ try:
+ # Read the file content
+ with open(file.name, "r", encoding="utf-8") as f:
+ text_content = f.read()
+
+ if disable_local_saving:
+ # When saving is disabled, put content directly in text input
+ # Normalize whitespace by replacing newlines with spaces
+ normalized_text = " ".join(text_content.split())
+ return normalized_text
+ else:
+ # When saving is enabled, save file and update dropdown
+ filename = os.path.basename(file.name)
+ target_path = os.path.join(files.INPUTS_DIR, filename)
+
+ # Handle duplicate filenames
+ base, ext = os.path.splitext(filename)
+ counter = 1
+ while os.path.exists(target_path):
+ new_name = f"{base}_{counter}{ext}"
+ target_path = os.path.join(files.INPUTS_DIR, new_name)
+ counter += 1
+
+ shutil.copy2(file.name, target_path)
+ return [gr.update(choices=files.list_input_files())]
+
+ except Exception as e:
+ print(f"Error handling file: {e}")
+ return (
+ ""
+ if disable_local_saving
+ else [gr.update(choices=files.list_input_files())]
+ )
+
+ def generate_from_text(text, voice, format, speed):
+ """Generate speech from direct text input"""
+ is_available, _ = api.check_api_status()
+ if not is_available:
+ gr.Warning("TTS Service is currently unavailable")
+ return [None, gr.update(choices=files.list_output_files())]
+
+ if not text or not text.strip():
+ gr.Warning("Please enter text in the input box")
+ return [None, gr.update(choices=files.list_output_files())]
+
+ # Only save text if local saving is enabled
+ if not disable_local_saving:
+ files.save_text(text)
+
+ result = api.text_to_speech(text, voice, format, speed)
+ if result is None:
+ gr.Warning("Failed to generate speech. Please try again.")
+ return [None, gr.update(choices=files.list_output_files())]
+
+ return [
+ result,
+ gr.update(
+ choices=files.list_output_files(), value=os.path.basename(result)
+ ),
+ ]
+
+ def generate_from_file(selected_file, voice, format, speed):
+ """Generate speech from selected file"""
+ is_available, _ = api.check_api_status()
+ if not is_available:
+ gr.Warning("TTS Service is currently unavailable")
+ return [None, gr.update(choices=files.list_output_files())]
+
+ if not selected_file:
+ gr.Warning("Please select a file")
+ return [None, gr.update(choices=files.list_output_files())]
+
+ text = files.read_text_file(selected_file)
+ result = api.text_to_speech(text, voice, format, speed)
+ if result is None:
+ gr.Warning("Failed to generate speech. Please try again.")
+ return [None, gr.update(choices=files.list_output_files())]
+
+ return [
+ result,
+ gr.update(
+ choices=files.list_output_files(), value=os.path.basename(result)
+ ),
+ ]
+
+ def play_selected(file_path):
+ if file_path and os.path.exists(file_path):
+ return gr.update(value=file_path, visible=True)
+ return gr.update(visible=False)
+
+ def clear_files(voice, format, speed):
+ """Delete all input files and clear UI components while preserving model settings"""
+ files.delete_all_input_files()
+ return [
+ gr.update(value=None, choices=[]), # file_select
+ None, # file_upload
+ gr.update(value=""), # file_preview
+ None, # audio_output
+ gr.update(choices=files.list_output_files()), # output_files
+ gr.update(value=voice), # voice
+ gr.update(value=format), # format
+ gr.update(value=speed), # speed
+ ]
+
+ def clear_outputs():
+ """Delete all output audio files and clear audio components"""
+ files.delete_all_output_files()
+ return [
+ None, # audio_output
+ gr.update(choices=[], value=None), # output_files
+ gr.update(visible=False), # selected_audio
+ ]
+
+ # Connect event handlers
+ components["model"]["status_btn"].click(
+ fn=refresh_status,
+ outputs=[components["model"]["status_btn"], components["model"]["voice"]],
+ )
+
+ # Connect text submit button (always present)
+ components["input"]["text_submit"].click(
+ fn=generate_from_text,
+ inputs=[
+ components["input"]["text_input"],
+ components["model"]["voice"],
+ components["model"]["format"],
+ components["model"]["speed"],
+ ],
+ outputs=[
+ components["output"]["audio_output"],
+ components["output"]["output_files"],
+ ],
+ )
+
+ # Only connect file-related handlers if components exist
+ if components["input"]["file_select"] is not None:
+ components["input"]["file_select"].change(
+ fn=handle_file_select,
+ inputs=[components["input"]["file_select"]],
+ outputs=[components["input"]["file_preview"]],
+ )
+
+ if components["input"]["file_upload"] is not None:
+ # File upload handler - output depends on disable_local_saving
+ components["input"]["file_upload"].upload(
+ fn=handle_file_upload,
+ inputs=[components["input"]["file_upload"]],
+ outputs=[
+ components["input"]["text_input"]
+ if disable_local_saving
+ else components["input"]["file_select"]
+ ],
+ )
+
+ if components["output"]["play_btn"] is not None:
+ components["output"]["play_btn"].click(
+ fn=play_selected,
+ inputs=[components["output"]["output_files"]],
+ outputs=[components["output"]["selected_audio"]],
+ )
+
+ if components["input"]["clear_files"] is not None:
+ components["input"]["clear_files"].click(
+ fn=clear_files,
+ inputs=[
+ components["model"]["voice"],
+ components["model"]["format"],
+ components["model"]["speed"],
+ ],
+ outputs=[
+ components["input"]["file_select"],
+ components["input"]["file_upload"],
+ components["input"]["file_preview"],
+ components["output"]["audio_output"],
+ components["output"]["output_files"],
+ components["model"]["voice"],
+ components["model"]["format"],
+ components["model"]["speed"],
+ ],
+ )
+
+ if components["output"]["clear_outputs"] is not None:
+ components["output"]["clear_outputs"].click(
+ fn=clear_outputs,
+ outputs=[
+ components["output"]["audio_output"],
+ components["output"]["output_files"],
+ components["output"]["selected_audio"],
+ ],
+ )
+
+ if components["input"]["file_submit"] is not None:
+ components["input"]["file_submit"].click(
+ fn=generate_from_file,
+ inputs=[
+ components["input"]["file_select"],
+ components["model"]["voice"],
+ components["model"]["format"],
+ components["model"]["speed"],
+ ],
+ outputs=[
+ components["output"]["audio_output"],
+ components["output"]["output_files"],
+ ],
+ )
diff --git a/ui/lib/interface.py b/ui/lib/interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..b35bee8eac1b89c71a0fc0c508c36eb408e24ef2
--- /dev/null
+++ b/ui/lib/interface.py
@@ -0,0 +1,102 @@
+import os
+
+import gradio as gr
+
+from . import api
+from .components import create_input_column, create_model_column, create_output_column
+from .handlers import setup_event_handlers
+
+
+def create_interface():
+ """Create the main Gradio interface."""
+ # Skip initial status check - let the timer handle it
+ is_available, available_voices = False, []
+
+ # Check if local saving is disabled
+ disable_local_saving = os.getenv("DISABLE_LOCAL_SAVING", "false").lower() == "true"
+
+ with gr.Blocks(title="Kokoro TTS Demo", theme=gr.themes.Monochrome()) as demo:
+ gr.HTML(
+ value='