Spaces:

DroolingPanda
/

dia-tts-server

Running

App Files Files Community

Michael Hu commited on Apr 27

Commit

ac5de5b

1 Parent(s): 2493d3b

initial check in of the dia tts server

Browse files

Files changed (20) hide show

.env +35 -0
Dockerfile +36 -0
README.md +1 -0
config.py +295 -0
dia/__init__.py +0 -0
dia/audio.py +280 -0
dia/config.py +206 -0
dia/layers.py +903 -0
dia/model.py +956 -0
docker-compose.yml +23 -0
documentation.md +549 -0
download_model.py +41 -0
engine.py +356 -0
models.py +97 -0
requirements.txt +22 -0
server.py +1061 -0
ui/index.html +916 -0
ui/presets.yaml +57 -0
ui/script.js +593 -0
utils.py +146 -0

.env ADDED Viewed

	@@ -0,0 +1,35 @@

+# .env - Configuration for Dia TTS Server
+# Values in this file override the defaults set in config.py
+# --- Server Settings ---
+HOST='0.0.0.0'
+PORT='8003'
+# --- Path Settings ---
+# Defaults are usually fine unless you want custom locations.
+DIA_MODEL_CACHE_PATH='./model_cache'
+REFERENCE_AUDIO_PATH='./reference_audio'
+OUTPUT_PATH='./outputs'
+# --- Model Source Settings ---
+# Defaulting to BF16 safetensors. Uncomment and modify lines below to use other models.
+DIA_MODEL_REPO_ID='ttj/dia-1.6b-safetensors'
+DIA_MODEL_CONFIG_FILENAME='config.json'
+DIA_MODEL_WEIGHTS_FILENAME='dia-v0_1_bf16.safetensors'
+# Example: Use full precision safetensors
+# DIA_MODEL_REPO_ID=ttj/dia-1.6b-safetensors
+# DIA_MODEL_WEIGHTS_FILENAME=dia-v0_1.safetensors
+# Example: Use original Nari Labs .pth model
+# DIA_MODEL_REPO_ID=nari-labs/Dia-1.6B
+# DIA_MODEL_WEIGHTS_FILENAME=dia-v0_1.pth
+# --- Default Generation Parameters ---
+# These set the initial values loaded in the UI.
+# They can be changed in the UI and saved back here using the 'Save Generation Defaults' button.
+GEN_DEFAULT_SPEED_FACTOR='0.9'
+GEN_DEFAULT_CFG_SCALE='3'
+GEN_DEFAULT_TEMPERATURE='1.3'
+GEN_DEFAULT_TOP_P='0.95'
+GEN_DEFAULT_CFG_FILTER_TOP_K='35'

Dockerfile ADDED Viewed

	@@ -0,0 +1,36 @@

+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV DEBIAN_FRONTEND=noninteractive
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    libsndfile1 \
+    ffmpeg \
+    python3 \
+    python3-pip \
+    python3-dev \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Set up working directory
+WORKDIR /app
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create required directories
+RUN mkdir -p model_cache reference_audio outputs
+# Expose the port the application will run on (default to 8003 as per config)
+EXPOSE 8003
+# Command to run the application
+CMD ["python3", "server.py"]

README.md CHANGED Viewed

@@ -5,6 +5,7 @@ colorFrom: indigo
 colorTo: indigo
 sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorTo: indigo
 sdk: docker
 pinned: false
+app_port: 8003
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

config.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# config.py
+# Configuration management for Dia TTS server
+import os
+import logging
+from dotenv import load_dotenv, find_dotenv, set_key
+from typing import Dict, Any, Optional
+# Configure logging
+logger = logging.getLogger(__name__)
+# Default configuration values (used if not found in .env or environment)
+DEFAULT_CONFIG = {
+    # Server Settings
+    "HOST": "0.0.0.0",
+    "PORT": "8003",
+    # Model Source Settings
+    "DIA_MODEL_REPO_ID": "ttj/dia-1.6b-safetensors",  # Default to safetensors repo
+    "DIA_MODEL_CONFIG_FILENAME": "config.json",  # Standard config filename
+    "DIA_MODEL_WEIGHTS_FILENAME": "dia-v0_1_bf16.safetensors",  # Default to BF16 weights
+    # Path Settings
+    "DIA_MODEL_CACHE_PATH": "./model_cache",
+    "REFERENCE_AUDIO_PATH": "./reference_audio",
+    "OUTPUT_PATH": "./outputs",
+    # Default Generation Parameters (can be overridden by user in UI/API)
+    # These are saved to .env via the UI's "Save Generation Defaults" button
+    "GEN_DEFAULT_SPEED_FACTOR": "0.90",  # Default speed slightly slower
+    "GEN_DEFAULT_CFG_SCALE": "3.0",
+    "GEN_DEFAULT_TEMPERATURE": "1.3",
+    "GEN_DEFAULT_TOP_P": "0.95",
+    "GEN_DEFAULT_CFG_FILTER_TOP_K": "35",
+}
+class ConfigManager:
+    """Manages configuration for the TTS server with .env file support."""
+    def __init__(self):
+        """Initialize the configuration manager."""
+        self.config = {}
+        self.env_file = find_dotenv()
+        if not self.env_file:
+            self.env_file = os.path.join(os.getcwd(), ".env")
+            logger.info(
+                f"No .env file found, creating one with defaults at {self.env_file}"
+            )
+            self._create_default_env_file()
+        else:
+            logger.info(f"Loading configuration from: {self.env_file}")
+        self.reload()
+    def _create_default_env_file(self):
+        """Create a default .env file with default values."""
+        try:
+            with open(self.env_file, "w") as f:
+                for key, value in DEFAULT_CONFIG.items():
+                    f.write(f"{key}={value}\n")
+            logger.info("Created default .env file")
+        except Exception as e:
+            logger.error(f"Failed to create default .env file: {e}")
+    def reload(self):
+        """Reload configuration from .env file and environment variables."""
+        load_dotenv(self.env_file, override=True)
+        loaded_config = {}
+        for key, default_value in DEFAULT_CONFIG.items():
+            loaded_config[key] = os.environ.get(key, default_value)
+        self.config = loaded_config
+        logger.info("Configuration loaded/reloaded.")
+        logger.debug(f"Current config: {self.config}")
+        return self.config
+    def get(self, key: str, default: Any = None) -> Any:
+        """Get a configuration value by key."""
+        return self.config.get(key, default)
+    def set(self, key: str, value: Any) -> None:
+        """Set a configuration value in memory (does not save automatically)."""
+        self.config[key] = value
+        logger.debug(f"Configuration value set in memory: {key}={value}")
+    def save(self) -> bool:
+        """Save the current in-memory configuration to the .env file."""
+        if not self.env_file:
+            logger.error("Cannot save configuration, .env file path not set.")
+            return False
+        try:
+            for key in DEFAULT_CONFIG.keys():
+                if key not in self.config:
+                    logger.warning(
+                        f"Key '{key}' missing from current config, adding default value before saving."
+                    )
+                    self.config[key] = DEFAULT_CONFIG[key]
+            for key, value in self.config.items():
+                if key in DEFAULT_CONFIG:
+                    set_key(self.env_file, key, str(value))
+            logger.info(f"Configuration saved to {self.env_file}")
+            return True
+        except Exception as e:
+            logger.error(
+                f"Failed to save configuration to {self.env_file}: {e}", exc_info=True
+            )
+            return False
+    def get_all(self) -> Dict[str, Any]:
+        """Get all current configuration values."""
+        return self.config.copy()
+    def update(self, new_config: Dict[str, Any]) -> None:
+        """Update multiple configuration values in memory from a dictionary."""
+        updated_keys = []
+        for key, value in new_config.items():
+            if key in DEFAULT_CONFIG:
+                self.config[key] = value
+                updated_keys.append(key)
+            else:
+                logger.warning(
+                    f"Attempted to update unknown config key: {key}. Ignoring."
+                )
+        if updated_keys:
+            logger.debug(
+                f"Configuration values updated in memory for keys: {updated_keys}"
+            )
+    def get_int(self, key: str, default: Optional[int] = None) -> int:
+        """Get a configuration value as an integer, with error handling."""
+        value_str = self.get(key)  # Get value which might be from env (str) or default
+        if value_str is None:  # Key not found at all
+            if default is not None:
+                logger.warning(
+                    f"Config key '{key}' not found, using provided default: {default}"
+                )
+                return default
+            else:
+                logger.error(
+                    f"Mandatory config key '{key}' not found and no default provided. Returning 0."
+                )
+                return 0  # Or raise error
+        try:
+            return int(value_str)
+        except (ValueError, TypeError):
+            logger.warning(
+                f"Invalid integer value '{value_str}' for config key '{key}', using default: {default}"
+            )
+            if isinstance(default, int):
+                return default
+            elif default is None:
+                logger.error(
+                    f"Cannot parse '{value_str}' as int for key '{key}' and no valid default. Returning 0."
+                )
+                return 0
+            else:  # Default was provided but not an int
+                logger.error(
+                    f"Invalid default value type for key '{key}'. Cannot parse '{value_str}'. Returning 0."
+                )
+                return 0
+    def get_float(self, key: str, default: Optional[float] = None) -> float:
+        """Get a configuration value as a float, with error handling."""
+        value_str = self.get(key)
+        if value_str is None:
+            if default is not None:
+                logger.warning(
+                    f"Config key '{key}' not found, using provided default: {default}"
+                )
+                return default
+            else:
+                logger.error(
+                    f"Mandatory config key '{key}' not found and no default provided. Returning 0.0."
+                )
+                return 0.0
+        try:
+            return float(value_str)
+        except (ValueError, TypeError):
+            logger.warning(
+                f"Invalid float value '{value_str}' for config key '{key}', using default: {default}"
+            )
+            if isinstance(default, float):
+                return default
+            elif default is None:
+                logger.error(
+                    f"Cannot parse '{value_str}' as float for key '{key}' and no valid default. Returning 0.0."
+                )
+                return 0.0
+            else:
+                logger.error(
+                    f"Invalid default value type for key '{key}'. Cannot parse '{value_str}'. Returning 0.0."
+                )
+                return 0.0
+# --- Create a singleton instance for global access ---
+config_manager = ConfigManager()
+# --- Export common getters for easy access ---
+# Server Settings
+def get_host() -> str:
+    """Gets the host address for the server."""
+    return config_manager.get("HOST", DEFAULT_CONFIG["HOST"])
+def get_port() -> int:
+    """Gets the port number for the server."""
+    # Ensure default is parsed correctly if get_int fails on env var
+    return config_manager.get_int("PORT", int(DEFAULT_CONFIG["PORT"]))
+# Model Source Settings
+def get_model_repo_id() -> str:
+    """Gets the Hugging Face repository ID for the model."""
+    return config_manager.get("DIA_MODEL_REPO_ID", DEFAULT_CONFIG["DIA_MODEL_REPO_ID"])
+def get_model_config_filename() -> str:
+    """Gets the filename for the model's configuration file within the repo."""
+    return config_manager.get(
+        "DIA_MODEL_CONFIG_FILENAME", DEFAULT_CONFIG["DIA_MODEL_CONFIG_FILENAME"]
+    )
+def get_model_weights_filename() -> str:
+    """Gets the filename for the model's weights file within the repo."""
+    return config_manager.get(
+        "DIA_MODEL_WEIGHTS_FILENAME", DEFAULT_CONFIG["DIA_MODEL_WEIGHTS_FILENAME"]
+    )
+# Path Settings
+def get_model_cache_path() -> str:
+    """Gets the local directory path for caching downloaded models."""
+    return os.path.abspath(
+        config_manager.get(
+            "DIA_MODEL_CACHE_PATH", DEFAULT_CONFIG["DIA_MODEL_CACHE_PATH"]
+        )
+    )
+def get_reference_audio_path() -> str:
+    """Gets the local directory path for storing reference audio files for cloning."""
+    return os.path.abspath(
+        config_manager.get(
+            "REFERENCE_AUDIO_PATH", DEFAULT_CONFIG["REFERENCE_AUDIO_PATH"]
+        )
+    )
+def get_output_path() -> str:
+    """Gets the local directory path for saving generated audio outputs."""
+    return os.path.abspath(
+        config_manager.get("OUTPUT_PATH", DEFAULT_CONFIG["OUTPUT_PATH"])
+    )
+# Default Generation Parameter Getters
+def get_gen_default_speed_factor() -> float:
+    """Gets the default speed factor for generation."""
+    return config_manager.get_float(
+        "GEN_DEFAULT_SPEED_FACTOR", float(DEFAULT_CONFIG["GEN_DEFAULT_SPEED_FACTOR"])
+    )
+def get_gen_default_cfg_scale() -> float:
+    """Gets the default CFG scale for generation."""
+    return config_manager.get_float(
+        "GEN_DEFAULT_CFG_SCALE", float(DEFAULT_CONFIG["GEN_DEFAULT_CFG_SCALE"])
+    )
+def get_gen_default_temperature() -> float:
+    """Gets the default temperature for generation."""
+    return config_manager.get_float(
+        "GEN_DEFAULT_TEMPERATURE", float(DEFAULT_CONFIG["GEN_DEFAULT_TEMPERATURE"])
+    )
+def get_gen_default_top_p() -> float:
+    """Gets the default top_p for generation."""
+    return config_manager.get_float(
+        "GEN_DEFAULT_TOP_P", float(DEFAULT_CONFIG["GEN_DEFAULT_TOP_P"])
+    )
+def get_gen_default_cfg_filter_top_k() -> int:
+    """Gets the default CFG filter top_k for generation."""
+    return config_manager.get_int(
+        "GEN_DEFAULT_CFG_FILTER_TOP_K",
+        int(DEFAULT_CONFIG["GEN_DEFAULT_CFG_FILTER_TOP_K"]),
+    )

dia/__init__.py ADDED Viewed

File without changes

dia/audio.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import typing as tp
+import torch
+from .config import DataConfig
+def build_delay_indices(B: int, T: int, C: int, delay_pattern: tp.List[int]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Precompute (t_idx_BxTxC, indices_BTCx3) so that out[t, c] = in[t - delay[c], c].
+    Negative t_idx => BOS; t_idx >= T => PAD.
+    """
+    delay_arr = torch.tensor(delay_pattern, dtype=torch.int32)
+    t_idx_BxT = torch.broadcast_to(
+        torch.arange(T, dtype=torch.int32)[None, :],
+        [B, T],
+    )
+    t_idx_BxTx1 = t_idx_BxT[..., None]
+    t_idx_BxTxC = t_idx_BxTx1 - delay_arr.view(1, 1, C)
+    b_idx_BxTxC = torch.broadcast_to(
+        torch.arange(B, dtype=torch.int32).view(B, 1, 1),
+        [B, T, C],
+    )
+    c_idx_BxTxC = torch.broadcast_to(
+        torch.arange(C, dtype=torch.int32).view(1, 1, C),
+        [B, T, C],
+    )
+    # We must clamp time indices to [0..T-1] so gather_nd equivalent won't fail
+    t_clamped_BxTxC = torch.clamp(t_idx_BxTxC, 0, T - 1)
+    indices_BTCx3 = torch.stack(
+        [
+            b_idx_BxTxC.reshape(-1),
+            t_clamped_BxTxC.reshape(-1),
+            c_idx_BxTxC.reshape(-1),
+        ],
+        dim=1,
+    ).long()  # Ensure indices are long type for indexing
+    return t_idx_BxTxC, indices_BTCx3
+def apply_audio_delay(
+    audio_BxTxC: torch.Tensor,
+    pad_value: int,
+    bos_value: int,
+    precomp: tp.Tuple[torch.Tensor, torch.Tensor],
+) -> torch.Tensor:
+    """
+    Applies the delay pattern to batched audio tokens using precomputed indices,
+    inserting BOS where t_idx < 0 and PAD where t_idx >= T.
+    Args:
+        audio_BxTxC: [B, T, C] int16 audio tokens (or int32/float)
+        pad_value: the padding token
+        bos_value: the BOS token
+        precomp:  (t_idx_BxTxC, indices_BTCx3) from build_delay_indices
+    Returns:
+        result_BxTxC: [B, T, C] delayed audio tokens
+    """
+    device = audio_BxTxC.device  # Get device from input tensor
+    t_idx_BxTxC, indices_BTCx3 = precomp
+    t_idx_BxTxC = t_idx_BxTxC.to(device)  # Move precomputed indices to device
+    indices_BTCx3 = indices_BTCx3.to(device)
+    # Equivalent of tf.gather_nd using advanced indexing
+    # Ensure indices are long type if not already (build_delay_indices should handle this)
+    gathered_flat = audio_BxTxC[indices_BTCx3[:, 0], indices_BTCx3[:, 1], indices_BTCx3[:, 2]]
+    gathered_BxTxC = gathered_flat.view(audio_BxTxC.shape)
+    # Create masks on the correct device
+    mask_bos = t_idx_BxTxC < 0  # => place bos_value
+    mask_pad = t_idx_BxTxC >= audio_BxTxC.shape[1]  # => place pad_value
+    # Create scalar tensors on the correct device
+    bos_tensor = torch.tensor(bos_value, dtype=audio_BxTxC.dtype, device=device)
+    pad_tensor = torch.tensor(pad_value, dtype=audio_BxTxC.dtype, device=device)
+    # If mask_bos, BOS; else if mask_pad, PAD; else original gather
+    # All tensors should now be on the same device
+    result_BxTxC = torch.where(mask_bos, bos_tensor, torch.where(mask_pad, pad_tensor, gathered_BxTxC))
+    return result_BxTxC
+@torch.no_grad()
+@torch.inference_mode()
+def audio_to_codebook(
+    model,
+    input_values,
+    data_config: DataConfig,
+    padding_mask=None,
+    sample_rate=44100,
+):
+    """
+    Encodes the input audio waveform into discrete codes.
+    Args:
+        model: The model to use for encoding.
+        input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+            Float values of the input audio waveform.
+        padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+            Padding mask used to pad the `input_values`.
+        sample_rate (`int`, *optional*) :
+            Signal sampling_rate
+    Returns:
+        A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
+        factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
+        `codebook` of shape `[batch_size, num_codebooks, frames]`.
+        Scale is not used here.
+    """
+    audio_data = model.preprocess(input_values, sample_rate)
+    if padding_mask is None:
+        padding_mask = torch.ones_like(input_values).bool()
+    _, encoded_frame, _, _, _ = model.encode(audio_data, n_quantizers=None)  # 1, C, T
+    seq_length = encoded_frame.shape[2]
+    t_idx_BxTxC, indices_BTCx3 = build_delay_indices(
+        B=1,
+        T=seq_length,
+        C=data_config.channels,
+        delay_pattern=data_config.delay_pattern,
+    )
+    encoded_frame = apply_audio_delay(
+        audio_BxTxC=encoded_frame.transpose(1, 2),  # 1, T, C
+        pad_value=data_config.audio_pad_value,
+        bos_value=data_config.audio_bos_value,
+        precomp=(t_idx_BxTxC, indices_BTCx3),
+    )
+    return encoded_frame
+def build_revert_indices(B: int, T: int, C: int, delay_pattern: tp.List[int]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Precompute indices for the revert operation using PyTorch.
+    Returns:
+        A tuple (t_idx_BxTxC, indices_BTCx3) where:
+            - t_idx_BxTxC is a tensor of shape [B, T, C] computed as time indices plus the delay.
+            - indices_BTCx3 is a tensor of shape [B*T*C, 3] used for gathering, computed from:
+                batch indices, clamped time indices, and channel indices.
+    """
+    # Use default device unless specified otherwise; assumes inputs might define device later
+    device = None  # Or determine dynamically if needed, e.g., from a model parameter
+    delay_arr = torch.tensor(delay_pattern, dtype=torch.int32, device=device)
+    t_idx_BT1 = torch.broadcast_to(torch.arange(T, device=device).unsqueeze(0), [B, T])
+    t_idx_BT1 = t_idx_BT1.unsqueeze(-1)
+    t_idx_BxTxC = torch.minimum(
+        t_idx_BT1 + delay_arr.view(1, 1, C),
+        torch.tensor(T - 1, device=device),
+    )
+    b_idx_BxTxC = torch.broadcast_to(torch.arange(B, device=device).view(B, 1, 1), [B, T, C])
+    c_idx_BxTxC = torch.broadcast_to(torch.arange(C, device=device).view(1, 1, C), [B, T, C])
+    indices_BTCx3 = torch.stack(
+        [
+            b_idx_BxTxC.reshape(-1),
+            t_idx_BxTxC.reshape(-1),
+            c_idx_BxTxC.reshape(-1),
+        ],
+        axis=1,
+    ).long()  # Ensure indices are long type
+    return t_idx_BxTxC, indices_BTCx3
+def revert_audio_delay(
+    audio_BxTxC: torch.Tensor,
+    pad_value: int,
+    precomp: tp.Tuple[torch.Tensor, torch.Tensor],
+    T: int,
+) -> torch.Tensor:
+    """
+    Reverts a delay pattern from batched audio tokens using precomputed indices (PyTorch version).
+    Args:
+        audio_BxTxC: Input delayed audio tensor
+        pad_value: Padding value for out-of-bounds indices
+        precomp: Precomputed revert indices tuple containing:
+            - t_idx_BxTxC: Time offset indices tensor
+            - indices_BTCx3: Gather indices tensor for original audio
+        T: Original sequence length before padding
+    Returns:
+        Reverted audio tensor with same shape as input
+    """
+    t_idx_BxTxC, indices_BTCx3 = precomp
+    device = audio_BxTxC.device  # Get device from input tensor
+    # Move precomputed indices to the same device as audio_BxTxC if they aren't already
+    t_idx_BxTxC = t_idx_BxTxC.to(device)
+    indices_BTCx3 = indices_BTCx3.to(device)
+    # Using PyTorch advanced indexing (equivalent to tf.gather_nd or np equivalent)
+    gathered_flat = audio_BxTxC[indices_BTCx3[:, 0], indices_BTCx3[:, 1], indices_BTCx3[:, 2]]
+    gathered_BxTxC = gathered_flat.view(audio_BxTxC.size())  # Use .size() for robust reshaping
+    # Create pad_tensor on the correct device
+    pad_tensor = torch.tensor(pad_value, dtype=audio_BxTxC.dtype, device=device)
+    # Create T tensor on the correct device for comparison
+    T_tensor = torch.tensor(T, device=device)
+    result_BxTxC = torch.where(t_idx_BxTxC >= T_tensor, pad_tensor, gathered_BxTxC)  # Changed np.where to torch.where
+    return result_BxTxC
+@torch.no_grad()
+@torch.inference_mode()
+def decode(
+    model,
+    audio_codes,
+):
+    """
+    Decodes the given frames into an output audio waveform
+    """
+    if len(audio_codes) != 1:
+        raise ValueError(f"Expected one frame, got {len(audio_codes)}")
+    try:
+        audio_values = model.quantizer.from_codes(audio_codes)
+        audio_values = model.decode(audio_values[0])
+        return audio_values
+    except Exception as e:
+        print(f"Error in decode method: {str(e)}")
+        raise
+def codebook_to_audio(generated_codes: torch.Tensor, model, delay_pattern, B=1, T=2600, C=9):
+    """Process a single codebook file to generate audio"""
+    # Remove BOS token
+    generated_codes = generated_codes[:, 1:]
+    if generated_codes.shape[1] > T:
+        generated_codes = generated_codes[:, :T]
+    seq_length = generated_codes.shape[1]
+    # Build revert indices
+    t_idx_BxTxC, indices_BTCx3 = build_revert_indices(B=B, T=seq_length, C=C, delay_pattern=delay_pattern)
+    # Transpose and add batch dimension
+    audio_BxTxC = generated_codes.transpose(1, 0).unsqueeze(0)
+    reverted_codebook = revert_audio_delay(
+        audio_BxTxC=audio_BxTxC,
+        pad_value=0,
+        precomp=(t_idx_BxTxC, indices_BTCx3),
+        T=seq_length,
+    )
+    reverted_codebook = reverted_codebook[:, :-30, :]
+    codebook = reverted_codebook.transpose(1, 2)
+    min_valid_index = 0
+    max_valid_index = 1023
+    invalid_mask = (codebook < min_valid_index) | (codebook > max_valid_index)
+    num_invalid = torch.sum(invalid_mask).item()
+    if num_invalid > 0:
+        print(f"Warning: Clamping {num_invalid} indices outside range [{min_valid_index}, {max_valid_index}] to 0.")
+    # Set invalid values to 0 (modify the tensor in-place)
+    codebook[invalid_mask] = 0
+    audio_array = decode(model, codebook)
+    return audio_array

dia/config.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""Configuration management module for the Dia model.
+This module provides comprehensive configuration management for the Dia model,
+utilizing Pydantic for validation. It defines configurations for data processing,
+model architecture (encoder and decoder), and training settings.
+Key components:
+- DataConfig: Parameters for data loading and preprocessing.
+- EncoderConfig: Architecture details for the encoder module.
+- DecoderConfig: Architecture details for the decoder module.
+- ModelConfig: Combined model architecture settings.
+- TrainingConfig: Training hyperparameters and settings.
+- DiaConfig: Master configuration combining all components.
+"""
+import os
+from typing import Annotated
+from pydantic import BaseModel, BeforeValidator, Field
+class DataConfig(BaseModel, frozen=True):
+    """Configuration for data loading and preprocessing.
+    Attributes:
+        text_length: Maximum length of text sequences (must be multiple of 128).
+        audio_length: Maximum length of audio sequences (must be multiple of 128).
+        channels: Number of audio channels.
+        text_pad_value: Value used for padding text sequences.
+        audio_eos_value: Value representing the end of audio sequences.
+        audio_bos_value: Value representing the beginning of audio sequences.
+        audio_pad_value: Value used for padding audio sequences.
+        delay_pattern: List of delay values for each audio channel.
+    """
+    text_length: Annotated[int, BeforeValidator(lambda x: (x + 127) // 128 * 128)] = Field(gt=0, multiple_of=128)
+    audio_length: Annotated[int, BeforeValidator(lambda x: (x + 127) // 128 * 128)] = Field(gt=0, multiple_of=128)
+    channels: int = Field(default=9, gt=0, multiple_of=1)
+    text_pad_value: int = Field(default=0)
+    audio_eos_value: int = Field(default=1024)
+    audio_pad_value: int = Field(default=1025)
+    audio_bos_value: int = Field(default=1026)
+    delay_pattern: list[Annotated[int, Field(ge=0)]] = Field(default_factory=lambda: [0, 8, 9, 10, 11, 12, 13, 14, 15])
+    def __hash__(self) -> int:
+        """Generate a hash based on all fields of the config."""
+        return hash(
+            (
+                self.text_length,
+                self.audio_length,
+                self.channels,
+                self.text_pad_value,
+                self.audio_pad_value,
+                self.audio_bos_value,
+                self.audio_eos_value,
+                tuple(self.delay_pattern),
+            )
+        )
+class EncoderConfig(BaseModel, frozen=True):
+    """Configuration for the encoder component of the Dia model.
+    Attributes:
+        n_layer: Number of transformer layers.
+        n_embd: Embedding dimension.
+        n_hidden: Hidden dimension size in the MLP layers.
+        n_head: Number of attention heads.
+        head_dim: Dimension per attention head.
+        mlp_activations: List of activation functions for the MLP layers.
+        use_pre_norm: Whether to use pre-normalization (LayerNorm before attention/MLP).
+    """
+    n_layer: int = Field(gt=0)
+    n_embd: int = Field(gt=0)
+    n_hidden: int = Field(gt=0)
+    n_head: int = Field(gt=0)
+    head_dim: int = Field(gt=0)
+    mlp_activations: list[str] = Field(default=["silu", "linear"])
+    use_pre_norm: bool = Field(default=False)
+class DecoderConfig(BaseModel, frozen=True):
+    """Configuration for the decoder component of the Dia model.
+    Attributes:
+        n_layer: Number of transformer layers.
+        n_embd: Embedding dimension.
+        n_hidden: Hidden dimension size in the MLP layers.
+        gqa_query_heads: Number of query heads for grouped-query self-attention.
+        kv_heads: Number of key/value heads for grouped-query self-attention.
+        gqa_head_dim: Dimension per query head for grouped-query self-attention.
+        cross_query_heads: Number of query heads for cross-attention.
+        cross_head_dim: Dimension per cross-attention head.
+        mlp_activations: List of activation functions for the MLP layers.
+        use_pre_norm: Whether to use pre-normalization.
+    """
+    n_layer: int = Field(gt=0)
+    n_embd: int = Field(gt=0)
+    n_hidden: int = Field(gt=0)
+    gqa_query_heads: int = Field(gt=0)
+    kv_heads: int = Field(gt=0)
+    gqa_head_dim: int = Field(gt=0)
+    cross_query_heads: int = Field(gt=0)
+    cross_head_dim: int = Field(gt=0)
+    mlp_activations: list[str] = Field(default=["silu", "linear"])
+    use_pre_norm: bool = Field(default=False)
+class ModelConfig(BaseModel, frozen=True):
+    """Main configuration container for the Dia model architecture.
+    Attributes:
+        encoder: Configuration for the encoder component.
+        decoder: Configuration for the decoder component.
+        src_vocab_size: Size of the source (text) vocabulary.
+        tgt_vocab_size: Size of the target (audio code) vocabulary.
+        dropout: Dropout probability applied within the model.
+        normalization_layer_epsilon: Epsilon value for normalization layers (e.g., LayerNorm).
+        weight_dtype: Data type for model weights (e.g., "float32", "bfloat16").
+        rope_min_timescale: Minimum timescale for Rotary Positional Embeddings (RoPE).
+        rope_max_timescale: Maximum timescale for Rotary Positional Embeddings (RoPE).
+    """
+    encoder: EncoderConfig
+    decoder: DecoderConfig
+    src_vocab_size: int = Field(default=128, gt=0)
+    tgt_vocab_size: int = Field(default=1028, gt=0)
+    dropout: float = Field(default=0.0, ge=0.0, lt=1.0)
+    normalization_layer_epsilon: float = Field(default=1.0e-5, ge=0.0)
+    weight_dtype: str = Field(default="float32", description="Weight precision")
+    rope_min_timescale: int = Field(default=1, description="Timescale For global Attention")
+    rope_max_timescale: int = Field(default=10_000, description="Timescale For global Attention")
+class TrainingConfig(BaseModel, frozen=True):
+    """Training process configuration and hyperparameters.
+    Note: This configuration currently only includes precision settings.
+    Other training parameters (like batch size, learning rate, optimizer settings)
+    are assumed to be handled externally.
+    Attributes:
+        dtype: Data type for activations during training (e.g., "bfloat16", "float32").
+        logits_dot_in_fp32: Whether to compute the final logits dot product in fp32 for stability.
+    """
+    dtype: str = Field(default="bfloat16", description="Activation precision")
+    logits_dot_in_fp32: bool = Field(default=False)
+class DiaConfig(BaseModel, frozen=True):
+    """Master configuration for the Dia model.
+    Combines all sub-configurations into a single validated object.
+    Attributes:
+        version: Configuration version string.
+        model: Model architecture configuration.
+        training: Training process configuration (precision settings).
+        data: Data loading and processing configuration.
+    """
+    version: str = Field(default="1.0")
+    model: ModelConfig
+    training: TrainingConfig
+    data: DataConfig
+    def save(self, path: str) -> None:
+        """Save the current configuration instance to a JSON file.
+        Ensures the parent directory exists and the file has a .json extension.
+        Args:
+            path: The target file path to save the configuration.
+        Raises:
+            ValueError: If the path is not a file with a .json extension.
+        """
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        config_json = self.model_dump_json(indent=2)
+        with open(path, "w") as f:
+            f.write(config_json)
+    @classmethod
+    def load(cls, path: str) -> "DiaConfig | None":
+        """Load and validate a Dia configuration from a JSON file.
+        Args:
+            path: The path to the configuration file.
+        Returns:
+            A validated DiaConfig instance if the file exists and is valid,
+            otherwise None if the file is not found.
+        Raises:
+            ValueError: If the path does not point to an existing .json file.
+            pydantic.ValidationError: If the JSON content fails validation against the DiaConfig schema.
+        """
+        try:
+            with open(path, "r") as f:
+                content = f.read()
+            return cls.model_validate_json(content)
+        except FileNotFoundError:
+            return None

dia/layers.py ADDED Viewed

	@@ -0,0 +1,903 @@

+from typing import Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn import RMSNorm
+from .config import DiaConfig
+def _normalize_axes(axes: tuple[int, ...], ndim: int) -> tuple[int, ...]:
+    return tuple(ax if ax >= 0 else ndim + ax for ax in axes)
+def _str_to_dtype(dtype_str: str) -> torch.dtype | None:
+    # Allow None for default behavior
+    if dtype_str is None or dtype_str.lower() == "none":
+        return None
+    if dtype_str == "float32":
+        return torch.float32
+    elif dtype_str == "float16":
+        return torch.float16
+    elif dtype_str == "bfloat16":
+        return torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported dtype string: {dtype_str}")
+class DenseGeneral(nn.Module):
+    """
+    PyTorch equivalent of flax.linen.DenseGeneral with shapes defined at init.
+    Stores weights (`kernel`) in the same layout as Jax and uses torch.tensordot
+    for the generalized matrix multiplication. Weight/bias shapes are calculated
+    and parameters created during initialization based on config.
+    `load_weights` validates shapes and copies data.
+    Attributes:
+        axis (Tuple[int, ...]): Input axis or axes to contract.
+        in_shapes (Tuple[int, ...]): Sizes of the input dimensions specified by `axis`.
+        out_features (Tuple[int, ...]): Shape of the output features (non-contracted dims).
+        use_bias (bool): Whether to add a bias term.
+        weight (nn.Parameter): The kernel parameter.
+        bias (Optional[nn.Parameter]): The bias parameter (if use_bias=True).
+    """
+    def __init__(
+        self,
+        in_shapes: tuple[int, ...],
+        out_features: tuple[int, ...],
+        axis: tuple[int, ...] = (-1,),
+        dtype: torch.dtype | None = None,
+        weight_dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ):
+        super().__init__()
+        self.in_shapes = in_shapes
+        self.out_features = out_features
+        self.axis = axis
+        self.dtype = dtype
+        self.kernel_shape = self.in_shapes + self.out_features
+        factory_kwargs = {"device": device, "dtype": weight_dtype}
+        self.weight = nn.Parameter(torch.empty(self.kernel_shape, **factory_kwargs))
+        self.register_parameter("bias", None)
+    def forward(self, inputs: Tensor) -> Tensor:
+        norm_axis = _normalize_axes(self.axis, inputs.ndim)
+        kernel_contract_axes = tuple(range(len(norm_axis)))
+        output = torch.tensordot(
+            inputs.float(),
+            self.weight.float(),
+            dims=(norm_axis, kernel_contract_axes),
+        ).to(inputs.dtype)
+        return output
+def get_activation_fn(activation_string: str) -> nn.Module:  # Return Module instance
+    """Maps activation string to PyTorch activation function module."""
+    if activation_string == "gelu":
+        return nn.GELU()
+    elif activation_string == "relu":
+        return nn.ReLU()
+    elif activation_string == "silu" or activation_string == "swish":
+        return nn.SiLU()
+    elif activation_string == "linear":
+        return nn.Identity()
+    else:
+        raise ValueError(f"Unsupported activation function: {activation_string}")
+class MlpBlock(nn.Module):
+    """MLP block using DenseGeneral."""
+    def __init__(
+        self,
+        config: DiaConfig,
+        embed_dim: int,
+        intermediate_dim: int,
+        dropout_rate: float,
+        activations: list[str] = ["silu", "linear"],
+        use_pre_norm: bool = False,
+    ):
+        super().__init__()
+        self.use_pre_norm = use_pre_norm
+        num_activations = len(activations)
+        compute_dtype = _str_to_dtype(config.training.dtype)
+        weight_dtype = _str_to_dtype(config.model.weight_dtype)
+        self.dtype = compute_dtype
+        # Assume default device for now, could be passed in config
+        if use_pre_norm:
+            self.pre_norm = RMSNorm(
+                embed_dim,
+                eps=config.model.normalization_layer_epsilon,
+                dtype=torch.float32,
+            )
+        self.wi_fused = DenseGeneral(
+            in_shapes=(embed_dim,),
+            out_features=(
+                num_activations,
+                intermediate_dim,
+            ),
+            axis=(-1,),
+            dtype=compute_dtype,
+            weight_dtype=weight_dtype,
+        )
+        self.activation_fn_0 = get_activation_fn(activations[0])  # silu
+        self.activation_fn_1 = get_activation_fn(activations[1])  # linear
+        self.dropout = nn.Dropout(dropout_rate)
+        # Output layer using DenseGeneral
+        self.wo = DenseGeneral(
+            in_shapes=(intermediate_dim,),
+            out_features=(embed_dim,),
+            axis=(-1,),
+            dtype=compute_dtype,
+            weight_dtype=weight_dtype,
+        )
+    def forward(self, x: torch.Tensor, deterministic: bool) -> torch.Tensor:
+        """Forward pass."""
+        if self.use_pre_norm and hasattr(self, "pre_norm"):
+            x = self.pre_norm(x)
+        fused_x = self.wi_fused(x)
+        gate_input = fused_x[..., 0, :]
+        up_input = fused_x[..., 1, :]
+        gate = self.activation_fn_0(gate_input)
+        up = self.activation_fn_1(up_input)
+        hidden = torch.mul(gate, up).to(self.dtype)
+        if not deterministic:
+            hidden = self.dropout(hidden)
+        output = self.wo(hidden)
+        return output
+class RotaryEmbedding(nn.Module):
+    """Rotary Position Embedding (RoPE) implementation in PyTorch."""
+    def __init__(
+        self,
+        embedding_dims: int,
+        min_timescale: int = 1,
+        max_timescale: int = 10000,
+        dtype: torch.dtype = torch.float32,
+    ):
+        super().__init__()
+        if embedding_dims % 2 != 0:
+            raise ValueError("Embedding dim must be even for RoPE.")
+        self.embedding_dims = embedding_dims
+        self.min_timescale = min_timescale
+        self.max_timescale = max_timescale
+        self.dtype = dtype
+        half_embedding_dim = embedding_dims // 2
+        fraction = (2.0 * torch.arange(0, half_embedding_dim)) / embedding_dims
+        self.register_buffer(
+            "timescale",
+            self.min_timescale * (self.max_timescale / self.min_timescale) ** fraction,
+            persistent=False,
+        )
+    def extra_repr(self) -> str:
+        s = f"{self.timescale.shape}"
+        return s
+    def forward(self, inputs: torch.Tensor, position: torch.Tensor):
+        """Applies RoPE."""
+        position = position.unsqueeze(-1).unsqueeze(-1)
+        timescale = self.timescale.to(inputs.device)
+        sinusoid_inp = position / timescale
+        sin = torch.sin(sinusoid_inp).to(inputs.dtype)
+        cos = torch.cos(sinusoid_inp).to(inputs.dtype)
+        first_half, second_half = torch.chunk(inputs, 2, dim=-1)
+        first_part = first_half * cos - second_half * sin
+        second_part = second_half * cos + first_half * sin
+        return torch.cat((first_part, second_part), dim=-1)
+class KVCache:
+    def __init__(self, num_heads, max_len, head_dim, device, k=None, v=None):
+        self.k = (
+            torch.zeros((2, num_heads, max_len, head_dim), device=device)
+            if k is None
+            else k
+        )
+        self.v = (
+            torch.zeros((2, num_heads, max_len, head_dim), device=device)
+            if v is None
+            else v
+        )
+        self.current_idx = 0
+        self.max_len = max_len
+    def get_kv_for_attention(self, current_k, current_v):
+        if self.current_idx == 0:
+            return current_k, current_v
+        else:
+            past_k = self.k[:, :, : self.current_idx, :]
+            past_v = self.v[:, :, : self.current_idx, :]
+            attn_k = torch.cat((past_k, current_k), dim=2)
+            attn_v = torch.cat((past_v, current_v), dim=2)
+            return attn_k, attn_v
+    def update_cache(self, k, v):
+        assert self.current_idx < self.max_len
+        self.k[:, :, self.current_idx : self.current_idx + 1, :] = k
+        self.v[:, :, self.current_idx : self.current_idx + 1, :] = v
+        self.current_idx += 1
+    def prefill_kv(self, k, v):
+        prefill_len = k.shape[2]
+        assert prefill_len <= self.max_len
+        self.k[:, :, :prefill_len, :] = k
+        self.v[:, :, :prefill_len, :] = v
+        self.current_idx = prefill_len
+class Attention(nn.Module):
+    """Attention using DenseGeneral."""
+    def __init__(
+        self,
+        config: DiaConfig,
+        q_embed_dim: int,
+        kv_embed_dim: int,
+        num_query_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        dropout_rate: float,
+        is_cross_attn: bool = False,
+        out_embed_dim: int | None = None,
+    ):
+        super().__init__()
+        self.num_query_heads = num_query_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.is_cross_attn = is_cross_attn
+        self.dropout_rate = dropout_rate
+        compute_dtype = _str_to_dtype(config.training.dtype)
+        weight_dtype = _str_to_dtype(config.model.weight_dtype)
+        self.output_dim = out_embed_dim if out_embed_dim is not None else q_embed_dim
+        self.projected_query_dim = num_query_heads * head_dim
+        if num_query_heads % num_kv_heads != 0:
+            raise ValueError(
+                f"num_query_heads ({num_query_heads}) must be divisible by num_kv_heads ({num_kv_heads})"
+            )
+        self.num_gqa_groups = num_query_heads // num_kv_heads
+        # --- Projection Layers using DenseGeneral ---
+        self.q_proj = DenseGeneral(
+            in_shapes=(q_embed_dim,),
+            out_features=(num_query_heads, head_dim),
+            axis=(-1,),
+            dtype=compute_dtype,
+            weight_dtype=weight_dtype,
+        )
+        self.k_proj = DenseGeneral(
+            in_shapes=(kv_embed_dim,),
+            out_features=(num_kv_heads, head_dim),
+            axis=(-1,),
+            dtype=compute_dtype,
+            weight_dtype=weight_dtype,
+        )
+        self.v_proj = DenseGeneral(
+            in_shapes=(kv_embed_dim,),
+            out_features=(num_kv_heads, head_dim),
+            axis=(-1,),
+            dtype=compute_dtype,
+            weight_dtype=weight_dtype,
+        )
+        self.o_proj = DenseGeneral(
+            in_shapes=(num_query_heads, head_dim),
+            out_features=(self.output_dim,),
+            axis=(-2, -1),
+            dtype=compute_dtype,
+            weight_dtype=weight_dtype,
+        )
+        # --- Rotary Embedding ---
+        self.rotary_emb = RotaryEmbedding(
+            embedding_dims=self.head_dim,
+            min_timescale=config.model.rope_min_timescale,
+            max_timescale=config.model.rope_max_timescale,
+            dtype=compute_dtype,
+        )
+    def forward(
+        self,
+        Xq: torch.Tensor,  # (B, T, D) T = 1 in AR generation
+        Xkv: torch.Tensor,  # (B, S, E) S = 1 in AR generation
+        q_positions: torch.Tensor,  # (B, T)
+        kv_positions: torch.Tensor | None = None,  # (B, S)
+        deterministic: bool = True,
+        attn_mask: (
+            torch.Tensor | None
+        ) = None,  # None in Decoder Self Attention, Valid mask in Others
+        cache: KVCache | None = None,  # None in Encoder, KVCache in Decoder
+        prefill: bool = False,  # True only when prefilling KV Cache
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
+        """
+        Performs attention calculation with optional KV caching.
+        Args:
+            Xq: Query tensor (B, T, D). T=1 during single-step decoding.
+            Xkv: Key/Value source tensor (B, S, E). S=1 during single-step decoding for self-attn.
+            q_positions: Positions for queries (B, T).
+            kv_positions: Positions for keys/values (B, S). If None, uses q_positions.
+            deterministic: If True, disable dropout.
+            attn_mask: Attention mask.
+            cache: KVCache.
+            prefill: If True, use prefill mode.
+        Returns:
+            A tuple containing:
+            - output: The attention output tensor (B, T, output_dim).
+            - present_kv: The K/V state to be cached for the next step ((B, N, S_new, H), (B, N, S_new, H)). For self-attn, S_new = S_past + S. For cross-attn, S_new = S_kv.
+        """
+        if kv_positions is None:
+            kv_positions = q_positions
+        original_dtype = Xq.dtype
+        Xq_BxTxNxH = self.q_proj(Xq)
+        Xq_BxTxNxH = self.rotary_emb(Xq_BxTxNxH, position=q_positions)
+        Xq_BxNxTxH = Xq_BxTxNxH.transpose(1, 2)
+        # Input values into attention calculation
+        attn_k: torch.Tensor | None = None
+        attn_v: torch.Tensor | None = None
+        new_kv_cache: tuple[torch.Tensor, torch.Tensor] | None = None
+        # Decoder Cross Attention
+        if self.is_cross_attn:
+            # Directly use cache (no need to check index)
+            attn_k, attn_v = cache.k, cache.v
+            if (
+                attn_k.shape[1] != self.num_query_heads
+                or attn_v.shape[1] != self.num_query_heads
+            ):
+                raise ValueError(
+                    f"Cross-attention cache head dimension ({attn_k.shape[1]}) "
+                    f"does not match num_query_heads ({self.num_query_heads}). "
+                    "Cache should be pre-repeated for GQA."
+                )
+        # Self Attention
+        else:
+            Xk_BxSxKxH = self.k_proj(Xkv)  # (B, S, K, H)
+            Xv_BxSxKxH = self.v_proj(Xkv)  # (B, S, K, H)
+            Xk_BxSxKxH = self.rotary_emb(
+                Xk_BxSxKxH, position=kv_positions
+            )  # (B, S, K, H)
+            Xk_BxKxSxH = Xk_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
+            Xv_BxKxSxH = Xv_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
+            # S=1 for Decode Step
+            if self.num_gqa_groups > 1:
+                Xk_BxNxSxH = Xk_BxKxSxH.repeat_interleave(self.num_gqa_groups, dim=1)
+                Xv_BxNxSxH = Xv_BxKxSxH.repeat_interleave(self.num_gqa_groups, dim=1)
+            else:
+                Xk_BxNxSxH = Xk_BxKxSxH
+                Xv_BxNxSxH = Xv_BxKxSxH
+            # Encoder Self Attention
+            if cache is None:
+                attn_k = Xk_BxNxSxH
+                attn_v = Xv_BxNxSxH
+            # Decoder Self Attention
+            else:
+                # In prefill mode, we fill in cache until prefill length
+                if prefill:
+                    attn_k, attn_v = Xk_BxNxSxH, Xv_BxNxSxH
+                    cache.prefill_kv(attn_k, attn_v)
+                # In decode step, we add current K/V to cache step by step
+                else:
+                    new_kv_cache = Xk_BxNxSxH, Xv_BxNxSxH
+                    attn_k, attn_v = cache.get_kv_for_attention(Xk_BxNxSxH, Xv_BxNxSxH)
+        # Add the dtype conversion here - after both cross-attention and self-attention paths
+        if attn_k is not None and attn_v is not None:
+            attn_k = attn_k.to(Xq_BxNxTxH.dtype)
+            attn_v = attn_v.to(Xq_BxNxTxH.dtype)
+        attn_output = F.scaled_dot_product_attention(
+            Xq_BxNxTxH,
+            attn_k,
+            attn_v,
+            attn_mask=attn_mask,
+            dropout_p=self.dropout_rate if not deterministic else 0.0,
+            scale=1.0,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()  # (B, T, N, H)
+        output = self.o_proj(attn_output)
+        return output.to(original_dtype), new_kv_cache
+class EncoderLayer(nn.Module):
+    """Transformer Encoder Layer using DenseGeneral."""
+    def __init__(self, config: DiaConfig):
+        super().__init__()
+        self.config = config
+        model_config = config.model
+        enc_config = config.model.encoder
+        embed_dim = enc_config.n_embd
+        self.pre_sa_norm = RMSNorm(
+            embed_dim,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+        self.self_attention = Attention(
+            config=config,
+            q_embed_dim=embed_dim,
+            kv_embed_dim=embed_dim,
+            num_query_heads=enc_config.n_head,
+            num_kv_heads=enc_config.n_head,
+            head_dim=enc_config.head_dim,
+            dropout_rate=model_config.dropout,
+            is_cross_attn=False,
+            out_embed_dim=embed_dim,
+        )
+        self.post_sa_norm = RMSNorm(
+            embed_dim,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+        self.mlp = MlpBlock(
+            config=config,
+            embed_dim=embed_dim,
+            intermediate_dim=enc_config.n_hidden,
+            activations=enc_config.mlp_activations,
+            dropout_rate=model_config.dropout,
+            use_pre_norm=enc_config.use_pre_norm,
+        )
+        self.dropout = nn.Dropout(model_config.dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        src_positions: torch.Tensor | None = None,
+        deterministic: bool = True,
+        attn_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        residual = x
+        x_norm = self.pre_sa_norm(x)
+        sa_out, _ = self.self_attention(
+            Xq=x_norm,
+            Xkv=x_norm,
+            q_positions=src_positions,
+            kv_positions=src_positions,
+            deterministic=deterministic,
+            attn_mask=attn_mask,
+        )
+        x = residual + sa_out
+        residual = x
+        x_norm = self.post_sa_norm(x)
+        mlp_out = self.mlp(x_norm, deterministic=deterministic)
+        x = residual + mlp_out
+        if not deterministic:
+            x = self.dropout(x)
+        return x
+class Encoder(nn.Module):
+    """Transformer Encoder Stack using DenseGeneral."""
+    def __init__(self, config: DiaConfig):
+        super().__init__()
+        self.config = config
+        model_config = config.model
+        enc_config = config.model.encoder
+        compute_dtype = _str_to_dtype(config.training.dtype)
+        self.embedding = nn.Embedding(
+            model_config.src_vocab_size,
+            enc_config.n_embd,
+            dtype=compute_dtype,
+        )
+        self.dropout = nn.Dropout(model_config.dropout)
+        self.layers = nn.ModuleList(
+            [EncoderLayer(config=config) for _ in range(enc_config.n_layer)]
+        )
+        self.norm = RMSNorm(
+            enc_config.n_embd,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+    def forward(
+        self,
+        x_ids: torch.Tensor,
+        src_positions: torch.Tensor | None = None,
+        deterministic: bool = True,
+        attn_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        x = self.embedding(x_ids)
+        if not deterministic:
+            x = self.dropout(x)
+        for layer in self.layers:
+            x = layer(
+                x,
+                src_positions=src_positions,
+                deterministic=deterministic,
+                attn_mask=attn_mask,
+            )
+        x = self.norm(x)
+        if not deterministic:
+            x = self.dropout(x)
+        return x
+class DecoderLayer(nn.Module):
+    """Transformer Decoder Layer using DenseGeneral."""
+    def __init__(self, config: DiaConfig):
+        super().__init__()
+        self.config = config
+        model_config = config.model
+        dec_config = config.model.decoder
+        enc_config = config.model.encoder
+        dec_embed_dim = dec_config.n_embd
+        enc_embed_dim = enc_config.n_embd
+        # Norms
+        self.pre_sa_norm = RMSNorm(
+            dec_embed_dim,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+        self.pre_ca_norm = RMSNorm(
+            dec_embed_dim,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+        self.pre_mlp_norm = RMSNorm(
+            dec_embed_dim,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+        # Self-Attention (GQA) with Causal Masking
+        self.self_attention = Attention(
+            config=config,
+            q_embed_dim=dec_embed_dim,
+            kv_embed_dim=dec_embed_dim,
+            num_query_heads=dec_config.gqa_query_heads,
+            num_kv_heads=dec_config.kv_heads,
+            head_dim=dec_config.gqa_head_dim,
+            dropout_rate=model_config.dropout,
+            is_cross_attn=False,
+            out_embed_dim=dec_embed_dim,
+        )
+        # Cross-Attention (MHA)
+        self.cross_attention = Attention(
+            config=config,
+            q_embed_dim=dec_embed_dim,
+            kv_embed_dim=enc_embed_dim,  # Note kv_embed_dim
+            num_query_heads=dec_config.cross_query_heads,
+            num_kv_heads=dec_config.cross_query_heads,
+            head_dim=dec_config.cross_head_dim,
+            dropout_rate=model_config.dropout,
+            is_cross_attn=True,
+            out_embed_dim=dec_embed_dim,
+        )
+        # MLP
+        self.mlp = MlpBlock(
+            config=config,
+            embed_dim=dec_embed_dim,
+            intermediate_dim=dec_config.n_hidden,
+            activations=dec_config.mlp_activations,
+            dropout_rate=model_config.dropout,
+            use_pre_norm=dec_config.use_pre_norm,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        encoder_out: torch.Tensor,
+        tgt_positions: torch.Tensor,
+        src_positions: torch.Tensor | None,
+        deterministic: bool,
+        self_attn_mask: torch.Tensor,
+        cross_attn_mask: torch.Tensor,
+        self_attn_cache: KVCache,
+        cross_attn_cache: KVCache,
+        prefill: bool = False,
+    ) -> torch.Tensor:
+        residual = x
+        x_norm = self.pre_sa_norm(x)
+        sa_out, new_kv_cache = self.self_attention(
+            Xq=x_norm,  # (2, 1, D)
+            Xkv=x_norm,  # (2, 1, D)
+            q_positions=tgt_positions,  # (2, 1)
+            kv_positions=tgt_positions,  # (2, 1)
+            deterministic=deterministic,
+            attn_mask=self_attn_mask,  # (2, 1, 1, S_max)
+            cache=self_attn_cache,
+            prefill=prefill,
+        )
+        x = residual + sa_out
+        # 2. Cross-Attention
+        residual = x
+        x_norm = self.pre_ca_norm(x)
+        ca_out, _ = self.cross_attention(
+            Xq=x_norm,
+            Xkv=encoder_out,
+            q_positions=tgt_positions,
+            kv_positions=src_positions,
+            deterministic=deterministic,
+            attn_mask=cross_attn_mask,
+            cache=cross_attn_cache,
+        )
+        x = residual + ca_out
+        # 3. MLP
+        residual = x
+        x_norm = self.pre_mlp_norm(x)
+        mlp_out = self.mlp(x_norm, deterministic=deterministic)
+        x = residual + mlp_out
+        return x, new_kv_cache
+class Decoder(nn.Module):
+    """Transformer Decoder Stack using DenseGeneral."""
+    def __init__(self, config: DiaConfig):
+        super().__init__()
+        self.config = config
+        model_config = config.model
+        dec_config = config.model.decoder
+        train_config = config.training
+        data_config = config.data
+        compute_dtype = _str_to_dtype(config.training.dtype)
+        weight_dtype = _str_to_dtype(config.model.weight_dtype)
+        self.num_channels = data_config.channels
+        self.num_layers = dec_config.n_layer
+        self.embeddings = nn.ModuleList(
+            [
+                nn.Embedding(
+                    model_config.tgt_vocab_size, dec_config.n_embd, dtype=compute_dtype
+                )
+                for _ in range(self.num_channels)
+            ]
+        )
+        self.dropout = nn.Dropout(model_config.dropout)
+        self.layers = nn.ModuleList(
+            [DecoderLayer(config=config) for _ in range(self.num_layers)]
+        )
+        self.norm = RMSNorm(
+            dec_config.n_embd,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+        # Final Logits Projection using DenseGeneral
+        self.logits_dense = DenseGeneral(
+            in_shapes=(dec_config.n_embd,),
+            out_features=(self.num_channels, model_config.tgt_vocab_size),
+            axis=(-1,),
+            dtype=(torch.float32 if train_config.logits_dot_in_fp32 else compute_dtype),
+            weight_dtype=weight_dtype,
+        )
+        self.logits_in_fp32 = train_config.logits_dot_in_fp32
+    def precompute_cross_attention_kv(
+        self,
+        max_len: int,
+        encoder_out: torch.Tensor,  # (B, S, E)
+        src_positions: torch.Tensor | None,  # (B, S)
+    ) -> list[KVCache]:
+        """
+        Computes the Key and Value tensors for cross-attention for each layer from the encoder output.
+        """
+        per_layer_kv_cache: list[KVCache] = []
+        for layer in self.layers:
+            cross_attn_module = layer.cross_attention
+            k_proj = cross_attn_module.k_proj(encoder_out)
+            v_proj = cross_attn_module.v_proj(encoder_out)
+            k_proj = cross_attn_module.rotary_emb(k_proj, position=src_positions)
+            k = k_proj.transpose(1, 2)
+            v = v_proj.transpose(1, 2)
+            per_layer_kv_cache.append(
+                KVCache(
+                    cross_attn_module.num_kv_heads,
+                    max_len,
+                    cross_attn_module.head_dim,
+                    k.device,
+                    k=k,
+                    v=v,
+                )
+            )
+        return per_layer_kv_cache
+    def decode_step(
+        self,
+        tgt_ids_Bx1xC: torch.Tensor,  # [B, 1, C]
+        tgt_pos_Bx1: torch.Tensor,  # [B, 1]
+        encoder_out: torch.Tensor,  # [B, S, E]
+        self_attn_mask: Any,  # None
+        cross_attn_mask: torch.Tensor,  # [B, 1, 1, S]
+        self_attention_cache: list[KVCache],
+        cross_attention_cache: list[KVCache],
+    ) -> torch.Tensor:
+        """
+        Performs a single decoding step, managing KV caches layer by layer.
+        Returns:
+            A tuple containing:
+            - logits_Bx1xCV: The final output logits for the current step (B, 1, C*V), cast to float32.
+        """
+        assert (
+            self_attn_mask is None
+        ), "Self-attention mask should be None, kept for pattern"
+        x = None
+        for i in range(self.num_channels):
+            channel_tokens = tgt_ids_Bx1xC[..., i]
+            channel_embed = self.embeddings[i](channel_tokens)
+            x = channel_embed if x is None else x + channel_embed
+        new_cache = []
+        for i, layer in enumerate(self.layers):
+            self_cache = self_attention_cache[i]
+            cross_cache = cross_attention_cache[i]
+            x, new_kv_cache = layer(
+                x,  # (2, 1, D)
+                encoder_out,  # (2, S, E)
+                src_positions=None,  # CA KV is already computed
+                tgt_positions=tgt_pos_Bx1,  # (2, 1)
+                deterministic=True,
+                self_attn_mask=None,
+                cross_attn_mask=cross_attn_mask,
+                self_attn_cache=self_cache,
+                cross_attn_cache=cross_cache,
+            )
+            new_cache.append(new_kv_cache)
+        x = self.norm(x)
+        logits_Bx1xCxV = self.logits_dense(x)
+        return logits_Bx1xCxV.to(torch.float32), new_cache
+    def forward(
+        self,
+        tgt_ids_BxTxC: torch.Tensor,
+        encoder_out: torch.Tensor,
+        tgt_positions: torch.Tensor,
+        src_positions: torch.Tensor,
+        deterministic: bool,
+        self_attn_mask: torch.Tensor,
+        cross_attn_mask: torch.Tensor,
+        self_attention_cache: list[KVCache],
+        cross_attention_cache: list[KVCache],
+    ) -> torch.Tensor:
+        """
+        Forward pass for the Decoder stack, managing KV caches.
+        Args:
+            tgt_ids_BxTxC: Target token IDs (B, T, C).
+            encoder_out: Output from the encoder (B, S, E).
+            tgt_positions: Positions for target sequence (B, T).
+            src_positions: Positions for source sequence (B, S).
+            deterministic: Disable dropout if True.
+            self_attn_mask: Mask for self-attention.
+            cross_attn_mask: Mask for cross-attention.
+            past_key_values: List containing the self-attention KV cache for each layer
+                             from the previous decoding step. `len(past_key_values)` should
+                             equal `num_layers`.
+            precomputed_cross_attn_kv: A single tuple containing the pre-computed K/V cache
+                                      derived from `encoder_out`. This is passed identically
+                                      to all layers.
+        Returns:
+            A tuple containing:
+            - logits: The final output logits (B, T, C * V), cast to float32.
+            - present_key_values: A list containing the updated self-attention KV cache
+                                 for each layer for the *current* decoding step.
+        """
+        _, _, num_channels_in = tgt_ids_BxTxC.shape
+        assert num_channels_in == self.num_channels, "Input channels mismatch"
+        # Embeddings
+        x = None
+        for i in range(self.num_channels):
+            channel_tokens = tgt_ids_BxTxC[..., i]
+            channel_embed = self.embeddings[i](channel_tokens)
+            x = channel_embed if x is None else x + channel_embed
+        if not deterministic:
+            x = self.dropout(x)
+        for i, layer in enumerate(self.layers):
+            x, _ = layer(
+                x,
+                encoder_out,
+                tgt_positions=tgt_positions,
+                src_positions=src_positions,
+                deterministic=deterministic,
+                self_attn_mask=self_attn_mask,
+                cross_attn_mask=cross_attn_mask,
+                self_attn_cache=self_attention_cache[i],
+                cross_attn_cache=cross_attention_cache[i],
+                prefill=True,
+            )
+        # Final Norm
+        x = self.norm(x)
+        logits_BxTxCxV = self.logits_dense(x)
+        return logits_BxTxCxV.to(torch.float32)
+class DiaModel(nn.Module):
+    """PyTorch Dia Model using DenseGeneral."""
+    def __init__(self, config: DiaConfig):
+        super().__init__()
+        self.config = config
+        self.encoder = Encoder(config)
+        self.decoder = Decoder(config)
+    def forward(
+        self,
+        src_BxS: torch.Tensor,
+        tgt_BxTxC: torch.Tensor,
+        src_positions: torch.Tensor | None = None,
+        tgt_positions: torch.Tensor | None = None,
+        enc_self_attn_mask: torch.Tensor | None = None,
+        dec_self_attn_mask: torch.Tensor | None = None,
+        dec_cross_attn_mask: torch.Tensor | None = None,
+        enable_dropout: bool = True,
+    ):
+        deterministic = not enable_dropout
+        # --- Encoder Pass ---
+        encoder_out = self.encoder(
+            x_ids=src_BxS,
+            src_positions=src_positions,
+            deterministic=deterministic,
+            attn_mask=enc_self_attn_mask,
+        )
+        # --- Decoder Pass ---
+        logits, _ = self.decoder(
+            tgt_ids_BxTxC=tgt_BxTxC,
+            encoder_out=encoder_out,
+            tgt_positions=tgt_positions,
+            src_positions=src_positions,
+            deterministic=deterministic,
+            self_attn_mask=dec_self_attn_mask,
+            cross_attn_mask=dec_cross_attn_mask,
+            precomputed_cross_attn_kv=None,
+        )
+        return logits

dia/model.py ADDED Viewed

	@@ -0,0 +1,956 @@

+# dia/model.py
+import os
+import logging
+import time
+import dac  # Keep this import name
+import numpy as np
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file  # <<< ADDED Import for safetensors
+from .audio import audio_to_codebook, codebook_to_audio
+from .config import (
+    DiaConfig,
+)  # Assuming this is the Pydantic config for model structure
+from .layers import DiaModel, KVCache  # Assuming these are the nn.Module definitions
+# --- Get a logger instance for this module ---
+logger = logging.getLogger(__name__)
+# Optional: Add a check after import to verify the library looks correct
+# Note: We now expect 'utils' based on original code
+if (
+    not hasattr(dac, "utils")
+    or not hasattr(dac.utils, "download")
+    or not hasattr(dac, "DAC")
+):
+    logger.warning(
+        "The imported 'dac' module does not appear to have the 'utils.download' structure expected by the original Dia code."
+    )
+    logger.warning(
+        "Ensure 'descript-audio-codec' is installed correctly (pip install descript-audio-codec)."
+    )
+    # If this check fails, _load_dac_model will likely raise an error later anyway.
+def _sample_next_token(
+    logits_BCxV: torch.Tensor,
+    temperature: float,
+    top_p: float,
+    use_cfg_filter: bool,
+    cfg_filter_top_k: int | None = None,
+) -> torch.Tensor:
+    """Samples the next token based on logits, temperature, and top_p."""
+    if temperature == 0.0:
+        # Greedy sampling
+        return torch.argmax(logits_BCxV, dim=-1)
+    # Apply temperature scaling
+    logits_BCxV = logits_BCxV / temperature
+    # Apply CFG Top-K filtering (optional)
+    if use_cfg_filter and cfg_filter_top_k is not None:
+        # Get top K values and indices
+        _, top_k_indices_BCxV = torch.topk(logits_BCxV, k=cfg_filter_top_k, dim=-1)
+        # Create a mask to keep only top K logits
+        mask = torch.ones_like(logits_BCxV, dtype=torch.bool)
+        mask.scatter_(
+            dim=-1, index=top_k_indices_BCxV, value=False
+        )  # Set top K positions to False (don't mask)
+        # Mask out logits not in the top K
+        logits_BCxV = logits_BCxV.masked_fill(mask, -torch.inf)
+    # Apply Top-P (Nucleus) sampling
+    if top_p < 1.0:
+        # Convert logits to probabilities
+        probs_BCxV = torch.softmax(logits_BCxV, dim=-1)
+        # Sort probabilities in descending order
+        sorted_probs_BCxV, sorted_indices_BCxV = torch.sort(
+            probs_BCxV, dim=-1, descending=True
+        )
+        # Calculate cumulative probabilities
+        cumulative_probs_BCxV = torch.cumsum(sorted_probs_BCxV, dim=-1)
+        # Create mask for tokens to remove (those exceeding top_p threshold)
+        sorted_indices_to_remove_BCxV = cumulative_probs_BCxV > top_p
+        # Shift the mask: keep the first token that crosses the threshold
+        sorted_indices_to_remove_BCxV[..., 1:] = sorted_indices_to_remove_BCxV[
+            ..., :-1
+        ].clone()
+        sorted_indices_to_remove_BCxV[..., 0] = 0  # Always keep the most probable token
+        # Scatter the mask back to the original order
+        indices_to_remove_BCxV = torch.zeros_like(sorted_indices_to_remove_BCxV)
+        indices_to_remove_BCxV.scatter_(
+            dim=-1, index=sorted_indices_BCxV, src=sorted_indices_to_remove_BCxV
+        )
+        # Apply the mask to the logits
+        logits_BCxV = logits_BCxV.masked_fill(indices_to_remove_BCxV, -torch.inf)
+    # Calculate final probabilities after filtering
+    final_probs_BCxV = torch.softmax(logits_BCxV, dim=-1)
+    # Sample from the filtered distribution
+    # multinomial expects probabilities for each item in the batch
+    sampled_indices_BC = torch.multinomial(
+        final_probs_BCxV, num_samples=1
+    )  # Shape [B*C, 1]
+    sampled_indices_C = sampled_indices_BC.squeeze(
+        -1
+    )  # Shape [B*C] -> should be [C] if input was [C,V]
+    return sampled_indices_C
+class Dia:
+    """
+    Main class for the Dia Text-to-Speech model, handling loading and generation.
+    """
+    def __init__(self, config: DiaConfig, device: torch.device = torch.device("cuda")):
+        """
+        Initializes the Dia model structure based on the provided configuration.
+        Does not load weights here.
+        Args:
+            config: The DiaConfig object defining model parameters.
+            device: The torch device (e.g., 'cuda', 'cpu') the model should eventually run on.
+                    Note: The model is instantiated but not moved to the device here.
+        """
+        super().__init__()
+        logger.info(
+            f"Initializing Dia model structure with config version: {config.version}"
+        )
+        self.config = config
+        # Store the target device, but don't move the model yet. Loading weights will handle device placement.
+        self.target_device = device
+        # Instantiate the underlying PyTorch model based on the config
+        self.model = DiaModel(config)
+        self.dac_model = None  # DAC model will be loaded separately
+        logger.info("Dia model structure initialized.")
+    @classmethod
+    def load_model_from_files(
+        cls,
+        config_path: str,
+        weights_path: str,
+        device: torch.device = torch.device("cuda"),
+    ) -> "Dia":
+        """
+        Loads the Dia model from local configuration and weights files.
+        Handles both .pth and .safetensors weight formats.
+        Args:
+            config_path: Path to the configuration JSON file (e.g., 'config.json').
+            weights_path: Path to the model weights file (e.g., 'model.pth' or 'model.safetensors').
+            device: The torch device ('cuda', 'cpu', etc.) to load the model onto.
+        Returns:
+            An instance of the Dia model loaded with weights and set to eval mode.
+        Raises:
+            FileNotFoundError: If the config or weights file is not found.
+            ValueError: If the weights file format is unsupported.
+            RuntimeError: If there is an error loading the config, weights, or DAC model.
+        """
+        logger.info(f"Loading Dia model from local files:")
+        logger.info(f"  Config: {config_path}")
+        logger.info(f"  Weights: {weights_path}")
+        logger.info(f"  Target Device: {device}")
+        # 1. Load Configuration
+        try:
+            config = DiaConfig.load(config_path)
+            if config is None:
+                # DiaConfig.load returns None on FileNotFoundError
+                logger.error(f"Configuration file not found at {config_path}")
+                raise FileNotFoundError(
+                    f"Configuration file not found at {config_path}"
+                )
+            logger.info("Configuration loaded successfully.")
+        except Exception as e:
+            logger.error(
+                f"Error loading or validating configuration from {config_path}: {e}",
+                exc_info=True,
+            )
+            raise RuntimeError(
+                f"Failed to load configuration from {config_path}"
+            ) from e
+        # 2. Instantiate Model Structure
+        # Pass the target device during instantiation if the underlying DiaModel supports it,
+        # otherwise, we move it later. Assuming __init__ doesn't take device for now.
+        dia_instance = cls(
+            config, device
+        )  # Pass device mainly for storing target_device
+        # 3. Load Weights (State Dictionary)
+        try:
+            logger.info(f"Loading weights from: {weights_path}")
+            weights_filename = os.path.basename(weights_path)
+            state_dict = None
+            if weights_filename.endswith(".safetensors"):
+                logger.info(
+                    "Detected .safetensors format. Loading using safetensors library."
+                )
+                # load_file loads directly to the specified device
+                state_dict = load_file(weights_path, device=str(device))
+                logger.info("Safetensors weights loaded.")
+            elif weights_filename.endswith(".pth"):
+                logger.info("Detected .pth format. Loading using torch.load.")
+                # torch.load needs map_location to load onto the correct device
+                state_dict = torch.load(weights_path, map_location=device)
+                logger.info("PyTorch weights (.pth) loaded.")
+            else:
+                logger.error(
+                    f"Unsupported weights file format: {weights_filename}. Expected .pth or .safetensors."
+                )
+                raise ValueError(f"Unsupported weights file format: {weights_filename}")
+            # Load the state dictionary into the model structure
+            logger.info("Applying loaded weights to the model structure...")
+            # Use strict=True by default to catch mismatches. Can be set to False if needed for specific conversions (e.g., BF16 -> FP32 partial loads)
+            dia_instance.model.load_state_dict(state_dict, strict=True)
+            logger.info("Weights applied successfully.")
+        except FileNotFoundError:
+            logger.error(f"Weights file not found at {weights_path}")
+            raise FileNotFoundError(f"Weights file not found at {weights_path}")
+        except Exception as e:
+            logger.error(
+                f"Error loading weights from {weights_path}: {e}", exc_info=True
+            )
+            raise RuntimeError(f"Error loading weights from {weights_path}") from e
+        # 4. Move Model to Device and Set Eval Mode
+        logger.info(f"Moving model to device: {device}...")
+        dia_instance.model.to(device)
+        logger.info("Setting model to evaluation mode...")
+        dia_instance.model.eval()
+        # 5. Load Associated DAC Model
+        logger.info("Loading associated DAC model...")
+        dia_instance._load_dac_model()  # This will log its own progress/errors
+        logger.info("Dia model fully loaded and ready.")
+        return dia_instance
+    # REMOVED from_pretrained - Responsibility moved to engine.py
+    # @classmethod
+    # def from_pretrained(...) -> "Dia": ...
+    def _load_dac_model(self):
+        """Loads the Descript Audio Codec (DAC) model using the original project's method."""
+        if self.dac_model is not None:
+            logger.info("DAC model already loaded.")
+            return
+        # Verify the imported module has the necessary structure expected by original code
+        if (
+            not hasattr(dac, "utils")
+            or not hasattr(dac.utils, "download")
+            or not hasattr(dac, "DAC")
+        ):
+            logger.error(
+                "Imported 'dac' module structure mismatch. Expected 'dac.utils.download()' and 'dac.DAC'."
+            )
+            logger.error(
+                "Ensure 'descript-audio-codec' is installed correctly via pip."
+            )
+            raise RuntimeError(
+                "Failed to load DAC model: required functions/structure missing from 'dac' module."
+            )
+        try:
+            # Use the original method found in the Dia repository
+            logger.info("Downloading/finding DAC model using dac.utils.download()...")
+            # This assumes dac.utils.download() handles caching internally
+            dac_model_path = dac.utils.download()
+            logger.info(f"DAC model path determined: {dac_model_path}")
+            logger.info("Loading DAC model from path...")
+            # Load DAC model and move it to the same device as the main Dia model
+            dac_model = dac.DAC.load(dac_model_path).to(self.target_device)
+            logger.info("DAC model loaded successfully.")
+        except AttributeError as ae:
+            logger.error(
+                f"AttributeError loading DAC model: '{ae}'. The installed 'descript-audio-codec' version might be incompatible with Dia's original code which expects 'dac.utils.download()'."
+            )
+            logger.error(
+                "Please check for specific version requirements of 'descript-audio-codec' for Dia, or potential installation issues."
+            )
+            raise RuntimeError(
+                "Failed to load DAC model due to incompatible library version or structure"
+            ) from ae
+        except Exception as e:
+            logger.error(f"General error loading DAC model: {e}", exc_info=True)
+            raise RuntimeError("Failed to load DAC model") from e
+        self.dac_model = dac_model
+    def _create_attn_mask(
+        self,
+        q_padding_mask_1d: torch.Tensor,
+        k_padding_mask_1d: torch.Tensor,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        """
+        Creates the attention mask (self or cross) based on padding masks.
+        Mimics JAX segment ID logic where attention is allowed between non-padding tokens
+        OR between padding tokens, but not across the boundary.
+        Args:
+            q_padding_mask_1d: Boolean tensor [Batch, SeqLenQ] where True indicates non-padding.
+            k_padding_mask_1d: Boolean tensor [Batch, SeqLenK] where True indicates non-padding.
+            is_causal: If True, applies an additional causal mask (for decoder self-attention).
+        Returns:
+            Boolean attention mask tensor [Batch, 1, SeqLenQ, SeqLenK] ready for F.scaled_dot_product_attention.
+        """
+        B1, Tq = q_padding_mask_1d.shape
+        B2, Tk = k_padding_mask_1d.shape
+        if B1 != B2:
+            logger.warning(
+                f"Query ({B1}) and key ({B2}) batch dimensions do not match in _create_attn_mask"
+            )
+        assert B1 == B2, "Query and key batch dimensions must match"
+        # Expand masks for broadcasting: [B, Tq, 1] and [B, 1, Tk]
+        p_mask_q = q_padding_mask_1d.unsqueeze(2)
+        p_mask_k = k_padding_mask_1d.unsqueeze(1)
+        # True where a non-padding query token attends to a non-padding key token
+        non_pad_attends_non_pad = p_mask_q & p_mask_k  # Shape [B, Tq, Tk]
+        # True where a padding query token attends to a padding key token
+        pad_attends_pad = (~p_mask_q) & (~p_mask_k)  # Shape [B, Tq, Tk]
+        # Combine: Attention is allowed if tokens are both non-padding OR both padding.
+        mask = non_pad_attends_non_pad | pad_attends_pad  # Shape [B, Tq, Tk]
+        if is_causal:
+            # Apply causal mask for self-attention (query cannot attend to future keys)
+            if Tq != Tk:
+                logger.warning(f"Causal mask requested but Tq ({Tq}) != Tk ({Tk})")
+            assert (
+                Tq == Tk
+            ), "Causal mask requires query and key sequence lengths to be equal"
+            # Create lower triangular matrix (True allows attention)
+            causal_mask_2d = torch.tril(
+                torch.ones((Tq, Tk), dtype=torch.bool, device=self.target_device)
+            )
+            # Combine with padding compatibility mask
+            mask = mask & causal_mask_2d  # Shape [B, Tq, Tk]
+        # Add head dimension for broadcasting: [B, 1, Tq, Tk]
+        return mask.unsqueeze(1)
+    def _prepare_text_input(
+        self, text: str
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Encodes text prompt into byte tokens, pads to max length,
+        and creates position IDs and padding mask.
+        Args:
+            text: The input text string.
+        Returns:
+            Tuple containing:
+                - src_tokens: Padded token IDs [1, SeqLen].
+                - src_positions: Position IDs [1, SeqLen].
+                - src_padding_mask: Boolean mask (True=non-pad) [1, SeqLen].
+                - enc_self_attn_mask: Attention mask for encoder [1, 1, SeqLen, SeqLen].
+        """
+        text_pad_value = self.config.data.text_pad_value
+        max_len = self.config.data.text_length
+        logger.debug(
+            f"Preparing text input. Max length: {max_len}, Pad value: {text_pad_value}"
+        )
+        logger.debug(f"Original text (start): '{text[:100]}...'")
+        # Convert text to bytes and replace special speaker tokens
+        byte_text = text.encode("utf-8")
+        # Assuming Dia uses byte values 1 and 2 for S1/S2 based on original code context
+        replaced_bytes = byte_text.replace(b"[S1]", b"\x01").replace(b"[S2]", b"\x02")
+        text_tokens = list(replaced_bytes)  # List of integer byte values
+        logger.debug(
+            f"Text tokens after byte conversion (first 10): {text_tokens[:10]}"
+        )
+        # Pad or truncate sequence
+        current_len = len(text_tokens)
+        padding_needed = max_len - current_len
+        if padding_needed <= 0:
+            if current_len > max_len:
+                logger.warning(
+                    f"Input text length ({current_len}) exceeds max length ({max_len}). Truncating."
+                )
+                text_tokens = text_tokens[:max_len]
+            padded_text_np = np.array(text_tokens, dtype=np.uint8)
+        else:
+            logger.debug(f"Padding text input with {padding_needed} pad tokens.")
+            padded_text_np = np.pad(
+                text_tokens,
+                (0, padding_needed),
+                mode="constant",
+                constant_values=text_pad_value,
+            ).astype(np.uint8)
+        # Convert to tensors and add batch dimension [1, SeqLen]
+        src_tokens = (
+            torch.from_numpy(padded_text_np)
+            .to(torch.long)
+            .to(self.target_device)
+            .unsqueeze(0)
+        )
+        src_positions = (
+            torch.arange(max_len, device=self.target_device).to(torch.long).unsqueeze(0)
+        )
+        # Create padding mask (True where token is NOT the pad value)
+        src_padding_mask = src_tokens != text_pad_value  # Shape [1, SeqLen]
+        # Create attention mask for the encoder (non-causal self-attention)
+        # Needs shape [B, 1, Tq, Tk] -> [1, 1, SeqLen, SeqLen]
+        enc_self_attn_mask = self._create_attn_mask(
+            src_padding_mask, src_padding_mask, is_causal=False
+        )
+        logger.debug(f"Prepared src_tokens shape: {src_tokens.shape}")
+        logger.debug(f"Prepared src_positions shape: {src_positions.shape}")
+        logger.debug(
+            f"Prepared src_padding_mask shape: {src_padding_mask.shape} (True means non-padding)"
+        )
+        logger.debug(f"Prepared enc_self_attn_mask shape: {enc_self_attn_mask.shape}")
+        return src_tokens, src_positions, src_padding_mask, enc_self_attn_mask
+    @torch.inference_mode()
+    def generate(
+        self,
+        text: str,
+        max_tokens: int | None = None,
+        cfg_scale: float = 3.0,
+        temperature: float = 1.3,
+        top_p: float = 0.95,
+        use_cfg_filter: bool = True,
+        use_torch_compile: bool = False,  # Default to False for broader compatibility
+        cfg_filter_top_k: int = 35,
+        audio_prompt_path: str | None = None,
+    ) -> np.ndarray:
+        """
+        Generates audio waveform from a text prompt, optionally conditioned on an audio prompt.
+        Args:
+            text: The input text string. For dialogue, use [S1]/[S2] markers.
+                  For voice cloning, prepend the transcript of the audio prompt.
+            max_tokens: Maximum number of audio tokens (frames) to generate. Defaults to config value.
+            cfg_scale: Classifier-Free Guidance scale. Higher values increase adherence to text.
+            temperature: Sampling temperature. Higher values increase randomness.
+            top_p: Nucleus sampling probability. Filters vocabulary during sampling.
+            use_cfg_filter: Whether to apply Top-K filtering based on CFG logits.
+            use_torch_compile: If True, attempts to compile the decoder step for potential speedup.
+            cfg_filter_top_k: The 'K' value for CFG Top-K filtering.
+            audio_prompt_path: Path to an audio file (e.g., WAV, MP3) to use as a voice prompt/clone target.
+        Returns:
+            A 1D NumPy array containing the generated audio waveform (float32).
+        """
+        start_time_gen = time.time()
+        logger.info("Starting audio generation...")
+        logger.info(f"  Text (start): '{text[:100]}...'")
+        logger.info(
+            f"  Max tokens: {max_tokens if max_tokens is not None else 'Model Default'}"
+        )
+        logger.info(f"  CFG Scale: {cfg_scale}")
+        logger.info(f"  Temperature: {temperature}")
+        logger.info(f"  Top P: {top_p}")
+        logger.info(f"  Use CFG Filter: {use_cfg_filter}, Top K: {cfg_filter_top_k}")
+        logger.info(
+            f"  Audio Prompt: {audio_prompt_path if audio_prompt_path else 'None'}"
+        )
+        logger.info(f"  Use torch.compile: {use_torch_compile}")
+        logger.info(f"  Target Device: {self.target_device}")
+        # --- Parameter Setup ---
+        num_channels = self.config.data.channels
+        audio_bos_value = self.config.data.audio_bos_value
+        audio_eos_value = self.config.data.audio_eos_value
+        audio_pad_value = self.config.data.audio_pad_value
+        delay_pattern = self.config.data.delay_pattern
+        # Use model's default audio length if max_tokens not provided
+        effective_max_tokens = (
+            max_tokens if max_tokens is not None else self.config.data.audio_length
+        )
+        logger.info(f"  Effective max_tokens for generation: {effective_max_tokens}")
+        # Ensure delay pattern is usable
+        if not isinstance(delay_pattern, list) or not delay_pattern:
+            logger.warning("Delay pattern is invalid or empty. Using default [0].")
+            delay_pattern = [
+                0
+            ] * num_channels  # Fallback, though config should provide default
+        delay_tensor = torch.tensor(
+            delay_pattern, dtype=torch.long, device=self.target_device
+        )
+        max_delay_pattern = max(delay_pattern) if delay_pattern else 0
+        self.model.eval()  # Ensure model is in eval mode
+        # --- Prepare Conditional and Unconditional Inputs ---
+        logger.info(
+            "Preparing text inputs for conditional and unconditional generation..."
+        )
+        (
+            cond_src_BxS,
+            cond_src_positions_BxS,
+            cond_src_padding_mask_BxS,
+            cond_enc_self_attn_mask_Bx1xSxS,
+        ) = self._prepare_text_input(text)
+        # Create unconditional input (batch of zeros representing padding)
+        # Assuming pad value 0 for text based on config default
+        unc_src_BxS = torch.full_like(
+            cond_src_BxS, fill_value=self.config.data.text_pad_value
+        )
+        # Batch conditional and unconditional inputs together [2, SeqLen]
+        src_BxS = torch.cat([unc_src_BxS, cond_src_BxS], dim=0)
+        # Expand other inputs to match batch size 2
+        src_positions_BxS = cond_src_positions_BxS.expand(2, -1)
+        src_padding_mask_BxS = torch.cat(
+            [
+                torch.zeros_like(cond_src_padding_mask_BxS[0:1]),
+                cond_src_padding_mask_BxS,
+            ],
+            dim=0,
+        )  # Uncond mask is all False (padding)
+        # Encoder mask needs to handle the batched input correctly
+        # For CFG, typically the unconditional branch attends to nothing useful from text,
+        # but the structure needs to be maintained. We can reuse the conditional mask structure,
+        # but the actual attention scores will be based on the zeroed-out unconditional input.
+        # Alternatively, create a specific mask for the unconditional part if needed.
+        # Let's expand the conditional mask for simplicity, assuming the model handles zero inputs appropriately.
+        enc_self_attn_mask_Bx1xSxS = cond_enc_self_attn_mask_Bx1xSxS.expand(
+            2, -1, -1, -1
+        )
+        logger.info("Text inputs prepared (batch size 2 for CFG).")
+        # --- Encoder Pass ---
+        logger.info("Running encoder pass...")
+        start_time_enc = time.time()
+        # Potentially use autocast for mixed precision if supported and beneficial on device
+        # Example: with torch.autocast(device_type=self.target_device.type, dtype=torch.bfloat16 if self.target_device.type == 'cuda' else torch.float32):
+        encoder_out = self.model.encoder(
+            x_ids=src_BxS,  # Shape [2, S]
+            src_positions=src_positions_BxS,  # Shape [2, S]
+            deterministic=True,  # No dropout during inference
+            attn_mask=enc_self_attn_mask_Bx1xSxS,  # Shape [2, 1, S, S]
+        )
+        logger.info(
+            f"Encoder pass completed in {time.time() - start_time_enc:.3f}s. Output shape: {encoder_out.shape}"
+        )  # Shape: [2, S, E]
+        # --- Prepare Decoder Inputs & KV Cache ---
+        logger.info("Preparing decoder inputs and KV cache...")
+        start_time_kv = time.time()
+        # 3-1. Precompute Cross-Attention KV Cache (Static) from encoder output
+        # This cache is computed once and reused for every decoding step.
+        decoder_cross_attention_cache: list[KVCache] = (
+            self.model.decoder.precompute_cross_attention_kv(
+                effective_max_tokens, encoder_out, src_positions_BxS
+            )
+        )
+        logger.debug(
+            f"Precomputed cross-attention KV cache for {len(decoder_cross_attention_cache)} layers."
+        )
+        # 3-2. Initialize Self-Attention KV Cache (Dynamic, grows with each step)
+        decoder_self_attention_cache: list[KVCache] = []
+        for i in range(self.model.decoder.num_layers):
+            decoder_self_attention_cache.append(
+                KVCache(
+                    self.config.model.decoder.gqa_query_heads,
+                    effective_max_tokens,  # Max length the cache can hold
+                    self.config.model.decoder.gqa_head_dim,
+                    self.target_device,  # Cache tensors should be on the target device
+                )
+            )
+        logger.debug(
+            f"Initialized self-attention KV cache for {len(decoder_self_attention_cache)} layers."
+        )
+        logger.info(
+            f"KV cache preparation completed in {time.time() - start_time_kv:.3f}s."
+        )
+        # 3-3. Initialize Decoder Start Tokens (BOS)
+        # Shape [2, 1, C] (Batch=2 for cond/uncond, T=1 for first step, C=channels)
+        generated_tokens_history = torch.full(
+            (2, 1, num_channels),
+            fill_value=audio_bos_value,
+            dtype=torch.long,
+            device=self.target_device,
+        )
+        logger.debug(f"Initial decoder input (BOS): {generated_tokens_history.shape}")
+        current_step_index = (
+            0  # Index of the step we are currently generating (starts at 0)
+        )
+        prompt_len_inc_bos = 1  # Length of the initial prompt (just BOS initially)
+        # 3-4. Handle Audio Prompt (Prefill KV Cache)
+        if audio_prompt_path is not None:
+            logger.info("Processing audio prompt for prefilling...")
+            start_time_prompt = time.time()
+            try:
+                # Load and potentially resample audio
+                audio_prompt_waveform, sr = torchaudio.load(audio_prompt_path)
+                logger.debug(
+                    f"Loaded audio prompt: {audio_prompt_waveform.shape}, Sample Rate: {sr}"
+                )
+                if sr != 44100:
+                    logger.info(f"Resampling audio prompt from {sr}Hz to 44100Hz")
+                    audio_prompt_waveform = torchaudio.functional.resample(
+                        audio_prompt_waveform, sr, 44100
+                    )
+                # Ensure correct shape [B, C, T_audio] and device
+                # Assuming DAC expects channels first, add batch dim
+                if audio_prompt_waveform.ndim == 1:  # Mono
+                    audio_prompt_waveform = audio_prompt_waveform.unsqueeze(
+                        0
+                    )  # Add channel dim
+                audio_prompt_waveform = audio_prompt_waveform.unsqueeze(0).to(
+                    self.target_device
+                )  # Add batch dim
+                # Encode audio prompt to codes using DAC
+                logger.info("Encoding audio prompt to codes using DAC...")
+                if self.dac_model is None:
+                    raise RuntimeError(
+                        "DAC model not loaded, required for audio prompt."
+                    )
+                # audio_to_codebook returns [B, T_codes, C]
+                audio_prompt_codes = audio_to_codebook(
+                    self.dac_model, audio_prompt_waveform, data_config=self.config.data
+                )  # Shape [1, T_codes, C]
+                logger.info(
+                    f"Encoded audio prompt to codes: {audio_prompt_codes.shape}"
+                )
+                # Concatenate BOS tokens with prompt codes
+                # Expand prompt codes to batch size 2 (for cond/uncond)
+                generated_tokens_history = torch.cat(
+                    [generated_tokens_history, audio_prompt_codes.expand(2, -1, -1)],
+                    dim=1,
+                )  # Shape [2, 1 + T_codes, C]
+                logger.debug(
+                    f"Decoder input history after prompt concatenation: {generated_tokens_history.shape}"
+                )
+                prefill_len = generated_tokens_history.shape[
+                    1
+                ]  # Length including BOS + prompt
+                prompt_len_inc_bos = prefill_len
+                logger.info(f"Prefilling KV cache with length {prefill_len}...")
+                # Prepare inputs for prefill forward pass
+                prefill_tgt_pos = (
+                    torch.arange(prefill_len, device=self.target_device)
+                    .unsqueeze(0)
+                    .expand(2, -1)
+                )  # Shape [2, T_prefill]
+                # Padding mask based on actual tokens (BOS and prompt codes are not PAD)
+                # Shape [2, T_prefill] (True where not PAD)
+                prefill_tgt_padding_mask = (
+                    generated_tokens_history != audio_pad_value
+                ).any(dim=2)
+                # Create attention masks for prefill
+                # Shape [2, 1, T_prefill, T_prefill]
+                prefill_self_attn_mask = self._create_attn_mask(
+                    prefill_tgt_padding_mask,
+                    prefill_tgt_padding_mask,
+                    is_causal=True,
+                )
+                # Shape [2, 1, T_prefill, S]
+                prefill_cross_attn_mask = self._create_attn_mask(
+                    prefill_tgt_padding_mask,
+                    src_padding_mask_BxS,
+                    is_causal=False,
+                )
+                # Run forward pass through decoder to fill the self-attention KV cache
+                # We discard the logits from prefill
+                _ = self.model.decoder.forward(
+                    tgt_ids_BxTxC=generated_tokens_history,  # Pass the full history [2, T_prefill, C]
+                    encoder_out=encoder_out,
+                    tgt_positions=prefill_tgt_pos,
+                    src_positions=src_positions_BxS,
+                    deterministic=True,
+                    self_attn_mask=prefill_self_attn_mask,
+                    cross_attn_mask=prefill_cross_attn_mask,
+                    self_attention_cache=decoder_self_attention_cache,  # Pass cache to be filled
+                    cross_attention_cache=decoder_cross_attention_cache,  # Pass precomputed cache
+                    # prefill=True # Pass prefill flag if decoder layer uses it
+                )
+                # Update the current step index. The next token to generate is at index prefill_len.
+                current_step_index = prefill_len
+                logger.info(
+                    f"KV cache prefilled in {time.time() - start_time_prompt:.3f}s. Next step index: {current_step_index}"
+                )
+            except Exception as e:
+                logger.error(f"Error processing audio prompt: {e}", exc_info=True)
+                raise RuntimeError("Failed to process audio prompt") from e
+        # --- Autoregressive Generation Loop ---
+        logger.info("Starting autoregressive generation loop...")
+        start_time_loop = time.time()
+        eos_detected_channel_0 = False
+        eos_countdown = -1  # Countdown after EOS detected in channel 0
+        extra_steps_after_eos = (
+            30  # Generate a few extra steps for delay pattern completion
+        )
+        # Pre-allocate tensor for storing *newly* generated tokens for efficiency
+        # We already have the prompt in generated_tokens_history
+        num_steps_to_generate = effective_max_tokens
+        newly_generated_tokens = torch.full(
+            (2, num_steps_to_generate, num_channels),
+            fill_value=audio_pad_value,  # Fill with pad initially
+            dtype=torch.long,
+            device=self.target_device,
+        )
+        logger.debug(
+            f"Allocated tensor for newly generated tokens: {newly_generated_tokens.shape}"
+        )
+        # --- Compile decode_step if requested ---
+        decode_step_fn = self.model.decoder.decode_step
+        if use_torch_compile:
+            logger.info("Compiling decoder step function with torch.compile...")
+            try:
+                # Experiment with modes: "default", "reduce-overhead", "max-autotune"
+                decode_step_fn = torch.compile(decode_step_fn, mode="reduce-overhead")
+                logger.info("Decoder step function compiled.")
+            except Exception as e:
+                logger.warning(
+                    f"torch.compile failed: {e}. Using eager mode.", exc_info=True
+                )
+        # --- Prepare static cross-attention mask for single-step decoding ---
+        # Query mask is always [B, 1] (True, as generated tokens are not PAD)
+        step_tgt_padding_mask = torch.ones(
+            (2, 1), dtype=torch.bool, device=self.target_device
+        )
+        # Shape [2, 1, 1, S]
+        step_decoder_cross_attn_mask = self._create_attn_mask(
+            step_tgt_padding_mask,
+            src_padding_mask_BxS,
+            is_causal=False,
+        )
+        # --- Generation Loop ---
+        steps_taken = 0
+        for step_offset in range(num_steps_to_generate):
+            # Absolute step index considering prompt length
+            current_absolute_step = current_step_index + step_offset
+            # Get the token IDs for the *previous* step to predict the current one
+            # Shape [2, 1, C]
+            # If step_offset is 0, use the last token from the prompt history
+            if step_offset == 0:
+                input_token_ids = generated_tokens_history[:, -1, :].unsqueeze(1)
+            else:
+                # Use the token generated in the previous iteration of this loop
+                input_token_ids = newly_generated_tokens[
+                    :, step_offset - 1, :
+                ].unsqueeze(1)
+            # Position ID for the current absolute step
+            # Shape [2, 1]
+            tgt_pos_Bx1 = torch.full(
+                (2, 1),
+                fill_value=current_absolute_step,
+                dtype=torch.long,
+                device=self.target_device,
+            )
+            # --- Call Decoder Step ---
+            # self_attn_mask is None because KV cache handles causality implicitly in single-step decoding
+            logits_Bx1xCxV, new_self_kv_cache_list = decode_step_fn(
+                tgt_ids_Bx1xC=input_token_ids,
+                tgt_pos_Bx1=tgt_pos_Bx1,
+                encoder_out=encoder_out,
+                self_attn_mask=None,
+                cross_attn_mask=step_decoder_cross_attn_mask,
+                self_attention_cache=decoder_self_attention_cache,
+                cross_attention_cache=decoder_cross_attention_cache,
+            )  # Logits shape: [2, 1, C, V]
+            # --- Update Self-Attention KV Cache ---
+            for i, layer_cache in enumerate(decoder_self_attention_cache):
+                if (
+                    new_self_kv_cache_list
+                    and i < len(new_self_kv_cache_list)
+                    and new_self_kv_cache_list[i] is not None
+                ):
+                    # new_self_kv_cache_list[i] is a tuple (k_tensor, v_tensor) for the current step
+                    # k_tensor shape: [2, NumHeads, 1, HeadDim]
+                    # v_tensor shape: [2, NumHeads, 1, HeadDim]
+                    layer_cache.update_cache(
+                        new_self_kv_cache_list[i][0], new_self_kv_cache_list[i][1]
+                    )
+                else:
+                    logger.warning(
+                        f"Missing KV cache update for layer {i} at step {current_absolute_step}"
+                    )
+            # --- Sampling ---
+            V = self.config.model.tgt_vocab_size
+            # Get logits for the generated step [2, C, V]
+            logits_last_BxCxV = logits_Bx1xCxV.squeeze(1)
+            # Separate conditional and unconditional logits
+            uncond_logits_CxV = logits_last_BxCxV[0, :, :]  # Shape [C, V]
+            cond_logits_CxV = logits_last_BxCxV[1, :, :]  # Shape [C, V]
+            # Apply Classifier-Free Guidance (CFG)
+            cfg_logits_CxV = cond_logits_CxV + cfg_scale * (
+                cond_logits_CxV - uncond_logits_CxV
+            )  # Shape [C, V]
+            # --- Prevent sampling PAD/EOS/BOS tokens inappropriately ---
+            logits_for_sampling_CxV = (
+                cfg_logits_CxV.clone()
+            )  # Clone to avoid modifying original logits
+            logits_for_sampling_CxV[:, audio_pad_value] = -torch.inf  # Never sample PAD
+            logits_for_sampling_CxV[:, audio_bos_value] = (
+                -torch.inf
+            )  # Never sample BOS after start
+            # Allow EOS only if not already detected or in countdown
+            if eos_detected_channel_0 and eos_countdown <= 0:
+                logits_for_sampling_CxV[:, audio_eos_value] = -torch.inf
+            # --- Sample the next token for each channel ---
+            pred_C = _sample_next_token(
+                logits_for_sampling_CxV.float(),  # Ensure float32 for sampling stability
+                temperature=temperature,
+                top_p=top_p,
+                use_cfg_filter=use_cfg_filter,
+                cfg_filter_top_k=cfg_filter_top_k,
+            )  # Shape [C]
+            # --- Handle Delay Pattern (Only if no audio prompt was given) ---
+            # If there's no prompt, the first few tokens should be BOS according to delay
+            # generation_step_index is how many tokens generated *after* prompt/initial BOS
+            generation_step_index = step_offset
+            if audio_prompt_path is None:
+                is_before_delay = generation_step_index < delay_tensor  # Shape [C]
+                pred_C = torch.where(
+                    is_before_delay,
+                    torch.tensor(
+                        audio_bos_value, device=self.target_device, dtype=torch.long
+                    ),
+                    pred_C,
+                )
+            # --- Store the predicted token in the newly_generated_tokens tensor ---
+            newly_generated_tokens[:, step_offset, :] = pred_C.unsqueeze(0).expand(
+                2, -1
+            )
+            steps_taken += 1  # Increment steps taken in this loop
+            # --- EOS Handling ---
+            if not eos_detected_channel_0 and pred_C[0] == audio_eos_value:
+                logger.info(
+                    f"EOS token detected in channel 0 at step {current_absolute_step}. Starting countdown."
+                )
+                eos_detected_channel_0 = True
+                eos_countdown = extra_steps_after_eos
+            if eos_countdown > 0:
+                step_after_eos = extra_steps_after_eos - eos_countdown
+                logger.debug(
+                    f"EOS countdown: {eos_countdown}, Step after EOS: {step_after_eos}"
+                )
+                # Modify the token *just generated* if needed for EOS/PAD forcing
+                current_new_tokens = newly_generated_tokens[
+                    :, step_offset, :
+                ]  # Shape [2, C]
+                for i, d in enumerate(delay_pattern):
+                    if step_after_eos == d:
+                        logger.debug(
+                            f"  Forcing EOS in channel {i} at step {current_absolute_step}"
+                        )
+                        current_new_tokens[:, i] = audio_eos_value
+                    elif step_after_eos > d:
+                        logger.debug(
+                            f"  Forcing PAD in channel {i} at step {current_absolute_step}"
+                        )
+                        current_new_tokens[:, i] = audio_pad_value
+                # Put the potentially modified tokens back
+                newly_generated_tokens[:, step_offset, :] = current_new_tokens
+                eos_countdown -= 1
+                if eos_countdown == 0:
+                    logger.info(
+                        f"EOS countdown finished at step {current_absolute_step}. Stopping generation."
+                    )
+                    break  # Stop generation loop
+            # Check if we reached the max *new* tokens requested
+            if steps_taken >= num_steps_to_generate:
+                logger.info(
+                    f"Reached max generation steps ({num_steps_to_generate}). Stopping."
+                )
+                break
+        logger.info(
+            f"Autoregressive loop finished after {steps_taken} steps in {time.time() - start_time_loop:.3f}s."
+        )
+        # --- Extract Generated Codes ---
+        # Get the conditional generation result (index 1) from the *newly* generated tokens
+        # Only take the number of steps actually taken
+        final_new_codes = newly_generated_tokens[
+            1, :steps_taken, :
+        ]  # Shape [T_generated, C]
+        logger.info(f"Extracted newly generated codes shape: {final_new_codes.shape}")
+        # --- Convert Codes to Audio using DAC ---
+        logger.info("Converting generated codes to audio using DAC...")
+        start_time_decode = time.time()
+        if self.dac_model is None:
+            raise RuntimeError("DAC model not loaded, required for audio decoding.")
+        # codebook_to_audio expects codes shape [C, T]
+        generated_codes_CxT = final_new_codes.transpose(0, 1)  # Shape [C, T_generated]
+        if generated_codes_CxT.numel() == 0:
+            logger.warning("No new codes were generated. Returning empty audio.")
+            return np.array([], dtype=np.float32)
+        # Call the decoding function (handles delay reversal and DAC decoding)
+        audio_waveform = codebook_to_audio(
+            generated_codes_CxT,
+            self.dac_model,
+            delay_pattern,
+            B=1,  # Batch size for decoding is 1
+            T=generated_codes_CxT.shape[1],  # Pass the actual length of generated codes
+            C=num_channels,
+        )  # Returns shape [1, T_audio] or [T_audio]
+        # Ensure output is a 1D numpy array on CPU
+        final_audio_np = audio_waveform.squeeze().cpu().numpy()
+        logger.info(
+            f"Audio decoding completed in {time.time() - start_time_decode:.3f}s. Output shape: {final_audio_np.shape}"
+        )
+        logger.info(f"Total generation time: {time.time() - start_time_gen:.3f}s")
+        return final_audio_np

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,23 @@

+version: '3.8'
+services:
+  dia-tts-server:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "${PORT:-8003}:${PORT:-8003}"
+    volumes:
+      - ./model_cache:/app/model_cache
+      - ./reference_audio:/app/reference_audio
+      - ./outputs:/app/outputs
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    restart: unless-stopped
+    env_file:
+      - .env

documentation.md ADDED Viewed

	@@ -0,0 +1,549 @@

+# Dia TTS Server - Technical Documentation
+**Version:** 1.0.0
+**Date:** 2025-04-22
+**Table of Contents:**
+1.  [Overview](#1-overview)
+2.  [Visual Overview](#2-visual-overview)
+    *   [Directory Structure](#21-directory-structure)
+    *   [Component Diagram](#22-component-diagram)
+3.  [System Prerequisites](#3-system-prerequisites)
+4.  [Installation and Setup](#4-installation-and-setup)
+    *   [Cloning the Repository](#41-cloning-the-repository)
+    *   [Setting up Python Virtual Environment](#42-setting-up-python-virtual-environment)
+        *   [Windows Setup](#421-windows-setup)
+        *   [Linux Setup (Debian/Ubuntu Example)](#422-linux-setup-debianubuntu-example)
+    *   [Installing Dependencies](#43-installing-dependencies)
+    *   [NVIDIA Driver and CUDA Setup (Required for GPU Acceleration)](#44-nvidia-driver-and-cuda-setup-required-for-gpu-acceleration)
+        *   [Step 1: Check/Install NVIDIA Drivers](#441-step-1-checkinstall-nvidia-drivers)
+        *   [Step 2: Install PyTorch with CUDA Support](#442-step-2-install-pytorch-with-cuda-support)
+        *   [Step 3: Verify PyTorch CUDA Installation](#443-step-3-verify-pytorch-cuda-installation)
+5.  [Configuration](#5-configuration)
+    *   [Configuration Files (`.env` and `config.py`)](#51-configuration-files-env-and-configpy)
+    *   [Configuration Parameters](#52-configuration-parameters)
+6.  [Running the Server](#6-running-the-server)
+7.  [Usage](#7-usage)
+    *   [Web User Interface (Web UI)](#71-web-user-interface-web-ui)
+        *   [Main Generation Form](#711-main-generation-form)
+        *   [Presets](#712-presets)
+        *   [Voice Cloning](#713-voice-cloning)
+        *   [Generation Parameters](#714-generation-parameters)
+        *   [Server Configuration (UI)](#715-server-configuration-ui)
+        *   [Generated Audio Player](#716-generated-audio-player)
+        *   [Theme Toggle](#717-theme-toggle)
+    *   [API Endpoints](#72-api-endpoints)
+        *   [POST /v1/audio/speech (OpenAI Compatible)](#721-post-v1audiospeech-openai-compatible)
+        *   [POST /tts (Custom Parameters)](#722-post-tts-custom-parameters)
+        *   [Configuration & Helper Endpoints](#723-configuration--helper-endpoints)
+8.  [Troubleshooting](#8-troubleshooting)
+9.  [Project Architecture](#9-project-architecture)
+10. [License and Disclaimer](#10-license-and-disclaimer)
+---
+## 1. Overview
+The Dia TTS Server provides a backend service and web interface for generating high-fidelity speech, including dialogue with multiple speakers and non-verbal sounds, using the Dia text-to-speech model family (originally from Nari Labs, with support for community conversions like SafeTensors).
+This server is built using the FastAPI framework and offers both a RESTful API (including an OpenAI-compatible endpoint) and an interactive web UI powered by Jinja2, Tailwind CSS, and JavaScript. It supports voice cloning via audio prompts and allows configuration of various generation parameters.
+**Key Features:**
+*   **High-Quality TTS:** Leverages the Dia model for realistic speech synthesis.
+*   **Dialogue Generation:** Supports `[S1]` and `[S2]` tags for multi-speaker dialogue.
+*   **Non-Verbal Sounds:** Can generate sounds like `(laughs)`, `(sighs)`, etc., when included in the text.
+*   **Voice Cloning:** Allows conditioning the output voice on a provided reference audio file.
+*   **Flexible Model Loading:** Supports loading models from Hugging Face repositories, including both `.pth` and `.safetensors` formats (defaults to BF16 SafeTensors for efficiency).
+*   **API Access:** Provides a custom API endpoint (`/tts`) and an OpenAI-compatible endpoint (`/v1/audio/speech`).
+*   **Web Interface:** Offers an easy-to-use UI for text input, parameter adjustment, preset loading, reference audio management, and audio playback.
+*   **Configuration:** Server settings, model sources, paths, and default generation parameters are configurable via an `.env` file.
+*   **GPU Acceleration:** Utilizes NVIDIA GPUs via CUDA for significantly faster inference when available, falling back to CPU otherwise.
+---
+## 2. Visual Overview
+### 2.1 Directory Structure
+```
+dia-tts-server/
+│
+├── .env                  # Local configuration overrides (user-created)
+├── config.py             # Default configuration and management class
+├── engine.py             # Core model loading and generation logic
+├── models.py             # Pydantic models for API requests
+├── requirements.txt      # Python dependencies
+├── server.py             # Main FastAPI application, API endpoints, UI routes
+├── utils.py              # Utility functions (audio encoding, saving, etc.)
+│
+├── dia/                  # Core Dia model implementation package
+│   ├── __init__.py
+│   ├── audio.py          # Audio processing helpers (delay, codebook conversion)
+│   ├── config.py         # Pydantic models for Dia model architecture config
+│   ├── layers.py         # Custom PyTorch layers for the Dia model
+│   └── model.py          # Dia model class wrapper (loading, generation)
+│
+├── static/               # Static assets (e.g., favicon.ico)
+│   └── favicon.ico
+│
+├── ui/                   # Web User Interface files
+│   ├── index.html        # Main HTML template (Jinja2)
+│   ├── presets.yaml      # Predefined UI examples
+│   ├── script.js         # Frontend JavaScript logic
+│   └── style.css         # Frontend CSS styling (Tailwind via CDN/build)
+│
+├── model_cache/          # Default directory for downloaded model files (configurable)
+├── outputs/              # Default directory for saved audio output (configurable)
+└── reference_audio/      # Default directory for voice cloning reference files (configurable)
+```
+### 2.2 Component Diagram
+```
+┌───────────────────┐      ┌───────────────────┐      ┌───────────────────┐      ┌───────────────────┐
+│ User (Web UI /    │────→ │ FastAPI Server    │────→ │ TTS Engine        │────→ │ Dia Model Wrapper │
+│ API Client)       │      │ (server.py)       │      │ (engine.py)       │      │ (dia/model.py)    │
+└───────────────────┘      └─────────┬─────────┘      └─────────┬─────────┘      └─────────┬─────────┘
+                                     │                          │                          │
+                                     │ Uses                     │ Uses                     │ Uses
+                                     ▼                          ▼                          ▼
+                           ┌───────────────────┐      ┌───────────────────┐      ┌───────────────────┐
+                           │ Configuration     │ ←─── │ .env File         │      │ Dia Model Layers  │
+                           │ (config.py)       │      └───────────────────┘      │ (dia/layers.py)   │
+                           └───────────────────┘                                 └───────────────────┘
+                                     │                                                   │ Uses
+                                     │ Uses                                                   │
+                                     ▼                                                   │
+                           ┌───────────────────┐                                         │ Uses
+                           │ Utilities         │                                         ▼
+                           │ (utils.py)        │                               ┌───────────────────┐
+                           └───────────────────┘                               │ PyTorch / CUDA    │
+                                     ▲                                         └───────────────────┘
+                                     │ Uses                                             │ Uses
+                                     │                                                  ▼
+┌───────────────────┐      ┌───────────────────┐                           ┌───────────────────┐
+│ Web UI Files      │ ←─── │ Jinja2 Templates  │                           │ DAC Model         │
+│ (ui/)             │      └───────────────────┘                           │ (descript-audio..)│
+└───────────────────┘               ▲                                      └───────────────────┘
+                                    │ Renders                                        ▲
+                                    │                                                │ Uses
+                                    └────────────────────────────────────────────────┘
+```
+**Diagram Legend:**
+*   Boxes represent major components or file groups.
+*   Arrows (`→`) indicate primary data flow or control flow.
+*   Lines with "Uses" indicate dependencies or function calls.
+---
+## 3. System Prerequisites
+Before installing and running the Dia TTS Server, ensure your system meets the following requirements:
+*   **Operating System:**
+    *   Windows 10/11 (64-bit)
+    *   Linux (Debian/Ubuntu recommended, other distributions may require adjustments)
+*   **Python:** Python 3.10 or later (Python 3.10.x recommended based on tracebacks). Ensure Python and Pip are added to your system's PATH.
+*   **Version Control:** Git (for cloning the repository).
+*   **Internet Connection:** Required for downloading dependencies and model files.
+*   **(Optional but Highly Recommended for Performance):**
+    *   **NVIDIA GPU:** A CUDA-compatible NVIDIA GPU (Maxwell architecture or newer). Check compatibility [here](https://developer.nvidia.com/cuda-gpus). Sufficient VRAM is needed (BF16 model requires ~5-6GB, full precision ~10GB).
+    *   **NVIDIA Drivers:** Latest appropriate drivers for your GPU and OS.
+    *   **CUDA Toolkit:** Version compatible with the chosen PyTorch build (e.g., 11.8, 12.1). See [Section 4.4](#44-nvidia-driver-and-cuda-setup-required-for-gpu-acceleration).
+*   **(Linux System Libraries):**
+    *   `libsndfile1`: Required by the `soundfile` Python library for audio I/O. Install using your package manager (e.g., `sudo apt install libsndfile1` on Debian/Ubuntu).
+---
+## 4. Installation and Setup
+Follow these steps to set up the project environment and install necessary dependencies.
+### 4.1 Cloning the Repository
+Open your terminal or command prompt and navigate to the directory where you want to store the project. Then, clone the repository:
+```bash
+git clone https://github.com/devnen/dia-tts-server.git # Replace with the actual repo URL if different
+cd dia-tts-server
+```
+### 4.2 Setting up Python Virtual Environment
+Using a virtual environment is strongly recommended to isolate project dependencies.
+#### 4.2.1 Windows Setup
+1.  **Open PowerShell or Command Prompt** in the project directory (`dia-tts-server`).
+2.  **Create the virtual environment:**
+    ```powershell
+    python -m venv venv
+    ```
+3.  **Activate the virtual environment:**
+    ```powershell
+    .\venv\Scripts\activate
+    ```
+    Your terminal prompt should now be prefixed with `(venv)`.
+#### 4.2.2 Linux Setup (Debian/Ubuntu Example)
+1.  **Install prerequisites (if not already present):**
+    ```bash
+    sudo apt update
+    sudo apt install python3 python3-venv python3-pip libsndfile1 -y
+    ```
+2.  **Open your terminal** in the project directory (`dia-tts-server`).
+3.  **Create the virtual environment:**
+    ```bash
+    python3 -m venv venv
+    ```
+4.  **Activate the virtual environment:**
+    ```bash
+    source venv/bin/activate
+    ```
+    Your terminal prompt should now be prefixed with `(venv)`.
+### 4.3 Installing Dependencies
+With your virtual environment activated (`(venv)` prefix visible), install the required Python packages:
+```bash
+# Upgrade pip first (optional but good practice)
+pip install --upgrade pip
+# Install all dependencies from requirements.txt
+pip install -r requirements.txt
+```
+**Note:** This command installs the CPU-only version of PyTorch by default. If you have a compatible NVIDIA GPU and want acceleration, proceed to [Section 4.4](#44-nvidia-driver-and-cuda-setup-required-for-gpu-acceleration) **before** running the server.
+### 4.4 NVIDIA Driver and CUDA Setup (Required for GPU Acceleration)
+Follow these steps **only if you have a compatible NVIDIA GPU** and want faster inference.
+#### 4.4.1 Step 1: Check/Install NVIDIA Drivers
+1.  **Check Existing Driver:** Open Command Prompt (Windows) or Terminal (Linux) and run:
+    ```bash
+    nvidia-smi
+    ```
+2.  **Interpret Output:**
+    *   If the command runs successfully, note the **Driver Version** and the **CUDA Version** listed in the top right corner. This CUDA version is the *maximum* supported by your current driver.
+    *   If the command fails ("not recognized"), you need to install or update your NVIDIA drivers.
+3.  **Install/Update Drivers:** Go to the [NVIDIA Driver Downloads](https://www.nvidia.com/Download/index.aspx) page. Select your GPU model and OS, then download and install the latest recommended driver (Game Ready or Studio). **Reboot your computer** after installation. Run `nvidia-smi` again to confirm it works.
+#### 4.4.2 Step 2: Install PyTorch with CUDA Support
+1.  **Go to PyTorch Website:** Visit [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/).
+2.  **Configure:** Select:
+    *   **PyTorch Build:** Stable
+    *   **Your OS:** Windows or Linux
+    *   **Package:** Pip
+    *   **Language:** Python
+    *   **Compute Platform:** Choose the CUDA version **equal to or lower than** the version reported by `nvidia-smi`. For example, if `nvidia-smi` shows `CUDA Version: 12.4`, select `CUDA 12.1`. If it shows `11.8`, select `CUDA 11.8`. **Do not select a version higher than your driver supports.** (CUDA 12.1 or 11.8 are common stable choices).
+3.  **Copy Command:** Copy the generated installation command. It will look similar to:
+    ```bash
+    # Example for CUDA 12.1 (Windows/Linux):
+    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+    # Example for CUDA 11.8 (Windows/Linux):
+    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+    ```
+    *(Use `pip` instead of `pip3` if that's your command)*
+4.  **Install in Activated venv:**
+    *   Ensure your `(venv)` is active.
+    *   **Uninstall CPU PyTorch first:**
+        ```bash
+        pip uninstall torch torchvision torchaudio -y
+        ```
+    *   **Paste and run the copied command** from the PyTorch website.
+#### 4.4.3 Step 3: Verify PyTorch CUDA Installation
+1.  With the `(venv)` still active, start a Python interpreter:
+    ```bash
+    python
+    ```
+2.  Run the following Python code:
+    ```python
+    import torch
+    print(f"PyTorch version: {torch.__version__}")
+    cuda_available = torch.cuda.is_available()
+    print(f"CUDA available: {cuda_available}")
+    if cuda_available:
+        print(f"CUDA version used by PyTorch: {torch.version.cuda}")
+        print(f"Device count: {torch.cuda.device_count()}")
+        print(f"Current device index: {torch.cuda.current_device()}")
+        print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
+    else:
+        print("CUDA not available to PyTorch. Ensure drivers and CUDA-enabled PyTorch are installed correctly.")
+    exit()
+    ```
+3.  If `CUDA available:` shows `True`, the setup was successful. If `False`, review driver installation and the PyTorch installation command.
+---
+## 5. Configuration
+The server's behavior, including model selection, paths, and default generation parameters, is controlled via configuration settings.
+### 5.1 Configuration Files (`.env` and `config.py`)
+*   **`config.py`:** Defines the *default* values for all configuration parameters in the `DEFAULT_CONFIG` dictionary. It also contains the `ConfigManager` class and getter functions used by the application.
+*   **`.env` File:** This file, located in the project root directory (`dia-tts-server/.env`), allows you to *override* the default values. Create this file if it doesn't exist. Settings are defined as `KEY=VALUE` pairs, one per line. The server reads this file on startup using `python-dotenv`.
+**Priority:** Values set in the `.env` file take precedence over the defaults in `config.py`. Environment variables set directly in your system also override `.env` file values (though using `.env` is generally recommended for project-specific settings).
+### 5.2 Configuration Parameters
+The following parameters can be set in your `.env` file:
+| Parameter Name (in `.env`)         | Default Value (`config.py`)        | Description                                                                                                | Example `.env` Value                 |
+| :--------------------------------- | :--------------------------------- | :--------------------------------------------------------------------------------------------------------- | :----------------------------------- |
+| **Server Settings**                |                                    |                                                                                                            |                                      |
+| `HOST`                             | `0.0.0.0`                          | The network interface address the server listens on. `0.0.0.0` makes it accessible on your local network. | `127.0.0.1` (localhost only)         |
+| `PORT`                             | `8003`                             | The port number the server listens on.                                                                     | `8080`                               |
+| **Model Source Settings**          |                                    |                                                                                                            |                                      |
+| `DIA_MODEL_REPO_ID`                | `ttj/dia-1.6b-safetensors`         | The Hugging Face repository ID containing the model files.                                                 | `nari-labs/Dia-1.6B`                 |
+| `DIA_MODEL_CONFIG_FILENAME`        | `config.json`                      | The filename of the model's configuration JSON within the repository.                                      | `config.json`                        |
+| `DIA_MODEL_WEIGHTS_FILENAME`       | `dia-v0_1_bf16.safetensors`        | The filename of the model weights file (`.safetensors` or `.pth`) within the repository to load.           | `dia-v0_1.safetensors` or `dia-v0_1.pth` |
+| **Path Settings**                  |                                    |                                                                                                            |                                      |
+| `DIA_MODEL_CACHE_PATH`             | `./model_cache`                    | Local directory to store downloaded model files. Relative paths are based on the project root.             | `/path/to/shared/cache`              |
+| `REFERENCE_AUDIO_PATH`             | `./reference_audio`                | Local directory to store reference audio files (`.wav`, `.mp3`) used for voice cloning.                      | `./voices`                           |
+| `OUTPUT_PATH`                      | `./outputs`                        | Local directory where generated audio files from the Web UI are saved.                                     | `./generated_speech`                 |
+| **Default Generation Parameters**  |                                    | *(These set the initial UI values and can be saved via the UI)*                                            |                                      |
+| `GEN_DEFAULT_SPEED_FACTOR`         | `0.90`                             | Default playback speed factor applied *after* generation (UI slider initial value).                        | `1.0`                                |
+| `GEN_DEFAULT_CFG_SCALE`            | `3.0`                              | Default Classifier-Free Guidance scale (UI slider initial value).                                          | `2.5`                                |
+| `GEN_DEFAULT_TEMPERATURE`          | `1.3`                              | Default sampling temperature (UI slider initial value).                                                    | `1.2`                                |
+| `GEN_DEFAULT_TOP_P`                | `0.95`                             | Default nucleus sampling probability (UI slider initial value).                                            | `0.9`                                |
+| `GEN_DEFAULT_CFG_FILTER_TOP_K`     | `35`                               | Default Top-K value for CFG filtering (UI slider initial value).                                           | `40`                                 |
+**Example `.env` File (Using Original Nari Labs Model):**
+```dotenv
+# .env
+# Example configuration to use the original Nari Labs model
+HOST=0.0.0.0
+PORT=8003
+DIA_MODEL_REPO_ID=nari-labs/Dia-1.6B
+DIA_MODEL_CONFIG_FILENAME=config.json
+DIA_MODEL_WEIGHTS_FILENAME=dia-v0_1.pth
+# Keep other paths as default or specify custom ones
+# DIA_MODEL_CACHE_PATH=./model_cache
+# REFERENCE_AUDIO_PATH=./reference_audio
+# OUTPUT_PATH=./outputs
+# Keep default generation parameters or override them
+# GEN_DEFAULT_SPEED_FACTOR=0.90
+# GEN_DEFAULT_CFG_SCALE=3.0
+# GEN_DEFAULT_TEMPERATURE=1.3
+# GEN_DEFAULT_TOP_P=0.95
+# GEN_DEFAULT_CFG_FILTER_TOP_K=35
+```
+**Important:** You must **restart the server** after making changes to the `.env` file for them to take effect.
+---
+## 6. Running the Server
+1.  **Activate Virtual Environment:** Ensure your virtual environment is activated (`(venv)` prefix).
+    *   Windows: `.\venv\Scripts\activate`
+    *   Linux: `source venv/bin/activate`
+2.  **Navigate to Project Root:** Make sure your terminal is in the `dia-tts-server` directory.
+3.  **Run the Server:**
+    ```bash
+    python server.py
+    ```
+4.  **Server Output:** You should see log messages indicating the server is starting, including:
+    *   The configuration being used (repo ID, filenames, paths).
+    *   The device being used (CPU or CUDA).
+    *   Model loading progress (downloading if necessary).
+    *   Confirmation that the server is running (e.g., `Uvicorn running on http://0.0.0.0:8003`).
+    *   URLs for accessing the Web UI and API Docs.
+5.  **Accessing the Server:**
+    *   **Web UI:** Open your web browser and go to `http://localhost:PORT` (e.g., `http://localhost:8003` if using the default port). If running on a different machine or VM, replace `localhost` with the server's IP address.
+    *   **API Docs:** Access the interactive API documentation (Swagger UI) at `http://localhost:PORT/docs`.
+6.  **Stopping the Server:** Press `CTRL+C` in the terminal where the server is running.
+**Auto-Reload:** The server is configured to run with `reload=True`. This means Uvicorn will automatically restart the server if it detects changes in `.py`, `.html`, `.css`, `.js`, `.env`, or `.yaml` files within the project or `ui` directory. This is useful for development but should generally be disabled in production.
+---
+## 7. Usage
+The Dia TTS Server can be used via its Web UI or its API endpoints.
+### 7.1 Web User Interface (Web UI)
+Access the UI by navigating to the server's base URL (e.g., `http://localhost:8003`).
+#### 7.1.1 Main Generation Form
+*   **Text to speak:** Enter the text you want to synthesize.
+    *   Use `[S1]` and `[S2]` tags to indicate speaker turns for dialogue.
+    *   Include non-verbal cues like `(laughs)`, `(sighs)`, `(clears throat)` directly in the text where desired.
+    *   For voice cloning, **prepend the exact transcript** of the selected reference audio before the text you want generated (e.g., `[S1] Reference transcript text. [S1] This is the new text to generate in the cloned voice.`).
+*   **Voice Mode:** Select the desired generation mode:
+    *   **Single / Dialogue (Use [S1]/[S2]):** Use this for single-speaker text (you can use `[S1]` or omit tags if the model handles it) or multi-speaker dialogue (using `[S1]` and `[S2]`).
+    *   **Voice Clone (from Reference):** Enables voice cloning based on a selected audio file. Requires selecting a file below and prepending its transcript to the text input.
+*   **Generate Speech Button:** Submits the text and settings to the server to start generation.
+#### 7.1.2 Presets
+*   Located below the Voice Mode selection.
+*   Clicking a preset button (e.g., "Standard Dialogue", "Expressive Narration") will automatically populate the "Text to speak" area and the "Generation Parameters" sliders with predefined values, demonstrating different use cases.
+#### 7.1.3 Voice Cloning
+*   This section appears only when "Voice Clone" mode is selected.
+*   **Reference Audio File Dropdown:** Lists available `.wav` and `.mp3` files found in the configured `REFERENCE_AUDIO_PATH`. Select the file whose voice you want to clone. Remember to prepend its transcript to the main text input.
+*   **Load Button:** Click this to open your system's file browser. You can select one or more `.wav` or `.mp3` files to upload. The selected files will be copied to the server's `REFERENCE_AUDIO_PATH`, and the dropdown list will refresh automatically. The first newly uploaded file will be selected in the dropdown.
+#### 7.1.4 Generation Parameters
+*   Expand this section to fine-tune the generation process. These values correspond to the parameters used by the underlying Dia model.
+*   **Sliders:** Adjust Speed Factor, CFG Scale, Temperature, Top P, and CFG Filter Top K. The current value is displayed next to the label.
+*   **Save Generation Defaults Button:** Saves the *current* values of these sliders to the `.env` file (as `GEN_DEFAULT_...` keys). These saved values will become the default settings loaded into the UI the next time the server starts.
+#### 7.1.5 Server Configuration (UI)
+*   Expand this section to view and modify server-level settings stored in the `.env` file.
+*   **Fields:** Edit Model Repo ID, Config/Weights Filenames, Cache/Reference/Output Paths, Host, and Port.
+*   **Save Server Configuration Button:** Saves the values currently shown in these fields to the `.env` file. **A server restart is required** for most of these changes (especially model source or paths) to take effect.
+*   **Restart Server Button:** (Appears after saving) Attempts to trigger a server restart. This works best if the server was started with `reload=True` or is managed by a process manager like systemd or Supervisor.
+#### 7.1.6 Generated Audio Player
+*   Appears below the main form after a successful generation.
+*   **Waveform:** Visual representation of the generated audio.
+*   **Play/Pause Button:** Controls audio playback.
+*   **Download WAV Button:** Downloads the generated audio as a `.wav` file.
+*   **Info:** Displays the voice mode used, generation time, and audio duration.
+#### 7.1.7 Theme Toggle
+*   Located in the top-right navigation bar.
+*   Click the Sun/Moon icon to switch between Light and Dark themes. Your preference is saved in your browser's `localStorage`.
+### 7.2 API Endpoints
+Access the interactive API documentation via the `/docs` path (e.g., `http://localhost:8003/docs`).
+#### 7.2.1 POST `/v1/audio/speech` (OpenAI Compatible)
+*   **Purpose:** Provides an endpoint compatible with the basic OpenAI TTS API for easier integration with existing tools.
+*   **Request Body:** (`application/json`) - Uses the `OpenAITTSRequest` model.
+    | Field             | Type                     | Required | Description                                                                                                                               | Example                     |
+    | :---------------- | :----------------------- | :------- | :---------------------------------------------------------------------------------------------------------------------------------------- | :-------------------------- |
+    | `model`           | string                   | No       | Ignored by this server (always uses Dia). Included for compatibility. Defaults to `dia-1.6b`.                                               | `"dia-1.6b"`                |
+    | `input`           | string                   | Yes      | The text to synthesize. Use `[S1]`/`[S2]` tags for dialogue. For cloning, prepend reference transcript.                                    | `"Hello [S1] world."`       |
+    | `voice`           | string                   | No       | Maps to Dia modes. Use `"S1"`, `"S2"`, `"dialogue"`, or the filename of a reference audio (e.g., `"my_ref.wav"`) for cloning. Defaults to `S1`. | `"dialogue"` or `"ref.mp3"` |
+    | `response_format` | `"opus"` \| `"wav"`      | No       | Desired audio output format. Defaults to `opus`.                                                                                          | `"wav"`                     |
+    | `speed`           | float                    | No       | Playback speed factor (0.5-2.0). Applied *after* generation. Defaults to `1.0`.                                                           | `0.9`                       |
+*   **Response:**
+    *   **Success (200 OK):** `StreamingResponse` containing the binary audio data (`audio/opus` or `audio/wav`).
+    *   **Error:** Standard FastAPI JSON error response (e.g., 400, 404, 500).
+#### 7.2.2 POST `/tts` (Custom Parameters)
+*   **Purpose:** Allows generation using all specific Dia generation parameters.
+*   **Request Body:** (`application/json`) - Uses the `CustomTTSRequest` model.
+    | Field                      | Type                                   | Required | Description                                                                                                                               | Default     |
+    | :------------------------- | :------------------------------------- | :------- | :---------------------------------------------------------------------------------------------------------------------------------------- | :---------- |
+    | `text`                     | string                                 | Yes      | The text to synthesize. Use `[S1]`/`[S2]` tags. Prepend transcript for cloning.                                                           |             |
+    | `voice_mode`               | `"dialogue"` \| `"clone"`              | No       | Generation mode. Note: `single_s1`/`single_s2` are handled via `dialogue` mode with appropriate tags in the text.                         | `dialogue`  |
+    | `clone_reference_filename` | string \| null                         | No       | Filename of reference audio in `REFERENCE_AUDIO_PATH`. **Required if `voice_mode` is `clone`**.                                           | `null`      |
+    | `output_format`            | `"opus"` \| `"wav"`                    | No       | Desired audio output format.                                                                                                              | `opus`      |
+    | `max_tokens`               | integer \| null                        | No       | Maximum audio tokens to generate. `null` uses the model's default.                                                                        | `null`      |
+    | `cfg_scale`                | float                                  | No       | Classifier-Free Guidance scale.                                                                                                           | `3.0`       |
+    | `temperature`              | float                                  | No       | Sampling temperature.                                                                                                                     | `1.3`       |
+    | `top_p`                    | float                                  | No       | Nucleus sampling probability.                                                                                                             | `0.95`      |
+    | `speed_factor`             | float                                  | No       | Playback speed factor (0.5-2.0). Applied *after* generation.                                                                              | `0.90`      |
+    | `cfg_filter_top_k`         | integer                                | No       | Top-K value for CFG filtering.                                                                                                            | `35`        |
+*   **Response:**
+    *   **Success (200 OK):** `StreamingResponse` containing the binary audio data (`audio/opus` or `audio/wav`).
+    *   **Error:** Standard FastAPI JSON error response (e.g., 400, 404, 500).
+#### 7.2.3 Configuration & Helper Endpoints
+*   **GET `/get_config`:** Returns the current server configuration as JSON.
+*   **POST `/save_config`:** Saves server configuration settings provided in the JSON request body to the `.env` file. Requires server restart.
+*   **POST `/save_generation_defaults`:** Saves default generation parameters provided in the JSON request body to the `.env` file. Affects UI defaults on next load.
+*   **POST `/restart_server`:** Attempts to trigger a server restart (reliability depends on execution environment).
+*   **POST `/upload_reference`:** Uploads one or more audio files (`.wav`, `.mp3`) as `multipart/form-data` to the reference audio directory. Returns JSON with status and updated file list.
+*   **GET `/health`:** Basic health check endpoint. Returns `{"status": "healthy", "model_loaded": true/false}`.
+---
+## 8. Troubleshooting
+*   **Error: `CUDA available: False` or Slow Performance:**
+    *   Verify NVIDIA drivers are installed correctly (`nvidia-smi` command).
+    *   Ensure you installed the correct PyTorch version with CUDA support matching your driver (See [Section 4.4](#44-nvidia-driver-and-cuda-setup-required-for-gpu-acceleration)). Reinstall PyTorch using the command from the official website if unsure.
+    *   Check if another process is using all GPU VRAM.
+*   **Error: `ImportError: No module named 'dac'` (or `safetensors`, `yaml`, etc.):**
+    *   Make sure your virtual environment is activated.
+    *   Run `pip install -r requirements.txt` again to install missing dependencies.
+    *   Specifically for `dac`, ensure you installed `descript-audio-codec` and not a different package named `dac`. Run `pip uninstall dac -y && pip install descript-audio-codec`.
+*   **Error: `libsndfile library not found` (or similar `soundfile` error, mainly on Linux):**
+    *   Install the system library: `sudo apt update && sudo apt install libsndfile1` (Debian/Ubuntu) or the equivalent for your distribution.
+*   **Error: Model Download Fails (e.g., `HTTPError`, `ConnectionError`):**
+    *   Check your internet connection.
+    *   Verify the `DIA_MODEL_REPO_ID`, `DIA_MODEL_CONFIG_FILENAME`, and `DIA_MODEL_WEIGHTS_FILENAME` in your `.env` file (or defaults in `config.py`) are correct and accessible on Hugging Face Hub.
+    *   Check Hugging Face Hub status if multiple downloads fail.
+    *   Ensure the cache directory (`DIA_MODEL_CACHE_PATH`) is writable.
+*   **Error: `RuntimeError: Failed to load DAC model...`:**
+    *   This usually indicates an issue with the `descript-audio-codec` installation or version incompatibility. Ensure it's installed correctly (see `ImportError` above).
+    *   Check logs for specific `AttributeError` messages (like missing `utils` or `download`) which might indicate version mismatches between the Dia code's expectation and the installed library. The current code expects `dac.utils.download()`.
+*   **Error: `FileNotFoundError` during generation (Reference Audio):**
+    *   Ensure the filename selected/provided for voice cloning exists in the configured `REFERENCE_AUDIO_PATH`.
+    *   Check that the path in `config.py` or `.env` is correct and the server has permission to read from it.
+*   **Error: Cannot Save Output/Reference Files (`PermissionError`, etc.):**
+    *   Ensure the directories specified by `OUTPUT_PATH` and `REFERENCE_AUDIO_PATH` exist and the server process has write permissions to them.
+*   **Web UI Issues (Buttons don't work, styles missing):**
+    *   Clear your browser cache.
+    *   Check the browser's developer console (usually F12) for JavaScript errors.
+    *   Ensure `ui/script.js` and `ui/style.css` are being loaded correctly (check network tab in developer tools).
+*   **Generation Cancel Button Doesn't Stop Process:**
+    *   This is expected ("Fake Cancel"). The button currently only prevents the UI from processing the result when it eventually arrives. True cancellation is complex and not implemented. Clicking "Generate" again *will* cancel the *previous UI request's result processing* before starting the new one.
+---
+## 9. Project Architecture
+*   **`server.py`:** The main entry point using FastAPI. Defines API routes, serves the Web UI using Jinja2, handles requests, and orchestrates calls to the engine.
+*   **`engine.py`:** Responsible for loading the Dia model (including downloading files via `huggingface_hub`), managing the model instance, preparing inputs for the model's `generate` method based on user requests (handling voice modes), and calling the model's generation function. Also handles post-processing like speed adjustment.
+*   **`config.py`:** Manages all configuration settings using default values and overrides from a `.env` file. Provides getter functions for easy access to settings.
+*   **`dia/` package:** Contains the core implementation of the Dia model itself.
+    *   `model.py`: Defines the `Dia` class, which wraps the underlying PyTorch model (`DiaModel`). It handles loading weights (`.pth` or `.safetensors`), loading the required DAC model, preparing inputs specifically for the `DiaModel` forward pass (including CFG logic), and running the autoregressive generation loop.
+    *   `config.py` (within `dia/`): Defines Pydantic models representing the *structure* and hyperparameters of the Dia model architecture (encoder, decoder, data parameters). This is loaded from the `config.json` file associated with the model weights.
+    *   `layers.py`: Contains custom PyTorch `nn.Module` implementations used within the `DiaModel` (e.g., Attention blocks, MLP blocks, RoPE).
+    *   `audio.py`: Includes helper functions for audio processing specific to the model's tokenization and delay patterns (e.g., `audio_to_codebook`, `codebook_to_audio`, `apply_audio_delay`).
+*   **`ui/` directory:** Contains all files related to the Web UI.
+    *   `index.html`: The main Jinja2 template.
+    *   `script.js`: Frontend JavaScript for interactivity, API calls, theme switching, etc.
+    *   `presets.yaml`: Definitions for the UI preset examples.
+*   **`utils.py`:** General utility functions, such as audio encoding (`encode_audio`) and saving (`save_audio_to_file`) using the `soundfile` library.
+*   **Dependencies:** Relies heavily on `FastAPI`, `Uvicorn`, `PyTorch`, `torchaudio`, `huggingface_hub`, `safetensors`, `descript-audio-codec`, `soundfile`, `PyYAML`, `python-dotenv`, `pydantic`, and `Jinja2`.
+---
+## 10. License and Disclaimer
+*   **License:** This project is licensed under the MIT License.
+*   **Disclaimer:** This project offers a high-fidelity speech generation model intended solely for research and educational use. The following uses are **strictly forbidden**:
+    *   **Identity Misuse**: Do not produce audio resembling real individuals without permission.
+    *   **Deceptive Content**: Do not use this model to generate misleading content (e.g. fake news)
+    *   **Illegal or Malicious Use**: Do not use this model for activities that are illegal or intended to cause harm.
+    By using this model, you agree to uphold relevant legal standards and ethical responsibilities. The creators **are not responsible** for any misuse and firmly oppose any unethical usage of this technology.
+---

download_model.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# download_model.py
+# Utility script to download the Dia model and dependencies without starting the server.
+import logging
+import os
+import engine  # Import the engine module to trigger its loading logic
+# Configure basic logging for the script
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
+)
+logger = logging.getLogger("ModelDownloader")
+if __name__ == "__main__":
+    logger.info("--- Starting Dia Model Download ---")
+    # Ensure cache directory exists (redundant if engine.load_model does it, but safe)
+    try:
+        from config import get_model_cache_path
+        cache_path = get_model_cache_path()
+        os.makedirs(cache_path, exist_ok=True)
+        logger.info(
+            f"Ensured model cache directory exists: {os.path.abspath(cache_path)}"
+        )
+    except Exception as e:
+        logger.warning(f"Could not ensure cache directory exists: {e}")
+    # Trigger the model loading function from the engine
+    logger.info("Calling engine.load_model() to initiate download if necessary...")
+    success = engine.load_model()
+    if success:
+        logger.info("--- Model download/load process completed successfully ---")
+    else:
+        logger.error(
+            "--- Model download/load process failed. Check logs for details. ---"
+        )
+        exit(1)  # Exit with error code
+    logger.info("You can now start the server using 'python server.py'")

engine.py ADDED Viewed

	@@ -0,0 +1,356 @@

+# engine.py
+# Core Dia TTS model loading and generation logic
+import logging
+import time
+import os
+import torch
+import numpy as np
+from typing import Optional, Tuple
+from huggingface_hub import hf_hub_download  # Import downloader
+# Import Dia model class and config
+try:
+    from dia.model import Dia
+    from dia.config import DiaConfig
+except ImportError as e:
+    # Log critical error if core components are missing
+    logging.critical(
+        f"Failed to import Dia model components: {e}. Ensure the 'dia' package exists and is importable.",
+        exc_info=True,
+    )
+    # Define dummy classes/functions to prevent server crash on import,
+    # but generation will fail later if these are used.
+    class Dia:
+        @staticmethod
+        def load_model_from_files(*args, **kwargs):
+            raise RuntimeError("Dia model package not available or failed to import.")
+        def generate(*args, **kwargs):
+            raise RuntimeError("Dia model package not available or failed to import.")
+    class DiaConfig:
+        pass
+# Import configuration getters from our project's config.py
+from config import (
+    get_model_repo_id,
+    get_model_cache_path,
+    get_reference_audio_path,
+    get_model_config_filename,
+    get_model_weights_filename,
+)
+logger = logging.getLogger(__name__)  # Use standard logger name
+# --- Global Variables ---
+dia_model: Optional[Dia] = None
+# model_config is now loaded within Dia.load_model_from_files, maybe remove global?
+# Let's keep it for now if needed elsewhere, but populate it after loading.
+model_config_instance: Optional[DiaConfig] = None
+model_device: Optional[torch.device] = None
+MODEL_LOADED = False
+EXPECTED_SAMPLE_RATE = 44100  # Dia model and DAC typically operate at 44.1kHz
+# --- Model Loading ---
+def get_device() -> torch.device:
+    """Determines the optimal torch device (CUDA > MPS > CPU)."""
+    if torch.cuda.is_available():
+        logger.info("CUDA is available, using GPU.")
+        return torch.device("cuda")
+    # Add MPS check for Apple Silicon GPUs
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        # Basic check is usually sufficient
+        logger.info("MPS is available, using Apple Silicon GPU.")
+        return torch.device("mps")
+    else:
+        logger.info("CUDA and MPS not available, using CPU.")
+        return torch.device("cpu")
+def load_model():
+    """
+    Loads the Dia TTS model and associated DAC model.
+    Downloads model files based on configuration if they don't exist locally.
+    Handles both .pth and .safetensors formats.
+    """
+    global dia_model, model_config_instance, model_device, MODEL_LOADED
+    if MODEL_LOADED:
+        logger.info("Dia model already loaded.")
+        return True
+    # Get configuration values
+    repo_id = get_model_repo_id()
+    config_filename = get_model_config_filename()
+    weights_filename = get_model_weights_filename()
+    cache_path = get_model_cache_path()  # Already absolute path
+    model_device = get_device()
+    logger.info(f"Attempting to load Dia model:")
+    logger.info(f"  Repo ID: {repo_id}")
+    logger.info(f"  Config File: {config_filename}")
+    logger.info(f"  Weights File: {weights_filename}")
+    logger.info(f"  Cache Directory: {cache_path}")
+    logger.info(f"  Target Device: {model_device}")
+    # Ensure cache directory exists
+    try:
+        os.makedirs(cache_path, exist_ok=True)
+    except OSError as e:
+        logger.error(
+            f"Failed to create cache directory '{cache_path}': {e}", exc_info=True
+        )
+        # Depending on severity, might want to return False here
+        # return False
+        pass  # Continue and let hf_hub_download handle potential issues
+    try:
+        start_time = time.time()
+        # --- Download Model Files ---
+        logger.info(
+            f"Downloading/finding configuration file '{config_filename}' from repo '{repo_id}'..."
+        )
+        local_config_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=config_filename,
+            cache_dir=cache_path,
+            # force_download=False, # Default: only download if missing or outdated
+            # resume_download=True, # Default: resume interrupted downloads
+        )
+        logger.info(f"Configuration file path: {local_config_path}")
+        logger.info(
+            f"Downloading/finding weights file '{weights_filename}' from repo '{repo_id}'..."
+        )
+        local_weights_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=weights_filename,
+            cache_dir=cache_path,
+        )
+        logger.info(f"Weights file path: {local_weights_path}")
+        # --- Load Model using the class method ---
+        # The Dia class method now handles config loading, instantiation, weight loading, etc.
+        dia_model = Dia.load_model_from_files(
+            config_path=local_config_path,
+            weights_path=local_weights_path,
+            device=model_device,
+        )
+        # Store the config instance if needed globally (optional)
+        model_config_instance = dia_model.config
+        end_time = time.time()
+        logger.info(
+            f"Dia model loaded successfully in {end_time - start_time:.2f} seconds."
+        )
+        MODEL_LOADED = True
+        return True
+    except FileNotFoundError as e:
+        logger.error(
+            f"Model loading failed: Required file not found. {e}", exc_info=True
+        )
+        MODEL_LOADED = False
+        return False
+    except ImportError:
+        # This catches if the 'dia' package itself is missing
+        logger.critical(
+            "Failed to load model: Dia package or its core dependencies not found.",
+            exc_info=True,
+        )
+        MODEL_LOADED = False
+        return False
+    except Exception as e:
+        # Catch other potential errors during download or loading
+        logger.error(
+            f"Error loading Dia model from repo '{repo_id}': {e}", exc_info=True
+        )
+        dia_model = None
+        model_config_instance = None
+        MODEL_LOADED = False
+        return False
+# --- Speech Generation ---
+def generate_speech(
+    text: str,
+    voice_mode: str = "single_s1",
+    clone_reference_filename: Optional[str] = None,
+    max_tokens: Optional[int] = None,
+    cfg_scale: float = 3.0,
+    temperature: float = 1.3,
+    top_p: float = 0.95,
+    speed_factor: float = 0.94,  # Keep speed factor separate from model generation params
+    cfg_filter_top_k: int = 35,
+) -> Optional[Tuple[np.ndarray, int]]:
+    """
+    Generates speech using the loaded Dia model, handling voice modes and speed adjustment.
+    Args:
+        text: Text to synthesize.
+        voice_mode: 'dialogue', 'single_s1', 'single_s2', 'clone'.
+        clone_reference_filename: Filename for voice cloning (if mode is 'clone'). Located in reference audio path.
+        max_tokens: Max generation tokens for the model's generate method.
+        cfg_scale: CFG scale for the model's generate method.
+        temperature: Sampling temperature for the model's generate method.
+        top_p: Nucleus sampling p for the model's generate method.
+        speed_factor: Factor to adjust the playback speed *after* generation (e.g., 0.9 = slower, 1.1 = faster).
+        cfg_filter_top_k: CFG filter top K for the model's generate method.
+    Returns:
+        Tuple of (numpy_audio_array, sample_rate), or None on failure.
+    """
+    if not MODEL_LOADED or dia_model is None:
+        logger.error("Dia model is not loaded. Cannot generate speech.")
+        return None
+    logger.info(f"Generating speech with mode: {voice_mode}")
+    logger.debug(f"Input text (start): '{text[:100]}...'")
+    # Log model generation parameters
+    logger.debug(
+        f"Model Params: max_tokens={max_tokens}, cfg={cfg_scale}, temp={temperature}, top_p={top_p}, top_k={cfg_filter_top_k}"
+    )
+    # Log post-processing parameters
+    logger.debug(f"Post-processing Params: speed_factor={speed_factor}")
+    audio_prompt_path = None
+    processed_text = text  # Start with original text
+    # --- Handle Voice Mode ---
+    if voice_mode == "clone":
+        if not clone_reference_filename:
+            logger.error("Clone mode selected but no reference filename provided.")
+            return None
+        ref_base_path = get_reference_audio_path()  # Gets absolute path
+        potential_path = os.path.join(ref_base_path, clone_reference_filename)
+        if os.path.isfile(potential_path):
+            audio_prompt_path = potential_path
+            logger.info(f"Using audio prompt for cloning: {audio_prompt_path}")
+            # Dia requires the transcript of the clone audio to be prepended to the target text.
+            # The UI/API caller is responsible for constructing this combined text.
+            logger.warning(
+                "Clone mode active. Ensure the 'text' input includes the transcript of the reference audio for best results (e.g., '[S1] Reference transcript. [S1] Target text...')."
+            )
+            processed_text = text  # Use the combined text provided by the caller
+        else:
+            logger.error(f"Reference audio file not found: {potential_path}")
+            return None  # Fail generation if reference file is missing
+    elif voice_mode == "dialogue":
+        # Assume text already contains [S1]/[S2] tags as required by the model
+        logger.info("Using dialogue mode. Expecting [S1]/[S2] tags in input text.")
+        if "[S1]" not in text and "[S2]" not in text:
+            logger.warning(
+                "Dialogue mode selected, but no [S1] or [S2] tags found in the input text."
+            )
+        processed_text = text  # Pass directly
+    elif voice_mode == "single_s1":
+        logger.info("Using single voice mode (S1).")
+        # Check if text *already* contains tags, warn if so, as it might confuse the model
+        if "[S1]" in text or "[S2]" in text:
+            logger.warning(
+                "Input text contains dialogue tags ([S1]/[S2]), but 'single_s1' mode was selected. Model behavior might be unexpected."
+            )
+        # Dia likely expects tags even for single speaker. Prepending [S1] might be safer.
+        # Let's assume for now the model handles untagged text as S1, but this could be adjusted.
+        # Consider: processed_text = f"[S1] {text}" # Option to enforce S1 tag
+        processed_text = text  # Pass directly for now
+    elif voice_mode == "single_s2":
+        logger.info("Using single voice mode (S2).")
+        if "[S1]" in text or "[S2]" in text:
+            logger.warning(
+                "Input text contains dialogue tags ([S1]/[S2]), but 'single_s2' mode was selected."
+            )
+        # Similar to S1, how to signal S2? Prepending [S2] seems logical if needed.
+        # Consider: processed_text = f"[S2] {text}" # Option to enforce S2 tag
+        processed_text = text  # Pass directly for now
+    else:
+        logger.error(
+            f"Unsupported voice_mode: {voice_mode}. Defaulting to 'single_s1'."
+        )
+        processed_text = text  # Fallback
+    # --- Call Dia Generate ---
+    try:
+        start_time = time.time()
+        logger.info("Calling Dia model generate method...")
+        # Call the model's generate method with appropriate parameters
+        generated_audio_np = dia_model.generate(
+            text=processed_text,
+            audio_prompt_path=audio_prompt_path,
+            max_tokens=max_tokens,  # Pass None if not specified, Dia uses its default
+            cfg_scale=cfg_scale,
+            temperature=temperature,
+            top_p=top_p,
+            use_cfg_filter=True,  # Default from Dia's app.py, seems reasonable
+            cfg_filter_top_k=cfg_filter_top_k,
+            use_torch_compile=False,  # Keep False for stability unless specifically tested/enabled
+        )
+        gen_end_time = time.time()
+        logger.info(
+            f"Dia model generation finished in {gen_end_time - start_time:.2f} seconds."
+        )
+        if generated_audio_np is None or generated_audio_np.size == 0:
+            logger.warning("Dia model returned None or empty audio array.")
+            return None
+        # --- Apply Speed Factor (Post-processing) ---
+        # This mimics the logic in Dia's original app.py
+        if speed_factor != 1.0:
+            logger.info(f"Applying speed factor: {speed_factor}")
+            original_len = len(generated_audio_np)
+            # Ensure speed_factor is within a reasonable range to avoid extreme distortion
+            # Adjust range based on observed quality (e.g., 0.5 to 2.0)
+            speed_factor = max(0.5, min(speed_factor, 2.0))
+            target_len = int(original_len / speed_factor)
+            if target_len > 0 and target_len != original_len:
+                logger.debug(
+                    f"Resampling audio from {original_len} to {target_len} samples."
+                )
+                # Create time axes for original and resampled audio
+                x_original = np.linspace(0, original_len - 1, original_len)
+                x_resampled = np.linspace(0, original_len - 1, target_len)
+                # Interpolate using numpy
+                resampled_audio_np = np.interp(
+                    x_resampled, x_original, generated_audio_np
+                )
+                final_audio_np = resampled_audio_np.astype(np.float32)  # Ensure float32
+                logger.info(f"Audio resampled for {speed_factor:.2f}x speed.")
+            else:
+                logger.warning(
+                    f"Skipping speed adjustment (factor: {speed_factor:.2f}). Target length invalid ({target_len}) or no change needed."
+                )
+                final_audio_np = generated_audio_np  # Use original audio
+        else:
+            logger.info("Speed factor is 1.0, no speed adjustment needed.")
+            final_audio_np = generated_audio_np  # No speed change needed
+        # Ensure output is float32 (DAC output should be, but good practice)
+        if final_audio_np.dtype != np.float32:
+            logger.warning(
+                f"Generated audio was not float32 ({final_audio_np.dtype}), converting."
+            )
+            final_audio_np = final_audio_np.astype(np.float32)
+        logger.info(
+            f"Final audio ready. Shape: {final_audio_np.shape}, dtype: {final_audio_np.dtype}"
+        )
+        # Return the processed audio and the expected sample rate
+        return final_audio_np, EXPECTED_SAMPLE_RATE
+    except Exception as e:
+        logger.error(
+            f"Error during Dia generation or post-processing: {e}", exc_info=True
+        )
+        return None  # Return None on failure

models.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# models.py
+# Pydantic models for API requests and potentially responses
+from pydantic import BaseModel, Field
+from typing import Optional, Literal
+# --- Request Models ---
+class OpenAITTSRequest(BaseModel):
+    """Request model compatible with the OpenAI TTS API."""
+    model: str = Field(
+        default="dia-1.6b",
+        description="Model identifier (ignored by this server, always uses Dia). Included for compatibility.",
+    )
+    input: str = Field(..., description="The text to synthesize.")
+    voice: str = Field(
+        default="S1",
+        description="Voice mode or reference audio filename. Examples: 'S1', 'S2', 'dialogue', 'my_reference.wav'.",
+    )
+    response_format: Literal["opus", "wav"] = Field(
+        default="opus", description="The desired audio output format."
+    )
+    speed: float = Field(
+        default=1.0,
+        ge=0.8,
+        le=1.2,  # Dia speed factor range seems narrower
+        description="Adjusts the speed of the generated audio (0.8 to 1.2).",
+    )
+class CustomTTSRequest(BaseModel):
+    """Request model for the custom /tts endpoint."""
+    text: str = Field(
+        ...,
+        description="The text to synthesize. For 'dialogue' mode, include [S1]/[S2] tags.",
+    )
+    voice_mode: Literal["dialogue", "single_s1", "single_s2", "clone"] = Field(
+        default="single_s1", description="Specifies the generation mode."
+    )
+    clone_reference_filename: Optional[str] = Field(
+        default=None,
+        description="Filename of the reference audio within the configured reference path (required if voice_mode is 'clone').",
+    )
+    output_format: Literal["opus", "wav"] = Field(
+        default="opus", description="The desired audio output format."
+    )
+    # Dia-specific generation parameters
+    max_tokens: Optional[int] = Field(
+        default=None,
+        gt=0,
+        description="Maximum number of audio tokens to generate (defaults to model's internal config value).",
+    )
+    cfg_scale: float = Field(
+        default=3.0,
+        ge=1.0,
+        le=5.0,
+        description="Classifier-Free Guidance scale (1.0-5.0).",
+    )
+    temperature: float = Field(
+        default=1.3, ge=1.0, le=1.5, description="Sampling temperature (1.0-1.5)."
+    )
+    top_p: float = Field(
+        default=0.95,
+        ge=0.8,
+        le=1.0,
+        description="Nucleus sampling probability (0.8-1.0).",
+    )
+    speed_factor: float = Field(
+        default=0.94,
+        ge=0.8,
+        le=1.0,  # Dia's default range seems to be <= 1.0
+        description="Adjusts the speed of the generated audio (0.8 to 1.0).",
+    )
+    cfg_filter_top_k: int = Field(
+        default=35, ge=15, le=50, description="Top k filter for CFG guidance (15-50)."
+    )
+# --- Response Models (Optional, can be simple dicts too) ---
+class TTSResponse(BaseModel):
+    """Basic response model for successful generation (if returning JSON)."""
+    request_id: str
+    status: str = "completed"
+    generation_time_sec: float
+    output_url: Optional[str] = None  # If saving file and returning URL
+class ErrorResponse(BaseModel):
+    """Error response model."""
+    detail: str

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+# requirements.txt
+# Core Web Framework
+fastapi
+uvicorn[standard]
+# Machine Learning & Audio
+torch
+torchaudio
+numpy
+soundfile # Requires libsndfile system library (e.g., sudo apt-get install libsndfile1 on Debian/Ubuntu)
+huggingface_hub
+descript-audio-codec
+safetensors
+# Configuration & Utilities
+pydantic
+python-dotenv
+Jinja2
+python-multipart # For potential file uploads in UI
+requests # For health checks or other potential uses
+PyYAML # For parsing presets.yaml

server.py ADDED Viewed

	@@ -0,0 +1,1061 @@

+# server.py
+# Main FastAPI server for Dia TTS
+import sys
+import logging
+import time
+import os
+import io
+import uuid
+import sys
+import shutil  # For file copying
+import yaml  # For loading presets
+from datetime import datetime
+from contextlib import asynccontextmanager
+from typing import Optional, Literal, List, Dict, Any
+import webbrowser
+import threading
+import time
+from fastapi import (
+    FastAPI,
+    HTTPException,
+    Request,
+    Response,
+    Form,
+    UploadFile,
+    File,
+    BackgroundTasks,
+)
+from fastapi.responses import (
+    StreamingResponse,
+    JSONResponse,
+    HTMLResponse,
+    RedirectResponse,
+)
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+import uvicorn
+import numpy as np
+# Internal imports
+from config import (
+    config_manager,
+    get_host,
+    get_port,
+    get_output_path,
+    get_reference_audio_path,
+    # register_config_routes is now defined locally
+    get_model_cache_path,
+    get_model_repo_id,
+    get_model_config_filename,
+    get_model_weights_filename,
+    # Generation default getters
+    get_gen_default_speed_factor,
+    get_gen_default_cfg_scale,
+    get_gen_default_temperature,
+    get_gen_default_top_p,
+    get_gen_default_cfg_filter_top_k,
+    DEFAULT_CONFIG,
+)
+from models import OpenAITTSRequest, CustomTTSRequest, ErrorResponse
+import engine
+from engine import (
+    load_model as load_dia_model,
+    generate_speech,
+    EXPECTED_SAMPLE_RATE,
+)
+from utils import encode_audio, save_audio_to_file, PerformanceMonitor
+# Configure logging (Basic setup, can be enhanced)
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
+)
+# Reduce verbosity of noisy libraries if needed
+# logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
+# logging.getLogger("watchfiles").setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)  # Logger for this module
+# --- Global Variables & Constants ---
+PRESETS_FILE = "ui/presets.yaml"
+loaded_presets: List[Dict[str, Any]] = []  # Cache presets in memory
+startup_complete_event = threading.Event()
+# --- Helper Functions ---
+def load_presets():
+    """Loads presets from the YAML file."""
+    global loaded_presets
+    try:
+        if os.path.exists(PRESETS_FILE):
+            with open(PRESETS_FILE, "r", encoding="utf-8") as f:
+                loaded_presets = yaml.safe_load(f)
+                if not isinstance(loaded_presets, list):
+                    logger.error(
+                        f"Presets file '{PRESETS_FILE}' should contain a list, but found {type(loaded_presets)}. No presets loaded."
+                    )
+                    loaded_presets = []
+                else:
+                    logger.info(
+                        f"Successfully loaded {len(loaded_presets)} presets from {PRESETS_FILE}."
+                    )
+        else:
+            logger.warning(
+                f"Presets file not found at '{PRESETS_FILE}'. No presets will be available."
+            )
+            loaded_presets = []
+    except yaml.YAMLError as e:
+        logger.error(
+            f"Error parsing presets YAML file '{PRESETS_FILE}': {e}", exc_info=True
+        )
+        loaded_presets = []
+    except Exception as e:
+        logger.error(f"Error loading presets file '{PRESETS_FILE}': {e}", exc_info=True)
+        loaded_presets = []
+def get_valid_reference_files() -> list[str]:
+    """Gets a list of valid audio files (.wav, .mp3) from the reference directory."""
+    ref_path = get_reference_audio_path()
+    valid_files = []
+    allowed_extensions = (".wav", ".mp3")
+    try:
+        if os.path.isdir(ref_path):
+            for filename in os.listdir(ref_path):
+                if filename.lower().endswith(allowed_extensions):
+                    # Optional: Add check for file size or basic validity if needed
+                    valid_files.append(filename)
+        else:
+            logger.warning(f"Reference audio directory not found: {ref_path}")
+    except Exception as e:
+        logger.error(
+            f"Error reading reference audio directory '{ref_path}': {e}", exc_info=True
+        )
+    return sorted(valid_files)
+def sanitize_filename(filename: str) -> str:
+    """Removes potentially unsafe characters and path components from a filename."""
+    # Remove directory separators
+    filename = os.path.basename(filename)
+    # Keep only alphanumeric, underscore, hyphen, dot. Replace others with underscore.
+    safe_chars = set(
+        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._-"
+    )
+    sanitized = "".join(c if c in safe_chars else "_" for c in filename)
+    # Prevent names starting with dot or consisting only of dots/spaces
+    if not sanitized or sanitized.lstrip("._ ") == "":
+        return f"uploaded_file_{uuid.uuid4().hex[:8]}"  # Generate a safe fallback name
+    # Limit length
+    max_len = 100
+    if len(sanitized) > max_len:
+        name, ext = os.path.splitext(sanitized)
+        sanitized = name[: max_len - len(ext)] + ext
+    return sanitized
+# --- Application Lifespan (Startup/Shutdown) ---
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan manager for startup/shutdown."""
+    model_loaded_successfully = False  # Flag to track success
+    try:
+        logger.info("Starting Dia TTS server initialization...")
+        # Ensure base directories exist
+        os.makedirs(get_output_path(), exist_ok=True)
+        os.makedirs(get_reference_audio_path(), exist_ok=True)
+        os.makedirs(get_model_cache_path(), exist_ok=True)
+        os.makedirs("ui", exist_ok=True)
+        os.makedirs("static", exist_ok=True)
+        # Load presets from YAML file
+        load_presets()
+        # Load the main TTS model during startup
+        if not load_dia_model():
+            # Model loading failed
+            error_msg = (
+                "CRITICAL: Failed to load Dia model on startup. Server cannot start."
+            )
+            logger.critical(error_msg)
+            # Option 1: Raise an exception to stop Uvicorn startup cleanly
+            raise RuntimeError(error_msg)
+            # Option 2: Force exit (less clean, might bypass some Uvicorn shutdown)
+            # sys.exit(1)
+        else:
+            logger.info("Dia model loaded successfully.")
+            model_loaded_successfully = True
+            # Create and start a delayed browser opening thread
+            # IMPORTANT: Create this thread AFTER model loading completes
+            host = get_host()
+            port = get_port()
+            browser_thread = threading.Thread(
+                target=lambda: _delayed_browser_open(host, port), daemon=True
+            )
+            browser_thread.start()
+        # --- Signal completion AFTER potentially long operations ---
+        logger.info("Application startup sequence finished. Signaling readiness.")
+        startup_complete_event.set()
+        yield  # Application runs here
+    except Exception as e:
+        # Catch the RuntimeError we raised or any other startup error
+        logger.error(f"Fatal error during application startup: {e}", exc_info=True)
+        # Do NOT set the event here if startup failed
+        # Re-raise the exception or exit to ensure the server stops
+        raise e  # Re-raising ensures Uvicorn knows startup failed
+        # Alternatively: sys.exit(1)
+    finally:
+        # Cleanup on shutdown
+        logger.info("Application shutdown initiated...")
+        # Add any specific cleanup needed
+        logger.info("Application shutdown complete.")
+def _delayed_browser_open(host, port):
+    """Opens browser after a short delay to ensure server is ready"""
+    try:
+        # Small delay to ensure Uvicorn is fully ready
+        time.sleep(2)
+        display_host = "localhost" if host == "0.0.0.0" else host
+        browser_url = f"http://{display_host}:{port}/"
+        # Log to file for debugging
+        with open("browser_thread_debug.log", "a") as f:
+            f.write(f"[{time.time()}] Opening browser at {browser_url}\n")
+        # Try to use logger as well (might work at this point)
+        try:
+            logger.info(f"Opening browser at {browser_url}")
+        except:
+            pass
+        # Open browser directly without health checks
+        webbrowser.open(browser_url)
+    except Exception as e:
+        with open("browser_thread_debug.log", "a") as f:
+            f.write(f"[{time.time()}] Browser open error: {str(e)}\n")
+# --- FastAPI App Initialization ---
+app = FastAPI(
+    title="Dia TTS Server",
+    description="Text-to-Speech server using the Dia model, providing API and Web UI.",
+    version="1.1.0",  # Incremented version
+    lifespan=lifespan,
+)
+# List of folders to check/create
+folders = ["reference_audio", "model_cache", "outputs"]
+# Check each folder and create if it doesn't exist
+for folder in folders:
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+        print(f"Created directory: {folder}")
+# --- Static Files and Templates ---
+# Serve generated audio files from the configured output path
+app.mount("/outputs", StaticFiles(directory=get_output_path()), name="outputs")
+# Serve UI files (CSS, JS) from the 'ui' directory
+app.mount("/ui", StaticFiles(directory="ui"), name="ui_static")
+# Initialize Jinja2 templates to look in the 'ui' directory
+templates = Jinja2Templates(directory="ui")
+# --- Configuration Routes Definition ---
+# Defined locally now instead of importing from config.py
+def register_config_routes(app: FastAPI):
+    """Adds configuration management endpoints to the FastAPI app."""
+    logger.info(
+        "Registering configuration routes (/get_config, /save_config, /restart_server, /save_generation_defaults)."
+    )
+    @app.get(
+        "/get_config",
+        tags=["Configuration"],
+        summary="Get current server configuration",
+    )
+    async def get_current_config():
+        """Returns the current server configuration values (from .env or defaults)."""
+        logger.info("Request received for /get_config")
+        return JSONResponse(content=config_manager.get_all())
+    @app.post(
+        "/save_config", tags=["Configuration"], summary="Save server configuration"
+    )
+    async def save_new_config(request: Request):
+        """
+        Saves updated server configuration values (Host, Port, Model paths, etc.)
+        to the .env file. Requires server restart to apply most changes.
+        """
+        logger.info("Request received for /save_config")
+        try:
+            new_config_data = await request.json()
+            if not isinstance(new_config_data, dict):
+                raise ValueError("Request body must be a JSON object.")
+            logger.debug(f"Received server config data to save: {new_config_data}")
+            # Filter data to only include keys present in DEFAULT_CONFIG
+            filtered_data = {
+                k: v for k, v in new_config_data.items() if k in DEFAULT_CONFIG
+            }
+            unknown_keys = set(new_config_data.keys()) - set(filtered_data.keys())
+            if unknown_keys:
+                logger.warning(
+                    f"Ignoring unknown keys in save_config request: {unknown_keys}"
+                )
+            config_manager.update(filtered_data)  # Update in memory first
+            if config_manager.save():  # Attempt to save to .env
+                logger.info("Server configuration saved successfully to .env.")
+                return JSONResponse(
+                    content={
+                        "message": "Server configuration saved. Restart server to apply changes."
+                    }
+                )
+            else:
+                logger.error("Failed to save server configuration to .env file.")
+                raise HTTPException(
+                    status_code=500, detail="Failed to save configuration file."
+                )
+        except ValueError as ve:
+            logger.error(f"Invalid data format for /save_config: {ve}")
+            raise HTTPException(
+                status_code=400, detail=f"Invalid request data: {str(ve)}"
+            )
+        except Exception as e:
+            logger.error(f"Error processing /save_config request: {e}", exc_info=True)
+            raise HTTPException(
+                status_code=500, detail=f"Internal server error during save: {str(e)}"
+            )
+    @app.post(
+        "/save_generation_defaults",
+        tags=["Configuration"],
+        summary="Save default generation parameters",
+    )
+    async def save_generation_defaults(request: Request):
+        """
+        Saves the provided generation parameters (speed, cfg, temp, etc.)
+        as the new defaults in the .env file. These are loaded by the UI on startup.
+        """
+        logger.info("Request received for /save_generation_defaults")
+        try:
+            gen_params = await request.json()
+            if not isinstance(gen_params, dict):
+                raise ValueError("Request body must be a JSON object.")
+            logger.debug(f"Received generation defaults to save: {gen_params}")
+            # Map received keys (e.g., 'speed_factor') to .env keys (e.g., 'GEN_DEFAULT_SPEED_FACTOR')
+            defaults_to_save = {}
+            key_map = {
+                "speed_factor": "GEN_DEFAULT_SPEED_FACTOR",
+                "cfg_scale": "GEN_DEFAULT_CFG_SCALE",
+                "temperature": "GEN_DEFAULT_TEMPERATURE",
+                "top_p": "GEN_DEFAULT_TOP_P",
+                "cfg_filter_top_k": "GEN_DEFAULT_CFG_FILTER_TOP_K",
+            }
+            valid_keys_found = False
+            for ui_key, env_key in key_map.items():
+                if ui_key in gen_params:
+                    # Basic validation could be added here (e.g., check if float/int)
+                    defaults_to_save[env_key] = str(
+                        gen_params[ui_key]
+                    )  # Ensure saving as string
+                    valid_keys_found = True
+                else:
+                    logger.warning(
+                        f"Missing expected key '{ui_key}' in save_generation_defaults request."
+                    )
+            if not valid_keys_found:
+                raise ValueError("No valid generation parameters found in the request.")
+            config_manager.update(defaults_to_save)  # Update in memory
+            if (
+                config_manager.save()
+            ):  # Save all current config (including these) to .env
+                logger.info("Generation defaults saved successfully to .env.")
+                return JSONResponse(content={"message": "Generation defaults saved."})
+            else:
+                logger.error("Failed to save generation defaults to .env file.")
+                raise HTTPException(
+                    status_code=500, detail="Failed to save configuration file."
+                )
+        except ValueError as ve:
+            logger.error(f"Invalid data format for /save_generation_defaults: {ve}")
+            raise HTTPException(
+                status_code=400, detail=f"Invalid request data: {str(ve)}"
+            )
+        except Exception as e:
+            logger.error(
+                f"Error processing /save_generation_defaults request: {e}",
+                exc_info=True,
+            )
+            raise HTTPException(
+                status_code=500, detail=f"Internal server error during save: {str(e)}"
+            )
+    @app.post(
+        "/restart_server",
+        tags=["Configuration"],
+        summary="Attempt to restart the server",
+    )
+    async def trigger_server_restart(background_tasks: BackgroundTasks):
+        """
+        Attempts to restart the server process.
+        NOTE: This is highly dependent on how the server is run (e.g., with uvicorn --reload,
+        or managed by systemd/supervisor). A simple exit might just stop the process.
+        This implementation attempts a clean exit, relying on the runner to restart it.
+        """
+        logger.warning("Received request to restart server via API.")
+        def _do_restart():
+            time.sleep(1)  # Short delay to allow response to be sent
+            logger.warning("Attempting clean exit for restart...")
+            # Option 1: Clean exit (relies on Uvicorn reload or process manager)
+            sys.exit(0)
+            # Option 2: Forceful re-execution (use with caution, might not work as expected)
+            # try:
+            #     logger.warning("Attempting os.execv for restart...")
+            #     os.execv(sys.executable, ['python'] + sys.argv)
+            # except Exception as exec_e:
+            #      logger.error(f"os.execv failed: {exec_e}. Server may not restart automatically.")
+            #      # Fallback to sys.exit if execv fails
+            #      sys.exit(1)
+        background_tasks.add_task(_do_restart)
+        return JSONResponse(
+            content={
+                "message": "Restart signal sent. Server should restart shortly if run with auto-reload."
+            }
+        )
+# --- Register Configuration Routes ---
+register_config_routes(app)
+# --- API Endpoints ---
+@app.post(
+    "/v1/audio/speech",
+    response_class=StreamingResponse,
+    tags=["TTS Generation"],
+    summary="Generate speech (OpenAI compatible)",
+)
+async def openai_tts_endpoint(request: OpenAITTSRequest):
+    """
+    Generates speech audio from text, compatible with the OpenAI TTS API structure.
+    Maps the 'voice' parameter to Dia's voice modes ('S1', 'S2', 'dialogue', or filename for clone).
+    """
+    monitor = PerformanceMonitor()
+    monitor.record("Request received")
+    logger.info(
+        f"Received OpenAI request: voice='{request.voice}', speed={request.speed}, format='{request.response_format}'"
+    )
+    logger.debug(f"Input text (start): '{request.input[:100]}...'")
+    voice_mode = "single_s1"  # Default if mapping fails
+    clone_ref_file = None
+    ref_path = get_reference_audio_path()
+    # --- Map OpenAI 'voice' parameter to Dia's modes ---
+    voice_param = request.voice.strip()
+    if voice_param.lower() == "dialogue":
+        voice_mode = "dialogue"
+    elif voice_param.lower() == "s1":
+        voice_mode = "single_s1"
+    elif voice_param.lower() == "s2":
+        voice_mode = "single_s2"
+    # Check if it looks like a filename for cloning (allow .wav or .mp3)
+    elif voice_param.lower().endswith((".wav", ".mp3")):
+        potential_path = os.path.join(ref_path, voice_param)
+        # Check if the file actually exists in the reference directory
+        if os.path.isfile(potential_path):
+            voice_mode = "clone"
+            clone_ref_file = voice_param  # Use the provided filename
+            logger.info(
+                f"OpenAI request mapped to clone mode with file: {clone_ref_file}"
+            )
+        else:
+            logger.warning(
+                f"Reference file '{voice_param}' specified in OpenAI request not found in '{ref_path}'. Defaulting voice mode."
+            )
+            # Fallback to default 'single_s1' if file not found
+    else:
+        logger.warning(
+            f"Unrecognized OpenAI voice parameter '{voice_param}'. Defaulting voice mode to 'single_s1'."
+        )
+        # Fallback for any other value
+    monitor.record("Parameters processed")
+    try:
+        # Call the core engine function using mapped parameters
+        result = generate_speech(
+            text=request.input,
+            voice_mode=voice_mode,
+            clone_reference_filename=clone_ref_file,
+            speed_factor=request.speed,  # Pass speed factor for post-processing
+            # Use Dia's configured defaults for other generation params unless mapped
+            max_tokens=None,  # Let Dia use its default unless specified otherwise
+            cfg_scale=get_gen_default_cfg_scale(),  # Use saved defaults
+            temperature=get_gen_default_temperature(),
+            top_p=get_gen_default_top_p(),
+            cfg_filter_top_k=get_gen_default_cfg_filter_top_k(),
+        )
+        monitor.record("Generation complete")
+        if result is None:
+            logger.error("Speech generation failed (engine returned None).")
+            raise HTTPException(status_code=500, detail="Speech generation failed.")
+        audio_array, sample_rate = result
+        if sample_rate != EXPECTED_SAMPLE_RATE:
+            logger.warning(
+                f"Engine returned sample rate {sample_rate}, but expected {EXPECTED_SAMPLE_RATE}. Encoding might assume {EXPECTED_SAMPLE_RATE}."
+            )
+            # Use EXPECTED_SAMPLE_RATE for encoding as it's what the model is trained for
+            sample_rate = EXPECTED_SAMPLE_RATE
+        # Encode the audio in memory to the requested format
+        encoded_audio = encode_audio(audio_array, sample_rate, request.response_format)
+        monitor.record("Audio encoding complete")
+        if encoded_audio is None:
+            logger.error(f"Failed to encode audio to format: {request.response_format}")
+            raise HTTPException(
+                status_code=500,
+                detail=f"Failed to encode audio to {request.response_format}",
+            )
+        # Determine the correct media type for the response header
+        media_type = "audio/opus" if request.response_format == "opus" else "audio/wav"
+        # Note: OpenAI uses audio/opus, not audio/ogg;codecs=opus. Let's match OpenAI.
+        logger.info(
+            f"Successfully generated {len(encoded_audio)} bytes in format {request.response_format}"
+        )
+        logger.debug(monitor.report())
+        # Stream the encoded audio back to the client
+        return StreamingResponse(io.BytesIO(encoded_audio), media_type=media_type)
+    except HTTPException as http_exc:
+        # Re-raise HTTPExceptions directly (e.g., from parameter validation)
+        logger.error(f"HTTP exception during OpenAI request: {http_exc.detail}")
+        raise http_exc
+    except Exception as e:
+        logger.error(f"Error processing OpenAI TTS request: {e}", exc_info=True)
+        logger.debug(monitor.report())
+        # Return generic server error for unexpected issues
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+@app.post(
+    "/tts",
+    response_class=StreamingResponse,
+    tags=["TTS Generation"],
+    summary="Generate speech (Custom parameters)",
+)
+async def custom_tts_endpoint(request: CustomTTSRequest):
+    """
+    Generates speech audio from text using explicit Dia parameters.
+    """
+    monitor = PerformanceMonitor()
+    monitor.record("Request received")
+    logger.info(
+        f"Received custom TTS request: mode='{request.voice_mode}', format='{request.output_format}'"
+    )
+    logger.debug(f"Input text (start): '{request.text[:100]}...'")
+    logger.debug(
+        f"Params: max_tokens={request.max_tokens}, cfg={request.cfg_scale}, temp={request.temperature}, top_p={request.top_p}, speed={request.speed_factor}, top_k={request.cfg_filter_top_k}"
+    )
+    clone_ref_file = None
+    if request.voice_mode == "clone":
+        if not request.clone_reference_filename:
+            raise HTTPException(
+                status_code=400,  # Bad request
+                detail="Missing 'clone_reference_filename' which is required for clone mode.",
+            )
+        ref_path = get_reference_audio_path()
+        potential_path = os.path.join(ref_path, request.clone_reference_filename)
+        if not os.path.isfile(potential_path):
+            logger.error(
+                f"Reference audio file not found for clone mode: {potential_path}"
+            )
+            raise HTTPException(
+                status_code=404,  # Not found
+                detail=f"Reference audio file not found: {request.clone_reference_filename}",
+            )
+        clone_ref_file = request.clone_reference_filename
+        logger.info(f"Custom request using clone mode with file: {clone_ref_file}")
+    monitor.record("Parameters processed")
+    try:
+        # Call the core engine function with parameters from the request
+        result = generate_speech(
+            text=request.text,
+            voice_mode=request.voice_mode,
+            clone_reference_filename=clone_ref_file,
+            max_tokens=request.max_tokens,  # Pass user value or None
+            cfg_scale=request.cfg_scale,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            speed_factor=request.speed_factor,  # For post-processing
+            cfg_filter_top_k=request.cfg_filter_top_k,
+        )
+        monitor.record("Generation complete")
+        if result is None:
+            logger.error("Speech generation failed (engine returned None).")
+            raise HTTPException(status_code=500, detail="Speech generation failed.")
+        audio_array, sample_rate = result
+        if sample_rate != EXPECTED_SAMPLE_RATE:
+            logger.warning(
+                f"Engine returned sample rate {sample_rate}, expected {EXPECTED_SAMPLE_RATE}. Encoding will use {EXPECTED_SAMPLE_RATE}."
+            )
+            sample_rate = EXPECTED_SAMPLE_RATE
+        # Encode the audio in memory
+        encoded_audio = encode_audio(audio_array, sample_rate, request.output_format)
+        monitor.record("Audio encoding complete")
+        if encoded_audio is None:
+            logger.error(f"Failed to encode audio to format: {request.output_format}")
+            raise HTTPException(
+                status_code=500,
+                detail=f"Failed to encode audio to {request.output_format}",
+            )
+        # Determine media type
+        media_type = "audio/opus" if request.output_format == "opus" else "audio/wav"
+        logger.info(
+            f"Successfully generated {len(encoded_audio)} bytes in format {request.output_format}"
+        )
+        logger.debug(monitor.report())
+        # Stream the response
+        return StreamingResponse(io.BytesIO(encoded_audio), media_type=media_type)
+    except HTTPException as http_exc:
+        logger.error(f"HTTP exception during custom TTS request: {http_exc.detail}")
+        raise http_exc
+    except Exception as e:
+        logger.error(f"Error processing custom TTS request: {e}", exc_info=True)
+        logger.debug(monitor.report())
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# --- Web UI Endpoints ---
+@app.get("/", response_class=HTMLResponse, include_in_schema=False)
+async def get_web_ui(request: Request):
+    """Serves the main TTS web interface."""
+    logger.info("Serving TTS Web UI (index.html)")
+    # Get current list of reference files for the clone dropdown
+    reference_files = get_valid_reference_files()
+    # Get current server config and default generation params
+    current_config = config_manager.get_all()
+    default_gen_params = {
+        "speed_factor": get_gen_default_speed_factor(),
+        "cfg_scale": get_gen_default_cfg_scale(),
+        "temperature": get_gen_default_temperature(),
+        "top_p": get_gen_default_top_p(),
+        "cfg_filter_top_k": get_gen_default_cfg_filter_top_k(),
+    }
+    return templates.TemplateResponse(
+        "index.html",  # Use the renamed file
+        {
+            "request": request,
+            "reference_files": reference_files,
+            "config": current_config,  # Pass current server config
+            "presets": loaded_presets,  # Pass loaded presets
+            "default_gen_params": default_gen_params,  # Pass default gen params
+            # Add other variables needed by the template for initial state
+            "error": None,
+            "success": None,
+            "output_file_url": None,
+            "generation_time": None,
+            "submitted_text": "",
+            "submitted_voice_mode": "dialogue",  # Default to combined mode
+            "submitted_clone_file": None,
+            # Initial generation params will be set by default_gen_params
+        },
+    )
+@app.post("/web/generate", response_class=HTMLResponse, include_in_schema=False)
+async def handle_web_ui_generate(
+    request: Request,
+    text: str = Form(...),
+    voice_mode: Literal["dialogue", "clone"] = Form(...),  # Updated modes
+    clone_reference_select: Optional[str] = Form(None),
+    # Generation parameters from form
+    speed_factor: float = Form(...),  # Make required or use Depends with default
+    cfg_scale: float = Form(...),
+    temperature: float = Form(...),
+    top_p: float = Form(...),
+    cfg_filter_top_k: int = Form(...),
+):
+    """Handles the generation request from the web UI form."""
+    logger.info(f"Web UI generation request: mode='{voice_mode}'")
+    monitor = PerformanceMonitor()
+    monitor.record("Web request received")
+    output_file_url = None
+    generation_time = None
+    error_message = None
+    success_message = None
+    output_filename_base = "dia_output"  # Default base name
+    # --- Pre-generation Validation ---
+    if not text.strip():
+        error_message = "Please enter some text to synthesize."
+    clone_ref_file = None
+    if voice_mode == "clone":
+        if not clone_reference_select or clone_reference_select == "none":
+            error_message = "Please select a reference audio file for clone mode."
+        else:
+            # Verify selected file still exists (important if files can be deleted)
+            ref_path = get_reference_audio_path()
+            potential_path = os.path.join(ref_path, clone_reference_select)
+            if not os.path.isfile(potential_path):
+                error_message = f"Selected reference file '{clone_reference_select}' no longer exists. Please refresh or upload."
+                # Invalidate selection
+                clone_ref_file = None
+                clone_reference_select = None  # Clear submitted value for re-rendering
+            else:
+                clone_ref_file = clone_reference_select
+                logger.info(f"Using selected reference file: {clone_ref_file}")
+    # If validation failed, re-render the page with error and submitted values
+    if error_message:
+        logger.warning(f"Web UI validation error: {error_message}")
+        reference_files = get_valid_reference_files()
+        current_config = config_manager.get_all()
+        default_gen_params = {  # Pass defaults again for consistency
+            "speed_factor": get_gen_default_speed_factor(),
+            "cfg_scale": get_gen_default_cfg_scale(),
+            "temperature": get_gen_default_temperature(),
+            "top_p": get_gen_default_top_p(),
+            "cfg_filter_top_k": get_gen_default_cfg_filter_top_k(),
+        }
+        # Pass back the values the user submitted
+        submitted_gen_params = {
+            "speed_factor": speed_factor,
+            "cfg_scale": cfg_scale,
+            "temperature": temperature,
+            "top_p": top_p,
+            "cfg_filter_top_k": cfg_filter_top_k,
+        }
+        return templates.TemplateResponse(
+            "index.html",
+            {
+                "request": request,
+                "error": error_message,
+                "reference_files": reference_files,
+                "config": current_config,
+                "presets": loaded_presets,
+                "default_gen_params": default_gen_params,  # Base defaults
+                # Submitted values to repopulate form
+                "submitted_text": text,
+                "submitted_voice_mode": voice_mode,
+                "submitted_clone_file": clone_reference_select,  # Use potentially invalidated value
+                "submitted_gen_params": submitted_gen_params,  # Pass submitted params back
+                # Ensure other necessary template variables are passed
+                "success": None,
+                "output_file_url": None,
+                "generation_time": None,
+            },
+        )
+    # --- Generation ---
+    try:
+        monitor.record("Parameters processed")
+        # Call the core engine function
+        result = generate_speech(
+            text=text,
+            voice_mode=voice_mode,
+            clone_reference_filename=clone_ref_file,
+            speed_factor=speed_factor,
+            cfg_scale=cfg_scale,
+            temperature=temperature,
+            top_p=top_p,
+            cfg_filter_top_k=cfg_filter_top_k,
+            max_tokens=None,  # Use model default for UI simplicity
+        )
+        monitor.record("Generation complete")
+        if result:
+            audio_array, sample_rate = result
+            output_path_base = get_output_path()
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            # Create a more descriptive filename
+            mode_tag = voice_mode
+            if voice_mode == "clone" and clone_ref_file:
+                safe_ref_name = sanitize_filename(os.path.splitext(clone_ref_file)[0])
+                mode_tag = f"clone_{safe_ref_name[:20]}"  # Limit length
+            output_filename = (
+                f"{mode_tag}_{timestamp}.wav"  # Always save as WAV for simplicity
+            )
+            output_filepath = os.path.join(output_path_base, output_filename)
+            # Save the audio to a WAV file
+            saved = save_audio_to_file(audio_array, sample_rate, output_filepath)
+            monitor.record("Audio saved")
+            if saved:
+                output_file_url = (
+                    f"/outputs/{output_filename}"  # URL path for browser access
+                )
+                generation_time = (
+                    monitor.events[-1][1] - monitor.start_time
+                )  # Time until save complete
+                success_message = f"Audio generated successfully!"
+                logger.info(f"Web UI generated audio saved to: {output_filepath}")
+            else:
+                error_message = "Failed to save generated audio file."
+                logger.error("Failed to save audio file from web UI request.")
+        else:
+            error_message = "Speech generation failed (engine returned None)."
+            logger.error("Speech generation failed for web UI request.")
+    except Exception as e:
+        logger.error(f"Error processing web UI TTS request: {e}", exc_info=True)
+        error_message = f"An unexpected error occurred: {str(e)}"
+    logger.debug(monitor.report())
+    # --- Re-render Template with Results ---
+    reference_files = get_valid_reference_files()
+    current_config = config_manager.get_all()
+    default_gen_params = {
+        "speed_factor": get_gen_default_speed_factor(),
+        "cfg_scale": get_gen_default_cfg_scale(),
+        "temperature": get_gen_default_temperature(),
+        "top_p": get_gen_default_top_p(),
+        "cfg_filter_top_k": get_gen_default_cfg_filter_top_k(),
+    }
+    # Pass back submitted values to repopulate form correctly
+    submitted_gen_params = {
+        "speed_factor": speed_factor,
+        "cfg_scale": cfg_scale,
+        "temperature": temperature,
+        "top_p": top_p,
+        "cfg_filter_top_k": cfg_filter_top_k,
+    }
+    return templates.TemplateResponse(
+        "index.html",
+        {
+            "request": request,
+            "error": error_message,
+            "success": success_message,
+            "output_file_url": output_file_url,
+            "generation_time": f"{generation_time:.2f}" if generation_time else None,
+            "reference_files": reference_files,
+            "config": current_config,
+            "presets": loaded_presets,
+            "default_gen_params": default_gen_params,  # Base defaults
+            # Pass back submitted values
+            "submitted_text": text,
+            "submitted_voice_mode": voice_mode,
+            "submitted_clone_file": clone_ref_file,  # Pass the validated filename back
+            "submitted_gen_params": submitted_gen_params,  # Pass submitted params back
+        },
+    )
+# --- Reference Audio Upload Endpoint ---
+@app.post(
+    "/upload_reference", tags=["Web UI Helpers"], summary="Upload reference audio files"
+)
+async def upload_reference_audio(files: List[UploadFile] = File(...)):
+    """Handles uploading of reference audio files (.wav, .mp3) for voice cloning."""
+    logger.info(f"Received request to upload {len(files)} reference audio file(s).")
+    ref_path = get_reference_audio_path()
+    uploaded_filenames = []
+    errors = []
+    allowed_mime_types = [
+        "audio/wav",
+        "audio/mpeg",
+        "audio/x-wav",
+    ]  # Common WAV/MP3 types
+    allowed_extensions = [".wav", ".mp3"]
+    for file in files:
+        try:
+            # Basic validation
+            if not file.filename:
+                errors.append("Received file with no filename.")
+                continue
+            # Sanitize filename
+            safe_filename = sanitize_filename(file.filename)
+            _, ext = os.path.splitext(safe_filename)
+            if ext.lower() not in allowed_extensions:
+                errors.append(
+                    f"File '{file.filename}' has unsupported extension '{ext}'. Allowed: {allowed_extensions}"
+                )
+                continue
+            # Check MIME type (more reliable than extension)
+            if file.content_type not in allowed_mime_types:
+                errors.append(
+                    f"File '{file.filename}' has unsupported content type '{file.content_type}'. Allowed: {allowed_mime_types}"
+                )
+                continue
+            # Construct full save path
+            destination_path = os.path.join(ref_path, safe_filename)
+            # Prevent overwriting existing files (optional, could add counter)
+            if os.path.exists(destination_path):
+                # Simple approach: skip if exists
+                logger.warning(
+                    f"Reference file '{safe_filename}' already exists. Skipping upload."
+                )
+                # Add to list so UI knows it's available, even if not newly uploaded this time
+                if safe_filename not in uploaded_filenames:
+                    uploaded_filenames.append(safe_filename)
+                continue
+                # Alternative: add counter like file_1.wav, file_2.wav
+            # Save the file using shutil.copyfileobj for efficiency with large files
+            try:
+                with open(destination_path, "wb") as buffer:
+                    shutil.copyfileobj(file.file, buffer)
+                logger.info(f"Successfully saved reference file: {destination_path}")
+                uploaded_filenames.append(safe_filename)
+            except Exception as save_exc:
+                errors.append(f"Failed to save file '{safe_filename}': {save_exc}")
+                logger.error(
+                    f"Failed to save uploaded file '{safe_filename}' to '{destination_path}': {save_exc}",
+                    exc_info=True,
+                )
+            finally:
+                # Ensure the UploadFile resource is closed
+                await file.close()
+        except Exception as e:
+            errors.append(
+                f"Error processing file '{getattr(file, 'filename', 'unknown')}': {e}"
+            )
+            logger.error(
+                f"Unexpected error processing uploaded file: {e}", exc_info=True
+            )
+            # Ensure file is closed even if other errors occur
+            if file:
+                await file.close()
+    # Get the updated list of all valid files in the directory
+    updated_file_list = get_valid_reference_files()
+    response_data = {
+        "message": f"Processed {len(files)} file(s).",
+        "uploaded_files": uploaded_filenames,  # List of successfully saved *new* files this request
+        "all_reference_files": updated_file_list,  # Complete current list
+        "errors": errors,
+    }
+    status_code = (
+        200 if not errors or len(errors) < len(files) else 400
+    )  # OK if at least one succeeded, else Bad Request
+    if errors:
+        logger.warning(f"Upload completed with errors: {errors}")
+    return JSONResponse(content=response_data, status_code=status_code)
+# --- Health Check Endpoint ---
+@app.get("/health", tags=["Server Status"], summary="Check server health")
+async def health_check():
+    """Basic health check, indicates if the server is running and if the model is loaded."""
+    # Access the MODEL_LOADED variable *directly* from the engine module
+    # each time the endpoint is called to get the current status.
+    current_model_status = getattr(engine, "MODEL_LOADED", False)  # Safely get status
+    logger.debug(
+        f"Health check returning model_loaded status: {current_model_status}"
+    )  # Add debug log
+    return {"status": "healthy", "model_loaded": current_model_status}
+# --- Main Execution ---
+if __name__ == "__main__":
+    host = get_host()
+    port = get_port()
+    logger.info(f"Starting Dia TTS server on {host}:{port}")
+    logger.info(f"Model Repository: {get_model_repo_id()}")
+    logger.info(f"Model Config File: {get_model_config_filename()}")
+    logger.info(f"Model Weights File: {get_model_weights_filename()}")
+    logger.info(f"Model Cache Path: {get_model_cache_path()}")
+    logger.info(f"Reference Audio Path: {get_reference_audio_path()}")
+    logger.info(f"Output Path: {get_output_path()}")
+    # Determine the host to display in logs and use for browser opening
+    display_host = "localhost" if host == "0.0.0.0" else host
+    logger.info(f"Web UI will be available at http://{display_host}:{port}/")
+    logger.info(f"API Docs available at http://{display_host}:{port}/docs")
+    # Ensure UI directory and index.html exist for UI
+    ui_dir = "ui"
+    index_file = os.path.join(ui_dir, "index.html")
+    if not os.path.isdir(ui_dir) or not os.path.isfile(index_file):
+        logger.warning(
+            f"'{ui_dir}' directory or '{index_file}' not found. Web UI may not work."
+        )
+        # Optionally create dummy files/dirs if needed for startup
+        os.makedirs(ui_dir, exist_ok=True)
+        if not os.path.isfile(index_file):
+            try:
+                with open(index_file, "w") as f:
+                    f.write(
+                        "<html><body>Web UI template missing. See project source for index.html.</body></html>"
+                    )
+                logger.info(f"Created dummy {index_file}.")
+            except Exception as e:
+                logger.error(f"Failed to create dummy {index_file}: {e}")
+    # --- Create synchronization event ---
+    # This event will be set by the lifespan manager once startup (incl. model loading) is complete.
+    startup_complete_event = threading.Event()
+    # Run Uvicorn server
+    # The lifespan context manager ('lifespan="on"') will run during startup.
+    # The 'lifespan' function is responsible for loading models and setting the 'startup_complete_event'.
+    uvicorn.run(
+        "server:app",  # Use the format 'module:app_instance'
+        host=host,
+        port=port,
+        reload=False,  # Set reload as needed for development/production
+        # reload_dirs=[".", "ui"], # Only use reload=True with reload_dirs/includes for development
+        # reload_includes=[
+        #     "*.py",
+        #     "*.html",
+        #     "*.css",
+        #     "*.js",
+        #     ".env",
+        #     "*.yaml",
+        # ],
+        lifespan="on",  # Use the lifespan context manager defined in this file
+        # workers=1 # Keep workers=1 when using reload=True or complex global state/models
+    )

ui/index.html ADDED Viewed

	@@ -0,0 +1,916 @@

+<!DOCTYPE html>
+<html lang="en" class="dark"> <!-- Default to dark mode class -->
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Dia TTS Server | Text-to-Dialogue</title>
+    <link rel="icon" href="/static/favicon.ico" type="image/x-icon">
+    <!-- Tailwind CSS (CDN for simplicity, processes styles in <style type="text/tailwindcss"> below) -->
+    <script src="https://cdn.tailwindcss.com"></script>
+    <script>
+        // Configure Tailwind CSS
+        tailwind.config = {
+            darkMode: 'class', // Enable class-based dark mode
+            theme: {
+                extend: {
+                    colors: {
+                        // Define color palettes used in style.css
+                        // Light Mode Colors (Examples - Adjust as needed)
+                        gray: { 50: '#f9fafb', 100: '#f3f4f6', 200: '#e5e7eb', 300: '#d1d5db', 400: '#9ca3af', 500: '#6b7280', 600: '#4b5563', 700: '#374151', 800: '#1f2937', 900: '#111827' },
+                        sky: { 50: '#f0f9ff', 100: '#e0f2fe', 200: '#bae6fd', 300: '#7dd3fc', 400: '#38bdf8', 500: '#0ea5e9', 600: '#0284c7', 700: '#0369a1', 800: '#075985', 900: '#0c4a6e' },
+                        indigo: { 50: '#eef2ff', 100: '#e0e7ff', 200: '#c7d2fe', 300: '#a5b4fc', 400: '#818cf8', 500: '#6366f1', 600: '#4f46e5', 700: '#4338ca', 800: '#3730a3', 900: '#312e81' },
+                        red: { 100: '#fee2e2', 300: '#fca5a5', 500: '#ef4444', 600: '#dc2626', 800: '#991b1b', 900: '#7f1d1d' },
+                        green: { 100: '#dcfce7', 300: '#86efac', 500: '#22c55e', 800: '#166534', 900: '#14532d' },
+                        yellow: { 100: '#fef9c3', 300: '#fcd34d', 500: '#eab308', 700: '#b45309', 900: '#78350f' },
+                        // Dark Mode Colors (Copied from previous inline config)
+                        primary: { 50: '#f0f9ff', 100: '#e0f2fe', 200: '#bae6fd', 300: '#7dd3fc', 400: '#38bdf8', 500: '#0ea5e9', 600: '#0284c7', 700: '#0369a1', 800: '#075985', 900: '#0c4a6e' },
+                        purple: { 50: '#faf5ff', 100: '#f3e8ff', 200: '#e9d5ff', 300: '#d8b4fe', 400: '#c084fc', 500: '#a855f7', 600: '#9333ea', 700: '#7e22ce', 800: '#6b21a8', 900: '#581c87' },
+                        dark: { 50: '#f9fafb', 100: '#f3f4f6', 200: '#e5e7eb', 300: '#d1d5db', 400: '#9ca3af', 500: '#6b7280', 600: '#4b5563', 700: '#374151', 800: '#1f2937', 900: '#111827', 950: '#030712', 1000: '#0f1729' }
+                    }
+                }
+            }
+        }
+    </script>
+    <!-- Removed External Stylesheet Link: <link rel="stylesheet" href="/ui/style.css"> -->
+    <!-- Wavesurfer for audio visualization -->
+    <script src="https://unpkg.com/wavesurfer.js@7"></script>
+    <style type="text/tailwindcss">
+        /* ui/style.css */
+        /* Import Tailwind base, components, and utilities */
+        @tailwind base;
+        @tailwind components;
+        @tailwind utilities;
+        /* Define custom components/utilities */
+        @layer components {
+            /* Base styles (Light Mode) */
+            .body-base {
+                @apply h-full bg-gray-100 text-gray-900;
+            }
+            .nav-base {
+                @apply bg-gradient-to-r from-white to-sky-100 border-b border-sky-200 shadow-md;
+            }
+            .nav-link {
+                @apply text-sky-700 hover:text-sky-900 px-3 py-2 rounded-md text-sm font-medium;
+            }
+            .title-link {
+                @apply text-gray-900 text-xl font-bold;
+            }
+            .card-base {
+                @apply bg-white shadow-lg rounded-lg overflow-hidden border border-gray-200;
+            }
+            .card-header {
+                @apply text-lg font-medium text-gray-900 mb-4;
+            }
+            .card-footer {
+                @apply bg-gray-50 px-6 py-4 flex items-center justify-between border-t border-gray-200;
+            }
+            .label-base {
+                @apply block text-sm font-medium text-gray-700 mb-1;
+            }
+            .input-base {
+                @apply block w-full rounded-md border-gray-300 shadow-sm focus:border-sky-500 focus:ring-sky-500 sm:text-sm px-3 py-2 bg-white text-gray-900 placeholder-gray-400;
+            }
+            .textarea-base {
+                @apply input-base;
+                /* Inherit base input styles */
+            }
+            .select-base {
+                @apply input-base appearance-none pr-8;
+                /* Add padding for arrow */
+                /* Consider adding a background SVG for the dropdown arrow */
+            }
+            .button-base {
+                @apply inline-flex items-center justify-center px-4 py-2 border border-transparent rounded-md shadow-sm text-sm font-medium focus:outline-none focus:ring-2 focus:ring-offset-2 transition-colors disabled:opacity-50 disabled:cursor-not-allowed whitespace-nowrap flex-shrink-0;
+                /* Added whitespace-nowrap and flex-shrink-0 for button text */
+            }
+            .btn-primary {
+                @apply button-base bg-sky-600 text-white hover:bg-sky-700 focus:ring-sky-500;
+            }
+            .btn-secondary {
+                @apply button-base bg-gray-200 text-gray-700 border-gray-300 hover:bg-gray-300 focus:ring-indigo-500;
+                /* Example secondary */
+            }
+            .btn-danger {
+                @apply button-base bg-red-600 text-white hover:bg-red-700 focus:ring-red-500;
+            }
+            .btn-purple {
+                @apply button-base bg-purple-600 text-white hover:bg-purple-700 focus:ring-purple-500;
+            }
+            .slider-base {
+                @apply w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer;
+                /* Need to style the thumb separately per browser */
+            }
+            .slider-thumb {
+                /* Basic thumb styling */
+                @apply appearance-none w-5 h-5 bg-sky-600 rounded-full cursor-pointer;
+            }
+            .radio-label {
+                @apply flex items-center space-x-2 cursor-pointer border border-gray-300 bg-white hover:border-sky-400 p-3 rounded-md transition-colors;
+            }
+            .radio-label-text {
+                @apply text-gray-700;
+            }
+            /* Apply checked styles directly using peer-checked utility on the container/text span */
+            /* .radio-label input:checked+span {
+                @apply text-sky-600 font-semibold;
+            }
+            .radio-label-checked {
+                @apply border-sky-500 ring-2 ring-sky-500;
+            } */
+            /* Replaced these custom classes with Tailwind peer utilities in the HTML */
+            .preset-button {
+                @apply button-base bg-indigo-100 text-indigo-700 border-indigo-200 hover:bg-indigo-200 focus:ring-indigo-500 text-xs px-3 py-1;
+            }
+            .notification-base {
+                @apply px-4 py-3 rounded relative shadow-md flex items-center mb-3;
+                /* Reduced margin bottom */
+            }
+            .notification-success {
+                @apply notification-base bg-green-100 border border-green-300 text-green-800;
+            }
+            .notification-error {
+                @apply notification-base bg-red-100 border border-red-300 text-red-800;
+            }
+            .notification-warning {
+                @apply notification-base bg-yellow-100 border border-yellow-300 text-yellow-800;
+            }
+            .notification-info {
+                /* Added info style */
+                @apply notification-base bg-sky-100 border border-sky-300 text-sky-800;
+            }
+            .code-inline {
+                @apply bg-gray-200 px-1 rounded text-sm font-mono text-gray-800;
+            }
+            .tooltip {
+                /* Basic tooltip styling */
+                @apply absolute hidden group-hover:block bg-gray-700 text-white text-xs rounded py-1 px-2 z-10 -mt-8;
+            }
+            .loading-overlay-base {
+                @apply fixed inset-0 bg-gray-600 bg-opacity-75 flex items-center justify-center z-50 transition-opacity duration-300;
+            }
+            .loading-box-base {
+                @apply bg-white p-6 rounded-lg shadow-xl flex flex-col items-center border border-gray-300;
+            }
+            .loading-spinner {
+                @apply animate-spin h-10 w-10 text-sky-600 mb-4;
+            }
+            .loading-text {
+                @apply text-gray-900 text-lg mb-2;
+            }
+            .loading-status {
+                @apply text-gray-600 text-sm mb-4 text-center max-w-xs;
+                /* Limit width */
+            }
+            .waveform-container {
+                @apply w-full h-24 bg-gray-100 rounded;
+            }
+            .audio-player-card {
+                @apply card-base mt-8;
+                /* Margin top for spacing */
+            }
+            .audio-player-controls {
+                @apply flex flex-wrap items-center justify-between gap-4;
+            }
+            .audio-player-buttons {
+                @apply flex items-center space-x-2 sm:space-x-4;
+                /* Adjust spacing */
+            }
+            .audio-player-info {
+                @apply text-sm text-gray-600 text-right;
+            }
+            .theme-switch {
+                @apply p-2 rounded-md text-gray-600 hover:bg-gray-200 hover:text-gray-800 focus:outline-none focus:ring-2 focus:ring-sky-500 focus:ring-offset-2;
+            }
+            /* Dark Mode Overrides using 'dark:' prefix */
+            .dark .body-base {
+                @apply bg-[#0f1729] text-white;
+                /* Original dark bg */
+            }
+            .dark .nav-base {
+                @apply bg-gradient-to-r from-dark-900 to-purple-900 border-b border-purple-800 shadow-lg;
+            }
+            .dark .nav-link {
+                @apply text-primary-300 hover:text-white;
+            }
+            .dark .title-link {
+                @apply text-white;
+            }
+            .dark .card-base {
+                @apply bg-dark-800 border border-dark-700;
+            }
+            .dark .card-header {
+                @apply text-white;
+            }
+            .dark .card-footer {
+                @apply bg-dark-900 border-t border-dark-700;
+            }
+            .dark .label-base {
+                @apply text-gray-300;
+                /* Lighter gray for dark */
+            }
+            .dark .input-base {
+                @apply border-dark-600 bg-dark-700 text-white placeholder-gray-500 focus:ring-offset-dark-800;
+            }
+            .dark .select-base {
+                /* Dark mode arrow styling if needed */
+            }
+            .dark .btn-primary {
+                @apply bg-primary-600 text-white hover:bg-primary-700 focus:ring-primary-500 focus:ring-offset-dark-800;
+            }
+            .dark .btn-secondary {
+                @apply bg-dark-700 text-white border-dark-600 hover:bg-dark-600 focus:ring-purple-500 focus:ring-offset-dark-800;
+            }
+            .dark .btn-danger {
+                @apply bg-red-600 text-white hover:bg-red-700 focus:ring-red-500 focus:ring-offset-dark-800;
+            }
+            .dark .btn-purple {
+                @apply bg-purple-600 text-white hover:bg-purple-700 focus:ring-purple-500 focus:ring-offset-dark-800;
+            }
+            .dark .slider-base {
+                @apply bg-dark-600;
+            }
+            .dark .slider-thumb {
+                @apply bg-primary-500;
+            }
+            .dark .radio-label {
+                @apply border-dark-600 bg-dark-800 hover:border-primary-400;
+            }
+            .dark .radio-label-text {
+                @apply text-gray-300;
+            }
+            /* Apply checked styles directly using peer-checked utility on the container/text span */
+            /* .dark .radio-label input:checked+span {
+                @apply text-primary-400;
+            }
+            .dark .radio-label-checked {
+                 @apply border-primary-500 ring-primary-500;
+            } */
+            /* Replaced these custom classes with Tailwind peer utilities in the HTML */
+            .dark .preset-button {
+                @apply bg-indigo-900 text-indigo-200 border-indigo-700 hover:bg-indigo-800 focus:ring-indigo-500 focus:ring-offset-dark-800;
+            }
+            .dark .notification-success {
+                @apply notification-base bg-green-900 border border-green-700 text-green-100;
+            }
+            .dark .notification-error {
+                @apply notification-base bg-red-900 border border-red-700 text-red-100;
+            }
+            .dark .notification-warning {
+                @apply notification-base bg-yellow-900 border border-yellow-700 text-yellow-100;
+            }
+            .dark .notification-info {
+                /* Added info style */
+                @apply notification-base bg-sky-900 border border-sky-700 text-sky-100;
+            }
+            .dark .code-inline {
+                @apply bg-dark-900 text-purple-300;
+            }
+            .dark .tooltip {
+                @apply bg-dark-950;
+            }
+            .dark .loading-overlay-base {
+                @apply bg-dark-900 bg-opacity-75;
+            }
+            .dark .loading-box-base {
+                @apply bg-dark-800 border border-dark-700;
+            }
+            .dark .loading-spinner {
+                @apply text-primary-500;
+            }
+            .dark .loading-text {
+                @apply text-white;
+            }
+            .dark .loading-status {
+                @apply text-gray-400;
+            }
+            .dark .waveform-container {
+                @apply bg-dark-900;
+            }
+            .dark .audio-player-info {
+                @apply text-purple-300;
+            }
+            .dark .theme-switch {
+                @apply text-gray-400 hover:bg-dark-700 hover:text-white focus:ring-offset-dark-900;
+            }
+        }
+        /* Specific slider thumb styling per browser */
+        /* Apply these within the <style> tag as they target pseudo-elements */
+        input[type="range"].slider-base::-webkit-slider-thumb {
+            @apply slider-thumb;
+        }
+        input[type="range"].slider-base::-moz-range-thumb {
+            @apply slider-thumb;
+        }
+        /* Dark mode thumbs need specific overrides if needed */
+        .dark input[type="range"].slider-base::-webkit-slider-thumb {
+            /* Apply dark mode thumb styles directly */
+            background-color: theme('colors.primary.500');
+            /* Replaced @apply dark:slider-thumb */
+            /* Inherit other base thumb styles if needed (like size, border-radius) or re-apply */
+            @apply appearance-none w-5 h-5 rounded-full cursor-pointer;
+        }
+        .dark input[type="range"].slider-base::-moz-range-thumb {
+            /* Apply dark mode thumb styles directly */
+            background-color: theme('colors.primary.500');
+            /* Replaced @apply dark:slider-thumb */
+            /* Inherit other base thumb styles if needed or re-apply */
+            @apply appearance-none w-5 h-5 rounded-full cursor-pointer;
+        }
+    </style>
+</head>
+<body class="body-base">
+    <div class="min-h-full">
+        <!-- Navigation -->
+        <nav class="nav-base">
+            <div class="mx-auto max-w-7xl px-4 sm:px-6 lg:px-8">
+                <div class="flex h-16 items-center justify-between">
+                    <div class="flex items-center">
+                        <div class="flex-shrink-0">
+                            <!-- Make title clickable -->
+                            <a href="/" class="title-link">Dia TTS Server</a>
+                        </div>
+                    </div>
+                    <div class="flex items-center space-x-2 sm:space-x-4">
+                        <a href="/docs" target="_blank" class="nav-link">API Docs</a>
+                        <!-- Theme Toggle Button -->
+                        <button id="theme-toggle-btn" type="button"
+                            class="relative inline-flex items-center p-1 rounded-full bg-gray-200 dark:bg-dark-700 h-8 w-16 transition-colors"
+                            title="Toggle light/dark mode">
+                            <span class="sr-only">Toggle theme</span>
+                            <span class="absolute inset-0 rounded-full transition-colors"></span>
+                            <!-- Toggle thumb with icons -->
+                            <span
+                                class="relative rounded-full w-6 h-6 bg-white dark:bg-purple-600 transform transition-transform duration-200 ease-in-out translate-x-0 dark:translate-x-8 flex items-center justify-center shadow-md">
+                                <!-- Sun icon (for light mode) -->
+                                <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor"
+                                    class="w-4 h-4 text-yellow-500 dark:opacity-0 transition-opacity">
+                                    <path
+                                        d="M10 2a.75.75 0 0 1 .75.75v1.5a.75.75 0 0 1-1.5 0v-1.5A.75.75 0 0 1 10 2ZM10 15a.75.75 0 0 1 .75.75v1.5a.75.75 0 0 1-1.5 0v-1.5A.75.75 0 0 1 10 15ZM10 7a3 3 0 1 0 0 6 3 3 0 0 0 0-6Z" />
+                                </svg>
+                                <!-- Moon icon (for dark mode) -->
+                                <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor"
+                                    class="w-4 h-4 text-white opacity-0 dark:opacity-100 transition-opacity">
+                                    <path
+                                        d="M7.455 1.75A8.5 8.5 0 0 1 18.25 12.55a8.5 8.5 0 0 1-8.46 8.46A8.5 8.5 0 0 1 1.75 12.55a8.5 8.5 0 0 1 5.705-10.8Z" />
+                                </svg>
+                            </span>
+                        </button>
+                    </div>
+                </div>
+            </div>
+        </nav>
+        <!-- Main content -->
+        <main>
+            <div class="mx-auto max-w-7xl px-4 py-8 sm:px-6 lg:px-8">
+                <!-- Notification area -->
+                <div id="notification-area" class="mb-6 space-y-3">
+                    {% if error %}
+                    <div class="notification-error" role="alert">
+                        <svg class="h-5 w-5 text-red-500 mr-2 flex-shrink-0" viewBox="0 0 20 20" fill="currentColor">
+                            <path fill-rule="evenodd"
+                                d="M10 18a8 8 0 100-16 8 8 0 000 16zM8.707 7.293a1 1 0 00-1.414 1.414L8.586 10l-1.293 1.293a1 1 0 101.414 1.414L10 11.414l1.293 1.293a1 1 0 001.414-1.414L11.414 10l1.293-1.293a1 1 0 00-1.414-1.414L10 8.586 8.707 7.293z"
+                                clip-rule="evenodd" />
+                        </svg>
+                        <span class="block sm:inline">{{ error }}</span>
+                    </div>
+                    {% endif %}
+                    {% if success %}
+                    <div class="notification-success" role="alert">
+                        <svg class="h-5 w-5 text-green-500 mr-2 flex-shrink-0" viewBox="0 0 20 20" fill="currentColor">
+                            <path fill-rule="evenodd"
+                                d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z"
+                                clip-rule="evenodd" />
+                        </svg>
+                        <span class="block sm:inline">{{ success }}</span>
+                    </div>
+                    {% endif %}
+                </div>
+                <!-- TTS form -->
+                <div class="card-base">
+                    <form id="tts-form" action="/web/generate" method="post" class="flex flex-col">
+                        <div class="p-6">
+                            <h2 class="card-header">Generate Speech with Dia</h2>
+                            <!-- Text input -->
+                            <div class="mb-6">
+                                <label for="text" class="label-base">Text to speak</label>
+                                <p class="text-xs text-purple-500 dark:text-purple-300 mb-2">
+                                    Use <code class="code-inline">[S1]</code> and <code class="code-inline">[S2]</code>
+                                    tags for speaker turns. Add non-verbals like <code
+                                        class="code-inline">(laughs)</code>.
+                                </p>
+                                <div class="relative">
+                                    <textarea name="text" id="text" rows="5" maxlength="8192" class="textarea-base"
+                                        placeholder="Example: [S1] Hello there! [S2] Hi! How are you? [S1] I'm doing well, thanks. (laughs)"
+                                        required>{{ submitted_text if submitted_text else "" }}</textarea>
+                                    <div class="absolute bottom-2 right-2 text-xs text-gray-500 dark:text-purple-300">
+                                        <span id="char-count">0</span> / 8192
+                                    </div>
+                                </div>
+                            </div>
+                            <!-- Voice Mode Selection -->
+                            <div class="mb-6">
+                                <label class="label-base mb-2">Voice Mode</label>
+                                <div class="grid grid-cols-1 md:grid-cols-2 gap-4">
+                                    <!-- Combined Dialogue / Single Speaker Mode -->
+                                    <label
+                                        class="radio-label peer-checked:border-sky-500 peer-checked:dark:border-primary-500 peer-checked:ring-2 peer-checked:ring-sky-500 peer-checked:dark:ring-primary-500">
+                                        <input type="radio" name="voice_mode" value="dialogue" class="hidden peer" {% if
+                                            submitted_voice_mode=='dialogue' or not submitted_voice_mode %}checked{%
+                                            endif %} onchange="toggleCloneOptions()">
+                                        <span
+                                            class="radio-label-text peer-checked:text-sky-600 dark:peer-checked:text-primary-400 peer-checked:font-semibold">
+                                            Single / Dialogue (Use [S1]/[S2])
+                                        </span>
+                                    </label>
+                                    <!-- Clone Mode -->
+                                    <label
+                                        class="radio-label peer-checked:border-sky-500 peer-checked:dark:border-primary-500 peer-checked:ring-2 peer-checked:ring-sky-500 peer-checked:dark:ring-primary-500">
+                                        <input type="radio" name="voice_mode" value="clone" class="hidden peer" {% if
+                                            submitted_voice_mode=='clone' %}checked{% endif %}
+                                            onchange="toggleCloneOptions()">
+                                        <span
+                                            class="radio-label-text peer-checked:text-sky-600 dark:peer-checked:text-primary-400 peer-checked:font-semibold">
+                                            Voice Clone (from Reference)
+                                        </span>
+                                    </label>
+                                </div>
+                            </div>
+                            <!-- Presets Section -->
+                            <div class="mb-6">
+                                <label class="label-base mb-2">Load Example Preset</label>
+                                <div id="presets-container" class="flex flex-wrap gap-2">
+                                    {% if presets %}
+                                    {% for preset in presets %}
+                                    <button type="button" id="preset-btn-{{ loop.index0 }}" class="preset-button"
+                                        title="Load '{{ preset.name }}' text and settings">
+                                        {{ preset.name }}
+                                    </button>
+                                    {% endfor %}
+                                    {% else %}
+                                    <p class="text-sm text-gray-500 dark:text-gray-400">No presets loaded. Check
+                                        presets.yaml.</p>
+                                    {% endif %}
+                                </div>
+                            </div>
+                            <!-- Clone Options (Hidden by default) -->
+                            <div id="clone-options" class="mb-6 hidden">
+                                <label for="clone_reference_select" class="label-base">Reference Audio File</label>
+                                <p class="text-xs text-purple-500 dark:text-purple-300 mb-2">
+                                    Select a <code class="code-inline">.wav</code> or <code
+                                        class="code-inline">.mp3</code> file from the <code
+                                        class="code-inline">reference_audio</code> folder.
+                                    <strong class="dark:text-yellow-300 text-yellow-600">Important:</strong> Prepend the
+                                    exact transcript of this audio to your text input above for best results.
+                                </p>
+                                <div class="flex items-center gap-2">
+                                    <select id="clone_reference_select" name="clone_reference_select"
+                                        class="select-base flex-grow">
+                                        <option value="none" {% if not submitted_clone_file %}selected{% endif %}>--
+                                            Select Reference File --</option>
+                                        {% for filename in reference_files %}
+                                        <option value="{{ filename }}" {% if submitted_clone_file==filename %}selected{%
+                                            endif %}>{{ filename }}</option>
+                                        {% endfor %}
+                                    </select>
+                                    <!-- Hidden file input triggered by the button -->
+                                    <input type="file" id="clone-file-input" class="hidden" multiple accept=".wav,.mp3"
+                                        aria-label="Upload reference audio file">
+                                    <!-- Modified Load Button -->
+                                    <button type="button" id="clone-load-button" class="btn-secondary hidden"
+                                        title="Upload new reference files">
+                                        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor"
+                                            class="w-5 h-5 mr-1">
+                                            <path
+                                                d="M9.25 13.25a.75.75 0 0 0 1.5 0V4.636l2.955 3.129a.75.75 0 0 0 1.09-1.03l-4.25-4.5a.75.75 0 0 0-1.09 0l-4.25 4.5a.75.75 0 1 0 1.09 1.03L9.25 4.636v8.614Z" />
+                                            <path
+                                                d="M3.5 12.75a.75.75 0 0 0-1.5 0v2.5A2.75 2.75 0 0 0 4.75 18h10.5A2.75 2.75 0 0 0 18 15.25v-2.5a.75.75 0 0 0-1.5 0v2.5c0 .69-.56 1.25-1.25 1.25H4.75c-.69 0-1.25-.56-1.25-1.25v-2.5Z" />
+                                        </svg>
+                                        Load
+                                    </button>
+                                </div>
+                            </div>
+                            <!-- Generation Parameters -->
+                            <div class="mb-6">
+                                <details class="group">
+                                    <summary class="list-none flex cursor-pointer items-center">
+                                        <span class="text-sm font-medium label-base">Generation Parameters</span>
+                                        <span class="ml-2 text-purple-500 dark:text-purple-300">
+                                            <svg class="group-open:rotate-180 h-5 w-5 transition-transform"
+                                                viewBox="0 0 20 20" fill="currentColor">
+                                                <path fill-rule="evenodd"
+                                                    d="M5.293 7.293a1 1 0 011.414 0L10 10.586l3.293-3.293a1 1 0 111.414 1.414l-4 4a1 1 0 01-1.414 0l-4-4a1 1 0 010-1.414z"
+                                                    clip-rule="evenodd" />
+                                            </svg>
+                                        </span>
+                                    </summary>
+                                    <div class="mt-4 grid grid-cols-1 md:grid-cols-2 gap-x-6 gap-y-4">
+                                        <!-- Use default_gen_params passed from server for initial values -->
+                                        {% set current_gen_params = submitted_gen_params if submitted_gen_params else
+                                        default_gen_params %}
+                                        <!-- Speed Factor -->
+                                        <div>
+                                            <label for="speed_factor" class="label-base">Speed Factor (<span
+                                                    id="speed_factor_value">{{ current_gen_params.speed_factor
+                                                    }}</span>)</label>
+                                            <input type="range" id="speed_factor" name="speed_factor" min="0.5"
+                                                max="2.0" step="0.01" value="{{ current_gen_params.speed_factor }}"
+                                                class="slider-base">
+                                        </div>
+                                        <!-- CFG Scale -->
+                                        <div>
+                                            <label for="cfg_scale" class="label-base">CFG Scale (<span
+                                                    id="cfg_scale_value">{{ current_gen_params.cfg_scale
+                                                    }}</span>)</label>
+                                            <input type="range" id="cfg_scale" name="cfg_scale" min="1.0" max="5.0"
+                                                step="0.1" value="{{ current_gen_params.cfg_scale }}"
+                                                class="slider-base">
+                                        </div>
+                                        <!-- Temperature -->
+                                        <div>
+                                            <label for="temperature" class="label-base">Temperature (<span
+                                                    id="temperature_value">{{ current_gen_params.temperature
+                                                    }}</span>)</label>
+                                            <input type="range" id="temperature" name="temperature" min="1.0" max="1.5"
+                                                step="0.05" value="{{ current_gen_params.temperature }}"
+                                                class="slider-base">
+                                        </div>
+                                        <!-- Top P -->
+                                        <div>
+                                            <label for="top_p" class="label-base">Top P (<span id="top_p_value">{{
+                                                    current_gen_params.top_p }}</span>)</label>
+                                            <input type="range" id="top_p" name="top_p" min="0.8" max="1.0" step="0.01"
+                                                value="{{ current_gen_params.top_p }}" class="slider-base">
+                                        </div>
+                                        <!-- CFG Filter Top K -->
+                                        <div>
+                                            <label for="cfg_filter_top_k" class="label-base">CFG Filter Top K (<span
+                                                    id="cfg_filter_top_k_value">{{ current_gen_params.cfg_filter_top_k
+                                                    }}</span>)</label>
+                                            <input type="range" id="cfg_filter_top_k" name="cfg_filter_top_k" min="15"
+                                                max="50" step="1" value="{{ current_gen_params.cfg_filter_top_k }}"
+                                                class="slider-base">
+                                        </div>
+                                        <!-- Save Gen Defaults Button -->
+                                        <div class="col-span-1 md:col-span-2 mt-4 flex items-center gap-4">
+                                            <button id="save-gen-defaults-btn" type="button" class="btn-secondary">
+                                                Save Generation Defaults
+                                            </button>
+                                            <span id="gen-defaults-status" class="text-xs hidden"></span>
+                                        </div>
+                                    </div>
+                                </details>
+                            </div>
+                            <!-- Server Configuration (Collapsible) -->
+                            <div class="mb-6">
+                                <details class="group">
+                                    <summary class="list-none flex cursor-pointer items-center">
+                                        <span class="text-sm font-medium label-base">Server Configuration</span>
+                                        <span class="ml-2 text-purple-500 dark:text-purple-300">
+                                            <svg class="group-open:rotate-180 h-5 w-5 transition-transform"
+                                                viewBox="0 0 20 20" fill="currentColor">
+                                                <path fill-rule="evenodd"
+                                                    d="M5.293 7.293a1 1 0 011.414 0L10 10.586l3.293-3.293a1 1 0 111.414 1.414l-4 4a1 1 0 01-1.414 0l-4-4a1 1 0 010-1.414z"
+                                                    clip-rule="evenodd" />
+                                            </svg>
+                                        </span>
+                                    </summary>
+                                    <div id="server-config-form"
+                                        class="mt-4 border-t border-gray-200 dark:border-dark-700 pt-4">
+                                        <p class="text-xs text-purple-500 dark:text-purple-300 mb-3">
+                                            These settings are saved to the <code class="code-inline">.env</code> file.
+                                            Restart the server to apply changes.
+                                        </p>
+                                        <div class="grid grid-cols-1 md:grid-cols-2 gap-4">
+                                            <!-- Dia Model Repo ID -->
+                                            <div>
+                                                <label for="config_model_repo" class="label-base text-xs">Model Repo
+                                                    ID</label>
+                                                <input type="text" id="config_model_repo" name="DIA_MODEL_REPO_ID"
+                                                    value="{{ config.DIA_MODEL_REPO_ID }}"
+                                                    placeholder="ttj/dia-1.6b-safetensors" class="input-base text-sm">
+                                            </div>
+                                            <!-- Model Config Filename -->
+                                            <div>
+                                                <label for="config_model_config" class="label-base text-xs">Model Config
+                                                    Filename</label>
+                                                <input type="text" id="config_model_config"
+                                                    name="DIA_MODEL_CONFIG_FILENAME"
+                                                    value="{{ config.DIA_MODEL_CONFIG_FILENAME }}"
+                                                    placeholder="config.json" class="input-base text-sm">
+                                            </div>
+                                            <!-- Model Weights Filename -->
+                                            <div>
+                                                <label for="config_model_weights" class="label-base text-xs">Model
+                                                    Weights Filename</label>
+                                                <input type="text" id="config_model_weights"
+                                                    name="DIA_MODEL_WEIGHTS_FILENAME"
+                                                    value="{{ config.DIA_MODEL_WEIGHTS_FILENAME }}"
+                                                    placeholder="dia-v0_1_bf16.safetensors" class="input-base text-sm">
+                                            </div>
+                                            <!-- Model Cache Path -->
+                                            <div>
+                                                <label for="config_model_cache" class="label-base text-xs">Model Cache
+                                                    Path</label>
+                                                <input type="text" id="config_model_cache" name="DIA_MODEL_CACHE_PATH"
+                                                    value="{{ config.DIA_MODEL_CACHE_PATH }}"
+                                                    placeholder="./model_cache" class="input-base text-sm">
+                                            </div>
+                                            <!-- Reference Audio Path -->
+                                            <div>
+                                                <label for="config_ref_audio" class="label-base text-xs">Reference Audio
+                                                    Path</label>
+                                                <input type="text" id="config_ref_audio" name="REFERENCE_AUDIO_PATH"
+                                                    value="{{ config.REFERENCE_AUDIO_PATH }}"
+                                                    placeholder="./reference_audio" class="input-base text-sm">
+                                            </div>
+                                            <!-- Output Path -->
+                                            <div>
+                                                <label for="config_output_path" class="label-base text-xs">Output
+                                                    Path</label>
+                                                <input type="text" id="config_output_path" name="OUTPUT_PATH"
+                                                    value="{{ config.OUTPUT_PATH }}" placeholder="./outputs"
+                                                    class="input-base text-sm">
+                                            </div>
+                                            <!-- Server Host -->
+                                            <div>
+                                                <label for="config_host" class="label-base text-xs">Server Host</label>
+                                                <input type="text" id="config_host" name="HOST"
+                                                    value="{{ config.HOST }}" placeholder="0.0.0.0"
+                                                    class="input-base text-sm">
+                                            </div>
+                                            <!-- Server Port -->
+                                            <div>
+                                                <label for="config_port" class="label-base text-xs">Server Port</label>
+                                                <input type="number" id="config_port" name="PORT"
+                                                    value="{{ config.PORT }}" min="1024" max="65535" step="1"
+                                                    class="input-base text-sm">
+                                            </div>
+                                            <!-- Save/Restart Buttons -->
+                                            <div
+                                                class="col-span-1 md:col-span-2 mt-4 flex flex-col md:flex-row gap-4 items-center">
+                                                <button id="save-config-btn" type="button"
+                                                    class="btn-purple w-full md:w-auto">
+                                                    Save Server Configuration
+                                                </button>
+                                                <button id="restart-server-btn" type="button"
+                                                    class="btn-danger w-full md:w-auto hidden">
+                                                    <svg xmlns="http://www.w3.org/2000/svg" fill="none"
+                                                        viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor"
+                                                        class="w-5 h-5 mr-1 inline-block">
+                                                        <path stroke-linecap="round" stroke-linejoin="round"
+                                                            d="M16.023 9.348h4.992v-.001M2.985 19.644v-4.992m0 0h4.992m-4.993 0 3.181 3.183a8.25 8.25 0 0 0 13.803-3.7M4.031 9.865a8.25 8.25 0 0 1 13.803-3.7l3.181 3.182m0-4.991v4.99" />
+                                                    </svg>
+                                                    Restart Server
+                                                </button>
+                                                <span id="config-status" class="text-xs ml-2 hidden"></span>
+                                            </div>
+                                        </div>
+                                    </div>
+                                </details>
+                            </div>
+                        </div> <!-- End p-6 -->
+                        <!-- Form Actions -->
+                        <div class="card-footer">
+                            <div class="text-sm text-gray-600 dark:text-purple-300">
+                                <p>Use <code class="code-inline">[S1]</code>/<code class="code-inline">[S2]</code> for
+                                    dialogue. Add <code class="code-inline">(laughs)</code> etc.</p>
+                            </div>
+                            <button type="submit" id="generate-btn" class="btn-primary">
+                                <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24"
+                                    stroke-width="1.5" stroke="currentColor" class="w-5 h-5 mr-1 inline-block">
+                                    <path stroke-linecap="round" stroke-linejoin="round"
+                                        d="M19.114 5.636a9 9 0 0 1 0 12.728M16.463 8.288a5.25 5.25 0 0 1 0 7.424M6.75 8.25l4.72-4.72a.75.75 0 0 1 1.28.53v15.88a.75.75 0 0 1-1.28.53l-4.72-4.72H4.51c-.88 0-1.704-.507-1.938-1.354A9.009 9.009 0 0 1 2.25 12c0-.83.112-1.633.322-2.396C2.806 8.756 3.63 8.25 4.51 8.25H6.75Z" />
+                                </svg>
+                                Generate Speech
+                            </button>
+                        </div>
+                    </form>
+                </div> <!-- End TTS Form Card -->
+                <!-- Audio player container - Populated by JavaScript if generation is successful -->
+                <div id="audio-player-container" class="mt-8">
+                    {% if output_file_url %}
+                    <!-- Template for initial load if result is passed from server -->
+                    <!-- Add data attribute to signal JS that result is present -->
+                    <div id="output-file-url-data" data-initial-audio-url="{{ output_file_url }}" class="hidden"></div>
+                    <div class="audio-player-card">
+                        <div class="p-6">
+                            <h2 class="card-header">Generated Audio</h2>
+                            <div class="mb-4">
+                                <div id="waveform" class="waveform-container"></div>
+                            </div>
+                            <div class="audio-player-controls">
+                                <div class="audio-player-buttons">
+                                    <button id="play-btn" class="btn-primary" disabled>
+                                        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor"
+                                            class="w-5 h-5 mr-1">
+                                            <path fill-rule="evenodd"
+                                                d="M2 10a8 8 0 1 1 16 0 8 8 0 0 1-16 0Zm6.39-2.908a.75.75 0 0 1 .766.027l3.5 2.25a.75.75 0 0 1 0 1.262l-3.5 2.25A.75.75 0 0 1 8 12.25v-4.5a.75.75 0 0 1 .39-.658Z"
+                                                clip-rule="evenodd" />
+                                        </svg>
+                                        Play
+                                    </button>
+                                    <a id="download-link" href="{{ output_file_url }}"
+                                        download="{{ output_file_url.split('/')[-1] }}" class="btn-secondary">
+                                        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor"
+                                            class="w-5 h-5 mr-1">
+                                            <path
+                                                d="M10.75 2.75a.75.75 0 0 0-1.5 0v8.614L6.295 8.235a.75.75 0 1 0-1.09 1.03l4.25 4.5a.75.75 0 0 0 1.09 0l4.25-4.5a.75.75 0 0 0-1.09-1.03l-2.955 3.129V2.75Z" />
+                                            <path
+                                                d="M3.5 12.75a.75.75 0 0 0-1.5 0v2.5A2.75 2.75 0 0 0 4.75 18h10.5A2.75 2.75 0 0 0 18 15.25v-2.5a.75.75 0 0 0-1.5 0v2.5c0 .69-.56 1.25-1.25 1.25H4.75c-.69 0-1.25-.56-1.25-1.25v-2.5Z" />
+                                        </svg>
+                                        Download WAV
+                                    </a>
+                                </div>
+                                <div class="audio-player-info">
+                                    Mode: <span class="font-medium">{{ submitted_voice_mode }}</span>
+                                    {% if submitted_voice_mode == 'clone' and submitted_clone_file %}
+                                    (<span class="font-medium">{{ submitted_clone_file }}</span>)
+                                    {% endif %}
+                                    • Gen Time: <span class="font-medium">{{ generation_time }}s</span>
+                                    • Duration: <span id="audio-duration" class="font-medium">--:--</span>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                    {% endif %}
+                </div>
+                <!-- Tips Section -->
+                <div class="mt-8">
+                    <h2 class="card-header mb-4">Tips & Tricks for Dia</h2>
+                    <div class="card-base">
+                        <div class="p-6">
+                            <ul class="list-disc pl-5 text-sm text-gray-700 dark:text-purple-300 space-y-2">
+                                <li>For **Dialogue** mode, clearly mark speaker turns using <code
+                                        class="code-inline">[S1]</code> and <code class="code-inline">[S2]</code>.</li>
+                                <li>Add non-verbal sounds like <code class="code-inline">(laughs)</code>, <code
+                                        class="code-inline">(sighs)</code>, <code
+                                        class="code-inline">(clears throat)</code> within the text where desired.</li>
+                                <li>For **Voice Clone** mode, upload a clean reference audio file (<code
+                                        class="code-inline">.wav</code>/<code class="code-inline">.mp3</code>) using the
+                                    "Load" button. <strong class="dark:text-yellow-300 text-yellow-600">Crucially,
+                                        include the exact transcript of the reference audio at the beginning of your
+                                        text input</strong> (e.g., <code
+                                        class="code-inline">[S1] Reference transcript. [S1] Target text...</code>).</li>
+                                <li>Experiment with **CFG Scale** (higher = more adherence to text, potentially less
+                                    natural) and **Temperature** (higher = more random/varied).</li>
+                                <li>The **Speed Factor** adjusts playback speed (0.8 = slower, 1.0 = original).</li>
+                                <li>Use the <code class="code-inline">/v1/audio/speech</code> endpoint for OpenAI
+                                    compatibility. Use the <code class="code-inline">voice</code> parameter to specify
+                                    mode ('S1', 'S2', 'dialogue', 'reference_file.wav').</li>
+                            </ul>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </main>
+        <footer class="nav-base py-6 mt-12">
+            <div class="mx-auto max-w-7xl px-4 sm:px-6 lg:px-8">
+                <div class="flex justify-center">
+                    <a href="https://github.com/devnen/Dia-TTS-Server"
+                        class="flex items-center gap-2 text-gray-600 dark:text-purple-300 text-sm hover:text-sky-600 dark:hover:text-primary-400 transition-colors">
+                        <!-- GitHub icon -->
+                        <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="currentColor"
+                            viewBox="0 0 16 16" class="flex-shrink-0">
+                            <path
+                                d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.012 8.012 0 0 0 16 8c0-4.42-3.58-8-8-8z" />
+                        </svg>
+                        <span>Dia TTS Server | Powered by FastAPI</span>
+                    </a>
+                </div>
+            </div>
+        </footer>
+    </div>
+    <!-- Loading spinner template (hidden by default) -->
+    <div id="loading-overlay" class="loading-overlay-base hidden">
+        <div class="loading-box-base">
+            <svg class="loading-spinner" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
+                <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
+                <path class="opacity-75" fill="currentColor"
+                    d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z">
+                </path>
+            </svg>
+            <p id="loading-message" class="loading-text">Generating audio...</p>
+            <p id="loading-status" class="loading-status">Please wait.</p>
+            <button id="loading-cancel-btn" type="button" class="btn-secondary mt-4">Cancel</button>
+        </div>
+    </div>
+    <!-- Pass data from server to JavaScript -->
+    <script>
+        // Make presets data available to script.js
+        // Ensure this is correctly populated by your Jinja2 template context
+        window.appPresets = {{ presets | tojson | safe }};
+    </script>
+    <!-- Link External JavaScript (Ensure it's loaded AFTER the DOM) -->
+    <script src="/ui/script.js" defer></script>
+</body>
+</html>

ui/presets.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+# ui/presets.yaml
+# Predefined examples for the Dia TTS UI
+- name: "Standard Dialogue"
+  voice_mode: "dialogue"
+  text: |
+    [S1] Hey, how's it going?
+    [S2] Pretty good! Just grabbing some coffee. You?
+    [S1] Same here. Need the fuel! (laughs)
+  params:
+    cfg_scale: 3.0
+    temperature: 1.3
+    top_p: 0.95
+    cfg_filter_top_k: 35
+    # speed_factor uses the saved default
+- name: "Expressive Narration"
+  voice_mode: "dialogue" # Use dialogue mode with single speaker tag
+  text: |
+    [S1] The old house stood on a windswept hill, its windows like empty eyes staring out at the stormy sea. (sighs) It felt... lonely.
+  params:
+    cfg_scale: 3.0
+    temperature: 1.2 # Slightly lower temp for clarity
+    top_p: 0.95
+    cfg_filter_top_k: 35
+- name: "Quick Announcement"
+  voice_mode: "dialogue" # Use dialogue mode with single speaker tag
+  text: |
+    [S1] Attention shoppers! The store will be closing in 15 minutes. Please bring your final purchases to the checkout.
+  params:
+    cfg_scale: 2.8 # Slightly lower CFG for potentially more natural tone
+    temperature: 1.3
+    top_p: 0.95
+    cfg_filter_top_k: 35
+- name: "Funny Exchange"
+  voice_mode: "dialogue"
+  text: |
+    [S1] Did you remember to buy the alien repellent?
+    [S2] The what now? (laughs) I thought you were joking!
+    [S1] Joking? They're landing tonight! (clears throat) Probably.
+  params:
+    cfg_scale: 3.2 # Slightly higher CFG
+    temperature: 1.35 # Slightly higher temp
+    top_p: 0.95
+    cfg_filter_top_k: 35
+- name: "Simple Sentence"
+  voice_mode: "dialogue" # Use dialogue mode with single speaker tag
+  text: |
+    [S1] This is a test of the text to speech system.
+  params:
+    cfg_scale: 3.0
+    temperature: 1.3
+    top_p: 0.95
+    cfg_filter_top_k: 35

ui/script.js ADDED Viewed

	@@ -0,0 +1,593 @@

+// ui/script.js
+document.addEventListener('DOMContentLoaded', function () {
+    // --- Global Flags ---
+    let isGenerating = false;
+    let isGenerationCancelled = false;
+    let wavesurfer = null; // Global wavesurfer instance
+    // --- Element Selectors ---
+    const ttsForm = document.getElementById('tts-form');
+    const textArea = document.getElementById('text');
+    const charCount = document.getElementById('char-count');
+    const voiceModeRadios = document.querySelectorAll('input[name="voice_mode"]');
+    const cloneOptionsDiv = document.getElementById('clone-options');
+    const cloneReferenceSelect = document.getElementById('clone_reference_select');
+    const cloneLoadButton = document.getElementById('clone-load-button'); // New ID
+    const cloneFileInput = document.getElementById('clone-file-input'); // New ID
+    const generateBtn = document.getElementById('generate-btn');
+    const loadingOverlay = document.getElementById('loading-overlay');
+    const loadingMessage = document.getElementById('loading-message');
+    const loadingStatus = document.getElementById('loading-status'); // New element for status
+    const loadingCancelBtn = document.getElementById('loading-cancel-btn'); // New ID
+    const notificationArea = document.getElementById('notification-area');
+    const audioPlayerContainer = document.getElementById('audio-player-container');
+    const configSaveBtn = document.getElementById('save-config-btn');
+    const configRestartBtn = document.getElementById('restart-server-btn');
+    const configStatus = document.getElementById('config-status');
+    const genDefaultsSaveBtn = document.getElementById('save-gen-defaults-btn'); // New ID
+    const genDefaultsStatus = document.getElementById('gen-defaults-status'); // New ID
+    const themeToggleButton = document.getElementById('theme-toggle-btn'); // New ID
+    const themeIconLight = document.getElementById('theme-icon-light'); // New ID
+    const themeIconDark = document.getElementById('theme-icon-dark'); // New ID
+    const presetsContainer = document.getElementById('presets-container'); // New ID
+    // --- Initial Setup ---
+    // Character counter
+    function updateCharCount() {
+        if (textArea && charCount) {
+            charCount.textContent = textArea.value.length;
+        }
+    }
+    if (textArea) {
+        textArea.addEventListener('input', updateCharCount);
+        updateCharCount(); // Initial count
+    }
+    // Toggle Clone Options Visibility & Required Attribute
+    function toggleCloneOptions() {
+        const selectedMode = document.querySelector('input[name="voice_mode"]:checked')?.value;
+        if (cloneOptionsDiv && cloneReferenceSelect && cloneLoadButton) {
+            if (selectedMode === 'clone') {
+                cloneOptionsDiv.classList.remove('hidden');
+                cloneReferenceSelect.required = true;
+                cloneLoadButton.classList.remove('hidden');
+            } else {
+                cloneOptionsDiv.classList.add('hidden');
+                cloneReferenceSelect.required = false;
+                // cloneReferenceSelect.value = 'none'; // Don't reset if user might switch back
+                cloneLoadButton.classList.add('hidden');
+            }
+        }
+    }
+    voiceModeRadios.forEach(radio => radio.addEventListener('change', toggleCloneOptions));
+    toggleCloneOptions(); // Initial check
+    // Update slider value displays dynamically
+    const sliders = [
+        { id: 'speed_factor', valueId: 'speed_factor_value' },
+        { id: 'cfg_scale', valueId: 'cfg_scale_value' },
+        { id: 'temperature', valueId: 'temperature_value' },
+        { id: 'top_p', valueId: 'top_p_value' },
+        { id: 'cfg_filter_top_k', valueId: 'cfg_filter_top_k_value' },
+    ];
+    sliders.forEach(sliderInfo => {
+        const slider = document.getElementById(sliderInfo.id);
+        const valueDisplay = document.getElementById(sliderInfo.valueId);
+        if (slider && valueDisplay) {
+            // Set initial display from slider's current value (set by template)
+            valueDisplay.textContent = slider.value;
+            // Add event listener to update display on change
+            slider.addEventListener('input', () => valueDisplay.textContent = slider.value);
+        }
+    });
+    // --- Notifications ---
+    function showNotification(message, type = 'success', duration = 5000) {
+        if (!notificationArea) return;
+        // notificationArea.innerHTML = ''; // Clear previous? Or allow multiple? Let's allow multiple for now.
+        const colors = {
+            success: 'notification-success',
+            error: 'notification-error',
+            warning: 'notification-warning',
+            info: 'notification-info' // Add info style if needed
+        };
+        const icons = { // SVG icons or classes
+            success: '<svg class="h-5 w-5 text-green-500 mr-2 flex-shrink-0" viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z" clip-rule="evenodd" /></svg>',
+            error: '<svg class="h-5 w-5 text-red-500 mr-2 flex-shrink-0" viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zM8.707 7.293a1 1 0 00-1.414 1.414L8.586 10l-1.293 1.293a1 1 0 101.414 1.414L10 11.414l1.293 1.293a1 1 0 001.414-1.414L11.414 10l1.293-1.293a1 1 0 00-1.414-1.414L10 8.586 8.707 7.293z" clip-rule="evenodd" /></svg>',
+            warning: '<svg class="h-5 w-5 text-yellow-500 mr-2 flex-shrink-0" viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M8.485 2.495c.673-1.167 2.357-1.167 3.03 0l6.28 10.875c.673 1.167-.17 2.625-1.516 2.625H3.72c-1.347 0-2.189-1.458-1.515-2.625L8.485 2.495zM10 5a.75.75 0 01.75.75v3.5a.75.75 0 01-1.5 0v-3.5A.75.75 0 0110 5zm0 9a1 1 0 100-2 1 1 0 000 2z" clip-rule="evenodd" /></svg>',
+            info: '<svg class="h-5 w-5 text-sky-500 mr-2 flex-shrink-0" viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a.75.75 0 000 1.5h.253a.25.25 0 01.244.304l-.459 2.066A1.75 1.75 0 0010.747 15H11a.75.75 0 000-1.5h-.253a.25.25 0 01-.244-.304l.459-2.066A1.75 1.75 0 009.253 9H9z" clip-rule="evenodd" /></svg>'
+        };
+        const notificationDiv = document.createElement('div');
+        notificationDiv.className = colors[type] || colors['info']; // Default to info style
+        notificationDiv.innerHTML = `${icons[type] || icons['info']} <span class="block sm:inline">${message}</span>`;
+        notificationArea.appendChild(notificationDiv);
+        // Auto-hide after specified duration
+        if (duration > 0) {
+            setTimeout(() => {
+                notificationDiv.style.transition = 'opacity 0.5s ease-out';
+                notificationDiv.style.opacity = '0';
+                setTimeout(() => notificationDiv.remove(), 500);
+            }, duration);
+        }
+        return notificationDiv; // Return the element if manual removal is needed
+    }
+    // --- Presets ---
+    function applyPreset(presetData) {
+        console.log("Applying preset:", presetData);
+        if (!presetData) return;
+        // Update text area
+        if (textArea && presetData.text !== undefined) {
+            textArea.value = presetData.text;
+            updateCharCount(); // Update counter
+        }
+        // Update voice mode
+        if (presetData.voice_mode) {
+            const radio = document.querySelector(`input[name="voice_mode"][value="${presetData.voice_mode}"]`);
+            if (radio) {
+                radio.checked = true;
+                toggleCloneOptions(); // Update UI based on new mode
+            }
+        }
+        // Update generation parameters
+        if (presetData.params) {
+            for (const [key, value] of Object.entries(presetData.params)) {
+                const slider = document.getElementById(key); // Assumes slider ID matches param key
+                const valueDisplay = document.getElementById(`${key}_value`);
+                if (slider) {
+                    slider.value = value;
+                    if (valueDisplay) {
+                        valueDisplay.textContent = value; // Update display
+                    }
+                } else {
+                    console.warn(`Slider element not found for preset parameter: ${key}`);
+                }
+            }
+        }
+        showNotification(`Preset "${presetData.name}" loaded.`, 'info', 3000);
+    }
+    // Add event listeners to preset buttons (assuming they exist)
+    // Presets data should be available globally, e.g., from template `window.appPresets = {{ presets | tojson }};`
+    if (window.appPresets && presetsContainer) {
+        window.appPresets.forEach((preset, index) => {
+            const button = document.getElementById(`preset-btn-${index}`);
+            if (button) {
+                button.addEventListener('click', () => applyPreset(preset));
+            }
+        });
+    } else if (presetsContainer) {
+        console.warn("Presets data (window.appPresets) not found, preset buttons will not work.");
+    }
+    // --- Audio Player ---
+    function initializeWaveSurfer(audioUrl) {
+        if (wavesurfer) {
+            wavesurfer.destroy();
+        }
+        const waveformDiv = document.getElementById('waveform');
+        const playBtn = document.getElementById('play-btn');
+        const durationSpan = document.getElementById('audio-duration');
+        if (!waveformDiv || !playBtn || !durationSpan) {
+            console.error("Audio player elements not found in the container.");
+            // Clear the container if elements are missing after generation
+            if (audioPlayerContainer) audioPlayerContainer.innerHTML = '<p class="text-red-500 dark:text-red-400">Error displaying audio player.</p>';
+            return;
+        }
+        // Ensure button text doesn't wrap
+        playBtn.classList.add('whitespace-nowrap', 'flex-shrink-0');
+        const downloadLink = document.getElementById('download-link');
+        if (downloadLink) downloadLink.classList.add('whitespace-nowrap', 'flex-shrink-0');
+        wavesurfer = WaveSurfer.create({
+            container: waveformDiv,
+            waveColor: document.documentElement.classList.contains('dark') ? '#38bdf8' : '#0ea5e9', // primary-400(dark) / primary-500(light)
+            progressColor: document.documentElement.classList.contains('dark') ? '#0284c7' : '#0369a1', // primary-600(dark) / primary-700(light)
+            cursorColor: document.documentElement.classList.contains('dark') ? '#a855f7' : '#9333ea', // purple-500(dark) / purple-600(light)
+            barWidth: 3,
+            barRadius: 3,
+            cursorWidth: 1,
+            height: 80,
+            barGap: 2,
+            responsive: true,
+            url: audioUrl,
+            mediaControls: false, // Use custom controls
+            normalize: true,
+        });
+        wavesurfer.on('ready', () => {
+            const duration = wavesurfer.getDuration();
+            const minutes = Math.floor(duration / 60);
+            const seconds = Math.floor(duration % 60);
+            durationSpan.textContent = `${minutes}:${seconds < 10 ? '0' : ''}${seconds}`;
+            playBtn.disabled = false;
+            playBtn.innerHTML = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor" class="w-5 h-5 mr-1"><path fill-rule="evenodd" d="M2 10a8 8 0 1 1 16 0 8 8 0 0 1-16 0Zm6.39-2.908a.75.75 0 0 1 .766.027l3.5 2.25a.75.75 0 0 1 0 1.262l-3.5 2.25A.75.75 0 0 1 8 12.25v-4.5a.75.75 0 0 1 .39-.658Z" clip-rule="evenodd" /></svg> Play`;
+        });
+        wavesurfer.on('play', () => {
+            playBtn.innerHTML = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor" class="w-5 h-5 mr-1"><path fill-rule="evenodd" d="M2 10a8 8 0 1 1 16 0 8 8 0 0 1-16 0Zm5-2.25A.75.75 0 0 1 7.75 7h4.5a.75.75 0 0 1 .75.75v4.5a.75.75 0 0 1-.75.75h-4.5a.75.75 0 0 1-.75-.75v-4.5Z" clip-rule="evenodd" /></svg> Pause`;
+        });
+        wavesurfer.on('pause', () => {
+            playBtn.innerHTML = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor" class="w-5 h-5 mr-1"><path fill-rule="evenodd" d="M2 10a8 8 0 1 1 16 0 8 8 0 0 1-16 0Zm6.39-2.908a.75.75 0 0 1 .766.027l3.5 2.25a.75.75 0 0 1 0 1.262l-3.5 2.25A.75.75 0 0 1 8 12.25v-4.5a.75.75 0 0 1 .39-.658Z" clip-rule="evenodd" /></svg> Play`;
+        });
+        wavesurfer.on('finish', () => {
+            playBtn.innerHTML = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor" class="w-5 h-5 mr-1"><path fill-rule="evenodd" d="M2 10a8 8 0 1 1 16 0 8 8 0 0 1-16 0Zm6.39-2.908a.75.75 0 0 1 .766.027l3.5 2.25a.75.75 0 0 1 0 1.262l-3.5 2.25A.75.75 0 0 1 8 12.25v-4.5a.75.75 0 0 1 .39-.658Z" clip-rule="evenodd" /></svg> Play`;
+        });
+        playBtn.onclick = () => {
+            wavesurfer.playPause();
+        };
+        // Scroll to the player after initialization
+        setTimeout(() => {
+            audioPlayerContainer.scrollIntoView({ behavior: 'smooth', block: 'center' });
+        }, 100); // Short delay to ensure rendering
+    }
+    // Initialize player if audio URL is present on initial page load
+    // This logic needs to be adapted as the player is now dynamically added
+    // We'll call initializeWaveSurfer if the template renders the player container
+    const initialAudioUrlElement = document.querySelector('[data-initial-audio-url]');
+    if (initialAudioUrlElement && initialAudioUrlElement.dataset.initialAudioUrl) {
+        console.log("Initializing WaveSurfer for initially loaded audio.");
+        initializeWaveSurfer(initialAudioUrlElement.dataset.initialAudioUrl);
+    }
+    // --- Form Submission & Cancellation ---
+    if (ttsForm) {
+        ttsForm.addEventListener('submit', function (event) {
+            // Client-side validation
+            const text = textArea.value.trim();
+            const mode = document.querySelector('input[name="voice_mode"]:checked')?.value;
+            const cloneFile = cloneReferenceSelect?.value;
+            if (!text) {
+                showNotification("Please enter some text.", 'error');
+                event.preventDefault(); return;
+            }
+            if (mode === 'clone' && (!cloneFile || cloneFile === 'none')) {
+                showNotification("Please select a reference file for clone mode.", 'error');
+                event.preventDefault(); return;
+            }
+            // Handle cancellation of previous request if Generate is clicked again
+            if (isGenerating) {
+                console.log("Generate clicked while previous generation in progress. Setting cancel flag.");
+                showNotification("Cancelling previous request...", 'warning', 2000);
+                isGenerationCancelled = true;
+                // We don't actually stop the backend here (Fake Cancel)
+                // but the result processing will ignore the previous result.
+            }
+            // Reset flags and show loading overlay for the new request
+            isGenerating = true;
+            isGenerationCancelled = false; // Reset cancel flag for the new request
+            if (loadingOverlay && generateBtn && loadingCancelBtn) {
+                loadingMessage.textContent = 'Generating audio...'; // Initial status
+                loadingStatus.textContent = 'Please wait.';
+                loadingOverlay.classList.remove('hidden');
+                generateBtn.disabled = true;
+                generateBtn.classList.add('opacity-50', 'cursor-not-allowed');
+                loadingCancelBtn.disabled = false; // Enable cancel button
+            }
+            // Allow default form submission to proceed
+            // The page will reload with results rendered by the template
+        });
+    }
+    // Handle Cancel button click
+    if (loadingCancelBtn) {
+        loadingCancelBtn.addEventListener('click', () => {
+            if (isGenerating) {
+                console.log("Cancel button clicked.");
+                isGenerationCancelled = true;
+                isGenerating = false; // Stop considering it "generating" from UI perspective
+                if (loadingOverlay && generateBtn) {
+                    loadingOverlay.classList.add('hidden'); // Hide overlay
+                    generateBtn.disabled = false; // Re-enable generate button
+                    generateBtn.classList.remove('opacity-50', 'cursor-not-allowed');
+                }
+                showNotification("Generation cancelled by user.", 'info');
+                // Note: Backend request continues, but result will be ignored on page reload/update
+            }
+        });
+    }
+    // --- Result Handling (on page load after form submission) ---
+    // This logic runs every time the page loads. We check if specific elements
+    // indicating a successful generation are present.
+    const outputUrlElement = document.getElementById('output-file-url-data'); // Need to add this element in HTML
+    if (outputUrlElement && outputUrlElement.dataset.url) {
+        const outputUrl = outputUrlElement.dataset.url;
+        console.log("Page loaded with generation result:", outputUrl);
+        if (isGenerationCancelled) {
+            console.log("Generation was cancelled, ignoring result.");
+            showNotification("Previous generation was cancelled.", "warning");
+            // Reset flag after checking
+            isGenerationCancelled = false;
+        } else {
+            console.log("Processing successful generation result.");
+            // The audio player structure should be rendered by the template.
+            // We just need to initialize wavesurfer for it.
+            initializeWaveSurfer(outputUrl);
+        }
+    }
+    // Always reset generating flag on page load, as any active generation is now finished or irrelevant
+    isGenerating = false;
+    if (generateBtn) { // Re-enable button if page reloads for any reason
+        generateBtn.disabled = false;
+        generateBtn.classList.remove('opacity-50', 'cursor-not-allowed');
+    }
+    // --- Configuration Management ---
+    async function updateConfigStatus(button, statusElement, message, success = true, duration = 5000) {
+        const successClass = 'text-green-500 dark:text-green-400';
+        const errorClass = 'text-red-500 dark:text-red-400';
+        const savingClass = 'text-yellow-500 dark:text-yellow-400';
+        statusElement.textContent = message;
+        statusElement.className = `text-xs ml-2 ${success ? successClass : (message.startsWith('Saving') || message.startsWith('Restarting') ? savingClass : errorClass)}`;
+        statusElement.classList.remove('hidden');
+        if (button) button.disabled = true; // Disable button while processing
+        // Clear status after duration, re-enable button
+        if (duration > 0) {
+            setTimeout(() => {
+                statusElement.classList.add('hidden');
+                if (button) button.disabled = false;
+            }, duration);
+        }
+    }
+    // Save Server Configuration
+    if (configSaveBtn) {
+        configSaveBtn.addEventListener('click', async () => {
+            const configData = {};
+            document.querySelectorAll('#server-config-form input[name]').forEach(input => { // Assume inputs are within a form/div
+                configData[input.name] = input.value;
+            });
+            updateConfigStatus(configSaveBtn, configStatus, 'Saving...', true, 0); // Indefinite until success/error
+            try {
+                const response = await fetch('/save_config', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify(configData)
+                });
+                const result = await response.json();
+                if (!response.ok) throw new Error(result.detail || 'Failed to save');
+                updateConfigStatus(configSaveBtn, configStatus, result.message, true);
+                if (configRestartBtn) configRestartBtn.classList.remove('hidden'); // Show restart button
+            } catch (error) {
+                console.error('Error saving server config:', error);
+                updateConfigStatus(configSaveBtn, configStatus, `Error: ${error.message}`, false);
+            }
+        });
+    }
+    // Restart Server
+    if (configRestartBtn) {
+        configRestartBtn.addEventListener('click', async () => {
+            configRestartBtn.disabled = true;
+            configRestartBtn.innerHTML = `
+               <svg class="animate-spin h-5 w-5 mr-1 inline-block" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
+                 <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
+                 <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
+               </svg>
+               Restarting...`;
+            updateConfigStatus(configRestartBtn, configStatus, 'Restarting...', true, 0); // Indefinite
+            try {
+                const response = await fetch('/restart_server', { method: 'POST' });
+                const result = await response.json();
+                if (!response.ok) throw new Error(result.detail || 'Failed to trigger restart');
+                updateConfigStatus(configRestartBtn, configStatus, result.message + " Page will attempt reload.", true, 15000); // Show longer
+                // Show main loading overlay during restart check
+                if (loadingOverlay) {
+                    loadingMessage.textContent = 'Server restarting...';
+                    loadingStatus.textContent = 'Waiting for server to respond...';
+                    loadingCancelBtn.disabled = true; // Disable cancel during restart
+                    loadingOverlay.classList.remove('hidden');
+                }
+                // Poll for server readiness
+                let attempts = 0;
+                const maxAttempts = 45; // Wait up to 45 seconds
+                function checkServerReady() {
+                    attempts++;
+                    console.log(`Checking server readiness (Attempt ${attempts}/${maxAttempts})...`);
+                    loadingStatus.textContent = `Waiting for server... (${attempts}/${maxAttempts})`;
+                    fetch('/health?cache=' + Date.now(), { cache: 'no-store', headers: { 'pragma': 'no-cache' } })
+                        .then(res => {
+                            if (res.ok) {
+                                console.log("Server is ready. Reloading page.");
+                                window.location.reload(true); // Force reload from server
+                            } else if (attempts < maxAttempts) {
+                                setTimeout(checkServerReady, 1000); // Check again in 1 second
+                            } else {
+                                throw new Error('Server did not become ready after restart.');
+                            }
+                        })
+                        .catch(() => {
+                            if (attempts < maxAttempts) {
+                                setTimeout(checkServerReady, 1000); // Check again on connection error
+                            } else {
+                                throw new Error('Server did not respond after restart.');
+                            }
+                        });
+                }
+                setTimeout(checkServerReady, 3000); // Start checking after 3 seconds
+            } catch (error) {
+                console.error('Error restarting server:', error);
+                updateConfigStatus(configRestartBtn, configStatus, `Restart Error: ${error.message}`, false);
+                configRestartBtn.disabled = false; // Re-enable button on error
+                configRestartBtn.innerHTML = `
+                 <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" class="w-5 h-5 mr-1 inline-block"><path stroke-linecap="round" stroke-linejoin="round" d="M16.023 9.348h4.992v-.001M2.985 19.644v-4.992m0 0h4.992m-4.993 0 3.181 3.183a8.25 8.25 0 0 0 13.803-3.7M4.031 9.865a8.25 8.25 0 0 1 13.803-3.7l3.181 3.182m0-4.991v4.99" /></svg>
+                 Restart Server`;
+                if (loadingOverlay) loadingOverlay.classList.add('hidden');
+            }
+        });
+    }
+    // Save Generation Defaults
+    if (genDefaultsSaveBtn) {
+        genDefaultsSaveBtn.addEventListener('click', async () => {
+            const genParams = {};
+            sliders.forEach(s => {
+                const slider = document.getElementById(s.id);
+                if (slider) genParams[s.id] = slider.value;
+            });
+            updateConfigStatus(genDefaultsSaveBtn, genDefaultsStatus, 'Saving...', true, 0);
+            try {
+                const response = await fetch('/save_generation_defaults', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify(genParams)
+                });
+                const result = await response.json();
+                if (!response.ok) throw new Error(result.detail || 'Failed to save');
+                updateConfigStatus(genDefaultsSaveBtn, genDefaultsStatus, result.message, true);
+            } catch (error) {
+                console.error('Error saving generation defaults:', error);
+                updateConfigStatus(genDefaultsSaveBtn, genDefaultsStatus, `Error: ${error.message}`, false);
+            }
+        });
+    }
+    // --- Reference Audio Upload ---
+    if (cloneLoadButton && cloneFileInput && cloneReferenceSelect) {
+        cloneLoadButton.addEventListener('click', () => {
+            cloneFileInput.click(); // Trigger hidden file input
+        });
+        cloneFileInput.addEventListener('change', async (event) => {
+            const files = event.target.files;
+            if (!files || files.length === 0) {
+                return; // No files selected
+            }
+            cloneLoadButton.disabled = true;
+            cloneLoadButton.textContent = 'Uploading...';
+            showNotification(`Uploading ${files.length} file(s)...`, 'info', 0); // Indefinite
+            const formData = new FormData();
+            for (const file of files) {
+                formData.append('files', file);
+            }
+            try {
+                const response = await fetch('/upload_reference', {
+                    method: 'POST',
+                    body: formData
+                    // Content-Type is set automatically for FormData
+                });
+                const result = await response.json();
+                // Clear existing notifications before showing results
+                notificationArea.innerHTML = '';
+                if (!response.ok) {
+                    throw new Error(result.message || `Upload failed with status ${response.status}`);
+                }
+                // Process results
+                if (result.errors && result.errors.length > 0) {
+                    result.errors.forEach(err => showNotification(err, 'error'));
+                }
+                if (result.uploaded_files && result.uploaded_files.length > 0) {
+                    showNotification(`Successfully uploaded: ${result.uploaded_files.join(', ')}`, 'success');
+                } else if (!result.errors || result.errors.length === 0) {
+                    showNotification("Files processed, but no new files were added (might already exist).", 'info');
+                }
+                // Update dropdown
+                const currentSelection = cloneReferenceSelect.value;
+                cloneReferenceSelect.innerHTML = '<option value="none">-- Select Reference File --</option>'; // Clear existing options
+                result.all_reference_files.forEach(filename => {
+                    const option = document.createElement('option');
+                    option.value = filename;
+                    option.textContent = filename;
+                    cloneReferenceSelect.appendChild(option);
+                });
+                // Select the first newly uploaded file, or keep current selection if still valid
+                const firstUploaded = result.uploaded_files ? result.uploaded_files[0] : null;
+                if (firstUploaded) {
+                    cloneReferenceSelect.value = firstUploaded;
+                } else if (result.all_reference_files.includes(currentSelection)) {
+                    cloneReferenceSelect.value = currentSelection; // Restore previous valid selection
+                } else {
+                    cloneReferenceSelect.value = 'none'; // Default if nothing else matches
+                }
+            } catch (error) {
+                console.error('Error uploading reference files:', error);
+                showNotification(`Upload Error: ${error.message}`, 'error');
+            } finally {
+                cloneLoadButton.disabled = false;
+                cloneLoadButton.textContent = 'Load';
+                cloneFileInput.value = ''; // Reset file input
+            }
+        });
+    }
+    // --- Theme Toggle ---
+    function applyTheme(theme) {
+        if (theme === 'light') {
+            document.documentElement.classList.remove('dark');
+            if (themeIconLight) themeIconLight.classList.remove('hidden');
+            if (themeIconDark) themeIconDark.classList.add('hidden');
+        } else {
+            document.documentElement.classList.add('dark');
+            if (themeIconLight) themeIconLight.classList.add('hidden');
+            if (themeIconDark) themeIconDark.classList.remove('hidden');
+        }
+        // Update wavesurfer colors if it exists
+        if (wavesurfer) {
+            wavesurfer.setOptions({
+                waveColor: theme === 'light' ? '#0ea5e9' : '#38bdf8',
+                progressColor: theme === 'light' ? '#0369a1' : '#0284c7',
+                cursorColor: theme === 'light' ? '#9333ea' : '#a855f7',
+            });
+        }
+    }
+    if (themeToggleButton) {
+        // Check localStorage on load
+        const savedTheme = localStorage.getItem('theme') || 'dark'; // Default to dark
+        applyTheme(savedTheme);
+        themeToggleButton.addEventListener('click', () => {
+            const isDark = document.documentElement.classList.contains('dark');
+            const newTheme = isDark ? 'light' : 'dark';
+            applyTheme(newTheme);
+            localStorage.setItem('theme', newTheme); // Save preference
+        });
+    }
+}); // End DOMContentLoaded

utils.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# utils.py
+# Utility functions for the Dia TTS server
+import logging
+import time
+import os
+import io
+import numpy as np
+import soundfile as sf
+from typing import Optional, Tuple
+logger = logging.getLogger(__name__)
+# --- Audio Processing ---
+def encode_audio(
+    audio_array: np.ndarray, sample_rate: int, output_format: str = "opus"
+) -> Optional[bytes]:
+    """
+    Encodes a NumPy audio array into the specified format in memory.
+    Args:
+        audio_array: NumPy array containing audio data (float32, range [-1, 1]).
+        sample_rate: Sample rate of the audio data.
+        output_format: Desired output format ('opus' or 'wav').
+    Returns:
+        Bytes object containing the encoded audio, or None on failure.
+    """
+    if audio_array is None or audio_array.size == 0:
+        logger.warning("encode_audio received empty or None audio array.")
+        return None
+    start_time = time.time()
+    output_buffer = io.BytesIO()
+    try:
+        if output_format == "opus":
+            # Soundfile expects int16 for Opus usually, but let's try float32 first
+            # It might convert internally or require specific subtypes.
+            # If this fails, we might need to convert to int16 first:
+            # audio_int16 = (audio_array * 32767).astype(np.int16)
+            # sf.write(output_buffer, audio_int16, sample_rate, format='ogg', subtype='opus')
+            sf.write(
+                output_buffer, audio_array, sample_rate, format="ogg", subtype="opus"
+            )
+            content_type = "audio/ogg; codecs=opus"
+        elif output_format == "wav":
+            # WAV typically uses int16
+            audio_int16 = (audio_array * 32767).astype(np.int16)
+            sf.write(
+                output_buffer, audio_int16, sample_rate, format="wav", subtype="pcm_16"
+            )
+            content_type = "audio/wav"
+        else:
+            logger.error(f"Unsupported output format requested: {output_format}")
+            return None
+        encoded_bytes = output_buffer.getvalue()
+        end_time = time.time()
+        logger.info(
+            f"Encoded {len(encoded_bytes)} bytes to {output_format} in {end_time - start_time:.3f} seconds."
+        )
+        return encoded_bytes
+    except ImportError:
+        logger.critical(
+            "`soundfile` or its dependency `libsndfile` not found/installed correctly. Cannot encode audio."
+        )
+        raise  # Re-raise critical error
+    except Exception as e:
+        logger.error(f"Error encoding audio to {output_format}: {e}", exc_info=True)
+        return None
+def save_audio_to_file(
+    audio_array: np.ndarray, sample_rate: int, file_path: str
+) -> bool:
+    """
+    Saves a NumPy audio array to a WAV file.
+    Args:
+        audio_array: NumPy array containing audio data (float32, range [-1, 1]).
+        sample_rate: Sample rate of the audio data.
+        file_path: Path to save the WAV file.
+    Returns:
+        True if saving was successful, False otherwise.
+    """
+    if audio_array is None or audio_array.size == 0:
+        logger.warning("save_audio_to_file received empty or None audio array.")
+        return False
+    if not file_path.lower().endswith(".wav"):
+        logger.warning(
+            f"File path '{file_path}' does not end with .wav. Saving as WAV anyway."
+        )
+        # Optionally change the extension: file_path += ".wav"
+    start_time = time.time()
+    try:
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        # WAV typically uses int16
+        audio_int16 = (audio_array * 32767).astype(np.int16)
+        sf.write(file_path, audio_int16, sample_rate, format="wav", subtype="pcm_16")
+        end_time = time.time()
+        logger.info(
+            f"Saved WAV file to {file_path} in {end_time - start_time:.3f} seconds."
+        )
+        return True
+    except ImportError:
+        logger.critical(
+            "`soundfile` or its dependency `libsndfile` not found/installed correctly. Cannot save audio."
+        )
+        return False  # Indicate failure
+    except Exception as e:
+        logger.error(f"Error saving WAV file to {file_path}: {e}", exc_info=True)
+        return False
+# --- Other Utilities (Optional) ---
+class PerformanceMonitor:
+    """Simple performance monitoring."""
+    def __init__(self):
+        self.start_time = time.time()
+        self.events = []
+    def record(self, event_name: str):
+        self.events.append((event_name, time.time()))
+    def report(self) -> str:
+        report_lines = ["Performance Report:"]
+        last_time = self.start_time
+        total_duration = time.time() - self.start_time
+        for name, timestamp in self.events:
+            duration = timestamp - last_time
+            report_lines.append(f"  - {name}: {duration:.3f}s")
+            last_time = timestamp
+        report_lines.append(f"Total Duration: {total_duration:.3f}s")
+        return "\n".join(report_lines)