Spaces:

Quantumhash
/

Qhash_Clone

Paused

App Files Files Community

sbapan41 commited on Jun 10

Commit

45b9e1a

verified ·

1 Parent(s): 46270db

Upload 14 files

Browse files

Files changed (9) hide show

app.py +561 -0
qhash/autoencoder.py +26 -0
qhash/backbone.py +50 -0
qhash/codebook_pattern.py +12 -0
qhash/conditioning.py +373 -0
qhash/config.py +38 -0
qhash/model.py +270 -0
qhash/sampling.py +141 -0
qhash/speaker_cloning.py +406 -0

app.py ADDED Viewed

	@@ -0,0 +1,561 @@

+import os
+import shlex
+import subprocess
+subprocess.run(
+    shlex.split("pip install flash-attn --no-build-isolation"),
+    env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    check=True,
+)
+subprocess.run(
+    shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
+    check=True,
+)
+subprocess.run(
+    shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
+    check=True,
+)
+import spaces
+import torch
+import torchaudio
+import gradio as gr
+from os import getenv
+from qhash.model import Zonos
+from qhash.conditioning import make_cond_dict, supported_language_codes
+device = "cuda"
+MODEL_NAMES = ["Quantumhash/Qhash-v0.1-transformer", "Quantumhash/Qhash-v0.1-hybrid"]
+MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
+for model in MODELS.values():
+    model.requires_grad_(False).eval()
+def update_ui(model_choice):
+    """
+    Dynamically show/hide UI elements based on the model's conditioners.
+    We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
+    """
+    model = MODELS[model_choice]
+    cond_names = [c.name for c in model.prefix_conditioner.conditioners]
+    print("Conditioners in this model:", cond_names)
+    text_update = gr.update(visible=("espeak" in cond_names))
+    language_update = gr.update(visible=("espeak" in cond_names))
+    speaker_audio_update = gr.update(visible=("speaker" in cond_names))
+    prefix_audio_update = gr.update(visible=True)
+    emotion1_update = gr.update(visible=("emotion" in cond_names))
+    emotion2_update = gr.update(visible=("emotion" in cond_names))
+    emotion3_update = gr.update(visible=("emotion" in cond_names))
+    emotion4_update = gr.update(visible=("emotion" in cond_names))
+    emotion5_update = gr.update(visible=("emotion" in cond_names))
+    emotion6_update = gr.update(visible=("emotion" in cond_names))
+    emotion7_update = gr.update(visible=("emotion" in cond_names))
+    emotion8_update = gr.update(visible=("emotion" in cond_names))
+    vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
+    fmax_slider_update = gr.update(visible=("fmax" in cond_names))
+    pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
+    speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
+    dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
+    speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
+    unconditional_keys_update = gr.update(
+        choices=[name for name in cond_names if name not in ("espeak", "language_id")]
+    )
+    return (
+        text_update,
+        language_update,
+        speaker_audio_update,
+        prefix_audio_update,
+        emotion1_update,
+        emotion2_update,
+        emotion3_update,
+        emotion4_update,
+        emotion5_update,
+        emotion6_update,
+        emotion7_update,
+        emotion8_update,
+        vq_single_slider_update,
+        fmax_slider_update,
+        pitch_std_slider_update,
+        speaking_rate_slider_update,
+        dnsmos_slider_update,
+        speaker_noised_checkbox_update,
+        unconditional_keys_update,
+    )
+@spaces.GPU(duration=120)
+def generate_audio(
+    model_choice,
+    text,
+    language,
+    speaker_audio,
+    prefix_audio,
+    e1,
+    e2,
+    e3,
+    e4,
+    e5,
+    e6,
+    e7,
+    e8,
+    vq_single,
+    fmax,
+    pitch_std,
+    speaking_rate,
+    dnsmos_ovrl,
+    speaker_noised,
+    cfg_scale,
+    min_p,
+    seed,
+    randomize_seed,
+    unconditional_keys,
+    progress=gr.Progress(),
+):
+    """
+    Generates audio based on the provided UI parameters.
+    We do NOT use language_id or ctc_loss even if the model has them.
+    """
+    selected_model = MODELS[model_choice]
+    speaker_noised_bool = bool(speaker_noised)
+    fmax = float(fmax)
+    pitch_std = float(pitch_std)
+    speaking_rate = float(speaking_rate)
+    dnsmos_ovrl = float(dnsmos_ovrl)
+    cfg_scale = float(cfg_scale)
+    min_p = float(min_p)
+    seed = int(seed)
+    max_new_tokens = 86 * 30
+    if randomize_seed:
+        seed = torch.randint(0, 2**32 - 1, (1,)).item()
+    torch.manual_seed(seed)
+    speaker_embedding = None
+    if speaker_audio is not None and "speaker" not in unconditional_keys:
+        wav, sr = torchaudio.load(speaker_audio)
+        speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
+        speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
+    audio_prefix_codes = None
+    if prefix_audio is not None:
+        wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
+        wav_prefix = wav_prefix.mean(0, keepdim=True)
+        wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
+        wav_prefix = wav_prefix.to(device, dtype=torch.float32)
+        with torch.autocast(device, dtype=torch.float32):
+            audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
+    emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
+    vq_val = float(vq_single)
+    vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
+    cond_dict = make_cond_dict(
+        text=text,
+        language=language,
+        speaker=speaker_embedding,
+        emotion=emotion_tensor,
+        vqscore_8=vq_tensor,
+        fmax=fmax,
+        pitch_std=pitch_std,
+        speaking_rate=speaking_rate,
+        dnsmos_ovrl=dnsmos_ovrl,
+        speaker_noised=speaker_noised_bool,
+        device=device,
+        unconditional_keys=unconditional_keys,
+    )
+    conditioning = selected_model.prepare_conditioning(cond_dict)
+    estimated_generation_duration = 30 * len(text) / 400
+    estimated_total_steps = int(estimated_generation_duration * 86)
+    def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
+        progress((step, estimated_total_steps))
+        return True
+    codes = selected_model.generate(
+        prefix_conditioning=conditioning,
+        audio_prefix_codes=audio_prefix_codes,
+        max_new_tokens=max_new_tokens,
+        cfg_scale=cfg_scale,
+        batch_size=1,
+        sampling_params=dict(min_p=min_p),
+        callback=update_progress,
+    )
+    wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
+    sr_out = selected_model.autoencoder.sampling_rate
+    if wav_out.dim() == 2 and wav_out.size(0) > 1:
+        wav_out = wav_out[0:1, :]
+    return (sr_out, wav_out.squeeze().numpy()), seed
+# Custom CSS for pastel gradient background and enhanced UI
+custom_css = """
+.gradio-container {
+    background: #0C101B;
+    background-size: 400% 400%;
+    animation: gradient 15s ease infinite;
+}
+@keyframes gradient {
+    0% {
+        background-position: 0% 50%;
+    }
+    50% {
+        background-position: 100% 50%;
+    }
+    100% {
+        background-position: 0% 50%;
+    }
+}
+.container {
+    max-width: 1200px;
+    margin: 0 auto;
+    padding: 20px;
+}
+.panel {
+    background-color: rgba(159, 153, 96, 0.9);
+    border-radius: 16px;
+    padding: 20px;
+    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
+    margin-bottom: 16px;
+    backdrop-filter: blur(5px);
+    transition: all 0.3s ease;
+}
+.panel p {
+    font-size: 1.1em;
+    color: black;
+}
+.panel:hover {
+    box-shadow: 0 6px 16px rgba(0, 0, 0, 0.12);
+    transform: translateY(-2px);
+}
+.title {
+    font-size: 1.2em;
+    font-weight: 600;
+    margin-bottom: 12px;
+    color: #6a3ea1;
+    border-bottom: 2px solid #f0e6ff;
+    padding-bottom: 8px;
+}
+.slider-container {
+    background-color: rgba(255, 255, 255, 0.5);
+    border-radius: 10px;
+    padding: 10px;
+    margin: 5px 0;
+}
+/* Make sliders more appealing */
+input[type=range] {
+    height: 5px;
+    appearance: none;
+    width: 100%;
+    border-radius: 3px;
+    background: linear-gradient(90deg, #9c83e0, #83b1e0);
+}
+.generate-button {
+    background: linear-gradient(90deg, #a673ff, #7c4dff);
+    color: white;
+    border: none;
+    border-radius: 8px;
+    padding: 12px 24px;
+    font-size: 16px;
+    font-weight: 500;
+    cursor: pointer;
+    transition: all 0.3s ease;
+    box-shadow: 0 4px 10px rgba(124, 77, 255, 0.2);
+    display: block;
+    width: 100%;
+    margin: 20px 0;
+}
+.generate-button:hover {
+    background: linear-gradient(90deg, #9c5eff, #6a3aff);
+    box-shadow: 0 6px 15px rgba(124, 77, 255, 0.3);
+    transform: translateY(-2px);
+}
+/* Tabs styling */
+.tabs {
+    display: flex;
+    border-bottom: 1px solid #e0e0e0;
+    margin-bottom: 20px;
+}
+.tab {
+    padding: 10px 20px;
+    cursor: pointer;
+    transition: all 0.3s ease;
+    background-color: transparent;
+    border: none;
+    color: #666;
+}
+.tab.active {
+    color: #7c4dff;
+    border-bottom: 3px solid #7c4dff;
+    font-weight: 600;
+}
+/* Emotion sliders container */
+.emotion-grid {
+    display: grid;
+    grid-template-columns: repeat(4, 1fr);
+    gap: 12px;
+}
+/* Header styling */
+.app-header {
+    text-align: center;
+    margin-bottom: 25px;
+}
+.app-header h1 {
+    font-size: 2.5em;
+    color: #6a3ea1;
+    margin-bottom: 8px;
+    font-weight: 700;
+}
+.app-header p {
+    font-size: 1.1em;
+    color: #6a3ea1;
+    margin-bottom: 20px;
+}
+/* Audio player styling */
+.audio-output {
+    margin-top: 20px;
+}
+/* Make output area more prominent */
+.output-container {
+    background-color: rgba(24, 82, 79, 0.85);
+    border-radius: 16px;
+    padding: 24px;
+    box-shadow: 0 8px 18px rgba(0, 0, 0, 0.1);
+    margin-top: 20px;
+}
+"""
+def build_interface():
+    # Build interface with enhanced visual elements and layout
+    with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
+        # Header section
+        with gr.Column(elem_classes="app-header"):
+            gr.Markdown("# ✨ Qhash Text-to-Speech Clone ✨")
+            gr.Markdown("Create natural-sounding speech with customizable voice characteristics")
+        # Main content container
+        with gr.Column(elem_classes="container"):
+            # First panel - Text & Model Selection
+            with gr.Column(elem_classes="panel"):
+                gr.Markdown("💬 Text & Model Configuration")
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        model_choice = gr.Dropdown(
+                            choices=MODEL_NAMES,
+                            value="Quantumhash/Qhash-v0.1-transformer",
+                            label="Qhash Model Type",
+                            info="Select the model variant to use.",
+                        )
+                        text = gr.Textbox(
+                            label="Text to Synthesize",
+                            value="Qhash uses eSpeak for text to phoneme conversion!",
+                            lines=4,
+                            max_length=500,
+                        )
+                        language = gr.Dropdown(
+                            choices=supported_language_codes,
+                            value="en-us",
+                            label="Language Code",
+                            info="Select a language code.",
+                        )
+                    with gr.Column(scale=1):
+                        prefix_audio = gr.Audio(
+                            value="assets/silence_100ms.wav",
+                            label="Optional Prefix Audio (continue from this audio)",
+                            type="filepath",
+                        )
+            # Second panel - Voice Characteristics
+            with gr.Column(elem_classes="panel"):
+                gr.Markdown("🎤 Voice Characteristics")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        speaker_audio = gr.Audio(
+                            label="Optional Speaker Audio (for voice cloning)",
+                            type="filepath",
+                        )
+                        speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
+                    with gr.Column(scale=2):
+                        with gr.Row():
+                            with gr.Column():
+                                dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="Voice Quality", elem_classes="slider-container")
+                                fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Frequency Max (Hz)", elem_classes="slider-container")
+                                vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="Voice Clarity", elem_classes="slider-container")
+                            with gr.Column():
+                                pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Variation", elem_classes="slider-container")
+                                speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", elem_classes="slider-container")
+            # Third panel - Generation Parameters
+            with gr.Column(elem_classes="panel"):
+                gr.Markdown("⚙️ Generation Parameters")
+                with gr.Row():
+                    with gr.Column():
+                        cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="Guidance Scale", elem_classes="slider-container")
+                        min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P (Randomness)", elem_classes="slider-container")
+                    with gr.Column():
+                        seed_number = gr.Number(label="Seed", value=420, precision=0)
+                        randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
+            # Emotion Panel with Tabbed Interface
+            with gr.Accordion("🎭 Emotion Settings", open=False, elem_classes="panel"):
+                gr.Markdown(
+                    "Adjust these sliders to control the emotional tone of the generated speech.\n"
+                    "For a neutral voice, keep 'Neutral' high and other emotions low."
+                )
+                with gr.Row(elem_classes="emotion-grid"):
+                    emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness", elem_classes="slider-container")
+                    emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness", elem_classes="slider-container")
+                    emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust", elem_classes="slider-container")
+                    emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear", elem_classes="slider-container")
+                with gr.Row(elem_classes="emotion-grid"):
+                    emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise", elem_classes="slider-container")
+                    emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger", elem_classes="slider-container")
+                    emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other", elem_classes="slider-container")
+                    emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral", elem_classes="slider-container")
+            # Advanced Settings Panel
+            with gr.Accordion("⚡ Advanced Settings", open=False, elem_classes="panel"):
+                gr.Markdown(
+                    "### Unconditional Toggles\n"
+                    "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
+                    'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
+                )
+                unconditional_keys = gr.CheckboxGroup(
+                    [
+                        "speaker",
+                        "emotion",
+                        "vqscore_8",
+                        "fmax",
+                        "pitch_std",
+                        "speaking_rate",
+                        "dnsmos_ovrl",
+                        "speaker_noised",
+                    ],
+                    value=["emotion"],
+                    label="Unconditional Keys",
+                )
+            # Generate Button and Output Area
+            with gr.Column(elem_classes="panel output-container"):
+                gr.Markdown("🔊 Generate & Output")
+                generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
+                output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")
+        model_choice.change(
+            fn=update_ui,
+            inputs=[model_choice],
+            outputs=[
+                text,
+                language,
+                speaker_audio,
+                prefix_audio,
+                emotion1,
+                emotion2,
+                emotion3,
+                emotion4,
+                emotion5,
+                emotion6,
+                emotion7,
+                emotion8,
+                vq_single_slider,
+                fmax_slider,
+                pitch_std_slider,
+                speaking_rate_slider,
+                dnsmos_slider,
+                speaker_noised_checkbox,
+                unconditional_keys,
+            ],
+        )
+        # On page load, trigger the same UI refresh
+        demo.load(
+            fn=update_ui,
+            inputs=[model_choice],
+            outputs=[
+                text,
+                language,
+                speaker_audio,
+                prefix_audio,
+                emotion1,
+                emotion2,
+                emotion3,
+                emotion4,
+                emotion5,
+                emotion6,
+                emotion7,
+                emotion8,
+                vq_single_slider,
+                fmax_slider,
+                pitch_std_slider,
+                speaking_rate_slider,
+                dnsmos_slider,
+                speaker_noised_checkbox,
+                unconditional_keys,
+            ],
+        )
+        # Generate audio on button click
+        generate_button.click(
+            fn=generate_audio,
+            inputs=[
+                model_choice,
+                text,
+                language,
+                speaker_audio,
+                prefix_audio,
+                emotion1,
+                emotion2,
+                emotion3,
+                emotion4,
+                emotion5,
+                emotion6,
+                emotion7,
+                emotion8,
+                vq_single_slider,
+                fmax_slider,
+                pitch_std_slider,
+                speaking_rate_slider,
+                dnsmos_slider,
+                speaker_noised_checkbox,
+                cfg_scale_slider,
+                min_p_slider,
+                seed_number,
+                randomize_seed_toggle,
+                unconditional_keys,
+            ],
+            outputs=[output_audio, seed_number],
+        )
+    return demo
+if __name__ == "__main__":
+    demo = build_interface()
+    share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=share, mcp_server=True)

qhash/autoencoder.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import math
+import torch
+import torchaudio
+from transformers.models.dac import DacModel
+class DACAutoencoder:
+    def __init__(self):
+        super().__init__()
+        self.dac = DacModel.from_pretrained("Quantumhash/dac_44khz")
+        self.dac.eval().requires_grad_(False)
+        self.codebook_size = self.dac.config.codebook_size
+        self.num_codebooks = self.dac.quantizer.n_codebooks
+        self.sampling_rate = self.dac.config.sampling_rate
+    def preprocess(self, wav: torch.Tensor, sr: int) -> torch.Tensor:
+        wav = torchaudio.functional.resample(wav, sr, 44_100)
+        right_pad = math.ceil(wav.shape[-1] / 512) * 512 - wav.shape[-1]
+        return torch.nn.functional.pad(wav, (0, right_pad))
+    def encode(self, wav: torch.Tensor) -> torch.Tensor:
+        return self.dac.encode(wav).audio_codes
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        return self.dac.decode(audio_codes=codes).audio_values.unsqueeze(1)

qhash/backbone.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torch.nn as nn
+from mamba_ssm.models.mixer_seq_simple import create_block
+from mamba_ssm.ops.triton.layer_norm import layer_norm_fn
+from mamba_ssm.utils.generation import InferenceParams
+from qhash.config import BackboneConfig
+class ZonosBackbone(nn.Module):
+    def __init__(self, config: BackboneConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                create_block(
+                    d_model=config.d_model,
+                    d_intermediate=config.d_intermediate
+                    if (i not in config.attn_layer_idx)
+                    else config.attn_mlp_d_intermediate,
+                    ssm_cfg=config.ssm_cfg,
+                    layer_idx=i,
+                    attn_layer_idx=config.attn_layer_idx,
+                    attn_cfg=config.attn_cfg,
+                    norm_epsilon=config.norm_epsilon,
+                    residual_in_fp32=config.residual_in_fp32,
+                    fused_add_norm=True,
+                    rms_norm=config.rms_norm,
+                )
+                for i in range(config.n_layer)
+            ]
+        )
+        self.norm_f = nn.LayerNorm(config.d_model, eps=config.norm_epsilon)
+    def forward(self, hidden_states: torch.Tensor, inference_params: InferenceParams | None = None):
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(hidden_states, residual, inference_params)
+        return layer_norm_fn(
+            hidden_states,
+            self.norm_f.weight,
+            self.norm_f.bias,
+            residual,
+            eps=self.norm_f.eps,
+            residual_in_fp32=self.config.residual_in_fp32,
+            is_rms_norm=self.config.rms_norm,
+        )

qhash/codebook_pattern.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+import torch.nn.functional as F
+def apply_delay_pattern(codes: torch.Tensor, mask_token: int):
+    codes = F.pad(codes, (0, codes.shape[1]), value=mask_token)
+    return torch.stack([codes[:, k].roll(k + 1) for k in range(codes.shape[1])], dim=1)
+def revert_delay_pattern(codes: torch.Tensor):
+    _, n_q, seq_len = codes.shape
+    return torch.stack([codes[:, k, k + 1 : seq_len - n_q + k + 1] for k in range(n_q)], dim=1)

qhash/conditioning.py ADDED Viewed

	@@ -0,0 +1,373 @@

+from functools import cache
+from typing import Any, Literal, Iterable
+import torch
+import torch.nn as nn
+from qhash.config import PrefixConditionerConfig
+class Conditioner(nn.Module):
+    def __init__(
+        self,
+        output_dim: int,
+        name: str,
+        cond_dim: int | None = None,
+        projection: Literal["none", "linear", "mlp"] = "none",
+        uncond_type: Literal["learned", "none"] = "none",
+        **kwargs,
+    ):
+        super().__init__()
+        self.name = name
+        self.output_dim = output_dim
+        self.cond_dim = cond_dim = cond_dim or output_dim
+        if projection == "linear":
+            self.project = nn.Linear(cond_dim, output_dim)
+        elif projection == "mlp":
+            self.project = nn.Sequential(
+                nn.Linear(cond_dim, output_dim),
+                nn.SiLU(),
+                nn.Linear(output_dim, output_dim),
+            )
+        else:
+            self.project = nn.Identity()
+        self.uncond_vector = None
+        if uncond_type == "learned":
+            self.uncond_vector = nn.Parameter(torch.zeros(output_dim))
+    def apply_cond(self, *inputs: Any) -> torch.Tensor:
+        raise NotImplementedError()
+    def forward(self, inputs: tuple[Any, ...] | None) -> torch.Tensor:
+        if inputs is None:
+            assert self.uncond_vector is not None
+            return self.uncond_vector.data.view(1, 1, -1)
+        cond = self.apply_cond(*inputs)
+        cond = self.project(cond)
+        return cond
+# ------- ESPEAK CONTAINMENT ZONE ------------------------------------------------------------------------------------------------------------------------------------------------
+import re
+import unicodedata
+import inflect
+import torch
+import torch.nn as nn
+from kanjize import number2kanji
+from phonemizer.backend import EspeakBackend
+from sudachipy import Dictionary, SplitMode
+# --- Number normalization code from https://github.com/daniilrobnikov/vits2/blob/main/text/normalize_numbers.py ---
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+def _remove_commas(m: re.Match) -> str:
+    return m.group(1).replace(",", "")
+def _expand_decimal_point(m: re.Match) -> str:
+    return m.group(1).replace(".", " point ")
+def _expand_dollars(m: re.Match) -> str:
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " dollars"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
+    else:
+        return "zero dollars"
+def _expand_ordinal(m: re.Match) -> str:
+    return _inflect.number_to_words(m.group(0))
+def _expand_number(m: re.Match) -> str:
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return "two thousand"
+        elif num > 2000 and num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        else:
+            return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
+    else:
+        return _inflect.number_to_words(num, andword="")
+def normalize_numbers(text: str) -> str:
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
+# --- Number normalization code end ---
+PAD_ID, UNK_ID, BOS_ID, EOS_ID = 0, 1, 2, 3
+SPECIAL_TOKEN_IDS = [PAD_ID, UNK_ID, BOS_ID, EOS_ID]
+_punctuation = ';:,.!?¡¿—…"«»“”() *~-/\\&'
+_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+_letters_ipa = (
+    "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+)
+symbols = [*_punctuation, *_letters, *_letters_ipa]
+_symbol_to_id = {s: i for i, s in enumerate(symbols, start=len(SPECIAL_TOKEN_IDS))}
+def _get_symbol_id(s: str) -> int:
+    return _symbol_to_id.get(s, 1)
+def get_symbol_ids(text: str) -> list[int]:
+    return list(map(_get_symbol_id, text))
+def tokenize_phonemes(phonemes: list[str]) -> tuple[torch.Tensor, list[int]]:
+    phoneme_ids = [[BOS_ID, *get_symbol_ids(phonemes), EOS_ID] for phonemes in phonemes]
+    lengths = list(map(len, phoneme_ids))
+    longest = max(lengths)
+    phoneme_ids = [[PAD_ID] * (longest - len(ids)) + ids for ids in phoneme_ids]
+    return torch.tensor(phoneme_ids), lengths
+def normalize_jp_text(text: str, tokenizer=Dictionary(dict="full").create()) -> str:
+    text = unicodedata.normalize("NFKC", text)
+    text = re.sub(r"\d+", lambda m: number2kanji(int(m[0])), text)
+    final_text = " ".join([x.reading_form() for x in tokenizer.tokenize(text, SplitMode.A)])
+    return final_text
+def clean(texts: list[str], languages: list[str]) -> list[str]:
+    texts_out = []
+    for text, language in zip(texts, languages):
+        if "ja" in language:
+            text = normalize_jp_text(text)
+        else:
+            text = normalize_numbers(text)
+        texts_out.append(text)
+    return texts_out
+@cache
+def get_backend(language: str) -> "EspeakBackend":
+    import logging
+    from phonemizer.backend import EspeakBackend
+    logger = logging.getLogger("phonemizer")
+    backend = EspeakBackend(
+        language,
+        preserve_punctuation=True,
+        with_stress=True,
+        punctuation_marks=_punctuation,
+        logger=logger,
+    )
+    logger.setLevel(logging.ERROR)
+    return backend
+def phonemize(texts: list[str], languages: list[str]) -> list[str]:
+    texts = clean(texts, languages)
+    batch_phonemes = []
+    for text, language in zip(texts, languages):
+        backend = get_backend(language)
+        phonemes = backend.phonemize([text], strip=True)
+        batch_phonemes.append(phonemes[0])
+    return batch_phonemes
+class EspeakPhonemeConditioner(Conditioner):
+    def __init__(self, output_dim: int, **kwargs):
+        super().__init__(output_dim, **kwargs)
+        self.phoneme_embedder = nn.Embedding(len(SPECIAL_TOKEN_IDS) + len(symbols), output_dim)
+    def apply_cond(self, texts: list[str], languages: list[str]) -> torch.Tensor:
+        """
+        Args:
+            texts: list of texts to convert to phonemes
+            languages: ISO 639-1 -or otherwise eSpeak compatible- language code
+        """
+        device = self.phoneme_embedder.weight.device
+        phonemes = phonemize(texts, languages)
+        phoneme_ids, _ = tokenize_phonemes(phonemes)
+        phoneme_embeds = self.phoneme_embedder(phoneme_ids.to(device))
+        return phoneme_embeds
+# ------- ESPEAK CONTAINMENT ZONE ------------------------------------------------------------------------------------------------------------------------------------------------
+class FourierConditioner(Conditioner):
+    def __init__(
+        self,
+        output_dim: int,
+        input_dim: int = 1,
+        std: float = 1.0,
+        min_val: float = 0.0,
+        max_val: float = 1.0,
+        **kwargs,
+    ):
+        assert output_dim % 2 == 0
+        super().__init__(output_dim, **kwargs)
+        self.register_buffer("weight", torch.randn([output_dim // 2, input_dim]) * std)
+        self.input_dim, self.min_val, self.max_val = input_dim, min_val, max_val
+    def apply_cond(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.shape[-1] == self.input_dim
+        x = (x - self.min_val) / (self.max_val - self.min_val)  # [batch_size, seq_len, input_dim]
+        f = 2 * torch.pi * x.to(self.weight.dtype) @ self.weight.T  # [batch_size, seq_len, output_dim // 2]
+        return torch.cat([f.cos(), f.sin()], dim=-1)  # [batch_size, seq_len, output_dim]
+class IntegerConditioner(Conditioner):
+    def __init__(self, output_dim: int, min_val: int = 0, max_val: int = 512, **kwargs):
+        super().__init__(output_dim, **kwargs)
+        self.min_val = min_val
+        self.max_val = max_val
+        self.int_embedder = nn.Embedding(max_val - min_val + 1, output_dim)
+    def apply_cond(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.shape[-1] == 1
+        return self.int_embedder(x.squeeze(-1) - self.min_val)  # [batch_size, seq_len, output_dim]
+class PassthroughConditioner(Conditioner):
+    def __init__(self, output_dim: int, **kwargs):
+        super().__init__(output_dim, **kwargs)
+    def apply_cond(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.shape[-1] == self.cond_dim
+        return x
+_cond_cls_map = {
+    "PassthroughConditioner": PassthroughConditioner,
+    "EspeakPhonemeConditioner": EspeakPhonemeConditioner,
+    "FourierConditioner": FourierConditioner,
+    "IntegerConditioner": IntegerConditioner,
+}
+def build_conditioners(conditioners: list[dict], output_dim: int) -> list[Conditioner]:
+    return [_cond_cls_map[config["type"]](output_dim, **config) for config in conditioners]
+class PrefixConditioner(Conditioner):
+    def __init__(self, config: PrefixConditionerConfig, output_dim: int):
+        super().__init__(output_dim, "prefix", projection=config.projection)
+        self.conditioners = nn.ModuleList(build_conditioners(config.conditioners, output_dim))
+        self.norm = nn.LayerNorm(output_dim)
+        self.required_keys = {c.name for c in self.conditioners if c.uncond_vector is None}
+    def forward(self, cond_dict: dict) -> torch.Tensor:
+        if not set(cond_dict).issuperset(self.required_keys):
+            raise ValueError(f"Missing required keys: {self.required_keys - set(cond_dict)}")
+        conds = []
+        for conditioner in self.conditioners:
+            conds.append(conditioner(cond_dict.get(conditioner.name)))
+        max_bsz = max(map(len, conds))
+        assert all(c.shape[0] in (max_bsz, 1) for c in conds)
+        conds = [c.expand(max_bsz, -1, -1) for c in conds]
+        return self.norm(self.project(torch.cat(conds, dim=-2)))
+supported_language_codes = [
+    'af', 'am', 'an', 'ar', 'as', 'az', 'ba', 'bg', 'bn', 'bpy', 'bs', 'ca', 'cmn',
+    'cs', 'cy', 'da', 'de', 'el', 'en-029', 'en-gb', 'en-gb-scotland', 'en-gb-x-gbclan',
+    'en-gb-x-gbcwmd', 'en-gb-x-rp', 'en-us', 'eo', 'es', 'es-419', 'et', 'eu', 'fa',
+    'fa-latn', 'fi', 'fr-be', 'fr-ch', 'fr-fr', 'ga', 'gd', 'gn', 'grc', 'gu', 'hak',
+    'hi', 'hr', 'ht', 'hu', 'hy', 'hyw', 'ia', 'id', 'is', 'it', 'ja', 'jbo', 'ka',
+    'kk', 'kl', 'kn', 'ko', 'kok', 'ku', 'ky', 'la', 'lfn', 'lt', 'lv', 'mi', 'mk',
+    'ml', 'mr', 'ms', 'mt', 'my', 'nb', 'nci', 'ne', 'nl', 'om', 'or', 'pa', 'pap',
+    'pl', 'pt', 'pt-br', 'py', 'quc', 'ro', 'ru', 'ru-lv', 'sd', 'shn', 'si', 'sk',
+    'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'tn', 'tr', 'tt', 'ur', 'uz', 'vi',
+    'vi-vn-x-central', 'vi-vn-x-south', 'yue'
+]  # fmt: off
+def make_cond_dict(
+    text: str = "It would be nice to have time for testing, indeed.",
+    language: str = "en-us",
+    speaker: torch.Tensor | None = None,
+    emotion: list[float] = [0.3077, 0.0256, 0.0256, 0.0256, 0.0256, 0.0256, 0.2564, 0.3077],
+    fmax: float = 22050.0,
+    pitch_std: float = 20.0,
+    speaking_rate: float = 15.0,
+    vqscore_8: list[float] = [0.78] * 8,
+    ctc_loss: float = 0.0,
+    dnsmos_ovrl: float = 4.0,
+    speaker_noised: bool = False,
+    unconditional_keys: Iterable[str] = {"vqscore_8", "dnsmos_ovrl"},
+    device: str = "cuda",
+) -> dict:
+    """
+    A helper to build the 'cond_dict' that the model expects.
+    By default, it will generate a random speaker embedding
+    """
+    assert language.lower() in supported_language_codes, "Please pick a supported language"
+    language_code_to_id = {lang: i for i, lang in enumerate(supported_language_codes)}
+    cond_dict = {
+        "espeak": ([text], [language]),
+        "speaker": speaker,
+        "emotion": emotion,
+        "fmax": fmax,
+        "pitch_std": pitch_std,
+        "speaking_rate": speaking_rate,
+        "language_id": language_code_to_id[language],
+        "vqscore_8": vqscore_8,
+        "ctc_loss": ctc_loss,
+        "dnsmos_ovrl": dnsmos_ovrl,
+        "speaker_noised": int(speaker_noised),
+    }
+    for k in unconditional_keys:
+        cond_dict.pop(k, None)
+    for k, v in cond_dict.items():
+        if isinstance(v, (float, int, list)):
+            v = torch.tensor(v)
+        if isinstance(v, torch.Tensor):
+            cond_dict[k] = v.view(1, 1, -1).to(device)
+        if k == "emotion":
+            cond_dict[k] /= cond_dict[k].sum(dim=-1)
+    return cond_dict

qhash/config.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from dataclasses import dataclass, field
+from typing import Literal
+@dataclass
+class BackboneConfig:
+    d_model: int = 1024
+    d_intermediate: int = 0
+    attn_mlp_d_intermediate: int = 0
+    n_layer: int = 16
+    ssm_cfg: dict = field(default_factory=dict)
+    attn_layer_idx: list = field(default_factory=list)
+    attn_cfg: dict = field(default_factory=dict)
+    rms_norm: bool = False
+    residual_in_fp32: bool = False
+    norm_epsilon: float = 1e-5
+@dataclass
+class PrefixConditionerConfig:
+    conditioners: list[dict]
+    projection: Literal["none", "linear", "mlp"]
+@dataclass
+class ZonosConfig:
+    backbone: BackboneConfig
+    prefix_conditioner: PrefixConditionerConfig
+    eos_token_id: int = 1024
+    masked_token_id: int = 1025
+    @classmethod
+    def from_dict(cls, d: dict) -> "ZonosConfig":
+        d = d.copy()
+        backbone_config = BackboneConfig(**d.pop("backbone"))
+        prefix_conditioner_config = PrefixConditionerConfig(**d.pop("prefix_conditioner"))
+        config = cls(backbone_config, prefix_conditioner_config, **d)
+        return config

qhash/model.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import json
+from typing import Callable
+import safetensors
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from mamba_ssm.utils.generation import InferenceParams
+from tqdm import tqdm
+from qhash.backbone import ZonosBackbone
+from qhash.autoencoder import DACAutoencoder
+from qhash.codebook_pattern import apply_delay_pattern, revert_delay_pattern
+from qhash.conditioning import PrefixConditioner
+from qhash.config import ZonosConfig
+from qhash.sampling import sample_from_logits
+from qhash.speaker_cloning import SpeakerEmbeddingLDA
+class Zonos(nn.Module):
+    def __init__(self, config: ZonosConfig):
+        super().__init__()
+        self.config = config
+        dim = config.backbone.d_model
+        self.eos_token_id = config.eos_token_id
+        self.masked_token_id = config.masked_token_id
+        self.autoencoder = DACAutoencoder()
+        self.backbone = ZonosBackbone(config.backbone)
+        self.prefix_conditioner = PrefixConditioner(config.prefix_conditioner, dim)
+        self.spk_clone_model = None
+        # TODO: pad to multiple of at least 8
+        self.embeddings = nn.ModuleList([nn.Embedding(1026, dim) for _ in range(self.autoencoder.num_codebooks)])
+        self.heads = nn.ModuleList([nn.Linear(dim, 1025, bias=False) for _ in range(self.autoencoder.num_codebooks)])
+        self._cg_graph = None
+        self._cg_batch_size = None
+        self._cg_input_ids = None
+        self._cg_logits = None
+        self._cg_inference_params = None
+        self._cg_scale = None
+    @classmethod
+    def from_pretrained(cls, repo_id: str, revision: str | None = None, device: str = "cuda") -> "Zonos":
+        config_path = hf_hub_download(repo_id=repo_id, filename="config.json", revision=revision)
+        model_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors", revision=revision)
+        return cls.from_local(config_path, model_path, device)
+    @classmethod
+    def from_local(cls, config_path: str, model_path: str, device: str = "cuda") -> "Zonos":
+        config = ZonosConfig.from_dict(json.load(open(config_path)))
+        model = cls(config).to(device, torch.bfloat16)
+        model.autoencoder.dac.to(device)
+        sd = model.state_dict()
+        with safetensors.safe_open(model_path, framework="pt") as f:
+            for k in f.keys():
+                sd[k] = f.get_tensor(k)
+        model.load_state_dict(sd)
+        return model
+    def make_speaker_embedding(self, wav: torch.Tensor, sr: int) -> torch.Tensor:
+        """Generate a speaker embedding from an audio clip."""
+        if self.spk_clone_model is None:
+            self.spk_clone_model = SpeakerEmbeddingLDA()
+        _, spk_embedding = self.spk_clone_model(wav.to(self.spk_clone_model.device), sr)
+        return spk_embedding.unsqueeze(0).bfloat16()
+    def embed_codes(self, codes: torch.Tensor) -> torch.Tensor:
+        return sum(emb(codes[:, i]) for i, emb in enumerate(self.embeddings))
+    def apply_heads(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return torch.stack([head(hidden_states) for head in self.heads], dim=1)
+    def _compute_logits(
+        self, hidden_states: torch.Tensor, inference_params: InferenceParams, cfg_scale: float
+    ) -> torch.Tensor:
+        """
+        Pass `hidden_states` into `backbone` and `multi_head`, applying
+        classifier-free guidance if `cfg_scale != 1.0`.
+        """
+        last_hidden_states = self.backbone(hidden_states, inference_params)[:, -1, :].unsqueeze(1)
+        logits = self.apply_heads(last_hidden_states).squeeze(2).float()
+        if cfg_scale != 1.0:
+            cond_logits, uncond_logits = logits.chunk(2)
+            logits = uncond_logits + (cond_logits - uncond_logits) * cfg_scale
+        return logits
+    def _decode_one_token(
+        self,
+        input_ids: torch.Tensor,
+        inference_params: InferenceParams,
+        cfg_scale: float,
+    ) -> torch.Tensor:
+        """
+        Single-step decode. Prepares the hidden states, possibly replicates them
+        for CFG, and then delegates to `_compute_logits`.
+        Below we wrap this function with a simple CUDA Graph capturing mechanism,
+        doing 3 warmup steps if needed and then capturing or replaying the graph.
+        We only recapture if the batch size changes.
+        """
+        # TODO: support cfg_scale==1
+        if cfg_scale == 1.0:
+            hidden_states = self.embed_codes(input_ids)
+            return self._compute_logits(hidden_states, inference_params, cfg_scale)
+        bsz = input_ids.size(0)
+        need_capture = (self._cg_graph is None) or (self._cg_batch_size != bsz)
+        if need_capture:
+            self._cg_graph = None
+            self._cg_batch_size = bsz
+            self._cg_inference_params = inference_params
+            self._cg_scale = cfg_scale
+            for _ in range(3):
+                hidden_states = self.embed_codes(input_ids)
+                hidden_states = hidden_states.repeat(2, 1, 1)  # because cfg != 1.0
+                logits = self._compute_logits(hidden_states, inference_params, cfg_scale)
+            self._cg_input_ids = input_ids.clone()
+            self._cg_logits = torch.empty_like(logits)
+            g = torch.cuda.CUDAGraph()
+            def capture_region():
+                hidden_states_local = self.embed_codes(self._cg_input_ids)
+                hidden_states_local = hidden_states_local.repeat(2, 1, 1)
+                self._cg_logits = self._compute_logits(hidden_states_local, self._cg_inference_params, self._cg_scale)
+            with torch.cuda.graph(g):
+                capture_region()
+            self._cg_graph = g
+        else:
+            self._cg_input_ids.copy_(input_ids)
+        self._cg_graph.replay()
+        return self._cg_logits
+    def _prefill(
+        self,
+        prefix_hidden_states: torch.Tensor,
+        input_ids: torch.Tensor,
+        inference_params: InferenceParams,
+        cfg_scale: float,
+    ) -> torch.Tensor:
+        """
+        "Prefill" mode: we already have `prefix_hidden_states`, and we want
+        to append new embeddings, then compute the logits.
+        """
+        # Replicate input_ids if CFG is enabled
+        if cfg_scale != 1.0:
+            input_ids = input_ids.expand(prefix_hidden_states.shape[0], -1, -1)
+        hidden_states = torch.cat([prefix_hidden_states, self.embed_codes(input_ids)], dim=1)
+        return self._compute_logits(hidden_states, inference_params, cfg_scale)
+    def setup_cache(self, batch_size: int, max_seqlen: int, dtype: torch.dtype = torch.bfloat16) -> InferenceParams:
+        key_value_memory_dict = {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype)
+            for i, layer in enumerate(self.backbone.layers)
+        }
+        lengths_per_sample = torch.full((batch_size,), 0, dtype=torch.int32, device="cuda")
+        return InferenceParams(max_seqlen, batch_size, 0, 0, key_value_memory_dict, lengths_per_sample)
+    def prepare_conditioning(self, cond_dict: dict, uncond_dict: dict | None = None) -> torch.Tensor:
+        if uncond_dict is None:
+            uncond_dict = {k: cond_dict[k] for k in self.prefix_conditioner.required_keys}
+        return torch.cat(
+            [
+                self.prefix_conditioner(cond_dict),
+                self.prefix_conditioner(uncond_dict),
+            ]
+        )
+    @torch.inference_mode()
+    def generate(
+        self,
+        prefix_conditioning: torch.Tensor,  # [bsz, cond_seq_len, d_model]
+        audio_prefix_codes: torch.Tensor | None = None,  # [bsz, 9, prefix_audio_seq_len]
+        max_new_tokens: int = 86 * 30,
+        cfg_scale: float = 2.0,
+        batch_size: int = 1,
+        sampling_params: dict = dict(min_p=0.1),
+        progress_bar: bool = True,
+        callback: Callable[[torch.Tensor, int, int], bool] | None = None,
+    ):
+        assert cfg_scale != 1, "TODO: add support for cfg_scale=1"
+        prefix_audio_len = 0 if audio_prefix_codes is None else audio_prefix_codes.shape[2]
+        unknown_token = -1
+        audio_seq_len = prefix_audio_len + max_new_tokens
+        seq_len = prefix_conditioning.shape[1] + audio_seq_len
+        inference_params = self.setup_cache(batch_size=batch_size * 2, max_seqlen=seq_len)
+        codes = torch.full((batch_size, 9, audio_seq_len), unknown_token, device="cuda")
+        if audio_prefix_codes is not None:
+            codes[..., :prefix_audio_len] = audio_prefix_codes
+        delayed_codes = apply_delay_pattern(codes, self.masked_token_id)
+        delayed_prefix_audio_codes = delayed_codes[..., : prefix_audio_len + 1]
+        logits = self._prefill(prefix_conditioning, delayed_prefix_audio_codes, inference_params, cfg_scale)
+        next_token = sample_from_logits(logits, **sampling_params)
+        offset = delayed_prefix_audio_codes.shape[2]
+        frame = delayed_codes[..., offset : offset + 1]
+        frame.masked_scatter_(frame == unknown_token, next_token)
+        prefix_length = prefix_conditioning.shape[1] + prefix_audio_len + 1
+        inference_params.seqlen_offset += prefix_length
+        inference_params.lengths_per_sample[:] += prefix_length
+        logit_bias = torch.zeros_like(logits)
+        logit_bias[:, 1:, self.eos_token_id] = -torch.inf  # only allow codebook 0 to predict EOS
+        stopping = torch.zeros(batch_size, dtype=torch.bool, device="cuda")
+        max_steps = delayed_codes.shape[2] - offset
+        remaining_steps = torch.full((batch_size,), max_steps, device="cuda")
+        progress = tqdm(total=max_steps, desc="Generating", disable=not progress_bar)
+        step = 0
+        while torch.max(remaining_steps) > 0:
+            offset += 1
+            input_ids = delayed_codes[..., offset - 1 : offset]
+            logits = self._decode_one_token(input_ids, inference_params, cfg_scale)
+            next_token = sample_from_logits(logits, generated_tokens=delayed_codes[..., :offset], **sampling_params)
+            eos_in_cb0 = next_token[:, 0] == self.eos_token_id
+            remaining_steps[eos_in_cb0[:, 0]] = torch.minimum(remaining_steps[eos_in_cb0[:, 0]], torch.tensor(9))
+            stopping |= eos_in_cb0[:, 0]
+            eos_codebook_idx = 9 - remaining_steps
+            eos_codebook_idx = torch.clamp(eos_codebook_idx, max=9 - 1)
+            for i in range(next_token.shape[0]):
+                if stopping[i]:
+                    idx = eos_codebook_idx[i].item()
+                    next_token[i, :idx] = self.masked_token_id
+                    next_token[i, idx] = self.eos_token_id
+            frame = delayed_codes[..., offset : offset + 1]
+            frame.masked_scatter_(frame == unknown_token, next_token)
+            inference_params.seqlen_offset += 1
+            inference_params.lengths_per_sample[:] += 1
+            remaining_steps -= 1
+            progress.update()
+            step += 1
+            if callback is not None and not callback(frame, step, max_steps):
+                break
+        out_codes = revert_delay_pattern(delayed_codes)
+        out_codes.masked_fill_(out_codes >= 1024, 0)
+        out_codes = out_codes[..., : offset - 9]
+        self._cg_graph = None  # reset cuda graph to avoid cache changes
+        return out_codes

qhash/sampling.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import torch
+def multinomial(input: torch.Tensor, num_samples: int, replacement=False, *, generator=None):
+    """torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.
+    Args:
+        input (torch.Tensor): The input tensor containing probabilities.
+        num_samples (int): Number of samples to draw.
+        replacement (bool): Whether to draw with replacement or not.
+    Keywords args:
+        generator (torch.Generator): A pseudorandom number generator for sampling.
+    Returns:
+        torch.Tensor: Last dimension contains num_samples indices
+            sampled from the multinomial probability distribution
+            located in the last dimension of tensor input.
+    """
+    if num_samples == 1:
+        q = torch.empty_like(input).exponential_(1, generator=generator)
+        return torch.argmax(input / q, dim=-1, keepdim=True).to(torch.int64)
+    input_ = input.reshape(-1, input.shape[-1])
+    output_ = torch.multinomial(input_, num_samples=num_samples, replacement=replacement, generator=generator)
+    output = output_.reshape(*list(input.shape[:-1]), -1)
+    return output
+def apply_top_k(
+    probs: torch.Tensor,
+    k: int,
+) -> torch.Tensor:
+    """Sample next token from top K values along the last dimension of the input probs tensor.
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        k (int): The k in “top-k”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    """
+    v, _ = torch.topk(probs, min(k, probs.size(-1)))
+    pivot = v.select(-1, -1).unsqueeze(-1)
+    probs = torch.where(probs < pivot, 0.0, probs)
+    probs.div_(probs.sum(dim=-1, keepdim=True))
+    return probs
+def apply_top_p(probs: torch.Tensor, p: float) -> torch.Tensor:
+    """Sample next token from top P probabilities along the last dimension of the input probs tensor.
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        p (int): The p in “top-p”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    """
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort *= (~mask).float()
+    probs = probs.scatter(-1, probs_idx, probs_sort)
+    probs.div_(probs.sum(dim=-1, keepdim=True))
+    return probs
+def apply_min_p(probs: torch.Tensor, min_p: float) -> torch.Tensor:
+    """Sample next token using min-p sampling.
+    Args:
+        scores (torch.FloatTensor): Input logits with token candidates on the last dimension.
+        min_p (float): Minimum token probability, scaled by the probability of the most likely token.
+                       Must be between 0 and 1. Typical values are in the 0.01-0.2 range.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    """
+    top_probs, _ = probs.max(dim=-1, keepdim=True)
+    tokens_to_remove = probs < (min_p * top_probs)
+    probs = probs.masked_fill(tokens_to_remove, 0.0)
+    probs.div_(probs.sum(dim=-1, keepdim=True))
+    return probs
+def modify_logit_for_repetition_penalty(
+    logits: torch.Tensor,
+    generated_tokens: torch.Tensor,
+    repetition_penalty: float,
+    repetition_penalty_window: int,
+):
+    """See https://arxiv.org/abs/1909.05858
+    Apply repetition penalty over a sliding window of the last `repetition_penalty_window` tokens.
+    logits: (batch_size, n_codebooks, vocab_size)
+    generated_tokens: (batch_size, n_codebooks, seq_len)
+    """
+    generated_tokens = generated_tokens[..., -repetition_penalty_window:]
+    generated_tokens = generated_tokens.clamp_max(logits.shape[-1] - 1).to(torch.int64)
+    rp = torch.full_like(logits, repetition_penalty)
+    factors = torch.ones_like(logits).scatter_reduce(2, generated_tokens, rp, reduce="prod")
+    return torch.where(logits <= 0, logits * factors, logits / factors)
+def sample_from_logits(
+    logits: torch.Tensor,
+    temperature: float = 1.0,
+    top_p: float = 0.0,
+    top_k: int = 0,
+    min_p: float = 0.0,
+    generated_tokens: torch.Tensor | None = None,
+    repetition_penalty: float = 3.0,
+    repetition_penalty_window: float = 2,
+) -> torch.Tensor:
+    """Sample next token from logits using temperature, top-p, top-k, or min-p sampling.
+    Args:
+        logits (torch.Tensor): Input logits with token candidates on the last dimension.
+        temperature (float): Sampling temperature. Lower temperature results in more deterministic samples.
+        top_p (float): The p in “top-p”.
+        top_k (int): The k in “top-k”.
+        min_p (float): Minimum token probability, scaled by the probability of the most likely token.
+                       Must be between 0 and 1. Typical values are in the 0.01-0.2 range.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    """
+    if repetition_penalty != 1.0 and generated_tokens is not None:
+        logits = modify_logit_for_repetition_penalty(logits, generated_tokens, repetition_penalty, repetition_penalty_window)
+    if temperature > 0:
+        probs = torch.softmax(logits / temperature, dim=-1)
+        if top_p > 0:
+            probs = apply_top_p(probs, top_p)
+        if top_k > 0:
+            probs = apply_top_k(probs, top_k)
+        if min_p > 0:
+            probs = apply_min_p(probs, min_p)
+        next_token = multinomial(probs, num_samples=1)
+    else:
+        next_token = torch.argmax(logits, dim=-1, keepdim=True)
+    return next_token  # [batch_size, num_codebooks, 1]

qhash/speaker_cloning.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import math
+from functools import cache
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from huggingface_hub import hf_hub_download
+import os
+class logFbankCal(nn.Module):
+    def __init__(
+        self,
+        sample_rate: int = 16_000,
+        n_fft: int = 512,
+        win_length: float = 0.025,
+        hop_length: float = 0.01,
+        n_mels: int = 80,
+    ):
+        super().__init__()
+        self.fbankCal = torchaudio.transforms.MelSpectrogram(
+            sample_rate=sample_rate,
+            n_fft=n_fft,
+            win_length=int(win_length * sample_rate),
+            hop_length=int(hop_length * sample_rate),
+            n_mels=n_mels,
+        )
+    def forward(self, x):
+        out = self.fbankCal(x)
+        out = torch.log(out + 1e-6)
+        out = out - out.mean(axis=2).unsqueeze(dim=2)
+        return out
+class ASP(nn.Module):
+    # Attentive statistics pooling
+    def __init__(self, in_planes, acoustic_dim):
+        super(ASP, self).__init__()
+        outmap_size = int(acoustic_dim / 8)
+        self.out_dim = in_planes * 8 * outmap_size * 2
+        self.attention = nn.Sequential(
+            nn.Conv1d(in_planes * 8 * outmap_size, 128, kernel_size=1),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Conv1d(128, in_planes * 8 * outmap_size, kernel_size=1),
+            nn.Softmax(dim=2),
+        )
+    def forward(self, x):
+        x = x.reshape(x.size()[0], -1, x.size()[-1])
+        w = self.attention(x)
+        mu = torch.sum(x * w, dim=2)
+        sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
+        x = torch.cat((mu, sg), 1)
+        x = x.view(x.size()[0], -1)
+        return x
+class SimAMBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, ConvLayer, NormLayer, in_planes, planes, stride=1, block_id=1):
+        super(SimAMBasicBlock, self).__init__()
+        self.conv1 = ConvLayer(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = NormLayer(planes)
+        self.conv2 = ConvLayer(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = NormLayer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+        self.downsample = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.downsample = nn.Sequential(
+                ConvLayer(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
+                NormLayer(self.expansion * planes),
+            )
+    def forward(self, x):
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out = self.SimAM(out)
+        out += self.downsample(x)
+        out = self.relu(out)
+        return out
+    def SimAM(self, X, lambda_p=1e-4):
+        n = X.shape[2] * X.shape[3] - 1
+        d = (X - X.mean(dim=[2, 3], keepdim=True)).pow(2)
+        v = d.sum(dim=[2, 3], keepdim=True) / n
+        E_inv = d / (4 * (v + lambda_p)) + 0.5
+        return X * self.sigmoid(E_inv)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, ConvLayer, NormLayer, in_planes, planes, stride=1, block_id=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = ConvLayer(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = NormLayer(planes)
+        self.conv2 = ConvLayer(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = NormLayer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.downsample = nn.Sequential(
+                ConvLayer(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
+                NormLayer(self.expansion * planes),
+            )
+    def forward(self, x):
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.downsample(x)
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, ConvLayer, NormLayer, in_planes, planes, stride=1, block_id=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(self.expansion * planes)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion * planes),
+            )
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, in_planes, block, num_blocks, in_ch=1, feat_dim="2d", **kwargs):
+        super(ResNet, self).__init__()
+        if feat_dim == "1d":
+            self.NormLayer = nn.BatchNorm1d
+            self.ConvLayer = nn.Conv1d
+        elif feat_dim == "2d":
+            self.NormLayer = nn.BatchNorm2d
+            self.ConvLayer = nn.Conv2d
+        elif feat_dim == "3d":
+            self.NormLayer = nn.BatchNorm3d
+            self.ConvLayer = nn.Conv3d
+        else:
+            print("error")
+        self.in_planes = in_planes
+        self.conv1 = self.ConvLayer(in_ch, in_planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = self.NormLayer(in_planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(block, in_planes, num_blocks[0], stride=1, block_id=1)
+        self.layer2 = self._make_layer(block, in_planes * 2, num_blocks[1], stride=2, block_id=2)
+        self.layer3 = self._make_layer(block, in_planes * 4, num_blocks[2], stride=2, block_id=3)
+        self.layer4 = self._make_layer(block, in_planes * 8, num_blocks[3], stride=2, block_id=4)
+    def _make_layer(self, block, planes, num_blocks, stride, block_id=1):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.ConvLayer, self.NormLayer, self.in_planes, planes, stride, block_id))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.relu(self.bn1(self.conv1(x)))
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x
+def ResNet293(in_planes: int, **kwargs):
+    return ResNet(in_planes, SimAMBasicBlock, [10, 20, 64, 3], **kwargs)
+class ResNet293_based(nn.Module):
+    def __init__(
+        self,
+        in_planes: int = 64,
+        embd_dim: int = 256,
+        acoustic_dim: int = 80,
+        featCal=None,
+        dropout: float = 0,
+        **kwargs,
+    ):
+        super(ResNet293_based, self).__init__()
+        self.featCal = featCal
+        self.front = ResNet293(in_planes)
+        block_expansion = SimAMBasicBlock.expansion
+        self.pooling = ASP(in_planes * block_expansion, acoustic_dim)
+        self.bottleneck = nn.Linear(self.pooling.out_dim, embd_dim)
+        self.drop = nn.Dropout(dropout) if dropout else None
+    def forward(self, x):
+        x = self.featCal(x)
+        x = self.front(x.unsqueeze(dim=1))
+        x = self.pooling(x)
+        if self.drop:
+            x = self.drop(x)
+        x = self.bottleneck(x)
+        return x
+class SEModule(nn.Module):
+    def __init__(self, channels, bottleneck=128):
+        super(SEModule, self).__init__()
+        self.se = nn.Sequential(
+            nn.AdaptiveAvgPool1d(1),
+            nn.Conv1d(channels, bottleneck, kernel_size=1, padding=0),
+            nn.ReLU(),
+            # nn.BatchNorm1d(bottleneck), # Removed
+            nn.Conv1d(bottleneck, channels, kernel_size=1, padding=0),
+            nn.Sigmoid(),
+        )
+    def forward(self, input):
+        x = self.se(input)
+        return input * x
+class Bottle2neck(nn.Module):
+    def __init__(self, inplanes, planes, kernel_size=None, dilation=None, scale=8):
+        super(Bottle2neck, self).__init__()
+        width = int(math.floor(planes / scale))
+        self.conv1 = nn.Conv1d(inplanes, width * scale, kernel_size=1)
+        self.bn1 = nn.BatchNorm1d(width * scale)
+        self.nums = scale - 1
+        convs = []
+        bns = []
+        num_pad = math.floor(kernel_size / 2) * dilation
+        for i in range(self.nums):
+            convs.append(nn.Conv1d(width, width, kernel_size=kernel_size, dilation=dilation, padding=num_pad))
+            bns.append(nn.BatchNorm1d(width))
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.conv3 = nn.Conv1d(width * scale, planes, kernel_size=1)
+        self.bn3 = nn.BatchNorm1d(planes)
+        self.relu = nn.ReLU()
+        self.width = width
+        self.se = SEModule(planes)
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+        spx = torch.split(out, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            sp = self.convs[i](sp)
+            sp = self.relu(sp)
+            sp = self.bns[i](sp)
+            if i == 0:
+                out = sp
+            else:
+                out = torch.cat((out, sp), 1)
+        out = torch.cat((out, spx[self.nums]), 1)
+        out = self.conv3(out)
+        out = self.relu(out)
+        out = self.bn3(out)
+        out = self.se(out)
+        out += residual
+        return out
+class ECAPA_TDNN(nn.Module):
+    def __init__(self, C, featCal):
+        super(ECAPA_TDNN, self).__init__()
+        self.featCal = featCal
+        self.conv1 = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
+        self.relu = nn.ReLU()
+        self.bn1 = nn.BatchNorm1d(C)
+        self.layer1 = Bottle2neck(C, C, kernel_size=3, dilation=2, scale=8)
+        self.layer2 = Bottle2neck(C, C, kernel_size=3, dilation=3, scale=8)
+        self.layer3 = Bottle2neck(C, C, kernel_size=3, dilation=4, scale=8)
+        # I fixed the shape of the output from MFA layer, that is close to the setting from ECAPA paper.
+        self.layer4 = nn.Conv1d(3 * C, 1536, kernel_size=1)
+        self.attention = nn.Sequential(
+            nn.Conv1d(4608, 256, kernel_size=1),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Tanh(),  # Added
+            nn.Conv1d(256, 1536, kernel_size=1),
+            nn.Softmax(dim=2),
+        )
+        self.bn5 = nn.BatchNorm1d(3072)
+        self.fc6 = nn.Linear(3072, 192)
+        self.bn6 = nn.BatchNorm1d(192)
+    def forward(self, x):
+        x = self.featCal(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.bn1(x)
+        x1 = self.layer1(x)
+        x2 = self.layer2(x + x1)
+        x3 = self.layer3(x + x1 + x2)
+        x = self.layer4(torch.cat((x1, x2, x3), dim=1))
+        x = self.relu(x)
+        t = x.size()[-1]
+        global_x = torch.cat(
+            (
+                x,
+                torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t),
+                torch.sqrt(torch.var(x, dim=2, keepdim=True).clamp(min=1e-4)).repeat(1, 1, t),
+            ),
+            dim=1,
+        )
+        w = self.attention(global_x)
+        mu = torch.sum(x * w, dim=2)
+        sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4))
+        x = torch.cat((mu, sg), 1)
+        x = self.bn5(x)
+        x = self.fc6(x)
+        x = self.bn6(x)
+        return x
+class SpeakerEmbedding(nn.Module):
+    def __init__(self, ckpt_path: str = "ResNet293_SimAM_ASP_base.pt", device: str = "cuda"):
+        super().__init__()
+        self.device = device
+        with torch.device(device):
+            self.model = ResNet293_based()
+            self.model.load_state_dict(torch.load(ckpt_path, weights_only=True, mmap=True))
+            self.model.featCal = logFbankCal()
+        self.requires_grad_(False).eval()
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    @cache
+    def _get_resampler(self, orig_sample_rate: int):
+        return torchaudio.transforms.Resample(orig_sample_rate, 16_000).to(self.device)
+    def prepare_input(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor:
+        assert wav.ndim < 3
+        if wav.ndim == 2:
+            wav = wav.mean(0, keepdim=True)
+        wav = self._get_resampler(sample_rate)(wav)
+        return wav
+    def forward(self, wav: torch.Tensor, sample_rate: int):
+        wav = self.prepare_input(wav, sample_rate).to(self.device, self.dtype)
+        return self.model(wav).to(wav.device)
+class SpeakerEmbeddingLDA(nn.Module):
+    def __init__(
+        self,
+        device: str = "cuda",
+    ):
+        super().__init__()
+        spk_model_path = hf_hub_download(repo_id="Quantumhash/Qhash-v0.1-speaker-embedding", filename="ResNet293_SimAM_ASP_base.pt")
+        lda_spk_model_path = hf_hub_download(repo_id="Quantumhash/Qhash-v0.1-speaker-embedding", filename="ResNet293_SimAM_ASP_base_LDA-128.pt")
+        self.device = device
+        with torch.device(device):
+            self.model = SpeakerEmbedding(spk_model_path, device)
+            lda_sd = torch.load(lda_spk_model_path, weights_only=True)
+            out_features, in_features = lda_sd["weight"].shape
+            self.lda = nn.Linear(in_features, out_features, bias=True, dtype=torch.float32)
+            self.lda.load_state_dict(lda_sd)
+        self.requires_grad_(False).eval()
+    def forward(self, wav: torch.Tensor, sample_rate: int):
+        emb = self.model(wav, sample_rate).to(torch.float32)
+        return emb, self.lda(emb)