Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on 27 days ago

Commit

63d778a

1 Parent(s): 02daaea

ui

Browse files

Files changed (1) hide show

app.py +210 -855

app.py CHANGED Viewed

@@ -1,876 +1,231 @@
-import gradio as gr
 import torch
-import numpy as np
-import os
-import locale
-# Set UTF-8 encoding
-locale.getpreferredencoding = lambda: "UTF-8"
-# Try different import methods for unsloth
-try:
-    from unsloth import FastLanguageModel
-    UNSLOTH_AVAILABLE = True
-except ImportError:
-    try:
-        # Fallback import
-        import unsloth
-        from unsloth import FastLanguageModel
-        UNSLOTH_AVAILABLE = True
-    except ImportError:
-        print("Warning: Unsloth not available, using transformers fallback")
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-        UNSLOTH_AVAILABLE = False
-# Import SNAC
-try:
-    from snac import SNAC
-    SNAC_AVAILABLE = True
-except ImportError:
-    print("Error: SNAC not available")
-    SNAC_AVAILABLE = False
-class TTSKhmerModel:
-    def __init__(self):
-        self.model = None
-        self.tokenizer = None
-        self.snac_model = None
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.current_model = None
-        print(f"Using device: {self.device}")
-        print(f"Unsloth available: {UNSLOTH_AVAILABLE}")
-        print(f"SNAC available: {SNAC_AVAILABLE}")
-    def load_models(self, model_name="mrrtmob/tts-khm"):
-        """Load the TTS model and SNAC model"""
-        try:
-            if not SNAC_AVAILABLE:
-                return False, "SNAC model not available"
-            # Check if we need to reload the model
-            if self.current_model != model_name:
-                print(f"Loading TTS model: {model_name}...")
-                if UNSLOTH_AVAILABLE:
-                    # Use unsloth
-                    self.model, self.tokenizer = FastLanguageModel.from_pretrained(
-                        model_name=model_name,
-                        max_seq_length=2048,
-                        dtype=None,
-                        load_in_4bit=False if self.device == "cuda" else True,
-                    )
-                    # Enable inference mode
-                    FastLanguageModel.for_inference(self.model)
-                else:
-                    # Fallback to transformers
-                    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-                    self.model = AutoModelForCausalLM.from_pretrained(
-                        model_name,
-                        torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
-                        device_map="auto" if self.device == "cuda" else None
-                    )
-                self.current_model = model_name
-                print(f"TTS model '{model_name}' loaded successfully!")
-            # Load SNAC model if not already loaded
-            if self.snac_model is None:
-                print("Loading SNAC model...")
-                self.snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-                # Keep SNAC on CPU to save VRAM
-                self.snac_model = self.snac_model.to("cpu")
-                print("SNAC model loaded successfully!")
-            return True, f"Model '{model_name}' loaded successfully"
-        except Exception as e:
-            error_msg = f"Error loading model '{model_name}': {e}"
-            print(error_msg)
-            return False, error_msg
-    def redistribute_codes(self, code_list):
-        """Convert code list to audio using SNAC decoder"""
-        layer_1 = []
-        layer_2 = []
-        layer_3 = []
-        for i in range((len(code_list)+1)//7):
-            if 7*i < len(code_list):
-                layer_1.append(code_list[7*i])
-            if 7*i+1 < len(code_list):
-                layer_2.append(code_list[7*i+1]-4096)
-            if 7*i+2 < len(code_list):
-                layer_3.append(code_list[7*i+2]-(2*4096))
-            if 7*i+3 < len(code_list):
-                layer_3.append(code_list[7*i+3]-(3*4096))
-            if 7*i+4 < len(code_list):
-                layer_2.append(code_list[7*i+4]-(4*4096))
-            if 7*i+5 < len(code_list):
-                layer_3.append(code_list[7*i+5]-(5*4096))
-            if 7*i+6 < len(code_list):
-                layer_3.append(code_list[7*i+6]-(6*4096))
-        codes = [
-            torch.tensor(layer_1).unsqueeze(0),
-            torch.tensor(layer_2).unsqueeze(0),
-            torch.tensor(layer_3).unsqueeze(0)
-        ]
-        # Move SNAC to GPU temporarily for decoding if available
-        if self.device == "cuda":
-            self.snac_model = self.snac_model.to("cuda")
-            codes = [c.to("cuda") for c in codes]
-        # Decode audio
-        with torch.no_grad():
-            audio_hat = self.snac_model.decode(codes)
-        # Move back to CPU to save memory
-        if self.device == "cuda":
-            audio_hat = audio_hat.cpu()
-            self.snac_model = self.snac_model.to("cpu")
-            torch.cuda.empty_cache()
-        return audio_hat
-    def generate_speech(self, text, voice="Elise", temperature=0.6, top_p=0.95):
-        """Generate speech from text"""
-        if not self.model or not self.tokenizer or not self.snac_model:
-            return None, "Models not loaded properly"
-        try:
-            # Prepare prompt
-            prompt = f"{voice}: {text}" if voice else text
-            # Tokenize
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
-            # Add special tokens
-            start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
-            end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human
-            # Combine tokens
-            modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
-            # Create attention mask
-            attention_mask = torch.ones_like(modified_input_ids)
-            # Move to device
-            input_ids = modified_input_ids.to(self.device)
-            attention_mask = attention_mask.to(self.device)
-            # Generate
-            with torch.no_grad():
-                generated_ids = self.model.generate(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    max_new_tokens=1200,
-                    do_sample=True,
-                    temperature=temperature,
-                    top_p=top_p,
-                    repetition_penalty=1.1,
-                    num_return_sequences=1,
-                    eos_token_id=128258,
-                    use_cache=True,
-                    pad_token_id=self.tokenizer.eos_token_id
-                )
-            # Clear GPU cache
-            if self.device == "cuda":
-                torch.cuda.empty_cache()
-            # Process generated tokens
-            token_to_find = 128257
-            token_to_remove = 128258
-            # Find last occurrence of token_to_find
-            token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
-            if len(token_indices[1]) > 0:
-                last_occurrence_idx = token_indices[1][-1].item()
-                cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
-            else:
-                cropped_tensor = generated_ids
-            # Remove unwanted tokens
-            row = cropped_tensor[0]
-            row = row[row != token_to_remove]
-            # Process codes
-            row_length = row.size(0)
-            new_length = (row_length // 7) * 7
-            trimmed_row = row[:new_length]
-            code_list = [t.item() - 128266 for t in trimmed_row]
-            if len(code_list) == 0:
-                return None, "No valid audio tokens generated"
-            # Generate audio
-            audio_tensor = self.redistribute_codes(code_list)
-            audio_array = audio_tensor.detach().squeeze().cpu().numpy()
-            # Convert to proper format for Gradio
-            sample_rate = 24000
-            return (sample_rate, audio_array), "✅ Speech generated successfully!"
-        except Exception as e:
-            return None, f"❌ Error generating speech: {str(e)}"
-# Initialize the model
-tts_model = TTSKhmerModel()
-def initialize_models(model_name):
-    """Initialize models on startup"""
-    print("Initializing models...")
-    success, message = tts_model.load_models(model_name)
-    gpu_info = f"GPU available: {torch.cuda.is_available()}"
-    if torch.cuda.is_available():
-        gpu_info += f" ({torch.cuda.get_device_name(0)})"
-    if success:
-        return f"✅ {message}! {gpu_info}"
-    else:
-        return f"❌ {message}. {gpu_info}"
-def change_model(model_name):
-    """Change the TTS model"""
-    if not model_name.strip():
-        return "⚠️ Please enter a valid model name"
-    success, message = tts_model.load_models(model_name.strip())
-    return message
-def text_to_speech(text, voice, temperature, top_p):
-    """Gradio interface function"""
-    if not text.strip():
-        return None, "⚠️ Please enter some text"
-    if not SNAC_AVAILABLE:
-        return None, "❌ SNAC model not available. Please check installation."
-    print(f"Generating speech for: {text[:50]}...")
-    audio_output, message = tts_model.generate_speech(text, voice, temperature, top_p)
-    return audio_output, message
-# Elegant and smooth CSS
-custom_css = """
-/* Import Google Fonts */
-@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
-/* Root variables for consistent theming */
-:root {
-    --primary-gradient: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    --secondary-gradient: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
-    --tertiary-gradient: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
-    --quaternary-gradient: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
-    --glass-bg: rgba(255, 255, 255, 0.1);
-    --glass-border: rgba(255, 255, 255, 0.2);
-    --text-primary: #2d3748;
-    --text-secondary: #4a5568;
-    --shadow-light: 0 4px 20px rgba(0, 0, 0, 0.08);
-    --shadow-medium: 0 8px 30px rgba(0, 0, 0, 0.12);
-    --shadow-heavy: 0 12px 40px rgba(0, 0, 0, 0.15);
-    --border-radius: 16px;
-    --transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
-}
-/* Global styling */
-* {
-    font-family: 'Inter', 'Segoe UI', system-ui, -apple-system, sans-serif !important;
-    transition: var(--transition);
-}
-.gradio-container {
-    background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
-    min-height: 100vh;
-    padding: 20px;
-}
-/* Header Section */
-.header-container {
-    text-align: center;
-    background: var(--glass-bg);
-    backdrop-filter: blur(20px);
-    border: 1px solid var(--glass-border);
-    border-radius: 24px;
-    padding: 2.5rem;
-    margin-bottom: 2rem;
-    box-shadow: var(--shadow-medium);
-    position: relative;
-    overflow: hidden;
-}
-.header-container::before {
-    content: '';
-    position: absolute;
-    top: 0;
-    left: 0;
-    right: 0;
-    bottom: 0;
-    background: var(--primary-gradient);
-    opacity: 0.1;
-    z-index: -1;
-}
-.main-title {
-    font-size: 3rem;
-    font-weight: 700;
-    background: var(--primary-gradient);
-    -webkit-background-clip: text;
-    -webkit-text-fill-color: transparent;
-    background-clip: text;
-    margin: 0 0 1rem 0;
-    line-height: 1.2;
-}
-.subtitle {
-    font-size: 1.25rem;
-    color: var(--text-secondary);
-    margin: 0 0 0.5rem 0;
-    font-weight: 500;
-}
-.feature-badges {
-    display: flex;
-    justify-content: center;
-    gap: 1rem;
-    flex-wrap: wrap;
-    margin-top: 1.5rem;
-}
-.badge {
-    background: var(--glass-bg);
-    backdrop-filter: blur(10px);
-    border: 1px solid var(--glass-border);
-    padding: 0.5rem 1rem;
-    border-radius: 50px;
-    font-size: 0.875rem;
-    font-weight: 500;
-    color: var(--text-primary);
-    box-shadow: var(--shadow-light);
-}
-/* Card styling */
-.glass-card {
-    background: var(--glass-bg);
-    backdrop-filter: blur(20px);
-    border: 1px solid var(--glass-border);
-    border-radius: var(--border-radius);
-    padding: 1.5rem;
-    margin: 1rem 0;
-    box-shadow: var(--shadow-light);
-    transition: var(--transition);
-}
-.glass-card:hover {
-    box-shadow: var(--shadow-medium);
-    transform: translateY(-2px);
-}
-.card-title {
-    font-size: 1.25rem;
-    font-weight: 600;
-    color: var(--text-primary);
-    margin: 0 0 1rem 0;
-    display: flex;
-    align-items: center;
-    gap: 0.5rem;
-}
-/* Input styling */
-.smooth-input textarea,
-.smooth-input input {
-    background: rgba(255, 255, 255, 0.7) !important;
-    backdrop-filter: blur(10px) !important;
-    border: 2px solid transparent !important;
-    border-radius: 12px !important;
-    padding: 1rem !important;
-    font-size: 1rem !important;
-    transition: var(--transition) !important;
-    box-shadow: var(--shadow-light) !important;
-}
-.smooth-input textarea:focus,
-.smooth-input input:focus {
-    border-color: #667eea !important;
-    box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
-    transform: translateY(-1px) !important;
-}
-/* Button styling */
-.generate-button {
-    background: var(--primary-gradient) !important;
-    border: none !important;
-    border-radius: 50px !important;
-    padding: 1rem 2rem !important;
-    font-size: 1rem !important;
-    font-weight: 600 !important;
-    color: white !important;
-    box-shadow: var(--shadow-medium) !important;
-    transition: var(--transition) !important;
-    text-transform: none !important;
-    letter-spacing: 0.5px !important;
-    min-height: 50px !important;
-}
-.generate-button:hover {
-    transform: translateY(-2px) !important;
-    box-shadow: var(--shadow-heavy) !important;
-}
-.model-button {
-    background: var(--tertiary-gradient) !important;
-    border: none !important;
-    border-radius: 12px !important;
-    padding: 0.75rem 1.5rem !important;
-    font-size: 0.875rem !important;
-    font-weight: 500 !important;
-    color: white !important;
-    box-shadow: var(--shadow-light) !important;
-    transition: var(--transition) !important;
-}
-.model-button:hover {
-    transform: translateY(-1px) !important;
-    box-shadow: var(--shadow-medium) !important;
-}
-/* Dropdown styling */
-.smooth-dropdown select {
-    background: rgba(255, 255, 255, 0.7) !important;
-    backdrop-filter: blur(10px) !important;
-    border: 2px solid transparent !important;
-    border-radius: 12px !important;
-    padding: 0.75rem 1rem !important;
-    font-weight: 500 !important;
-    color: var(--text-primary) !important;
-    transition: var(--transition) !important;
-}
-.smooth-dropdown select:focus {
-    border-color: #667eea !important;
-    box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
-}
-/* Slider styling */
-.smooth-slider {
-    background: rgba(255, 255, 255, 0.5) !important;
-    border-radius: 12px !important;
-    padding: 1rem !important;
-    margin: 0.5rem 0 !important;
-}
-.smooth-slider input[type="range"] {
-    background: var(--quaternary-gradient) !important;
-    height: 6px !important;
-    border-radius: 3px !important;
-}
-/* Status display */
-.status-display {
-    background: rgba(255, 255, 255, 0.8) !important;
-    border: none !important;
-    border-radius: 12px !important;
-    padding: 1rem !important;
-    font-weight: 500 !important;
-    text-align: center !important;
-    box-shadow: var(--shadow-light) !important;
-}
-/* Audio player */
-.audio-container {
-    background: rgba(255, 255, 255, 0.6) !important;
-    border-radius: 16px !important;
-    padding: 1rem !important;
-    box-shadow: var(--shadow-light) !important;
-    backdrop-filter: blur(10px) !important;
-}
-/* Examples section */
-.examples-grid {
-    display: grid;
-    gap: 1rem;
-    margin-top: 1rem;
-}
-.example-card {
-    background: rgba(255, 255, 255, 0.4);
-    border: 1px solid var(--glass-border);
-    border-radius: 12px;
-    padding: 1rem;
-    cursor: pointer;
-    transition: var(--transition);
-    backdrop-filter: blur(5px);
-}
-.example-card:hover {
-    background: rgba(255, 255, 255, 0.6);
-    transform: translateY(-1px);
-    box-shadow: var(--shadow-light);
-}
-/* Info section */
-.info-grid {
-    display: grid;
-    grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
-    gap: 1rem;
-    margin-top: 1rem;
-}
-.info-item {
-    background: rgba(255, 255, 255, 0.3);
-    border-radius: 12px;
-    padding: 1rem;
-    backdrop-filter: blur(5px);
-    border: 1px solid var(--glass-border);
-}
-.info-title {
-    font-size: 1rem;
-    font-weight: 600;
-    margin: 0 0 0.5rem 0;
-    color: var(--text-primary);
-}
-.info-content {
-    font-size: 0.875rem;
-    color: var(--text-secondary);
-    line-height: 1.5;
-}
-/* Accordion styling */
-.accordion-container {
-    background: rgba(255, 255, 255, 0.3) !important;
-    border-radius: 12px !important;
-    border: 1px solid var(--glass-border) !important;
-    box-shadow: var(--shadow-light) !important;
-}
-/* Animation for loading states */
-@keyframes pulse {
-    0%, 100% { opacity: 1; }
-    50% { opacity: 0.7; }
-}
-.loading {
-    animation: pulse 2s infinite;
-}
-/* Responsive adjustments */
-@media (max-width: 768px) {
-    .main-title {
-        font-size: 2rem;
-    }
-    .feature-badges {
-        flex-direction: column;
-        align-items: center;
-    }
-    .info-grid {
-        grid-template-columns: 1fr;
-    }
-}
-/* Smooth scrolling */
-html {
-    scroll-behavior: smooth;
-}
-/* Custom scrollbar */
-::-webkit-scrollbar {
-    width: 8px;
-}
-::-webkit-scrollbar-track {
-    background: rgba(255, 255, 255, 0.1);
-    border-radius: 4px;
-}
-::-webkit-scrollbar-thumb {
-    background: var(--primary-gradient);
-    border-radius: 4px;
-}
-::-webkit-scrollbar-thumb:hover {
-    background: var(--secondary-gradient);
-}
-"""
-# Create the enhanced Gradio interface
-with gr.Blocks(
-    title="🎤 Advanced Khmer TTS Studio",
-    theme=gr.themes.Soft(
-        primary_hue="blue",
-        secondary_hue="emerald",
-        neutral_hue="slate",
-        font=gr.themes.GoogleFont("Inter")
-    ),
-    css=custom_css
-) as demo:
-    # Beautiful header
-    gr.HTML("""
-    <div class="header-container">
-        <h1 class="main-title">🎤 Advanced Khmer TTS Studio</h1>
-        <p class="subtitle">Professional AI-Powered Khmer Speech Synthesis Platform</p>
-        <div class="feature-badges">
-            <span class="badge">🎯 Multi-Model Support</span>
-            <span class="badge">🚀 Real-time Processing</span>
-            <span class="badge">🎭 Multiple Voices</span>
-            <span class="badge">⚡ GPU Accelerated</span>
-        </div>
-    </div>
-    """)
-    # Model selection section
-    with gr.Row():
-        model_input = gr.Textbox(
-            label="🤖 Model Selection",
-            placeholder="Enter HuggingFace model name (e.g., mrrtmob/tts-khm)",
-            value="mrrtmob/tts-khm",
-            elem_classes=["smooth-input"],
-            info="Enter any compatible TTS model from HuggingFace"
-        )
-        model_load_btn = gr.Button(
-            "🔄 Load Model",
-            elem_classes=["model-button"],
-            scale=0
-        )
     with gr.Row():
-        # Input Section
-        with gr.Column(scale=2):
-            gr.HTML('<div class="glass-card"><h2 class="card-title">📝 Text Input & Configuration</h2>')
             text_input = gr.Textbox(
-                label="📖 Text to Synthesize",
-                placeholder="សូមបញ្ចូលអត្ថបទភាសាខ្មែរនៅទីនេះ...",
-                lines=5,
-                value="សួស្ដី ខ្ញុំគឺជា AI អាចនិយាយភាសាខ្មែរបាន",
-                elem_classes=["smooth-input"]
             )
-            with gr.Row():
-                voice_dropdown = gr.Dropdown(
-                    label="🎭 Voice Model",
-                    choices=["Elise", "Jing", "Default"],
-                    value="Elise",
-                    elem_classes=["smooth-dropdown"],
-                    info="Select your preferred voice character"
-                )
-            with gr.Accordion("⚙️ Advanced Parameters", open=False, elem_classes=["accordion-container"]):
-                gr.HTML('<div style="padding: 1rem;">')
-                with gr.Row():
-                    temperature = gr.Slider(
-                        minimum=0.1,
-                        maximum=1.0,
-                        value=0.6,
-                        step=0.1,
-                        label="🌡️ Temperature",
-                        info="Controls randomness (0.1 = consistent, 1.0 = creative)",
-                        elem_classes=["smooth-slider"]
-                    )
-                    top_p = gr.Slider(
-                        minimum=0.1,
-                        maximum=1.0,
-                        value=0.95,
-                        step=0.05,
-                        label="🎯 Top P",
-                        info="Controls diversity (0.1 = focused, 1.0 = diverse)",
-                        elem_classes=["smooth-slider"]
-                    )
-                gr.HTML('</div>')
-            generate_btn = gr.Button(
-                "🎵 Generate Speech",
-                size="lg",
-                elem_classes=["generate-button"]
             )
-            gr.HTML('</div>')
-        # Output Section
-        with gr.Column(scale=1):
-            gr.HTML('<div class="glass-card"><h2 class="card-title">🔊 Audio Output</h2>')
-            status_text = gr.Textbox(
-                label="📊 System Status",
-                value="🔄 Ready to load model...",
-                interactive=False,
-                elem_classes=["status-display"]
-            )
-            audio_output = gr.Audio(
-                label="🎵 Generated Speech",
-                type="numpy",
-                elem_classes=["audio-container"]
-            )
-            gr.HTML("""
-            <div style="background: rgba(255, 255, 255, 0.2); backdrop-filter: blur(10px);
-                        border-radius: 12px; padding: 1rem; margin-top: 1rem; text-align: center;">
-                <h4 style="margin: 0 0 0.5rem 0; color: #2d3748;">💡 Quick Tips</h4>
-                <p style="margin: 0; font-size: 0.875rem; color: #4a5568; line-height: 1.5;">
-                    🎧 Use headphones for optimal experience<br>
-                    ⚡ Processing typically takes 15-45 seconds<br>
-                    🔧 Adjust parameters for different results
-                </p>
-            </div>
-            """)
-            gr.HTML('</div>')
-    # Event handlers
-    model_load_btn.click(
-        fn=change_model,
-        inputs=[model_input],
-        outputs=[status_text]
     )
-    generate_btn.click(
-        fn=text_to_speech,
-        inputs=[text_input, voice_dropdown, temperature, top_p],
-        outputs=[audio_output, status_text]
     )
-    # Initialize with default model
-    demo.load(
-        fn=lambda: initialize_models("mrrtmob/tts-khm"),
-        outputs=[status_text]
-    )
-    # Enhanced Examples Section
-    gr.HTML("""
-    <div class="glass-card" style="margin-top: 2rem;">
-        <h2 class="card-title">📚 Example Texts</h2>
-        <p style="color: #4a5568; margin-bottom: 1rem;">Click any example below to try it instantly!</p>
-    </div>
-    """)
-    with gr.Row():
-        with gr.Column():
-            gr.Examples(
-                examples=[
-                    # Basic greetings
-                    "សួស្ដី អ្នកសុខសប្បាយទេ? ខ្ញុំគឺជា AI",
-                    "ជំរាបសួរ សូមស្វាគមន៍មកកាន់ប្រព័ន្ធ TTS",
-                    # Cultural content
-                    "ប្រទេសកម្ពុជាមានប្រាសាទអង្គរវត្តដ៏ល្បី",
-                    "បុណ្យចូលឆ្នាំខ្មែរគឺជាបុណ្យធំបំផុត",
-                    # Educational
-                    "ការអប់រំគឺជាមូលដ្ឋានសំខាន់នៃការអភិវឌ្ឍន៍",
-                    "បច្ចេកវិទ្យាកំពុងផ្លាស់ប្ដូរពិភពលោក",
-                ],
-                inputs=[text_input],
-                label="🌟 Popular Examples"
-            )
-        with gr.Column():
-            gr.Examples(
-                examples=[
-                    # Technology
-                    "ការរៀនម៉ាស៊ីននិង AI កំពុងរីកចម្រើន",
-                    "បណ្ដាញសង្គមបានផ្លាស់ប្ដូរជីវិតយើង",
-                    # Literature
-                    "ព្រះអាទិត្យរះនៅពេលព្រឹក ធ្វើឱ្យផ្ទៃទឹកស្រស់ស្អាត",
-                    "ក្រុមសត្វស្លាបបានហោះហើរនៅលំអង",
-                    # Information
-                    "ការពារបរិស្ថានគឺជាទំនួលខុសត្រូវរួម",
-                    "ព័ត៌មានគឺជាកម្លាំងនៃការអភិវឌ្ឍន៍",
-                ],
-                inputs=[text_input],
-                label="🎭 Creative Examples"
-            )
-    # Enhanced Information Section
-    gr.HTML("""
-    <div class="glass-card" style="margin-top: 2rem;">
-        <h2 class="card-title">📊 System Information & Guidelines</h2>
-        <div class="info-grid">
-            <div class="info-item">
-                <div class="info-title">🔧 System Status</div>
-                <div class="info-content">
-                    <strong>Unsloth:</strong> """ + ('✅ Available' if UNSLOTH_AVAILABLE else '❌ Not Available') + """<br>
-                    <strong>SNAC:</strong> """ + ('✅ Available' if SNAC_AVAILABLE else '❌ Not Available') + """<br>
-                    <strong>GPU:</strong> """ + ('✅ Available' if torch.cuda.is_available() else '❌ CPU Only') + """<br>
-                    <strong>Device:</strong> """ + ('CUDA' if torch.cuda.is_available() else 'CPU') + """
-                </div>
-            </div>
-            <div class="info-item">
-                <div class="info-title">🎭 Voice Profiles</div>
-                <div class="info-content">
-                    <strong>Elise:</strong> Clear, professional, news-style<br>
-                    <strong>Jing:</strong> Warm, conversational, friendly<br>
-                    <strong>Default:</strong> Standard neutral synthesis<br>
-                    <em>Each voice has unique characteristics</em>
-                </div>
-            </div>
-            <div class="info-item">
-                <div class="info-title">🤖 Model Support</div>
-                <div class="info-content">
-                    <strong>Current:</strong> mrrtmob/tts-khm (default)<br>
-                    <strong>Custom:</strong> Any HuggingFace TTS model<br>
-                    <strong>Format:</strong> username/model-name<br>
-                    <em>Models are cached after first load</em>
-                </div>
-            </div>
-            <div class="info-item">
-                <div class="info-title">💡 Best Practices</div>
-                <div class="info-content">
-                    • Use proper Khmer Unicode text<br>
-                    • Keep sentences under 100 characters<br>
-                    • Lower temperature = more consistent<br>
-                    • Higher Top P = more natural variation<br>
-                    • Test different voice models for variety
-                </div>
-            </div>
-            <div class="info-item">
-                <div class="info-title">⚡ Performance Tips</div>
-                <div class="info-content">
-                    • GPU acceleration automatically detected<br>
-                    • Models are loaded once and cached<br>
-                    • First generation may take longer<br>
-                    • SNAC decoding optimized for memory<br>
-                    • Batch processing not yet supported
-                </div>
-            </div>
-            <div class="info-item">
-                <div class="info-title">🔧 Technical Details</div>
-                <div class="info-content">
-                    <strong>Sample Rate:</strong> 24 kHz<br>
-                    <strong>Format:</strong> WAV (numpy array)<br>
-                    <strong>Max Tokens:</strong> 1200 new tokens<br>
-                    <strong>Sequence Length:</strong> 2048 tokens<br>
-                    <strong>Audio Quality:</strong> High-fidelity
-                </div>
-            </div>
-        </div>
-    </div>
-    """)
-    # Footer
-    gr.HTML("""
-    <div style="text-align: center; margin-top: 2rem; padding: 2rem;
-                background: rgba(255, 255, 255, 0.1); backdrop-filter: blur(10px);
-                border-radius: 16px; border: 1px solid rgba(255, 255, 255, 0.2);">
-        <h3 style="color: #2d3748; margin-bottom: 1rem;">🌟 Advanced Khmer TTS Studio</h3>
-        <p style="color: #4a5568; margin: 0; font-size: 0.875rem;">
-            Built with ❤️ for the Khmer community • Powered by state-of-the-art AI<br>
-            Supporting multiple models • Professional-grade speech synthesis
-        </p>
-    </div>
-    """)
 if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        show_api=False,
-        share=False,
-        favicon_path=None,
-        ssl_verify=False,
-        inbrowser=True
-    )

+import spaces
+from snac import SNAC
 import torch
+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import snapshot_download
+from dotenv import load_dotenv
+load_dotenv()
+# Check if CUDA is available
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print("Loading SNAC model...")
+snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
+snac_model = snac_model.to(device)
+model_name = "mrrtmob/tts-khm"
+# Download only model config and safetensors
+snapshot_download(
+    repo_id=model_name,
+    allow_patterns=[
+        "config.json",
+        "*.safetensors",
+        "model.safetensors.index.json",
+    ],
+    ignore_patterns=[
+        "optimizer.pt",
+        "pytorch_model.bin",
+        "training_args.bin",
+        "scheduler.pt",
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt",
+        "tokenizer.*"
+    ]
+)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
+model.to(device)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+print(f"Khmer TTS model loaded to {device}")
+# Process text prompt
+def process_prompt(prompt, voice, tokenizer, device):
+    prompt = f"{voice}: {prompt}"
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
+    end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human
+    modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)  # SOH SOT Text EOT EOH
+    # No padding needed for single input
+    attention_mask = torch.ones_like(modified_input_ids)
+    return modified_input_ids.to(device), attention_mask.to(device)
+# Parse output tokens to audio
+def parse_output(generated_ids):
+    token_to_find = 128257
+    token_to_remove = 128258
+    token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
+    if len(token_indices[1]) > 0:
+        last_occurrence_idx = token_indices[1][-1].item()
+        cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
+    else:
+        cropped_tensor = generated_ids
+    processed_rows = []
+    for row in cropped_tensor:
+        masked_row = row[row != token_to_remove]
+        processed_rows.append(masked_row)
+    code_lists = []
+    for row in processed_rows:
+        row_length = row.size(0)
+        new_length = (row_length // 7) * 7
+        trimmed_row = row[:new_length]
+        trimmed_row = [t - 128266 for t in trimmed_row]
+        code_lists.append(trimmed_row)
+    return code_lists[0]  # Return just the first one for single sample
+# Redistribute codes for audio generation
+def redistribute_codes(code_list, snac_model):
+    device = next(snac_model.parameters()).device  # Get the device of SNAC model
+    layer_1 = []
+    layer_2 = []
+    layer_3 = []
+    for i in range((len(code_list)+1)//7):
+        layer_1.append(code_list[7*i])
+        layer_2.append(code_list[7*i+1]-4096)
+        layer_3.append(code_list[7*i+2]-(2*4096))
+        layer_3.append(code_list[7*i+3]-(3*4096))
+        layer_2.append(code_list[7*i+4]-(4*4096))
+        layer_3.append(code_list[7*i+5]-(5*4096))
+        layer_3.append(code_list[7*i+6]-(6*4096))
+    # Move tensors to the same device as the SNAC model
+    codes = [
+        torch.tensor(layer_1, device=device).unsqueeze(0),
+        torch.tensor(layer_2, device=device).unsqueeze(0),
+        torch.tensor(layer_3, device=device).unsqueeze(0)
+    ]
+    audio_hat = snac_model.decode(codes)
+    return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
+# Main generation function
+@spaces.GPU()
+def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
+    if not text.strip():
+        return None
+    try:
+        progress(0.1, "Processing text...")
+        input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
+        progress(0.3, "Generating speech tokens...")
+        with torch.no_grad():
+            generated_ids = model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=max_new_tokens,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                num_return_sequences=1,
+                eos_token_id=128258,
+            )
+        progress(0.6, "Processing speech tokens...")
+        code_list = parse_output(generated_ids)
+        progress(0.8, "Converting to audio...")
+        audio_samples = redistribute_codes(code_list, snac_model)
+        return (24000, audio_samples)  # Return sample rate and audio
+    except Exception as e:
+        print(f"Error generating speech: {e}")
+        return None
+# Examples for the UI - Khmer text examples
+examples = [
+    ["ជំរាបសួរ ខ្ញុំឈ្មោះ តារា ហើយខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។", "tara", 0.6, 0.95, 1.1, 1200],
+    ["ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច ឬ <sigh> ថប់ដង្ហើម។", "dan", 0.7, 0.95, 1.1, 1200],
+    ["ខ្ញុំរស់នៅក្នុងទីក្រុងភ្នំពេញ ហើយមានប៉ារ៉ាម៉ែត្រ <gasp> ច្រើនណាស់។", "leah", 0.6, 0.9, 1.2, 1200],
+    ["ពេលខ្លះ ពេលខ្ញុំនិយាយច្រើនព���ក ខ្ញុំត្រូវ <cough> សុំទោស។", "leo", 0.65, 0.9, 1.1, 1200],
+    ["ការនិយាយនៅចំពោះមុខសាធារណៈ អាចមានការពិបាក។ <groan> ប៉ុន្តែបើហាត់ហាន គេអាចធ្វើបាន។", "jess", 0.7, 0.95, 1.1, 1200],
+    ["ការឡើងភ្នំពិតជាហត់ណត់ ប៉ុន្តែទេសភាពពីលើនេះ ពិតជាស្រស់ស្អាត! <sigh> គួរឱ្យធ្វើ។", "mia", 0.65, 0.9, 1.15, 1200],
+    ["តើអ្នកបានឮរឿងកំប្លែងនេះយ៉ាងណា? <laugh> ខ្ញុំមិនអាចបញ្ឈប់ការសើចបាននោះទេ។", "zac", 0.7, 0.95, 1.1, 1200],
+    ["បន្ទាប់ពីរត់ម៉ារ៉ាតុងរួច ខ្ញុំហត់ណាស់ <yawn> ហើយត្រូវការសម្រាក។", "zoe", 0.6, 0.95, 1.1, 1200]
+]
+# Available voices
+VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe", "jing", "Elise"]
+# Available Emotive Tags
+EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
+# Create Gradio interface
+with gr.Blocks(title="Khmer Text-to-Speech") as demo:
+    gr.Markdown(f"""
+    # 🎵 Khmer Text-to-Speech (ម៉ូដែលបម្លែងអត្ថបទជាសំលេង)
+    Enter your Khmer text below and hear it converted to natural-sounding speech.
+    បញ្ចូលអត្ថបទខ្មែររបស់អ្នកខាងក្រោម ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយធម្មជាតិ។
+    ## Tips for better prompts (គន្លឹះសម្រាប់ការប្រើប្រាស់ដ៏ល្អ):
+    - Add paralinguistic elements like {", ".join(EMOTIVE_TAGS)} for more human-like speech
+    - Longer text prompts generally work better than very short phrases
+    - អត្ថបទវែងជាទូទៅមានលទ្ធផលល្អជាងអត្ថបទខ្លី
+    - Increasing `repetition_penalty` and `temperature` makes the model speak faster
+    """)
     with gr.Row():
+        with gr.Column(scale=3):
             text_input = gr.Textbox(
+                label="Text to speak (អត្ថបទដើម្បីនិយាយ)",
+                placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
+                lines=5
             )
+            voice = gr.Dropdown(
+                choices=VOICES,
+                value="tara",
+                label="Voice (សំលេង)"
             )
+            with gr.Accordion("Advanced Settings (ការកំណត់កម្រិតខ្ពស់)", open=False):
+                temperature = gr.Slider(
+                    minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                    label="Temperature",
+                    info="Higher values (0.7-1.0) create more expressive but less stable speech"
+                )
+                top_p = gr.Slider(
+                    minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                    label="Top P",
+                    info="Nucleus sampling threshold"
+                )
+                repetition_penalty = gr.Slider(
+                    minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                    label="Repetition Penalty",
+                    info="Higher values discourage repetitive patterns"
+                )
+                max_new_tokens = gr.Slider(
+                    minimum=100, maximum=2000, value=1200, step=100,
+                    label="Max Length",
+                    info="Maximum length of generated audio (in tokens)"
+                )
+            with gr.Row():
+                submit_btn = gr.Button("Generate Speech (បង្កើតសំលេង)", variant="primary")
+                clear_btn = gr.Button("Clear (លុប)")
+        with gr.Column(scale=2):
+            audio_output = gr.Audio(label="Generated Speech (សំលេងដែលបង្កើតឡើង)", type="numpy")
+    # Set up examples
+    gr.Examples(
+        examples=examples,
+        inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
+        outputs=audio_output,
+        fn=generate_speech,
+        cache_examples=True,
     )
+    # Set up event handlers
+    submit_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
+        outputs=audio_output
     )
+    clear_btn.click(
+        fn=lambda: (None, None),
+        inputs=[],
+        outputs=[text_input, audio_output]
+    )
+# Launch the app
 if __name__ == "__main__":
+    demo.queue().launch(share=False, ssr_mode=False)