Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on 30 days ago

Commit

844e3a3

1 Parent(s): 21ac5de

Remove flash-attn from requirements

Browse files

Files changed (2) hide show

app.py +277 -234
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -4,17 +4,14 @@ import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import snapshot_download, login
 import os
 import re
 import numpy as np
-from torch.nn.attention import SDPABackend, sdpa_kernel
-import torch.nn.functional as F
-# Enable optimizations
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-torch.set_float32_matmul_precision('medium')  # or 'high' for better speed
 def setup_auth():
     hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
     if hf_token:
@@ -25,14 +22,19 @@ def setup_auth():
         except Exception as e:
             print(f"⚠️ Failed to login to Hugging Face: {e}")
             return False
-    return False
 auth_success = setup_auth()
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
-# Global model variables
 snac_model = None
 model = None
 tokenizer = None
@@ -43,204 +45,227 @@ def load_models():
     print("Loading SNAC model...")
     snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
     snac_model = snac_model.to(device)
-    snac_model.eval()  # Set to eval mode
-    # Optimize SNAC model
-    if device == "cuda":
-        snac_model = torch.compile(snac_model, mode="reduce-overhead")
     model_name = "mrrtmob/tts-khm-4"
     print("Loading main model...")
     if device == "cuda":
         model = AutoModelForCausalLM.from_pretrained(
-            model_name,
             torch_dtype=torch.bfloat16,
-            low_cpu_mem_usage=True,
-            attn_implementation="flash_attention_2",  # Use Flash Attention if available
         )
         model = model.to(device)
-        # Optimize main model with torch.compile
-        model = torch.compile(model, mode="reduce-overhead")
     else:
         model = AutoModelForCausalLM.from_pretrained(
-            model_name,
             torch_dtype=torch.float32
         )
-    model.eval()
     print("Loading tokenizer...")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    print(f"Models loaded and optimized")
-# Load models
 load_models()
-# Optimized text processing with caching
-text_cache = {}
-audio_cache = {}
-def smart_split_text(text, max_chars=120):
-    """Optimized text splitting for better performance"""
-    if len(text) <= max_chars:
-        return [text]
-    # Use simple sentence splitting for speed
-    sentences = re.split(r'([។!?])', text)
-    chunks = []
-    current_chunk = ""
     for i in range(0, len(sentences), 2):
         sentence = sentences[i]
         if i + 1 < len(sentences):
             sentence += sentences[i + 1]
-        if len(current_chunk + sentence) <= max_chars:
-            current_chunk += sentence
         else:
             if current_chunk:
-                chunks.append(current_chunk.strip())
-            current_chunk = sentence
     if current_chunk:
-        chunks.append(current_chunk.strip())
-    return [chunk for chunk in chunks if chunk.strip()]
-def process_prompt_fast(prompt, voice, tokenizer, device):
-    """Optimized prompt processing"""
-    # Cache tokenization if same prompt
-    cache_key = f"{voice}:{prompt}"
-    if cache_key in text_cache:
-        return text_cache[cache_key]
     prompt = f"{voice}: {prompt}"
-    # Batch tokenize for efficiency
-    encoded = tokenizer(
-        prompt,
-        return_tensors="pt",
-        padding=False,
-        truncation=True,
-        max_length=512
-    )
-    input_ids = encoded.input_ids
     start_token = torch.tensor([[128259]], dtype=torch.int64)
     end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
     modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
     attention_mask = torch.ones_like(modified_input_ids)
-    result = (modified_input_ids.to(device), attention_mask.to(device))
-    text_cache[cache_key] = result
-    return result
-def parse_output_fast(generated_ids):
-    """Optimized output parsing"""
-    # Vectorized operations for speed
     token_to_find = 128257
     token_to_remove = 128258
-    # Find last occurrence efficiently
-    mask = (generated_ids == token_to_find)
-    if mask.any():
-        indices = torch.where(mask)
-        if len(indices[1]) > 0:
-            last_idx = indices[1][-1].item()
-            cropped = generated_ids[:, last_idx+1:]
-        else:
-            cropped = generated_ids
     else:
-        cropped = generated_ids
-    # Remove unwanted tokens
-    for row in cropped:
-        filtered = row[row != token_to_remove]
-        if len(filtered) >= 7:
-            # Trim to multiple of 7
-            new_length = (len(filtered) // 7) * 7
-            trimmed = filtered[:new_length]
-            # Vectorized subtraction and clipping
-            codes = torch.clamp(trimmed - 128266, min=0)
-            return codes.tolist()
-    return []
-def redistribute_codes_fast(code_list, snac_model):
-    """Optimized code redistribution"""
     if not code_list or len(code_list) < 7:
-        return np.zeros(6000, dtype=np.float32)  # Shorter silence
     device = next(snac_model.parameters()).device
     try:
-        # Vectorized processing
-        num_frames = len(code_list) // 7
-        codes_array = np.array(code_list[:num_frames * 7]).reshape(-1, 7)
-        # Vectorized layer extraction
-        layer_1 = codes_array[:, 0]
-        layer_2_indices = [1, 4]
-        layer_3_indices = [2, 3, 5, 6]
-        layer_2 = []
-        layer_3 = []
-        for i in range(num_frames):
-            layer_2.extend([
-                max(0, codes_array[i, 1] - 4096),
-                max(0, codes_array[i, 4] - (4*4096))
-            ])
-            layer_3.extend([
-                max(0, codes_array[i, 2] - (2*4096)),
-                max(0, codes_array[i, 3] - (3*4096)),
-                max(0, codes_array[i, 5] - (5*4096)),
-                max(0, codes_array[i, 6] - (6*4096))
-            ])
-        # Create tensors efficiently
         codes = [
-            torch.tensor(layer_1, device=device, dtype=torch.long).unsqueeze(0),
-            torch.tensor(layer_2, device=device, dtype=torch.long).unsqueeze(0),
-            torch.tensor(layer_3, device=device, dtype=torch.long).unsqueeze(0)
         ]
-        # Generate audio with optimizations
-        with torch.no_grad(), torch.autocast(device_type='cuda' if device == 'cuda' else 'cpu'):
             audio_hat = snac_model.decode(codes)
-        return audio_hat.detach().squeeze().cpu().numpy().astype(np.float32)
     except Exception as e:
-        print(f"Error in redistribute_codes_fast: {e}")
-        return np.zeros(6000, dtype=np.float32)
-@spaces.GPU(duration=45)  # Shorter duration for faster allocation
-def generate_speech_chunk_fast(text_chunk, temperature=0.7, top_p=0.9, repetition_penalty=1.1,
-                              max_new_tokens=400, voice="Elise"):
-    """Optimized speech generation"""
     global model, tokenizer, snac_model
     if not text_chunk.strip():
-        return np.array([], dtype=np.float32)
-    # Check cache first
-    cache_key = f"{text_chunk}:{temperature}:{top_p}:{max_new_tokens}"
-    if cache_key in audio_cache:
-        return audio_cache[cache_key]
     try:
-        input_ids, attention_mask = process_prompt_fast(text_chunk, voice, tokenizer, device)
-        # Optimized generation parameters
-        with torch.no_grad(), torch.autocast(device_type='cuda' if device == 'cuda' else 'cpu'):
-            # Use optimized generation settings
             generated_ids = model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
@@ -248,164 +273,182 @@ def generate_speech_chunk_fast(text_chunk, temperature=0.7, top_p=0.9, repetitio
                 do_sample=True,
                 temperature=temperature,
                 top_p=top_p,
-                top_k=50,  # Add top_k for faster sampling
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
                 eos_token_id=128258,
                 pad_token_id=tokenizer.eos_token_id,
-                use_cache=True,
-                # Optimization flags
-                num_beams=1,  # Greedy-like but with sampling
             )
-        code_list = parse_output_fast(generated_ids)
         if not code_list:
-            return np.array([], dtype=np.float32)
-        audio_samples = redistribute_codes_fast(code_list, snac_model)
-        # Cache result if successful
-        if len(audio_samples) > 0:
-            audio_cache[cache_key] = audio_samples
-            # Limit cache size
-            if len(audio_cache) > 100:
-                # Remove oldest entries
-                keys = list(audio_cache.keys())
-                for k in keys[:20]:
-                    del audio_cache[k]
         return audio_samples
     except Exception as e:
-        print(f"Error in chunk generation: {e}")
-        return np.array([], dtype=np.float32)
-def combine_audio_fast(audio_chunks, pause_duration=0.2):
-    """Fast audio combination"""
-    if not audio_chunks:
-        return np.array([], dtype=np.float32)
-    # Shorter pauses for faster speech
-    pause_samples = int(24000 * pause_duration)
-    pause = np.zeros(pause_samples, dtype=np.float32)
-    # Pre-calculate total length for efficiency
-    total_length = sum(len(chunk) for chunk in audio_chunks) + pause_samples * (len(audio_chunks) - 1)
-    combined = np.empty(total_length, dtype=np.float32)
-    pos = 0
-    for i, chunk in enumerate(audio_chunks):
-        if len(chunk) > 0:
-            combined[pos:pos+len(chunk)] = chunk
-            pos += len(chunk)
-            if i < len(audio_chunks) - 1:
-                combined[pos:pos+pause_samples] = pause
-                pos += pause_samples
-    return combined[:pos]  # Trim to actual length
-def generate_speech_fast(text, temperature=0.7, top_p=0.9, repetition_penalty=1.1,
-                        max_new_tokens=400, voice="Elise", split_method="punctuation",
-                        max_chars=120, pause_duration=0.2, progress=gr.Progress()):
-    """Optimized main generation function"""
     if not text.strip():
         return None
     try:
-        progress(0.05, "Processing...")
-        # Fast text splitting
-        if split_method == "punctuation" and len(text) > max_chars:
-            chunks = smart_split_text(text, max_chars)
         else:
-            chunks = [text]
-        progress(0.1, f"Generating {len(chunks)} chunks...")
-        print(f"Processing {len(chunks)} chunks")
-        # Parallel-like processing (sequential but optimized)
         audio_chunks = []
-        for i, chunk in enumerate(chunks):
-            progress(0.1 + 0.8 * (i / len(chunks)), f"Chunk {i+1}/{len(chunks)}")
-            audio = generate_speech_chunk_fast(
                 chunk, temperature, top_p, repetition_penalty, max_new_tokens, voice
             )
             if len(audio) > 0:
                 audio_chunks.append(audio)
         if not audio_chunks:
             return None
-        progress(0.95, "Combining...")
-        final_audio = combine_audio_fast(audio_chunks, pause_duration)
-        progress(1.0, "Done!")
-        print(f"Generated {len(final_audio)/24000:.1f}s of audio")
         return (24000, final_audio)
     except Exception as e:
-        print(f"Generation error: {e}")
         return None
-# Simplified Gradio interface for speed
 examples = [
-    ["ជំរាបសួរ ខ្ញុំឈ្មោះតារា។"],
-    ["ខ្ញុំអាចនិយាយភាសាខ្មែរ។"],
 ]
-with gr.Blocks(title="Fast Khmer TTS", theme="soft") as demo:
-    gr.Markdown("""
-    # ⚡ Fast Khmer Text-to-Speech
-    **Optimized for speed and efficiency**
     """)
     text_input = gr.Textbox(
-        label="Khmer Text",
-        placeholder="Enter Khmer text here...",
-        lines=3
     )
-    with gr.Row():
-        max_chars = gr.Slider(80, 200, 120, step=20, label="Chunk Size")
-        pause_duration = gr.Slider(0.1, 0.5, 0.2, step=0.1, label="Pause Duration")
     with gr.Row():
-        generate_btn = gr.Button("🎤 Generate", variant="primary")
-        clear_btn = gr.Button("Clear")
-    audio_output = gr.Audio(label="Generated Speech", type="numpy")
     gr.Examples(
         examples=examples,
         inputs=[text_input],
         outputs=audio_output,
-        fn=lambda text: generate_speech_fast(text),
         cache_examples=False,
     )
-    generate_btn.click(
-        fn=generate_speech_fast,
-        inputs=[text_input, gr.State(0.7), gr.State(0.9), gr.State(1.1),
-                gr.State(400), gr.State("Elise"), gr.State("punctuation"),
-                max_chars, pause_duration],
         outputs=audio_output
     )
     clear_btn.click(
         fn=lambda: (None, None),
         outputs=[text_input, audio_output]
     )
 if __name__ == "__main__":
-    demo.queue(max_size=3, api_open=False).launch(
         share=False,
         server_name="0.0.0.0",
-        server_port=7860,
-        show_error=True
     )

 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import snapshot_download, login
+from dotenv import load_dotenv
 import os
 import re
 import numpy as np
+load_dotenv()
+# Setup Hugging Face authentication
 def setup_auth():
     hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
     if hf_token:
         except Exception as e:
             print(f"⚠️ Failed to login to Hugging Face: {e}")
             return False
+    else:
+        print("⚠️ No HF token found. Running as anonymous user.")
+        return False
+# Setup authentication before anything else
 auth_success = setup_auth()
+# Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+print(f"Authentication status: {'✅ Logged in' if auth_success else '❌ Anonymous'}")
+# Global variables to store models
 snac_model = None
 model = None
 tokenizer = None
     print("Loading SNAC model...")
     snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
     snac_model = snac_model.to(device)
     model_name = "mrrtmob/tts-khm-4"
+    print("Downloading model files...")
+    snapshot_download(
+        repo_id=model_name,
+        allow_patterns=[
+            "config.json",
+            "*.safetensors",
+            "model.safetensors.index.json",
+            "tokenizer.json",
+            "tokenizer_config.json",
+            "special_tokens_map.json",
+            "vocab.json",
+            "merges.txt"
+        ],
+        ignore_patterns=[
+            "optimizer.pt",
+            "pytorch_model.bin",
+            "training_args.bin",
+            "scheduler.pt"
+        ]
+    )
     print("Loading main model...")
     if device == "cuda":
         model = AutoModelForCausalLM.from_pretrained(
+            model_name,
             torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True
         )
         model = model.to(device)
     else:
         model = AutoModelForCausalLM.from_pretrained(
+            model_name,
             torch_dtype=torch.float32
         )
     print("Loading tokenizer...")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
+    print(f"Khmer TTS model loaded to {device}")
+# Load models at startup
 load_models()
+def split_text_by_punctuation(text, max_chars=200):
+    sentence_endings = r'[។!?]'
+    clause_separators = r'[,;:]'
+    sentences = re.split(f'({sentence_endings})', text)
+    combined_sentences = []
     for i in range(0, len(sentences), 2):
         sentence = sentences[i]
         if i + 1 < len(sentences):
             sentence += sentences[i + 1]
+        if sentence.strip():
+            combined_sentences.append(sentence.strip())
+    if len(combined_sentences) <= 1:
+        parts = re.split(f'({clause_separators})', text)
+        combined_sentences = []
+        for i in range(0, len(parts), 2):
+            part = parts[i]
+            if i + 1 < len(parts):
+                part += parts[i + 1]
+            if part.strip():
+                combined_sentences.append(part.strip())
+    final_chunks = []
+    for sentence in combined_sentences:
+        if len(sentence) <= max_chars:
+            final_chunks.append(sentence)
+        else:
+            words = sentence.split()
+            current_chunk = ""
+            for word in words:
+                test_chunk = current_chunk + " " + word if current_chunk else word
+                if len(test_chunk) <= max_chars:
+                    current_chunk = test_chunk
+                else:
+                    if current_chunk:
+                        final_chunks.append(current_chunk)
+                    current_chunk = word
+            if current_chunk:
+                final_chunks.append(current_chunk)
+    return [chunk for chunk in final_chunks if chunk.strip()]
+def split_text_by_tokens(text, max_tokens=150):
+    global tokenizer
+    tokens = tokenizer.encode(text)
+    if len(tokens) <= max_tokens:
+        return [text]
+    chunks = []
+    words = text.split()
+    current_chunk = ""
+    for word in words:
+        test_chunk = current_chunk + " " + word if current_chunk else word
+        test_tokens = tokenizer.encode(test_chunk)
+        if len(test_tokens) <= max_tokens:
+            current_chunk = test_chunk
         else:
             if current_chunk:
+                chunks.append(current_chunk)
+            current_chunk = word
     if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
+def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
     start_token = torch.tensor([[128259]], dtype=torch.int64)
     end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
     modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
     attention_mask = torch.ones_like(modified_input_ids)
+    return modified_input_ids.to(device), attention_mask.to(device)
+def parse_output(generated_ids):
     token_to_find = 128257
     token_to_remove = 128258
+    token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
+    if len(token_indices[1]) > 0:
+        last_occurrence_idx = token_indices[1][-1].item()
+        cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
     else:
+        cropped_tensor = generated_ids
+    processed_rows = []
+    for row in cropped_tensor:
+        masked_row = row[row != token_to_remove]
+        processed_rows.append(masked_row)
+    code_lists = []
+    for row in processed_rows:
+        row_length = row.size(0)
+        new_length = (row_length // 7) * 7
+        trimmed_row = row[:new_length]
+        trimmed_row = [max(0, t - 128266) for t in trimmed_row]
+        code_lists.append(trimmed_row)
+    return code_lists[0] if code_lists and len(code_lists[0]) > 0 else []
+def redistribute_codes(code_list, snac_model):
     if not code_list or len(code_list) < 7:
+        return np.zeros(12000)
     device = next(snac_model.parameters()).device
+    layer_1 = []
+    layer_2 = []
+    layer_3 = []
     try:
+        for i in range((len(code_list))//7):
+            layer_1.append(max(0, code_list[7*i]))
+            layer_2.append(max(0, code_list[7*i+1]-4096))
+            layer_3.append(max(0, code_list[7*i+2]-(2*4096)))
+            layer_3.append(max(0, code_list[7*i+3]-(3*4096)))
+            layer_2.append(max(0, code_list[7*i+4]-(4*4096)))
+            layer_3.append(max(0, code_list[7*i+5]-(5*4096)))
+            layer_3.append(max(0, code_list[7*i+6]-(6*4096)))
         codes = [
+            torch.tensor(layer_1, device=device).unsqueeze(0),
+            torch.tensor(layer_2, device=device).unsqueeze(0),
+            torch.tensor(layer_3, device=device).unsqueeze(0)
         ]
+        with torch.no_grad():
             audio_hat = snac_model.decode(codes)
+        return audio_hat.detach().squeeze().cpu().numpy()
     except Exception as e:
+        print(f"Error in redistribute_codes: {e}")
+        return np.zeros(12000)
+def combine_audio_chunks(audio_chunks, pause_duration=0.3):
+    if not audio_chunks:
+        return np.array([])
+    pause_samples = int(24000 * pause_duration)
+    pause = np.zeros(pause_samples)
+    combined_audio = []
+    for i, chunk in enumerate(audio_chunks):
+        if len(chunk) > 0:
+            combined_audio.append(chunk)
+            if i < len(audio_chunks) - 1:
+                combined_audio.append(pause)
+    if combined_audio:
+        return np.concatenate(combined_audio)
+    else:
+        return np.array([])
+@spaces.GPU(duration=60)  # Reduced duration to be more conservative
+def generate_speech_chunk(text_chunk, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=600, voice="Elise"):
+    """Generate speech for a single chunk"""
     global model, tokenizer, snac_model
     if not text_chunk.strip():
+        return np.array([])
     try:
+        input_ids, attention_mask = process_prompt(text_chunk, voice, tokenizer, device)
+        with torch.no_grad():
             generated_ids = model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 do_sample=True,
                 temperature=temperature,
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
                 eos_token_id=128258,
                 pad_token_id=tokenizer.eos_token_id,
+                use_cache=True
             )
+        code_list = parse_output(generated_ids)
         if not code_list:
+            return np.array([])
+        audio_samples = redistribute_codes(code_list, snac_model)
         return audio_samples
     except Exception as e:
+        print(f"Error generating speech chunk: {e}")
+        return np.array([])
+def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=600,
+                   voice="Elise", split_method="punctuation", max_chars=150, max_tokens=100,
+                   pause_duration=0.3, progress=gr.Progress()):
+    """Main function to generate speech with text splitting"""
     if not text.strip():
         return None
     try:
+        progress(0.05, "Splitting text...")
+        if split_method == "punctuation":
+            text_chunks = split_text_by_punctuation(text, max_chars)
+        elif split_method == "tokens":
+            text_chunks = split_text_by_tokens(text, max_tokens)
         else:
+            text_chunks = [text]
+        progress(0.1, f"Processing {len(text_chunks)} chunks...")
+        print(f"Split text into {len(text_chunks)} chunks:")
+        for i, chunk in enumerate(text_chunks):
+            print(f"Chunk {i+1}: {chunk[:50]}...")
         audio_chunks = []
+        for i, chunk in enumerate(text_chunks):
+            progress(0.1 + 0.7 * (i / len(text_chunks)), f"Generating chunk {i+1}/{len(text_chunks)}...")
+            audio = generate_speech_chunk(
                 chunk, temperature, top_p, repetition_penalty, max_new_tokens, voice
             )
             if len(audio) > 0:
                 audio_chunks.append(audio)
+                print(f"Generated audio for chunk {i+1}: {len(audio)} samples ({len(audio)/24000:.2f}s)")
         if not audio_chunks:
             return None
+        progress(0.9, "Combining audio chunks...")
+        final_audio = combine_audio_chunks(audio_chunks, pause_duration)
+        progress(1.0, "Complete!")
+        print(f"Final audio: {len(final_audio)} samples ({len(final_audio)/24000:.2f}s)")
         return (24000, final_audio)
     except Exception as e:
+        print(f"Error generating speech: {e}")
+        import traceback
+        traceback.print_exc()
         return None
+# [Rest of your Gradio interface code remains the same]
 examples = [
+    ["ជំរាបសួរ ខ្ញុំឈ្មោះ តារា។ ខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។"],
+    ["ខ្ញុំអា���បង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច ឬ <sigh> ថប់ដង្ហើម។"],
 ]
+EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
+with gr.Blocks(title="Khmer Text-to-Speech") as demo:
+    gr.Markdown(f"""
+    # 🎵 Khmer Text-to-Speech
+    **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
+    Authentication: {'✅ Pro Account' if auth_success else '❌ Anonymous (Limited)'}
+    បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
+    💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
+    ✨ **New**: Supports long text with automatic splitting!
     """)
     text_input = gr.Textbox(
+        label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
+        placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ... (អាចវែងបាន)",
+        lines=6
     )
+    with gr.Accordion("📝 Text Splitting Options", open=True):
+        split_method = gr.Radio(
+            choices=[
+                ("Split by punctuation (recommended)", "punctuation"),
+                ("Split by token count", "tokens"),
+                ("No splitting", "none")
+            ],
+            value="punctuation",
+            label="Text splitting method"
+        )
+        with gr.Row():
+            max_chars = gr.Slider(
+                minimum=50, maximum=300, value=150, step=25,
+                label="Max characters per chunk"
+            )
+            max_tokens = gr.Slider(
+                minimum=50, maximum=200, value=100, step=25,
+                label="Max tokens per chunk"
+            )
+        pause_duration = gr.Slider(
+            minimum=0.0, maximum=1.0, value=0.3, step=0.1,
+            label="Pause between chunks (seconds)"
+        )
+    with gr.Accordion("🔧 Advanced Settings", open=False):
+        with gr.Row():
+            temperature = gr.Slider(
+                minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                label="Temperature"
+            )
+            top_p = gr.Slider(
+                minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                label="Top P"
+            )
+        with gr.Row():
+            repetition_penalty = gr.Slider(
+                minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                label="Repetition Penalty"
+            )
+            max_new_tokens = gr.Slider(
+                minimum=100, maximum=800, value=600, step=100,
+                label="Max tokens per chunk"
+            )
     with gr.Row():
+        submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
+        clear_btn = gr.Button("🗑️ Clear", size="lg")
+    audio_output = gr.Audio(
+        label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
+        type="numpy",
+        show_label=True
+    )
     gr.Examples(
         examples=examples,
         inputs=[text_input],
         outputs=audio_output,
+        fn=lambda text: generate_speech(text),
         cache_examples=False,
     )
+    submit_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens,
+                gr.State("Elise"), split_method, max_chars, max_tokens, pause_duration],
         outputs=audio_output
     )
     clear_btn.click(
         fn=lambda: (None, None),
+        inputs=[],
         outputs=[text_input, audio_output]
     )
 if __name__ == "__main__":
+    demo.queue(max_size=5).launch(
         share=False,
         server_name="0.0.0.0",
+        server_port=7860
     )

requirements.txt CHANGED Viewed

@@ -9,5 +9,4 @@ gradio
 scipy
 openai
 huggingface-hub
-accelerate
-flash-attn

 scipy
 openai
 huggingface-hub
+accelerate