Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on Jul 9

Commit

21ac5de

1 Parent(s): 2938eff

Add flash-attn to requirements

Browse files

Files changed (2) hide show

app.py +234 -277
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -4,14 +4,17 @@ import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import snapshot_download, login
-from dotenv import load_dotenv
 import os
 import re
 import numpy as np
-load_dotenv()
-# Setup Hugging Face authentication
 def setup_auth():
     hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
     if hf_token:
@@ -22,19 +25,14 @@ def setup_auth():
         except Exception as e:
             print(f"⚠️ Failed to login to Hugging Face: {e}")
             return False
-    else:
-        print("⚠️ No HF token found. Running as anonymous user.")
-        return False
-# Setup authentication before anything else
 auth_success = setup_auth()
-# Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
-print(f"Authentication status: {'✅ Logged in' if auth_success else '❌ Anonymous'}")
-# Global variables to store models
 snac_model = None
 model = None
 tokenizer = None
@@ -45,227 +43,204 @@ def load_models():
     print("Loading SNAC model...")
     snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
     snac_model = snac_model.to(device)
-    model_name = "mrrtmob/tts-khm-4"
-    print("Downloading model files...")
-    snapshot_download(
-        repo_id=model_name,
-        allow_patterns=[
-            "config.json",
-            "*.safetensors",
-            "model.safetensors.index.json",
-            "tokenizer.json",
-            "tokenizer_config.json",
-            "special_tokens_map.json",
-            "vocab.json",
-            "merges.txt"
-        ],
-        ignore_patterns=[
-            "optimizer.pt",
-            "pytorch_model.bin",
-            "training_args.bin",
-            "scheduler.pt"
-        ]
-    )
     print("Loading main model...")
     if device == "cuda":
         model = AutoModelForCausalLM.from_pretrained(
-            model_name,
             torch_dtype=torch.bfloat16,
-            low_cpu_mem_usage=True
         )
         model = model.to(device)
     else:
         model = AutoModelForCausalLM.from_pretrained(
-            model_name,
             torch_dtype=torch.float32
         )
     print("Loading tokenizer...")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    print(f"Khmer TTS model loaded to {device}")
-# Load models at startup
 load_models()
-def split_text_by_punctuation(text, max_chars=200):
-    sentence_endings = r'[។!?]'
-    clause_separators = r'[,;:]'
-    sentences = re.split(f'({sentence_endings})', text)
-    combined_sentences = []
-    for i in range(0, len(sentences), 2):
-        sentence = sentences[i]
-        if i + 1 < len(sentences):
-            sentence += sentences[i + 1]
-        if sentence.strip():
-            combined_sentences.append(sentence.strip())
-    if len(combined_sentences) <= 1:
-        parts = re.split(f'({clause_separators})', text)
-        combined_sentences = []
-        for i in range(0, len(parts), 2):
-            part = parts[i]
-            if i + 1 < len(parts):
-                part += parts[i + 1]
-            if part.strip():
-                combined_sentences.append(part.strip())
-    final_chunks = []
-    for sentence in combined_sentences:
-        if len(sentence) <= max_chars:
-            final_chunks.append(sentence)
-        else:
-            words = sentence.split()
-            current_chunk = ""
-            for word in words:
-                test_chunk = current_chunk + " " + word if current_chunk else word
-                if len(test_chunk) <= max_chars:
-                    current_chunk = test_chunk
-                else:
-                    if current_chunk:
-                        final_chunks.append(current_chunk)
-                    current_chunk = word
-            if current_chunk:
-                final_chunks.append(current_chunk)
-    return [chunk for chunk in final_chunks if chunk.strip()]
-def split_text_by_tokens(text, max_tokens=150):
-    global tokenizer
-    tokens = tokenizer.encode(text)
-    if len(tokens) <= max_tokens:
         return [text]
     chunks = []
-    words = text.split()
     current_chunk = ""
-    for word in words:
-        test_chunk = current_chunk + " " + word if current_chunk else word
-        test_tokens = tokenizer.encode(test_chunk)
-        if len(test_tokens) <= max_tokens:
-            current_chunk = test_chunk
         else:
             if current_chunk:
-                chunks.append(current_chunk)
-            current_chunk = word
     if current_chunk:
-        chunks.append(current_chunk)
-    return chunks
-def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
-    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
     start_token = torch.tensor([[128259]], dtype=torch.int64)
     end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
     modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
     attention_mask = torch.ones_like(modified_input_ids)
-    return modified_input_ids.to(device), attention_mask.to(device)
-def parse_output(generated_ids):
     token_to_find = 128257
     token_to_remove = 128258
-    token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
-    if len(token_indices[1]) > 0:
-        last_occurrence_idx = token_indices[1][-1].item()
-        cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
     else:
-        cropped_tensor = generated_ids
-    processed_rows = []
-    for row in cropped_tensor:
-        masked_row = row[row != token_to_remove]
-        processed_rows.append(masked_row)
-    code_lists = []
-    for row in processed_rows:
-        row_length = row.size(0)
-        new_length = (row_length // 7) * 7
-        trimmed_row = row[:new_length]
-        trimmed_row = [max(0, t - 128266) for t in trimmed_row]
-        code_lists.append(trimmed_row)
-    return code_lists[0] if code_lists and len(code_lists[0]) > 0 else []
-def redistribute_codes(code_list, snac_model):
     if not code_list or len(code_list) < 7:
-        return np.zeros(12000)
     device = next(snac_model.parameters()).device
-    layer_1 = []
-    layer_2 = []
-    layer_3 = []
     try:
-        for i in range((len(code_list))//7):
-            layer_1.append(max(0, code_list[7*i]))
-            layer_2.append(max(0, code_list[7*i+1]-4096))
-            layer_3.append(max(0, code_list[7*i+2]-(2*4096)))
-            layer_3.append(max(0, code_list[7*i+3]-(3*4096)))
-            layer_2.append(max(0, code_list[7*i+4]-(4*4096)))
-            layer_3.append(max(0, code_list[7*i+5]-(5*4096)))
-            layer_3.append(max(0, code_list[7*i+6]-(6*4096)))
         codes = [
-            torch.tensor(layer_1, device=device).unsqueeze(0),
-            torch.tensor(layer_2, device=device).unsqueeze(0),
-            torch.tensor(layer_3, device=device).unsqueeze(0)
         ]
-        with torch.no_grad():
             audio_hat = snac_model.decode(codes)
-        return audio_hat.detach().squeeze().cpu().numpy()
-    except Exception as e:
-        print(f"Error in redistribute_codes: {e}")
-        return np.zeros(12000)
-def combine_audio_chunks(audio_chunks, pause_duration=0.3):
-    if not audio_chunks:
-        return np.array([])
-    pause_samples = int(24000 * pause_duration)
-    pause = np.zeros(pause_samples)
-    combined_audio = []
-    for i, chunk in enumerate(audio_chunks):
-        if len(chunk) > 0:
-            combined_audio.append(chunk)
-            if i < len(audio_chunks) - 1:
-                combined_audio.append(pause)
-    if combined_audio:
-        return np.concatenate(combined_audio)
-    else:
-        return np.array([])
-@spaces.GPU(duration=60)  # Reduced duration to be more conservative
-def generate_speech_chunk(text_chunk, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=600, voice="Elise"):
-    """Generate speech for a single chunk"""
     global model, tokenizer, snac_model
     if not text_chunk.strip():
-        return np.array([])
     try:
-        input_ids, attention_mask = process_prompt(text_chunk, voice, tokenizer, device)
-        with torch.no_grad():
             generated_ids = model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
@@ -273,182 +248,164 @@ def generate_speech_chunk(text_chunk, temperature=0.6, top_p=0.95, repetition_pe
                 do_sample=True,
                 temperature=temperature,
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
                 eos_token_id=128258,
                 pad_token_id=tokenizer.eos_token_id,
-                use_cache=True
             )
-        code_list = parse_output(generated_ids)
         if not code_list:
-            return np.array([])
-        audio_samples = redistribute_codes(code_list, snac_model)
         return audio_samples
     except Exception as e:
-        print(f"Error generating speech chunk: {e}")
-        return np.array([])
-def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=600,
-                   voice="Elise", split_method="punctuation", max_chars=150, max_tokens=100,
-                   pause_duration=0.3, progress=gr.Progress()):
-    """Main function to generate speech with text splitting"""
     if not text.strip():
         return None
     try:
-        progress(0.05, "Splitting text...")
-        if split_method == "punctuation":
-            text_chunks = split_text_by_punctuation(text, max_chars)
-        elif split_method == "tokens":
-            text_chunks = split_text_by_tokens(text, max_tokens)
         else:
-            text_chunks = [text]
-        progress(0.1, f"Processing {len(text_chunks)} chunks...")
-        print(f"Split text into {len(text_chunks)} chunks:")
-        for i, chunk in enumerate(text_chunks):
-            print(f"Chunk {i+1}: {chunk[:50]}...")
         audio_chunks = []
-        for i, chunk in enumerate(text_chunks):
-            progress(0.1 + 0.7 * (i / len(text_chunks)), f"Generating chunk {i+1}/{len(text_chunks)}...")
-            audio = generate_speech_chunk(
                 chunk, temperature, top_p, repetition_penalty, max_new_tokens, voice
             )
             if len(audio) > 0:
                 audio_chunks.append(audio)
-                print(f"Generated audio for chunk {i+1}: {len(audio)} samples ({len(audio)/24000:.2f}s)")
         if not audio_chunks:
             return None
-        progress(0.9, "Combining audio chunks...")
-        final_audio = combine_audio_chunks(audio_chunks, pause_duration)
-        progress(1.0, "Complete!")
-        print(f"Final audio: {len(final_audio)} samples ({len(final_audio)/24000:.2f}s)")
         return (24000, final_audio)
     except Exception as e:
-        print(f"Error generating speech: {e}")
-        import traceback
-        traceback.print_exc()
         return None
-# [Rest of your Gradio interface code remains the same]
 examples = [
-    ["ជំរាបសួរ ខ្ញុំឈ្មោះ តារា។ ខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។"],
-    ["ខ្ញុំអាចបង្កើ��សំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច ឬ <sigh> ថប់ដង្ហើម។"],
 ]
-EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
-with gr.Blocks(title="Khmer Text-to-Speech") as demo:
-    gr.Markdown(f"""
-    # 🎵 Khmer Text-to-Speech
-    **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
-    Authentication: {'✅ Pro Account' if auth_success else '❌ Anonymous (Limited)'}
-    បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
-    💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
-    ✨ **New**: Supports long text with automatic splitting!
     """)
     text_input = gr.Textbox(
-        label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
-        placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ... (អាចវែងបាន)",
-        lines=6
     )
-    with gr.Accordion("📝 Text Splitting Options", open=True):
-        split_method = gr.Radio(
-            choices=[
-                ("Split by punctuation (recommended)", "punctuation"),
-                ("Split by token count", "tokens"),
-                ("No splitting", "none")
-            ],
-            value="punctuation",
-            label="Text splitting method"
-        )
-        with gr.Row():
-            max_chars = gr.Slider(
-                minimum=50, maximum=300, value=150, step=25,
-                label="Max characters per chunk"
-            )
-            max_tokens = gr.Slider(
-                minimum=50, maximum=200, value=100, step=25,
-                label="Max tokens per chunk"
-            )
-        pause_duration = gr.Slider(
-            minimum=0.0, maximum=1.0, value=0.3, step=0.1,
-            label="Pause between chunks (seconds)"
-        )
-    with gr.Accordion("🔧 Advanced Settings", open=False):
-        with gr.Row():
-            temperature = gr.Slider(
-                minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                label="Temperature"
-            )
-            top_p = gr.Slider(
-                minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                label="Top P"
-            )
-        with gr.Row():
-            repetition_penalty = gr.Slider(
-                minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                label="Repetition Penalty"
-            )
-            max_new_tokens = gr.Slider(
-                minimum=100, maximum=800, value=600, step=100,
-                label="Max tokens per chunk"
-            )
     with gr.Row():
-        submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
-        clear_btn = gr.Button("🗑️ Clear", size="lg")
-    audio_output = gr.Audio(
-        label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
-        type="numpy",
-        show_label=True
-    )
     gr.Examples(
         examples=examples,
         inputs=[text_input],
         outputs=audio_output,
-        fn=lambda text: generate_speech(text),
         cache_examples=False,
     )
-    submit_btn.click(
-        fn=generate_speech,
-        inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens,
-                gr.State("Elise"), split_method, max_chars, max_tokens, pause_duration],
         outputs=audio_output
     )
     clear_btn.click(
         fn=lambda: (None, None),
-        inputs=[],
         outputs=[text_input, audio_output]
     )
 if __name__ == "__main__":
-    demo.queue(max_size=5).launch(
         share=False,
         server_name="0.0.0.0",
-        server_port=7860
     )

 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import snapshot_download, login
 import os
 import re
 import numpy as np
+from torch.nn.attention import SDPABackend, sdpa_kernel
+import torch.nn.functional as F
+# Enable optimizations
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.set_float32_matmul_precision('medium')  # or 'high' for better speed
 def setup_auth():
     hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
     if hf_token:
         except Exception as e:
             print(f"⚠️ Failed to login to Hugging Face: {e}")
             return False
+    return False
 auth_success = setup_auth()
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+# Global model variables
 snac_model = None
 model = None
 tokenizer = None
     print("Loading SNAC model...")
     snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
     snac_model = snac_model.to(device)
+    snac_model.eval()  # Set to eval mode
+    # Optimize SNAC model
+    if device == "cuda":
+        snac_model = torch.compile(snac_model, mode="reduce-overhead")
+    model_name = "mrrtmob/tts-khm-4"
     print("Loading main model...")
     if device == "cuda":
         model = AutoModelForCausalLM.from_pretrained(
+            model_name,
             torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            attn_implementation="flash_attention_2",  # Use Flash Attention if available
         )
         model = model.to(device)
+        # Optimize main model with torch.compile
+        model = torch.compile(model, mode="reduce-overhead")
     else:
         model = AutoModelForCausalLM.from_pretrained(
+            model_name,
             torch_dtype=torch.float32
         )
+    model.eval()
     print("Loading tokenizer...")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
+    print(f"Models loaded and optimized")
+# Load models
 load_models()
+# Optimized text processing with caching
+text_cache = {}
+audio_cache = {}
+def smart_split_text(text, max_chars=120):
+    """Optimized text splitting for better performance"""
+    if len(text) <= max_chars:
         return [text]
+    # Use simple sentence splitting for speed
+    sentences = re.split(r'([។!?])', text)
     chunks = []
     current_chunk = ""
+    for i in range(0, len(sentences), 2):
+        sentence = sentences[i]
+        if i + 1 < len(sentences):
+            sentence += sentences[i + 1]
+        if len(current_chunk + sentence) <= max_chars:
+            current_chunk += sentence
         else:
             if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence
     if current_chunk:
+        chunks.append(current_chunk.strip())
+    return [chunk for chunk in chunks if chunk.strip()]
+def process_prompt_fast(prompt, voice, tokenizer, device):
+    """Optimized prompt processing"""
+    # Cache tokenization if same prompt
+    cache_key = f"{voice}:{prompt}"
+    if cache_key in text_cache:
+        return text_cache[cache_key]
     prompt = f"{voice}: {prompt}"
+    # Batch tokenize for efficiency
+    encoded = tokenizer(
+        prompt,
+        return_tensors="pt",
+        padding=False,
+        truncation=True,
+        max_length=512
+    )
+    input_ids = encoded.input_ids
     start_token = torch.tensor([[128259]], dtype=torch.int64)
     end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
     modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
     attention_mask = torch.ones_like(modified_input_ids)
+    result = (modified_input_ids.to(device), attention_mask.to(device))
+    text_cache[cache_key] = result
+    return result
+def parse_output_fast(generated_ids):
+    """Optimized output parsing"""
+    # Vectorized operations for speed
     token_to_find = 128257
     token_to_remove = 128258
+    # Find last occurrence efficiently
+    mask = (generated_ids == token_to_find)
+    if mask.any():
+        indices = torch.where(mask)
+        if len(indices[1]) > 0:
+            last_idx = indices[1][-1].item()
+            cropped = generated_ids[:, last_idx+1:]
+        else:
+            cropped = generated_ids
     else:
+        cropped = generated_ids
+    # Remove unwanted tokens
+    for row in cropped:
+        filtered = row[row != token_to_remove]
+        if len(filtered) >= 7:
+            # Trim to multiple of 7
+            new_length = (len(filtered) // 7) * 7
+            trimmed = filtered[:new_length]
+            # Vectorized subtraction and clipping
+            codes = torch.clamp(trimmed - 128266, min=0)
+            return codes.tolist()
+    return []
+def redistribute_codes_fast(code_list, snac_model):
+    """Optimized code redistribution"""
     if not code_list or len(code_list) < 7:
+        return np.zeros(6000, dtype=np.float32)  # Shorter silence
     device = next(snac_model.parameters()).device
     try:
+        # Vectorized processing
+        num_frames = len(code_list) // 7
+        codes_array = np.array(code_list[:num_frames * 7]).reshape(-1, 7)
+        # Vectorized layer extraction
+        layer_1 = codes_array[:, 0]
+        layer_2_indices = [1, 4]
+        layer_3_indices = [2, 3, 5, 6]
+        layer_2 = []
+        layer_3 = []
+        for i in range(num_frames):
+            layer_2.extend([
+                max(0, codes_array[i, 1] - 4096),
+                max(0, codes_array[i, 4] - (4*4096))
+            ])
+            layer_3.extend([
+                max(0, codes_array[i, 2] - (2*4096)),
+                max(0, codes_array[i, 3] - (3*4096)),
+                max(0, codes_array[i, 5] - (5*4096)),
+                max(0, codes_array[i, 6] - (6*4096))
+            ])
+        # Create tensors efficiently
         codes = [
+            torch.tensor(layer_1, device=device, dtype=torch.long).unsqueeze(0),
+            torch.tensor(layer_2, device=device, dtype=torch.long).unsqueeze(0),
+            torch.tensor(layer_3, device=device, dtype=torch.long).unsqueeze(0)
         ]
+        # Generate audio with optimizations
+        with torch.no_grad(), torch.autocast(device_type='cuda' if device == 'cuda' else 'cpu'):
             audio_hat = snac_model.decode(codes)
+        return audio_hat.detach().squeeze().cpu().numpy().astype(np.float32)
+    except Exception as e:
+        print(f"Error in redistribute_codes_fast: {e}")
+        return np.zeros(6000, dtype=np.float32)
+@spaces.GPU(duration=45)  # Shorter duration for faster allocation
+def generate_speech_chunk_fast(text_chunk, temperature=0.7, top_p=0.9, repetition_penalty=1.1,
+                              max_new_tokens=400, voice="Elise"):
+    """Optimized speech generation"""
     global model, tokenizer, snac_model
     if not text_chunk.strip():
+        return np.array([], dtype=np.float32)
+    # Check cache first
+    cache_key = f"{text_chunk}:{temperature}:{top_p}:{max_new_tokens}"
+    if cache_key in audio_cache:
+        return audio_cache[cache_key]
     try:
+        input_ids, attention_mask = process_prompt_fast(text_chunk, voice, tokenizer, device)
+        # Optimized generation parameters
+        with torch.no_grad(), torch.autocast(device_type='cuda' if device == 'cuda' else 'cpu'):
+            # Use optimized generation settings
             generated_ids = model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 do_sample=True,
                 temperature=temperature,
                 top_p=top_p,
+                top_k=50,  # Add top_k for faster sampling
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
                 eos_token_id=128258,
                 pad_token_id=tokenizer.eos_token_id,
+                use_cache=True,
+                # Optimization flags
+                num_beams=1,  # Greedy-like but with sampling
             )
+        code_list = parse_output_fast(generated_ids)
         if not code_list:
+            return np.array([], dtype=np.float32)
+        audio_samples = redistribute_codes_fast(code_list, snac_model)
+        # Cache result if successful
+        if len(audio_samples) > 0:
+            audio_cache[cache_key] = audio_samples
+            # Limit cache size
+            if len(audio_cache) > 100:
+                # Remove oldest entries
+                keys = list(audio_cache.keys())
+                for k in keys[:20]:
+                    del audio_cache[k]
         return audio_samples
     except Exception as e:
+        print(f"Error in chunk generation: {e}")
+        return np.array([], dtype=np.float32)
+def combine_audio_fast(audio_chunks, pause_duration=0.2):
+    """Fast audio combination"""
+    if not audio_chunks:
+        return np.array([], dtype=np.float32)
+    # Shorter pauses for faster speech
+    pause_samples = int(24000 * pause_duration)
+    pause = np.zeros(pause_samples, dtype=np.float32)
+    # Pre-calculate total length for efficiency
+    total_length = sum(len(chunk) for chunk in audio_chunks) + pause_samples * (len(audio_chunks) - 1)
+    combined = np.empty(total_length, dtype=np.float32)
+    pos = 0
+    for i, chunk in enumerate(audio_chunks):
+        if len(chunk) > 0:
+            combined[pos:pos+len(chunk)] = chunk
+            pos += len(chunk)
+            if i < len(audio_chunks) - 1:
+                combined[pos:pos+pause_samples] = pause
+                pos += pause_samples
+    return combined[:pos]  # Trim to actual length
+def generate_speech_fast(text, temperature=0.7, top_p=0.9, repetition_penalty=1.1,
+                        max_new_tokens=400, voice="Elise", split_method="punctuation",
+                        max_chars=120, pause_duration=0.2, progress=gr.Progress()):
+    """Optimized main generation function"""
     if not text.strip():
         return None
     try:
+        progress(0.05, "Processing...")
+        # Fast text splitting
+        if split_method == "punctuation" and len(text) > max_chars:
+            chunks = smart_split_text(text, max_chars)
         else:
+            chunks = [text]
+        progress(0.1, f"Generating {len(chunks)} chunks...")
+        print(f"Processing {len(chunks)} chunks")
+        # Parallel-like processing (sequential but optimized)
         audio_chunks = []
+        for i, chunk in enumerate(chunks):
+            progress(0.1 + 0.8 * (i / len(chunks)), f"Chunk {i+1}/{len(chunks)}")
+            audio = generate_speech_chunk_fast(
                 chunk, temperature, top_p, repetition_penalty, max_new_tokens, voice
             )
             if len(audio) > 0:
                 audio_chunks.append(audio)
         if not audio_chunks:
             return None
+        progress(0.95, "Combining...")
+        final_audio = combine_audio_fast(audio_chunks, pause_duration)
+        progress(1.0, "Done!")
+        print(f"Generated {len(final_audio)/24000:.1f}s of audio")
         return (24000, final_audio)
     except Exception as e:
+        print(f"Generation error: {e}")
         return None
+# Simplified Gradio interface for speed
 examples = [
+    ["ជំរាបសួរ ខ្ញុំឈ្មោះតារា។"],
+    ["ខ្ញុំអាចនិយាយភាសាខ្មែរ។"],
 ]
+with gr.Blocks(title="Fast Khmer TTS", theme="soft") as demo:
+    gr.Markdown("""
+    # ⚡ Fast Khmer Text-to-Speech
+    **Optimized for speed and efficiency**
     """)
     text_input = gr.Textbox(
+        label="Khmer Text",
+        placeholder="Enter Khmer text here...",
+        lines=3
     )
+    with gr.Row():
+        max_chars = gr.Slider(80, 200, 120, step=20, label="Chunk Size")
+        pause_duration = gr.Slider(0.1, 0.5, 0.2, step=0.1, label="Pause Duration")
     with gr.Row():
+        generate_btn = gr.Button("🎤 Generate", variant="primary")
+        clear_btn = gr.Button("Clear")
+    audio_output = gr.Audio(label="Generated Speech", type="numpy")
     gr.Examples(
         examples=examples,
         inputs=[text_input],
         outputs=audio_output,
+        fn=lambda text: generate_speech_fast(text),
         cache_examples=False,
     )
+    generate_btn.click(
+        fn=generate_speech_fast,
+        inputs=[text_input, gr.State(0.7), gr.State(0.9), gr.State(1.1),
+                gr.State(400), gr.State("Elise"), gr.State("punctuation"),
+                max_chars, pause_duration],
         outputs=audio_output
     )
     clear_btn.click(
         fn=lambda: (None, None),
         outputs=[text_input, audio_output]
     )
 if __name__ == "__main__":
+    demo.queue(max_size=3, api_open=False).launch(
         share=False,
         server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
     )

requirements.txt CHANGED Viewed

@@ -10,3 +10,4 @@ scipy
 openai
 huggingface-hub
 accelerate

 openai
 huggingface-hub
 accelerate
+flash-attn