Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on 25 days ago

Commit

3d2ce0a

1 Parent(s): 63b6422

Implement rate limiting for speech generation and enhance text validation; improve UI with character count and custom CSS

Browse files

Files changed (1) hide show

app.py +149 -27

app.py CHANGED Viewed

@@ -1,18 +1,40 @@
 import os
 import spaces
 from snac import SNAC
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import snapshot_download, login, whoami
 from dotenv import load_dotenv
 load_dotenv()
 # Get HF token from environment variables
 hf_token = os.getenv("HF_TOKEN")
-# Simplified authentication - no debug prints
 if hf_token:
     login(token=hf_token)
@@ -22,9 +44,11 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
 model_name = "mrrtmob/tts-khm-kore"
 # Download only model config and safetensors with token
 snapshot_download(
     repo_id=model_name,
@@ -46,7 +70,9 @@ snapshot_download(
         "scheduler.pt"
     ]
 )
 # Load model and tokenizer with token
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
@@ -55,6 +81,7 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model = model.to(device)
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
     token=hf_token
@@ -133,14 +160,43 @@ def redistribute_codes(code_list, snac_model):
     audio_hat = snac_model.decode(codes)
     return audio_hat.detach().squeeze().cpu().numpy()
-# Main generation function - KEY CHANGES HERE
-@spaces.GPU(duration=60)  # Reduced duration
-def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=800, voice="Elise", progress=gr.Progress()):  # Reduced max tokens
     if not text.strip():
         return None
     try:
         progress(0.1, "Processing text...")
         input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
         progress(0.3, "Generating speech tokens...")
@@ -162,21 +218,26 @@ def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, m
         code_list = parse_output(generated_ids)
         if not code_list:
             return None
         progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
         if audio_samples is None:
             return None
         return (24000, audio_samples)
     except Exception as e:
-        print(f"Error generating speech: {e}")
         return None
-# Examples - reduced to save quota
 examples = [
     ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ហើយខ្ញុំជា AI ដែលអាចបម្លែងអត្ថបទទៅជាសំលេង។"],
     ["ខ្ញុំអាចបង្ក��តសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច។"],
@@ -192,50 +253,92 @@ examples = [
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
 # Create Gradio interface
-with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
     """)
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
-                label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
-                placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
                 lines=4,
-                max_lines=6  # Limited input size
             )
             # Advanced Settings
             with gr.Accordion("🔧 Advanced Settings", open=False):
                 with gr.Row():
                     temperature = gr.Slider(
                         minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                        label="Temperature"
                     )
                     top_p = gr.Slider(
                         minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                        label="Top P"
                     )
                 with gr.Row():
                     repetition_penalty = gr.Slider(
                         minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                        label="Repetition Penalty"
                     )
                     max_new_tokens = gr.Slider(
-                        minimum=100, maximum=800, value=800, step=50,  # Reduced max
-                        label="Max Length"
                     )
             with gr.Row():
-                submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
-                clear_btn = gr.Button("🗑️ Clear", size="lg")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
@@ -245,14 +348,21 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
                 interactive=False
             )
-    # Examples with NO CACHE to save quota
     gr.Examples(
         examples=examples,
         inputs=[text_input],
         outputs=audio_output,
         fn=lambda text: generate_speech(text),
-        cache_examples=False,  # Important: no caching
-        label="📝 Example Texts"
     )
     # Set up event handlers
@@ -264,18 +374,30 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     )
     clear_btn.click(
-        fn=lambda: (None, None),
         inputs=[],
-        outputs=[text_input, audio_output]
     )
-# Launch with optimizations
 if __name__ == "__main__":
     demo.queue(
-        max_size=10,  # Reduced queue size
-        default_concurrency_limit=2  # Reduced concurrent users
     ).launch(
         share=False,
         show_error=True,
-        ssr_mode=False  # Important for quota
     )

 import os
+import time
+from functools import wraps
 import spaces
 from snac import SNAC
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import snapshot_download, login
 from dotenv import load_dotenv
 load_dotenv()
+# Rate limiting
+last_request_time = {}
+REQUEST_COOLDOWN = 30
+def rate_limit(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        user_id = "anonymous"
+        current_time = time.time()
+        if user_id in last_request_time:
+            time_since_last = current_time - last_request_time[user_id]
+            if time_since_last < REQUEST_COOLDOWN:
+                remaining = int(REQUEST_COOLDOWN - time_since_last)
+                gr.Warning(f"Please wait {remaining} seconds before next request.")
+                return None
+        last_request_time[user_id] = current_time
+        return func(*args, **kwargs)
+    return wrapper
 # Get HF token from environment variables
 hf_token = os.getenv("HF_TOKEN")
 if hf_token:
     login(token=hf_token)
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
+print("SNAC model loaded successfully")
 model_name = "mrrtmob/tts-khm-kore"
+print(f"Downloading model files from {model_name}...")
 # Download only model config and safetensors with token
 snapshot_download(
     repo_id=model_name,
         "scheduler.pt"
     ]
 )
+print("Model files downloaded successfully")
+print("Loading main model...")
 # Load model and tokenizer with token
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
 )
 model = model.to(device)
+print("Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
     token=hf_token
     audio_hat = snac_model.decode(codes)
     return audio_hat.detach().squeeze().cpu().numpy()
+# Text validation function
+def validate_text(text):
+    """Validate and limit text length"""
+    MAX_LENGTH = 200
+    if len(text) > MAX_LENGTH:
+        return text[:MAX_LENGTH]
+    return text
+# Text change handler
+def on_text_change(text):
+    """Handle text changes and show character count"""
+    MAX_LENGTH = 200
+    current_length = len(text)
+    if current_length > MAX_LENGTH:
+        text = text[:MAX_LENGTH]
+        current_length = MAX_LENGTH
+        gr.Warning(f"Text truncated to {MAX_LENGTH} characters")
+    # Return the (potentially truncated) text and update info
+    return text, f"Characters: {current_length}/{MAX_LENGTH}"
+# Main generation function with rate limiting
+@rate_limit
+@spaces.GPU(duration=45)
+def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=800, voice="Elise", progress=gr.Progress()):
     if not text.strip():
+        gr.Warning("Please enter some text to generate speech.")
         return None
+    # Validate text length
+    text = validate_text(text)
     try:
         progress(0.1, "Processing text...")
+        print(f"Generating speech for text: {text[:50]}...")
         input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
         progress(0.3, "Generating speech tokens...")
         code_list = parse_output(generated_ids)
         if not code_list:
+            gr.Warning("Failed to generate valid audio codes.")
             return None
         progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
         if audio_samples is None:
+            gr.Warning("Failed to convert codes to audio.")
             return None
+        print("Speech generation completed successfully")
         return (24000, audio_samples)
     except Exception as e:
+        error_msg = f"Error generating speech: {str(e)}"
+        print(error_msg)
+        gr.Error(error_msg)
         return None
+# Examples - reduced for quota management
 examples = [
     ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ហើយខ្ញុំជា AI ដែលអាចបម្លែងអត្ថបទទៅជាសំលេង។"],
     ["ខ្ញុំអាចបង្ក��តសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច។"],
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
+# Create custom CSS
+css = """
+.gradio-container {
+    max-width: 1200px;
+    margin: auto;
+    padding-top: 1.5rem;
+}
+.main-header {
+    text-align: center;
+    margin-bottom: 2rem;
+}
+.generate-btn {
+    background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
+    border: none !important;
+    color: white !important;
+    font-weight: bold !important;
+}
+.clear-btn {
+    background: linear-gradient(45deg, #95A5A6, #BDC3C7) !important;
+    border: none !important;
+    color: white !important;
+}
+.char-counter {
+    font-size: 12px;
+    color: #666;
+    text-align: right;
+    margin-top: 5px;
+}
+"""
 # Create Gradio interface
+with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"""
+    <div class="main-header">
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
+    </div>
     """)
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
+                label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ) - Max 200 characters",
+                placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ... (អតិបរមា ២០០ តួអក្សរ)",
                 lines=4,
+                max_lines=6,
+                interactive=True
             )
+            # Character counter
+            char_info = gr.Markdown("Characters: 0/200", elem_classes=["char-counter"])
             # Advanced Settings
             with gr.Accordion("🔧 Advanced Settings", open=False):
                 with gr.Row():
                     temperature = gr.Slider(
                         minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                        label="Temperature",
+                        info="Higher values create more expressive speech"
                     )
                     top_p = gr.Slider(
                         minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                        label="Top P",
+                        info="Nucleus sampling threshold"
                     )
                 with gr.Row():
                     repetition_penalty = gr.Slider(
                         minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                        label="Repetition Penalty",
+                        info="Higher values discourage repetitive patterns"
                     )
                     max_new_tokens = gr.Slider(
+                        minimum=100, maximum=600, value=600, step=50,
+                        label="Max Length",
+                        info="Maximum length of generated audio"
                     )
             with gr.Row():
+                submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg", elem_classes=["generate-btn"])
+                clear_btn = gr.Button("🗑️ Clear", size="lg", elem_classes=["clear-btn"])
         with gr.Column(scale=1):
             audio_output = gr.Audio(
                 interactive=False
             )
+    # Set up examples (NO CACHE to save quota)
     gr.Examples(
         examples=examples,
         inputs=[text_input],
         outputs=audio_output,
         fn=lambda text: generate_speech(text),
+        cache_examples=False,
+        label="📝 Example Texts (អត្ថបទគំរូ)"
+    )
+    # Text change event handler
+    text_input.change(
+        fn=on_text_change,
+        inputs=[text_input],
+        outputs=[text_input, char_info]
     )
     # Set up event handlers
     )
     clear_btn.click(
+        fn=lambda: ("", "Characters: 0/200", None),
         inputs=[],
+        outputs=[text_input, char_info, audio_output]
+    )
+    # Add keyboard shortcut
+    text_input.submit(
+        fn=generate_speech,
+        inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
+        outputs=audio_output,
+        show_progress=True
     )
+# Launch with embed-friendly optimizations
 if __name__ == "__main__":
+    print("Starting Gradio interface...")
     demo.queue(
+        max_size=3,  # Small queue for embeds
+        default_concurrency_limit=1  # One user at a time
     ).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
         share=False,
         show_error=True,
+        ssr_mode=False,
+        auth_message="Login to HuggingFace recommended for better GPU quota"
     )