Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on Jul 8

Commit

0100eae

1 Parent(s): 8a44aae

Update Khmer TTS model to version 2 and simplify UI components

Browse files

Files changed (1) hide show

app.py +42 -81

app.py CHANGED Viewed

@@ -6,15 +6,12 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
 load_dotenv()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
-model_name = "mrrtmob/tts-khm-1"
 # Download only model config and safetensors
 snapshot_download(
     repo_id=model_name,
@@ -36,12 +33,10 @@ snapshot_download(
         "tokenizer.*"
     ]
 )
 model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
 model.to(device)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 print(f"Khmer TTS model loaded to {device}")
 # Process text prompt
 def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
@@ -56,7 +51,6 @@ def process_prompt(prompt, voice, tokenizer, device):
     attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
 # Parse output tokens to audio
 def parse_output(generated_ids):
     token_to_find = 128257
@@ -83,7 +77,6 @@ def parse_output(generated_ids):
         code_lists.append(trimmed_row)
     return code_lists[0]  # Return just the first one for single sample
 # Redistribute codes for audio generation
 def redistribute_codes(code_list, snac_model):
     device = next(snac_model.parameters()).device  # Get the device of SNAC model
@@ -109,10 +102,9 @@ def redistribute_codes(code_list, snac_model):
     audio_hat = snac_model.decode(codes)
     return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
 # Main generation function
 @spaces.GPU()
-def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
     if not text.strip():
         return None
@@ -144,95 +136,65 @@ def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new
     except Exception as e:
         print(f"Error generating speech: {e}")
         return None
-# Examples for the UI - Khmer text examples
 examples = [
-    ["ជំរាបសួរ ខ្ញុំឈ្មោះ តារា ហើយខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។", "tara", 0.6, 0.95, 1.1, 1200],
-    ["ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច ឬ <sigh> ថប់ដង្ហើម។", "dan", 0.7, 0.95, 1.1, 1200],
-    ["ខ្ញុំរស់នៅក្នុងទីក្រុងភ្នំពេញ ហើយមានប៉ារ៉ាម៉ែត្រ <gasp> ច្រើនណាស់។", "leah", 0.6, 0.9, 1.2, 1200],
-    ["ពេលខ្លះ ពេលខ្ញុំនិយាយច្រើនពេក ខ្ញុំត្រូវ <cough> សុំទោស។", "leo", 0.65, 0.9, 1.1, 1200],
-    ["ការនិយាយនៅចំពោះមុខសាធារណៈ អាចមានការពិបាក។ <groan> ប៉ុន្តែបើហាត់ហាន គេអាចធ្វើបាន។", "jess", 0.7, 0.95, 1.1, 1200],
-    ["ការឡើងភ្នំពិតជាហត់ណត់ ប៉ុន្តែទេសភាពពីលើនេះ ពិតជាស្រស់ស្អាត! <sigh> គួរឱ្យធ្វើ។", "mia", 0.65, 0.9, 1.15, 1200],
-    ["តើអ្នកបានឮរឿងកំប្លែងនេះយ៉ាងណា? <laugh> ខ្ញុំមិនអាចបញ្ឈប់ការសើចបាននោះទេ។", "zac", 0.7, 0.95, 1.1, 1200],
-    ["បន្ទាប់ពីរត់ម៉ារ៉ាតុងរួច ខ្ញុំហត់ណាស់ <yawn> ហើយត្រូវការសម្រាក។", "zoe", 0.6, 0.95, 1.1, 1200]
 ]
-# Available voices
-VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe", "jing", "Elise"]
 # Available Emotive Tags
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
-# Create Gradio interface
 with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
-    # 🎵 Khmer Text-to-Speech (ម៉ូដែលបម្លែងអត្ថបទជាសំលេង)
-    Enter your Khmer text below and hear it converted to natural-sounding speech.
-    បញ្ចូលអត្ថបទខ្មែររបស់អ្នកខាងក្រោម ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយធម្មជាតិ។
-    ## Tips for better prompts (គន្លឹះសម្រាប់ការប្រើប្រាស់ដ៏ល្អ):
-    - Add paralinguistic elements like {", ".join(EMOTIVE_TAGS)} for more human-like speech
-    - Longer text prompts generally work better than very short phrases
-    - អត្ថបទវែងជាទូទៅមានលទ្ធផលល្អជាងអត្ថបទខ្លី
-    - Increasing `repetition_penalty` and `temperature` makes the model speak faster
     """)
     with gr.Row():
-        with gr.Column(scale=3):
-            text_input = gr.Textbox(
-                label="Text to speak (អត្ថបទដើម្បីនិយាយ)",
-                placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
-                lines=5
-            )
-            voice = gr.Dropdown(
-                choices=VOICES,
-                value="tara",
-                label="Voice (សំលេង)"
-            )
-            with gr.Accordion("Advanced Settings (ការកំណត់កម្រិតខ្ពស់)", open=False):
-                temperature = gr.Slider(
-                    minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                    label="Temperature",
-                    info="Higher values (0.7-1.0) create more expressive but less stable speech"
-                )
-                top_p = gr.Slider(
-                    minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                    label="Top P",
-                    info="Nucleus sampling threshold"
-                )
-                repetition_penalty = gr.Slider(
-                    minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                    label="Repetition Penalty",
-                    info="Higher values discourage repetitive patterns"
-                )
-                max_new_tokens = gr.Slider(
-                    minimum=100, maximum=2000, value=1200, step=100,
-                    label="Max Length",
-                    info="Maximum length of generated audio (in tokens)"
-                )
-            with gr.Row():
-                submit_btn = gr.Button("Generate Speech (បង្កើតសំលេង)", variant="primary")
-                clear_btn = gr.Button("Clear (លុប)")
-        with gr.Column(scale=2):
-            audio_output = gr.Audio(label="Generated Speech (សំលេងដែលបង្កើតឡើង)", type="numpy")
-    # Set up examples
     gr.Examples(
         examples=examples,
-        inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output,
-        fn=generate_speech,
         cache_examples=True,
     )
     # Set up event handlers
     submit_btn.click(
-        fn=generate_speech,
-        inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output
     )
@@ -241,7 +203,6 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
         inputs=[],
         outputs=[text_input, audio_output]
     )
 # Launch the app
 if __name__ == "__main__":
     demo.queue().launch(share=False)

 from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
 load_dotenv()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
+model_name = "mrrtmob/tts-khm-2"
 # Download only model config and safetensors
 snapshot_download(
     repo_id=model_name,
         "tokenizer.*"
     ]
 )
 model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
 model.to(device)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 print(f"Khmer TTS model loaded to {device}")
 # Process text prompt
 def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
     attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
 # Parse output tokens to audio
 def parse_output(generated_ids):
     token_to_find = 128257
         code_lists.append(trimmed_row)
     return code_lists[0]  # Return just the first one for single sample
 # Redistribute codes for audio generation
 def redistribute_codes(code_list, snac_model):
     device = next(snac_model.parameters()).device  # Get the device of SNAC model
     audio_hat = snac_model.decode(codes)
     return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
 # Main generation function
 @spaces.GPU()
+def generate_speech(text, voice="tara", temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200, progress=gr.Progress()):
     if not text.strip():
         return None
     except Exception as e:
         print(f"Error generating speech: {e}")
         return None
+# Examples for the UI - Khmer text examples (simplified)
 examples = [
+    ["ជំរាបសួរ ខ្ញុំឈ្មោះ តារា ហើយខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។"],
+    ["ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច ឬ <sigh> ថប់ដង្ហើម។"],
+    ["ខ្ញុំរស់នៅក្នុងទីក្រុងភ្នំពេញ ហើយមានប៉ារ៉ាម៉ែត្រ <gasp> ច្រើនណាស់។"],
+    ["ពេលខ្លះ ពេលខ្ញុំនិយាយច្រើនពេក ខ្ញុំត្រូវ <cough> សុំទោស។"],
+    ["ការនិយាយនៅចំពោះមុខសាធារណៈ អាចមានការពិបាក។ <groan> ប៉ុន្តែប��ហាត់ហាន គេអាចធ្វើបាន។"],
 ]
+# Available voices (commented out for simpler UI)
+# VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe", "jing", "Elise"]
 # Available Emotive Tags
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
+# Create Gradio interface (simplified)
 with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
+    # 🎵 Khmer Text-to-Speech
+    **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
+    បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
+    💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
     """)
+    text_input = gr.Textbox(
+        label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
+        placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
+        lines=4
+    )
+    # Voice selector (commented out)
+    # voice = gr.Dropdown(
+    #     choices=VOICES,
+    #     value="tara",
+    #     label="Voice (សំលេង)"
+    # )
     with gr.Row():
+        submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
+        clear_btn = gr.Button("🗑️ Clear", size="lg")
+    audio_output = gr.Audio(
+        label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
+        type="numpy",
+        show_label=True
+    )
+    # Set up examples (simplified)
     gr.Examples(
         examples=examples,
+        inputs=[text_input],
         outputs=audio_output,
+        fn=lambda text: generate_speech(text),
         cache_examples=True,
     )
     # Set up event handlers
     submit_btn.click(
+        fn=lambda text: generate_speech(text),
+        inputs=[text_input],
         outputs=audio_output
     )
         inputs=[],
         outputs=[text_input, audio_output]
     )
 # Launch the app
 if __name__ == "__main__":
     demo.queue().launch(share=False)