Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on 25 days ago

Commit

c0c6352

1 Parent(s): 5193c5e

Update requirements to include necessary dependencies

Browse files

Files changed (1) hide show

app.py +185 -75

app.py CHANGED Viewed

@@ -4,65 +4,83 @@ from snac import SNAC
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import snapshot_download, login
 from dotenv import load_dotenv
 load_dotenv()
 # Get HF token from environment variables
 hf_token = os.getenv("HF_TOKEN")
 if hf_token:
     login(token=hf_token)
-    print("Successfully logged in to Hugging Face")
 else:
     print("Warning: HF_TOKEN not found in environment variables")
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
 model_name = "mrrtmob/tts-khm-kore"
 # Download only model config and safetensors with token
 snapshot_download(
     repo_id=model_name,
-    token=hf_token,  # Add token here
     allow_patterns=[
         "config.json",
         "*.safetensors",
         "model.safetensors.index.json",
     ],
     ignore_patterns=[
         "optimizer.pt",
         "pytorch_model.bin",
         "training_args.bin",
-        "scheduler.pt",
-        "tokenizer.json",
-        "tokenizer_config.json",
-        "special_tokens_map.json",
-        "vocab.json",
-        "merges.txt",
-        "tokenizer.*"
     ]
 )
 # Load model and tokenizer with token
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.bfloat16,
-    token=hf_token  # Add token here
 )
-model.to(device)
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
-    token=hf_token  # Add token here
 )
-print(f"Khmer TTS model loaded to {device}")
 # Process text prompt
 def process_prompt(prompt, voice, tokenizer, device):
@@ -96,22 +114,37 @@ def parse_output(generated_ids):
         trimmed_row = row[:new_length]
         trimmed_row = [t - 128266 for t in trimmed_row]
         code_lists.append(trimmed_row)
-    return code_lists[0]  # Return just the first one for single sample
 # Redistribute codes for audio generation
 def redistribute_codes(code_list, snac_model):
     device = next(snac_model.parameters()).device  # Get the device of SNAC model
     layer_1 = []
     layer_2 = []
     layer_3 = []
     for i in range((len(code_list)+1)//7):
-        layer_1.append(code_list[7*i])
-        layer_2.append(code_list[7*i+1]-4096)
-        layer_3.append(code_list[7*i+2]-(2*4096))
-        layer_3.append(code_list[7*i+3]-(3*4096))
-        layer_2.append(code_list[7*i+4]-(4*4096))
-        layer_3.append(code_list[7*i+5]-(5*4096))
-        layer_3.append(code_list[7*i+6]-(6*4096))
     # Move tensors to the same device as the SNAC model
     codes = [
         torch.tensor(layer_1, device=device).unsqueeze(0),
@@ -122,13 +155,18 @@ def redistribute_codes(code_list, snac_model):
     return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
 # Main generation function
-@spaces.GPU()
 def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200, voice="Elise", progress=gr.Progress()):
     if not text.strip():
         return None
     try:
         progress(0.1, "Processing text...")
         input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
         progress(0.3, "Generating speech tokens...")
         with torch.no_grad():
             generated_ids = model.generate(
@@ -141,27 +179,43 @@ def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, m
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
                 eos_token_id=128258,
             )
         progress(0.6, "Processing speech tokens...")
         code_list = parse_output(generated_ids)
         progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
         return (24000, audio_samples)  # Return sample rate and audio
     except Exception as e:
-        print(f"Error generating speech: {e}")
         return None
 # Examples for the UI - Khmer text examples
 examples = [
     ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ហើយខ្ញុំជា AI ដែលអាចបម្លែងអត្ថបទទៅជាសំលេង។"],
     ["ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច។"],
-    ["ម្សិលមិញ ខ្ញុំឃើញឆ្មាមួយក្បាលដេញចាប់កន្ទុយខ្លួនឯង។ <laugh> វាគួរឲ្យអស់សំណើចណាស់។"], # Yesterday, I saw a cat chasing its own tail. <laugh> It was so funny.
-    ["ខ្ញុំរៀបចំម្ហូប ស្រាប់តែធ្វើជ្រុះគ្រឿងទេសពេញឥដ្ឋ។ <chuckle> វាប្រឡាក់អស់ហើយ។"], # I was preparing food when suddenly I dropped spices all over the floor. <chuckle> It's all messed up.
-    ["ថ្ងៃនេះហត់ណាស់ ធ្វើការពេញមួយថ្ងៃ។ <sigh> ចង់ទៅផ្ទះសម្រាកហើយ។"], # So tired today, worked all day. <sigh> Want to go home and rest now.
-    ["អាកាសធាតុត្រជាក់ ធ្វើឲ្យខ្ញុំផ្តាសាយតិចៗ។ <sniffle> ខ្ញុំក៏ក្អកដែរ។ <cough>"], # The cold weather made me get a bit of a cold. <sniffle> I also cough. <cough>
-    ["ការប្រឡងមិនបានល្អដូចការរំពឹងទុកទេ។ <groan> ខ្ញុំត្រូវរៀនឲ្យខ្លាំងជាងនេះ។"], # The exam didn't go as well as expected. <groan> I need to study harder.
-    ["កិច្ចប្រជុំនេះវែងអន្លាយពេកហើយ។ <yawn> ខ្ញុំចាប់ផ្តើមងងុយគេងហើយ។"], # This meeting is too long. <yawn> I'm starting to get sleepy.
-    ["ខ្ញុំដើរទៅទិញអីញ៉ាំ ស្រាប់តែឃើញឆ្កែធំមួយរត់មករកខ្ញុំ។ <gasp> ខ្ញុំភ័យណាស់! តែវារត់ទៅបាត់វិញ។ <sigh>"], # I was walking to buy something when suddenly I saw a big dog running towards me. <gasp> I was so scared! But then it ran away. <sigh>
     ["អរគុណច្រើនសម្រាប់ជំនួយ។ <chuckle> បើគ្មានអ្នកទេ ខ្ញុំមិនដឹងធ្វើយ៉ាងម៉េចទេ។"],
 ]
@@ -171,57 +225,91 @@ examples = [
 # Available Emotive Tags
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
 # Create Gradio interface
-with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
     """)
-    text_input = gr.Textbox(
-        label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
-        placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
-        lines=4
-    )
-    # Voice selector (commented out)
-    # voice = gr.Dropdown(
-    #     choices=VOICES,
-    #     value="tara",
-    #     label="Voice (សំលេង)"
-    # )
-    # Advanced Settings
-    with gr.Accordion("🔧 Advanced Settings", open=False):
-        with gr.Row():
-            temperature = gr.Slider(
-                minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                label="Temperature",
-                info="Higher values create more expressive speech"
-            )
-            top_p = gr.Slider(
-                minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                label="Top P",
-                info="Nucleus sampling threshold"
-            )
-        with gr.Row():
-            repetition_penalty = gr.Slider(
-                minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                label="Repetition Penalty",
-                info="Higher values discourage repetitive patterns"
             )
-            max_new_tokens = gr.Slider(
-                minimum=100, maximum=2000, value=1200, step=100,
-                label="Max Length",
-                info="Maximum length of generated audio"
             )
-    with gr.Row():
-        submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
-        clear_btn = gr.Button("🗑️ Clear", size="lg")
-    audio_output = gr.Audio(
-        label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
-        type="numpy",
-        show_label=True
-    )
     # Set up examples (NO CACHE)
     gr.Examples(
         examples=examples,
@@ -229,19 +317,41 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
         outputs=audio_output,
         fn=lambda text: generate_speech(text),
         cache_examples=False,
     )
     # Set up event handlers
     submit_btn.click(
         fn=generate_speech,
         inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
-        outputs=audio_output
     )
     clear_btn.click(
         fn=lambda: (None, None),
         inputs=[],
         outputs=[text_input, audio_output]
     )
 # Launch the app
 if __name__ == "__main__":
-    demo.queue().launch(share=False)

 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import snapshot_download, login, whoami
 from dotenv import load_dotenv
 load_dotenv()
 # Get HF token from environment variables
 hf_token = os.getenv("HF_TOKEN")
+# Debug and authentication
+print("=== DEBUG INFO ===")
+print(f"HF_TOKEN exists: {bool(hf_token)}")
 if hf_token:
     login(token=hf_token)
+    try:
+        user_info = whoami(token=hf_token)
+        print(f"Successfully logged in as: {user_info.get('name', 'Unknown')}")
+        print(f"User type: {user_info.get('type', 'Unknown')}")
+        print(f"User ID: {user_info.get('id', 'Unknown')}")
+    except Exception as e:
+        print(f"Authentication error: {e}")
 else:
     print("Warning: HF_TOKEN not found in environment variables")
+print("=== END DEBUG ===")
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
+print("SNAC model loaded successfully")
 model_name = "mrrtmob/tts-khm-kore"
+print(f"Downloading model files from {model_name}...")
 # Download only model config and safetensors with token
 snapshot_download(
     repo_id=model_name,
+    token=hf_token,
     allow_patterns=[
         "config.json",
         "*.safetensors",
         "model.safetensors.index.json",
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt"
     ],
     ignore_patterns=[
         "optimizer.pt",
         "pytorch_model.bin",
         "training_args.bin",
+        "scheduler.pt"
     ]
 )
+print("Model files downloaded successfully")
+print("Loading main model...")
 # Load model and tokenizer with token
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.bfloat16,
+    token=hf_token,
+    device_map="auto"
 )
+print("Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
+    token=hf_token
 )
+print(f"Khmer TTS model loaded successfully to {device}")
 # Process text prompt
 def process_prompt(prompt, voice, tokenizer, device):
         trimmed_row = row[:new_length]
         trimmed_row = [t - 128266 for t in trimmed_row]
         code_lists.append(trimmed_row)
+    return code_lists[0] if code_lists else []  # Return just the first one for single sample
 # Redistribute codes for audio generation
 def redistribute_codes(code_list, snac_model):
+    if not code_list:
+        return None
     device = next(snac_model.parameters()).device  # Get the device of SNAC model
     layer_1 = []
     layer_2 = []
     layer_3 = []
     for i in range((len(code_list)+1)//7):
+        if 7*i < len(code_list):
+            layer_1.append(code_list[7*i])
+        if 7*i+1 < len(code_list):
+            layer_2.append(code_list[7*i+1]-4096)
+        if 7*i+2 < len(code_list):
+            layer_3.append(code_list[7*i+2]-(2*4096))
+        if 7*i+3 < len(code_list):
+            layer_3.append(code_list[7*i+3]-(3*4096))
+        if 7*i+4 < len(code_list):
+            layer_2.append(code_list[7*i+4]-(4*4096))
+        if 7*i+5 < len(code_list):
+            layer_3.append(code_list[7*i+5]-(5*4096))
+        if 7*i+6 < len(code_list):
+            layer_3.append(code_list[7*i+6]-(6*4096))
+    if not layer_1:
+        return None
     # Move tensors to the same device as the SNAC model
     codes = [
         torch.tensor(layer_1, device=device).unsqueeze(0),
     return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
 # Main generation function
+@spaces.GPU(duration=120)
 def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200, voice="Elise", progress=gr.Progress()):
     if not text.strip():
+        gr.Warning("Please enter some text to generate speech.")
         return None
     try:
         progress(0.1, "Processing text...")
+        print(f"Generating speech for text: {text[:50]}...")
         input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
         progress(0.3, "Generating speech tokens...")
         with torch.no_grad():
             generated_ids = model.generate(
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
                 eos_token_id=128258,
+                pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else tokenizer.pad_token_id
             )
         progress(0.6, "Processing speech tokens...")
         code_list = parse_output(generated_ids)
+        if not code_list:
+            gr.Warning("Failed to generate valid audio codes.")
+            return None
         progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
+        if audio_samples is None:
+            gr.Warning("Failed to convert codes to audio.")
+            return None
+        print("Speech generation completed successfully")
         return (24000, audio_samples)  # Return sample rate and audio
     except Exception as e:
+        error_msg = f"Error generating speech: {str(e)}"
+        print(error_msg)
+        gr.Error(error_msg)
         return None
 # Examples for the UI - Khmer text examples
 examples = [
     ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ហើយខ្ញុំជា AI ដែលអាចបម្លែងអត្ថបទទៅជាសំលេង។"],
     ["ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច។"],
+    ["ម្សិលមិញ ខ្ញុំឃើញឆ្មាមួយក្បាលដេញចាប់កន្ទុយខ្លួនឯង។ <laugh> វាគួរឲ្យអស់សំណើចណាស់។"],
+    ["ខ្ញុំរៀបចំម្ហូប ស្រាប់តែធ្វើជ្រុះគ្រឿងទេសពេញឥដ្ឋ។ <chuckle> វាប្រឡាក់អស់ហើយ។"],
+    ["ថ្ងៃនេះហត់ណាស់ ធ្វើការពេញមួយថ្ងៃ។ <sigh> ចង់ទៅផ្ទះសម្រាកហើយ។"],
+    ["អាកាសធាតុត្រជាក់ ធ្វើឲ្យខ្ញុំផ្តាសាយតិចៗ។ <sniffle> ខ្ញុំក៏ក្អកដែរ។ <cough>"],
+    ["ការប្រឡងមិនបានល្អដូចការរំពឹងទុកទេ។ <groan> ខ្ញុំត្រូវរៀនឲ្យខ្លាំងជាងនេះ។"],
+    ["កិច្���ប្រជុំនេះវែងអន្លាយពេកហើយ។ <yawn> ខ្ញុំចាប់ផ្តើមងងុយគេងហើយ។"],
+    ["ខ្ញុំដើរទៅទិញអីញ៉ាំ ស្រាប់តែឃើញឆ្កែធំមួយរត់មករកខ្ញុំ។ <gasp> ខ្ញុំភ័យណាស់! តែវារត់ទៅបាត់វិញ។ <sigh>"],
     ["អរគុណច្រើនសម្រាប់ជំនួយ។ <chuckle> បើគ្មានអ្នកទេ ខ្ញុំមិនដឹងធ្វើយ៉ាងម៉េចទេ។"],
 ]
 # Available Emotive Tags
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
+# Create custom CSS
+css = """
+.gradio-container {
+    max-width: 1200px;
+    margin: auto;
+    padding-top: 1.5rem;
+}
+.main-header {
+    text-align: center;
+    margin-bottom: 2rem;
+}
+.generate-btn {
+    background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
+    border: none !important;
+    color: white !important;
+    font-weight: bold !important;
+}
+.clear-btn {
+    background: linear-gradient(45deg, #95A5A6, #BDC3C7) !important;
+    border: none !important;
+    color: white !important;
+}
+"""
 # Create Gradio interface
+with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"""
+    <div class="main-header">
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
+    </div>
     """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            text_input = gr.Textbox(
+                label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
+                placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
+                lines=4,
+                max_lines=8
             )
+            # Advanced Settings
+            with gr.Accordion("🔧 Advanced Settings", open=False):
+                with gr.Row():
+                    temperature = gr.Slider(
+                        minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                        label="Temperature",
+                        info="Higher values create more expressive speech"
+                    )
+                    top_p = gr.Slider(
+                        minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                        label="Top P",
+                        info="Nucleus sampling threshold"
+                    )
+                with gr.Row():
+                    repetition_penalty = gr.Slider(
+                        minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                        label="Repetition Penalty",
+                        info="Higher values discourage repetitive patterns"
+                    )
+                    max_new_tokens = gr.Slider(
+                        minimum=100, maximum=2000, value=1200, step=100,
+                        label="Max Length",
+                        info="Maximum length of generated audio"
+                    )
+            with gr.Row():
+                submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg", elem_classes=["generate-btn"])
+                clear_btn = gr.Button("🗑️ Clear", size="lg", elem_classes=["clear-btn"])
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(
+                label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
+                type="numpy",
+                show_label=True,
+                interactive=False
             )
     # Set up examples (NO CACHE)
     gr.Examples(
         examples=examples,
         outputs=audio_output,
         fn=lambda text: generate_speech(text),
         cache_examples=False,
+        label="📝 Example Texts (អត្ថបទគំរូ)"
     )
     # Set up event handlers
     submit_btn.click(
         fn=generate_speech,
         inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
+        outputs=audio_output,
+        show_progress=True
     )
     clear_btn.click(
         fn=lambda: (None, None),
         inputs=[],
         outputs=[text_input, audio_output]
     )
+    # Add keyboard shortcut
+    text_input.submit(
+        fn=generate_speech,
+        inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
+        outputs=audio_output,
+        show_progress=True
+    )
 # Launch the app
 if __name__ == "__main__":
+    print("Starting Gradio interface...")
+    demo.queue(
+        max_size=20,
+        default_concurrency_limit=5
+    ).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+        quiet=False
+    )