Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on 25 days ago

Commit

5193c5e

1 Parent(s): 25f78d4

token

Browse files

Files changed (1) hide show

app.py +47 -38

app.py CHANGED Viewed

@@ -1,20 +1,35 @@
 import spaces
 from snac import SNAC
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
 load_dotenv()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
 model_name = "mrrtmob/tts-khm-kore"
-# Download only model config and safetensors
 snapshot_download(
     repo_id=model_name,
     allow_patterns=[
         "config.json",
         "*.safetensors",
@@ -33,41 +48,47 @@ snapshot_download(
         "tokenizer.*"
     ]
 )
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
 model.to(device)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
 print(f"Khmer TTS model loaded to {device}")
 # Process text prompt
 def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids
     start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
     end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human
     modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)  # SOH SOT Text EOT EOH
     # No padding needed for single input
     attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
 # Parse output tokens to audio
 def parse_output(generated_ids):
     token_to_find = 128257
     token_to_remove = 128258
     token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
     if len(token_indices[1]) > 0:
         last_occurrence_idx = token_indices[1][-1].item()
         cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
     else:
         cropped_tensor = generated_ids
     processed_rows = []
     for row in cropped_tensor:
         masked_row = row[row != token_to_remove]
         processed_rows.append(masked_row)
     code_lists = []
     for row in processed_rows:
         row_length = row.size(0)
@@ -75,12 +96,11 @@ def parse_output(generated_ids):
         trimmed_row = row[:new_length]
         trimmed_row = [t - 128266 for t in trimmed_row]
         code_lists.append(trimmed_row)
     return code_lists[0]  # Return just the first one for single sample
 # Redistribute codes for audio generation
 def redistribute_codes(code_list, snac_model):
     device = next(snac_model.parameters()).device  # Get the device of SNAC model
     layer_1 = []
     layer_2 = []
     layer_3 = []
@@ -92,26 +112,23 @@ def redistribute_codes(code_list, snac_model):
         layer_2.append(code_list[7*i+4]-(4*4096))
         layer_3.append(code_list[7*i+5]-(5*4096))
         layer_3.append(code_list[7*i+6]-(6*4096))
     # Move tensors to the same device as the SNAC model
     codes = [
         torch.tensor(layer_1, device=device).unsqueeze(0),
         torch.tensor(layer_2, device=device).unsqueeze(0),
         torch.tensor(layer_3, device=device).unsqueeze(0)
     ]
     audio_hat = snac_model.decode(codes)
     return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
 # Main generation function
 @spaces.GPU()
 def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200, voice="Elise", progress=gr.Progress()):
     if not text.strip():
         return None
     try:
         progress(0.1, "Processing text...")
         input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
         progress(0.3, "Generating speech tokens...")
         with torch.no_grad():
             generated_ids = model.generate(
@@ -125,17 +142,15 @@ def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, m
                 num_return_sequences=1,
                 eos_token_id=128258,
             )
         progress(0.6, "Processing speech tokens...")
         code_list = parse_output(generated_ids)
         progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
         return (24000, audio_samples)  # Return sample rate and audio
     except Exception as e:
         print(f"Error generating speech: {e}")
         return None
 # Examples for the UI - Khmer text examples
 examples = [
     ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ហើយខ្ញុំជា AI ដែលអាចបម្លែងអត្ថបទទៅជាសំលេង។"],
@@ -149,69 +164,64 @@ examples = [
     ["ខ្ញុំដើរទៅទិញអីញ៉ាំ ស្រាប់តែឃើញឆ្កែធំមួយរត់មករកខ្ញុំ។ <gasp> ខ្ញុំភ័យណាស់! តែវារត់ទៅបាត់វិញ។ <sigh>"], # I was walking to buy something when suddenly I saw a big dog running towards me. <gasp> I was so scared! But then it ran away. <sigh>
     ["អរគុណច្រើនសម្រាប់ជំនួយ។ <chuckle> បើគ្មានអ្នកទេ ខ្ញុំមិនដឹងធ្វើយ៉ាងម៉េចទេ។"],
 ]
 # Available voices (commented out for simpler UI)
 # VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe", "jing", "Elise"]
 # Available Emotive Tags
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
 # Create Gradio interface
 with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
     """)
     text_input = gr.Textbox(
-        label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
         placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
         lines=4
     )
     # Voice selector (commented out)
     # voice = gr.Dropdown(
-    #     choices=VOICES,
-    #     value="tara",
     #     label="Voice (សំលេង)"
     # )
     # Advanced Settings
     with gr.Accordion("🔧 Advanced Settings", open=False):
         with gr.Row():
             temperature = gr.Slider(
                 minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                label="Temperature",
                 info="Higher values create more expressive speech"
             )
             top_p = gr.Slider(
                 minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                label="Top P",
                 info="Nucleus sampling threshold"
             )
         with gr.Row():
             repetition_penalty = gr.Slider(
                 minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                label="Repetition Penalty",
                 info="Higher values discourage repetitive patterns"
             )
             max_new_tokens = gr.Slider(
                 minimum=100, maximum=2000, value=1200, step=100,
-                label="Max Length",
                 info="Maximum length of generated audio"
             )
     with gr.Row():
         submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
         clear_btn = gr.Button("🗑️ Clear", size="lg")
     audio_output = gr.Audio(
-        label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
         type="numpy",
         show_label=True
     )
     # Set up examples (NO CACHE)
     gr.Examples(
         examples=examples,
@@ -220,19 +230,18 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
         fn=lambda text: generate_speech(text),
         cache_examples=False,
     )
     # Set up event handlers
     submit_btn.click(
         fn=generate_speech,
         inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output
     )
     clear_btn.click(
         fn=lambda: (None, None),
         inputs=[],
         outputs=[text_input, audio_output]
     )
 # Launch the app
 if __name__ == "__main__":
     demo.queue().launch(share=False)

+import os
 import spaces
 from snac import SNAC
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import snapshot_download, login
 from dotenv import load_dotenv
 load_dotenv()
+# Get HF token from environment variables
+hf_token = os.getenv("HF_TOKEN")
+if hf_token:
+    login(token=hf_token)
+    print("Successfully logged in to Hugging Face")
+else:
+    print("Warning: HF_TOKEN not found in environment variables")
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
 model_name = "mrrtmob/tts-khm-kore"
+# Download only model config and safetensors with token
 snapshot_download(
     repo_id=model_name,
+    token=hf_token,  # Add token here
     allow_patterns=[
         "config.json",
         "*.safetensors",
         "tokenizer.*"
     ]
 )
+# Load model and tokenizer with token
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    token=hf_token  # Add token here
+)
 model.to(device)
+tokenizer = AutoTokenizer.from_pretrained(
+    model_name,
+    token=hf_token  # Add token here
+)
 print(f"Khmer TTS model loaded to {device}")
 # Process text prompt
 def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids
     start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
     end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human
     modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)  # SOH SOT Text EOT EOH
     # No padding needed for single input
     attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
 # Parse output tokens to audio
 def parse_output(generated_ids):
     token_to_find = 128257
     token_to_remove = 128258
     token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
     if len(token_indices[1]) > 0:
         last_occurrence_idx = token_indices[1][-1].item()
         cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
     else:
         cropped_tensor = generated_ids
     processed_rows = []
     for row in cropped_tensor:
         masked_row = row[row != token_to_remove]
         processed_rows.append(masked_row)
     code_lists = []
     for row in processed_rows:
         row_length = row.size(0)
         trimmed_row = row[:new_length]
         trimmed_row = [t - 128266 for t in trimmed_row]
         code_lists.append(trimmed_row)
     return code_lists[0]  # Return just the first one for single sample
 # Redistribute codes for audio generation
 def redistribute_codes(code_list, snac_model):
     device = next(snac_model.parameters()).device  # Get the device of SNAC model
     layer_1 = []
     layer_2 = []
     layer_3 = []
         layer_2.append(code_list[7*i+4]-(4*4096))
         layer_3.append(code_list[7*i+5]-(5*4096))
         layer_3.append(code_list[7*i+6]-(6*4096))
     # Move tensors to the same device as the SNAC model
     codes = [
         torch.tensor(layer_1, device=device).unsqueeze(0),
         torch.tensor(layer_2, device=device).unsqueeze(0),
         torch.tensor(layer_3, device=device).unsqueeze(0)
     ]
     audio_hat = snac_model.decode(codes)
     return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
 # Main generation function
 @spaces.GPU()
 def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200, voice="Elise", progress=gr.Progress()):
     if not text.strip():
         return None
     try:
         progress(0.1, "Processing text...")
         input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
         progress(0.3, "Generating speech tokens...")
         with torch.no_grad():
             generated_ids = model.generate(
                 num_return_sequences=1,
                 eos_token_id=128258,
             )
         progress(0.6, "Processing speech tokens...")
         code_list = parse_output(generated_ids)
         progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
         return (24000, audio_samples)  # Return sample rate and audio
     except Exception as e:
         print(f"Error generating speech: {e}")
         return None
 # Examples for the UI - Khmer text examples
 examples = [
     ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ហើយខ្ញុំជា AI ដែលអាចបម្លែងអត្ថបទទៅជាសំលេង។"],
     ["ខ្ញុំដើរទៅទិញអីញ៉ាំ ស្រាប់តែឃើញឆ្កែធំមួយរត់មករកខ្ញុំ។ <gasp> ខ្ញុំភ័យណាស់! តែវារត់ទៅបាត់វិញ។ <sigh>"], # I was walking to buy something when suddenly I saw a big dog running towards me. <gasp> I was so scared! But then it ran away. <sigh>
     ["អរគុណច្រើនសម្រាប់ជំនួយ។ <chuckle> បើគ្មានអ្នកទេ ខ្ញុំមិនដឹងធ្វើយ៉ាងម៉េចទេ។"],
 ]
 # Available voices (commented out for simpler UI)
 # VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe", "jing", "Elise"]
 # Available Emotive Tags
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
 # Create Gradio interface
 with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
     """)
     text_input = gr.Textbox(
+        label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
         placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
         lines=4
     )
     # Voice selector (commented out)
     # voice = gr.Dropdown(
+    #     choices=VOICES,
+    #     value="tara",
     #     label="Voice (សំលេង)"
     # )
     # Advanced Settings
     with gr.Accordion("🔧 Advanced Settings", open=False):
         with gr.Row():
             temperature = gr.Slider(
                 minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                label="Temperature",
                 info="Higher values create more expressive speech"
             )
             top_p = gr.Slider(
                 minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                label="Top P",
                 info="Nucleus sampling threshold"
             )
         with gr.Row():
             repetition_penalty = gr.Slider(
                 minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                label="Repetition Penalty",
                 info="Higher values discourage repetitive patterns"
             )
             max_new_tokens = gr.Slider(
                 minimum=100, maximum=2000, value=1200, step=100,
+                label="Max Length",
                 info="Maximum length of generated audio"
             )
     with gr.Row():
         submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
         clear_btn = gr.Button("🗑️ Clear", size="lg")
     audio_output = gr.Audio(
+        label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
         type="numpy",
         show_label=True
     )
     # Set up examples (NO CACHE)
     gr.Examples(
         examples=examples,
         fn=lambda text: generate_speech(text),
         cache_examples=False,
     )
     # Set up event handlers
     submit_btn.click(
         fn=generate_speech,
         inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output
     )
     clear_btn.click(
         fn=lambda: (None, None),
         inputs=[],
         outputs=[text_input, audio_output]
     )
 # Launch the app
 if __name__ == "__main__":
     demo.queue().launch(share=False)