Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on Jul 8

Commit

8a44aae

1 Parent(s): 8f1133d

tts-khm-1

Browse files

Files changed (1) hide show

app.py +17 -1

app.py CHANGED Viewed

@@ -6,12 +6,15 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
 load_dotenv()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
 model_name = "mrrtmob/tts-khm-1"
 # Download only model config and safetensors
 snapshot_download(
     repo_id=model_name,
@@ -33,10 +36,12 @@ snapshot_download(
         "tokenizer.*"
     ]
 )
 model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
 model.to(device)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 print(f"Khmer TTS model loaded to {device}")
 # Process text prompt
 def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
@@ -51,6 +56,7 @@ def process_prompt(prompt, voice, tokenizer, device):
     attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
 # Parse output tokens to audio
 def parse_output(generated_ids):
     token_to_find = 128257
@@ -62,10 +68,12 @@ def parse_output(generated_ids):
         cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
     else:
         cropped_tensor = generated_ids
     processed_rows = []
     for row in cropped_tensor:
         masked_row = row[row != token_to_remove]
         processed_rows.append(masked_row)
     code_lists = []
     for row in processed_rows:
         row_length = row.size(0)
@@ -75,6 +83,7 @@ def parse_output(generated_ids):
         code_lists.append(trimmed_row)
     return code_lists[0]  # Return just the first one for single sample
 # Redistribute codes for audio generation
 def redistribute_codes(code_list, snac_model):
     device = next(snac_model.parameters()).device  # Get the device of SNAC model
@@ -100,6 +109,7 @@ def redistribute_codes(code_list, snac_model):
     audio_hat = snac_model.decode(codes)
     return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
 # Main generation function
 @spaces.GPU()
 def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
@@ -134,6 +144,7 @@ def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new
     except Exception as e:
         print(f"Error generating speech: {e}")
         return None
 # Examples for the UI - Khmer text examples
 examples = [
     ["ជំរាបសួរ ខ្ញុំឈ្មោះ តារា ហើយខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។", "tara", 0.6, 0.95, 1.1, 1200],
@@ -145,10 +156,13 @@ examples = [
     ["តើអ្នកបានឮរឿងកំប្លែងនេះយ៉ាងណា? <laugh> ខ្ញុំមិនអាចបញ្ឈប់ការសើចបាននោះទេ។", "zac", 0.7, 0.95, 1.1, 1200],
     ["បន្ទាប់ពីរត់ម៉ារ៉ាតុងរួច ខ្ញុំហត់ណាស់ <yawn> ហើយត្រូវការសម្រាក។", "zoe", 0.6, 0.95, 1.1, 1200]
 ]
 # Available voices
 VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe", "jing", "Elise"]
 # Available Emotive Tags
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
 # Create Gradio interface
 with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
@@ -163,6 +177,7 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     - អត្ថបទវែងជាទូទៅមានលទ្ធផលល្អជាងអត្ថបទខ្លី
     - Increasing `repetition_penalty` and `temperature` makes the model speak faster
     """)
     with gr.Row():
         with gr.Column(scale=3):
             text_input = gr.Textbox(
@@ -226,6 +241,7 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
         inputs=[],
         outputs=[text_input, audio_output]
     )
 # Launch the app
 if __name__ == "__main__":
-    demo.queue().launch(share=False, ssr_mode=False)

 from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
 load_dotenv()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
 model_name = "mrrtmob/tts-khm-1"
 # Download only model config and safetensors
 snapshot_download(
     repo_id=model_name,
         "tokenizer.*"
     ]
 )
 model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
 model.to(device)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 print(f"Khmer TTS model loaded to {device}")
 # Process text prompt
 def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
     attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
 # Parse output tokens to audio
 def parse_output(generated_ids):
     token_to_find = 128257
         cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
     else:
         cropped_tensor = generated_ids
     processed_rows = []
     for row in cropped_tensor:
         masked_row = row[row != token_to_remove]
         processed_rows.append(masked_row)
     code_lists = []
     for row in processed_rows:
         row_length = row.size(0)
         code_lists.append(trimmed_row)
     return code_lists[0]  # Return just the first one for single sample
 # Redistribute codes for audio generation
 def redistribute_codes(code_list, snac_model):
     device = next(snac_model.parameters()).device  # Get the device of SNAC model
     audio_hat = snac_model.decode(codes)
     return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
 # Main generation function
 @spaces.GPU()
 def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
     except Exception as e:
         print(f"Error generating speech: {e}")
         return None
 # Examples for the UI - Khmer text examples
 examples = [
     ["ជំរាបសួរ ខ្ញុំឈ្មោះ តារា ហើយខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។", "tara", 0.6, 0.95, 1.1, 1200],
     ["តើអ្នកបានឮរឿងកំប្លែងនេះយ៉ាងណា? <laugh> ខ្ញុំមិនអាចបញ្ឈប់ការសើចបាននោះទេ។", "zac", 0.7, 0.95, 1.1, 1200],
     ["បន្ទាប់ពីរត់ម៉ារ៉ាតុងរួច ខ្ញុំហត់ណាស់ <yawn> ហើយត្រូវការសម្រាក។", "zoe", 0.6, 0.95, 1.1, 1200]
 ]
 # Available voices
 VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe", "jing", "Elise"]
 # Available Emotive Tags
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
 # Create Gradio interface
 with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
     - អត្ថបទវែងជាទូទៅមានលទ្ធផលល្អជាងអត្ថបទខ្លី
     - Increasing `repetition_penalty` and `temperature` makes the model speak faster
     """)
     with gr.Row():
         with gr.Column(scale=3):
             text_input = gr.Textbox(
         inputs=[],
         outputs=[text_input, audio_output]
     )
 # Launch the app
 if __name__ == "__main__":
+    demo.queue().launch(share=False)