Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on Jul 9

Commit

2938eff

1 Parent(s): 1deb3e2

Add Hugging Face authentication and improve speech generation settings

Browse files

Files changed (1) hide show

app.py +62 -70

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ from snac import SNAC
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
 import os
 import re
@@ -11,9 +11,28 @@ import numpy as np
 load_dotenv()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 # Global variables to store models
 snac_model = None
@@ -29,7 +48,6 @@ def load_models():
     model_name = "mrrtmob/tts-khm-4"
-    # Download specific files
     print("Downloading model files...")
     snapshot_download(
         repo_id=model_name,
@@ -52,7 +70,6 @@ def load_models():
     )
     print("Loading main model...")
-    # Simplified model loading without device_map
     if device == "cuda":
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
@@ -78,24 +95,19 @@ def load_models():
 load_models()
 def split_text_by_punctuation(text, max_chars=200):
-    """Split text by punctuation marks, keeping sentences together when possible"""
-    # Khmer and common punctuation
     sentence_endings = r'[។!?]'
     clause_separators = r'[,;:]'
-    # First try to split by sentence endings
     sentences = re.split(f'({sentence_endings})', text)
-    # Recombine sentences with their punctuation
     combined_sentences = []
     for i in range(0, len(sentences), 2):
         sentence = sentences[i]
         if i + 1 < len(sentences):
-            sentence += sentences[i + 1]  # Add the punctuation back
         if sentence.strip():
             combined_sentences.append(sentence.strip())
-    # If no sentence endings found, split by clauses
     if len(combined_sentences) <= 1:
         parts = re.split(f'({clause_separators})', text)
         combined_sentences = []
@@ -106,13 +118,11 @@ def split_text_by_punctuation(text, max_chars=200):
             if part.strip():
                 combined_sentences.append(part.strip())
-    # Further split if sentences are too long
     final_chunks = []
     for sentence in combined_sentences:
         if len(sentence) <= max_chars:
             final_chunks.append(sentence)
         else:
-            # Split long sentences by words
             words = sentence.split()
             current_chunk = ""
@@ -131,10 +141,8 @@ def split_text_by_punctuation(text, max_chars=200):
     return [chunk for chunk in final_chunks if chunk.strip()]
 def split_text_by_tokens(text, max_tokens=150):
-    """Split text by token count"""
     global tokenizer
-    # Tokenize the entire text first
     tokens = tokenizer.encode(text)
     if len(tokens) <= max_tokens:
@@ -197,7 +205,7 @@ def parse_output(generated_ids):
 def redistribute_codes(code_list, snac_model):
     if not code_list or len(code_list) < 7:
-        return np.zeros(12000)  # 0.5 seconds of silence
     device = next(snac_model.parameters()).device
     layer_1 = []
@@ -227,8 +235,27 @@ def redistribute_codes(code_list, snac_model):
         print(f"Error in redistribute_codes: {e}")
         return np.zeros(12000)
-@spaces.GPU(duration=120)
-def generate_speech_chunk(text_chunk, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=800, voice="Elise"):
     """Generate speech for a single chunk"""
     global model, tokenizer, snac_model
@@ -265,30 +292,8 @@ def generate_speech_chunk(text_chunk, temperature=0.6, top_p=0.95, repetition_pe
         print(f"Error generating speech chunk: {e}")
         return np.array([])
-def combine_audio_chunks(audio_chunks, pause_duration=0.3):
-    """Combine audio chunks with pauses between them"""
-    if not audio_chunks:
-        return np.array([])
-    # Create pause (silence)
-    pause_samples = int(24000 * pause_duration)  # 24kHz sample rate
-    pause = np.zeros(pause_samples)
-    combined_audio = []
-    for i, chunk in enumerate(audio_chunks):
-        if len(chunk) > 0:
-            combined_audio.append(chunk)
-            # Add pause between chunks (except after the last chunk)
-            if i < len(audio_chunks) - 1:
-                combined_audio.append(pause)
-    if combined_audio:
-        return np.concatenate(combined_audio)
-    else:
-        return np.array([])
-def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=800,
-                   voice="Elise", split_method="punctuation", max_chars=200, max_tokens=150,
                    pause_duration=0.3, progress=gr.Progress()):
     """Main function to generate speech with text splitting"""
@@ -296,14 +301,13 @@ def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, m
         return None
     try:
-        # Split text based on selected method
         progress(0.05, "Splitting text...")
         if split_method == "punctuation":
             text_chunks = split_text_by_punctuation(text, max_chars)
         elif split_method == "tokens":
             text_chunks = split_text_by_tokens(text, max_tokens)
-        else:  # "none"
             text_chunks = [text]
         progress(0.1, f"Processing {len(text_chunks)} chunks...")
@@ -311,7 +315,6 @@ def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, m
         for i, chunk in enumerate(text_chunks):
             print(f"Chunk {i+1}: {chunk[:50]}...")
-        # Generate audio for each chunk
         audio_chunks = []
         for i, chunk in enumerate(text_chunks):
             progress(0.1 + 0.7 * (i / len(text_chunks)), f"Generating chunk {i+1}/{len(text_chunks)}...")
@@ -327,7 +330,6 @@ def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, m
         if not audio_chunks:
             return None
-        # Combine all audio chunks
         progress(0.9, "Combining audio chunks...")
         final_audio = combine_audio_chunks(audio_chunks, pause_duration)
@@ -342,19 +344,20 @@ def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, m
         traceback.print_exc()
         return None
-# Examples
 examples = [
-    ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។"],
     ["ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច ឬ <sigh> ថប់ដង្ហើម។"],
 ]
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
-# Create Gradio interface
 with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
     ✨ **New**: Supports long text with automatic splitting!
@@ -366,7 +369,6 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
         lines=6
     )
-    # Text splitting options
     with gr.Accordion("📝 Text Splitting Options", open=True):
         split_method = gr.Radio(
             choices=[
@@ -375,51 +377,42 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
                 ("No splitting", "none")
             ],
             value="punctuation",
-            label="Text splitting method",
-            info="For long texts, splitting helps avoid the 15s limit"
         )
         with gr.Row():
             max_chars = gr.Slider(
-                minimum=50, maximum=500, value=200, step=25,
-                label="Max characters per chunk (punctuation mode)",
-                info="Shorter chunks = more natural breaks but more processing time"
             )
             max_tokens = gr.Slider(
-                minimum=50, maximum=300, value=150, step=25,
-                label="Max tokens per chunk (token mode)",
-                info="Controls chunk size based on model tokenization"
             )
         pause_duration = gr.Slider(
             minimum=0.0, maximum=1.0, value=0.3, step=0.1,
-            label="Pause between chunks (seconds)",
-            info="Silence duration between text chunks"
         )
-    # Advanced Settings
     with gr.Accordion("🔧 Advanced Settings", open=False):
         with gr.Row():
             temperature = gr.Slider(
                 minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                label="Temperature",
-                info="Higher values create more expressive speech"
             )
             top_p = gr.Slider(
                 minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                label="Top P",
-                info="Nucleus sampling threshold"
             )
         with gr.Row():
             repetition_penalty = gr.Slider(
                 minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                label="Repetition Penalty",
-                info="Higher values discourage repetitive patterns"
             )
             max_new_tokens = gr.Slider(
-                minimum=100, maximum=1200, value=800, step=100,
-                label="Max tokens per chunk",
-                info="Lower values for shorter, more reliable generation"
             )
     with gr.Row():
@@ -453,9 +446,8 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
         outputs=[text_input, audio_output]
     )
-# Launch the app
 if __name__ == "__main__":
-    demo.queue(max_size=10).launch(
         share=False,
         server_name="0.0.0.0",
         server_port=7860

 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import snapshot_download, login
 from dotenv import load_dotenv
 import os
 import re
 load_dotenv()
+# Setup Hugging Face authentication
+def setup_auth():
+    hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+    if hf_token:
+        try:
+            login(token=hf_token, add_to_git_credential=False)
+            print("✅ Successfully logged in to Hugging Face")
+            return True
+        except Exception as e:
+            print(f"⚠️ Failed to login to Hugging Face: {e}")
+            return False
+    else:
+        print("⚠️ No HF token found. Running as anonymous user.")
+        return False
+# Setup authentication before anything else
+auth_success = setup_auth()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+print(f"Authentication status: {'✅ Logged in' if auth_success else '❌ Anonymous'}")
 # Global variables to store models
 snac_model = None
     model_name = "mrrtmob/tts-khm-4"
     print("Downloading model files...")
     snapshot_download(
         repo_id=model_name,
     )
     print("Loading main model...")
     if device == "cuda":
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
 load_models()
 def split_text_by_punctuation(text, max_chars=200):
     sentence_endings = r'[។!?]'
     clause_separators = r'[,;:]'
     sentences = re.split(f'({sentence_endings})', text)
     combined_sentences = []
     for i in range(0, len(sentences), 2):
         sentence = sentences[i]
         if i + 1 < len(sentences):
+            sentence += sentences[i + 1]
         if sentence.strip():
             combined_sentences.append(sentence.strip())
     if len(combined_sentences) <= 1:
         parts = re.split(f'({clause_separators})', text)
         combined_sentences = []
             if part.strip():
                 combined_sentences.append(part.strip())
     final_chunks = []
     for sentence in combined_sentences:
         if len(sentence) <= max_chars:
             final_chunks.append(sentence)
         else:
             words = sentence.split()
             current_chunk = ""
     return [chunk for chunk in final_chunks if chunk.strip()]
 def split_text_by_tokens(text, max_tokens=150):
     global tokenizer
     tokens = tokenizer.encode(text)
     if len(tokens) <= max_tokens:
 def redistribute_codes(code_list, snac_model):
     if not code_list or len(code_list) < 7:
+        return np.zeros(12000)
     device = next(snac_model.parameters()).device
     layer_1 = []
         print(f"Error in redistribute_codes: {e}")
         return np.zeros(12000)
+def combine_audio_chunks(audio_chunks, pause_duration=0.3):
+    if not audio_chunks:
+        return np.array([])
+    pause_samples = int(24000 * pause_duration)
+    pause = np.zeros(pause_samples)
+    combined_audio = []
+    for i, chunk in enumerate(audio_chunks):
+        if len(chunk) > 0:
+            combined_audio.append(chunk)
+            if i < len(audio_chunks) - 1:
+                combined_audio.append(pause)
+    if combined_audio:
+        return np.concatenate(combined_audio)
+    else:
+        return np.array([])
+@spaces.GPU(duration=60)  # Reduced duration to be more conservative
+def generate_speech_chunk(text_chunk, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=600, voice="Elise"):
     """Generate speech for a single chunk"""
     global model, tokenizer, snac_model
         print(f"Error generating speech chunk: {e}")
         return np.array([])
+def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=600,
+                   voice="Elise", split_method="punctuation", max_chars=150, max_tokens=100,
                    pause_duration=0.3, progress=gr.Progress()):
     """Main function to generate speech with text splitting"""
         return None
     try:
         progress(0.05, "Splitting text...")
         if split_method == "punctuation":
             text_chunks = split_text_by_punctuation(text, max_chars)
         elif split_method == "tokens":
             text_chunks = split_text_by_tokens(text, max_tokens)
+        else:
             text_chunks = [text]
         progress(0.1, f"Processing {len(text_chunks)} chunks...")
         for i, chunk in enumerate(text_chunks):
             print(f"Chunk {i+1}: {chunk[:50]}...")
         audio_chunks = []
         for i, chunk in enumerate(text_chunks):
             progress(0.1 + 0.7 * (i / len(text_chunks)), f"Generating chunk {i+1}/{len(text_chunks)}...")
         if not audio_chunks:
             return None
         progress(0.9, "Combining audio chunks...")
         final_audio = combine_audio_chunks(audio_chunks, pause_duration)
         traceback.print_exc()
         return None
+# [Rest of your Gradio interface code remains the same]
 examples = [
+    ["ជំរាបសួរ ខ្ញុំឈ្មោះ តារា។ ខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។"],
     ["ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច ឬ <sigh> ថប់ដង្ហើម។"],
 ]
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
 with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
+    Authentication: {'✅ Pro Account' if auth_success else '❌ Anonymous (Limited)'}
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
     ✨ **New**: Supports long text with automatic splitting!
         lines=6
     )
     with gr.Accordion("📝 Text Splitting Options", open=True):
         split_method = gr.Radio(
             choices=[
                 ("No splitting", "none")
             ],
             value="punctuation",
+            label="Text splitting method"
         )
         with gr.Row():
             max_chars = gr.Slider(
+                minimum=50, maximum=300, value=150, step=25,
+                label="Max characters per chunk"
             )
             max_tokens = gr.Slider(
+                minimum=50, maximum=200, value=100, step=25,
+                label="Max tokens per chunk"
             )
         pause_duration = gr.Slider(
             minimum=0.0, maximum=1.0, value=0.3, step=0.1,
+            label="Pause between chunks (seconds)"
         )
     with gr.Accordion("🔧 Advanced Settings", open=False):
         with gr.Row():
             temperature = gr.Slider(
                 minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                label="Temperature"
             )
             top_p = gr.Slider(
                 minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                label="Top P"
             )
         with gr.Row():
             repetition_penalty = gr.Slider(
                 minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                label="Repetition Penalty"
             )
             max_new_tokens = gr.Slider(
+                minimum=100, maximum=800, value=600, step=100,
+                label="Max tokens per chunk"
             )
     with gr.Row():
         outputs=[text_input, audio_output]
     )
 if __name__ == "__main__":
+    demo.queue(max_size=5).launch(
         share=False,
         server_name="0.0.0.0",
         server_port=7860