Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on 6 days ago

Commit

e173776

1 Parent(s): 731d214

Refactor authentication and model loading; simplify UI and reduce resource usage

Browse files

Files changed (1) hide show

app.py +28 -104

app.py CHANGED Viewed

@@ -12,36 +12,19 @@ load_dotenv()
 # Get HF token from environment variables
 hf_token = os.getenv("HF_TOKEN")
-# Debug and authentication
-print("=== DEBUG INFO ===")
-print(f"HF_TOKEN exists: {bool(hf_token)}")
 if hf_token:
     login(token=hf_token)
-    try:
-        user_info = whoami(token=hf_token)
-        print(f"Successfully logged in as: {user_info.get('name', 'Unknown')}")
-        print(f"User type: {user_info.get('type', 'Unknown')}")
-        print(f"User ID: {user_info.get('id', 'Unknown')}")
-    except Exception as e:
-        print(f"Authentication error: {e}")
-else:
-    print("Warning: HF_TOKEN not found in environment variables")
-print("=== END DEBUG ===")
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
-print("SNAC model loaded successfully")
 model_name = "mrrtmob/tts-khm-kore"
-print(f"Downloading model files from {model_name}...")
 # Download only model config and safetensors with token
 snapshot_download(
     repo_id=model_name,
@@ -63,18 +46,15 @@ snapshot_download(
         "scheduler.pt"
     ]
 )
-print("Model files downloaded successfully")
-print("Loading main model...")
-# Load model and tokenizer with token (removed device_map)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.bfloat16,
     token=hf_token
 )
-model = model.to(device)  # Move to device manually
-print("Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
     token=hf_token
@@ -114,14 +94,14 @@ def parse_output(generated_ids):
         trimmed_row = row[:new_length]
         trimmed_row = [t - 128266 for t in trimmed_row]
         code_lists.append(trimmed_row)
-    return code_lists[0] if code_lists else []  # Return just the first one for single sample
 # Redistribute codes for audio generation
 def redistribute_codes(code_list, snac_model):
     if not code_list:
         return None
-    device = next(snac_model.parameters()).device  # Get the device of SNAC model
     layer_1 = []
     layer_2 = []
     layer_3 = []
@@ -145,26 +125,22 @@ def redistribute_codes(code_list, snac_model):
     if not layer_1:
         return None
-    # Move tensors to the same device as the SNAC model
     codes = [
         torch.tensor(layer_1, device=device).unsqueeze(0),
         torch.tensor(layer_2, device=device).unsqueeze(0),
         torch.tensor(layer_3, device=device).unsqueeze(0)
     ]
     audio_hat = snac_model.decode(codes)
-    return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
-# Main generation function
-@spaces.GPU(duration=120)
-def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200, voice="Elise", progress=gr.Progress()):
     if not text.strip():
-        gr.Warning("Please enter some text to generate speech.")
         return None
     try:
         progress(0.1, "Processing text...")
-        print(f"Generating speech for text: {text[:50]}...")
         input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
         progress(0.3, "Generating speech tokens...")
@@ -186,26 +162,21 @@ def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, m
         code_list = parse_output(generated_ids)
         if not code_list:
-            gr.Warning("Failed to generate valid audio codes.")
             return None
         progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
         if audio_samples is None:
-            gr.Warning("Failed to convert codes to audio.")
             return None
-        print("Speech generation completed successfully")
-        return (24000, audio_samples)  # Return sample rate and audio
     except Exception as e:
-        error_msg = f"Error generating speech: {str(e)}"
-        print(error_msg)
-        gr.Error(error_msg)
         return None
-# Examples for the UI - Khmer text examples
 examples = [
     ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ហើយខ្ញុំជា AI ដែលអាចបម្លែងអត្ថបទទៅជាសំលេង។"],
     ["ខ្ញុ��អាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច។"],
@@ -219,49 +190,17 @@ examples = [
     ["អរគុណច្រើនសម្រាប់ជំនួយ។ <chuckle> បើគ្មានអ្នកទេ ខ្ញុំមិនដឹងធ្វើយ៉ាងម៉េចទេ។"],
 ]
-# Available voices (commented out for simpler UI)
-# VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe", "jing", "Elise"]
-# Available Emotive Tags
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
-# Create custom CSS
-css = """
-.gradio-container {
-    max-width: 1200px;
-    margin: auto;
-    padding-top: 1.5rem;
-}
-.main-header {
-    text-align: center;
-    margin-bottom: 2rem;
-}
-.generate-btn {
-    background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
-    border: none !important;
-    color: white !important;
-    font-weight: bold !important;
-}
-.clear-btn {
-    background: linear-gradient(45deg, #95A5A6, #BDC3C7) !important;
-    border: none !important;
-    color: white !important;
-}
-"""
 # Create Gradio interface
-with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"""
-    <div class="main-header">
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
-    </div>
     """)
     with gr.Row():
@@ -270,7 +209,7 @@ with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as
                 label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
                 placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
                 lines=4,
-                max_lines=8
             )
             # Advanced Settings
@@ -278,29 +217,25 @@ with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as
                 with gr.Row():
                     temperature = gr.Slider(
                         minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                        label="Temperature",
-                        info="Higher values create more expressive speech"
                     )
                     top_p = gr.Slider(
                         minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                        label="Top P",
-                        info="Nucleus sampling threshold"
                     )
                 with gr.Row():
                     repetition_penalty = gr.Slider(
                         minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                        label="Repetition Penalty",
-                        info="Higher values discourage repetitive patterns"
                     )
                     max_new_tokens = gr.Slider(
-                        minimum=100, maximum=2000, value=1200, step=100,
-                        label="Max Length",
-                        info="Maximum length of generated audio"
                     )
             with gr.Row():
-                submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg", elem_classes=["generate-btn"])
-                clear_btn = gr.Button("🗑️ Clear", size="lg", elem_classes=["clear-btn"])
         with gr.Column(scale=1):
             audio_output = gr.Audio(
@@ -310,14 +245,14 @@ with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as
                 interactive=False
             )
-    # Set up examples (NO CACHE)
     gr.Examples(
         examples=examples,
         inputs=[text_input],
         outputs=audio_output,
         fn=lambda text: generate_speech(text),
-        cache_examples=False,
-        label="📝 Example Texts (អត្ថបទគំរូ)"
     )
     # Set up event handlers
@@ -333,25 +268,14 @@ with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as
         inputs=[],
         outputs=[text_input, audio_output]
     )
-    # Add keyboard shortcut
-    text_input.submit(
-        fn=generate_speech,
-        inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
-        outputs=audio_output,
-        show_progress=True
-    )
-# Launch the app
 if __name__ == "__main__":
-    print("Starting Gradio interface...")
     demo.queue(
-        max_size=20,
-        default_concurrency_limit=5
     ).launch(
-        server_name="0.0.0.0",
-        server_port=7860,
         share=False,
         show_error=True,
-        quiet=False
     )

 # Get HF token from environment variables
 hf_token = os.getenv("HF_TOKEN")
+# Simplified authentication - no debug prints
 if hf_token:
     login(token=hf_token)
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
 model_name = "mrrtmob/tts-khm-kore"
 # Download only model config and safetensors with token
 snapshot_download(
     repo_id=model_name,
         "scheduler.pt"
     ]
 )
+# Load model and tokenizer with token
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.bfloat16,
     token=hf_token
 )
+model = model.to(device)
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
     token=hf_token
         trimmed_row = row[:new_length]
         trimmed_row = [t - 128266 for t in trimmed_row]
         code_lists.append(trimmed_row)
+    return code_lists[0] if code_lists else []
 # Redistribute codes for audio generation
 def redistribute_codes(code_list, snac_model):
     if not code_list:
         return None
+    device = next(snac_model.parameters()).device
     layer_1 = []
     layer_2 = []
     layer_3 = []
     if not layer_1:
         return None
     codes = [
         torch.tensor(layer_1, device=device).unsqueeze(0),
         torch.tensor(layer_2, device=device).unsqueeze(0),
         torch.tensor(layer_3, device=device).unsqueeze(0)
     ]
     audio_hat = snac_model.decode(codes)
+    return audio_hat.detach().squeeze().cpu().numpy()
+# Main generation function - KEY CHANGES HERE
+@spaces.GPU(duration=60)  # Reduced duration
+def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=800, voice="Elise", progress=gr.Progress()):  # Reduced max tokens
     if not text.strip():
         return None
     try:
         progress(0.1, "Processing text...")
         input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
         progress(0.3, "Generating speech tokens...")
         code_list = parse_output(generated_ids)
         if not code_list:
             return None
         progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
         if audio_samples is None:
             return None
+        return (24000, audio_samples)
     except Exception as e:
+        print(f"Error generating speech: {e}")
         return None
+# Examples - reduced to save quota
 examples = [
     ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ហើយខ្ញុំជា AI ដែលអាចបម្លែងអត្ថបទទៅជាសំលេង។"],
     ["ខ្ញុ��អាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច។"],
     ["អរគុណច្រើនសម្រាប់ជំនួយ។ <chuckle> បើគ្មានអ្នកទេ ខ្ញុំមិនដឹងធ្វើយ៉ាងម៉េចទេ។"],
 ]
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
 # Create Gradio interface
+with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
     """)
     with gr.Row():
                 label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
                 placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
                 lines=4,
+                max_lines=6  # Limited input size
             )
             # Advanced Settings
                 with gr.Row():
                     temperature = gr.Slider(
                         minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                        label="Temperature"
                     )
                     top_p = gr.Slider(
                         minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                        label="Top P"
                     )
                 with gr.Row():
                     repetition_penalty = gr.Slider(
                         minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                        label="Repetition Penalty"
                     )
                     max_new_tokens = gr.Slider(
+                        minimum=100, maximum=800, value=800, step=50,  # Reduced max
+                        label="Max Length"
                     )
             with gr.Row():
+                submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
+                clear_btn = gr.Button("🗑️ Clear", size="lg")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
                 interactive=False
             )
+    # Examples with NO CACHE to save quota
     gr.Examples(
         examples=examples,
         inputs=[text_input],
         outputs=audio_output,
         fn=lambda text: generate_speech(text),
+        cache_examples=False,  # Important: no caching
+        label="📝 Example Texts"
     )
     # Set up event handlers
         inputs=[],
         outputs=[text_input, audio_output]
     )
+# Launch with optimizations
 if __name__ == "__main__":
     demo.queue(
+        max_size=10,  # Reduced queue size
+        default_concurrency_limit=2  # Reduced concurrent users
     ).launch(
         share=False,
         show_error=True,
+        ssr_mode=False  # Important for quota
     )