Spaces:

ramimu
/

voice_cloning

Running

App Files Files Community

ramimu commited on Jun 19

Commit

ee39abc

verified ·

1 Parent(s): 74dbc75

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -38

app.py CHANGED Viewed

@@ -26,6 +26,10 @@ except ImportError as e:
 model = None
 model_loaded = False
 def download_model_files():
     """Download model files with error handling."""
     print(f"Checking for model files in {LOCAL_MODEL_PATH}...")
@@ -65,7 +69,6 @@ def load_model_on_gpu():
     try:
         print("Loading model inside GPU context...")
-        # Now we can safely use CUDA operations
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Loading model on device: {device}")
@@ -80,10 +83,8 @@ def load_model_on_gpu():
                 print("✓ Model loaded successfully with from_pretrained.")
             except Exception as e2:
                 print(f"from_pretrained failed: {e2}")
-                # Manual loading as fallback
                 model = load_model_manually(device)
-        # Move model to device and set to eval mode
         if model and hasattr(model, 'to'):
             model = model.to(device)
         if model and hasattr(model, 'eval'):
@@ -108,7 +109,6 @@ def load_model_manually(device):
     model_path = pathlib.Path(LOCAL_MODEL_PATH)
     print("Manual loading with correct constructor signature...")
-    # Load components to CPU first, then move to device
     s3gen_path = model_path / "s3gen.pt"
     ve_path = model_path / "ve.pt"
     tokenizer_path = model_path / "tokenizer.json"
@@ -127,7 +127,6 @@ def load_model_manually(device):
     except Exception:
         tokenizer = tokenizer_data
-    # Create model instance
     model = ChatterboxTTS(
         t3=t3_cfg,
         s3gen=s3gen,
@@ -141,10 +140,35 @@ def load_model_manually(device):
 def cleanup_gpu_memory():
     """Clean up GPU memory - only call within GPU context."""
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        torch.cuda.synchronize()
-        gc.collect()
 # Download model files during startup (CPU only)
 if chatterbox_available:
@@ -169,8 +193,16 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
     if reference_audio_path is None:
         return None, "Error: Please upload a reference audio file (.wav or .mp3)."
     try:
-        # Load model if not already loaded (inside GPU context)
         if not model_loaded:
             print("Loading model for the first time...")
             if not load_model_on_gpu():
@@ -180,7 +212,9 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
             return None, "Error: Model not loaded. Please check the logs for details."
         print(f"Processing request:")
-        print(f"  Text length: {len(text_to_speak)} characters")
         print(f"  Audio: '{reference_audio_path}'")
         print(f"  Parameters: exag={exaggeration}, cfg={cfg_pace}, seed={random_seed}, temp={temperature}")
@@ -199,33 +233,19 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
         # Generate audio with error handling
         try:
-            with torch.no_grad():  # Disable gradient computation to save memory
                 output_wav_data = model.generate(
-                    text=text_to_speak,
                     audio_prompt_path=reference_audio_path,
                     exaggeration=exaggeration,
                     cfg_weight=cfg_pace,
                     temperature=temperature
                 )
         except RuntimeError as e:
-            if "CUDA" in str(e) or "out of memory" in str(e):
                 print(f"CUDA error during generation: {e}")
-                # Try to recover by cleaning memory and retrying
                 cleanup_gpu_memory()
-                try:
-                    with torch.no_grad():
-                        output_wav_data = model.generate(
-                            text=text_to_speak,
-                            audio_prompt_path=reference_audio_path,
-                            exaggeration=exaggeration,
-                            cfg_weight=cfg_pace,
-                            temperature=temperature
-                        )
-                    print("✓ Recovery successful after memory cleanup")
-                except Exception as retry_error:
-                    print(f"✗ Recovery failed: {retry_error}")
-                    cleanup_gpu_memory()
-                    return None, f"CUDA error: {str(e)}. GPU memory issue - please try again in a moment."
             else:
                 raise e
@@ -253,7 +273,13 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
             print(f"CUDA memory after generation: {torch.cuda.memory_allocated() / 1024**2:.1f} MB")
         print("✓ Audio generated successfully")
-        return result, "Success: Audio generated successfully!"
     except Exception as e:
         print(f"ERROR during audio generation: {e}")
@@ -268,14 +294,14 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
         # Provide specific error messages
         error_msg = str(e)
         if "CUDA" in error_msg or "device-side assert" in error_msg:
-            return None, f"CUDA error: {error_msg}. This is usually a temporary GPU issue. Please try again in a moment."
         elif "out of memory" in error_msg:
-            return None, f"GPU memory error: {error_msg}. Please try with shorter text or try again later."
         else:
             return None, f"Error during audio generation: {error_msg}. Check logs for more details."
 def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
-    """API wrapper function - this will call the GPU function."""
     import requests
     import tempfile
     import os
@@ -317,21 +343,28 @@ def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pa
             except:
                 pass
-# Your existing Gradio interface code goes here...
 def main():
     print("Starting Advanced Gradio interface...")
-    # Your existing Gradio interface code
     with gr.Blocks(title="🎙️ Advanced Chatterbox Voice Cloning") as demo:
         gr.Markdown("# 🎙️ Advanced Chatterbox Voice Cloning")
         gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
-                    label="Text to Speak",
                     placeholder="Enter the text you want the cloned voice to say...",
-                    lines=3
                 )
                 audio_input = gr.Audio(
                     type="filepath",
@@ -362,7 +395,7 @@ def main():
             with gr.Column(scale=1):
                 audio_output = gr.Audio(label="Generated Audio", type="numpy")
-                status_output = gr.Textbox(label="Status", lines=2)
         # Connect the interface
         generate_btn.click(

 model = None
 model_loaded = False
+# Text length limits for the model
+MAX_CHARS_PER_GENERATION = 1000  # Safe limit for single generation
+MAX_CHARS_TOTAL = 5000           # Maximum we'll accept via API
 def download_model_files():
     """Download model files with error handling."""
     print(f"Checking for model files in {LOCAL_MODEL_PATH}...")
     try:
         print("Loading model inside GPU context...")
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Loading model on device: {device}")
                 print("✓ Model loaded successfully with from_pretrained.")
             except Exception as e2:
                 print(f"from_pretrained failed: {e2}")
                 model = load_model_manually(device)
         if model and hasattr(model, 'to'):
             model = model.to(device)
         if model and hasattr(model, 'eval'):
     model_path = pathlib.Path(LOCAL_MODEL_PATH)
     print("Manual loading with correct constructor signature...")
     s3gen_path = model_path / "s3gen.pt"
     ve_path = model_path / "ve.pt"
     tokenizer_path = model_path / "tokenizer.json"
     except Exception:
         tokenizer = tokenizer_data
     model = ChatterboxTTS(
         t3=t3_cfg,
         s3gen=s3gen,
 def cleanup_gpu_memory():
     """Clean up GPU memory - only call within GPU context."""
+    try:
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+            gc.collect()
+    except Exception as e:
+        print(f"Warning: GPU cleanup failed: {e}")
+def truncate_text_safely(text, max_chars=MAX_CHARS_PER_GENERATION):
+    """Truncate text to safe length while preserving sentence boundaries."""
+    if len(text) <= max_chars:
+        return text, False
+    # Find the last sentence ending before the limit
+    truncated = text[:max_chars]
+    # Look for sentence endings
+    for ending in ['. ', '! ', '? ']:
+        last_sentence = truncated.rfind(ending)
+        if last_sentence > max_chars * 0.7:  # Don't truncate too aggressively
+            return text[:last_sentence + 1].strip(), True
+    # Fallback to word boundary
+    last_space = truncated.rfind(' ')
+    if last_space > max_chars * 0.8:
+        return text[:last_space].strip(), True
+    # Last resort: hard truncate
+    return truncated.strip(), True
 # Download model files during startup (CPU only)
 if chatterbox_available:
     if reference_audio_path is None:
         return None, "Error: Please upload a reference audio file (.wav or .mp3)."
+    # Check text length and truncate if necessary
+    original_length = len(text_to_speak)
+    if original_length > MAX_CHARS_TOTAL:
+        return None, f"Error: Text is too long ({original_length:,} characters). Maximum allowed is {MAX_CHARS_TOTAL:,} characters. Please use the chunked generation API for longer texts."
+    # Truncate to safe generation length
+    text_to_use, was_truncated = truncate_text_safely(text_to_speak, MAX_CHARS_PER_GENERATION)
     try:
+        # Load model if not already loaded
         if not model_loaded:
             print("Loading model for the first time...")
             if not load_model_on_gpu():
             return None, "Error: Model not loaded. Please check the logs for details."
         print(f"Processing request:")
+        print(f"  Original text length: {original_length:,} characters")
+        print(f"  Processing length: {len(text_to_use):,} characters")
+        print(f"  Truncated: {was_truncated}")
         print(f"  Audio: '{reference_audio_path}'")
         print(f"  Parameters: exag={exaggeration}, cfg={cfg_pace}, seed={random_seed}, temp={temperature}")
         # Generate audio with error handling
         try:
+            with torch.no_grad():
                 output_wav_data = model.generate(
+                    text=text_to_use,
                     audio_prompt_path=reference_audio_path,
                     exaggeration=exaggeration,
                     cfg_weight=cfg_pace,
                     temperature=temperature
                 )
         except RuntimeError as e:
+            if "CUDA" in str(e) or "out of memory" in str(e) or "device-side assert" in str(e):
                 print(f"CUDA error during generation: {e}")
                 cleanup_gpu_memory()
+                return None, f"CUDA error: Text may be too long for single generation. Try shorter text (under {MAX_CHARS_PER_GENERATION} characters) or use the chunked generation API for longer content."
             else:
                 raise e
             print(f"CUDA memory after generation: {torch.cuda.memory_allocated() / 1024**2:.1f} MB")
         print("✓ Audio generated successfully")
+        # Prepare success message
+        success_msg = "Success: Audio generated successfully!"
+        if was_truncated:
+            success_msg += f" Note: Text was truncated from {original_length:,} to {len(text_to_use):,} characters for optimal generation. Use the chunked generation API for longer texts."
+        return result, success_msg
     except Exception as e:
         print(f"ERROR during audio generation: {e}")
         # Provide specific error messages
         error_msg = str(e)
         if "CUDA" in error_msg or "device-side assert" in error_msg:
+            return None, f"CUDA error: {error_msg}. Try shorter text (under {MAX_CHARS_PER_GENERATION} characters) or use the chunked generation API."
         elif "out of memory" in error_msg:
+            return None, f"GPU memory error: {error_msg}. Please try with shorter text."
         else:
             return None, f"Error during audio generation: {error_msg}. Check logs for more details."
 def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
+    """API wrapper function."""
     import requests
     import tempfile
     import os
             except:
                 pass
 def main():
     print("Starting Advanced Gradio interface...")
     with gr.Blocks(title="🎙️ Advanced Chatterbox Voice Cloning") as demo:
         gr.Markdown("# 🎙️ Advanced Chatterbox Voice Cloning")
         gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
+        # Add warning about text length
+        gr.Markdown(f"""
+        **⚠️ Text Length Limits:**
+        - **Single Generation**: Up to {MAX_CHARS_PER_GENERATION:,} characters (optimal quality)
+        - **API Maximum**: Up to {MAX_CHARS_TOTAL:,} characters (may be truncated)
+        - **For longer texts**: Use the chunked generation API in your application
+        """)
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
+                    label=f"Text to Speak (max {MAX_CHARS_TOTAL:,} characters)",
                     placeholder="Enter the text you want the cloned voice to say...",
+                    lines=5,
+                    max_lines=10
                 )
                 audio_input = gr.Audio(
                     type="filepath",
             with gr.Column(scale=1):
                 audio_output = gr.Audio(label="Generated Audio", type="numpy")
+                status_output = gr.Textbox(label="Status", lines=3)
         # Connect the interface
         generate_btn.click(