Spaces:

ceymox
/

TTS_Streaming-AP

Sleeping

App Files Files Community

ceymox commited on 24 days ago

Commit

f5147dc

verified ·

1 Parent(s): 1523ffa

Update app.py

Browse files

Files changed (1) hide show

app.py +210 -87

app.py CHANGED Viewed

@@ -12,6 +12,10 @@ import gradio as gr
 from transformers import AutoModel, logging as trf_logging
 from huggingface_hub import login, hf_hub_download, scan_cache_dir
 # Enable verbose logging for transformers
 trf_logging.set_verbosity_info()
@@ -33,34 +37,73 @@ model = None
 # Define the repository ID
 repo_id = "ai4bharat/IndicF5"
-# Improved model loading with error handling
-try:
-    print(f"Loading {repo_id} model...")
-    # Try direct loading first
-    model = AutoModel.from_pretrained(
-        repo_id,
-        trust_remote_code=True,
-        revision="main"
-    ).to(device)
-    print(f"Model loaded successfully! Type: {type(model)}")
-    # Check model attributes
-    model_methods = [method for method in dir(model) if not method.startswith('_') and callable(getattr(model, method))]
-    print(f"Available model methods: {model_methods[:10]}...")
-except Exception as e:
-    print(f"⚠️ Error loading model directly: {e}")
     try:
-        # Try loading with local_files_only if model is cached
         model = AutoModel.from_pretrained(
             repo_id,
             trust_remote_code=True,
-            local_files_only=True
         ).to(device)
-        print("Model loaded from cache!")
     except Exception as e2:
         print(f"❌ All attempts to load model failed: {e2}")
 # Advanced audio processing functions
 def remove_noise(audio_data, threshold=0.01):
@@ -147,54 +190,79 @@ def enhance_audio(audio_data):
     return audio_data
-# Load audio from URL with improved error handling
-def load_audio_from_url(url):
     print(f"Downloading reference audio from {url}")
-    try:
-        response = requests.get(url)
-        if response.status_code == 200:
-            try:
-                # Save content to a temp file
-                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
-                temp_file.write(response.content)
-                temp_file.close()
-                print(f"Saved reference audio to temp file: {temp_file.name}")
-                # Try different methods to read the audio file
-                audio_data = None
-                sample_rate = None
-                # Try SoundFile first
                 try:
-                    audio_data, sample_rate = sf.read(temp_file.name)
-                    print(f"Audio loaded with SoundFile: {sample_rate}Hz, {len(audio_data)} samples")
-                except Exception as sf_error:
-                    print(f"SoundFile failed: {sf_error}")
-                    # Try librosa as fallback
                     try:
-                        audio_data, sample_rate = librosa.load(temp_file.name, sr=None)
-                        print(f"Audio loaded with librosa: {sample_rate}Hz, shape={audio_data.shape}")
-                    except Exception as lr_error:
-                        print(f"Librosa also failed: {lr_error}")
-                # Clean up temp file
-                os.unlink(temp_file.name)
-                if audio_data is not None:
-                    # Apply audio enhancement to the reference
-                    audio_data = enhance_audio(audio_data)
-                    return sample_rate, audio_data
-            except Exception as e:
-                print(f"Failed to process audio data: {e}")
-        else:
-            print(f"Failed to download audio: status code {response.status_code}")
-    except Exception as e:
-        print(f"Error downloading audio: {e}")
-    # Return default values as fallback
     print("⚠️ Returning default silence as reference audio")
     return 24000, np.zeros(int(24000))  # 1 second of silence at 24kHz
 # Split text into chunks for streaming
@@ -241,7 +309,7 @@ def split_into_chunks(text, max_length=30):
     print(f"Split text into {len(final_chunks)} chunks")
     return final_chunks
-# Improved model wrapper
 class ModelWrapper:
     def __init__(self, model):
         self.model = model
@@ -274,9 +342,14 @@ class ModelWrapper:
     def generate(self, text, ref_audio_path, ref_text, **kwargs):
         """Generate speech with improved error handling and preprocessing"""
         print(f"\n==== MODEL INFERENCE ====")
-        print(f"Text input: '{text}'")
         print(f"Reference audio path: {ref_audio_path}")
         # Check if files exist
         if not os.path.exists(ref_audio_path):
             print(f"⚠️ Reference audio file not found")
@@ -292,25 +365,31 @@ class ModelWrapper:
             {"text": text, "ref_audio_path": ref_audio_path, "ref_text": ref_text},
             # Second try: alternative parameter names
             {"text": text, "reference_audio": ref_audio_path, "speaker_text": ref_text},
-            # Third try: just text and audio
             {"text": text, "reference_audio": ref_audio_path},
-            # Fourth try: just text
             {"text": text},
-            # Fifth try: positional arguments
             {}  # Will use positional below
         ]
-        # Try each parameter combination
         for i, params in enumerate(param_combinations):
             try:
                 method = getattr(self.model, method_name)
                 print(f"Attempt {i+1}: Calling model.{method_name} with {list(params.keys())} parameters")
-                # For the positional arguments case
-                if not params:
-                    result = method(text, ref_audio_path, ref_text, **kwargs)
-                else:
-                    result = method(**params, **kwargs)
                 print(f"✓ Call succeeded with parameters: {list(params.keys())}")
                 break  # Exit loop if successful
@@ -344,7 +423,7 @@ class ModelWrapper:
 # Create model wrapper
 model_wrapper = ModelWrapper(model) if model is not None else None
-# Streaming TTS class with improved audio quality
 class StreamingTTS:
     def __init__(self):
         self.is_generating = False
@@ -354,10 +433,15 @@ class StreamingTTS:
         self.output_file = None
         self.all_chunks = []
         self.sample_rate = 24000  # Default sample rate
         # Create temp directory
-        self.temp_dir = tempfile.mkdtemp()
-        print(f"Created temp directory: {self.temp_dir}")
     def prepare_ref_audio(self, ref_audio, ref_sr):
         """Prepare reference audio with enhanced quality"""
@@ -400,13 +484,17 @@ class StreamingTTS:
                 print(f"Error cleaning up: {e}")
     def generate(self, text, ref_audio, ref_sr, ref_text):
-        """Start generation in a new thread"""
         if self.is_generating:
             print("Already generating speech, please wait")
             return
         # Check model is loaded
-        if model_wrapper is None:
             print("⚠️ Model is not loaded. Cannot generate speech.")
             return
@@ -424,9 +512,18 @@ class StreamingTTS:
     def _process_streaming(self, text, ref_audio, ref_sr, ref_text):
         """Process text in chunks with high-quality audio generation"""
         try:
             # Prepare reference audio
             self.prepare_ref_audio(ref_audio, ref_sr)
             # Split text into smaller chunks for faster processing
             chunks = split_into_chunks(text)
             print(f"Processing {len(chunks)} chunks")
@@ -441,15 +538,19 @@ class StreamingTTS:
                     break
                 chunk_start = time.time()
-                print(f"Processing chunk {i+1}/{len(chunks)}: {chunk}")
                 # Generate speech for this chunk
                 try:
                     with torch.inference_mode():
                         chunk_audio = model_wrapper.generate(
-                            chunk,
-                            self.ref_audio_path,
-                            ref_text
                         )
                         if chunk_audio is None or (hasattr(chunk_audio, 'size') and chunk_audio.size == 0):
@@ -489,7 +590,15 @@ class StreamingTTS:
             print(f"Total generation time: {total_time:.2f}s")
         except Exception as e:
-            print(f"Error in streaming TTS: {str(e)[:100]}")
         finally:
             self.is_generating = False
             print("Generation complete")
@@ -511,7 +620,7 @@ class StreamingTTS:
 EXAMPLES = [{
     "audio_url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/KC%20Voice.wav",
     "ref_text": "ഹലോ ഇത് അപരനെ അല്ലേ ഞാൻ ജഗദീപ് ആണ് വിളിക്കുന്നത് ഇപ്പോൾ ഫ്രീയാണോ സംസാരിക്കാമോ ",
-    "synth_text": "ബ്രാഹ്മീയ ലിപികുടുംബത്തിൽ ഉൾപ്പെടുന്ന ഒരു ലിപിയാണ് മലയാള ലിപി."
 }]
 print("\nPreloading reference audio...")
@@ -530,7 +639,7 @@ def stop_generation():
     streaming_tts.stop()
     return "Generation stopped"
-# Gradio interface
 with gr.Blocks() as iface:
     gr.Markdown("## 🚀 IndicF5 Malayalam TTS")
@@ -574,21 +683,33 @@ with gr.Blocks() as iface:
         if ref_audio is None:
             return None, "⚠️ Reference audio not loaded. Cannot generate speech.", "Error: Reference audio not loaded"
         # Capture stdout for debug purposes
         import io
         from contextlib import redirect_stdout
         f = io.StringIO()
         with redirect_stdout(f):
-            streaming_tts.generate(text, ref_audio, ref_sr, EXAMPLES[0]["ref_text"] if EXAMPLES else "")
         debug_log = f.getvalue()
         # Add a delay to ensure file is created
-        time.sleep(1.5)
         audio_path = streaming_tts.get_current_audio()
         if audio_path and os.path.exists(audio_path) and os.path.getsize(audio_path) > 0:
-            return audio_path, "Generation started - audio playing", debug_log
         else:
             return None, "Starting generation... please wait", debug_log
@@ -602,5 +723,7 @@ def exit_handler():
 import atexit
 atexit.register(exit_handler)
 print("Starting Gradio interface...")
 iface.launch()

 from transformers import AutoModel, logging as trf_logging
 from huggingface_hub import login, hf_hub_download, scan_cache_dir
+# Increase timeout for transformers HTTP requests
+import os
+os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "300"  # 5 minutes timeout
 # Enable verbose logging for transformers
 trf_logging.set_verbosity_info()
 # Define the repository ID
 repo_id = "ai4bharat/IndicF5"
+# Improved model loading with error handling and cache checking
+def load_model_with_retry(max_retries=3, retry_delay=5):
+    global model
+    # First, check if model is already in cache
+    print("Checking if model is in cache...")
+    try:
+        cache_info = scan_cache_dir()
+        model_in_cache = any(repo_id in repo.repo_id for repo in cache_info.repos)
+        if model_in_cache:
+            print(f"Model {repo_id} found in cache, loading locally...")
+            model = AutoModel.from_pretrained(
+                repo_id,
+                trust_remote_code=True,
+                local_files_only=True
+            ).to(device)
+            print("Model loaded from cache successfully!")
+            return
+    except Exception as e:
+        print(f"Cache check failed: {e}")
+    # If not in cache or cache check failed, try loading with retries
+    for attempt in range(max_retries):
+        try:
+            print(f"Loading {repo_id} model (attempt {attempt+1}/{max_retries})...")
+            model = AutoModel.from_pretrained(
+                repo_id,
+                trust_remote_code=True,
+                revision="main",
+                use_auth_token=hf_token,  # Use token if available
+                low_cpu_mem_usage=True    # Reduce memory usage
+            ).to(device)
+            print(f"Model loaded successfully! Type: {type(model)}")
+            # Check model attributes
+            model_methods = [method for method in dir(model) if not method.startswith('_') and callable(getattr(model, method))]
+            print(f"Available model methods: {model_methods[:10]}...")
+            return  # Success, exit function
+        except Exception as e:
+            print(f"⚠️ Attempt {attempt+1}/{max_retries} failed: {e}")
+            if attempt < max_retries - 1:
+                print(f"Waiting {retry_delay} seconds before retrying...")
+                time.sleep(retry_delay)
+                retry_delay *= 1.5  # Exponential backoff
+    # If all attempts failed, try one last time with fallback options
     try:
+        print("Trying with fallback options...")
         model = AutoModel.from_pretrained(
             repo_id,
             trust_remote_code=True,
+            revision="main",
+            local_files_only=False,
+            use_auth_token=hf_token,
+            force_download=False,
+            resume_download=True
         ).to(device)
+        print("Model loaded with fallback options!")
     except Exception as e2:
         print(f"❌ All attempts to load model failed: {e2}")
+        print("Will continue without model loaded.")
+# Call the improved loading function
+load_model_with_retry()
 # Advanced audio processing functions
 def remove_noise(audio_data, threshold=0.01):
     return audio_data
+# Load audio from URL with improved error handling and retries
+def load_audio_from_url(url, max_retries=3):
     print(f"Downloading reference audio from {url}")
+    for attempt in range(max_retries):
+        try:
+            # Use a longer timeout
+            response = requests.get(url, timeout=60)  # 60 second timeout
+            if response.status_code == 200:
                 try:
+                    # Save content to a temp file
+                    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
+                    temp_file.write(response.content)
+                    temp_file.close()
+                    print(f"Saved reference audio to temp file: {temp_file.name}")
+                    # Try different methods to read the audio file
+                    audio_data = None
+                    sample_rate = None
+                    # Try SoundFile first
                     try:
+                        audio_data, sample_rate = sf.read(temp_file.name)
+                        print(f"Audio loaded with SoundFile: {sample_rate}Hz, {len(audio_data)} samples")
+                    except Exception as sf_error:
+                        print(f"SoundFile failed: {sf_error}")
+                        # Try librosa as fallback
+                        try:
+                            audio_data, sample_rate = librosa.load(temp_file.name, sr=None)
+                            print(f"Audio loaded with librosa: {sample_rate}Hz, shape={audio_data.shape}")
+                        except Exception as lr_error:
+                            print(f"Librosa also failed: {lr_error}")
+                    # Clean up temp file
+                    os.unlink(temp_file.name)
+                    if audio_data is not None:
+                        # Apply audio enhancement to the reference
+                        audio_data = enhance_audio(audio_data)
+                        return sample_rate, audio_data
+                except Exception as e:
+                    print(f"Failed to process audio data: {e}")
+            else:
+                print(f"Failed to download audio: status code {response.status_code}")
+        except requests.exceptions.Timeout:
+            if attempt < max_retries - 1:
+                wait_time = (attempt + 1) * 5  # Exponential backoff
+                print(f"Request timed out. Retrying in {wait_time} seconds...")
+                time.sleep(wait_time)
+            else:
+                print("All retry attempts failed due to timeout.")
+        except Exception as e:
+            print(f"Error downloading audio: {e}")
+            if attempt < max_retries - 1:
+                time.sleep(5)
+    # If we reach here, all attempts failed
     print("⚠️ Returning default silence as reference audio")
+    # Try to load a local backup audio if provided
+    backup_path = "backup_reference.wav"
+    if os.path.exists(backup_path):
+        try:
+            audio_data, sample_rate = sf.read(backup_path)
+            print(f"Loaded backup reference audio: {sample_rate}Hz")
+            return sample_rate, audio_data
+        except Exception as e:
+            print(f"Failed to load backup audio: {e}")
     return 24000, np.zeros(int(24000))  # 1 second of silence at 24kHz
 # Split text into chunks for streaming
     print(f"Split text into {len(final_chunks)} chunks")
     return final_chunks
+# Improved model wrapper with timeout handling
 class ModelWrapper:
     def __init__(self, model):
         self.model = model
     def generate(self, text, ref_audio_path, ref_text, **kwargs):
         """Generate speech with improved error handling and preprocessing"""
         print(f"\n==== MODEL INFERENCE ====")
+        print(f"Text to generate: '{text}'")  # Make sure this is the text we want to generate
         print(f"Reference audio path: {ref_audio_path}")
+        # Check if model is loaded
+        if self.model is None:
+            print("⚠️ Model is not loaded. Cannot generate speech.")
+            return np.zeros(int(24000))  # Return silence
         # Check if files exist
         if not os.path.exists(ref_audio_path):
             print(f"⚠️ Reference audio file not found")
             {"text": text, "ref_audio_path": ref_audio_path, "ref_text": ref_text},
             # Second try: alternative parameter names
             {"text": text, "reference_audio": ref_audio_path, "speaker_text": ref_text},
+            # Third try: alternative parameter names 2
+            {"text": text, "reference_audio": ref_audio_path, "reference_text": ref_text},
+            # Fourth try: just text and audio
             {"text": text, "reference_audio": ref_audio_path},
+            # Fifth try: just text
             {"text": text},
+            # Sixth try: positional arguments
             {}  # Will use positional below
         ]
+        # Try each parameter combination with timeout
         for i, params in enumerate(param_combinations):
             try:
                 method = getattr(self.model, method_name)
                 print(f"Attempt {i+1}: Calling model.{method_name} with {list(params.keys())} parameters")
+                # Set a timeout for inference
+                with torch.inference_mode():
+                    # For the positional arguments case
+                    if not params:
+                        print(f"Using positional args with text='{text}'")
+                        result = method(text, ref_audio_path, ref_text, **kwargs)
+                    else:
+                        print(f"Using keyword args with text='{params.get('text')}'")
+                        result = method(**params, **kwargs)
                 print(f"✓ Call succeeded with parameters: {list(params.keys())}")
                 break  # Exit loop if successful
 # Create model wrapper
 model_wrapper = ModelWrapper(model) if model is not None else None
+# Streaming TTS class with improved audio quality and error handling
 class StreamingTTS:
     def __init__(self):
         self.is_generating = False
         self.output_file = None
         self.all_chunks = []
         self.sample_rate = 24000  # Default sample rate
+        self.current_text = ""    # Track current text being processed
         # Create temp directory
+        try:
+            self.temp_dir = tempfile.mkdtemp()
+            print(f"Created temp directory: {self.temp_dir}")
+        except Exception as e:
+            print(f"Error creating temp directory: {e}")
+            self.temp_dir = "."  # Use current directory as fallback
     def prepare_ref_audio(self, ref_audio, ref_sr):
         """Prepare reference audio with enhanced quality"""
                 print(f"Error cleaning up: {e}")
     def generate(self, text, ref_audio, ref_sr, ref_text):
+        """Start generation in a new thread with validation"""
         if self.is_generating:
             print("Already generating speech, please wait")
             return
+        # Store the text for verification
+        self.current_text = text
+        print(f"Setting current text to: '{self.current_text}'")
         # Check model is loaded
+        if model_wrapper is None or model is None:
             print("⚠️ Model is not loaded. Cannot generate speech.")
             return
     def _process_streaming(self, text, ref_audio, ref_sr, ref_text):
         """Process text in chunks with high-quality audio generation"""
         try:
+            # Double check text matches what we expect
+            if text != self.current_text:
+                print(f"⚠️ Text mismatch detected! Expected: '{self.current_text}', Got: '{text}'")
+                # Use the stored text to be safe
+                text = self.current_text
             # Prepare reference audio
             self.prepare_ref_audio(ref_audio, ref_sr)
+            # Print the text we're actually going to process
+            print(f"Processing text: '{text}'")
             # Split text into smaller chunks for faster processing
             chunks = split_into_chunks(text)
             print(f"Processing {len(chunks)} chunks")
                     break
                 chunk_start = time.time()
+                print(f"Processing chunk {i+1}/{len(chunks)}: '{chunk}'")
                 # Generate speech for this chunk
                 try:
+                    # Set timeout for inference
+                    chunk_timeout = 30  # 30 seconds timeout per chunk
                     with torch.inference_mode():
+                        # Explicitly pass the chunk text
                         chunk_audio = model_wrapper.generate(
+                            text=chunk,  # Make sure we're using the current chunk
+                            ref_audio_path=self.ref_audio_path,
+                            ref_text=ref_text
                         )
                         if chunk_audio is None or (hasattr(chunk_audio, 'size') and chunk_audio.size == 0):
             print(f"Total generation time: {total_time:.2f}s")
         except Exception as e:
+            print(f"Error in streaming TTS: {str(e)[:200]}")
+            # Try to write whatever we have so far
+            if len(self.all_chunks) > 0:
+                try:
+                    combined = np.concatenate(self.all_chunks)
+                    sf.write(self.output_file, combined, 24000, format='WAV', subtype='FLOAT')
+                    print("Saved partial output")
+                except Exception as e2:
+                    print(f"Failed to save partial output: {e2}")
         finally:
             self.is_generating = False
             print("Generation complete")
 EXAMPLES = [{
     "audio_url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/KC%20Voice.wav",
     "ref_text": "ഹലോ ഇത് അപരനെ അല്ലേ ഞാൻ ജഗദീപ് ആണ് വിളിക്കുന്നത് ഇപ്പോൾ ഫ്രീയാണോ സംസാരിക്കാമോ ",
+    "synth_text": "ഞാൻ മലയാളം സംസാരിക്കാൻ കഴിയുന്നു."
 }]
 print("\nPreloading reference audio...")
     streaming_tts.stop()
     return "Generation stopped"
+# Gradio interface with offline mode
 with gr.Blocks() as iface:
     gr.Markdown("## 🚀 IndicF5 Malayalam TTS")
         if ref_audio is None:
             return None, "⚠️ Reference audio not loaded. Cannot generate speech.", "Error: Reference audio not loaded"
+        # Print the text being processed
+        print(f"🔍 User input text: '{text}'")
         # Capture stdout for debug purposes
         import io
         from contextlib import redirect_stdout
         f = io.StringIO()
         with redirect_stdout(f):
+            try:
+                # Make sure the text is explicitly passed as the first parameter
+                streaming_tts.generate(
+                    text=text,  # Explicitly name parameter
+                    ref_audio=ref_audio,
+                    ref_sr=ref_sr,
+                    ref_text=EXAMPLES[0]["ref_text"] if EXAMPLES else ""
+                )
+            except Exception as e:
+                print(f"Error starting generation: {e}")
         debug_log = f.getvalue()
         # Add a delay to ensure file is created
+        time.sleep(2.0)
         audio_path = streaming_tts.get_current_audio()
         if audio_path and os.path.exists(audio_path) and os.path.getsize(audio_path) > 0:
+            return audio_path, f"Generated speech for: {text[:30]}...", debug_log
         else:
             return None, "Starting generation... please wait", debug_log
 import atexit
 atexit.register(exit_handler)
+# Start the interface with flexible port selection
 print("Starting Gradio interface...")
+# Try a range of ports if 7860 is busy
 iface.launch()