Spaces:

bobpopboom
/

chaty

Sleeping

App Files Files Community

hashhac commited on Mar 15

Commit

dbf60e3

1 Parent(s): de7876c

updates!

Browse files

Files changed (2) hide show

app.py +144 -55
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -52,32 +52,38 @@ def load_asr_model():
 def load_llm_model():
     model_id = "facebook/opt-1.3b"
     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    # Check if pad_token is None or if pad_token is the same as eos_token
-    needs_pad_token = (tokenizer.pad_token is None or
-                      (tokenizer.pad_token == tokenizer.eos_token))
-    if needs_pad_token:
-        # Use a different special token as padding token
-        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-        print(f"Changed pad token from {tokenizer.pad_token} to [PAD], different from EOS token: {tokenizer.eos_token}")
-        # Resize the token embeddings since we added a new token
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch_dtype,
-            low_cpu_mem_usage=True
-        )
         model.resize_token_embeddings(len(tokenizer))
     else:
-        print(f"Pad token ({tokenizer.pad_token}) is already different from EOS token ({tokenizer.eos_token})")
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch_dtype,
-            low_cpu_mem_usage=True
-        )
     model.to(device)
     return model, tokenizer
@@ -85,72 +91,150 @@ def load_llm_model():
 # Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
 def gtts_text_to_speech(text):
     """Convert text to speech using gTTS and ensure proper WAV format."""
-    # Create temporary files
-    mp3_fd, mp3_filename = tempfile.mkstemp(suffix='.mp3')
-    os.close(mp3_fd)
-    wav_fd, wav_filename = tempfile.mkstemp(suffix='.wav')
-    os.close(wav_fd)
     try:
-        # Use gTTS to convert text to speech
         tts = gTTS(text=text, lang='en', slow=False)
         tts.save(mp3_filename)
-        # Convert MP3 to WAV - preferred method with ffmpeg
         try:
             import subprocess
             result = subprocess.run(
-                ['ffmpeg', '-y', '-i', mp3_filename, '-acodec', 'pcm_s16le', '-ar', '24000', '-ac', '1', wav_filename],
-                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                 check=True
             )
-        except (ImportError, FileNotFoundError, subprocess.CalledProcessError):
-            # Fallback if FFmpeg is not available or fails
-            from pydub import AudioSegment
-            sound = AudioSegment.from_mp3(mp3_filename)
-            sound = sound.set_frame_rate(24000).set_channels(1)
-            sound.export(wav_filename, format="wav")
-        # Verify the WAV file exists and has size
-        if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 0:
-            # Read the WAV file with scipy
             try:
                 sample_rate, audio_data = wavfile.read(wav_filename)
                 # Convert to expected format
                 audio_data = audio_data.reshape(1, -1).astype(np.int16)
                 return (sample_rate, audio_data)
             except Exception as e:
-                print(f"Error reading WAV file with scipy: {e}")
-                # Try alternative approach with pydub
-                try:
-                    from pydub import AudioSegment
-                    sound = AudioSegment.from_file(wav_filename, format="wav")
-                    audio_data = np.array(sound.get_array_of_samples(), dtype=np.int16)
-                    audio_data = audio_data.reshape(1, -1)
-                    return (sound.frame_rate, audio_data)
-                except Exception as e2:
-                    print(f"Error with pydub fallback: {e2}")
         # If all else fails, generate a simple tone
-        print("Falling back to synthetic audio tone")
         sample_rate = 24000
-        duration_sec = len(text) * 0.1  # Rough estimate of speech duration
         tone_length = int(sample_rate * duration_sec)
         audio_data = np.sin(2 * np.pi * np.arange(tone_length) * 440 / sample_rate)
         audio_data = (audio_data * 32767).astype(np.int16)
         audio_data = audio_data.reshape(1, -1)
         return (sample_rate, audio_data)
     finally:
         # Clean up temporary files
         for filename in [mp3_filename, wav_filename]:
             try:
                 if os.path.exists(filename):
                     os.remove(filename)
-            except:
-                pass
 # Initialize models
 print("Loading ASR model...")
@@ -183,10 +267,13 @@ def generate_response(prompt):
     full_prompt += "Assistant: "
     # Generate response with proper attention mask
     tokenized_inputs = llm_tokenizer(
         full_prompt,
         return_tensors="pt",
-        padding=True,
         return_attention_mask=True
     )
@@ -194,7 +281,7 @@ def generate_response(prompt):
     input_ids = tokenized_inputs["input_ids"].to(device)
     attention_mask = tokenized_inputs["attention_mask"].to(device)
-    # Generate response
     with torch.no_grad():
         output = llm_model.generate(
             input_ids=input_ids,
@@ -202,7 +289,9 @@ def generate_response(prompt):
             max_new_tokens=128,
             do_sample=True,
             temperature=0.7,
-            top_p=0.9
         )
     response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)

 def load_llm_model():
     model_id = "facebook/opt-1.3b"
+    # First load the tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_id)
+    # Print current token configuration
+    print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
+    # Load the model first
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch_dtype,
+        low_cpu_mem_usage=True
+    )
+    # Set pad token if needed
+    if tokenizer.pad_token is None or tokenizer.pad_token_id == tokenizer.eos_token_id:
+        # Add a new special token as padding token
+        special_tokens = {'pad_token': '[PAD]'}
+        num_added = tokenizer.add_special_tokens(special_tokens)
+        # Must resize the token embeddings when adding tokens
         model.resize_token_embeddings(len(tokenizer))
+        # Update the model's config to explicitly set the pad token ID
+        model.config.pad_token_id = tokenizer.pad_token_id
+        print(f"Added pad token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
+        print(f"Different from EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
     else:
+        print(f"Pad token already set: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
+        print(f"EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
+    # Move model to the right device
     model.to(device)
     return model, tokenizer
 # Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
 def gtts_text_to_speech(text):
     """Convert text to speech using gTTS and ensure proper WAV format."""
+    # Create absolute paths for temporary files
+    temp_dir = tempfile.gettempdir()
+    mp3_filename = os.path.join(temp_dir, f"tts_temp_{os.getpid()}_{time.time()}.mp3")
+    wav_filename = os.path.join(temp_dir, f"tts_temp_{os.getpid()}_{time.time()}.wav")
     try:
+        # Make sure text is not empty
+        if not text or text.isspace():
+            text = "I don't have a response for that."
+        # Create gTTS object and save to MP3
         tts = gTTS(text=text, lang='en', slow=False)
         tts.save(mp3_filename)
+        print(f"MP3 file created: {mp3_filename}, size: {os.path.getsize(mp3_filename)}")
+        # Try multiple methods to convert MP3 to WAV
+        wav_created = False
+        # Method 1: Try ffmpeg (most reliable)
         try:
             import subprocess
+            cmd = ['ffmpeg', '-y', '-i', mp3_filename, '-acodec', 'pcm_s16le', '-ar', '24000', '-ac', '1', wav_filename]
+            print(f"Running ffmpeg command: {' '.join(cmd)}")
             result = subprocess.run(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
                 check=True
             )
+            if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 100:
+                print(f"WAV file successfully created with ffmpeg: {wav_filename}, size: {os.path.getsize(wav_filename)}")
+                wav_created = True
+            else:
+                print(f"ffmpeg ran but WAV file is missing or too small: {wav_filename}")
+        except Exception as e:
+            print(f"ffmpeg conversion failed: {str(e)}")
+        # Method 2: Try pydub if ffmpeg failed
+        if not wav_created:
             try:
+                from pydub import AudioSegment
+                print("Converting MP3 to WAV using pydub...")
+                sound = AudioSegment.from_mp3(mp3_filename)
+                sound = sound.set_frame_rate(24000).set_channels(1)
+                sound.export(wav_filename, format="wav")
+                if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 100:
+                    print(f"WAV file successfully created with pydub: {wav_filename}, size: {os.path.getsize(wav_filename)}")
+                    wav_created = True
+                else:
+                    print(f"pydub ran but WAV file is missing or too small")
+            except Exception as e:
+                print(f"pydub conversion failed: {str(e)}")
+        # Method 3: Direct WAV creation with gTTS-like library (last resort)
+        if not wav_created:
+            try:
+                import numpy as np
+                from scipy.io import wavfile
+                print("Generating synthetic speech directly...")
+                # Generate a simple speech-like tone pattern
+                sample_rate = 24000
+                duration = len(text) * 0.075  # Approx timing
+                t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
+                # Create a speech-like tone with some variation
+                frequencies = [220, 440, 330, 550]
+                audio = np.zeros_like(t)
+                for i, freq in enumerate(frequencies):
+                    audio += 0.2 * np.sin(2 * np.pi * freq * t + i)
+                # Add some envelope
+                envelope = np.ones_like(t)
+                attack = int(0.01 * sample_rate)
+                release = int(0.1 * sample_rate)
+                envelope[:attack] = np.linspace(0, 1, attack)
+                envelope[-release:] = np.linspace(1, 0, release)
+                audio = audio * envelope
+                # Normalize and convert to int16
+                audio = audio / np.max(np.abs(audio))
+                audio = (audio * 32767).astype(np.int16)
+                # Save as WAV
+                wavfile.write(wav_filename, sample_rate, audio)
+                if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 100:
+                    print(f"WAV file successfully created directly: {wav_filename}, size: {os.path.getsize(wav_filename)}")
+                    wav_created = True
+            except Exception as e:
+                print(f"Direct WAV creation failed: {str(e)}")
+        # Read the WAV file if it was created
+        if wav_created:
+            try:
+                # Add a small delay to ensure the file is fully written
+                time.sleep(0.1)
+                # Read WAV file with scipy
+                print(f"Reading WAV file: {wav_filename}")
                 sample_rate, audio_data = wavfile.read(wav_filename)
                 # Convert to expected format
                 audio_data = audio_data.reshape(1, -1).astype(np.int16)
+                print(f"WAV file read successfully, shape: {audio_data.shape}, sample rate: {sample_rate}")
                 return (sample_rate, audio_data)
             except Exception as e:
+                print(f"Error reading WAV file: {str(e)}")
         # If all else fails, generate a simple tone
+        print("All methods failed. Falling back to synthetic audio tone")
         sample_rate = 24000
+        duration_sec = max(1, len(text) * 0.1)
         tone_length = int(sample_rate * duration_sec)
         audio_data = np.sin(2 * np.pi * np.arange(tone_length) * 440 / sample_rate)
         audio_data = (audio_data * 32767).astype(np.int16)
         audio_data = audio_data.reshape(1, -1)
         return (sample_rate, audio_data)
+    except Exception as e:
+        print(f"Unexpected error in text-to-speech: {str(e)}")
+        # Generate a simple tone as last resort
+        sample_rate = 24000
+        audio_data = np.sin(2 * np.pi * np.arange(sample_rate) * 440 / sample_rate)
+        audio_data = (audio_data * 32767).astype(np.int16)
+        audio_data = audio_data.reshape(1, -1)
+        return (sample_rate, audio_data)
     finally:
         # Clean up temporary files
         for filename in [mp3_filename, wav_filename]:
             try:
                 if os.path.exists(filename):
                     os.remove(filename)
+            except Exception as e:
+                print(f"Failed to remove temporary file {filename}: {str(e)}")
 # Initialize models
 print("Loading ASR model...")
     full_prompt += "Assistant: "
     # Generate response with proper attention mask
+    # Ensure padding is done correctly with explicit parameters
     tokenized_inputs = llm_tokenizer(
         full_prompt,
         return_tensors="pt",
+        padding="max_length",
+        max_length=512,  # Fixed length helps with attention masks
+        truncation=True,
         return_attention_mask=True
     )
     input_ids = tokenized_inputs["input_ids"].to(device)
     attention_mask = tokenized_inputs["attention_mask"].to(device)
+    # Generate response - explicitly pass all needed parameters
     with torch.no_grad():
         output = llm_model.generate(
             input_ids=input_ids,
             max_new_tokens=128,
             do_sample=True,
             temperature=0.7,
+            top_p=0.9,
+            pad_token_id=llm_tokenizer.pad_token_id,  # Explicitly set pad token ID
+            eos_token_id=llm_tokenizer.eos_token_id   # Explicitly set EOS token ID
         )
     response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)

requirements.txt CHANGED Viewed

@@ -10,4 +10,5 @@ fastrtc[vad,tts]
 torchaudio
 gtts
 pydub
-scipy

 torchaudio
 gtts
 pydub
+scipy
+time