Spaces:

bobpopboom
/

chaty

Sleeping

App Files Files Community

hashhac commited on Mar 15

Commit

289ad8b

1 Parent(s): 3931f99

demo run

Browse files

Files changed (2) hide show

app.py +77 -66
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -6,16 +6,16 @@ import gradio as gr
 import numpy as np
 import torch
 import os
 from transformers import (
     AutoModelForSpeechSeq2Seq,
     AutoProcessor,
     pipeline,
     AutoTokenizer,
-    AutoModelForCausalLM,
-    AutoModelForSeq2SeqLM
 )
-from datasets import load_dataset
-import scipy
 # Check if CUDA is available, otherwise use CPU
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -23,7 +23,7 @@ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 # Step 1: Audio transcription with Whisper
 def load_asr_model():
-    model_id = "openai/whisper-small"  # Smaller version that's more efficient
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         model_id,
@@ -50,7 +50,7 @@ def load_asr_model():
 # Step 2: Text generation with a smaller LLM
 def load_llm_model():
-    model_id = "facebook/opt-1.3b"  # A smaller language model
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = AutoModelForCausalLM.from_pretrained(
@@ -62,64 +62,50 @@ def load_llm_model():
     return model, tokenizer
-# Step 3: Text-to-Speech with a free model
-# Step 3: Text-to-Speech with a free model
-def load_tts_model():
-    # Import the specific SpeechT5 classes
-    from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-    model_id = "microsoft/speecht5_tts"
-    processor = SpeechT5Processor.from_pretrained(model_id)
-    model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
-    model.to(device)
-    # Load vocoder for waveform generation
-    vocoder_id = "microsoft/speecht5_hifigan"
-    vocoder = SpeechT5HifiGan.from_pretrained(vocoder_id)
-    vocoder.to(device)
-    # Load speaker embeddings
-    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-    speaker_embeddings = torch.tensor(embeddings_dataset[7]["xvector"]).unsqueeze(0)
-    return model, processor, vocoder, speaker_embeddings
-def text_to_speech(text):
-    # Prepare inputs
-    inputs = tts_processor(text=text, return_tensors="pt")
-    # Generate speech with SpeechT5
-    with torch.no_grad():
-        # Convert speaker embeddings to correct dtype and move to device
-        speaker_embeddings_device = speaker_embeddings.to(device).to(torch_dtype)
-        # Generate speech
-        speech = tts_model.generate_speech(
-            inputs["input_ids"].to(device),
-            speaker_embeddings_device,
-            vocoder=tts_vocoder
-        )
-    # Convert to numpy array
-    audio_array = speech.cpu().numpy().astype(np.float32)
-    # Normalize the audio
-    audio_array = audio_array / np.max(np.abs(audio_array) + 1e-6)
-    audio_array = audio_array.reshape(1, -1).astype(np.float32)
-    return (16000, audio_array)  # SpeechT5 uses 16kHz sample rate
-# Initialize all models
 print("Loading ASR model...")
 asr_pipeline = load_asr_model()
 print("Loading LLM model...")
 llm_model, llm_tokenizer = load_llm_model()
-print("Loading TTS model...")
-tts_model, tts_processor, tts_vocoder, speaker_embeddings = load_tts_model()
 # Chat history management
 chat_history = []
@@ -167,21 +153,31 @@ def generate_response(prompt):
     return response_text
 def response(audio: tuple[int, np.ndarray]):
-    # Step 1: Speech-to-Text
-    transcript = asr_pipeline({"sampling_rate": audio[0], "raw": audio[1].flatten()})
     prompt = transcript["text"]
     # Step 2: Generate text response
     response_text = generate_response(prompt)
-    # Step 3: Text-to-Speech
-    sample_rate, audio_array = text_to_speech(response_text)
-    # Convert to expected format
-    chunk_size = 4800  # 200ms chunks at 24kHz
     for i in range(0, audio_array.shape[1], chunk_size):
         chunk = audio_array[:, i:i+chunk_size]
         if chunk.size > 0:  # Ensure we don't yield empty chunks
@@ -205,14 +201,22 @@ def demo():
                 return None
             sample_rate, audio_array = audio
-            transcript = asr_pipeline({"sampling_rate": sample_rate, "raw": audio_array.flatten()})
             prompt = transcript["text"]
             print(f"Transcribed: {prompt}")
             response_text = generate_response(prompt)
             print(f"Response: {response_text}")
-            sample_rate, audio_array = text_to_speech(response_text)
             return (sample_rate, audio_array[0])
         audio_input.change(process_audio, inputs=[audio_input], outputs=[audio_output])
@@ -224,5 +228,12 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--demo", action="store_true", help="Run Gradio demo instead of WebRTC")
     args = parser.parse_args()
-    # would be faster with webRTC but needs to intialize the model to get it to work
     demo()

 import numpy as np
 import torch
 import os
+import tempfile
 from transformers import (
     AutoModelForSpeechSeq2Seq,
     AutoProcessor,
     pipeline,
     AutoTokenizer,
+    AutoModelForCausalLM
 )
+from gtts import gTTS
+from scipy.io import wavfile
 # Check if CUDA is available, otherwise use CPU
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Step 1: Audio transcription with Whisper
 def load_asr_model():
+    model_id = "openai/whisper-small"
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         model_id,
 # Step 2: Text generation with a smaller LLM
 def load_llm_model():
+    model_id = "facebook/opt-1.3b"
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = AutoModelForCausalLM.from_pretrained(
     return model, tokenizer
+# Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
+def gtts_text_to_speech(text):
+    # Create a temporary file
+    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
+        tmp_filename = f.name
+    # Use gTTS to convert text to speech
+    tts = gTTS(text=text, lang='en', slow=False)
+    # Save as MP3 first (gTTS only outputs MP3)
+    mp3_filename = tmp_filename.replace('.wav', '.mp3')
+    tts.save(mp3_filename)
+    # Convert MP3 to WAV using FFmpeg if available, otherwise use a fallback
+    try:
+        import subprocess
+        subprocess.run(['ffmpeg', '-i', mp3_filename, '-acodec', 'pcm_s16le', '-ar', '24000', '-ac', '1', tmp_filename],
+                       stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    except (ImportError, FileNotFoundError):
+        # Fallback if FFmpeg is not available
+        from pydub import AudioSegment
+        sound = AudioSegment.from_mp3(mp3_filename)
+        sound = sound.set_frame_rate(24000).set_channels(1)
+        sound.export(tmp_filename, format="wav")
+    # Read the WAV file
+    sample_rate, audio_data = wavfile.read(tmp_filename)
+    # Clean up temporary files
+    os.remove(mp3_filename)
+    os.remove(tmp_filename)
+    # Convert to expected format
+    audio_data = audio_data.reshape(1, -1).astype(np.int16)
+    return (sample_rate, audio_data)
+# Initialize models
 print("Loading ASR model...")
 asr_pipeline = load_asr_model()
 print("Loading LLM model...")
 llm_model, llm_tokenizer = load_llm_model()
 # Chat history management
 chat_history = []
     return response_text
 def response(audio: tuple[int, np.ndarray]):
+    # Step 1: Convert audio to float32 before passing to ASR
+    sample_rate, audio_data = audio
+    # Convert int16 audio to float32
+    audio_float32 = audio_data.flatten().astype(np.float32) / 32768.0  # Normalize to [-1.0, 1.0]
+    # Speech-to-Text with correct data type
+    transcript = asr_pipeline({
+        "sampling_rate": sample_rate,
+        "raw": audio_float32
+    })
     prompt = transcript["text"]
+    print(f"Transcribed: {prompt}")
     # Step 2: Generate text response
     response_text = generate_response(prompt)
+    print(f"Response: {response_text}")
+    # Step 3: Text-to-Speech using gTTS
+    sample_rate, audio_array = gtts_text_to_speech(response_text)
+    # Convert to expected format and yield chunks
+    chunk_size = int(sample_rate * 0.2)  # 200ms chunks
     for i in range(0, audio_array.shape[1], chunk_size):
         chunk = audio_array[:, i:i+chunk_size]
         if chunk.size > 0:  # Ensure we don't yield empty chunks
                 return None
             sample_rate, audio_array = audio
+            # Convert to float32 for ASR
+            audio_float32 = audio_array.flatten().astype(np.float32) / 32768.0
+            transcript = asr_pipeline({
+                "sampling_rate": sample_rate,
+                "raw": audio_float32
+            })
             prompt = transcript["text"]
             print(f"Transcribed: {prompt}")
             response_text = generate_response(prompt)
             print(f"Response: {response_text}")
+            sample_rate, audio_array = gtts_text_to_speech(response_text)
             return (sample_rate, audio_array[0])
         audio_input.change(process_audio, inputs=[audio_input], outputs=[audio_output])
     parser = argparse.ArgumentParser()
     parser.add_argument("--demo", action="store_true", help="Run Gradio demo instead of WebRTC")
     args = parser.parse_args()
+    # hugging face issues
     demo()
+    # if args.demo:
+    #     demo()
+    # else:
+    #     # For running with FastRTC
+    #     # You would need to add your FastRTC server code here
+    #     pass

requirements.txt CHANGED Viewed

@@ -7,4 +7,7 @@ gradio
 accelerate
 sentencepiece
 fastrtc[vad,tts]
-torchaudio

 accelerate
 sentencepiece
 fastrtc[vad,tts]
+torchaudio
+gtts
+pydub
+scipy