Spaces:

bobpopboom
/

chaty

Sleeping

App Files Files Community

hashhac commited on Mar 15

Commit

519f37a

1 Parent(s): 23c481b

debuging?

Browse files

Files changed (1) hide show

app.py +32 -31

app.py CHANGED Viewed

@@ -62,16 +62,20 @@ def load_llm_model():
     return model, tokenizer
 # Step 3: Text-to-Speech with a free model
 def load_tts_model():
     model_id = "microsoft/speecht5_tts"
-    processor = AutoProcessor.from_pretrained(model_id)
-    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
     model.to(device)
     # Load vocoder for waveform generation
     vocoder_id = "microsoft/speecht5_hifigan"
-    vocoder = AutoModelForCausalLM.from_pretrained(vocoder_id)
     vocoder.to(device)
     # Load speaker embeddings
@@ -80,6 +84,30 @@ def load_tts_model():
     return model, processor, vocoder, speaker_embeddings
 # Initialize all models
 print("Loading ASR model...")
 asr_pipeline = load_asr_model()
@@ -137,34 +165,7 @@ def generate_response(prompt):
     return response_text
-def text_to_speech(text):
-    # Prepare inputs
-    inputs = tts_processor(text=text, return_tensors="pt")
-    # Add speaker embeddings
-    inputs["speaker_embeddings"] = speaker_embeddings.to(device)
-    # Generate speech
-    with torch.no_grad():
-        speech = tts_model.generate_speech(
-            inputs["input_ids"].to(device),
-            speaker_embeddings.to(device)
-        )
-    # Convert to waveform using vocoder
-    with torch.no_grad():
-        waveform = tts_vocoder(speech)
-    # Convert to numpy array
-    audio_array = waveform.cpu().numpy().squeeze()
-    # Normalize and convert to int16
-    audio_array = (audio_array / np.max(np.abs(audio_array)) * 32767).astype(np.int16)
-    # Reshape for fastrtc
-    audio_array = audio_array.reshape(1, -1)
-    return (24000, audio_array)  # Using 24kHz sample rate
 def response(audio: tuple[int, np.ndarray]):
     # Step 1: Speech-to-Text

     return model, tokenizer
+# Step 3: Text-to-Speech with a free model
 # Step 3: Text-to-Speech with a free model
 def load_tts_model():
+    # Import the specific SpeechT5 classes
+    from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
     model_id = "microsoft/speecht5_tts"
+    processor = SpeechT5Processor.from_pretrained(model_id)
+    model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
     model.to(device)
     # Load vocoder for waveform generation
     vocoder_id = "microsoft/speecht5_hifigan"
+    vocoder = SpeechT5HifiGan.from_pretrained(vocoder_id)
     vocoder.to(device)
     # Load speaker embeddings
     return model, processor, vocoder, speaker_embeddings
+def text_to_speech(text):
+    # Prepare inputs
+    inputs = tts_processor(text=text, return_tensors="pt")
+    # Generate speech with SpeechT5
+    with torch.no_grad():
+        # Generate speech
+        speech = tts_model.generate_speech(
+            inputs["input_ids"].to(device),
+            speaker_embeddings.to(device),
+            vocoder=tts_vocoder
+        )
+    # Convert to numpy array
+    audio_array = speech.cpu().numpy()
+    # Normalize and convert to int16
+    audio_array = (audio_array / np.max(np.abs(audio_array)) * 32767).astype(np.int16)
+    # Reshape for fastrtc
+    audio_array = audio_array.reshape(1, -1)
+    return (16000, audio_array)  # SpeechT5 uses 16kHz sample rate
 # Initialize all models
 print("Loading ASR model...")
 asr_pipeline = load_asr_model()
     return response_text
 def response(audio: tuple[int, np.ndarray]):
     # Step 1: Speech-to-Text