Spaces:

bobpopboom
/

audio1test

Sleeping

hashhac commited on Mar 15

Commit

ca032b0

1 Parent(s): 36420ca

added ids removed data sets

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5For
 import soundfile as sf
 import tempfile
 import os
-from datasets import load_dataset
 # Check if CUDA is available, otherwise use CPU
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -15,10 +14,6 @@ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
 asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(device)
 tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
-# Load speaker embeddings
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
 # Function to convert speech to text
 def speech_to_text(audio_dict):
     # Extract the audio array from the dictionary
@@ -37,10 +32,12 @@ def speech_to_text(audio_dict):
 # Function to convert text to speech
 def text_to_speech(text):
     inputs = processor(text=text, return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
         speech = tts_model.generate_speech(
-            inputs,
-            speaker_embeddings=speaker_embeddings
         )
     return speech

 import soundfile as sf
 import tempfile
 import os
 # Check if CUDA is available, otherwise use CPU
 device = "cuda" if torch.cuda.is_available() else "cpu"
 asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(device)
 tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 # Function to convert speech to text
 def speech_to_text(audio_dict):
     # Extract the audio array from the dictionary
 # Function to convert text to speech
 def text_to_speech(text):
     inputs = processor(text=text, return_tensors="pt").input_ids.to(device)
+    # Create dummy decoder input IDs (this is a simplification)
+    decoder_input_ids = torch.zeros((1, 1), dtype=torch.long).to(device)
     with torch.no_grad():
         speech = tts_model.generate_speech(
+            inputs,
+            decoder_input_ids=decoder_input_ids
         )
     return speech