Spaces:

bobpopboom
/

audio1test

Sleeping

hashhac commited on Mar 15

Commit

36420ca

1 Parent(s): be00791

embeddings added

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5For
 import soundfile as sf
 import tempfile
 import os
 # Check if CUDA is available, otherwise use CPU
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -14,6 +15,10 @@ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
 asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(device)
 tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 # Function to convert speech to text
 def speech_to_text(audio_dict):
     # Extract the audio array from the dictionary
@@ -33,7 +38,10 @@ def speech_to_text(audio_dict):
 def text_to_speech(text):
     inputs = processor(text=text, return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
-        speech = tts_model.generate_speech(inputs)
     return speech
 # Gradio demo

 import soundfile as sf
 import tempfile
 import os
+from datasets import load_dataset
 # Check if CUDA is available, otherwise use CPU
 device = "cuda" if torch.cuda.is_available() else "cpu"
 asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(device)
 tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
+# Load speaker embeddings
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
 # Function to convert speech to text
 def speech_to_text(audio_dict):
     # Extract the audio array from the dictionary
 def text_to_speech(text):
     inputs = processor(text=text, return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
+        speech = tts_model.generate_speech(
+            inputs,
+            speaker_embeddings=speaker_embeddings
+        )
     return speech
 # Gradio demo