hashhac commited on
Commit
ca032b0
·
1 Parent(s): 36420ca

added ids removed data sets

Browse files
Files changed (1) hide show
  1. app.py +4 -7
app.py CHANGED
@@ -5,7 +5,6 @@ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5For
5
  import soundfile as sf
6
  import tempfile
7
  import os
8
- from datasets import load_dataset
9
 
10
  # Check if CUDA is available, otherwise use CPU
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -15,10 +14,6 @@ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
15
  asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(device)
16
  tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
17
 
18
- # Load speaker embeddings
19
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
20
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
21
-
22
  # Function to convert speech to text
23
  def speech_to_text(audio_dict):
24
  # Extract the audio array from the dictionary
@@ -37,10 +32,12 @@ def speech_to_text(audio_dict):
37
  # Function to convert text to speech
38
  def text_to_speech(text):
39
  inputs = processor(text=text, return_tensors="pt").input_ids.to(device)
 
 
40
  with torch.no_grad():
41
  speech = tts_model.generate_speech(
42
- inputs,
43
- speaker_embeddings=speaker_embeddings
44
  )
45
  return speech
46
 
 
5
  import soundfile as sf
6
  import tempfile
7
  import os
 
8
 
9
  # Check if CUDA is available, otherwise use CPU
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
14
  asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(device)
15
  tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
16
 
 
 
 
 
17
  # Function to convert speech to text
18
  def speech_to_text(audio_dict):
19
  # Extract the audio array from the dictionary
 
32
  # Function to convert text to speech
33
  def text_to_speech(text):
34
  inputs = processor(text=text, return_tensors="pt").input_ids.to(device)
35
+ # Create dummy decoder input IDs (this is a simplification)
36
+ decoder_input_ids = torch.zeros((1, 1), dtype=torch.long).to(device)
37
  with torch.no_grad():
38
  speech = tts_model.generate_speech(
39
+ inputs,
40
+ decoder_input_ids=decoder_input_ids
41
  )
42
  return speech
43