Daniel981215 commited on
Commit
1b9a7a0
verified
1 Parent(s): dbfdf1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -8
app.py CHANGED
@@ -9,21 +9,35 @@ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Proce
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
  # load speech translation checkpoint
12
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
 
17
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
18
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
 
20
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def translate(audio):
25
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
26
- return outputs["text"]
 
 
27
 
28
 
29
  def synthesise(text):
@@ -41,10 +55,7 @@ def speech_to_speech_translation(audio):
41
 
42
  title = "Cascaded STST"
43
  description = """
44
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
45
- [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
46
-
47
- ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
48
  """
49
 
50
  demo = gr.Blocks()
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
  # load speech translation checkpoint
12
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
 
17
+ model = SpeechT5ForTextToSpeech.from_pretrained("Daniel981215/speecht5_tts_finetuned_voxpopuli_es").to(device)
18
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
 
20
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
 
23
 
24
+ replacements = {'谩': 'a', '茅': 'e', '铆': 'i', '贸': 'o', '煤': 'u', '驴': '', '?': '', '1': 'uno', '2':'dos','3':'tres', '4':'cuatro', '5':'cinco',
25
+ '6', 'seis', '7':'siete', '8':'ocho', '9':'nueve', '0':'cero'}
26
+
27
+ def normalize_replace_string(input_string, replacements):
28
+ normalized_string = re.sub(r'\s+', ' ', input_string).strip().lower()
29
+
30
+ for old, new in replacements.items():
31
+ normalized_string = normalized_string.replace(old, new)
32
+
33
+ return normalized_string
34
+
35
+
36
  def translate(audio):
37
+ outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
38
+ output_txt = normalize_replace_string(outputs["text"], replacements)
39
+
40
+ return output_txt
41
 
42
 
43
  def synthesise(text):
 
55
 
56
  title = "Cascaded STST"
57
  description = """
58
+ speech-to-speech translation (STST)
 
 
 
59
  """
60
 
61
  demo = gr.Blocks()