speech-to-speech-translation

Runtime error

App Files Files Community

Daniel981215 commited on Apr 15, 2024

Commit

1b9a7a0

verified ·

1 Parent(s): dbfdf1a

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -8

app.py CHANGED Viewed

@@ -9,21 +9,35 @@ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Proce
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # load speech translation checkpoint
-asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 # load text-to-speech checkpoint and speaker embeddings
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 def translate(audio):
-    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
-    return outputs["text"]
 def synthesise(text):
@@ -41,10 +55,7 @@ def speech_to_speech_translation(audio):
 title = "Cascaded STST"
 description = """
-Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
-[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
-![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """
 demo = gr.Blocks()

 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # load speech translation checkpoint
+asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
 # load text-to-speech checkpoint and speaker embeddings
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+model = SpeechT5ForTextToSpeech.from_pretrained("Daniel981215/speecht5_tts_finetuned_voxpopuli_es").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+replacements = {'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u', '¿': '', '?': '', '1': 'uno', '2':'dos','3':'tres', '4':'cuatro', '5':'cinco',
+               '6', 'seis', '7':'siete', '8':'ocho', '9':'nueve', '0':'cero'}
+def normalize_replace_string(input_string, replacements):
+    normalized_string = re.sub(r'\s+', ' ', input_string).strip().lower()
+    for old, new in replacements.items():
+        normalized_string = normalized_string.replace(old, new)
+    return normalized_string
 def translate(audio):
+    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
+    output_txt = normalize_replace_string(outputs["text"], replacements)
+    return output_txt
 def synthesise(text):
 title = "Cascaded STST"
 description = """
+speech-to-speech translation (STST)
 """
 demo = gr.Blocks()