speech-to-speech-translation_en-nl

Runtime error

App Files Files Community

Rudy Ong commited on Oct 6, 2023

Commit

3038346

1 Parent(s): 9c5dc07

Initial commit

Browse files

Files changed (1) hide show

app.py +93 -31

app.py CHANGED Viewed

@@ -1,71 +1,133 @@
-import gradio as gr
-import numpy as np
-import torch
-from datasets import load_dataset
-from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# load speech translation checkpoint
-asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
-# load text-to-speech checkpoint and speaker embeddings
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl").to(device)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 def translate(audio):
-    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
     return outputs["text"]
-def synthesise(text):
     inputs = processor(text=text, return_tensors="pt")
-    speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
     return speech.cpu()
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
-    synthesised_speech = synthesise(translated_text)
-    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
-    return 16000, synthesised_speech
-title = "Cascaded STST"
-description = """
-Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Dutch. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
-[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
-![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
-"""
 demo = gr.Blocks()
 mic_translate = gr.Interface(
     fn=speech_to_speech_translation,
     inputs=gr.Audio(source="microphone", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
-    title=title,
-    description=description,
 )
 file_translate = gr.Interface(
     fn=speech_to_speech_translation,
     inputs=gr.Audio(source="upload", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
-    examples=[["./example.wav"]],
-    title=title,
-    description=description,
 )
 with demo:
     gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
-demo.launch()

+# -*- coding: utf-8 -*-
+"""Built_Speech-to-Speech_Translation.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1AHToRlVpGAy3jQdbTm14tDdTyRPc-oG3
+"""
+! pip install git+https://github.com/huggingface/transformers.git
+! pip install torch
+! pip install --upgrade accelerate
+! pip install datasets soundfile speechbrain
+"""### Speech Translation to Text"""
+from huggingface_hub import notebook_login
+notebook_login()
+import torch
+from transformers import pipeline
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+pipe = pipeline(
+    "automatic-speech-recognition", model="openai/whisper-base", device=device
+)
+from datasets import load_dataset
+dataset = load_dataset("facebook/voxpopuli", "nl", split="validation", streaming=True)
+sample = next(iter(dataset))
+from IPython.display import Audio
+Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])
+# Function to generate task argument "translate" for speech translation
+# Recall that "transcribe" task for Speech Recognition
 def translate(audio):
+    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
     return outputs["text"]
+"""Whisper can also be ‘tricked’ into translating from speech in any language X to any language Y. Simply set the task to "transcribe" and the "language" to your target language in the generation key-word arguments, e.g. for Spanish, one would set:
+generate_kwargs={"task": "transcribe", "language": "es"}
+"""
+# See the translation result
+translate(sample["audio"].copy())
+# Compare to raw text
+sample["raw_text"]
+"""### Text-to-Speech"""
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl")
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+"""Here we're using SpeechT5 checkpoint trained specifically for Dutch TTS from Bolakubus/speecht5_finetuned_voxpopuli_nl . Should you wish to translate into a language other than Dutch, either swap the checkpoint for a SpeechT5 TTS model fine-tuned on your language of choice, or use an MMS TTS checkpoint pre-trained in your target language."""
+# Put the model and vocoder to GPU accelerator device if we have one
+model.to(device)
+vocoder.to(device)
+# Load Speakers Embedding
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+"""We can now write a function that takes a text prompt as input, and generates the corresponding speech. We’ll first pre-process the text input using the SpeechT5 processor, tokenizing the text to get our input ids. We’ll then pass the input ids and speaker embeddings to the SpeechT5 model, placing each on the accelerator device if available. Finally, we’ll return the generated speech, bringing it back to the CPU so that we can play it back in our ipynb notebook:"""
+def synthesize(text):
     inputs = processor(text=text, return_tensors="pt")
+    speech = model.generate_speech(
+        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
+    )
     return speech.cpu()
+# Dummy Check
+speech = synthesize("This is a test")
+Audio(speech, rate=16000)
+"""### Creating Speech-to-Speech Translation (STST) Demo"""
+import numpy as np
+# Normalized Audio array by the dynamic range of the target dtype (int16)
+# Next convert from the default NumPy dtype (float64) to the target dtype (int16)
+target_dtype = np.int16
+max_range = np.iinfo(target_dtype).max
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
+    synthesized_speech = synthesize(translated_text)
+    synthesized_speech = (synthesized_speech.numpy() * max_range).astype(np.int16)
+    return 16000, synthesized_speech
+sampling_rate, synthesized_speech = speech_to_speech_translation(sample["audio"])
+Audio(synthesized_speech, rate=sampling_rate)
+! pip install gradio
+import gradio as gr
+from gradio.mix import Series
 demo = gr.Blocks()
+description = "Speech-to-Speech Translation En->Nl"
+title = "Building Demo for Audio Course"
 mic_translate = gr.Interface(
     fn=speech_to_speech_translation,
     inputs=gr.Audio(source="microphone", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
 )
 file_translate = gr.Interface(
     fn=speech_to_speech_translation,
     inputs=gr.Audio(source="upload", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
 )
 with demo:
     gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
+demo.launch(share=True, debug=False)