Spaces:
Runtime error
Runtime error
Rudy Ong
commited on
Commit
·
3038346
1
Parent(s):
9c5dc07
Initial commit
Browse files
app.py
CHANGED
@@ -1,71 +1,133 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl").to(device)
|
17 |
-
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
|
18 |
|
19 |
-
|
20 |
-
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
21 |
|
|
|
22 |
|
|
|
|
|
23 |
def translate(audio):
|
24 |
-
outputs =
|
25 |
return outputs["text"]
|
26 |
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
inputs = processor(text=text, return_tensors="pt")
|
30 |
-
speech = model.generate_speech(
|
|
|
|
|
31 |
return speech.cpu()
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
def speech_to_speech_translation(audio):
|
35 |
translated_text = translate(audio)
|
36 |
-
|
37 |
-
|
38 |
-
return 16000,
|
39 |
|
|
|
40 |
|
41 |
-
|
42 |
-
description = """
|
43 |
-
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Dutch. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
|
44 |
-
[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
|
45 |
|
46 |
-
!
|
47 |
-
|
|
|
|
|
48 |
|
49 |
demo = gr.Blocks()
|
|
|
|
|
50 |
|
51 |
mic_translate = gr.Interface(
|
52 |
fn=speech_to_speech_translation,
|
53 |
inputs=gr.Audio(source="microphone", type="filepath"),
|
54 |
outputs=gr.Audio(label="Generated Speech", type="numpy"),
|
55 |
-
title=title,
|
56 |
-
description=description,
|
57 |
)
|
58 |
|
59 |
file_translate = gr.Interface(
|
60 |
fn=speech_to_speech_translation,
|
61 |
inputs=gr.Audio(source="upload", type="filepath"),
|
62 |
outputs=gr.Audio(label="Generated Speech", type="numpy"),
|
63 |
-
examples=[["./example.wav"]],
|
64 |
-
title=title,
|
65 |
-
description=description,
|
66 |
)
|
67 |
|
68 |
with demo:
|
69 |
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
|
70 |
|
71 |
-
demo.launch()
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Built_Speech-to-Speech_Translation.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1AHToRlVpGAy3jQdbTm14tDdTyRPc-oG3
|
8 |
+
"""
|
9 |
+
|
10 |
+
! pip install git+https://github.com/huggingface/transformers.git
|
11 |
+
|
12 |
+
! pip install torch
|
13 |
+
|
14 |
+
! pip install --upgrade accelerate
|
15 |
+
|
16 |
+
! pip install datasets soundfile speechbrain
|
17 |
|
18 |
+
"""### Speech Translation to Text"""
|
19 |
|
20 |
+
from huggingface_hub import notebook_login
|
21 |
+
|
22 |
+
notebook_login()
|
23 |
+
|
24 |
+
import torch
|
25 |
+
from transformers import pipeline
|
26 |
|
27 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
28 |
+
pipe = pipeline(
|
29 |
+
"automatic-speech-recognition", model="openai/whisper-base", device=device
|
30 |
+
)
|
31 |
|
32 |
+
from datasets import load_dataset
|
|
|
33 |
|
34 |
+
dataset = load_dataset("facebook/voxpopuli", "nl", split="validation", streaming=True)
|
35 |
+
sample = next(iter(dataset))
|
|
|
|
|
36 |
|
37 |
+
from IPython.display import Audio
|
|
|
38 |
|
39 |
+
Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])
|
40 |
|
41 |
+
# Function to generate task argument "translate" for speech translation
|
42 |
+
# Recall that "transcribe" task for Speech Recognition
|
43 |
def translate(audio):
|
44 |
+
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
|
45 |
return outputs["text"]
|
46 |
|
47 |
+
"""Whisper can also be ‘tricked’ into translating from speech in any language X to any language Y. Simply set the task to "transcribe" and the "language" to your target language in the generation key-word arguments, e.g. for Spanish, one would set:
|
48 |
+
|
49 |
+
generate_kwargs={"task": "transcribe", "language": "es"}
|
50 |
+
"""
|
51 |
|
52 |
+
# See the translation result
|
53 |
+
translate(sample["audio"].copy())
|
54 |
+
|
55 |
+
# Compare to raw text
|
56 |
+
sample["raw_text"]
|
57 |
+
|
58 |
+
"""### Text-to-Speech"""
|
59 |
+
|
60 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
61 |
+
|
62 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
63 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl")
|
64 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
65 |
+
|
66 |
+
"""Here we're using SpeechT5 checkpoint trained specifically for Dutch TTS from Bolakubus/speecht5_finetuned_voxpopuli_nl . Should you wish to translate into a language other than Dutch, either swap the checkpoint for a SpeechT5 TTS model fine-tuned on your language of choice, or use an MMS TTS checkpoint pre-trained in your target language."""
|
67 |
+
|
68 |
+
# Put the model and vocoder to GPU accelerator device if we have one
|
69 |
+
model.to(device)
|
70 |
+
vocoder.to(device)
|
71 |
+
|
72 |
+
# Load Speakers Embedding
|
73 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
74 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
75 |
+
|
76 |
+
"""We can now write a function that takes a text prompt as input, and generates the corresponding speech. We’ll first pre-process the text input using the SpeechT5 processor, tokenizing the text to get our input ids. We’ll then pass the input ids and speaker embeddings to the SpeechT5 model, placing each on the accelerator device if available. Finally, we’ll return the generated speech, bringing it back to the CPU so that we can play it back in our ipynb notebook:"""
|
77 |
+
|
78 |
+
def synthesize(text):
|
79 |
inputs = processor(text=text, return_tensors="pt")
|
80 |
+
speech = model.generate_speech(
|
81 |
+
inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
|
82 |
+
)
|
83 |
return speech.cpu()
|
84 |
|
85 |
+
# Dummy Check
|
86 |
+
speech = synthesize("This is a test")
|
87 |
+
|
88 |
+
Audio(speech, rate=16000)
|
89 |
+
|
90 |
+
"""### Creating Speech-to-Speech Translation (STST) Demo"""
|
91 |
+
|
92 |
+
import numpy as np
|
93 |
+
|
94 |
+
# Normalized Audio array by the dynamic range of the target dtype (int16)
|
95 |
+
# Next convert from the default NumPy dtype (float64) to the target dtype (int16)
|
96 |
+
target_dtype = np.int16
|
97 |
+
max_range = np.iinfo(target_dtype).max
|
98 |
|
99 |
def speech_to_speech_translation(audio):
|
100 |
translated_text = translate(audio)
|
101 |
+
synthesized_speech = synthesize(translated_text)
|
102 |
+
synthesized_speech = (synthesized_speech.numpy() * max_range).astype(np.int16)
|
103 |
+
return 16000, synthesized_speech
|
104 |
|
105 |
+
sampling_rate, synthesized_speech = speech_to_speech_translation(sample["audio"])
|
106 |
|
107 |
+
Audio(synthesized_speech, rate=sampling_rate)
|
|
|
|
|
|
|
108 |
|
109 |
+
! pip install gradio
|
110 |
+
|
111 |
+
import gradio as gr
|
112 |
+
from gradio.mix import Series
|
113 |
|
114 |
demo = gr.Blocks()
|
115 |
+
description = "Speech-to-Speech Translation En->Nl"
|
116 |
+
title = "Building Demo for Audio Course"
|
117 |
|
118 |
mic_translate = gr.Interface(
|
119 |
fn=speech_to_speech_translation,
|
120 |
inputs=gr.Audio(source="microphone", type="filepath"),
|
121 |
outputs=gr.Audio(label="Generated Speech", type="numpy"),
|
|
|
|
|
122 |
)
|
123 |
|
124 |
file_translate = gr.Interface(
|
125 |
fn=speech_to_speech_translation,
|
126 |
inputs=gr.Audio(source="upload", type="filepath"),
|
127 |
outputs=gr.Audio(label="Generated Speech", type="numpy"),
|
|
|
|
|
|
|
128 |
)
|
129 |
|
130 |
with demo:
|
131 |
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
|
132 |
|
133 |
+
demo.launch(share=True, debug=False)
|