Rudy Ong commited on
Commit
3038346
·
1 Parent(s): 9c5dc07

Initial commit

Browse files
Files changed (1) hide show
  1. app.py +93 -31
app.py CHANGED
@@ -1,71 +1,133 @@
1
- import gradio as gr
2
- import numpy as np
3
- import torch
4
- from datasets import load_dataset
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
 
 
 
 
 
 
 
8
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
 
 
10
 
11
- # load speech translation checkpoint
12
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
 
14
- # load text-to-speech checkpoint and speaker embeddings
15
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
- model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl").to(device)
17
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
18
 
19
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
20
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
21
 
 
22
 
 
 
23
  def translate(audio):
24
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
25
  return outputs["text"]
26
 
 
 
 
 
27
 
28
- def synthesise(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  inputs = processor(text=text, return_tensors="pt")
30
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
 
 
31
  return speech.cpu()
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def speech_to_speech_translation(audio):
35
  translated_text = translate(audio)
36
- synthesised_speech = synthesise(translated_text)
37
- synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
38
- return 16000, synthesised_speech
39
 
 
40
 
41
- title = "Cascaded STST"
42
- description = """
43
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Dutch. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
44
- [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
45
 
46
- ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
47
- """
 
 
48
 
49
  demo = gr.Blocks()
 
 
50
 
51
  mic_translate = gr.Interface(
52
  fn=speech_to_speech_translation,
53
  inputs=gr.Audio(source="microphone", type="filepath"),
54
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
55
- title=title,
56
- description=description,
57
  )
58
 
59
  file_translate = gr.Interface(
60
  fn=speech_to_speech_translation,
61
  inputs=gr.Audio(source="upload", type="filepath"),
62
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
63
- examples=[["./example.wav"]],
64
- title=title,
65
- description=description,
66
  )
67
 
68
  with demo:
69
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
70
 
71
- demo.launch()
 
1
+ # -*- coding: utf-8 -*-
2
+ """Built_Speech-to-Speech_Translation.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1AHToRlVpGAy3jQdbTm14tDdTyRPc-oG3
8
+ """
9
+
10
+ ! pip install git+https://github.com/huggingface/transformers.git
11
+
12
+ ! pip install torch
13
+
14
+ ! pip install --upgrade accelerate
15
+
16
+ ! pip install datasets soundfile speechbrain
17
 
18
+ """### Speech Translation to Text"""
19
 
20
+ from huggingface_hub import notebook_login
21
+
22
+ notebook_login()
23
+
24
+ import torch
25
+ from transformers import pipeline
26
 
27
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
28
+ pipe = pipeline(
29
+ "automatic-speech-recognition", model="openai/whisper-base", device=device
30
+ )
31
 
32
+ from datasets import load_dataset
 
33
 
34
+ dataset = load_dataset("facebook/voxpopuli", "nl", split="validation", streaming=True)
35
+ sample = next(iter(dataset))
 
 
36
 
37
+ from IPython.display import Audio
 
38
 
39
+ Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])
40
 
41
+ # Function to generate task argument "translate" for speech translation
42
+ # Recall that "transcribe" task for Speech Recognition
43
  def translate(audio):
44
+ outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
45
  return outputs["text"]
46
 
47
+ """Whisper can also be ‘tricked’ into translating from speech in any language X to any language Y. Simply set the task to "transcribe" and the "language" to your target language in the generation key-word arguments, e.g. for Spanish, one would set:
48
+
49
+ generate_kwargs={"task": "transcribe", "language": "es"}
50
+ """
51
 
52
+ # See the translation result
53
+ translate(sample["audio"].copy())
54
+
55
+ # Compare to raw text
56
+ sample["raw_text"]
57
+
58
+ """### Text-to-Speech"""
59
+
60
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
61
+
62
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
63
+ model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl")
64
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
65
+
66
+ """Here we're using SpeechT5 checkpoint trained specifically for Dutch TTS from Bolakubus/speecht5_finetuned_voxpopuli_nl . Should you wish to translate into a language other than Dutch, either swap the checkpoint for a SpeechT5 TTS model fine-tuned on your language of choice, or use an MMS TTS checkpoint pre-trained in your target language."""
67
+
68
+ # Put the model and vocoder to GPU accelerator device if we have one
69
+ model.to(device)
70
+ vocoder.to(device)
71
+
72
+ # Load Speakers Embedding
73
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
74
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
75
+
76
+ """We can now write a function that takes a text prompt as input, and generates the corresponding speech. We’ll first pre-process the text input using the SpeechT5 processor, tokenizing the text to get our input ids. We’ll then pass the input ids and speaker embeddings to the SpeechT5 model, placing each on the accelerator device if available. Finally, we’ll return the generated speech, bringing it back to the CPU so that we can play it back in our ipynb notebook:"""
77
+
78
+ def synthesize(text):
79
  inputs = processor(text=text, return_tensors="pt")
80
+ speech = model.generate_speech(
81
+ inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
82
+ )
83
  return speech.cpu()
84
 
85
+ # Dummy Check
86
+ speech = synthesize("This is a test")
87
+
88
+ Audio(speech, rate=16000)
89
+
90
+ """### Creating Speech-to-Speech Translation (STST) Demo"""
91
+
92
+ import numpy as np
93
+
94
+ # Normalized Audio array by the dynamic range of the target dtype (int16)
95
+ # Next convert from the default NumPy dtype (float64) to the target dtype (int16)
96
+ target_dtype = np.int16
97
+ max_range = np.iinfo(target_dtype).max
98
 
99
  def speech_to_speech_translation(audio):
100
  translated_text = translate(audio)
101
+ synthesized_speech = synthesize(translated_text)
102
+ synthesized_speech = (synthesized_speech.numpy() * max_range).astype(np.int16)
103
+ return 16000, synthesized_speech
104
 
105
+ sampling_rate, synthesized_speech = speech_to_speech_translation(sample["audio"])
106
 
107
+ Audio(synthesized_speech, rate=sampling_rate)
 
 
 
108
 
109
+ ! pip install gradio
110
+
111
+ import gradio as gr
112
+ from gradio.mix import Series
113
 
114
  demo = gr.Blocks()
115
+ description = "Speech-to-Speech Translation En->Nl"
116
+ title = "Building Demo for Audio Course"
117
 
118
  mic_translate = gr.Interface(
119
  fn=speech_to_speech_translation,
120
  inputs=gr.Audio(source="microphone", type="filepath"),
121
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
 
 
122
  )
123
 
124
  file_translate = gr.Interface(
125
  fn=speech_to_speech_translation,
126
  inputs=gr.Audio(source="upload", type="filepath"),
127
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
 
 
 
128
  )
129
 
130
  with demo:
131
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
132
 
133
+ demo.launch(share=True, debug=False)