File size: 3,643 Bytes
226475c
5307e6b
dbc99da
5307e6b
 
226475c
dbc99da
5307e6b
dbc99da
 
 
 
da9d4b3
dbc99da
 
226475c
5307e6b
 
 
dbc99da
 
 
 
226475c
 
 
 
 
dbc99da
 
 
 
5973534
dbc99da
8ae0b9c
5973534
226475c
dbc99da
226475c
dbc99da
226475c
 
0bc8a9a
98d1415
 
 
 
 
 
 
 
0bc8a9a
a2d9db4
226475c
 
9e2b006
0bc8a9a
9e2b006
 
 
 
 
 
 
 
0bc8a9a
 
c8b6b18
0bc8a9a
226475c
9e2b006
 
 
8ae0b9c
9e2b006
 
 
 
 
 
 
8ae0b9c
9e2b006
 
 
 
 
226475c
9e2b006
 
226475c
8e194d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import torch
import numpy as np
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor

device="cpu"

pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate

def translate(audio):
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
    return outputs["text"]
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
    inputs = processor(text_prompt, voice_preset=voice_preset)
    speech_output = model.generate(**inputs.to(device),pad_token_id=10000)  
    return speech_output
def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text,voice_preset)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return synthesised_rate , synthesised_speech 
def speech_to_speech_translation_fix(audio,voice_preset="v2/zh_speaker_1"):
    synthesised_rate,synthesised_speech = speech_to_speech_translation(audio,voice_preset)
    return synthesised_rate,synthesised_speech.T

title = "Multilanguage to Chinese(mandarin) Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in Multilanguage to target speech in Chinese(mandarin). Demo uses OpenAI's [Whisper arge-v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and a suno/bark[bark-small](https://huggingface.co/suno/bark) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
examples = [
    ["./mama (1).mp3", None],
    ["./mama (2).mp3", None],
    ["./mama (3).mp3", None],
    ["./mama (4).mp3", None],
    ["./mama (5).mp3", None],
    ["./mama (6).mp3", None],
    ["./mama (7).mp3", None],
    ["./mama (8).mp3", None],
]
import gradio as gr
demo = gr.Blocks()

# Muti_translate=gr.Interface(
#     fn=speech_to_speech_translation_fix,
#     inputs=[
#         gr.Audio(label="Upload Speech", source="upload", type="filepath"),
#         gr.Audio(label="Record Speech", source="microphone", type="filepath"),
#     ],
#     outputs=[
#         gr.Audio(label="Generated Speech", type="numpy"),
#         gr.Text(label="Transcription"),
#     ],
#     title=title,
#     description=description,
#     examples=examples,
# )

mic_translate = gr.Interface(
    fn=speech_to_speech_translation_fix,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation_fix,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=examples,
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
    #gr.TabbedInterface([Muti_translate], ["Record or upload your speech"])

demo.launch(share=True)