speech-to-speech

Paused

File size: 3,378 Bytes

226475c
5307e6b
0041ae6
dbc99da
5307e6b
 
226475c
374bd91
5307e6b
dbc99da
 
 
0041ae6
dbc99da
da9d4b3
dbc99da
 
226475c
0041ae6
 
5307e6b
0041ae6
 
 
 
 
dbc99da
 
 
 
 
0041ae6
dbc99da
 
0041ae6
226475c
dbc99da
226475c
dbc99da
226475c
 
0bc8a9a
69c7afe
 
 
 
 
 
 
 
 
 
0bc8a9a
a2d9db4
226475c
b9359f0
 
0041ae6
b9359f0
 
 
 
0041ae6
b9359f0
9e2b006
 
b9359f0
9e2b006
b9359f0
0041ae6
b9359f0
 
 
 
0041ae6
b9359f0
9e2b006
 
 
226475c
b9359f0
 
 
 
226475c
8e194d1

import torch
import numpy as np
import soundfile as sf
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor

device = "cuda:0" if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate

def translate(audio_file):
    audio, sampling_rate = sf.read(audio_file)
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
    language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
    label_outputs = {}
    for pred in language_prediction:
        label_outputs[pred["label"]] = pred["score"]
    return outputs["text"],label_outputs
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
    inputs = processor(text_prompt, voice_preset=voice_preset)
    speech_output = model.generate(**inputs.to(device),pad_token_id=10000)  
    return speech_output
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
    translated_text, label_outputs= translate(audio)
    synthesised_speech = synthesise(translated_text,voice_preset)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs

title = "Multilanguage to Chinese(mandarin) Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in Multilanguage to target speech in Chinese(mandarin). Demo uses OpenAI's [Whisper arge-v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and a suno/bark[bark-small](https://huggingface.co/suno/bark) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
examples = [
    ["./cs-CZ.mp3", None],
    ["./de-DE.mp3", None],
    ["./es-ES.mp3", None],
    ["./fr-FR.mp3", None],
    ["./it-IT.mp3", None],
    ["./ko-KR.mp3", None],
    ["./nl-NL.mp3", None],
    ["./pl-PL.mp3", None],
    ["./pt-PT.mp3", None],
    ["./ru-RU.mp3", None],
]
import gradio as gr

demo = gr.Blocks()
file_transcribe = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Transcription"),
        gr.Label(label="Language prediction"),
    ],
    title=title,
    description=description,
    examples=examples,
)
mic_transcribe = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Transcription"),
        gr.Label(label="Language prediction"),
    ],
    title=title,
    description=description,
)
with demo:
    gr.TabbedInterface(
        [file_transcribe, mic_transcribe],
        ["Transcribe Audio File", "Transcribe Microphone"],
    )

demo.launch(share=True)