speech-to-speech

Paused

File size: 2,860 Bytes

226475c
5307e6b
dbc99da
5307e6b
 
226475c
dbc99da
5307e6b
dbc99da
 
 
 
da9d4b3
dbc99da
 
226475c
5307e6b
 
 
dbc99da
 
 
 
226475c
 
 
 
 
dbc99da
 
 
 
 
 
 
 
226475c
dbc99da
226475c
dbc99da
226475c
 
a2d9db4
226475c
 
 
dbc99da
226475c
 
 
 
 
 
 
dbc99da
226475c
 
8e194d1
226475c
 
 
 
 
 
 
8e194d1

import torch
import numpy as np
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor

device="cpu"

pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate

def translate(audio):
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
    return outputs["text"]
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
    inputs = processor(text_prompt, voice_preset=voice_preset)
    speech_output = model.generate(**inputs.to(device),pad_token_id=10000)  
    return speech_output
def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text,voice_preset)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return synthesised_rate , synthesised_speech 
def speech_to_speech_translation_fix(audio,voice_preset="v2/zh_speaker_1"):
    synthesised_rate,synthesised_speech = speech_to_speech_translation(audio,voice_preset)
    return synthesised_rate,synthesised_speech.T

title = "Multilanguage to Chinese(mandarin) Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in Multilanguage to target speech in Chinese(mandarin). Demo uses OpenAI's [Whisper arge-v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and a suno/bark[bark-small](https://huggingface.co/suno/bark) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
import gradio as gr
demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation_fix,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation_fix,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch(share=True)