speech-to-speech

Paused

File size: 3,690 Bytes

226475c
5307e6b
dbc99da
5307e6b
 
226475c
dbc99da
5307e6b
dbc99da
 
 
 
da9d4b3
dbc99da
 
226475c
5307e6b
 
 
dbc99da
 
 
 
226475c
 
 
 
 
dbc99da
 
 
 
d756780
dbc99da
d756780
 
226475c
dbc99da
226475c
dbc99da
226475c
 
0bc8a9a
98d1415
 
 
 
 
 
 
 
0bc8a9a
a2d9db4
226475c
 
0bc8a9a
4329a55
0bc8a9a
 
 
 
 
0d825e6
d756780
0bc8a9a
226475c
 
0bc8a9a
226475c
 
0bc8a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226475c
 
0bc8a9a
 
226475c
8e194d1

import torch
import numpy as np
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor

device="cpu"

pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate

def translate(audio):
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
    return outputs["text"]
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
    inputs = processor(text_prompt, voice_preset=voice_preset)
    speech_output = model.generate(**inputs.to(device),pad_token_id=10000)  
    return speech_output
def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text,voice_preset)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return synthesised_rate , synthesised_speech , translated_text
def speech_to_speech_translation_fix(audio,voice_preset="v2/zh_speaker_1"):
    synthesised_rate,synthesised_speech ,translated_text= speech_to_speech_translation(audio,voice_preset)
    return synthesised_rate,synthesised_speech.T,translated_text

title = "Multilanguage to Chinese(mandarin) Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in Multilanguage to target speech in Chinese(mandarin). Demo uses OpenAI's [Whisper arge-v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and a suno/bark[bark-small](https://huggingface.co/suno/bark) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
examples = [
    ["./mama (1).mp3", None],
    ["./mama (2).mp3", None],
    ["./mama (3).mp3", None],
    ["./mama (4).mp3", None],
    ["./mama (5).mp3", None],
    ["./mama (6).mp3", None],
    ["./mama (7).mp3", None],
    ["./mama (8).mp3", None],
]
import gradio as gr
demo = gr.Blocks()

Muti_translate=gr.Interface(
    fn=speech_to_speech_translation_fix,
    inputs=[
        gr.Audio(label="Upload Speech", source="upload", type="numpy"),
        gr.Audio(label="Record Speech", source="microphone", type="numpy"),
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Transcription"),
    ],
    title=title,
    description=description,
    examples=examples,
)

# mic_translate = gr.Interface(
#     fn=speech_to_speech_translation_fix,
#     inputs=gr.Audio(source="microphone", type="filepath"),
#     outputs=gr.Audio(label="Generated Speech", type="numpy"),
#     title=title,
#     description=description,
# )

# file_translate = gr.Interface(
#     fn=speech_to_speech_translation_fix,
#     inputs=gr.Audio(source="upload", type="filepath"),
#     outputs=gr.Audio(label="Generated Speech", type="numpy"),
#      examples=examples,
#     title=title,
#     description=description,
# )

with demo:
    # gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
    gr.TabbedInterface([Muti_translate], ["Record or upload your speech"])

demo.launch(share=True)