Spaces:
Paused
Paused
File size: 3,690 Bytes
226475c 5307e6b dbc99da 5307e6b 226475c dbc99da 5307e6b dbc99da da9d4b3 dbc99da 226475c 5307e6b dbc99da 226475c dbc99da d756780 dbc99da d756780 226475c dbc99da 226475c dbc99da 226475c 0bc8a9a 98d1415 0bc8a9a a2d9db4 226475c 0bc8a9a 4329a55 0bc8a9a 0d825e6 d756780 0bc8a9a 226475c 0bc8a9a 226475c 0bc8a9a 226475c 0bc8a9a 226475c 8e194d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import torch
import numpy as np
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor
device="cpu"
pipe = pipeline(
"automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate
def translate(audio):
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
return outputs["text"]
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
inputs = processor(text_prompt, voice_preset=voice_preset)
speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
return speech_output
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return 16000, synthesised_speech
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text,voice_preset)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return synthesised_rate , synthesised_speech , translated_text
def speech_to_speech_translation_fix(audio,voice_preset="v2/zh_speaker_1"):
synthesised_rate,synthesised_speech ,translated_text= speech_to_speech_translation(audio,voice_preset)
return synthesised_rate,synthesised_speech.T,translated_text
title = "Multilanguage to Chinese(mandarin) Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in Multilanguage to target speech in Chinese(mandarin). Demo uses OpenAI's [Whisper arge-v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and a suno/bark[bark-small](https://huggingface.co/suno/bark) model for text-to-speech:

"""
examples = [
["./mama (1).mp3", None],
["./mama (2).mp3", None],
["./mama (3).mp3", None],
["./mama (4).mp3", None],
["./mama (5).mp3", None],
["./mama (6).mp3", None],
["./mama (7).mp3", None],
["./mama (8).mp3", None],
]
import gradio as gr
demo = gr.Blocks()
Muti_translate=gr.Interface(
fn=speech_to_speech_translation_fix,
inputs=[
gr.Audio(label="Upload Speech", source="upload", type="numpy"),
gr.Audio(label="Record Speech", source="microphone", type="numpy"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Transcription"),
],
title=title,
description=description,
examples=examples,
)
# mic_translate = gr.Interface(
# fn=speech_to_speech_translation_fix,
# inputs=gr.Audio(source="microphone", type="filepath"),
# outputs=gr.Audio(label="Generated Speech", type="numpy"),
# title=title,
# description=description,
# )
# file_translate = gr.Interface(
# fn=speech_to_speech_translation_fix,
# inputs=gr.Audio(source="upload", type="filepath"),
# outputs=gr.Audio(label="Generated Speech", type="numpy"),
# examples=examples,
# title=title,
# description=description,
# )
with demo:
# gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
gr.TabbedInterface([Muti_translate], ["Record or upload your speech"])
demo.launch(share=True) |