Spaces:
Paused
Paused
File size: 3,643 Bytes
226475c 5307e6b dbc99da 5307e6b 226475c dbc99da 5307e6b dbc99da da9d4b3 dbc99da 226475c 5307e6b dbc99da 226475c dbc99da 5973534 dbc99da 8ae0b9c 5973534 226475c dbc99da 226475c dbc99da 226475c 0bc8a9a 98d1415 0bc8a9a a2d9db4 226475c 9e2b006 0bc8a9a 9e2b006 0bc8a9a c8b6b18 0bc8a9a 226475c 9e2b006 8ae0b9c 9e2b006 8ae0b9c 9e2b006 226475c 9e2b006 226475c 8e194d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import torch
import numpy as np
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor
device="cpu"
pipe = pipeline(
"automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate
def translate(audio):
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
return outputs["text"]
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
inputs = processor(text_prompt, voice_preset=voice_preset)
speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
return speech_output
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return 16000, synthesised_speech
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text,voice_preset)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return synthesised_rate , synthesised_speech
def speech_to_speech_translation_fix(audio,voice_preset="v2/zh_speaker_1"):
synthesised_rate,synthesised_speech = speech_to_speech_translation(audio,voice_preset)
return synthesised_rate,synthesised_speech.T
title = "Multilanguage to Chinese(mandarin) Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in Multilanguage to target speech in Chinese(mandarin). Demo uses OpenAI's [Whisper arge-v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and a suno/bark[bark-small](https://huggingface.co/suno/bark) model for text-to-speech:

"""
examples = [
["./mama (1).mp3", None],
["./mama (2).mp3", None],
["./mama (3).mp3", None],
["./mama (4).mp3", None],
["./mama (5).mp3", None],
["./mama (6).mp3", None],
["./mama (7).mp3", None],
["./mama (8).mp3", None],
]
import gradio as gr
demo = gr.Blocks()
# Muti_translate=gr.Interface(
# fn=speech_to_speech_translation_fix,
# inputs=[
# gr.Audio(label="Upload Speech", source="upload", type="filepath"),
# gr.Audio(label="Record Speech", source="microphone", type="filepath"),
# ],
# outputs=[
# gr.Audio(label="Generated Speech", type="numpy"),
# gr.Text(label="Transcription"),
# ],
# title=title,
# description=description,
# examples=examples,
# )
mic_translate = gr.Interface(
fn=speech_to_speech_translation_fix,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation_fix,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=examples,
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
#gr.TabbedInterface([Muti_translate], ["Record or upload your speech"])
demo.launch(share=True) |