Spaces:
Sleeping
Sleeping
File size: 1,705 Bytes
11ccb7a 6ab814a 11ccb7a 4ba3d10 11ccb7a 4ba3d10 11ccb7a 6ab814a 11ccb7a 4ba3d10 11ccb7a 4ba3d10 11ccb7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import gradio as gr
import numpy as np
from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
import torch
import io
import soundfile as sf
from nemo.collections.asr.models import EncDecMultiTaskModel
# Load the ASR model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
# Update decoding parameters
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)
# Initialize LLM pipeline
generator = pipeline("text-generation", model="microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
# Initialize TTS tokenizer and model
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
def transcribe_generate_and_speak(audio):
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
# Transcribe audio
asr_output = canary_model.transcribe([y], [sr])
# Generate text based on ASR output
generated_text = generator(asr_output[0])[0]['generated_text']
# Generate audio from text
inputs = tokenizer(text=generated_text, return_tensors="pt")
set_seed(555)
with torch.no_grad():
outputs = model(**inputs)
waveform = outputs.waveform[0]
waveform_path = "output.wav"
sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
return waveform_path
# Define Gradio interface
audio_input = gr.Interface(
transcribe_generate_and_speak,
gr.Audio(sources=["microphone"], label="Speak Here"),
"audio",
title="ASR -> LLM -> TTS",
description="Speak into the microphone and hear the generated audio."
)
# Launch the interface
audio_input.launch() |