mfidabel's picture
Update app.py
e566d20 verified
raw
history blame
3.12 kB
import spaces
import gradio as gr
import numpy as np
import torch
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, AutomaticSpeechRecognitionPipeline
peft_model_id = "mfidabel/Modelo_3_Whisper_Large_V3"
language = "guarani"
task = "transcribe"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
peft_config.base_model_name_or_path, load_in_8bit=False, device_map="cuda:0"
)
model = PeftModel.from_pretrained(model, peft_model_id)
model.merge_and_unload()
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task=task)
pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
@spaces.GPU
def transcribe(audio):
if audio is None:
return "Espera a que la grabaci贸n termine de subirse al servidor !! Intentelo de nuevo en unos segundos"
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
with torch.autocast("cuda"):
return pipeline({"sampling_rate": sr, "raw": y}, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"]
examples = [
"./examples/audio_1.mp3",
"./examples/audio_2.mp3",
"./examples/audio_3.mp3",
"./examples/audio_4.mp3"
]
title = "# 馃嚨馃嚲 Reconocimiento de Voz en Guaran铆"
description = """Esta es una demostraci贸n del reconocimiento de voz en Guaran铆 utilizando el modelo speech-to-text [Whisper](https://arxiv.org/pdf/2212.04356.pdf)
Autores:
- Mateo Andr茅s Fidabel Gill
- Santiago Ruben Acevedo Zarza
"""
audio_input = gr.Audio(value="./examples/audio_1.mp3",
sources=["upload", "microphone"],
label="馃帳 Audio a transcribir",
interactive=True)
transcription = gr.Textbox(label="馃摑 Transcripci贸n",
interactive=False)
with gr.Blocks() as demo:
with gr.Row():
# Model Title and Description
gr.Markdown(title)
gr.Markdown(description)
with gr.Row():
# Audio Input
audio_input.render()
with gr.Row():
# Text Output
transcription.render()
with gr.Row():
# Submit and Clear Buttons
submit = gr.Button("馃摑 Transcribir el Audio")
with gr.Row():
gr.Examples(examples=examples,
inputs=[audio_input],
outputs=[transcription],
fn=transcribe,
label="Ejemplos")
submit.click(transcribe,
inputs=[audio_input],
outputs = [transcription])
demo.queue()
demo.launch(share=True)