|
import spaces |
|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
from peft import PeftModel, PeftConfig |
|
from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, AutomaticSpeechRecognitionPipeline |
|
|
|
peft_model_id = "mfidabel/Modelo_3_Whisper_Large_V3" |
|
language = "guarani" |
|
task = "transcribe" |
|
peft_config = PeftConfig.from_pretrained(peft_model_id) |
|
model = WhisperForConditionalGeneration.from_pretrained( |
|
peft_config.base_model_name_or_path, load_in_8bit=False, device_map="cuda:0" |
|
) |
|
model = PeftModel.from_pretrained(model, peft_model_id) |
|
model = model.merge_and_unload() |
|
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task) |
|
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task) |
|
feature_extractor = processor.feature_extractor |
|
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task=task) |
|
|
|
pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) |
|
|
|
@spaces.GPU |
|
def transcribe(audio): |
|
if audio is None: |
|
return "Espera a que la grabaci贸n termine de subirse al servidor !! Intentelo de nuevo en unos segundos" |
|
|
|
sr, y = audio |
|
y = y.astype(np.float32) |
|
y /= np.max(np.abs(y)) |
|
with torch.autocast("cuda"): |
|
return pipeline({"sampling_rate": sr, "raw": y}, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"] |
|
|
|
examples = [ |
|
"./examples/audio_1.mp3", |
|
"./examples/audio_2.mp3", |
|
"./examples/audio_3.mp3", |
|
"./examples/audio_4.mp3" |
|
] |
|
|
|
title = "# 馃嚨馃嚲 Reconocimiento de Voz en Guaran铆" |
|
|
|
description = """Esta es una demostraci贸n del reconocimiento de voz en Guaran铆 utilizando el modelo speech-to-text [Whisper](https://arxiv.org/pdf/2212.04356.pdf) |
|
|
|
Autores: |
|
- Mateo Andr茅s Fidabel Gill |
|
- Santiago Ruben Acevedo Zarza |
|
""" |
|
|
|
audio_input = gr.Audio(value="./examples/audio_1.mp3", |
|
sources=["upload", "microphone"], |
|
label="馃帳 Audio a transcribir", |
|
interactive=True) |
|
|
|
transcription = gr.Textbox(label="馃摑 Transcripci贸n", |
|
interactive=False) |
|
|
|
with gr.Blocks() as demo: |
|
|
|
with gr.Row(): |
|
|
|
gr.Markdown(title) |
|
gr.Markdown(description) |
|
|
|
with gr.Row(): |
|
|
|
audio_input.render() |
|
|
|
with gr.Row(): |
|
|
|
transcription.render() |
|
|
|
with gr.Row(): |
|
|
|
submit = gr.Button("馃摑 Transcribir el Audio") |
|
|
|
with gr.Row(): |
|
gr.Examples(examples=examples, |
|
inputs=[audio_input], |
|
outputs=[transcription], |
|
fn=transcribe, |
|
label="Ejemplos") |
|
|
|
submit.click(transcribe, |
|
inputs=[audio_input], |
|
outputs = [transcription]) |
|
|
|
|
|
demo.queue() |
|
demo.launch(share=True) |
|
|