Spaces:

mfidabel
/

guarani-speech-recognition

Sleeping

File size: 3,122 Bytes

94261e3
9d6f79c
 
 
 
 
 
df1814d
9d6f79c
 
 
 
027b33e
9d6f79c
 
e566d20
9d6f79c
 
 
 
 
 
 
60fe074
9d6f79c
 
 
 
 
5d8b688
9d6f79c
e1d5e25
 
9d6f79c
98a9b6a
 
 
 
 
 
 
361ce4e

import spaces
import gradio as gr
import numpy as np
import torch
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, AutomaticSpeechRecognitionPipeline

peft_model_id = "mfidabel/Modelo_3_Whisper_Large_V3"
language = "guarani"
task = "transcribe"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=False, device_map="cuda:0"
)
model = PeftModel.from_pretrained(model, peft_model_id)
model.merge_and_unload()
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task=task)

pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)

@spaces.GPU
def transcribe(audio):
    if audio is None:
        return "Espera a que la grabación termine de subirse al servidor !! Intentelo de nuevo en unos segundos"
        
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    with torch.autocast("cuda"):
        return pipeline({"sampling_rate": sr, "raw": y}, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"]

examples = [
    "./examples/audio_1.mp3",
    "./examples/audio_2.mp3",
    "./examples/audio_3.mp3",
    "./examples/audio_4.mp3"
]

title = "# 🇵🇾 Reconocimiento de Voz en Guaraní"

description = """Esta es una demostración del reconocimiento de voz en Guaraní utilizando el modelo speech-to-text [Whisper](https://arxiv.org/pdf/2212.04356.pdf)

                Autores:
                - Mateo Andrés Fidabel Gill
                - Santiago Ruben Acevedo Zarza
              """

audio_input = gr.Audio(value="./examples/audio_1.mp3", 
                       sources=["upload", "microphone"],
                       label="🎤 Audio a transcribir",
                       interactive=True)

transcription = gr.Textbox(label="📝 Transcripción",
                           interactive=False)

with gr.Blocks() as demo:
    
    with gr.Row():
        # Model Title and Description
        gr.Markdown(title)
        gr.Markdown(description)
        
    with gr.Row():
        # Audio Input
        audio_input.render()
        
    with gr.Row():
        # Text Output
        transcription.render()
        
    with gr.Row():
        # Submit and Clear Buttons
        submit = gr.Button("📝 Transcribir el Audio")

    with gr.Row():
        gr.Examples(examples=examples,
                    inputs=[audio_input],
                    outputs=[transcription],
                    fn=transcribe,
                    label="Ejemplos")

    submit.click(transcribe, 
                 inputs=[audio_input],
                 outputs = [transcription])


demo.queue()
demo.launch(share=True)