File size: 3,122 Bytes
94261e3
9d6f79c
 
 
 
 
 
df1814d
9d6f79c
 
 
 
027b33e
9d6f79c
 
e566d20
9d6f79c
 
 
 
 
 
 
60fe074
9d6f79c
 
 
 
 
5d8b688
9d6f79c
e1d5e25
 
9d6f79c
98a9b6a
 
 
 
 
 
 
361ce4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import spaces
import gradio as gr
import numpy as np
import torch
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, AutomaticSpeechRecognitionPipeline

peft_model_id = "mfidabel/Modelo_3_Whisper_Large_V3"
language = "guarani"
task = "transcribe"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=False, device_map="cuda:0"
)
model = PeftModel.from_pretrained(model, peft_model_id)
model.merge_and_unload()
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task=task)

pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)

@spaces.GPU
def transcribe(audio):
    if audio is None:
        return "Espera a que la grabaci贸n termine de subirse al servidor !! Intentelo de nuevo en unos segundos"
        
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    with torch.autocast("cuda"):
        return pipeline({"sampling_rate": sr, "raw": y}, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"]

examples = [
    "./examples/audio_1.mp3",
    "./examples/audio_2.mp3",
    "./examples/audio_3.mp3",
    "./examples/audio_4.mp3"
]

title = "# 馃嚨馃嚲 Reconocimiento de Voz en Guaran铆"

description = """Esta es una demostraci贸n del reconocimiento de voz en Guaran铆 utilizando el modelo speech-to-text [Whisper](https://arxiv.org/pdf/2212.04356.pdf)

                Autores:
                - Mateo Andr茅s Fidabel Gill
                - Santiago Ruben Acevedo Zarza
              """

audio_input = gr.Audio(value="./examples/audio_1.mp3", 
                       sources=["upload", "microphone"],
                       label="馃帳 Audio a transcribir",
                       interactive=True)

transcription = gr.Textbox(label="馃摑 Transcripci贸n",
                           interactive=False)

with gr.Blocks() as demo:
    
    with gr.Row():
        # Model Title and Description
        gr.Markdown(title)
        gr.Markdown(description)
        
    with gr.Row():
        # Audio Input
        audio_input.render()
        
    with gr.Row():
        # Text Output
        transcription.render()
        
    with gr.Row():
        # Submit and Clear Buttons
        submit = gr.Button("馃摑 Transcribir el Audio")

    with gr.Row():
        gr.Examples(examples=examples,
                    inputs=[audio_input],
                    outputs=[transcription],
                    fn=transcribe,
                    label="Ejemplos")

    submit.click(transcribe, 
                 inputs=[audio_input],
                 outputs = [transcription])


demo.queue()
demo.launch(share=True)