Spaces:
Running
Running
File size: 1,649 Bytes
9ba2a1c a67942c 9ba2a1c c3e624b 9ba2a1c 3b5175f 9ba2a1c 31a57d8 9ba2a1c 31a57d8 9ba2a1c a67942c 270cde7 9334a23 891b8fc a67942c 891b8fc 9334a23 51d1944 9e4dfaa 891b8fc 9334a23 51d1944 891b8fc 51d1944 9334a23 51d1944 9ba2a1c 51d1944 9ba2a1c 42f6a29 bce555f 9ba2a1c 9e4dfaa 9ba2a1c a5bf333 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import torch
import gradio as gr
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import numpy as np
MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
BATCH_SIZE = 8
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=9,
device=device,
model_kwargs={
"attn_implementation": "eager"
},
)
def transcribe(audio_file):
if audio_file is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting.")
with open(audio_file, "rb") as f:
audio_data = f.read()
audio_array = ffmpeg_read(audio_data, sampling_rate=pipe.feature_extractor.sampling_rate)
duration = len(audio_array) / pipe.feature_extractor.sampling_rate
print(f"Audio duration: {duration:.2f} seconds")
result = pipe(
inputs=audio_array,
batch_size=BATCH_SIZE,
return_timestamps=False,
generate_kwargs={
"task": "transcribe",
"no_speech_threshold": 0.4,
"logprob_threshold": -1.0,
"compression_ratio_threshold": 2.4
}
)
return result["text"] if isinstance(result, dict) else result
demo = gr.Blocks()
file_transcribe = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath", label="Audio file"),
outputs="text",
title="Whisper Large V3: Transcribe Audio",
description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
)
with demo:
gr.TabbedInterface([file_transcribe], ["Audio file"])
demo.launch()
|