Spaces:
Running
Running
from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
import gradio as gr | |
import torch | |
import torchaudio | |
from transformers import pipeline | |
from transformers.pipelines.audio_utils import ffmpeg_read | |
MODEL_NAME = "dataprizma/whisper-large-v3-turbo" | |
processor = WhisperProcessor.from_pretrained(MODEL_NAME) | |
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME) | |
def transcribe(audio_file): | |
global model | |
global processor | |
# Move to GPU if available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
# Load and preprocess audio | |
waveform, sample_rate = torchaudio.load(audio_file) | |
if sample_rate != 16000: | |
waveform = torchaudio.functional.resample(waveform, sample_rate, 16000) | |
# Convert to mono if needed | |
if waveform.shape[0] > 1: | |
waveform = waveform.mean(dim=0, keepdim=True) | |
# Process audio | |
input_features = processor( | |
waveform.squeeze().numpy(), | |
sampling_rate=16000, | |
return_tensors="pt", | |
language="uz" | |
).input_features.to(device) | |
# Generate transcription | |
with torch.no_grad(): | |
predicted_ids = model.generate(input_features) | |
# Decode | |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
return transcription | |
demo = gr.Blocks() | |
file_transcribe = gr.Interface( | |
fn=transcribe, | |
inputs=gr.Audio(type="filepath", label="Audio file"), | |
outputs="text", | |
title="Whisper Large V3: Transcribe Audio", | |
description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma", | |
) | |
with demo: | |
gr.TabbedInterface([file_transcribe], ["Audio file"]) | |
demo.launch() | |