Spaces:
Running
Running
File size: 2,336 Bytes
9ba2a1c a67942c 9ba2a1c bce555f 9ba2a1c a5bf333 9ba2a1c c3e624b 9ba2a1c bce555f 3b5175f 9ba2a1c bce555f 9ba2a1c bce555f 9ba2a1c bce555f 9ba2a1c 31a57d8 9ba2a1c 31a57d8 9ba2a1c a67942c 9334a23 891b8fc a67942c 891b8fc 9334a23 891b8fc 9e4dfaa 891b8fc 9334a23 9e4dfaa 9334a23 9e4dfaa 9334a23 891b8fc 9e4dfaa 31a57d8 9334a23 9ba2a1c bce555f 9ba2a1c a5bf333 9ba2a1c 42f6a29 bce555f 9ba2a1c 9e4dfaa 9ba2a1c a5bf333 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
from urllib.parse import urlparse, parse_qs
import tempfile
import time
import os
import numpy as np
# Constants
MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # 1 hour limit
# Device selection
device = 0 if torch.cuda.is_available() else "cpu"
# Load Whisper pipeline
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=9,
device=device,
model_kwargs={
# "torch_dtype": torch.float16,
"attn_implementation": "eager"
},
)
# Transcription function (Fix applied)
def transcribe(audio_file, task):
if audio_file is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting.")
# Open file as binary to ensure correct data type
with open(audio_file, "rb") as f:
audio_data = f.read()
# Read audio using ffmpeg_read (correcting input format)
audio_array = ffmpeg_read(audio_data, pipe.feature_extractor.sampling_rate)
duration = len(audio_array) / pipe.feature_extractor.sampling_rate
print(f"Audio duration: {duration:.2f} seconds")
# Convert to proper format
inputs = {
"array": np.array(audio_array),
"sampling_rate": pipe.feature_extractor.sampling_rate
}
generate_kwargs = {
"task": task,
"no_speech_threshold": 0.3,
"logprob_threshold": -1.0,
"compression_ratio_threshold": 2.4
}
# Perform transcription
result = pipe(
inputs,
batch_size=BATCH_SIZE,
generate_kwargs=generate_kwargs,
return_timestamps=False
)
return result["text"]
# Gradio UI
demo = gr.Blocks()
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(type="filepath", label="Audio file"),
gr.Radio(["transcribe", "translate"], label="Task"),
],
outputs="text",
title="Whisper Large V3: Transcribe Audio",
description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
flagging_mode="never",
)
with demo:
gr.TabbedInterface([file_transcribe], ["Audio file"])
demo.launch()
|