File size: 2,336 Bytes
9ba2a1c
a67942c
9ba2a1c
 
 
bce555f
9ba2a1c
 
a5bf333
9ba2a1c
c3e624b
9ba2a1c
bce555f
3b5175f
9ba2a1c
 
bce555f
9ba2a1c
bce555f
9ba2a1c
 
bce555f
9ba2a1c
 
 
31a57d8
9ba2a1c
31a57d8
 
 
 
9ba2a1c
a67942c
9334a23
 
 
891b8fc
a67942c
891b8fc
 
 
9334a23
891b8fc
 
9e4dfaa
 
 
891b8fc
 
9334a23
9e4dfaa
9334a23
 
9e4dfaa
 
 
 
 
 
 
 
9334a23
 
891b8fc
 
9e4dfaa
31a57d8
9334a23
 
 
9ba2a1c
bce555f
9ba2a1c
 
 
 
 
a5bf333
 
9ba2a1c
 
42f6a29
bce555f
 
9ba2a1c
 
 
9e4dfaa
9ba2a1c
a5bf333
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
from urllib.parse import urlparse, parse_qs

import tempfile
import time
import os
import numpy as np

# Constants
MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600  # 1 hour limit

# Device selection
device = 0 if torch.cuda.is_available() else "cpu"

# Load Whisper pipeline
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=9,
    device=device,
    model_kwargs={
#        "torch_dtype": torch.float16,
        "attn_implementation": "eager"
    },
)

# Transcription function (Fix applied)
def transcribe(audio_file, task):
    if audio_file is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting.")

    # Open file as binary to ensure correct data type
    with open(audio_file, "rb") as f:
        audio_data = f.read()

    # Read audio using ffmpeg_read (correcting input format)
    audio_array = ffmpeg_read(audio_data, pipe.feature_extractor.sampling_rate)
    
    duration = len(audio_array) / pipe.feature_extractor.sampling_rate
    print(f"Audio duration: {duration:.2f} seconds")

    # Convert to proper format
    inputs = {
        "array": np.array(audio_array),  
        "sampling_rate": pipe.feature_extractor.sampling_rate
    }
    
    generate_kwargs = {
        "task": task,
        "no_speech_threshold": 0.3,
        "logprob_threshold": -1.0,
        "compression_ratio_threshold": 2.4
    }
    
    # Perform transcription
    result = pipe(
        inputs,
        batch_size=BATCH_SIZE,
        generate_kwargs=generate_kwargs,
        return_timestamps=False
    )

    return result["text"]

# Gradio UI
demo = gr.Blocks()

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath", label="Audio file"),
        gr.Radio(["transcribe", "translate"], label="Task"),
    ],
    outputs="text",
    title="Whisper Large V3: Transcribe Audio",
    description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
    flagging_mode="never",
)

with demo:
    gr.TabbedInterface([file_transcribe], ["Audio file"])

demo.launch()