File size: 3,927 Bytes
7ab2876
8fa375c
9f507de
 
68d0f03
 
9f507de
087adaa
 
 
 
9f507de
 
 
ca9beed
087adaa
7ab2876
 
 
 
 
182bd23
68d0f03
087adaa
68d0f03
 
087adaa
68d0f03
087adaa
c8949a6
087adaa
 
c8949a6
 
75f4da9
68d0f03
 
087adaa
68d0f03
087adaa
68d0f03
 
 
087adaa
9ef6b5c
 
68d0f03
 
087adaa
8fa375c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
977590d
8fa375c
 
68d0f03
 
 
087adaa
 
 
 
 
 
 
 
 
 
 
 
 
 
68d0f03
8fa375c
 
 
 
087adaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68d0f03
 
 
 
 
f3509ea
68d0f03
 
 
f3509ea
68d0f03
47d93d8
f3509ea
 
 
 
 
68d0f03
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import re
import warnings

import gradio as gr
import numpy as np
import torch
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    logging,
    pipeline,
)

warnings.simplefilter("ignore", FutureWarning)

# —— CPU performance tweaks ——
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["MKL_NUM_THREADS"] = "4"
torch.set_num_threads(4)

logging.set_verbosity_error()

# —— Model & device setup ——
model_id = "kingabzpro/whisper-large-v3-turbo-urdu"

# Load in fp32 and quantize to int8
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch.float32,
    use_safetensors=True,
)
model.eval()
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
model = torch.compile(model)
processor = AutoProcessor.from_pretrained(model_id)

# Build a CPU-based pipeline with chunking
transcriber = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device=-1,  # CPU
    chunk_length_s=20,
    stride_length_s=(5, 5),
)


def add_urdu_punctuation(text):
    """
    Adds Urdu full stop (۔) at the end of sentences and optionally Urdu comma (،) after conjunctions.
    This is a simple heuristic and may not be perfect for all cases.
    """
    # List of common Urdu conjunctions (for optional comma insertion)
    conjunctions = ["اور", "لیکن", "مگر", "یا", "پھر", "جبکہ", "کیونکہ", "تاہم"]
    # Add comma after conjunctions (optional, can be commented out if not desired)
    for conj in conjunctions:
        # Only add comma if not already present
        text = re.sub(rf"\b({conj})\b(?!،)", r"\1،", text)
    # Split sentences heuristically (by length or by pause words)
    # Here, we split by newlines or keep as one if no punctuation
    sentences = re.split(r"[\n]+", text)
    processed = []
    for s in sentences:
        s = s.strip()
        if not s:
            continue
        # Add Urdu full stop if not already present at end
        if not s.endswith("۔") and not s.endswith("؟"):
            s += "۔"
        processed.append(s)
    return "\n".join(processed)


def transcribe(audio):
    if audio is None:
        return "No audio provided. Please record or upload an audio file."

    sr, y = audio
    # mono & normalize
    if y.ndim > 1:
        y = y.mean(axis=1)
    y = y.astype(np.float32)
    peak = np.max(np.abs(y))
    if peak > 0:
        y /= peak
    else:
        return "Audio appears to be silent. Please try again."

    # Inference under no_grad
    with torch.no_grad():
        result = transcriber({"sampling_rate": sr, "raw": y})
    text = result.get("text", "")
    # Add Urdu punctuation
    text = add_urdu_punctuation(text)
    return text


# —— Gradio UI ——
description = """
<p style='text-align: center'>
Record or upload audio in Urdu and get the transcribed text using the Whisper Large V3 Turbo Urdu model.
</p>
"""
examples = [
    ["samples/audio1.mp3"],
    ["samples/audio2.mp3"],
    ["samples/audio3.mp3"],
]
article = """
<p style='text-align: center; color: #34C759;'>
<a href='https://github.com/kingabzpro/simple-mlops-with-urdu-asr' target='_blank' style='text-decoration: none; color: #34C759;'>
🌿 Explore the project on GitHub 📚
</a>
</p>
"""

demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(
        sources=["microphone", "upload"],
        type="numpy",
        label="Record or Upload Audio (Urdu)",
    ),
    outputs=gr.Textbox(
        label="Transcribed Text (Urdu)",
        placeholder="Transcribed Urdu text will appear here...",
    ),
    title="Urdu Speech Recognition",
    description=description,
    examples=examples,
    article=article,
    allow_flagging="never",
    theme="JohnSmith9982/small_and_pretty",
)

if __name__ == "__main__":
    demo.launch()