Abid Ali Awan
Adjust chunk and stride lengths in the transcription pipeline in app.py to optimize processing efficiency and improve transcription accuracy.
9ef6b5c
import os
import re
import warnings
import gradio as gr
import numpy as np
import torch
from transformers import (
AutoModelForSpeechSeq2Seq,
AutoProcessor,
logging,
pipeline,
)
warnings.simplefilter("ignore", FutureWarning)
# —— CPU performance tweaks ——
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["MKL_NUM_THREADS"] = "4"
torch.set_num_threads(4)
logging.set_verbosity_error()
# —— Model & device setup ——
model_id = "kingabzpro/whisper-large-v3-turbo-urdu"
# Load in fp32 and quantize to int8
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
torch_dtype=torch.float32,
use_safetensors=True,
)
model.eval()
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
model = torch.compile(model)
processor = AutoProcessor.from_pretrained(model_id)
# Build a CPU-based pipeline with chunking
transcriber = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
device=-1, # CPU
chunk_length_s=20,
stride_length_s=(5, 5),
)
def add_urdu_punctuation(text):
"""
Adds Urdu full stop (۔) at the end of sentences and optionally Urdu comma (،) after conjunctions.
This is a simple heuristic and may not be perfect for all cases.
"""
# List of common Urdu conjunctions (for optional comma insertion)
conjunctions = ["اور", "لیکن", "مگر", "یا", "پھر", "جبکہ", "کیونکہ", "تاہم"]
# Add comma after conjunctions (optional, can be commented out if not desired)
for conj in conjunctions:
# Only add comma if not already present
text = re.sub(rf"\b({conj})\b(?!،)", r"\1،", text)
# Split sentences heuristically (by length or by pause words)
# Here, we split by newlines or keep as one if no punctuation
sentences = re.split(r"[\n]+", text)
processed = []
for s in sentences:
s = s.strip()
if not s:
continue
# Add Urdu full stop if not already present at end
if not s.endswith("۔") and not s.endswith("؟"):
s += "۔"
processed.append(s)
return "\n".join(processed)
def transcribe(audio):
if audio is None:
return "No audio provided. Please record or upload an audio file."
sr, y = audio
# mono & normalize
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
peak = np.max(np.abs(y))
if peak > 0:
y /= peak
else:
return "Audio appears to be silent. Please try again."
# Inference under no_grad
with torch.no_grad():
result = transcriber({"sampling_rate": sr, "raw": y})
text = result.get("text", "")
# Add Urdu punctuation
text = add_urdu_punctuation(text)
return text
# —— Gradio UI ——
description = """
<p style='text-align: center'>
Record or upload audio in Urdu and get the transcribed text using the Whisper Large V3 Turbo Urdu model.
</p>
"""
examples = [
["samples/audio1.mp3"],
["samples/audio2.mp3"],
["samples/audio3.mp3"],
]
article = """
<p style='text-align: center; color: #34C759;'>
<a href='https://github.com/kingabzpro/simple-mlops-with-urdu-asr' target='_blank' style='text-decoration: none; color: #34C759;'>
🌿 Explore the project on GitHub 📚
</a>
</p>
"""
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="Record or Upload Audio (Urdu)",
),
outputs=gr.Textbox(
label="Transcribed Text (Urdu)",
placeholder="Transcribed Urdu text will appear here...",
),
title="Urdu Speech Recognition",
description=description,
examples=examples,
article=article,
allow_flagging="never",
theme="JohnSmith9982/small_and_pretty",
)
if __name__ == "__main__":
demo.launch()