Spaces:
Running
Running
Abid Ali Awan
Adjust chunk and stride lengths in the transcription pipeline in app.py to optimize processing efficiency and improve transcription accuracy.
9ef6b5c
import os | |
import re | |
import warnings | |
import gradio as gr | |
import numpy as np | |
import torch | |
from transformers import ( | |
AutoModelForSpeechSeq2Seq, | |
AutoProcessor, | |
logging, | |
pipeline, | |
) | |
warnings.simplefilter("ignore", FutureWarning) | |
# —— CPU performance tweaks —— | |
os.environ["OMP_NUM_THREADS"] = "4" | |
os.environ["MKL_NUM_THREADS"] = "4" | |
torch.set_num_threads(4) | |
logging.set_verbosity_error() | |
# —— Model & device setup —— | |
model_id = "kingabzpro/whisper-large-v3-turbo-urdu" | |
# Load in fp32 and quantize to int8 | |
model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
model_id, | |
torch_dtype=torch.float32, | |
use_safetensors=True, | |
) | |
model.eval() | |
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) | |
model = torch.compile(model) | |
processor = AutoProcessor.from_pretrained(model_id) | |
# Build a CPU-based pipeline with chunking | |
transcriber = pipeline( | |
task="automatic-speech-recognition", | |
model=model, | |
tokenizer=processor.tokenizer, | |
feature_extractor=processor.feature_extractor, | |
device=-1, # CPU | |
chunk_length_s=20, | |
stride_length_s=(5, 5), | |
) | |
def add_urdu_punctuation(text): | |
""" | |
Adds Urdu full stop (۔) at the end of sentences and optionally Urdu comma (،) after conjunctions. | |
This is a simple heuristic and may not be perfect for all cases. | |
""" | |
# List of common Urdu conjunctions (for optional comma insertion) | |
conjunctions = ["اور", "لیکن", "مگر", "یا", "پھر", "جبکہ", "کیونکہ", "تاہم"] | |
# Add comma after conjunctions (optional, can be commented out if not desired) | |
for conj in conjunctions: | |
# Only add comma if not already present | |
text = re.sub(rf"\b({conj})\b(?!،)", r"\1،", text) | |
# Split sentences heuristically (by length or by pause words) | |
# Here, we split by newlines or keep as one if no punctuation | |
sentences = re.split(r"[\n]+", text) | |
processed = [] | |
for s in sentences: | |
s = s.strip() | |
if not s: | |
continue | |
# Add Urdu full stop if not already present at end | |
if not s.endswith("۔") and not s.endswith("؟"): | |
s += "۔" | |
processed.append(s) | |
return "\n".join(processed) | |
def transcribe(audio): | |
if audio is None: | |
return "No audio provided. Please record or upload an audio file." | |
sr, y = audio | |
# mono & normalize | |
if y.ndim > 1: | |
y = y.mean(axis=1) | |
y = y.astype(np.float32) | |
peak = np.max(np.abs(y)) | |
if peak > 0: | |
y /= peak | |
else: | |
return "Audio appears to be silent. Please try again." | |
# Inference under no_grad | |
with torch.no_grad(): | |
result = transcriber({"sampling_rate": sr, "raw": y}) | |
text = result.get("text", "") | |
# Add Urdu punctuation | |
text = add_urdu_punctuation(text) | |
return text | |
# —— Gradio UI —— | |
description = """ | |
<p style='text-align: center'> | |
Record or upload audio in Urdu and get the transcribed text using the Whisper Large V3 Turbo Urdu model. | |
</p> | |
""" | |
examples = [ | |
["samples/audio1.mp3"], | |
["samples/audio2.mp3"], | |
["samples/audio3.mp3"], | |
] | |
article = """ | |
<p style='text-align: center; color: #34C759;'> | |
<a href='https://github.com/kingabzpro/simple-mlops-with-urdu-asr' target='_blank' style='text-decoration: none; color: #34C759;'> | |
🌿 Explore the project on GitHub 📚 | |
</a> | |
</p> | |
""" | |
demo = gr.Interface( | |
fn=transcribe, | |
inputs=gr.Audio( | |
sources=["microphone", "upload"], | |
type="numpy", | |
label="Record or Upload Audio (Urdu)", | |
), | |
outputs=gr.Textbox( | |
label="Transcribed Text (Urdu)", | |
placeholder="Transcribed Urdu text will appear here...", | |
), | |
title="Urdu Speech Recognition", | |
description=description, | |
examples=examples, | |
article=article, | |
allow_flagging="never", | |
theme="JohnSmith9982/small_and_pretty", | |
) | |
if __name__ == "__main__": | |
demo.launch() | |