import os import re import warnings import gradio as gr import numpy as np import torch from transformers import ( AutoModelForSpeechSeq2Seq, AutoProcessor, logging, pipeline, ) warnings.simplefilter("ignore", FutureWarning) # —— CPU performance tweaks —— os.environ["OMP_NUM_THREADS"] = "4" os.environ["MKL_NUM_THREADS"] = "4" torch.set_num_threads(4) logging.set_verbosity_error() # —— Model & device setup —— model_id = "kingabzpro/whisper-large-v3-turbo-urdu" # Load in fp32 and quantize to int8 model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch.float32, use_safetensors=True, ) model.eval() model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) model = torch.compile(model) processor = AutoProcessor.from_pretrained(model_id) # Build a CPU-based pipeline with chunking transcriber = pipeline( task="automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=-1, # CPU chunk_length_s=20, stride_length_s=(5, 5), ) def add_urdu_punctuation(text): """ Adds Urdu full stop (۔) at the end of sentences and optionally Urdu comma (،) after conjunctions. This is a simple heuristic and may not be perfect for all cases. """ # List of common Urdu conjunctions (for optional comma insertion) conjunctions = ["اور", "لیکن", "مگر", "یا", "پھر", "جبکہ", "کیونکہ", "تاہم"] # Add comma after conjunctions (optional, can be commented out if not desired) for conj in conjunctions: # Only add comma if not already present text = re.sub(rf"\b({conj})\b(?!،)", r"\1،", text) # Split sentences heuristically (by length or by pause words) # Here, we split by newlines or keep as one if no punctuation sentences = re.split(r"[\n]+", text) processed = [] for s in sentences: s = s.strip() if not s: continue # Add Urdu full stop if not already present at end if not s.endswith("۔") and not s.endswith("؟"): s += "۔" processed.append(s) return "\n".join(processed) def transcribe(audio): if audio is None: return "No audio provided. Please record or upload an audio file." sr, y = audio # mono & normalize if y.ndim > 1: y = y.mean(axis=1) y = y.astype(np.float32) peak = np.max(np.abs(y)) if peak > 0: y /= peak else: return "Audio appears to be silent. Please try again." # Inference under no_grad with torch.no_grad(): result = transcriber({"sampling_rate": sr, "raw": y}) text = result.get("text", "") # Add Urdu punctuation text = add_urdu_punctuation(text) return text # —— Gradio UI —— description = """
Record or upload audio in Urdu and get the transcribed text using the Whisper Large V3 Turbo Urdu model.
""" examples = [ ["samples/audio1.mp3"], ["samples/audio2.mp3"], ["samples/audio3.mp3"], ] article = """ """ demo = gr.Interface( fn=transcribe, inputs=gr.Audio( sources=["microphone", "upload"], type="numpy", label="Record or Upload Audio (Urdu)", ), outputs=gr.Textbox( label="Transcribed Text (Urdu)", placeholder="Transcribed Urdu text will appear here...", ), title="Urdu Speech Recognition", description=description, examples=examples, article=article, allow_flagging="never", theme="JohnSmith9982/small_and_pretty", ) if __name__ == "__main__": demo.launch()