Spaces:

kingabzpro
/

Transcribed-Urdu

Running

File size: 3,927 Bytes

7ab2876
8fa375c
9f507de
 
68d0f03
 
9f507de
087adaa
 
 
 
9f507de
 
 
ca9beed
087adaa
7ab2876
 
 
 
 
182bd23
68d0f03
087adaa
68d0f03
 
087adaa
68d0f03
087adaa
c8949a6
087adaa
 
c8949a6
 
75f4da9
68d0f03
 
087adaa
68d0f03
087adaa
68d0f03
 
 
087adaa
9ef6b5c
 
68d0f03
 
087adaa
8fa375c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
977590d
8fa375c
 
68d0f03
 
 
087adaa
 
 
 
 
 
 
 
 
 
 
 
 
 
68d0f03
8fa375c
 
 
 
087adaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68d0f03
 
 
 
 
f3509ea
68d0f03
 
 
f3509ea
68d0f03
47d93d8
f3509ea
 
 
 
 
68d0f03

import os
import re
import warnings

import gradio as gr
import numpy as np
import torch
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    logging,
    pipeline,
)

warnings.simplefilter("ignore", FutureWarning)

# —— CPU performance tweaks ——
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["MKL_NUM_THREADS"] = "4"
torch.set_num_threads(4)

logging.set_verbosity_error()

# —— Model & device setup ——
model_id = "kingabzpro/whisper-large-v3-turbo-urdu"

# Load in fp32 and quantize to int8
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch.float32,
    use_safetensors=True,
)
model.eval()
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
model = torch.compile(model)
processor = AutoProcessor.from_pretrained(model_id)

# Build a CPU-based pipeline with chunking
transcriber = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device=-1,  # CPU
    chunk_length_s=20,
    stride_length_s=(5, 5),
)


def add_urdu_punctuation(text):
    """
    Adds Urdu full stop (۔) at the end of sentences and optionally Urdu comma (،) after conjunctions.
    This is a simple heuristic and may not be perfect for all cases.
    """
    # List of common Urdu conjunctions (for optional comma insertion)
    conjunctions = ["اور", "لیکن", "مگر", "یا", "پھر", "جبکہ", "کیونکہ", "تاہم"]
    # Add comma after conjunctions (optional, can be commented out if not desired)
    for conj in conjunctions:
        # Only add comma if not already present
        text = re.sub(rf"\b({conj})\b(?!،)", r"\1،", text)
    # Split sentences heuristically (by length or by pause words)
    # Here, we split by newlines or keep as one if no punctuation
    sentences = re.split(r"[\n]+", text)
    processed = []
    for s in sentences:
        s = s.strip()
        if not s:
            continue
        # Add Urdu full stop if not already present at end
        if not s.endswith("۔") and not s.endswith("؟"):
            s += "۔"
        processed.append(s)
    return "\n".join(processed)


def transcribe(audio):
    if audio is None:
        return "No audio provided. Please record or upload an audio file."

    sr, y = audio
    # mono & normalize
    if y.ndim > 1:
        y = y.mean(axis=1)
    y = y.astype(np.float32)
    peak = np.max(np.abs(y))
    if peak > 0:
        y /= peak
    else:
        return "Audio appears to be silent. Please try again."

    # Inference under no_grad
    with torch.no_grad():
        result = transcriber({"sampling_rate": sr, "raw": y})
    text = result.get("text", "")
    # Add Urdu punctuation
    text = add_urdu_punctuation(text)
    return text


# —— Gradio UI ——
description = """
<p style='text-align: center'>
Record or upload audio in Urdu and get the transcribed text using the Whisper Large V3 Turbo Urdu model.
</p>
"""
examples = [
    ["samples/audio1.mp3"],
    ["samples/audio2.mp3"],
    ["samples/audio3.mp3"],
]
article = """
<p style='text-align: center; color: #34C759;'>
<a href='https://github.com/kingabzpro/simple-mlops-with-urdu-asr' target='_blank' style='text-decoration: none; color: #34C759;'>
🌿 Explore the project on GitHub 📚
</a>
</p>
"""

demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(
        sources=["microphone", "upload"],
        type="numpy",
        label="Record or Upload Audio (Urdu)",
    ),
    outputs=gr.Textbox(
        label="Transcribed Text (Urdu)",
        placeholder="Transcribed Urdu text will appear here...",
    ),
    title="Urdu Speech Recognition",
    description=description,
    examples=examples,
    article=article,
    allow_flagging="never",
    theme="JohnSmith9982/small_and_pretty",
)

if __name__ == "__main__":
    demo.launch()