Spaces:
Running
Running
File size: 3,927 Bytes
7ab2876 8fa375c 9f507de 68d0f03 9f507de 087adaa 9f507de ca9beed 087adaa 7ab2876 182bd23 68d0f03 087adaa 68d0f03 087adaa 68d0f03 087adaa c8949a6 087adaa c8949a6 75f4da9 68d0f03 087adaa 68d0f03 087adaa 68d0f03 087adaa 9ef6b5c 68d0f03 087adaa 8fa375c 977590d 8fa375c 68d0f03 087adaa 68d0f03 8fa375c 087adaa 68d0f03 f3509ea 68d0f03 f3509ea 68d0f03 47d93d8 f3509ea 68d0f03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import os
import re
import warnings
import gradio as gr
import numpy as np
import torch
from transformers import (
AutoModelForSpeechSeq2Seq,
AutoProcessor,
logging,
pipeline,
)
warnings.simplefilter("ignore", FutureWarning)
# —— CPU performance tweaks ——
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["MKL_NUM_THREADS"] = "4"
torch.set_num_threads(4)
logging.set_verbosity_error()
# —— Model & device setup ——
model_id = "kingabzpro/whisper-large-v3-turbo-urdu"
# Load in fp32 and quantize to int8
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
torch_dtype=torch.float32,
use_safetensors=True,
)
model.eval()
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
model = torch.compile(model)
processor = AutoProcessor.from_pretrained(model_id)
# Build a CPU-based pipeline with chunking
transcriber = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
device=-1, # CPU
chunk_length_s=20,
stride_length_s=(5, 5),
)
def add_urdu_punctuation(text):
"""
Adds Urdu full stop (۔) at the end of sentences and optionally Urdu comma (،) after conjunctions.
This is a simple heuristic and may not be perfect for all cases.
"""
# List of common Urdu conjunctions (for optional comma insertion)
conjunctions = ["اور", "لیکن", "مگر", "یا", "پھر", "جبکہ", "کیونکہ", "تاہم"]
# Add comma after conjunctions (optional, can be commented out if not desired)
for conj in conjunctions:
# Only add comma if not already present
text = re.sub(rf"\b({conj})\b(?!،)", r"\1،", text)
# Split sentences heuristically (by length or by pause words)
# Here, we split by newlines or keep as one if no punctuation
sentences = re.split(r"[\n]+", text)
processed = []
for s in sentences:
s = s.strip()
if not s:
continue
# Add Urdu full stop if not already present at end
if not s.endswith("۔") and not s.endswith("؟"):
s += "۔"
processed.append(s)
return "\n".join(processed)
def transcribe(audio):
if audio is None:
return "No audio provided. Please record or upload an audio file."
sr, y = audio
# mono & normalize
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
peak = np.max(np.abs(y))
if peak > 0:
y /= peak
else:
return "Audio appears to be silent. Please try again."
# Inference under no_grad
with torch.no_grad():
result = transcriber({"sampling_rate": sr, "raw": y})
text = result.get("text", "")
# Add Urdu punctuation
text = add_urdu_punctuation(text)
return text
# —— Gradio UI ——
description = """
<p style='text-align: center'>
Record or upload audio in Urdu and get the transcribed text using the Whisper Large V3 Turbo Urdu model.
</p>
"""
examples = [
["samples/audio1.mp3"],
["samples/audio2.mp3"],
["samples/audio3.mp3"],
]
article = """
<p style='text-align: center; color: #34C759;'>
<a href='https://github.com/kingabzpro/simple-mlops-with-urdu-asr' target='_blank' style='text-decoration: none; color: #34C759;'>
🌿 Explore the project on GitHub 📚
</a>
</p>
"""
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="Record or Upload Audio (Urdu)",
),
outputs=gr.Textbox(
label="Transcribed Text (Urdu)",
placeholder="Transcribed Urdu text will appear here...",
),
title="Urdu Speech Recognition",
description=description,
examples=examples,
article=article,
allow_flagging="never",
theme="JohnSmith9982/small_and_pretty",
)
if __name__ == "__main__":
demo.launch()
|