Spaces:

kingabzpro
/

Transcribed-Urdu

Running

App Files Files Community

Abid Ali Awan commited on Jul 2

Commit

68d0f03

1 Parent(s): 08054ac

Update README.md to include details about the whisper-large-v3-turbo-urdu model and its evaluation results.

Browse files

Files changed (2) hide show

README.md +6 -1
app.py +82 -0

README.md CHANGED Viewed

@@ -11,4 +11,9 @@ license: apache-2.0
 short_description: The most accurate Urdu speech recognition app.
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: The most accurate Urdu speech recognition app.
 ---
+# whisper-large-v3-turbo-urdu
+This model is a fine-tuned version of [openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) on the common_voice_17_0 dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.4630
+- Wer: 0.3826

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import gradio as gr
+import spaces
+import torch
+import numpy as np
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+import warnings
+# Suppress warnings
+warnings.filterwarnings("ignore")
+# Model configuration
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model_id = "kingabzpro/whisper-large-v3-turbo-urdu"
+# Initialize model and processor
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id,
+    torch_dtype=torch_dtype,
+    use_safetensors=True
+).to(device)
+model.generation_config.forced_decoder_ids = None
+processor = AutoProcessor.from_pretrained(model_id)
+# Create pipeline
+transcriber = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    torch_dtype=torch_dtype,
+    device=device,
+)
+@spaces.GPU
+def transcribe(audio):
+    if audio is None:
+        return "No audio provided. Please record or upload an audio file."
+    try:
+        sr, y = audio
+        # Convert to mono if stereo
+        if y.ndim > 1:
+            y = y.mean(axis=1)
+        # Convert to float32 and normalize
+        y = y.astype(np.float32)
+        if np.max(np.abs(y)) > 0:
+            y /= np.max(np.abs(y))
+        else:
+            return "Audio appears to be silent. Please try again."
+        # Transcribe using the pipeline
+        result = transcriber({"sampling_rate": sr, "raw": y})
+        return result["text"]
+    except Exception as e:
+        return f"Error during transcription: {str(e)}"
+# Create Gradio interface
+demo = gr.Interface(
+    fn=transcribe,
+    inputs=gr.Audio(
+        sources=["microphone", "upload"],
+        type="numpy",
+        label="Record or Upload Audio (Urdu)"
+    ),
+    outputs=gr.Textbox(
+        label="Transcribed Text (Urdu)",
+        placeholder="Transcribed Urdu text will appear here..."
+    ),
+    title="🎤 Urdu Speech Recognition",
+    description="Record or upload audio in Urdu and get the transcribed text using Whisper Large V3 Turbo Urdu model.",
+    examples=[],
+    allow_flagging="never"
+)
+if __name__ == "__main__":
+    demo.launch()