Abid Ali Awan commited on
Commit
68d0f03
·
1 Parent(s): 08054ac

Update README.md to include details about the whisper-large-v3-turbo-urdu model and its evaluation results.

Browse files
Files changed (2) hide show
  1. README.md +6 -1
  2. app.py +82 -0
README.md CHANGED
@@ -11,4 +11,9 @@ license: apache-2.0
11
  short_description: The most accurate Urdu speech recognition app.
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
11
  short_description: The most accurate Urdu speech recognition app.
12
  ---
13
 
14
+ # whisper-large-v3-turbo-urdu
15
+
16
+ This model is a fine-tuned version of [openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) on the common_voice_17_0 dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - Loss: 0.4630
19
+ - Wer: 0.3826
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import torch
4
+ import numpy as np
5
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
6
+ import warnings
7
+
8
+ # Suppress warnings
9
+ warnings.filterwarnings("ignore")
10
+
11
+ # Model configuration
12
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
14
+ model_id = "kingabzpro/whisper-large-v3-turbo-urdu"
15
+
16
+ # Initialize model and processor
17
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
18
+ model_id,
19
+ torch_dtype=torch_dtype,
20
+ use_safetensors=True
21
+ ).to(device)
22
+ model.generation_config.forced_decoder_ids = None
23
+
24
+ processor = AutoProcessor.from_pretrained(model_id)
25
+
26
+ # Create pipeline
27
+ transcriber = pipeline(
28
+ "automatic-speech-recognition",
29
+ model=model,
30
+ tokenizer=processor.tokenizer,
31
+ feature_extractor=processor.feature_extractor,
32
+ torch_dtype=torch_dtype,
33
+ device=device,
34
+ )
35
+
36
+ @spaces.GPU
37
+ def transcribe(audio):
38
+ if audio is None:
39
+ return "No audio provided. Please record or upload an audio file."
40
+
41
+ try:
42
+ sr, y = audio
43
+
44
+ # Convert to mono if stereo
45
+ if y.ndim > 1:
46
+ y = y.mean(axis=1)
47
+
48
+ # Convert to float32 and normalize
49
+ y = y.astype(np.float32)
50
+ if np.max(np.abs(y)) > 0:
51
+ y /= np.max(np.abs(y))
52
+ else:
53
+ return "Audio appears to be silent. Please try again."
54
+
55
+ # Transcribe using the pipeline
56
+ result = transcriber({"sampling_rate": sr, "raw": y})
57
+
58
+ return result["text"]
59
+
60
+ except Exception as e:
61
+ return f"Error during transcription: {str(e)}"
62
+
63
+ # Create Gradio interface
64
+ demo = gr.Interface(
65
+ fn=transcribe,
66
+ inputs=gr.Audio(
67
+ sources=["microphone", "upload"],
68
+ type="numpy",
69
+ label="Record or Upload Audio (Urdu)"
70
+ ),
71
+ outputs=gr.Textbox(
72
+ label="Transcribed Text (Urdu)",
73
+ placeholder="Transcribed Urdu text will appear here..."
74
+ ),
75
+ title="🎤 Urdu Speech Recognition",
76
+ description="Record or upload audio in Urdu and get the transcribed text using Whisper Large V3 Turbo Urdu model.",
77
+ examples=[],
78
+ allow_flagging="never"
79
+ )
80
+
81
+ if __name__ == "__main__":
82
+ demo.launch()