navyaparesh commited on
Commit
d7ba1b9
·
verified ·
1 Parent(s): 59f7f98

Upload 4 files

Browse files
Files changed (4) hide show
  1. README (1).md +14 -0
  2. app (2).py +233 -0
  3. gitattributes (1) +39 -0
  4. requirements (1).txt +26 -0
README (1).md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Indic Asr
3
+ emoji: 🏆
4
+ colorFrom: gray
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 5.20.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: cc-by-4.0
11
+ short_description: A speech recognition tool for Indic languages.
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app (2).py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ import gradio as gr
6
+
7
+ import torch
8
+ import torchaudio
9
+
10
+ import spaces
11
+
12
+ import nemo.collections.asr as nemo_asr
13
+
14
+ LANGUAGE_NAME_TO_CODE = {
15
+ "Assamese": "as",
16
+ "Bengali": "bn",
17
+ "Bodo": "br",
18
+ "Dogri": "doi",
19
+ "Gujarati": "gu",
20
+ "Hindi": "hi",
21
+ "Kannada": "kn",
22
+ "Kashmiri": "ks",
23
+ "Konkani": "kok",
24
+ "Maithili": "mai",
25
+ "Malayalam": "ml",
26
+ "Manipuri": "mni",
27
+ "Marathi": "mr",
28
+ "Nepali": "ne",
29
+ "Odia": "or",
30
+ "Punjabi": "pa",
31
+ "Sanskrit": "sa",
32
+ "Santali": "sat",
33
+ "Sindhi": "sd",
34
+ "Tamil": "ta",
35
+ "Telugu": "te",
36
+ "Urdu": "ur"
37
+ }
38
+
39
+
40
+ DESCRIPTION = """\
41
+ ### **IndicConformer: Speech Recognition for Indian Languages** 🎙️➡️📜
42
+
43
+ This Gradio demo showcases **IndicConformer**, a speech recognition model for **22 Indian languages**. The model operates in two modes: **CTC (Connectionist Temporal Classification)** and **RNNT (Recurrent Neural Network Transducer)**, providing robust and accurate transcriptions across diverse linguistic and acoustic conditions.
44
+
45
+ #### **How to Use:**
46
+ 1. **Upload or record** an audio clip in any supported Indian language.
47
+ 2. Select the **mode** (CTC or RNNT) for transcription.
48
+ 3. Click **"Transcribe"** to generate the corresponding text in the target language.
49
+ 4. View or copy the output for further use.
50
+
51
+ 🚀 Try it out and experience seamless speech recognition for Indian languages!
52
+ """
53
+
54
+ hf_token = os.getenv("HF_TOKEN")
55
+ device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
56
+ torch_dtype = torch.bfloat16 if device != "cpu" else torch.float32
57
+ model_name_or_path = "ai4bharat/IndicConformer"
58
+ model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name_or_path).to(device)
59
+ # model = nemo_asr.models.EncDecCTCModel.restore_from("indicconformer_stt_bn_hybrid_rnnt_large.nemo").to(device)
60
+ model.eval()
61
+
62
+ CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1" and torch.cuda.is_available()
63
+
64
+ AUDIO_SAMPLE_RATE = 16000
65
+ MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
66
+ DEFAULT_TARGET_LANGUAGE = "Bengali"
67
+
68
+ @spaces.GPU
69
+ def run_asr_ctc(input_audio: str, target_language: str) -> str:
70
+ lang_id = LANGUAGE_NAME_TO_CODE[target_language]
71
+
72
+ # Load and preprocess audio
73
+ audio_tensor, orig_freq = torchaudio.load(input_audio)
74
+
75
+ # Convert to mono if not already
76
+ if audio_tensor.shape[0] > 1:
77
+ audio_tensor = torch.mean(audio_tensor, dim=0, keepdim=True)
78
+
79
+ # Ensure shape [B x T]
80
+ if len(audio_tensor.shape) == 1:
81
+ audio_tensor = audio_tensor.unsqueeze(0) # Add batch dimension if missing
82
+
83
+ if audio_tensor.ndim > 1:
84
+ audio_tensor = audio_tensor.squeeze(0)
85
+
86
+ # Resample to 16kHz
87
+ audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=orig_freq, new_freq=16000)
88
+
89
+ model.cur_decoder = "ctc"
90
+ ctc_text = model.transcribe([audio_tensor.numpy()], batch_size=1, logprobs=False, language_id=lang_id)[0]
91
+
92
+ return ctc_text[0]
93
+
94
+ # @spaces.GPU
95
+ # def run_asr_ctc(input_audio: str, target_language: str) -> str:
96
+ # # preprocess_audio(input_audio)
97
+ # # input_audio, orig_freq = torchaudio.load(input_audio)
98
+ # # input_audio = torchaudio.functional.resample(input_audio, orig_freq=orig_freq, new_freq=16000)
99
+ # lang_id = LANGUAGE_NAME_TO_CODE[target_language]
100
+
101
+ # model.cur_decoder = "ctc"
102
+ # ctc_text = model.transcribe([input_audio], batch_size=1, logprobs=False, language_id=lang_id)[0]
103
+
104
+ # return ctc_text[0]
105
+
106
+ @spaces.GPU
107
+ def run_asr_rnnt(input_audio: str, target_language: str) -> str:
108
+ lang_id = LANGUAGE_NAME_TO_CODE[target_language]
109
+
110
+ # Load and preprocess audio
111
+ audio_tensor, orig_freq = torchaudio.load(input_audio)
112
+
113
+ # Convert to mono if not already
114
+ if audio_tensor.shape[0] > 1:
115
+ audio_tensor = torch.mean(audio_tensor, dim=0, keepdim=True)
116
+
117
+ # Ensure shape [B x T]
118
+ if len(audio_tensor.shape) == 1:
119
+ audio_tensor = audio_tensor.unsqueeze(0) # Add batch dimension if missing
120
+
121
+ if audio_tensor.ndim > 1:
122
+ audio_tensor = audio_tensor.squeeze(0)
123
+
124
+ # Resample to 16kHz
125
+ audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=orig_freq, new_freq=16000)
126
+
127
+ model.cur_decoder = "rnnt"
128
+ ctc_text = model.transcribe([audio_tensor.numpy()], batch_size=1, logprobs=False, language_id=lang_id)[0]
129
+
130
+ return ctc_text[0]
131
+
132
+ # @spaces.GPU
133
+ # def run_asr_rnnt(input_audio: str, target_language: str) -> str:
134
+ # # preprocess_audio(input_audio)
135
+ # # input_audio, orig_freq = torchaudio.load(input_audio)
136
+ # # input_audio = torchaudio.functional.resample(input_audio, orig_freq=orig_freq, new_freq=16000)
137
+ # lang_id = LANGUAGE_NAME_TO_CODE[target_language]
138
+
139
+ # model.cur_decoder = "rnnt"
140
+ # ctc_text = model.transcribe([input_audio], batch_size=1,logprobs=False, language_id=lang_id)[0]
141
+
142
+ # return ctc_text[0]
143
+
144
+
145
+
146
+ with gr.Blocks() as demo_asr_ctc:
147
+ with gr.Row():
148
+ with gr.Column():
149
+ with gr.Group():
150
+ input_audio = gr.Audio(label="Input speech", type="filepath")
151
+ target_language = gr.Dropdown(
152
+ label="Target language",
153
+ choices=LANGUAGE_NAME_TO_CODE.keys(),
154
+ value=DEFAULT_TARGET_LANGUAGE,
155
+ )
156
+ btn = gr.Button("Transcribe")
157
+ with gr.Column():
158
+ output_text = gr.Textbox(label="Transcribed text")
159
+
160
+ gr.Examples(
161
+ examples=[
162
+ ["assets/Bengali.wav", "Bengali", "English"],
163
+ ["assets/Gujarati.wav", "Gujarati", "Hindi"],
164
+ ["assets/Punjabi.wav", "Punjabi", "Hindi"],
165
+
166
+ ],
167
+ inputs=[input_audio, target_language],
168
+ outputs=output_text,
169
+ fn=run_asr_ctc,
170
+ cache_examples=CACHE_EXAMPLES,
171
+ api_name=False,
172
+ )
173
+
174
+ btn.click(
175
+ fn=run_asr_ctc,
176
+ inputs=[input_audio, target_language],
177
+ outputs=output_text,
178
+ api_name="asr",
179
+ )
180
+
181
+ with gr.Blocks() as demo_asr_rnnt:
182
+ with gr.Row():
183
+ with gr.Column():
184
+ with gr.Group():
185
+ input_audio = gr.Audio(label="Input speech", type="filepath")
186
+ target_language = gr.Dropdown(
187
+ label="Target language",
188
+ choices=LANGUAGE_NAME_TO_CODE.keys(),
189
+ value=DEFAULT_TARGET_LANGUAGE,
190
+ )
191
+ btn = gr.Button("Transcribe")
192
+ with gr.Column():
193
+ output_text = gr.Textbox(label="Transcribed text")
194
+
195
+ gr.Examples(
196
+ examples=[
197
+ ["assets/Bengali.wav", "Bengali", "English"],
198
+ ["assets/Gujarati.wav", "Gujarati", "Hindi"],
199
+ ["assets/Punjabi.wav", "Punjabi", "Hindi"],
200
+
201
+ ],
202
+ inputs=[input_audio, target_language],
203
+ outputs=output_text,
204
+ fn=run_asr_rnnt,
205
+ cache_examples=CACHE_EXAMPLES,
206
+ api_name=False,
207
+ )
208
+
209
+ btn.click(
210
+ fn=run_asr_rnnt,
211
+ inputs=[input_audio, target_language],
212
+ outputs=output_text,
213
+ api_name="asr",
214
+ )
215
+
216
+
217
+ with gr.Blocks(css="style.css") as demo:
218
+ gr.Markdown(DESCRIPTION)
219
+ gr.DuplicateButton(
220
+ value="Duplicate Space for private use",
221
+ elem_id="duplicate-button",
222
+ visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
223
+ )
224
+
225
+ with gr.Tabs():
226
+ with gr.Tab(label="CTC"):
227
+ demo_asr_ctc.render()
228
+ with gr.Tab(label="RNNT"):
229
+ demo_asr_rnnt.render()
230
+
231
+
232
+ if __name__ == "__main__":
233
+ demo.queue(max_size=50).launch()
gitattributes (1) ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ indicconformer_stt_bn_hybrid_rnnt_large.nemo filter=lfs diff=lfs merge=lfs -text
37
+ Bengali.wav filter=lfs diff=lfs merge=lfs -text
38
+ Gujarati.wav filter=lfs diff=lfs merge=lfs -text
39
+ Punjabi.wav filter=lfs diff=lfs merge=lfs -text
requirements (1).txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/AshwinSankar17/NeMo-ai4b@nemo-v2
2
+ torchaudio
3
+ pytorch-lightning==2.4.0
4
+ hydra-core==1.3.2
5
+ librosa==0.10.2.post1
6
+ sentencepiece==0.2.0
7
+ pandas==2.2.2
8
+ lhotse==1.27.0
9
+ editdistance==0.8.1
10
+ jiwer==3.0.4
11
+ pyannote.audio
12
+ webdataset==0.2.100
13
+ cython==0.29.37
14
+ pyyaml==6.0.2
15
+ argparse==1.4.0
16
+ onnxruntime==1.19.0
17
+ tqdm==4.66.5
18
+ transformers
19
+ huggingface_hub
20
+ tokenizers
21
+ datasets
22
+ inflect
23
+ IPython
24
+ soundfile
25
+ pydub
26
+ numpy<2.0