Athspi commited on
Commit
c9c7876
·
verified ·
1 Parent(s): 8b5c488

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -47
app.py CHANGED
@@ -1,9 +1,10 @@
1
  import gradio as gr
2
  import whisper
3
- import torch
4
  import os
5
  from pydub import AudioSegment
6
- from huggingsound import SpeechRecognitionModel
 
 
7
 
8
  # Mapping of model names to Whisper model sizes
9
  MODELS = {
@@ -14,12 +15,8 @@ MODELS = {
14
  "Large (Most Accurate)": "large"
15
  }
16
 
17
- # HuggingSound model for Arabic
18
- HUGGINGSOUND_MODEL = {
19
- "Arabic": {
20
- "model": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
21
- }
22
- }
23
 
24
  # Mapping of full language names to language codes
25
  LANGUAGE_NAME_TO_CODE = {
@@ -87,7 +84,7 @@ LANGUAGE_NAME_TO_CODE = {
87
  "Galician": "gl",
88
  "Marathi": "mr",
89
  "Punjabi": "pa",
90
- "Sinhala": "si", # Sinhala support
91
  "Khmer": "km",
92
  "Shona": "sn",
93
  "Yoruba": "yo",
@@ -125,76 +122,92 @@ LANGUAGE_NAME_TO_CODE = {
125
  "Sundanese": "su",
126
  }
127
 
128
- def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
129
- """Transcribe the audio file."""
130
- # Convert audio to 16kHz mono for better compatibility
 
 
131
  audio = AudioSegment.from_file(audio_file)
132
  audio = audio.set_frame_rate(16000).set_channels(1)
133
  processed_audio_path = "processed_audio.wav"
134
  audio.export(processed_audio_path, format="wav")
135
 
136
- # Load the appropriate model
137
- if language in HUGGINGSOUND_MODEL:
138
- # Use the HuggingSound model for the selected language
139
- model = SpeechRecognitionModel(HUGGINGSOUND_MODEL[language]["model"])
140
- transcriptions = model.transcribe([processed_audio_path])
141
- transcription = transcriptions[0]["transcription"]
142
- detected_language = language
143
  else:
144
- # Use the selected Whisper model
145
- model = whisper.load_model(MODELS[model_size])
146
-
147
- # Transcribe the audio
148
- if language == "Auto Detect":
149
- result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language
150
- detected_language = result.get("language", "unknown")
151
- else:
152
- language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
153
- result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
154
- detected_language = language_code
155
-
156
- transcription = result["text"]
157
-
158
  # Clean up processed audio file
159
  os.remove(processed_audio_path)
160
 
161
  # Return transcription and detected language
162
- return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  # Define the Gradio interface
165
  with gr.Blocks() as demo:
166
- gr.Markdown("# Audio Transcription with HuggingSound and Whisper")
167
 
168
  with gr.Tab("Transcribe Audio"):
169
  gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.")
170
  transcribe_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
171
  language_dropdown = gr.Dropdown(
172
- choices=list(LANGUAGE_NAME_TO_CODE.keys()), # Full language names
173
  label="Select Language",
174
  value="Auto Detect"
175
  )
176
  model_dropdown = gr.Dropdown(
177
- choices=list(MODELS.keys()), # Model options
178
  label="Select Model",
179
- value="Base (Faster)", # Default to "Base" model
180
- interactive=True # Allow model selection by default
181
  )
182
  transcribe_output = gr.Textbox(label="Transcription and Detected Language")
183
  transcribe_button = gr.Button("Transcribe Audio")
184
 
185
  # Update model dropdown based on language selection
186
  def update_model_dropdown(language):
187
- if language in HUGGINGSOUND_MODEL:
188
- # Add "HuggingSound Model" to the dropdown choices and disable it
189
- return gr.Dropdown(choices=["HuggingSound Model"], value="HuggingSound Model", interactive=False)
190
  else:
191
- # Reset the dropdown to standard Whisper models
192
- return gr.Dropdown(choices=list(MODELS.keys()), value="Base (Faster)", interactive=True)
193
 
194
  language_dropdown.change(update_model_dropdown, inputs=language_dropdown, outputs=model_dropdown)
195
-
196
- # Link button to function
197
  transcribe_button.click(transcribe_audio, inputs=[transcribe_audio_input, language_dropdown, model_dropdown], outputs=transcribe_output)
198
 
199
  # Launch the Gradio interface
200
- demo.launch()
 
1
  import gradio as gr
2
  import whisper
 
3
  import os
4
  from pydub import AudioSegment
5
+ from transformers import AutoProcessor, AutoModelForCTC
6
+ import torchaudio
7
+ import torch
8
 
9
  # Mapping of model names to Whisper model sizes
10
  MODELS = {
 
15
  "Large (Most Accurate)": "large"
16
  }
17
 
18
+ # Fine-tuned Sinhala model (using Hugging Face Transformers)
19
+ SINHALA_MODEL = "IAmNotAnanth/wav2vec2-large-xls-r-300m-sinhala"
 
 
 
 
20
 
21
  # Mapping of full language names to language codes
22
  LANGUAGE_NAME_TO_CODE = {
 
84
  "Galician": "gl",
85
  "Marathi": "mr",
86
  "Punjabi": "pa",
87
+ "Sinhala": "si",
88
  "Khmer": "km",
89
  "Shona": "sn",
90
  "Yoruba": "yo",
 
122
  "Sundanese": "su",
123
  }
124
 
125
+ def transcribe_with_whisper(audio_file, language="Auto Detect", model_size="Base (Faster)"):
126
+ """Transcribe using OpenAI's Whisper models."""
127
+ model = whisper.load_model(MODELS[model_size])
128
+
129
+ # Convert audio to 16kHz mono for compatibility with Whisper
130
  audio = AudioSegment.from_file(audio_file)
131
  audio = audio.set_frame_rate(16000).set_channels(1)
132
  processed_audio_path = "processed_audio.wav"
133
  audio.export(processed_audio_path, format="wav")
134
 
135
+ # Transcribe the audio
136
+ if language == "Auto Detect":
137
+ result = model.transcribe(processed_audio_path, fp16=False)
138
+ detected_language = result.get("language", "unknown")
 
 
 
139
  else:
140
+ language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
141
+ result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
142
+ detected_language = language_code
143
+
 
 
 
 
 
 
 
 
 
 
144
  # Clean up processed audio file
145
  os.remove(processed_audio_path)
146
 
147
  # Return transcription and detected language
148
+ return f"Detected Language: {detected_language}\n\nTranscription:\n{result['text']}"
149
+
150
+ def transcribe_with_sinhala_model(audio_file):
151
+ """Transcribe using the fine-tuned Sinhala Wav2Vec2 model."""
152
+ processor = AutoProcessor.from_pretrained(SINHALA_MODEL)
153
+ model = AutoModelForCTC.from_pretrained(SINHALA_MODEL)
154
+
155
+ # Convert audio to 16kHz mono
156
+ audio = AudioSegment.from_file(audio_file)
157
+ audio = audio.set_frame_rate(16000).set_channels(1)
158
+ processed_audio_path = "processed_audio.wav"
159
+ audio.export(processed_audio_path, format="wav")
160
+
161
+ # Load and process audio
162
+ audio_input, _ = torchaudio.load(processed_audio_path)
163
+ input_values = processor(audio_input.squeeze(), return_tensors="pt", sampling_rate=16000).input_values
164
+ logits = model(input_values).logits
165
+ predicted_ids = torch.argmax(logits, dim=-1)
166
+
167
+ # Decode prediction
168
+ transcription = processor.batch_decode(predicted_ids)[0]
169
+
170
+ # Clean up processed audio file
171
+ os.remove(processed_audio_path)
172
+
173
+ return f"Transcription:\n{transcription}"
174
+
175
+ def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
176
+ """Wrapper to select the correct transcription method."""
177
+ if language == "Sinhala":
178
+ return transcribe_with_sinhala_model(audio_file)
179
+ else:
180
+ return transcribe_with_whisper(audio_file, language, model_size)
181
 
182
  # Define the Gradio interface
183
  with gr.Blocks() as demo:
184
+ gr.Markdown("# Audio Transcription and Language Detection")
185
 
186
  with gr.Tab("Transcribe Audio"):
187
  gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.")
188
  transcribe_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
189
  language_dropdown = gr.Dropdown(
190
+ choices=list(LANGUAGE_NAME_TO_CODE.keys()),
191
  label="Select Language",
192
  value="Auto Detect"
193
  )
194
  model_dropdown = gr.Dropdown(
195
+ choices=list(MODELS.keys()),
196
  label="Select Model",
197
+ value="Base (Faster)"
 
198
  )
199
  transcribe_output = gr.Textbox(label="Transcription and Detected Language")
200
  transcribe_button = gr.Button("Transcribe Audio")
201
 
202
  # Update model dropdown based on language selection
203
  def update_model_dropdown(language):
204
+ if language == "Sinhala":
205
+ return gr.Dropdown(interactive=False, value="Fine-Tuned Sinhala Model")
 
206
  else:
207
+ return gr.Dropdown(choices=list(MODELS.keys()), interactive=True, value="Base (Faster)")
 
208
 
209
  language_dropdown.change(update_model_dropdown, inputs=language_dropdown, outputs=model_dropdown)
 
 
210
  transcribe_button.click(transcribe_audio, inputs=[transcribe_audio_input, language_dropdown, model_dropdown], outputs=transcribe_output)
211
 
212
  # Launch the Gradio interface
213
+ demo.launch()