Athspi commited on
Commit
ac8d452
·
verified ·
1 Parent(s): 7d07125

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -18
app.py CHANGED
@@ -3,7 +3,7 @@ import whisper
3
  import torch
4
  import os
5
  from pydub import AudioSegment
6
- from transformers import AutoProcessor, AutoModelForCTC
7
 
8
  # Mapping of model names to Whisper model sizes
9
  MODELS = {
@@ -14,13 +14,13 @@ MODELS = {
14
  "Large (Most Accurate)": "large"
15
  }
16
 
17
- # Fine-tuned Wav2Vec2 models for specific languages
18
- WAV2VEC2_MODELS = {
19
  "Tamil": {
20
- "processor": "Amrrs/wav2vec2-large-xlsr-53-tamil",
21
- "model": "Amrrs/wav2vec2-large-xlsr-53-tamil"
22
  },
23
- # Add more Wav2Vec2 models for other languages here
24
  }
25
 
26
  # Mapping of full language names to language codes
@@ -136,17 +136,21 @@ def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faste
136
  audio.export(processed_audio_path, format="wav")
137
 
138
  # Load the appropriate model
139
- if language in WAV2VEC2_MODELS:
140
- # Use the fine-tuned Wav2Vec2 model for the selected language
141
- processor = AutoProcessor.from_pretrained(WAV2VEC2_MODELS[language]["processor"])
142
- model = AutoModelForCTC.from_pretrained(WAV2VEC2_MODELS[language]["model"])
143
-
144
- # Load audio and process
145
- inputs = processor(AudioSegment.from_file(processed_audio_path).raw_data, sampling_rate=16000, return_tensors="pt")
146
- with torch.no_grad():
147
- logits = model(inputs.input_values).logits
148
- predicted_ids = torch.argmax(logits, dim=-1)
149
- transcription = processor.decode(predicted_ids[0])
 
 
 
 
150
  detected_language = language
151
  else:
152
  # Use the selected Whisper model
@@ -192,7 +196,7 @@ with gr.Blocks() as demo:
192
 
193
  # Update model dropdown based on language selection
194
  def update_model_dropdown(language):
195
- if language in WAV2VEC2_MODELS:
196
  return gr.Dropdown(interactive=False, value=f"Fine-Tuned {language} Model")
197
  else:
198
  return gr.Dropdown(choices=list(MODELS.keys()), interactive=True, value="Base (Faster)")
 
3
  import torch
4
  import os
5
  from pydub import AudioSegment
6
+ from transformers import pipeline
7
 
8
  # Mapping of model names to Whisper model sizes
9
  MODELS = {
 
14
  "Large (Most Accurate)": "large"
15
  }
16
 
17
+ # Fine-tuned models for specific languages
18
+ FINE_TUNED_MODELS = {
19
  "Tamil": {
20
+ "model": "vasista22/whisper-tamil-medium",
21
+ "language": "ta"
22
  },
23
+ # Add more fine-tuned models for other languages here
24
  }
25
 
26
  # Mapping of full language names to language codes
 
136
  audio.export(processed_audio_path, format="wav")
137
 
138
  # Load the appropriate model
139
+ if language in FINE_TUNED_MODELS:
140
+ # Use the fine-tuned Whisper model for the selected language
141
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
142
+ transcribe = pipeline(
143
+ task="automatic-speech-recognition",
144
+ model=FINE_TUNED_MODELS[language]["model"],
145
+ chunk_length_s=30,
146
+ device=device
147
+ )
148
+ transcribe.model.config.forced_decoder_ids = transcribe.tokenizer.get_decoder_prompt_ids(
149
+ language=FINE_TUNED_MODELS[language]["language"],
150
+ task="transcribe"
151
+ )
152
+ result = transcribe(processed_audio_path)
153
+ transcription = result["text"]
154
  detected_language = language
155
  else:
156
  # Use the selected Whisper model
 
196
 
197
  # Update model dropdown based on language selection
198
  def update_model_dropdown(language):
199
+ if language in FINE_TUNED_MODELS:
200
  return gr.Dropdown(interactive=False, value=f"Fine-Tuned {language} Model")
201
  else:
202
  return gr.Dropdown(choices=list(MODELS.keys()), interactive=True, value="Base (Faster)")