nareauow commited on
Commit
7b6df09
·
verified ·
1 Parent(s): b2a6006

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -14
app.py CHANGED
@@ -128,10 +128,8 @@ def recognize_speech(audio_path):
128
  return "Speech recognition model not available"
129
 
130
  try:
131
- # Read audio file
132
  audio_data, sr = sf.read(audio_path)
133
 
134
- # Resample to 16kHz if needed
135
  if sr != 16000:
136
  audio_data = np.interp(
137
  np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
@@ -140,26 +138,26 @@ def recognize_speech(audio_path):
140
  )
141
  sr = 16000
142
 
143
- # Process audio
144
- inputs = speech_processor(audio_data, sampling_rate=sr, return_tensors="pt")
145
- inputs = {k: v.to(device) for k, v in inputs.items()}
 
 
146
 
147
- # Generate transcription with specific parameters to prevent repetition
148
  generated_ids = speech_recognizer.generate(
149
- **inputs,
150
- max_length=100, # Limit output length
151
- num_beams=1, # Use greedy search instead of beam search
152
- no_repeat_ngram_size=2, # Prevent repeating n-grams
 
153
  )
154
 
155
- # Decode with skip special tokens
156
  transcription = speech_processor.batch_decode(
157
  generated_ids,
158
- skip_special_tokens=True,
159
- clean_up_tokenization_spaces=True
160
  )[0]
161
 
162
- return transcription.strip() # Remove any extra whitespace
163
 
164
  except Exception as e:
165
  return f"Speech recognition error: {str(e)}"
 
128
  return "Speech recognition model not available"
129
 
130
  try:
 
131
  audio_data, sr = sf.read(audio_path)
132
 
 
133
  if sr != 16000:
134
  audio_data = np.interp(
135
  np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
 
138
  )
139
  sr = 16000
140
 
141
+ inputs = speech_processor(
142
+ audio_data,
143
+ sampling_rate=sr,
144
+ return_tensors="pt"
145
+ ).to(device)
146
 
 
147
  generated_ids = speech_recognizer.generate(
148
+ input_features=inputs["input_features"],
149
+ max_length=100,
150
+ num_beams=5, # Changed from 1 to 5 for better results
151
+ early_stopping=True,
152
+ no_repeat_ngram_size=2
153
  )
154
 
 
155
  transcription = speech_processor.batch_decode(
156
  generated_ids,
157
+ skip_special_tokens=True
 
158
  )[0]
159
 
160
+ return transcription.strip()
161
 
162
  except Exception as e:
163
  return f"Speech recognition error: {str(e)}"