Spaces:

nareauow
/

speaker-recognition

Sleeping

nareauow commited on May 1

Commit

7b6df09

verified ·

1 Parent(s): b2a6006

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -128,10 +128,8 @@ def recognize_speech(audio_path):
         return "Speech recognition model not available"
     try:
-        # Read audio file
         audio_data, sr = sf.read(audio_path)
-        # Resample to 16kHz if needed
         if sr != 16000:
             audio_data = np.interp(
                 np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
@@ -140,26 +138,26 @@ def recognize_speech(audio_path):
             )
             sr = 16000
-        # Process audio
-        inputs = speech_processor(audio_data, sampling_rate=sr, return_tensors="pt")
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        # Generate transcription with specific parameters to prevent repetition
         generated_ids = speech_recognizer.generate(
-            **inputs,
-            max_length=100,  # Limit output length
-            num_beams=1,      # Use greedy search instead of beam search
-            no_repeat_ngram_size=2,  # Prevent repeating n-grams
         )
-        # Decode with skip special tokens
         transcription = speech_processor.batch_decode(
             generated_ids,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=True
         )[0]
-        return transcription.strip()  # Remove any extra whitespace
     except Exception as e:
         return f"Speech recognition error: {str(e)}"

         return "Speech recognition model not available"
     try:
         audio_data, sr = sf.read(audio_path)
         if sr != 16000:
             audio_data = np.interp(
                 np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
             )
             sr = 16000
+        inputs = speech_processor(
+            audio_data,
+            sampling_rate=sr,
+            return_tensors="pt"
+        ).to(device)
         generated_ids = speech_recognizer.generate(
+            input_features=inputs["input_features"],
+            max_length=100,
+            num_beams=5,  # Changed from 1 to 5 for better results
+            early_stopping=True,
+            no_repeat_ngram_size=2
         )
         transcription = speech_processor.batch_decode(
             generated_ids,
+            skip_special_tokens=True
         )[0]
+        return transcription.strip()
     except Exception as e:
         return f"Speech recognition error: {str(e)}"