import speech_recognition as sr import numpy as np import collections import config class SpeechTranscriber: def __init__(self): self.recognizer = sr.Recognizer() self.recognizer.energy_threshold = config.ENERGY_THRESHOLD self.recognizer.dynamic_energy_threshold = config.DYNAMIC_ENERGY_THRESHOLD self.recognizer.pause_threshold = config.PAUSE_THRESHOLD self.audio_buffer = collections.deque(maxlen=config.BUFFER_DURATION * 10) self.last_processed = 0 def add_audio_chunk(self, audio_chunk): self.audio_buffer.extend(audio_chunk) def get_transcript_chunk(self): # Only process if we have enough audio if len(self.audio_buffer) < config.SAMPLE_RATE * config.MIN_PROCESSING_DURATION: return None # Convert to AudioData format audio_data = sr.AudioData( np.array(self.audio_buffer).tobytes(), config.SAMPLE_RATE, 2 # Sample width in bytes ) try: # Use Google Web Speech API for best accuracy text = self.recognizer.recognize_google(audio_data) return text except sr.UnknownValueError: return None except sr.RequestError as e: print(f"Speech recognition error: {str(e)}") return None finally: # Clear buffer after processing self.audio_buffer.clear()