File size: 1,474 Bytes
4b0e845 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import speech_recognition as sr
import numpy as np
import collections
import config
class SpeechTranscriber:
def __init__(self):
self.recognizer = sr.Recognizer()
self.recognizer.energy_threshold = config.ENERGY_THRESHOLD
self.recognizer.dynamic_energy_threshold = config.DYNAMIC_ENERGY_THRESHOLD
self.recognizer.pause_threshold = config.PAUSE_THRESHOLD
self.audio_buffer = collections.deque(maxlen=config.BUFFER_DURATION * 10)
self.last_processed = 0
def add_audio_chunk(self, audio_chunk):
self.audio_buffer.extend(audio_chunk)
def get_transcript_chunk(self):
# Only process if we have enough audio
if len(self.audio_buffer) < config.SAMPLE_RATE * config.MIN_PROCESSING_DURATION:
return None
# Convert to AudioData format
audio_data = sr.AudioData(
np.array(self.audio_buffer).tobytes(),
config.SAMPLE_RATE,
2 # Sample width in bytes
)
try:
# Use Google Web Speech API for best accuracy
text = self.recognizer.recognize_google(audio_data)
return text
except sr.UnknownValueError:
return None
except sr.RequestError as e:
print(f"Speech recognition error: {str(e)}")
return None
finally:
# Clear buffer after processing
self.audio_buffer.clear() |