File size: 1,474 Bytes
4b0e845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import speech_recognition as sr
import numpy as np
import collections
import config

class SpeechTranscriber:
    def __init__(self):
        self.recognizer = sr.Recognizer()
        self.recognizer.energy_threshold = config.ENERGY_THRESHOLD
        self.recognizer.dynamic_energy_threshold = config.DYNAMIC_ENERGY_THRESHOLD
        self.recognizer.pause_threshold = config.PAUSE_THRESHOLD
        self.audio_buffer = collections.deque(maxlen=config.BUFFER_DURATION * 10)
        self.last_processed = 0
        
    def add_audio_chunk(self, audio_chunk):
        self.audio_buffer.extend(audio_chunk)
        
    def get_transcript_chunk(self):
        # Only process if we have enough audio
        if len(self.audio_buffer) < config.SAMPLE_RATE * config.MIN_PROCESSING_DURATION:
            return None
            
        # Convert to AudioData format
        audio_data = sr.AudioData(
            np.array(self.audio_buffer).tobytes(), 
            config.SAMPLE_RATE, 
            2  # Sample width in bytes
        )
        
        try:
            # Use Google Web Speech API for best accuracy
            text = self.recognizer.recognize_google(audio_data)
            return text
        except sr.UnknownValueError:
            return None
        except sr.RequestError as e:
            print(f"Speech recognition error: {str(e)}")
            return None
        finally:
            # Clear buffer after processing
            self.audio_buffer.clear()