File size: 2,904 Bytes
9556d07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8c786a
9556d07
 
 
 
c8c786a
9556d07
c8c786a
 
9556d07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324dee0
9556d07
 
 
 
324dee0
9556d07
 
 
 
 
 
 
324dee0
9556d07
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from microphone_stream import MicrophoneStream
from voice_activity_controller import VoiceActivityController
from whisper_online import *
import numpy as np
import librosa  
import io
import soundfile
import sys




class SimpleASRProcessor:

    def __init__(self, asr, sampling_rate = 16000):
        """run this when starting or restarting processing"""
        self.audio_buffer = np.array([],dtype=np.float32)
        self.prompt_buffer = ""
        self.asr = asr
        self.sampling_rate = sampling_rate
        self.init_prompt = ''

    def ts_words(self, segments):
        result = ""
        for segment in segments:
            if segment.no_speech_prob > 0.9:
                continue
            for word in segment.words:
                w = word.word
                t = (word.start, word.end, w)
                result +=w
        return result 

    def stream_process(self, vad_result):
        iter_in_phrase = 0
        for chunk, is_final in vad_result:
            iter_in_phrase += 1

            if chunk is not None:
                sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
                audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
                out = []
                out.append(audio)
                a = np.concatenate(out)
                self.audio_buffer = np.append(self.audio_buffer, a)

            if is_final and len(self.audio_buffer) > 0:
                res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
                tsw = self.ts_words(res)
                
                self.init_prompt = self.init_prompt + tsw
                self.init_prompt  = self.init_prompt [-100:]
                self.audio_buffer.resize(0)
                iter_in_phrase =0
                
                yield True, tsw
            # show progress evry 50 chunks
            elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
                res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
                # use custom ts_words
                tsw = self.ts_words(res)
                yield False, tsw
            
        





SAMPLING_RATE = 16000

model = "large-v2"
src_lan = "en"  # source language
tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
use_vad = False
min_sample_length = 1 * SAMPLING_RATE



vac = VoiceActivityController(use_vad_result = use_vad)
asr = FasterWhisperASR(src_lan, "large-v2")  # loads and wraps Whisper model

tokenizer = create_tokenizer(tgt_lan)
online = SimpleASRProcessor(asr)


stream = MicrophoneStream()
stream = vac.detect_user_speech(stream, audio_in_int16 = False) 
stream = online.stream_process(stream)

for isFinal, text in stream:
    if isFinal:
        print( text,  end="\r\n")
    else:
        print( text,  end="\r")