Spaces:

langtech-innovation
/

WhisperLiveKitDiarization

Paused

App Files Files Community

SilasKieser commited on Jan 28

Commit

a4ee6eb

1 Parent(s): 02f90cf

sentence work again!

Browse files

Files changed (1) hide show

src/whisper_streaming/online_asr.py +125 -77

src/whisper_streaming/online_asr.py CHANGED Viewed

@@ -17,8 +17,10 @@ class HypothesisBuffer:
         self.logfile = logfile
     def insert(self, new, offset):
-        # compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
-        # the new tail is added to self.new
         new = [(a + offset, b + offset, t) for a, b, t in new]
         self.new = [(a, b, t) for a, b, t in new if a > self.last_commited_time - 0.1]
@@ -77,6 +79,9 @@ class HypothesisBuffer:
         return self.buffer
 class OnlineASRProcessor:
     SAMPLING_RATE = 16000
@@ -128,7 +133,9 @@ class OnlineASRProcessor:
         if offset is not None:
             self.buffer_time_offset = offset
         self.transcript_buffer.last_commited_time = self.buffer_time_offset
-        self.commited = []
     def insert_audio_chunk(self, audio):
         self.audio_buffer = np.append(self.audio_buffer, audio)
@@ -136,23 +143,42 @@ class OnlineASRProcessor:
     def prompt(self):
         """Returns a tuple: (prompt, context), where "prompt" is a 200-character suffix of commited text that is inside of the scrolled away part of audio buffer.
         "context" is the commited text that is inside the audio buffer. It is transcribed again and skipped. It is returned only for debugging and logging reasons.
-        """
-        k = max(0, len(self.commited) - 1)
-        while k > 0 and self.commited[k - 1][1] > self.buffer_time_offset:
-            k -= 1
-        p = self.commited[:k]
-        p = [t for _, _, t in p]
-        prompt = []
-        l = 0
-        while p and l < 200:  # 200 characters prompt size
-            x = p.pop(-1)
-            l += len(x) + 1
-            prompt.append(x)
-        non_prompt = self.commited[k:]
-        return self.asr.sep.join(prompt[::-1]), self.asr.sep.join(
-            t for _, _, t in non_prompt
-        )
     def process_iter(self):
         """Runs on the current audio buffer.
@@ -161,93 +187,111 @@ class OnlineASRProcessor:
         """
         prompt, non_prompt = self.prompt()
-        logger.debug(f"PROMPT(previous): {prompt}")
-        logger.debug(f"CONTEXT: {non_prompt}")
         logger.debug(
             f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}"
         )
-        res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
-        # transform to [(beg,end,"word1"), ...]
         tsw = self.asr.ts_words(res)
-        # insert into HypothesisBuffer
         self.transcript_buffer.insert(tsw, self.buffer_time_offset)
-        o = self.transcript_buffer.flush()
-        # Completed words
-        self.commited.extend(o)
-        completed = self.concatenate_tsw(o) # This will be returned at the end of the function
-        logger.debug(f">>>>COMPLETE NOW: {completed[2]}")
-        ## The rest is incomplete
-        the_rest = self.concatenate_tsw(self.transcript_buffer.complete())
-        logger.debug(f"INCOMPLETE: {the_rest[2]}")
-        # there is a newly confirmed text
-        if self.buffer_trimming_way == "sentence":
-            self.chunk_completed_sentence(self.commited)
-        # TODO: new words in `completed` should not be reterned unless they form a sentence
-        # TODO: only complete sentences should go to completed
-        if len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec :
-            if self.buffer_trimming_way == "sentence":
-                logger.warning(f"Chunck segment after {self.buffer_trimming_sec} seconds!"
-                                " Even if no sentence was found!"
-                             )
-            self.chunk_completed_segment(res)
-                # alternative: on any word
-                # l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10
-                # let's find commited word that is less
-                # k = len(self.commited)-1
-                # while k>0 and self.commited[k][1] > l:
-                #    k -= 1
-                # t = self.commited[k][1]
-                # self.chunk_at(t)
-        return completed
-    def chunk_completed_sentence(self):
-        if self.commited == []:
-            return
-        raw_text = self.asr.sep.join([s[2] for s in self.commited])
-        logger.debug(f"COMPLETED SENTENCE: {raw_text}")
-        sents = self.words_to_sentences(self.commited)
-        if len(sents) < 2:
-            logger.debug(f"[Sentence-segmentation] no sentence segmented.")
-            return
-        identified_sentence= "\n    - ".join([f"{s[0]*1000:.0f}-{s[1]*1000:.0f} {s[2]}" for s in sents])
-        logger.debug(f"[Sentence-segmentation] identified sentences:\n    - {identified_sentence}")
-        # we will continue with audio processing at this timestamp
-        chunk_at = sents[-2][1]
-        self.chunk_at(chunk_at)
     def chunk_completed_segment(self, res):
-        if self.commited == []:
             return
         ends = self.asr.segments_end_ts(res)
-        t = self.commited[-1][1]
         if len(ends) <= 1:
             logger.debug(f"--- not enough segments to chunk (<=1 words)")
@@ -287,9 +331,11 @@ class OnlineASRProcessor:
         Returns: [(beg,end,"sentence 1"),...]
         """
         cwords = [w for w in words]
         t = self.asr.sep.join(o[2] for o in cwords)
         logger.debug(f"[Sentence-segmentation] Raw Text: {t}")
         s = self.tokenize([t])
         out = []
         while s:
@@ -302,11 +348,13 @@ class OnlineASRProcessor:
                 w = w.strip()
                 if beg is None and sent.startswith(w):
                     beg = b
-                elif end is None and sent == w:
                     end = e
                     out.append((beg, end, fsent))
                     break
                 sent = sent[len(w) :].strip()
         return out
     def finish(self):
@@ -330,7 +378,7 @@ class OnlineASRProcessor:
         # return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
         if sep is None:
             sep = self.asr.sep
         t = sep.join(s[2] for s in tsw)
         if len(tsw) == 0:
             b = None

         self.logfile = logfile
     def insert(self, new, offset):
+        """
+        compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
+        The new tail is added to self.new
+        """
         new = [(a + offset, b + offset, t) for a, b, t in new]
         self.new = [(a, b, t) for a, b, t in new if a > self.last_commited_time - 0.1]
         return self.buffer
 class OnlineASRProcessor:
     SAMPLING_RATE = 16000
         if offset is not None:
             self.buffer_time_offset = offset
         self.transcript_buffer.last_commited_time = self.buffer_time_offset
+        self.final_transcript = []
+        self.commited_not_final = []
     def insert_audio_chunk(self, audio):
         self.audio_buffer = np.append(self.audio_buffer, audio)
     def prompt(self):
         """Returns a tuple: (prompt, context), where "prompt" is a 200-character suffix of commited text that is inside of the scrolled away part of audio buffer.
         "context" is the commited text that is inside the audio buffer. It is transcribed again and skipped. It is returned only for debugging and logging reasons.
+        """
+        if len(self.final_transcript) == 0:
+            prompt=""
+        if len(self.final_transcript) == 1:
+            prompt = self.final_transcript[0][2][-200:]
+        else:
+            prompt = self.concatenate_tsw(self.final_transcript)[2][-200:]
+        # TODO: this is not ideal as we concatenate each time the whole transcript
+        # k = max(0, len(self.final_transcript) - 1)
+        # while k > 1 and self.final_transcript[k - 1][1] > self.buffer_time_offset:
+        #     k -= 1
+        # p = self.final_transcript[:k]
+        # p = [t for _, _, t in p]
+        # prompt = []
+        # l = 0
+        # while p and l < 200:  # 200 characters prompt size
+        #     x = p.pop(-1)
+        #     l += len(x) + 1
+        #     prompt.append(x)
+        non_prompt =  self.concatenate_tsw(self.commited_not_final)[2]
+        logger.debug(f"PROMPT(previous): {prompt[:20]}...{prompt[-20:]} (length={len(prompt)}chars)")
+        logger.debug(f"CONTEXT: {non_prompt}")
+        return prompt, non_prompt
     def process_iter(self):
         """Runs on the current audio buffer.
         """
         prompt, non_prompt = self.prompt()
         logger.debug(
             f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}"
         )
+        ## Transcribe and format the result to [(beg,end,"word1"), ...]
+        res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
         tsw = self.asr.ts_words(res)
+        # insert into HypothesisBuffer, and get back the commited words
         self.transcript_buffer.insert(tsw, self.buffer_time_offset)
+        commited_tsw = self.transcript_buffer.flush()
+        if len(commited_tsw) == 0:
+            return (None, None, "")
+        self.commited_not_final.extend(commited_tsw)
+        # Define `completed` and `the_rest` based on the buffer_trimming_way
+        # completed will be returned at the end of the function.
+        # completed is a transcribed text with (beg,end,"sentence ...") format.
+        if self.buffer_trimming_way == "sentence":
+            sentences = self.words_to_sentences(self.commited_not_final)
+            if len(sentences) < 2:
+                logger.debug(f"[Sentence-segmentation] no full sentence segmented, do not commit anything.")
+                completed = []
+            else:
+                identified_sentence= "\n    - ".join([f"{s[0]*1000:.0f}-{s[1]*1000:.0f} {s[2]}" for s in sentences])
+                logger.debug(f"[Sentence-segmentation] identified sentences:\n    - {identified_sentence}")
+                # assume last sentence is incomplete, which is not always true
+                # we will continue with audio processing at this timestamp
+                chunk_at = sentences[-2][1]
+                self.chunk_at(chunk_at)
+                # TODO: here paragraph breaks can be added
+                self.commited_not_final = sentences[-1:]
+                completed= sentences[:-1]
+        else:
+            # break audio buffer anyway if it is too long
+            if len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec :
+                if self.buffer_trimming_way == "sentence":
+                    logger.warning(f"Chunck segment after {self.buffer_trimming_sec} seconds!"
+                                    " Even if no sentence was found!"
+                                )
+                completed = self.concatenate_tsw(self.commited_not_final)
+                self.commited_not_final = []
+                self.chunk_completed_segment(res)
+                # TODO: I don't know if res is the correct variable to pass here
+            else:
+                completed = []
+        if len(completed) == 0:
+            return (None, None, "")
+        else:
+            self.final_transcript.extend(completed) # add whole time stamped sentences / or words to commited list
+            completed_text_segment= self.concatenate_tsw(completed)
+            the_rest = self.concatenate_tsw(self.transcript_buffer.complete())
+            commited_but_not_final = self.concatenate_tsw(self.commited_not_final)
+            logger.debug(f"\n    COMPLETE NOW: {completed_text_segment[2]}\n"
+                         f"    COMMITTED (but not Final): {commited_but_not_final[2]}\n"
+                         f"    INCOMPLETE: {the_rest[2]}"
+                         )
+            return completed_text_segment
     def chunk_completed_segment(self, res):
+        if self.final_transcript == []:
             return
         ends = self.asr.segments_end_ts(res)
+        t = self.final_transcript[-1][1]
         if len(ends) <= 1:
             logger.debug(f"--- not enough segments to chunk (<=1 words)")
         Returns: [(beg,end,"sentence 1"),...]
         """
         cwords = [w for w in words]
         t = self.asr.sep.join(o[2] for o in cwords)
         logger.debug(f"[Sentence-segmentation] Raw Text: {t}")
         s = self.tokenize([t])
         out = []
         while s:
                 w = w.strip()
                 if beg is None and sent.startswith(w):
                     beg = b
+                if end is None and sent == w:
                     end = e
+                if beg is not None and end is not None:
                     out.append((beg, end, fsent))
                     break
                 sent = sent[len(w) :].strip()
         return out
     def finish(self):
         # return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
         if sep is None:
             sep = self.asr.sep
         t = sep.join(s[2] for s in tsw)
         if len(tsw) == 0:
             b = None