vidverse / app /captioning /caption_helper.py
badal
feat: initial commit
2f2406a
#
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
#
from datetime import date, datetime, time, timedelta
from typing import List, Optional, Tuple
import azure.cognitiveservices.speech as speechsdk # type: ignore
from . import helper
class Caption(object) :
def __init__(self, language : Optional[str], sequence : int, begin : time, end : time, text : str) :
self.language = language
self.sequence = sequence
self.begin = begin
self.end = end
self.text = text
def get_captions(language : Optional[str], max_width : int, max_height : int, results : List[dict]) -> List[Caption] :
caption_helper = CaptionHelper(language, max_width, max_height, results)
return caption_helper.get_captions()
class CaptionHelper(object) :
def __init__(self, language : Optional[str], max_width : int, max_height : int, results : List[speechsdk.RecognitionResult]) :
self._language = language
self._max_width = max_width
self._max_height = max_height
self._results = results
self._first_pass_terminators = ["?", "!", ",", ";"]
self._second_pass_terminators = [" ", "."]
self._captions : List[Caption] = []
# consider adapting to use http://unicode.org/reports/tr29/#Sentence_Boundaries
if self._language is not None :
iso639 = self._language.split('-')[0]
if "zh" == iso639.lower() :
self._first_pass_terminators = [",", "、", ";", "?", "!", "?", "!", ",", ";"]
self._second_pass_terminators = ["。", " "]
if (helper.DEFAULT_MAX_LINE_LENGTH_SBCS == self._max_width) :
self._max_width = helper.DEFAULT_MAX_LINE_LENGTH_MBCS
def get_captions(self) -> List[Caption] :
self.ensure_captions()
return self._captions
def ensure_captions(self) -> None :
if not self._captions :
self.add_captions_for_all_results()
def add_captions_for_all_results(self) -> None :
for result in self._results :
if result.offset <= 0 or not self.is_final_result(result) :
continue
text = self.get_text_or_translation(result)
if not text :
continue
self.add_captions_for_final_result(result, text)
def get_text_or_translation(self, result : speechsdk.RecognitionResult) -> Optional[str] :
return result.text
# 20220921 We do not use this for now because this sample
# does not handle TranslationRecognitionResults.
#if not self._language :
# return result.text
#if type(result) is speechsdk.TranslationRecognitionResult and self._language in result.Translations :
# return result.Translations[self._language]
#else :
# return None
def add_captions_for_final_result(self, result : speechsdk.RecognitionResult, text : str) -> None :
caption_starts_at = 0
caption_lines : List[str] = []
index = 0
while (index < len(text)) :
index = self.skip_skippable(text, index)
line_length = self.get_best_width(text, index)
caption_lines.append(text[index:index + line_length].strip())
index += line_length
is_last_caption = index >= len(text)
max_caption_lines = len(caption_lines) >= self._max_height
add_caption = is_last_caption or max_caption_lines
if add_caption :
caption_text = '\n'.join(caption_lines)
caption_lines.clear()
caption_sequence = len(self._captions) + 1
is_first_caption = 0 == caption_starts_at
caption_begin_and_end : Tuple[time, time]
if is_first_caption and is_last_caption :
caption_begin_and_end = self.get_full_caption_result_timing(result)
else :
caption_begin_and_end = self.get_partial_result_caption_timing(result, text, caption_text, caption_starts_at, index - caption_starts_at)
self._captions.append(Caption(self._language, caption_sequence, caption_begin_and_end[0], caption_begin_and_end[1], caption_text))
caption_starts_at = index
def get_best_width(self, text : str, start_index : int) -> int :
remaining = len(text) - start_index
best_width = remaining if remaining < self._max_width else self.find_best_width(self._first_pass_terminators, text, start_index)
if (best_width < 0) :
best_width = self.find_best_width(self._second_pass_terminators, text, start_index)
if best_width < 0 :
best_width = self._max_width
return best_width
def find_best_width(self, terminators : List[str], text : str, start_at : int) -> int :
remaining = len(text) - start_at
check_chars = min(remaining, self._max_width)
best_width = -1
for terminator in terminators :
index = text.rfind(terminator, start_at, start_at + check_chars)
width = index - start_at
if width > best_width :
best_width = width + len(terminator)
return best_width
def skip_skippable(self, text : str, start_index : int) -> int :
index = start_index
while len(text) > index and ' ' == text[index] :
index += 1
return index
def get_full_caption_result_timing(self, result : speechsdk.RecognitionResult) -> Tuple[time, time] :
begin = helper.time_from_ticks(result.offset)
end = helper.time_from_ticks(result.offset + result.duration)
return (begin, end)
def get_partial_result_caption_timing(self, result : speechsdk.RecognitionResult, text : str, caption_text : str, caption_starts_at : int, caption_length : int) -> Tuple[time, time] :
(result_begin, result_end) = self.get_full_caption_result_timing(result)
result_duration = helper.subtract_times(result_end, result_begin)
text_length = len(text)
partial_begin = helper.add_time_and_timedelta(result_begin, result_duration * caption_starts_at / text_length)
partial_end = helper.add_time_and_timedelta(result_begin, result_duration * (caption_starts_at + caption_length) / text_length)
return (partial_begin, partial_end)
def is_final_result(self, result : speechsdk.RecognitionResult) -> bool :
return speechsdk.ResultReason.RecognizedSpeech == result.reason or speechsdk.ResultReason.RecognizedIntent == result.reason or speechsdk.ResultReason.TranslatedSpeech == result.reason
def lines_from_text(self, text : str) -> List[str] :
retval : List[str] = []
index = 0
while (index < len(text)) :
index = self.skip_skippable(text, index)
line_length = self.get_best_width(text, index)
retval.append(text[index:index + line_length].strip())
index += line_length
return retval