File size: 7,149 Bytes
2f2406a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
#

from datetime import date, datetime, time, timedelta
from typing import List, Optional, Tuple
import azure.cognitiveservices.speech as speechsdk # type: ignore
from . import helper

class Caption(object) :
    def __init__(self, language : Optional[str], sequence : int, begin : time, end : time, text : str) :
        self.language = language
        self.sequence = sequence
        self.begin = begin
        self.end = end
        self.text = text

def get_captions(language : Optional[str], max_width : int, max_height : int, results : List[dict]) -> List[Caption] :
    caption_helper = CaptionHelper(language, max_width, max_height, results)
    return caption_helper.get_captions()

class CaptionHelper(object) :
    def __init__(self, language : Optional[str], max_width : int, max_height : int, results : List[speechsdk.RecognitionResult]) :
        self._language = language
        self._max_width = max_width
        self._max_height = max_height
        self._results = results

        self._first_pass_terminators = ["?", "!", ",", ";"]
        self._second_pass_terminators = [" ", "."]

        self._captions : List[Caption] = []

        # consider adapting to use http://unicode.org/reports/tr29/#Sentence_Boundaries
        if self._language is not None :
            iso639 = self._language.split('-')[0]
            if "zh" == iso639.lower() :
                self._first_pass_terminators = [",", "、", ";", "?", "!", "?", "!", ",", ";"]
                self._second_pass_terminators = ["。", " "]
                if (helper.DEFAULT_MAX_LINE_LENGTH_SBCS == self._max_width) :
                    self._max_width = helper.DEFAULT_MAX_LINE_LENGTH_MBCS

    def get_captions(self) -> List[Caption] :
        self.ensure_captions()
        return self._captions

    def ensure_captions(self) -> None :
        if not self._captions :
            self.add_captions_for_all_results()

    def add_captions_for_all_results(self) -> None :
        for result in self._results :
            if result.offset <= 0 or not self.is_final_result(result) :
                continue
            text = self.get_text_or_translation(result)
            if not text :
                continue
            self.add_captions_for_final_result(result, text)

    def get_text_or_translation(self, result : speechsdk.RecognitionResult) -> Optional[str] :
        return result.text

        # 20220921 We do not use this for now because this sample
        # does not handle TranslationRecognitionResults.
        #if not self._language :
        #    return result.text
        #if type(result) is speechsdk.TranslationRecognitionResult and self._language in result.Translations :
        #    return result.Translations[self._language]
        #else :
        #    return None

    def add_captions_for_final_result(self, result : speechsdk.RecognitionResult, text : str) -> None :
        caption_starts_at = 0
        caption_lines : List[str] = []
        index = 0
        while (index < len(text)) :
            index = self.skip_skippable(text, index)

            line_length = self.get_best_width(text, index)
            caption_lines.append(text[index:index + line_length].strip())
            index += line_length

            is_last_caption = index >= len(text)
            max_caption_lines = len(caption_lines) >= self._max_height

            add_caption = is_last_caption or max_caption_lines

            if add_caption :
                caption_text = '\n'.join(caption_lines)
                caption_lines.clear()

                caption_sequence = len(self._captions) + 1
                is_first_caption = 0 == caption_starts_at

                caption_begin_and_end : Tuple[time, time]
                if is_first_caption and is_last_caption :
                    caption_begin_and_end = self.get_full_caption_result_timing(result)
                else :
                    caption_begin_and_end = self.get_partial_result_caption_timing(result, text, caption_text, caption_starts_at, index - caption_starts_at)

                self._captions.append(Caption(self._language, caption_sequence, caption_begin_and_end[0], caption_begin_and_end[1], caption_text))
                
                caption_starts_at = index

    def get_best_width(self, text : str, start_index : int) -> int :
        remaining = len(text) - start_index
        best_width = remaining if remaining < self._max_width else self.find_best_width(self._first_pass_terminators, text, start_index)
        if (best_width < 0) :
            best_width = self.find_best_width(self._second_pass_terminators, text, start_index)
        if best_width < 0 :
            best_width = self._max_width
        return best_width

    def find_best_width(self, terminators : List[str], text : str, start_at : int) -> int :
        remaining = len(text) - start_at
        check_chars = min(remaining, self._max_width)
        best_width = -1
        for terminator in terminators :
            index = text.rfind(terminator, start_at, start_at + check_chars)
            width = index - start_at
            if width > best_width :
                best_width = width + len(terminator)
        return best_width

    def skip_skippable(self, text : str, start_index : int) -> int :
        index = start_index
        while len(text) > index and ' ' == text[index] :
            index += 1
        return index

    def get_full_caption_result_timing(self, result : speechsdk.RecognitionResult) -> Tuple[time, time] :
        begin = helper.time_from_ticks(result.offset)
        end = helper.time_from_ticks(result.offset + result.duration)
        return (begin, end)

    def get_partial_result_caption_timing(self, result : speechsdk.RecognitionResult, text : str, caption_text : str, caption_starts_at : int, caption_length : int) -> Tuple[time, time] :
        (result_begin, result_end) = self.get_full_caption_result_timing(result)
        result_duration = helper.subtract_times(result_end, result_begin)
        text_length = len(text)
        partial_begin = helper.add_time_and_timedelta(result_begin, result_duration * caption_starts_at / text_length)
        partial_end = helper.add_time_and_timedelta(result_begin, result_duration * (caption_starts_at + caption_length) / text_length)
        return (partial_begin, partial_end)

    def is_final_result(self, result : speechsdk.RecognitionResult) -> bool :
        return speechsdk.ResultReason.RecognizedSpeech == result.reason or speechsdk.ResultReason.RecognizedIntent == result.reason or speechsdk.ResultReason.TranslatedSpeech == result.reason

    def lines_from_text(self, text : str) -> List[str] :
        retval : List[str] = []
        index = 0
        while (index < len(text)) :
            index = self.skip_skippable(text, index)
            line_length = self.get_best_width(text, index)
            retval.append(text[index:index + line_length].strip())
            index += line_length
        return retval