File size: 13,240 Bytes
5806e12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
import json
import os
import re

# read transcription_cuint.json from session_data/session_id
# read text_token item, repetition is start from <REPSTART> and end at <REPEND>, the 
# extract repetition information and add them to current json file, information include 
# content: the content between <REPSTART> and <REPEND>
# words: words index of each word in content, the index of first word in the segment is 0, index calculation ignore special token
# mark_location: the index of last words of content
# format example:
"""
"repetitions": [
        {
          "content": "now the now the",
          "words": [
            1,
            2,
            3,
            4
          ],
          "mark_location": 4
        }
      ],
"""
def annotate_repetition_for_mazewhisper(session_id):
    file_path = f"session_data/{session_id}/transcription_cunit.json"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return

    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    for segment in data["segments"]:
        text_token = segment.get("text_token", "")
        tokens = text_token.split()

        repetitions_list = []
        inside = False
        rep_words = []
        rep_word_indices = []
        word_count = 0  # segment-level word counter

        for tok in tokens:
            if tok == "<REPSTART>":
                inside = True
                rep_words, rep_word_indices = [], []
                continue
            elif tok == "<REPEND>":
                inside = False
                if rep_words:
                    clean_rep_words = [w for w in rep_words
                                       if not (w.startswith("<") and w.endswith(">"))]
                    repetitions_list.append(
                        {
                            "content": " ".join(clean_rep_words),
                            "words": rep_word_indices.copy(),
                            "mark_location": rep_word_indices[-1],
                        }
                    )
                continue

            if tok.startswith("<") and tok.endswith(">"):

                if inside:
                    pass
            else:
                if inside:
                    rep_words.append(tok)
                    rep_word_indices.append(word_count)
                word_count += 1

        if repetitions_list:
            segment["repetitions"] = repetitions_list

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

# read transcription_cuint.json from session_data/session_id
# read text_token item, revision is start from <REVSTART> and end at <REVEND>, the 
# extract revision information and add them to current json file, information include 
# content: the content between <REVSTART> and <REVEND>
# words: words index of each word in content, the index of first word in the segment is 0, index calculation ignore special token
# mark_location: the index of last words of content
# format example:
"""
"repetitions": [
        {
          "content": "now the now the",
          "words": [
            1,
            2,
            3,
            4
          ],
          "mark_location": 4
        }
      ],
"""
def annotate_revision_for_mazewhisper(session_id):
    file_path = f"session_data/{session_id}/transcription_cunit.json"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return

    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    for segment in data["segments"]:
        text_token = segment.get("text_token", "")
        tokens = text_token.split()

        revisions_list = []
        inside = False
        rev_words = []
        rev_word_indices = []
        word_count = 0

        for tok in tokens:
            if tok == "<REVSTART>":
                inside = True
                rev_words, rev_word_indices = [], []
                continue
            elif tok == "<REVEND>":
                inside = False
                if rev_words:
                    clean_rev_words = [w for w in rev_words
                                       if not (w.startswith("<") and w.endswith(">"))]
                    revisions_list.append(
                        {
                            "content": " ".join(clean_rev_words),
                            "words": rev_word_indices.copy(),
                            "mark_location": rev_word_indices[-1],
                        }
                    )
                continue

            if tok.startswith("<") and tok.endswith(">"):
                continue
            else:
                if inside:
                    rev_words.append(tok)
                    rev_word_indices.append(word_count)
                word_count += 1

        if revisions_list:
            segment["revisions"] = revisions_list

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)



# read transcription_cuint.json from session_data/session_id
# read text_token item, pause is represent as <PAUSE> 
# extract pause information and add them to current json file, information include 
# start: the start of pause is the end timestamp of previous non-special token. 
# if no previous non-special token, start time is the end time of previous segment, then set the start time of current segment as the the end time of previous segment. 
# end: the end of pause is the start timestamp of next non-special token.
# if no next non-special token, then move the <PAUSE> and all possible special token after <PAUSE> to the start of next segment.
# duration: the pause duration calculated by start and end.
# format example: 
"""
"pauses": [
        {
          "start": 364.08,
          "end": 369.1,
          "duration": 5.02
        },
        {
          "start": 369.18,
          "end": 369.56,
          "duration": 0.38
        }
      ],

"""
def annotate_pause_for_mazewhisper(session_id):
    
    file_path = f"session_data/{session_id}/transcription_cunit.json"
    
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    for i, segment in enumerate(data['segments']):
        text_token = segment.get('text_token', '')
        words = segment.get('words', [])
        pauses_list = []
        
        if '<PAUSE>' in text_token:
            tokens = text_token.split()
            
            for j, token in enumerate(tokens):
                if token == '<PAUSE>':
                    # Find start time: end timestamp of previous non-special token
                    start_time = None
                    
                    # Look backwards in current segment for previous word
                    word_idx = 0
                    for k in range(j):
                        if not (tokens[k].startswith('<') and tokens[k].endswith('>')):
                            word_idx += 1
                    
                    if word_idx > 0 and word_idx <= len(words):
                        start_time = words[word_idx - 1].get('end')
                    elif i > 0:  # Use end time of previous segment
                        start_time = data['segments'][i - 1].get('end')
                        # Update current segment start time
                        segment['start'] = start_time
                    
                    # Find end time: start timestamp of next non-special token
                    end_time = None
                    
                    # Look forwards in current segment for next word
                    next_word_idx = 0
                    for k in range(j + 1, len(tokens)):
                        if not (tokens[k].startswith('<') and tokens[k].endswith('>')):
                            next_word_idx = word_idx + 1
                            break
                        if tokens[k].startswith('<') and tokens[k].endswith('>'):
                            continue
                    
                    if next_word_idx > 0 and next_word_idx <= len(words):
                        end_time = words[next_word_idx - 1].get('start')
                    elif i < len(data['segments']) - 1:  # Move pause to next segment
                        # This is complex - for now, use next segment start
                        end_time = data['segments'][i + 1].get('start') if i + 1 < len(data['segments']) else segment.get('end')
                    
                    # If we have both start and end times, calculate duration
                    if start_time is not None and end_time is not None:
                        duration = round(end_time - start_time, 2)
                        
                        pause_info = {
                            "start": start_time,
                            "end": end_time,
                            "duration": duration
                        }
                        
                        pauses_list.append(pause_info)
        
        # Add pauses to this segment if any found
        if pauses_list:
            segment['pauses'] = pauses_list
    
    # Save the updated data back to file
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    
    return



# read transcription_cuint.json from session_data/session_id
# read text_token item, filler words is represent as <FILLER> 
# extract filler word information and add them to current json file, information include 
# start: set to empty for now
# end: set to empty for now
# duration: set to empty for now
# format example: 
"""
"fillerwords": [
        {
          "start": ,
          "end": ,
          "content": "",
          "duration": 
        }
      ],
"""
# then, insert "um" as the previous token of <FILLER> in text_token item.
# then, insert "um" to text item and word list (timestamp set to empty for now), the location is corresponding to the <FILLER> location in text_token
# (this is a prelimary implementation, next steps should use maze whisper that transcript filler words content and align them directly)
def annotate_fillerword_for_mazewhisper(session_id):
    
    file_path = f"session_data/{session_id}/transcription_cunit.json"
    
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    for segment in data['segments']:
        text_token = segment.get('text_token', '')
        words = segment.get('words', [])
        text = segment.get('text', '')
        fillerwords_list = []
        
        if '<FILLER>' in text_token:
            # Create filler word entries (preliminary implementation)
            filler_count = text_token.count('<FILLER>')
            
            for _ in range(filler_count):
                filler_info = {
                    "start": None,
                    "end": None,
                    "content": "",
                    "duration": None
                }
                fillerwords_list.append(filler_info)
            
            # Insert "um" before each <FILLER> in text_token
            updated_text_token = text_token.replace('<FILLER>', 'um <FILLER>')
            segment['text_token'] = updated_text_token
            
            # Insert "um" in text and words list at corresponding locations
            tokens = text_token.split()
            text_words = text.split()
            
            # Find positions of <FILLER> and insert "um"
            filler_positions = []
            word_count = 0
            
            for i, token in enumerate(tokens):
                if token == '<FILLER>':
                    filler_positions.append(word_count)
                elif not (token.startswith('<') and token.endswith('>')):
                    word_count += 1
            
            # Insert "um" in reverse order to maintain indices
            for pos in reversed(filler_positions):
                text_words.insert(pos, 'um')
                
                # Insert in words list with empty timestamps
                um_word = {
                    "word": "um",
                    "start": None,
                    "end": None
                }
                
                if pos <= len(words):
                    words.insert(pos, um_word)
            
            # Update text and words in segment
            segment['text'] = ' '.join(text_words)
            segment['words'] = words
            
            # Add fillerwords to this segment
            segment['fillerwords'] = fillerwords_list
    
    # Save the updated data back to file
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    
    return





def annotate_maze_for_mazewhisper(session_id):
    annotate_fillerword_for_mazewhisper(session_id)
    annotate_repetition_for_mazewhisper(session_id)
    annotate_revision_for_mazewhisper(session_id)
    annotate_pause_for_mazewhisper(session_id)

    print("Maze annotation completed!")