Spaces:
Build error
Build error
Update DiT_Extractor/sentence_extractor.py
Browse files
DiT_Extractor/sentence_extractor.py
CHANGED
|
@@ -95,6 +95,11 @@ def sentence_extract(document):
|
|
| 95 |
for sentence in sentences:
|
| 96 |
t += len(sentence)
|
| 97 |
if t <= max_tokens:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
word_section += sentence
|
| 99 |
else:
|
| 100 |
word_sections.append(word_section)
|
|
|
|
| 95 |
for sentence in sentences:
|
| 96 |
t += len(sentence)
|
| 97 |
if t <= max_tokens:
|
| 98 |
+
# update character indicies from concatenating sentences
|
| 99 |
+
if len(word_section) > 0:
|
| 100 |
+
last_word_obj = word_section[-1]
|
| 101 |
+
_, (_, char_idx_offset), _ = last_word_obj
|
| 102 |
+
sentence = [(w, (sc+char_idx_offset+1, ec+char_idx_offset+1), bbox) for w, (sc, ec), bbox in sentence]
|
| 103 |
word_section += sentence
|
| 104 |
else:
|
| 105 |
word_sections.append(word_section)
|