Spaces:
Runtime error
Runtime error
Commit
·
5f21add
1
Parent(s):
189657b
Fix get_slides in text_extractor.py
Browse files
__pycache__/app.cpython-38.pyc
CHANGED
|
Binary files a/__pycache__/app.cpython-38.pyc and b/__pycache__/app.cpython-38.pyc differ
|
|
|
__pycache__/text_extractor.cpython-38.pyc
CHANGED
|
Binary files a/__pycache__/text_extractor.cpython-38.pyc and b/__pycache__/text_extractor.cpython-38.pyc differ
|
|
|
text_extractor.py
CHANGED
|
@@ -117,19 +117,16 @@ class TextExtractor:
|
|
| 117 |
# Remove tag and pipes from the text
|
| 118 |
section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
|
| 119 |
elif tag.startswith('p'):
|
| 120 |
-
text = re.split("((\|){2,})", text)
|
| 121 |
for paragraph in text:
|
| 122 |
-
paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip()
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
section[
|
| 128 |
-
elif paragraph:
|
| 129 |
-
paragraph = re.sub(' +', ' ', paragraph) # Replace any double space in the paragraph
|
| 130 |
-
section.append((tag, paragraph))
|
| 131 |
try:
|
| 132 |
-
if
|
| 133 |
slides[f"Page {page}"] = section
|
| 134 |
page += 1
|
| 135 |
except:
|
|
|
|
| 117 |
# Remove tag and pipes from the text
|
| 118 |
section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
|
| 119 |
elif tag.startswith('p'):
|
| 120 |
+
text = re.split("((\|){2,})", text) # If encounter more than 1 pipe than split that text into different paragraphs
|
| 121 |
for paragraph in text:
|
| 122 |
+
paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip() # Remove any pipe
|
| 123 |
+
paragraph = re.sub(' +', ' ', paragraph) # Remove any double or more spaces into single space
|
| 124 |
+
if paragraph and paragraph[0].islower(): # If a pargraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
|
| 125 |
+
section[-1][1] += f" {paragraph}"
|
| 126 |
+
elif paragraph:
|
| 127 |
+
section.append([tag, paragraph])
|
|
|
|
|
|
|
|
|
|
| 128 |
try:
|
| 129 |
+
if tag_match.group() == 'h1': # Create new page when current text is a type 1 header or title
|
| 130 |
slides[f"Page {page}"] = section
|
| 131 |
page += 1
|
| 132 |
except:
|