Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -80,7 +80,19 @@ def extract_and_clean_text(data):
|
|
80 |
|
81 |
metadata_dict[key.strip()] = value.strip() # Store cleaned key-value pair
|
82 |
|
83 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
def clean_text(text):
|
85 |
# Remove inline citations like [2][4]
|
86 |
text = re.sub(r'\[\d+\]', '', text)
|
|
|
80 |
|
81 |
metadata_dict[key.strip()] = value.strip() # Store cleaned key-value pair
|
82 |
|
83 |
+
#Step 2: Remove everything before the "Abstract" section
|
84 |
+
def remove_text_before_abstract(text):
|
85 |
+
"""Removes all text before the first occurrence of 'Abstract'."""
|
86 |
+
abstract_pattern = re.compile(r"(?i)\babstract\b") # Case-insensitive search
|
87 |
+
match = abstract_pattern.search(text)
|
88 |
+
|
89 |
+
if match:
|
90 |
+
return text[match.start():] # Keep text from "Abstract" onwards
|
91 |
+
return text # If "Abstract" is not found, return the full text
|
92 |
+
|
93 |
+
data = remove_text_before_abstract(data)
|
94 |
+
|
95 |
+
# Step 3: Clean the extracted text
|
96 |
def clean_text(text):
|
97 |
# Remove inline citations like [2][4]
|
98 |
text = re.sub(r'\[\d+\]', '', text)
|