PuristanLabs1 commited on
Commit
32d9ffc
·
verified ·
1 Parent(s): be9f3b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -1
app.py CHANGED
@@ -80,7 +80,19 @@ def extract_and_clean_text(data):
80
 
81
  metadata_dict[key.strip()] = value.strip() # Store cleaned key-value pair
82
 
83
- # Step 2: Clean the extracted text
 
 
 
 
 
 
 
 
 
 
 
 
84
  def clean_text(text):
85
  # Remove inline citations like [2][4]
86
  text = re.sub(r'\[\d+\]', '', text)
 
80
 
81
  metadata_dict[key.strip()] = value.strip() # Store cleaned key-value pair
82
 
83
+ #Step 2: Remove everything before the "Abstract" section
84
+ def remove_text_before_abstract(text):
85
+ """Removes all text before the first occurrence of 'Abstract'."""
86
+ abstract_pattern = re.compile(r"(?i)\babstract\b") # Case-insensitive search
87
+ match = abstract_pattern.search(text)
88
+
89
+ if match:
90
+ return text[match.start():] # Keep text from "Abstract" onwards
91
+ return text # If "Abstract" is not found, return the full text
92
+
93
+ data = remove_text_before_abstract(data)
94
+
95
+ # Step 3: Clean the extracted text
96
  def clean_text(text):
97
  # Remove inline citations like [2][4]
98
  text = re.sub(r'\[\d+\]', '', text)