Spaces:

PuristanLabs1
/

VocalWeb

Sleeping

PuristanLabs1 commited on Feb 28

Commit

32d9ffc

verified ·

1 Parent(s): be9f3b7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -80,7 +80,19 @@ def extract_and_clean_text(data):
                 metadata_dict[key.strip()] = value.strip()  # Store cleaned key-value pair
-    # Step 2: Clean the extracted text
     def clean_text(text):
         # Remove inline citations like [2][4]
         text = re.sub(r'\[\d+\]', '', text)

                 metadata_dict[key.strip()] = value.strip()  # Store cleaned key-value pair
+    #Step 2: Remove everything before the "Abstract" section
+    def remove_text_before_abstract(text):
+        """Removes all text before the first occurrence of 'Abstract'."""
+        abstract_pattern = re.compile(r"(?i)\babstract\b")  # Case-insensitive search
+        match = abstract_pattern.search(text)
+        if match:
+            return text[match.start():]  # Keep text from "Abstract" onwards
+        return text  # If "Abstract" is not found, return the full text
+    data = remove_text_before_abstract(data)
+    # Step 3: Clean the extracted text
     def clean_text(text):
         # Remove inline citations like [2][4]
         text = re.sub(r'\[\d+\]', '', text)