Spaces:

samyak152002
/

texmetrics-regex-checks-gradio-1

Sleeping

samyak152002 commited on Nov 30, 2024

Commit

f652e83

verified ·

1 Parent(s): 37896f7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -30,21 +30,31 @@ def extract_pdf_text_by_page(file) -> List[str]:
 def extract_pdf_text(file) -> str:
     """Extracts full text from a PDF file using PyMuPDF."""
-    print("me llamo samyak")
     try:
-        # Open the PDF file
-#         print("me llamo samyak")
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         full_text = ""
-#         print(doc)
         for page_num, page in enumerate(doc, start=1):
-            text = page.get_text("text")
-            full_text += text + "\n"
-            print(f"Extracted text from page {page_num}: {len(text)} characters.")
         doc.close()
         print(f"Total extracted text length: {len(full_text)} characters.")
-#         print(full_text)
         return full_text
     except Exception as e:
         print(f"Error extracting text from PDF: {e}")
         return ""

 def extract_pdf_text(file) -> str:
     """Extracts full text from a PDF file using PyMuPDF."""
     try:
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         full_text = ""
         for page_num, page in enumerate(doc, start=1):
+            # Get text blocks with their coordinates
+            blocks = page.get_text("blocks")
+            processed_text = ""
+            for block in blocks:
+                text = block[4]  # The text content is at index 4
+                # Handle line-break hyphens
+                text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: m.group(1) + m.group(2), text)
+                # Preserve regular hyphens within words (e.g., "state-of-the-art")
+                processed_text += text + "\n"
+            full_text += processed_text
+            print(f"Extracted text from page {page_num}: {len(processed_text)} characters.")
         doc.close()
         print(f"Total extracted text length: {len(full_text)} characters.")
         return full_text
     except Exception as e:
         print(f"Error extracting text from PDF: {e}")
         return ""