Update app.py
Browse files
app.py
CHANGED
|
@@ -30,21 +30,31 @@ def extract_pdf_text_by_page(file) -> List[str]:
|
|
| 30 |
|
| 31 |
def extract_pdf_text(file) -> str:
|
| 32 |
"""Extracts full text from a PDF file using PyMuPDF."""
|
| 33 |
-
print("me llamo samyak")
|
| 34 |
try:
|
| 35 |
-
# Open the PDF file
|
| 36 |
-
# print("me llamo samyak")
|
| 37 |
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
| 38 |
full_text = ""
|
| 39 |
-
|
| 40 |
for page_num, page in enumerate(doc, start=1):
|
| 41 |
-
text
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
doc.close()
|
| 45 |
print(f"Total extracted text length: {len(full_text)} characters.")
|
| 46 |
-
# print(full_text)
|
| 47 |
return full_text
|
|
|
|
| 48 |
except Exception as e:
|
| 49 |
print(f"Error extracting text from PDF: {e}")
|
| 50 |
return ""
|
|
|
|
| 30 |
|
| 31 |
def extract_pdf_text(file) -> str:
|
| 32 |
"""Extracts full text from a PDF file using PyMuPDF."""
|
|
|
|
| 33 |
try:
|
|
|
|
|
|
|
| 34 |
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
| 35 |
full_text = ""
|
| 36 |
+
|
| 37 |
for page_num, page in enumerate(doc, start=1):
|
| 38 |
+
# Get text blocks with their coordinates
|
| 39 |
+
blocks = page.get_text("blocks")
|
| 40 |
+
processed_text = ""
|
| 41 |
+
|
| 42 |
+
for block in blocks:
|
| 43 |
+
text = block[4] # The text content is at index 4
|
| 44 |
+
|
| 45 |
+
# Handle line-break hyphens
|
| 46 |
+
text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: m.group(1) + m.group(2), text)
|
| 47 |
+
|
| 48 |
+
# Preserve regular hyphens within words (e.g., "state-of-the-art")
|
| 49 |
+
processed_text += text + "\n"
|
| 50 |
+
|
| 51 |
+
full_text += processed_text
|
| 52 |
+
print(f"Extracted text from page {page_num}: {len(processed_text)} characters.")
|
| 53 |
+
|
| 54 |
doc.close()
|
| 55 |
print(f"Total extracted text length: {len(full_text)} characters.")
|
|
|
|
| 56 |
return full_text
|
| 57 |
+
|
| 58 |
except Exception as e:
|
| 59 |
print(f"Error extracting text from PDF: {e}")
|
| 60 |
return ""
|