Update pdf_processor.py
Browse files- pdf_processor.py +18 -7
pdf_processor.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# pdf_processor.py -
|
2 |
import re
|
3 |
import tempfile
|
4 |
import os
|
@@ -14,7 +14,7 @@ class PDFProcessor:
|
|
14 |
self.executor = ThreadPoolExecutor(max_workers=2)
|
15 |
|
16 |
def _extract_text_sync(self, pdf_bytes: bytes) -> str:
|
17 |
-
"""Synchronous PDF text extraction"""
|
18 |
try:
|
19 |
from PyPDF2 import PdfReader
|
20 |
|
@@ -22,11 +22,17 @@ class PDFProcessor:
|
|
22 |
pdf_stream = io.BytesIO(pdf_bytes)
|
23 |
pdf_reader = PdfReader(pdf_stream)
|
24 |
|
|
|
|
|
|
|
|
|
25 |
full_text = ""
|
|
|
|
|
26 |
for page_num, page in enumerate(pdf_reader.pages):
|
27 |
try:
|
28 |
text = page.extract_text()
|
29 |
-
if text:
|
30 |
# Process each page
|
31 |
lines = text.split('\n')
|
32 |
# Remove first line (often header/page number)
|
@@ -41,7 +47,9 @@ class PDFProcessor:
|
|
41 |
# Remove extra spaces
|
42 |
cleaned_text = ' '.join(cleaned_text.split())
|
43 |
|
44 |
-
|
|
|
|
|
45 |
|
46 |
except Exception as e:
|
47 |
print(f"⚠️ Error extracting text from page {page_num}: {e}")
|
@@ -50,11 +58,11 @@ class PDFProcessor:
|
|
50 |
# Final cleanup
|
51 |
full_text = full_text.strip()
|
52 |
|
53 |
-
#
|
54 |
if len(full_text) < 50:
|
55 |
-
raise Exception("Extracted text too short - possible extraction error")
|
56 |
|
57 |
-
print(f"✅ Successfully extracted {len(full_text)} characters from
|
58 |
return full_text
|
59 |
|
60 |
except Exception as e:
|
@@ -63,6 +71,9 @@ class PDFProcessor:
|
|
63 |
|
64 |
async def extract_text_from_pdf_bytes(self, pdf_bytes: bytes) -> str:
|
65 |
"""Async wrapper for PDF text extraction"""
|
|
|
|
|
|
|
66 |
loop = asyncio.get_event_loop()
|
67 |
return await loop.run_in_executor(self.executor, self._extract_text_sync, pdf_bytes)
|
68 |
|
|
|
1 |
+
# pdf_processor.py - Updated for better error handling
|
2 |
import re
|
3 |
import tempfile
|
4 |
import os
|
|
|
14 |
self.executor = ThreadPoolExecutor(max_workers=2)
|
15 |
|
16 |
def _extract_text_sync(self, pdf_bytes: bytes) -> str:
|
17 |
+
"""Synchronous PDF text extraction with enhanced error handling"""
|
18 |
try:
|
19 |
from PyPDF2 import PdfReader
|
20 |
|
|
|
22 |
pdf_stream = io.BytesIO(pdf_bytes)
|
23 |
pdf_reader = PdfReader(pdf_stream)
|
24 |
|
25 |
+
# Check if PDF has pages
|
26 |
+
if len(pdf_reader.pages) == 0:
|
27 |
+
raise Exception("PDF has no pages")
|
28 |
+
|
29 |
full_text = ""
|
30 |
+
pages_processed = 0
|
31 |
+
|
32 |
for page_num, page in enumerate(pdf_reader.pages):
|
33 |
try:
|
34 |
text = page.extract_text()
|
35 |
+
if text and len(text.strip()) > 0:
|
36 |
# Process each page
|
37 |
lines = text.split('\n')
|
38 |
# Remove first line (often header/page number)
|
|
|
47 |
# Remove extra spaces
|
48 |
cleaned_text = ' '.join(cleaned_text.split())
|
49 |
|
50 |
+
if cleaned_text:
|
51 |
+
full_text += cleaned_text + " "
|
52 |
+
pages_processed += 1
|
53 |
|
54 |
except Exception as e:
|
55 |
print(f"⚠️ Error extracting text from page {page_num}: {e}")
|
|
|
58 |
# Final cleanup
|
59 |
full_text = full_text.strip()
|
60 |
|
61 |
+
# Validate extraction
|
62 |
if len(full_text) < 50:
|
63 |
+
raise Exception(f"Extracted text too short ({len(full_text)} chars) - possible extraction error. Pages processed: {pages_processed}")
|
64 |
|
65 |
+
print(f"✅ Successfully extracted {len(full_text)} characters from {pages_processed} pages")
|
66 |
return full_text
|
67 |
|
68 |
except Exception as e:
|
|
|
71 |
|
72 |
async def extract_text_from_pdf_bytes(self, pdf_bytes: bytes) -> str:
|
73 |
"""Async wrapper for PDF text extraction"""
|
74 |
+
if not pdf_bytes or len(pdf_bytes) < 100:
|
75 |
+
raise Exception("PDF bytes are empty or too small")
|
76 |
+
|
77 |
loop = asyncio.get_event_loop()
|
78 |
return await loop.run_in_executor(self.executor, self._extract_text_sync, pdf_bytes)
|
79 |
|