sagar008 commited on
Commit
5fcab8d
·
verified ·
1 Parent(s): 7a8b47f

Update pdf_processor.py

Browse files
Files changed (1) hide show
  1. pdf_processor.py +18 -7
pdf_processor.py CHANGED
@@ -1,4 +1,4 @@
1
- # pdf_processor.py - PDF processing functionality
2
  import re
3
  import tempfile
4
  import os
@@ -14,7 +14,7 @@ class PDFProcessor:
14
  self.executor = ThreadPoolExecutor(max_workers=2)
15
 
16
  def _extract_text_sync(self, pdf_bytes: bytes) -> str:
17
- """Synchronous PDF text extraction"""
18
  try:
19
  from PyPDF2 import PdfReader
20
 
@@ -22,11 +22,17 @@ class PDFProcessor:
22
  pdf_stream = io.BytesIO(pdf_bytes)
23
  pdf_reader = PdfReader(pdf_stream)
24
 
 
 
 
 
25
  full_text = ""
 
 
26
  for page_num, page in enumerate(pdf_reader.pages):
27
  try:
28
  text = page.extract_text()
29
- if text:
30
  # Process each page
31
  lines = text.split('\n')
32
  # Remove first line (often header/page number)
@@ -41,7 +47,9 @@ class PDFProcessor:
41
  # Remove extra spaces
42
  cleaned_text = ' '.join(cleaned_text.split())
43
 
44
- full_text += cleaned_text + " "
 
 
45
 
46
  except Exception as e:
47
  print(f"⚠️ Error extracting text from page {page_num}: {e}")
@@ -50,11 +58,11 @@ class PDFProcessor:
50
  # Final cleanup
51
  full_text = full_text.strip()
52
 
53
- # Remove very short extractions (likely errors)
54
  if len(full_text) < 50:
55
- raise Exception("Extracted text too short - possible extraction error")
56
 
57
- print(f"✅ Successfully extracted {len(full_text)} characters from PDF")
58
  return full_text
59
 
60
  except Exception as e:
@@ -63,6 +71,9 @@ class PDFProcessor:
63
 
64
  async def extract_text_from_pdf_bytes(self, pdf_bytes: bytes) -> str:
65
  """Async wrapper for PDF text extraction"""
 
 
 
66
  loop = asyncio.get_event_loop()
67
  return await loop.run_in_executor(self.executor, self._extract_text_sync, pdf_bytes)
68
 
 
1
+ # pdf_processor.py - Updated for better error handling
2
  import re
3
  import tempfile
4
  import os
 
14
  self.executor = ThreadPoolExecutor(max_workers=2)
15
 
16
  def _extract_text_sync(self, pdf_bytes: bytes) -> str:
17
+ """Synchronous PDF text extraction with enhanced error handling"""
18
  try:
19
  from PyPDF2 import PdfReader
20
 
 
22
  pdf_stream = io.BytesIO(pdf_bytes)
23
  pdf_reader = PdfReader(pdf_stream)
24
 
25
+ # Check if PDF has pages
26
+ if len(pdf_reader.pages) == 0:
27
+ raise Exception("PDF has no pages")
28
+
29
  full_text = ""
30
+ pages_processed = 0
31
+
32
  for page_num, page in enumerate(pdf_reader.pages):
33
  try:
34
  text = page.extract_text()
35
+ if text and len(text.strip()) > 0:
36
  # Process each page
37
  lines = text.split('\n')
38
  # Remove first line (often header/page number)
 
47
  # Remove extra spaces
48
  cleaned_text = ' '.join(cleaned_text.split())
49
 
50
+ if cleaned_text:
51
+ full_text += cleaned_text + " "
52
+ pages_processed += 1
53
 
54
  except Exception as e:
55
  print(f"⚠️ Error extracting text from page {page_num}: {e}")
 
58
  # Final cleanup
59
  full_text = full_text.strip()
60
 
61
+ # Validate extraction
62
  if len(full_text) < 50:
63
+ raise Exception(f"Extracted text too short ({len(full_text)} chars) - possible extraction error. Pages processed: {pages_processed}")
64
 
65
+ print(f"✅ Successfully extracted {len(full_text)} characters from {pages_processed} pages")
66
  return full_text
67
 
68
  except Exception as e:
 
71
 
72
  async def extract_text_from_pdf_bytes(self, pdf_bytes: bytes) -> str:
73
  """Async wrapper for PDF text extraction"""
74
+ if not pdf_bytes or len(pdf_bytes) < 100:
75
+ raise Exception("PDF bytes are empty or too small")
76
+
77
  loop = asyncio.get_event_loop()
78
  return await loop.run_in_executor(self.executor, self._extract_text_sync, pdf_bytes)
79