Spaces:

sagar008
/

unified-analysis-for-legal-docs

Sleeping

App Files Files Community

sagar008 commited on Aug 11

Commit

5fcab8d

verified ·

1 Parent(s): 7a8b47f

Update pdf_processor.py

Browse files

Files changed (1) hide show

pdf_processor.py +18 -7

pdf_processor.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# pdf_processor.py - PDF processing functionality
 import re
 import tempfile
 import os
@@ -14,7 +14,7 @@ class PDFProcessor:
         self.executor = ThreadPoolExecutor(max_workers=2)
     def _extract_text_sync(self, pdf_bytes: bytes) -> str:
-        """Synchronous PDF text extraction"""
         try:
             from PyPDF2 import PdfReader
@@ -22,11 +22,17 @@ class PDFProcessor:
             pdf_stream = io.BytesIO(pdf_bytes)
             pdf_reader = PdfReader(pdf_stream)
             full_text = ""
             for page_num, page in enumerate(pdf_reader.pages):
                 try:
                     text = page.extract_text()
-                    if text:
                         # Process each page
                         lines = text.split('\n')
                         # Remove first line (often header/page number)
@@ -41,7 +47,9 @@ class PDFProcessor:
                         # Remove extra spaces
                         cleaned_text = ' '.join(cleaned_text.split())
-                        full_text += cleaned_text + " "
                 except Exception as e:
                     print(f"⚠️ Error extracting text from page {page_num}: {e}")
@@ -50,11 +58,11 @@ class PDFProcessor:
             # Final cleanup
             full_text = full_text.strip()
-            # Remove very short extractions (likely errors)
             if len(full_text) < 50:
-                raise Exception("Extracted text too short - possible extraction error")
-            print(f"✅ Successfully extracted {len(full_text)} characters from PDF")
             return full_text
         except Exception as e:
@@ -63,6 +71,9 @@ class PDFProcessor:
     async def extract_text_from_pdf_bytes(self, pdf_bytes: bytes) -> str:
         """Async wrapper for PDF text extraction"""
         loop = asyncio.get_event_loop()
         return await loop.run_in_executor(self.executor, self._extract_text_sync, pdf_bytes)

+# pdf_processor.py - Updated for better error handling
 import re
 import tempfile
 import os
         self.executor = ThreadPoolExecutor(max_workers=2)
     def _extract_text_sync(self, pdf_bytes: bytes) -> str:
+        """Synchronous PDF text extraction with enhanced error handling"""
         try:
             from PyPDF2 import PdfReader
             pdf_stream = io.BytesIO(pdf_bytes)
             pdf_reader = PdfReader(pdf_stream)
+            # Check if PDF has pages
+            if len(pdf_reader.pages) == 0:
+                raise Exception("PDF has no pages")
             full_text = ""
+            pages_processed = 0
             for page_num, page in enumerate(pdf_reader.pages):
                 try:
                     text = page.extract_text()
+                    if text and len(text.strip()) > 0:
                         # Process each page
                         lines = text.split('\n')
                         # Remove first line (often header/page number)
                         # Remove extra spaces
                         cleaned_text = ' '.join(cleaned_text.split())
+                        if cleaned_text:
+                            full_text += cleaned_text + " "
+                            pages_processed += 1
                 except Exception as e:
                     print(f"⚠️ Error extracting text from page {page_num}: {e}")
             # Final cleanup
             full_text = full_text.strip()
+            # Validate extraction
             if len(full_text) < 50:
+                raise Exception(f"Extracted text too short ({len(full_text)} chars) - possible extraction error. Pages processed: {pages_processed}")
+            print(f"✅ Successfully extracted {len(full_text)} characters from {pages_processed} pages")
             return full_text
         except Exception as e:
     async def extract_text_from_pdf_bytes(self, pdf_bytes: bytes) -> str:
         """Async wrapper for PDF text extraction"""
+        if not pdf_bytes or len(pdf_bytes) < 100:
+            raise Exception("PDF bytes are empty or too small")
         loop = asyncio.get_event_loop()
         return await loop.run_in_executor(self.executor, self._extract_text_sync, pdf_bytes)