Spaces:

sagar008
/

unified-analysis-for-legal-docs

Sleeping

App Files Files Community

sagar008 commited on Aug 11

Commit

cfbc5c3

verified ·

1 Parent(s): 75f3dc7

Create pdf_processor.py

Browse files

Files changed (1) hide show

pdf_processor.py +72 -0

pdf_processor.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# pdf_processor.py - PDF processing functionality
+import re
+import tempfile
+import os
+from typing import Optional
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import io
+class PDFProcessor:
+    """Handles PDF text extraction and preprocessing"""
+    def __init__(self):
+        self.executor = ThreadPoolExecutor(max_workers=2)
+    def _extract_text_sync(self, pdf_bytes: bytes) -> str:
+        """Synchronous PDF text extraction"""
+        try:
+            from PyPDF2 import PdfReader
+            # Create PDF reader from bytes
+            pdf_stream = io.BytesIO(pdf_bytes)
+            pdf_reader = PdfReader(pdf_stream)
+            full_text = ""
+            for page_num, page in enumerate(pdf_reader.pages):
+                try:
+                    text = page.extract_text()
+                    if text:
+                        # Process each page
+                        lines = text.split('\n')
+                        # Remove first line (often header/page number)
+                        cleaned_text = '\n'.join(lines[1:]) if len(lines) > 1 else text
+                        # Replace multiple newlines with space
+                        cleaned_text = cleaned_text.replace('\n\n', ' ').replace('\n', ' ')
+                        # Remove URLs using regex
+                        pattern = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
+                        cleaned_text = pattern.sub('', cleaned_text)
+                        # Remove extra spaces
+                        cleaned_text = ' '.join(cleaned_text.split())
+                        full_text += cleaned_text + " "
+                except Exception as e:
+                    print(f"⚠️ Error extracting text from page {page_num}: {e}")
+                    continue
+            # Final cleanup
+            full_text = full_text.strip()
+            # Remove very short extractions (likely errors)
+            if len(full_text) < 50:
+                raise Exception("Extracted text too short - possible extraction error")
+            print(f"✅ Successfully extracted {len(full_text)} characters from PDF")
+            return full_text
+        except Exception as e:
+            print(f"❌ PDF extraction error: {e}")
+            raise Exception(f"Failed to extract text from PDF: {str(e)}")
+    async def extract_text_from_pdf_bytes(self, pdf_bytes: bytes) -> str:
+        """Async wrapper for PDF text extraction"""
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(self.executor, self._extract_text_sync, pdf_bytes)
+    def __del__(self):
+        """Cleanup thread pool"""
+        if hasattr(self, 'executor'):
+            self.executor.shutdown(wait=True)