sagar008 commited on
Commit
cfbc5c3
·
verified ·
1 Parent(s): 75f3dc7

Create pdf_processor.py

Browse files
Files changed (1) hide show
  1. pdf_processor.py +72 -0
pdf_processor.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pdf_processor.py - PDF processing functionality
2
+ import re
3
+ import tempfile
4
+ import os
5
+ from typing import Optional
6
+ import asyncio
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ import io
9
+
10
+ class PDFProcessor:
11
+ """Handles PDF text extraction and preprocessing"""
12
+
13
+ def __init__(self):
14
+ self.executor = ThreadPoolExecutor(max_workers=2)
15
+
16
+ def _extract_text_sync(self, pdf_bytes: bytes) -> str:
17
+ """Synchronous PDF text extraction"""
18
+ try:
19
+ from PyPDF2 import PdfReader
20
+
21
+ # Create PDF reader from bytes
22
+ pdf_stream = io.BytesIO(pdf_bytes)
23
+ pdf_reader = PdfReader(pdf_stream)
24
+
25
+ full_text = ""
26
+ for page_num, page in enumerate(pdf_reader.pages):
27
+ try:
28
+ text = page.extract_text()
29
+ if text:
30
+ # Process each page
31
+ lines = text.split('\n')
32
+ # Remove first line (often header/page number)
33
+ cleaned_text = '\n'.join(lines[1:]) if len(lines) > 1 else text
34
+ # Replace multiple newlines with space
35
+ cleaned_text = cleaned_text.replace('\n\n', ' ').replace('\n', ' ')
36
+
37
+ # Remove URLs using regex
38
+ pattern = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
39
+ cleaned_text = pattern.sub('', cleaned_text)
40
+
41
+ # Remove extra spaces
42
+ cleaned_text = ' '.join(cleaned_text.split())
43
+
44
+ full_text += cleaned_text + " "
45
+
46
+ except Exception as e:
47
+ print(f"⚠️ Error extracting text from page {page_num}: {e}")
48
+ continue
49
+
50
+ # Final cleanup
51
+ full_text = full_text.strip()
52
+
53
+ # Remove very short extractions (likely errors)
54
+ if len(full_text) < 50:
55
+ raise Exception("Extracted text too short - possible extraction error")
56
+
57
+ print(f"✅ Successfully extracted {len(full_text)} characters from PDF")
58
+ return full_text
59
+
60
+ except Exception as e:
61
+ print(f"❌ PDF extraction error: {e}")
62
+ raise Exception(f"Failed to extract text from PDF: {str(e)}")
63
+
64
+ async def extract_text_from_pdf_bytes(self, pdf_bytes: bytes) -> str:
65
+ """Async wrapper for PDF text extraction"""
66
+ loop = asyncio.get_event_loop()
67
+ return await loop.run_in_executor(self.executor, self._extract_text_sync, pdf_bytes)
68
+
69
+ def __del__(self):
70
+ """Cleanup thread pool"""
71
+ if hasattr(self, 'executor'):
72
+ self.executor.shutdown(wait=True)