Generate-Knowledge-Graphs / src /document_processor.py
CultriX's picture
First commit
e86199a
raw
history blame
4.37 kB
import os
import json
from typing import List, Dict, Any
import pdfplumber
from docx import Document
from config.settings import Config
class DocumentProcessor:
def __init__(self):
self.config = Config()
def validate_file_size(self, file_path: str) -> bool:
"""Validate file size is within limits."""
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
return file_size_mb <= self.config.MAX_FILE_SIZE_MB
def load_document(self, file_path: str) -> str:
"""Load document content based on file extension."""
if not self.validate_file_size(file_path):
raise ValueError(f"File size exceeds {self.config.MAX_FILE_SIZE_MB}MB limit")
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
return self._load_pdf(file_path)
elif file_ext == '.docx':
return self._load_docx(file_path)
elif file_ext == '.txt':
return self._load_txt(file_path)
elif file_ext == '.json':
return self._load_json(file_path)
else:
raise ValueError(f"Unsupported file format: {file_ext}")
def _load_pdf(self, file_path: str) -> str:
"""Load PDF content."""
text = ""
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
def _load_docx(self, file_path: str) -> str:
"""Load DOCX content."""
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
def _load_txt(self, file_path: str) -> str:
"""Load TXT content."""
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
def _load_json(self, file_path: str) -> str:
"""Load JSON content and convert to text."""
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
return json.dumps(data, indent=2)
def chunk_text(self, text: str) -> List[str]:
"""Split text into overlapping chunks for processing."""
if len(text) <= self.config.CHUNK_SIZE:
return [text]
chunks = []
start = 0
while start < len(text):
end = start + self.config.CHUNK_SIZE
# Try to break at sentence boundaries
if end < len(text):
# Look for sentence endings
sentence_end = text.rfind('.', start, end)
if sentence_end == -1:
sentence_end = text.rfind('!', start, end)
if sentence_end == -1:
sentence_end = text.rfind('?', start, end)
if sentence_end != -1 and sentence_end > start + self.config.CHUNK_SIZE // 2:
end = sentence_end + 1
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - self.config.CHUNK_OVERLAP
if start >= len(text):
break
return chunks
def process_documents(self, file_paths: List[str], batch_mode: bool = False) -> List[Dict[str, Any]]:
"""Process multiple documents."""
results = []
for file_path in file_paths:
try:
content = self.load_document(file_path)
chunks = self.chunk_text(content)
results.append({
'file_path': file_path,
'content': content,
'chunks': chunks,
'status': 'success'
})
if not batch_mode:
break # Process only one file if not in batch mode
except Exception as e:
results.append({
'file_path': file_path,
'content': '',
'chunks': [],
'status': 'error',
'error': str(e)
})
return results