Spaces:

CultriX
/

Generate-Knowledge-Graphs

Running

App Files Files Community

Generate-Knowledge-Graphs / src /document_processor.py

CultriX

First commit

e86199a 5 days ago

raw

history blame

4.37 kB

	import os
	import json
	from typing import List, Dict, Any
	import pdfplumber
	from docx import Document
	from config.settings import Config

	class DocumentProcessor:
	def __init__(self):
	self.config = Config()

	def validate_file_size(self, file_path: str) -> bool:
	"""Validate file size is within limits."""
	file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
	return file_size_mb <= self.config.MAX_FILE_SIZE_MB

	def load_document(self, file_path: str) -> str:
	"""Load document content based on file extension."""
	if not self.validate_file_size(file_path):
	raise ValueError(f"File size exceeds {self.config.MAX_FILE_SIZE_MB}MB limit")

	file_ext = os.path.splitext(file_path)[1].lower()

	if file_ext == '.pdf':
	return self._load_pdf(file_path)
	elif file_ext == '.docx':
	return self._load_docx(file_path)
	elif file_ext == '.txt':
	return self._load_txt(file_path)
	elif file_ext == '.json':
	return self._load_json(file_path)
	else:
	raise ValueError(f"Unsupported file format: {file_ext}")

	def _load_pdf(self, file_path: str) -> str:
	"""Load PDF content."""
	text = ""
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	return text

	def _load_docx(self, file_path: str) -> str:
	"""Load DOCX content."""
	doc = Document(file_path)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text

	def _load_txt(self, file_path: str) -> str:
	"""Load TXT content."""
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()

	def _load_json(self, file_path: str) -> str:
	"""Load JSON content and convert to text."""
	with open(file_path, 'r', encoding='utf-8') as file:
	data = json.load(file)
	return json.dumps(data, indent=2)

	def chunk_text(self, text: str) -> List[str]:
	"""Split text into overlapping chunks for processing."""
	if len(text) <= self.config.CHUNK_SIZE:
	return [text]

	chunks = []
	start = 0

	while start < len(text):
	end = start + self.config.CHUNK_SIZE

	# Try to break at sentence boundaries
	if end < len(text):
	# Look for sentence endings
	sentence_end = text.rfind('.', start, end)
	if sentence_end == -1:
	sentence_end = text.rfind('!', start, end)
	if sentence_end == -1:
	sentence_end = text.rfind('?', start, end)

	if sentence_end != -1 and sentence_end > start + self.config.CHUNK_SIZE // 2:
	end = sentence_end + 1

	chunk = text[start:end].strip()
	if chunk:
	chunks.append(chunk)

	start = end - self.config.CHUNK_OVERLAP
	if start >= len(text):
	break

	return chunks

	def process_documents(self, file_paths: List[str], batch_mode: bool = False) -> List[Dict[str, Any]]:
	"""Process multiple documents."""
	results = []

	for file_path in file_paths:
	try:
	content = self.load_document(file_path)
	chunks = self.chunk_text(content)

	results.append({
	'file_path': file_path,
	'content': content,
	'chunks': chunks,
	'status': 'success'
	})

	if not batch_mode:
	break # Process only one file if not in batch mode

	except Exception as e:
	results.append({
	'file_path': file_path,
	'content': '',
	'chunks': [],
	'status': 'error',
	'error': str(e)
	})

	return results