Spaces:

CultriX
/

Generate-Knowledge-Graphs

Running

File size: 4,369 Bytes

e86199a

import os
import json
from typing import List, Dict, Any
import pdfplumber
from docx import Document
from config.settings import Config

class DocumentProcessor:
    def __init__(self):
        self.config = Config()
    
    def validate_file_size(self, file_path: str) -> bool:
        """Validate file size is within limits."""
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
        return file_size_mb <= self.config.MAX_FILE_SIZE_MB
    
    def load_document(self, file_path: str) -> str:
        """Load document content based on file extension."""
        if not self.validate_file_size(file_path):
            raise ValueError(f"File size exceeds {self.config.MAX_FILE_SIZE_MB}MB limit")
        
        file_ext = os.path.splitext(file_path)[1].lower()
        
        if file_ext == '.pdf':
            return self._load_pdf(file_path)
        elif file_ext == '.docx':
            return self._load_docx(file_path)
        elif file_ext == '.txt':
            return self._load_txt(file_path)
        elif file_ext == '.json':
            return self._load_json(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_ext}")
    
    def _load_pdf(self, file_path: str) -> str:
        """Load PDF content."""
        text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        return text
    
    def _load_docx(self, file_path: str) -> str:
        """Load DOCX content."""
        doc = Document(file_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text
    
    def _load_txt(self, file_path: str) -> str:
        """Load TXT content."""
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    
    def _load_json(self, file_path: str) -> str:
        """Load JSON content and convert to text."""
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            return json.dumps(data, indent=2)
    
    def chunk_text(self, text: str) -> List[str]:
        """Split text into overlapping chunks for processing."""
        if len(text) <= self.config.CHUNK_SIZE:
            return [text]
        
        chunks = []
        start = 0
        
        while start < len(text):
            end = start + self.config.CHUNK_SIZE
            
            # Try to break at sentence boundaries
            if end < len(text):
                # Look for sentence endings
                sentence_end = text.rfind('.', start, end)
                if sentence_end == -1:
                    sentence_end = text.rfind('!', start, end)
                if sentence_end == -1:
                    sentence_end = text.rfind('?', start, end)
                
                if sentence_end != -1 and sentence_end > start + self.config.CHUNK_SIZE // 2:
                    end = sentence_end + 1
            
            chunk = text[start:end].strip()
            if chunk:
                chunks.append(chunk)
            
            start = end - self.config.CHUNK_OVERLAP
            if start >= len(text):
                break
        
        return chunks
    
    def process_documents(self, file_paths: List[str], batch_mode: bool = False) -> List[Dict[str, Any]]:
        """Process multiple documents."""
        results = []
        
        for file_path in file_paths:
            try:
                content = self.load_document(file_path)
                chunks = self.chunk_text(content)
                
                results.append({
                    'file_path': file_path,
                    'content': content,
                    'chunks': chunks,
                    'status': 'success'
                })
                
                if not batch_mode:
                    break  # Process only one file if not in batch mode
                    
            except Exception as e:
                results.append({
                    'file_path': file_path,
                    'content': '',
                    'chunks': [],
                    'status': 'error',
                    'error': str(e)
                })
        
        return results