import re import os import sys from datetime import datetime from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_text_splitters import SentenceTransformersTokenTextSplitter from langchain_core.documents import Document # Add the project root to the Python path project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) sys.path.insert(0, project_root) from RAG_BOT.logger import logger class DocumentProcessor: """ Base class for processing documents (PDF, HTM, etc.) to extract text, metadata, and split content into chunks. """ def _devanagari_to_ascii_digits(self, devanagari_string: str) -> str: """Converts Devanagari numerals in a string to ASCII digits.""" mapping = { '०': '0', '१': '1', '२': '2', '३': '3', '४': '4', '५': '5', '६': '6', '७': '7', '८': '8', '९': '9' } return "".join(mapping.get(char, char) for char in devanagari_string) def extract_date_from_text(self, text): """ Attempts to extract a date from the given text and returns it in YYYY-MM-DD format. Args: text (str): The text to search for a date. Returns: str or None: The extracted date in YYYY-MM-DD format if found, otherwise None. """ # Specific date patterns to avoid ambiguity date_patterns = [ (r"(\d{4})-(\d{2})-(\d{2})", "%Y-%m-%d"), # YYYY-MM-DD (r"([०-९]{4})-([०-९]{2})-([०-९]{2})", "%Y-%m-%d"), # YYYY-MM-DD (Devanagari) (r"(\d{2})/(\d{2})/(\d{4})", "%d/%m/%Y"), # DD/MM/YYYY (r"([०-९]{2})/([०-९]{2})/([०-९]{4})", "%d/%m/%Y"), # DD/MM/YYYY (Devanagari) (r"(\d{2})\.(\d{2})\.(\d{4})", "%d.%m.%Y"), # DD.MM.YYYY (r"([०-९]{2})\.([०-९]{2})\.([०-९]{4})", "%d.%m.%Y"), # DD.MM.YYYY (Devanagari) (r"(\d{1,2})\.(\d{1,2})\.(\d{4})", "%d.%m.%Y"), # D.M.YYYY, DD.M.YYYY, D.MM.YYYY (r"([०-९]{1,2})\.([०-९]{1,2})\.([०-९]{4})", "%d.%m.%Y"), # D.M.YYYY (Devanagari) (r"(\d{1,2})/(\d{1,2})/(\d{4})", "%d/%m/%Y"), # D/M/YYYY, DD/M/YYYY, D/MM/YYYY (r"([०-९]{1,2})/([०-९]{1,2})/([०-९]{4})", "%d/%m/%Y"), # D/M/YYYY (Devanagari) (r"(\d{1,2})-(\d{1,2})-(\d{4})", "%d-%m-%Y"), # D-M-YYYY, DD-M-YYYY, D-MM-YYYY (r"([०-९]{1,2})-([०-९]{1,2})-([०-९]{4})", "%d-%m-%Y"), # D-M-YYYY (Devanagari) (r"(\d{2})\.(\d{2})\.(\d{2})", "%d.%m.%y"), # DD.MM.YY (r"([०-९]{2})\.([०-९]{2})\.([०-९]{2})", "%d.%m.%y"), # DD.MM.YY (Devanagari) (r"(\d{2})/(\d{2})/(\d{2})", "%d/%m/%y"), # DD/MM/YY (r"([०-९]{2})/([०-९]{2})/([०-९]{2})", "%d/%m/%y"), # DD/MM/YY (Devanagari) (r"(\d{2})-(\d{2})-(\d{2})", "%d-%m-%y"), # DD-MM-YY (r"([०-९]{2})-([०-९]{2})-([०-९]{2})", "%d-%m-%y"), # DD-MM-YY (Devanagari) (r"(\d{1,2})\.(\d{1,2})\.(\d{2})", "%d.%m.%y"), # D.M.YY, DD.M.YY, D.MM.YY (r"([०-९]{1,2})\.([०-९]{1,2})\.([०-९]{2})", "%d.%m.%y"), # D.M.YY (Devanagari) (r"(\d{1,2})/(\d{1,2})/(\d{2})", "%d/%m/%y"), # D/M/YY, DD/M/YY, D/MM/YY (r"([०-९]{1,2})/([०-९]{1,2})/([०-९]{2})", "%d/%m/%y"), # D/M/YY (Devanagari) (r"(\d{1,2})-(\d{1,2})-(\d{2})", "%d-%m-%y"), # D-M-YY, DD-M-YY, D-MM-YY (r"([०-९]{1,2})-([०-९]{1,2})-([०-९]{2})", "%d-%m-%y"), # D-M-YY (Devanagari) # Add other common formats if needed (e.g., "January 21, 1969") ] for pattern, date_format in date_patterns: match = re.search(pattern, text) if match: matched_date_str = match.group(0) ascii_date_str = self._devanagari_to_ascii_digits(matched_date_str) try: # Attempt to parse the date using the specified format date_obj = datetime.strptime(ascii_date_str, date_format) return date_obj.strftime("%Y-%m-%d") except ValueError as e: logger.warning(f"Date format '{date_format}' matched for '{matched_date_str}' (converted to '{ascii_date_str}'), but couldn't parse. Error: {e}") # Continue searching other patterns except Exception as e: logger.error(f"Unexpected error parsing date '{matched_date_str}' (converted to '{ascii_date_str}') with format '{date_format}': {e}") # Continue searching other patterns logger.info(f"No date pattern matched in text: '{text[:100]}...'") return None # Return None if no pattern matched or parsing failed def get_murli_type(self, text): """ Determines if the text indicates an 'Avyakt' Murli. Args: text (str): The text to check. Returns: bool: True if 'avyakt' or 'अव्यक्त' is found, False otherwise. """ # Check for both Roman script (case-insensitive) and Devanagari script if 'avyakt' in text.lower() or 'अव्यक्त' in text: return True return False def split_text(self, documents, chunk_size=1000, chunk_overlap=200): """Splits the documents into chunks using RecursiveCharacterTextSplitter.""" text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) texts = text_splitter.split_documents(documents) logger.info(f"Split documents into {len(texts)} chunks using RecursiveCharacterTextSplitter") return texts def semantic_chunking(self, documents, model_name="sentence-transformers/all-MiniLM-L6-v2", chunk_size=1000, chunk_overlap=0): """ Performs semantic chunking on the input documents using a sentence transformer model. Args: documents (list): A list of LangChain Document objects. model_name (str): The name of the sentence transformer model to use. chunk_size (int): The desired maximum size of each chunk in tokens. Returns: list: A list of LangChain Document objects representing the semantically chunked text. """ logger.info(f"Performing semantic chunking using model: {model_name} with chunk size : {chunk_size} tokens") # Initialize the sentence transformer text splitter try: splitter = SentenceTransformersTokenTextSplitter(model_name=model_name, chunk_overlap=0, tokens_per_chunk=chunk_size) # Split the documents into semantically meaningful chunks chunks = splitter.split_documents(documents) logger.info(f"Split documents into {len(chunks)} chunks using semantic chunking") return chunks except Exception as e: logger.error(f"Error during semantic chunking: {e}") # Consider re-raising or returning empty list based on desired behavior # raise # Re-raise the exception return [] # Return empty list to indicate failure but allow continuation