Spaces:
Building
Building
import re | |
import os | |
import sys | |
from datetime import datetime | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_text_splitters import SentenceTransformersTokenTextSplitter | |
from langchain_core.documents import Document | |
# Add the project root to the Python path | |
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) | |
sys.path.insert(0, project_root) | |
from RAG_BOT.logger import logger | |
class DocumentProcessor: | |
""" | |
Base class for processing documents (PDF, HTM, etc.) to extract text, | |
metadata, and split content into chunks. | |
""" | |
def _devanagari_to_ascii_digits(self, devanagari_string: str) -> str: | |
"""Converts Devanagari numerals in a string to ASCII digits.""" | |
mapping = { | |
'०': '0', '१': '1', '२': '2', '३': '3', '४': '4', | |
'५': '5', '६': '6', '७': '7', '८': '8', '९': '9' | |
} | |
return "".join(mapping.get(char, char) for char in devanagari_string) | |
def extract_date_from_text(self, text): | |
""" | |
Attempts to extract a date from the given text and returns it in YYYY-MM-DD format. | |
Args: | |
text (str): The text to search for a date. | |
Returns: | |
str or None: The extracted date in YYYY-MM-DD format if found, otherwise None. | |
""" | |
# Specific date patterns to avoid ambiguity | |
date_patterns = [ | |
(r"(\d{4})-(\d{2})-(\d{2})", "%Y-%m-%d"), # YYYY-MM-DD | |
(r"([०-९]{4})-([०-९]{2})-([०-९]{2})", "%Y-%m-%d"), # YYYY-MM-DD (Devanagari) | |
(r"(\d{2})/(\d{2})/(\d{4})", "%d/%m/%Y"), # DD/MM/YYYY | |
(r"([०-९]{2})/([०-९]{2})/([०-९]{4})", "%d/%m/%Y"), # DD/MM/YYYY (Devanagari) | |
(r"(\d{2})\.(\d{2})\.(\d{4})", "%d.%m.%Y"), # DD.MM.YYYY | |
(r"([०-९]{2})\.([०-९]{2})\.([०-९]{4})", "%d.%m.%Y"), # DD.MM.YYYY (Devanagari) | |
(r"(\d{1,2})\.(\d{1,2})\.(\d{4})", "%d.%m.%Y"), # D.M.YYYY, DD.M.YYYY, D.MM.YYYY | |
(r"([०-९]{1,2})\.([०-९]{1,2})\.([०-९]{4})", "%d.%m.%Y"), # D.M.YYYY (Devanagari) | |
(r"(\d{1,2})/(\d{1,2})/(\d{4})", "%d/%m/%Y"), # D/M/YYYY, DD/M/YYYY, D/MM/YYYY | |
(r"([०-९]{1,2})/([०-९]{1,2})/([०-९]{4})", "%d/%m/%Y"), # D/M/YYYY (Devanagari) | |
(r"(\d{1,2})-(\d{1,2})-(\d{4})", "%d-%m-%Y"), # D-M-YYYY, DD-M-YYYY, D-MM-YYYY | |
(r"([०-९]{1,2})-([०-९]{1,2})-([०-९]{4})", "%d-%m-%Y"), # D-M-YYYY (Devanagari) | |
(r"(\d{2})\.(\d{2})\.(\d{2})", "%d.%m.%y"), # DD.MM.YY | |
(r"([०-९]{2})\.([०-९]{2})\.([०-९]{2})", "%d.%m.%y"), # DD.MM.YY (Devanagari) | |
(r"(\d{2})/(\d{2})/(\d{2})", "%d/%m/%y"), # DD/MM/YY | |
(r"([०-९]{2})/([०-९]{2})/([०-९]{2})", "%d/%m/%y"), # DD/MM/YY (Devanagari) | |
(r"(\d{2})-(\d{2})-(\d{2})", "%d-%m-%y"), # DD-MM-YY | |
(r"([०-९]{2})-([०-९]{2})-([०-९]{2})", "%d-%m-%y"), # DD-MM-YY (Devanagari) | |
(r"(\d{1,2})\.(\d{1,2})\.(\d{2})", "%d.%m.%y"), # D.M.YY, DD.M.YY, D.MM.YY | |
(r"([०-९]{1,2})\.([०-९]{1,2})\.([०-९]{2})", "%d.%m.%y"), # D.M.YY (Devanagari) | |
(r"(\d{1,2})/(\d{1,2})/(\d{2})", "%d/%m/%y"), # D/M/YY, DD/M/YY, D/MM/YY | |
(r"([०-९]{1,2})/([०-९]{1,2})/([०-९]{2})", "%d/%m/%y"), # D/M/YY (Devanagari) | |
(r"(\d{1,2})-(\d{1,2})-(\d{2})", "%d-%m-%y"), # D-M-YY, DD-M-YY, D-MM-YY | |
(r"([०-९]{1,2})-([०-९]{1,2})-([०-९]{2})", "%d-%m-%y"), # D-M-YY (Devanagari) | |
# Add other common formats if needed (e.g., "January 21, 1969") | |
] | |
for pattern, date_format in date_patterns: | |
match = re.search(pattern, text) | |
if match: | |
matched_date_str = match.group(0) | |
ascii_date_str = self._devanagari_to_ascii_digits(matched_date_str) | |
try: | |
# Attempt to parse the date using the specified format | |
date_obj = datetime.strptime(ascii_date_str, date_format) | |
return date_obj.strftime("%Y-%m-%d") | |
except ValueError as e: | |
logger.warning(f"Date format '{date_format}' matched for '{matched_date_str}' (converted to '{ascii_date_str}'), but couldn't parse. Error: {e}") | |
# Continue searching other patterns | |
except Exception as e: | |
logger.error(f"Unexpected error parsing date '{matched_date_str}' (converted to '{ascii_date_str}') with format '{date_format}': {e}") | |
# Continue searching other patterns | |
logger.info(f"No date pattern matched in text: '{text[:100]}...'") | |
return None # Return None if no pattern matched or parsing failed | |
def get_murli_type(self, text): | |
""" | |
Determines if the text indicates an 'Avyakt' Murli. | |
Args: | |
text (str): The text to check. | |
Returns: | |
bool: True if 'avyakt' or 'अव्यक्त' is found, False otherwise. | |
""" | |
# Check for both Roman script (case-insensitive) and Devanagari script | |
if 'avyakt' in text.lower() or 'अव्यक्त' in text: | |
return True | |
return False | |
def split_text(self, documents, chunk_size=1000, chunk_overlap=200): | |
"""Splits the documents into chunks using RecursiveCharacterTextSplitter.""" | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
texts = text_splitter.split_documents(documents) | |
logger.info(f"Split documents into {len(texts)} chunks using RecursiveCharacterTextSplitter") | |
return texts | |
def semantic_chunking(self, documents, model_name="sentence-transformers/all-MiniLM-L6-v2", | |
chunk_size=1000, chunk_overlap=0): | |
""" | |
Performs semantic chunking on the input documents using a sentence transformer model. | |
Args: | |
documents (list): A list of LangChain Document objects. | |
model_name (str): The name of the sentence transformer model to use. | |
chunk_size (int): The desired maximum size of each chunk in tokens. | |
Returns: | |
list: A list of LangChain Document objects representing the semantically chunked text. | |
""" | |
logger.info(f"Performing semantic chunking using model: {model_name} with chunk size : {chunk_size} tokens") | |
# Initialize the sentence transformer text splitter | |
try: | |
splitter = SentenceTransformersTokenTextSplitter(model_name=model_name, chunk_overlap=0, tokens_per_chunk=chunk_size) | |
# Split the documents into semantically meaningful chunks | |
chunks = splitter.split_documents(documents) | |
logger.info(f"Split documents into {len(chunks)} chunks using semantic chunking") | |
return chunks | |
except Exception as e: | |
logger.error(f"Error during semantic chunking: {e}") | |
# Consider re-raising or returning empty list based on desired behavior | |
# raise # Re-raise the exception | |
return [] # Return empty list to indicate failure but allow continuation | |