Spaces:
Building
Building
File size: 7,248 Bytes
b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import re
import os
import sys
from datetime import datetime
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
from langchain_core.documents import Document
# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)
from RAG_BOT.logger import logger
class DocumentProcessor:
"""
Base class for processing documents (PDF, HTM, etc.) to extract text,
metadata, and split content into chunks.
"""
def _devanagari_to_ascii_digits(self, devanagari_string: str) -> str:
"""Converts Devanagari numerals in a string to ASCII digits."""
mapping = {
'०': '0', '१': '1', '२': '2', '३': '3', '४': '4',
'५': '5', '६': '6', '७': '7', '८': '8', '९': '9'
}
return "".join(mapping.get(char, char) for char in devanagari_string)
def extract_date_from_text(self, text):
"""
Attempts to extract a date from the given text and returns it in YYYY-MM-DD format.
Args:
text (str): The text to search for a date.
Returns:
str or None: The extracted date in YYYY-MM-DD format if found, otherwise None.
"""
# Specific date patterns to avoid ambiguity
date_patterns = [
(r"(\d{4})-(\d{2})-(\d{2})", "%Y-%m-%d"), # YYYY-MM-DD
(r"([०-९]{4})-([०-९]{2})-([०-९]{2})", "%Y-%m-%d"), # YYYY-MM-DD (Devanagari)
(r"(\d{2})/(\d{2})/(\d{4})", "%d/%m/%Y"), # DD/MM/YYYY
(r"([०-९]{2})/([०-९]{2})/([०-९]{4})", "%d/%m/%Y"), # DD/MM/YYYY (Devanagari)
(r"(\d{2})\.(\d{2})\.(\d{4})", "%d.%m.%Y"), # DD.MM.YYYY
(r"([०-९]{2})\.([०-९]{2})\.([०-९]{4})", "%d.%m.%Y"), # DD.MM.YYYY (Devanagari)
(r"(\d{1,2})\.(\d{1,2})\.(\d{4})", "%d.%m.%Y"), # D.M.YYYY, DD.M.YYYY, D.MM.YYYY
(r"([०-९]{1,2})\.([०-९]{1,2})\.([०-९]{4})", "%d.%m.%Y"), # D.M.YYYY (Devanagari)
(r"(\d{1,2})/(\d{1,2})/(\d{4})", "%d/%m/%Y"), # D/M/YYYY, DD/M/YYYY, D/MM/YYYY
(r"([०-९]{1,2})/([०-९]{1,2})/([०-९]{4})", "%d/%m/%Y"), # D/M/YYYY (Devanagari)
(r"(\d{1,2})-(\d{1,2})-(\d{4})", "%d-%m-%Y"), # D-M-YYYY, DD-M-YYYY, D-MM-YYYY
(r"([०-९]{1,2})-([०-९]{1,2})-([०-९]{4})", "%d-%m-%Y"), # D-M-YYYY (Devanagari)
(r"(\d{2})\.(\d{2})\.(\d{2})", "%d.%m.%y"), # DD.MM.YY
(r"([०-९]{2})\.([०-९]{2})\.([०-९]{2})", "%d.%m.%y"), # DD.MM.YY (Devanagari)
(r"(\d{2})/(\d{2})/(\d{2})", "%d/%m/%y"), # DD/MM/YY
(r"([०-९]{2})/([०-९]{2})/([०-९]{2})", "%d/%m/%y"), # DD/MM/YY (Devanagari)
(r"(\d{2})-(\d{2})-(\d{2})", "%d-%m-%y"), # DD-MM-YY
(r"([०-९]{2})-([०-९]{2})-([०-९]{2})", "%d-%m-%y"), # DD-MM-YY (Devanagari)
(r"(\d{1,2})\.(\d{1,2})\.(\d{2})", "%d.%m.%y"), # D.M.YY, DD.M.YY, D.MM.YY
(r"([०-९]{1,2})\.([०-९]{1,2})\.([०-९]{2})", "%d.%m.%y"), # D.M.YY (Devanagari)
(r"(\d{1,2})/(\d{1,2})/(\d{2})", "%d/%m/%y"), # D/M/YY, DD/M/YY, D/MM/YY
(r"([०-९]{1,2})/([०-९]{1,2})/([०-९]{2})", "%d/%m/%y"), # D/M/YY (Devanagari)
(r"(\d{1,2})-(\d{1,2})-(\d{2})", "%d-%m-%y"), # D-M-YY, DD-M-YY, D-MM-YY
(r"([०-९]{1,2})-([०-९]{1,2})-([०-९]{2})", "%d-%m-%y"), # D-M-YY (Devanagari)
# Add other common formats if needed (e.g., "January 21, 1969")
]
for pattern, date_format in date_patterns:
match = re.search(pattern, text)
if match:
matched_date_str = match.group(0)
ascii_date_str = self._devanagari_to_ascii_digits(matched_date_str)
try:
# Attempt to parse the date using the specified format
date_obj = datetime.strptime(ascii_date_str, date_format)
return date_obj.strftime("%Y-%m-%d")
except ValueError as e:
logger.warning(f"Date format '{date_format}' matched for '{matched_date_str}' (converted to '{ascii_date_str}'), but couldn't parse. Error: {e}")
# Continue searching other patterns
except Exception as e:
logger.error(f"Unexpected error parsing date '{matched_date_str}' (converted to '{ascii_date_str}') with format '{date_format}': {e}")
# Continue searching other patterns
logger.info(f"No date pattern matched in text: '{text[:100]}...'")
return None # Return None if no pattern matched or parsing failed
def get_murli_type(self, text):
"""
Determines if the text indicates an 'Avyakt' Murli.
Args:
text (str): The text to check.
Returns:
bool: True if 'avyakt' or 'अव्यक्त' is found, False otherwise.
"""
# Check for both Roman script (case-insensitive) and Devanagari script
if 'avyakt' in text.lower() or 'अव्यक्त' in text:
return True
return False
def split_text(self, documents, chunk_size=1000, chunk_overlap=200):
"""Splits the documents into chunks using RecursiveCharacterTextSplitter."""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_documents(documents)
logger.info(f"Split documents into {len(texts)} chunks using RecursiveCharacterTextSplitter")
return texts
def semantic_chunking(self, documents, model_name="sentence-transformers/all-MiniLM-L6-v2",
chunk_size=1000, chunk_overlap=0):
"""
Performs semantic chunking on the input documents using a sentence transformer model.
Args:
documents (list): A list of LangChain Document objects.
model_name (str): The name of the sentence transformer model to use.
chunk_size (int): The desired maximum size of each chunk in tokens.
Returns:
list: A list of LangChain Document objects representing the semantically chunked text.
"""
logger.info(f"Performing semantic chunking using model: {model_name} with chunk size : {chunk_size} tokens")
# Initialize the sentence transformer text splitter
try:
splitter = SentenceTransformersTokenTextSplitter(model_name=model_name, chunk_overlap=0, tokens_per_chunk=chunk_size)
# Split the documents into semantically meaningful chunks
chunks = splitter.split_documents(documents)
logger.info(f"Split documents into {len(chunks)} chunks using semantic chunking")
return chunks
except Exception as e:
logger.error(f"Error during semantic chunking: {e}")
# Consider re-raising or returning empty list based on desired behavior
# raise # Re-raise the exception
return [] # Return empty list to indicate failure but allow continuation
|