SpiritualChatBot / RAG_BOT /htm_processor.py
bk-anupam
Enhance RAG_BOT functionality with multilingual support and improved JSON parsing
b9ccd0b
import os
import re
import sys
from bs4 import BeautifulSoup
from langchain_core.documents import Document
# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)
from RAG_BOT.logger import logger
# Remove the old import: from RAG_BOT.pdf_processor import extract_date_from_text, get_murli_type
from RAG_BOT.document_processor import DocumentProcessor # Import the base class
class HtmProcessor(DocumentProcessor):
"""
Processes HTM files to extract text and metadata, inheriting common
functionality from DocumentProcessor.
"""
def load_htm(self, htm_path):
"""
Loads an HTM file, extracts text content and metadata.
Args:
htm_path (str): Path to the HTM file.
Returns:
Document or None: A Document object with extracted content and metadata, or None if processing fails.
"""
try:
# Try reading with windows-1252 first, as it's common for older HTM
with open(htm_path, 'r', encoding='windows-1252') as f:
html_content = f.read()
except UnicodeDecodeError:
# Fallback to utf-8 if windows-1252 fails
try:
with open(htm_path, 'r', encoding='utf-8') as f:
html_content = f.read()
except Exception as e:
logger.error(f"Failed to read HTM file with utf-8 fallback: {htm_path}. Error: {e}")
return None
except FileNotFoundError:
logger.error(f"HTM file not found: {htm_path}")
return None
except Exception as e:
logger.error(f"Failed to read HTM file: {htm_path}. Error: {e}")
return None
try:
soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser for robustness
# Extract text from the body tag, handling cases where body might be missing
body_tag = soup.find('body')
body_text = body_tag.get_text(separator='\n', strip=True) if body_tag else ''
# Improve text cleaning: replace multiple whitespace chars (including newlines) with a single space
body_text = re.sub(r'\s+', ' ', body_text).strip()
# Extract metadata using base class methods
# Check a reasonable portion of the text for metadata clues
check_text = body_text[:500]
date = self.extract_date_from_text(check_text)
is_avyakt = self.get_murli_type(check_text)
# Extract filename from the path for the source metadata
filename = os.path.basename(htm_path)
metadata = {
"source": filename, # Use filename instead of full path
"full_path": htm_path # Optionally keep the full path if needed elsewhere
}
if date:
metadata["date"] = date
if is_avyakt is True: # Explicit check for True
metadata["is_avyakt"] = True
# Log less verbose debug message unless needed
# logger.debug(f"Extracted content from HTM {filename}: {body_text[:200]}...") # Log preview
logger.info(f"Processed HTM: {filename}, Date: {date}, Is Avyakt: {is_avyakt}")
return Document(page_content=body_text, metadata=metadata)
except Exception as e:
# Catch potential errors during parsing or metadata extraction
logger.error(f"Failed to parse HTM or extract data from {htm_path}. Error: {e}")
return None
def load_directory_htm(self, directory_path):
"""
Loads all HTM files from a directory and processes them.
Args:
directory_path (str): Path to the directory containing HTM files.
Returns:
list[Document]: A list of Document objects from the processed HTM files.
"""
all_documents = []
if not os.path.isdir(directory_path):
logger.error(f"Directory not found: {directory_path}")
return []
logger.info(f"Scanning directory for HTM files: {directory_path}")
file_count = 0
processed_count = 0
for filename in os.listdir(directory_path):
# Check for both .htm and .html extensions, case-insensitive
if filename.lower().endswith((".htm", ".html")):
file_count += 1
htm_path = os.path.join(directory_path, filename)
logger.debug(f"Attempting to load HTM file: {htm_path}")
document = self.load_htm(htm_path) # Use self.load_htm
if document:
all_documents.append(document)
processed_count += 1
else:
logger.warning(f"Skipped processing file: {filename}")
logger.info(f"Found {file_count} HTM/HTML files. Successfully processed and loaded {processed_count} documents from {directory_path}")
return all_documents
if __name__ == "__main__":
# Example usage with the new class structure
# Use a relative path for better portability
script_dir = os.path.dirname(__file__)
TEST_DATA_DIR = os.path.join(script_dir, 'tests', 'data', 'hindi')
# Instantiate the processor
htm_processor = HtmProcessor()
# Load documents using the instance method
print(f"Loading HTM documents from: {TEST_DATA_DIR}")
htm_documents = htm_processor.load_directory_htm(TEST_DATA_DIR)
if htm_documents:
print(f"\nSuccessfully loaded {len(htm_documents)} HTM documents.")
# Optional: Split the documents using inherited methods
# print("Attempting to split documents...")
# chunks = htm_processor.split_text(htm_documents) # Example using basic split
# chunks = htm_processor.semantic_chunking(htm_documents) # Example using semantic
# if chunks:
# print(f"Total chunks created: {len(chunks)}")
# else:
# print("Splitting resulted in no chunks.")
# Print metadata of first few docs to verify loading
print("\n--- Sample Document Metadata and Content ---")
for i in range(min(5, len(htm_documents))):
print(f"\nDocument {i}:")
print(f" Metadata: {htm_documents[i].metadata}")
# Limit content preview length
content_preview = htm_documents[i].page_content[:300].replace('\n', ' ') + "..."
print(f" Content Preview: {content_preview}")
else:
print(f"No HTM documents were successfully processed from {TEST_DATA_DIR}")