Spaces:

bk-anupam
/

SpiritualChatBot

Building

SpiritualChatBot / RAG_BOT /htm_processor.py

bk-anupam

Enhance RAG_BOT functionality with multilingual support and improved JSON parsing

b9ccd0b about 1 month ago

6.66 kB

	import os
	import re
	import sys
	from bs4 import BeautifulSoup
	from langchain_core.documents import Document

	# Add the project root to the Python path
	project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
	sys.path.insert(0, project_root)
	from RAG_BOT.logger import logger
	# Remove the old import: from RAG_BOT.pdf_processor import extract_date_from_text, get_murli_type
	from RAG_BOT.document_processor import DocumentProcessor # Import the base class


	class HtmProcessor(DocumentProcessor):
	"""
	Processes HTM files to extract text and metadata, inheriting common
	functionality from DocumentProcessor.
	"""
	def load_htm(self, htm_path):
	"""
	Loads an HTM file, extracts text content and metadata.

	Args:
	htm_path (str): Path to the HTM file.

	Returns:
	Document or None: A Document object with extracted content and metadata, or None if processing fails.
	"""
	try:
	# Try reading with windows-1252 first, as it's common for older HTM
	with open(htm_path, 'r', encoding='windows-1252') as f:
	html_content = f.read()
	except UnicodeDecodeError:
	# Fallback to utf-8 if windows-1252 fails
	try:
	with open(htm_path, 'r', encoding='utf-8') as f:
	html_content = f.read()
	except Exception as e:
	logger.error(f"Failed to read HTM file with utf-8 fallback: {htm_path}. Error: {e}")
	return None
	except FileNotFoundError:
	logger.error(f"HTM file not found: {htm_path}")
	return None
	except Exception as e:
	logger.error(f"Failed to read HTM file: {htm_path}. Error: {e}")
	return None

	try:
	soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser for robustness
	# Extract text from the body tag, handling cases where body might be missing
	body_tag = soup.find('body')
	body_text = body_tag.get_text(separator='\n', strip=True) if body_tag else ''

	# Improve text cleaning: replace multiple whitespace chars (including newlines) with a single space
	body_text = re.sub(r'\s+', ' ', body_text).strip()

	# Extract metadata using base class methods
	# Check a reasonable portion of the text for metadata clues
	check_text = body_text[:500]
	date = self.extract_date_from_text(check_text)
	is_avyakt = self.get_murli_type(check_text)

	# Extract filename from the path for the source metadata
	filename = os.path.basename(htm_path)

	metadata = {
	"source": filename, # Use filename instead of full path
	"full_path": htm_path # Optionally keep the full path if needed elsewhere
	}
	if date:
	metadata["date"] = date
	if is_avyakt is True: # Explicit check for True
	metadata["is_avyakt"] = True

	# Log less verbose debug message unless needed
	# logger.debug(f"Extracted content from HTM {filename}: {body_text[:200]}...") # Log preview
	logger.info(f"Processed HTM: {filename}, Date: {date}, Is Avyakt: {is_avyakt}")

	return Document(page_content=body_text, metadata=metadata)

	except Exception as e:
	# Catch potential errors during parsing or metadata extraction
	logger.error(f"Failed to parse HTM or extract data from {htm_path}. Error: {e}")
	return None

	def load_directory_htm(self, directory_path):
	"""
	Loads all HTM files from a directory and processes them.

	Args:
	directory_path (str): Path to the directory containing HTM files.

	Returns:
	list[Document]: A list of Document objects from the processed HTM files.
	"""
	all_documents = []
	if not os.path.isdir(directory_path):
	logger.error(f"Directory not found: {directory_path}")
	return []

	logger.info(f"Scanning directory for HTM files: {directory_path}")
	file_count = 0
	processed_count = 0
	for filename in os.listdir(directory_path):
	# Check for both .htm and .html extensions, case-insensitive
	if filename.lower().endswith((".htm", ".html")):
	file_count += 1
	htm_path = os.path.join(directory_path, filename)
	logger.debug(f"Attempting to load HTM file: {htm_path}")
	document = self.load_htm(htm_path) # Use self.load_htm
	if document:
	all_documents.append(document)
	processed_count += 1
	else:
	logger.warning(f"Skipped processing file: {filename}")


	logger.info(f"Found {file_count} HTM/HTML files. Successfully processed and loaded {processed_count} documents from {directory_path}")
	return all_documents


	if __name__ == "__main__":
	# Example usage with the new class structure
	# Use a relative path for better portability
	script_dir = os.path.dirname(__file__)
	TEST_DATA_DIR = os.path.join(script_dir, 'tests', 'data', 'hindi')

	# Instantiate the processor
	htm_processor = HtmProcessor()

	# Load documents using the instance method
	print(f"Loading HTM documents from: {TEST_DATA_DIR}")
	htm_documents = htm_processor.load_directory_htm(TEST_DATA_DIR)

	if htm_documents:
	print(f"\nSuccessfully loaded {len(htm_documents)} HTM documents.")
	# Optional: Split the documents using inherited methods
	# print("Attempting to split documents...")
	# chunks = htm_processor.split_text(htm_documents) # Example using basic split
	# chunks = htm_processor.semantic_chunking(htm_documents) # Example using semantic
	# if chunks:
	# print(f"Total chunks created: {len(chunks)}")
	# else:
	# print("Splitting resulted in no chunks.")


	# Print metadata of first few docs to verify loading
	print("\n--- Sample Document Metadata and Content ---")
	for i in range(min(5, len(htm_documents))):
	print(f"\nDocument {i}:")
	print(f" Metadata: {htm_documents[i].metadata}")
	# Limit content preview length
	content_preview = htm_documents[i].page_content[:300].replace('\n', ' ') + "..."
	print(f" Content Preview: {content_preview}")
	else:
	print(f"No HTM documents were successfully processed from {TEST_DATA_DIR}")