import os import re import sys from bs4 import BeautifulSoup from langchain_core.documents import Document # Add the project root to the Python path project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) sys.path.insert(0, project_root) from RAG_BOT.logger import logger # Remove the old import: from RAG_BOT.pdf_processor import extract_date_from_text, get_murli_type from RAG_BOT.document_processor import DocumentProcessor # Import the base class class HtmProcessor(DocumentProcessor): """ Processes HTM files to extract text and metadata, inheriting common functionality from DocumentProcessor. """ def load_htm(self, htm_path): """ Loads an HTM file, extracts text content and metadata. Args: htm_path (str): Path to the HTM file. Returns: Document or None: A Document object with extracted content and metadata, or None if processing fails. """ try: # Try reading with windows-1252 first, as it's common for older HTM with open(htm_path, 'r', encoding='windows-1252') as f: html_content = f.read() except UnicodeDecodeError: # Fallback to utf-8 if windows-1252 fails try: with open(htm_path, 'r', encoding='utf-8') as f: html_content = f.read() except Exception as e: logger.error(f"Failed to read HTM file with utf-8 fallback: {htm_path}. Error: {e}") return None except FileNotFoundError: logger.error(f"HTM file not found: {htm_path}") return None except Exception as e: logger.error(f"Failed to read HTM file: {htm_path}. Error: {e}") return None try: soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser for robustness # Extract text from the body tag, handling cases where body might be missing body_tag = soup.find('body') body_text = body_tag.get_text(separator='\n', strip=True) if body_tag else '' # Improve text cleaning: replace multiple whitespace chars (including newlines) with a single space body_text = re.sub(r'\s+', ' ', body_text).strip() # Extract metadata using base class methods # Check a reasonable portion of the text for metadata clues check_text = body_text[:500] date = self.extract_date_from_text(check_text) is_avyakt = self.get_murli_type(check_text) # Extract filename from the path for the source metadata filename = os.path.basename(htm_path) metadata = { "source": filename, # Use filename instead of full path "full_path": htm_path # Optionally keep the full path if needed elsewhere } if date: metadata["date"] = date if is_avyakt is True: # Explicit check for True metadata["is_avyakt"] = True # Log less verbose debug message unless needed # logger.debug(f"Extracted content from HTM {filename}: {body_text[:200]}...") # Log preview logger.info(f"Processed HTM: {filename}, Date: {date}, Is Avyakt: {is_avyakt}") return Document(page_content=body_text, metadata=metadata) except Exception as e: # Catch potential errors during parsing or metadata extraction logger.error(f"Failed to parse HTM or extract data from {htm_path}. Error: {e}") return None def load_directory_htm(self, directory_path): """ Loads all HTM files from a directory and processes them. Args: directory_path (str): Path to the directory containing HTM files. Returns: list[Document]: A list of Document objects from the processed HTM files. """ all_documents = [] if not os.path.isdir(directory_path): logger.error(f"Directory not found: {directory_path}") return [] logger.info(f"Scanning directory for HTM files: {directory_path}") file_count = 0 processed_count = 0 for filename in os.listdir(directory_path): # Check for both .htm and .html extensions, case-insensitive if filename.lower().endswith((".htm", ".html")): file_count += 1 htm_path = os.path.join(directory_path, filename) logger.debug(f"Attempting to load HTM file: {htm_path}") document = self.load_htm(htm_path) # Use self.load_htm if document: all_documents.append(document) processed_count += 1 else: logger.warning(f"Skipped processing file: {filename}") logger.info(f"Found {file_count} HTM/HTML files. Successfully processed and loaded {processed_count} documents from {directory_path}") return all_documents if __name__ == "__main__": # Example usage with the new class structure # Use a relative path for better portability script_dir = os.path.dirname(__file__) TEST_DATA_DIR = os.path.join(script_dir, 'tests', 'data', 'hindi') # Instantiate the processor htm_processor = HtmProcessor() # Load documents using the instance method print(f"Loading HTM documents from: {TEST_DATA_DIR}") htm_documents = htm_processor.load_directory_htm(TEST_DATA_DIR) if htm_documents: print(f"\nSuccessfully loaded {len(htm_documents)} HTM documents.") # Optional: Split the documents using inherited methods # print("Attempting to split documents...") # chunks = htm_processor.split_text(htm_documents) # Example using basic split # chunks = htm_processor.semantic_chunking(htm_documents) # Example using semantic # if chunks: # print(f"Total chunks created: {len(chunks)}") # else: # print("Splitting resulted in no chunks.") # Print metadata of first few docs to verify loading print("\n--- Sample Document Metadata and Content ---") for i in range(min(5, len(htm_documents))): print(f"\nDocument {i}:") print(f" Metadata: {htm_documents[i].metadata}") # Limit content preview length content_preview = htm_documents[i].page_content[:300].replace('\n', ' ') + "..." print(f" Content Preview: {content_preview}") else: print(f"No HTM documents were successfully processed from {TEST_DATA_DIR}")