Spaces:
Building
Building
import os | |
import re | |
import sys | |
from bs4 import BeautifulSoup | |
from langchain_core.documents import Document | |
# Add the project root to the Python path | |
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) | |
sys.path.insert(0, project_root) | |
from RAG_BOT.logger import logger | |
# Remove the old import: from RAG_BOT.pdf_processor import extract_date_from_text, get_murli_type | |
from RAG_BOT.document_processor import DocumentProcessor # Import the base class | |
class HtmProcessor(DocumentProcessor): | |
""" | |
Processes HTM files to extract text and metadata, inheriting common | |
functionality from DocumentProcessor. | |
""" | |
def load_htm(self, htm_path): | |
""" | |
Loads an HTM file, extracts text content and metadata. | |
Args: | |
htm_path (str): Path to the HTM file. | |
Returns: | |
Document or None: A Document object with extracted content and metadata, or None if processing fails. | |
""" | |
try: | |
# Try reading with windows-1252 first, as it's common for older HTM | |
with open(htm_path, 'r', encoding='windows-1252') as f: | |
html_content = f.read() | |
except UnicodeDecodeError: | |
# Fallback to utf-8 if windows-1252 fails | |
try: | |
with open(htm_path, 'r', encoding='utf-8') as f: | |
html_content = f.read() | |
except Exception as e: | |
logger.error(f"Failed to read HTM file with utf-8 fallback: {htm_path}. Error: {e}") | |
return None | |
except FileNotFoundError: | |
logger.error(f"HTM file not found: {htm_path}") | |
return None | |
except Exception as e: | |
logger.error(f"Failed to read HTM file: {htm_path}. Error: {e}") | |
return None | |
try: | |
soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser for robustness | |
# Extract text from the body tag, handling cases where body might be missing | |
body_tag = soup.find('body') | |
body_text = body_tag.get_text(separator='\n', strip=True) if body_tag else '' | |
# Improve text cleaning: replace multiple whitespace chars (including newlines) with a single space | |
body_text = re.sub(r'\s+', ' ', body_text).strip() | |
# Extract metadata using base class methods | |
# Check a reasonable portion of the text for metadata clues | |
check_text = body_text[:500] | |
date = self.extract_date_from_text(check_text) | |
is_avyakt = self.get_murli_type(check_text) | |
# Extract filename from the path for the source metadata | |
filename = os.path.basename(htm_path) | |
metadata = { | |
"source": filename, # Use filename instead of full path | |
"full_path": htm_path # Optionally keep the full path if needed elsewhere | |
} | |
if date: | |
metadata["date"] = date | |
if is_avyakt is True: # Explicit check for True | |
metadata["is_avyakt"] = True | |
# Log less verbose debug message unless needed | |
# logger.debug(f"Extracted content from HTM {filename}: {body_text[:200]}...") # Log preview | |
logger.info(f"Processed HTM: {filename}, Date: {date}, Is Avyakt: {is_avyakt}") | |
return Document(page_content=body_text, metadata=metadata) | |
except Exception as e: | |
# Catch potential errors during parsing or metadata extraction | |
logger.error(f"Failed to parse HTM or extract data from {htm_path}. Error: {e}") | |
return None | |
def load_directory_htm(self, directory_path): | |
""" | |
Loads all HTM files from a directory and processes them. | |
Args: | |
directory_path (str): Path to the directory containing HTM files. | |
Returns: | |
list[Document]: A list of Document objects from the processed HTM files. | |
""" | |
all_documents = [] | |
if not os.path.isdir(directory_path): | |
logger.error(f"Directory not found: {directory_path}") | |
return [] | |
logger.info(f"Scanning directory for HTM files: {directory_path}") | |
file_count = 0 | |
processed_count = 0 | |
for filename in os.listdir(directory_path): | |
# Check for both .htm and .html extensions, case-insensitive | |
if filename.lower().endswith((".htm", ".html")): | |
file_count += 1 | |
htm_path = os.path.join(directory_path, filename) | |
logger.debug(f"Attempting to load HTM file: {htm_path}") | |
document = self.load_htm(htm_path) # Use self.load_htm | |
if document: | |
all_documents.append(document) | |
processed_count += 1 | |
else: | |
logger.warning(f"Skipped processing file: {filename}") | |
logger.info(f"Found {file_count} HTM/HTML files. Successfully processed and loaded {processed_count} documents from {directory_path}") | |
return all_documents | |
if __name__ == "__main__": | |
# Example usage with the new class structure | |
# Use a relative path for better portability | |
script_dir = os.path.dirname(__file__) | |
TEST_DATA_DIR = os.path.join(script_dir, 'tests', 'data', 'hindi') | |
# Instantiate the processor | |
htm_processor = HtmProcessor() | |
# Load documents using the instance method | |
print(f"Loading HTM documents from: {TEST_DATA_DIR}") | |
htm_documents = htm_processor.load_directory_htm(TEST_DATA_DIR) | |
if htm_documents: | |
print(f"\nSuccessfully loaded {len(htm_documents)} HTM documents.") | |
# Optional: Split the documents using inherited methods | |
# print("Attempting to split documents...") | |
# chunks = htm_processor.split_text(htm_documents) # Example using basic split | |
# chunks = htm_processor.semantic_chunking(htm_documents) # Example using semantic | |
# if chunks: | |
# print(f"Total chunks created: {len(chunks)}") | |
# else: | |
# print("Splitting resulted in no chunks.") | |
# Print metadata of first few docs to verify loading | |
print("\n--- Sample Document Metadata and Content ---") | |
for i in range(min(5, len(htm_documents))): | |
print(f"\nDocument {i}:") | |
print(f" Metadata: {htm_documents[i].metadata}") | |
# Limit content preview length | |
content_preview = htm_documents[i].page_content[:300].replace('\n', ' ') + "..." | |
print(f" Content Preview: {content_preview}") | |
else: | |
print(f"No HTM documents were successfully processed from {TEST_DATA_DIR}") | |