File size: 6,664 Bytes
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import os
import re
import sys
from bs4 import BeautifulSoup
from langchain_core.documents import Document

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)
from RAG_BOT.logger import logger
# Remove the old import: from RAG_BOT.pdf_processor import extract_date_from_text, get_murli_type
from RAG_BOT.document_processor import DocumentProcessor # Import the base class


class HtmProcessor(DocumentProcessor):
    """
    Processes HTM files to extract text and metadata, inheriting common
    functionality from DocumentProcessor.
    """
    def load_htm(self, htm_path):
        """
        Loads an HTM file, extracts text content and metadata.

        Args:
            htm_path (str): Path to the HTM file.

        Returns:
            Document or None: A Document object with extracted content and metadata, or None if processing fails.
        """
        try:
            # Try reading with windows-1252 first, as it's common for older HTM
            with open(htm_path, 'r', encoding='windows-1252') as f:
                html_content = f.read()
        except UnicodeDecodeError:
            # Fallback to utf-8 if windows-1252 fails
            try:
                with open(htm_path, 'r', encoding='utf-8') as f:
                    html_content = f.read()
            except Exception as e:
                logger.error(f"Failed to read HTM file with utf-8 fallback: {htm_path}. Error: {e}")
                return None
        except FileNotFoundError:
             logger.error(f"HTM file not found: {htm_path}")
             return None
        except Exception as e:
            logger.error(f"Failed to read HTM file: {htm_path}. Error: {e}")
            return None

        try:
            soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser for robustness
            # Extract text from the body tag, handling cases where body might be missing
            body_tag = soup.find('body')
            body_text = body_tag.get_text(separator='\n', strip=True) if body_tag else ''

            # Improve text cleaning: replace multiple whitespace chars (including newlines) with a single space
            body_text = re.sub(r'\s+', ' ', body_text).strip()

            # Extract metadata using base class methods
            # Check a reasonable portion of the text for metadata clues
            check_text = body_text[:500]
            date = self.extract_date_from_text(check_text)
            is_avyakt = self.get_murli_type(check_text)

            # Extract filename from the path for the source metadata
            filename = os.path.basename(htm_path)

            metadata = {
                "source": filename, # Use filename instead of full path
                "full_path": htm_path # Optionally keep the full path if needed elsewhere
            }
            if date:
                metadata["date"] = date
            if is_avyakt is True: # Explicit check for True
                metadata["is_avyakt"] = True

            # Log less verbose debug message unless needed
            # logger.debug(f"Extracted content from HTM {filename}: {body_text[:200]}...") # Log preview
            logger.info(f"Processed HTM: {filename}, Date: {date}, Is Avyakt: {is_avyakt}")

            return Document(page_content=body_text, metadata=metadata)

        except Exception as e:
            # Catch potential errors during parsing or metadata extraction
            logger.error(f"Failed to parse HTM or extract data from {htm_path}. Error: {e}")
            return None

    def load_directory_htm(self, directory_path):
        """
        Loads all HTM files from a directory and processes them.

        Args:
            directory_path (str): Path to the directory containing HTM files.

        Returns:
            list[Document]: A list of Document objects from the processed HTM files.
        """
        all_documents = []
        if not os.path.isdir(directory_path):
            logger.error(f"Directory not found: {directory_path}")
            return []

        logger.info(f"Scanning directory for HTM files: {directory_path}")
        file_count = 0
        processed_count = 0
        for filename in os.listdir(directory_path):
            # Check for both .htm and .html extensions, case-insensitive
            if filename.lower().endswith((".htm", ".html")):
                file_count += 1
                htm_path = os.path.join(directory_path, filename)
                logger.debug(f"Attempting to load HTM file: {htm_path}")
                document = self.load_htm(htm_path) # Use self.load_htm
                if document:
                    all_documents.append(document)
                    processed_count += 1
                else:
                    logger.warning(f"Skipped processing file: {filename}")


        logger.info(f"Found {file_count} HTM/HTML files. Successfully processed and loaded {processed_count} documents from {directory_path}")
        return all_documents


if __name__ == "__main__":
    # Example usage with the new class structure
    # Use a relative path for better portability
    script_dir = os.path.dirname(__file__)
    TEST_DATA_DIR = os.path.join(script_dir, 'tests', 'data', 'hindi')

    # Instantiate the processor
    htm_processor = HtmProcessor()

    # Load documents using the instance method
    print(f"Loading HTM documents from: {TEST_DATA_DIR}")
    htm_documents = htm_processor.load_directory_htm(TEST_DATA_DIR)

    if htm_documents:
        print(f"\nSuccessfully loaded {len(htm_documents)} HTM documents.")
        # Optional: Split the documents using inherited methods
        # print("Attempting to split documents...")
        # chunks = htm_processor.split_text(htm_documents) # Example using basic split
        # chunks = htm_processor.semantic_chunking(htm_documents) # Example using semantic
        # if chunks:
        #      print(f"Total chunks created: {len(chunks)}")
        # else:
        #      print("Splitting resulted in no chunks.")


        # Print metadata of first few docs to verify loading
        print("\n--- Sample Document Metadata and Content ---")
        for i in range(min(5, len(htm_documents))):
            print(f"\nDocument {i}:")
            print(f"  Metadata: {htm_documents[i].metadata}")
            # Limit content preview length
            content_preview = htm_documents[i].page_content[:300].replace('\n', ' ') + "..."
            print(f"  Content Preview: {content_preview}")
    else:
        print(f"No HTM documents were successfully processed from {TEST_DATA_DIR}")