File size: 6,119 Bytes
3f61806
b9ccd0b
 
3f61806
b9ccd0b
 
 
 
 
3f61806
b9ccd0b
 
3f61806
b9ccd0b
 
3f61806
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7361b6f
b9ccd0b
 
 
7361b6f
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7361b6f
b9ccd0b
 
7361b6f
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
3f61806
 
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import sys
from langchain_community.document_loaders import PyMuPDFLoader # Keep only PyMuPDFLoader if PyPDFLoader is unused
from langchain_core.documents import Document
# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)
from RAG_BOT.logger import logger
from RAG_BOT.document_processor import DocumentProcessor # Import the base class


class PdfProcessor(DocumentProcessor):
    """
    Processes PDF files to extract text, metadata (including dates and type),
    inheriting common functionality from DocumentProcessor.
    """
    def _load_pdf_multi_murli(self, pdf_path, header_check_chars=300):
        """
        Loads a PDF, detects dates at the start of pages (indicating new Murlis),
        and applies the correct date metadata to all pages of that Murli.
        This is a private helper function.

        Args:
            pdf_path (str): Path to the PDF file.
            header_check_chars (int): Number of characters at the start of a page to check for a date.

        Returns:
            list[Document]: A list of Document objects with corrected date metadata.
        """
        #loader = PyPDFLoader(pdf_path)
        loader = PyMuPDFLoader(pdf_path) # Use PyMuPDFLoader for better performance
        try:
            pages = loader.load() # Use load() instead of load_and_split() initially
        except Exception as e:
            logger.error(f"Failed to load PDF: {pdf_path}. Error: {e}")
            return []

        all_documents = []
        current_date = None
        current_is_avyakt = None # Track avyakt status similarly

        logger.info(f"Processing {len(pages)} pages from {pdf_path}...")

        for i, page in enumerate(pages):
            page_text = page.page_content
            metadata = page.metadata.copy() # Work on a copy

            # Check the beginning of the page for a new date/type
            header_text = page_text[:header_check_chars]
            header_len = len(header_text)
            header_preview = repr(header_text[:100]) # Use repr() to show whitespace/special chars clearly
            logger.debug(f"Page {metadata.get('page', i)}: Header Text Length={header_len}. Preview (first 100 chars): {header_preview}")
            potential_new_date = self.extract_date_from_text(header_text) # Use self.method
            potential_is_avyakt = self.get_murli_type(header_text) # Use self.method

            # If a date is found in the header, assume it's the start of a new Murli
            if potential_new_date:
                if potential_new_date != current_date:
                    logger.debug(f"Found new date '{potential_new_date}' on page {metadata.get('page', i)}.")
                    current_date = potential_new_date
                # Update avyakt status whenever a new date is found
                if potential_is_avyakt != current_is_avyakt:
                    logger.debug(f"Murli type set to Avyakt={potential_is_avyakt} starting page {metadata.get('page', i)}.")
                    current_is_avyakt = potential_is_avyakt

            # Apply the current date and type (if found) to the page's metadata
            if current_date:
                metadata["date"] = current_date
            else:
                # If no date has been found yet (e.g., first few pages have no date)
                if "date" in metadata:
                    del metadata["date"] # Remove potentially incorrect date from loader

            # Only add is_avyakt to metadata if it's an Avyakt Murli
            if current_is_avyakt is True:
                metadata["is_avyakt"] = True
            elif "is_avyakt" in metadata:
                del metadata["is_avyakt"]

            # Create a new Document object with updated metadata
            # Using page_text ensures we have the content, metadata has page number etc.
            processed_doc = Document(page_content=page_text, metadata=metadata)
            all_documents.append(processed_doc)

        if current_date is None:
            logger.warning(f"No date could be extracted from the headers in {pdf_path}.")

        logger.info(f"Finished processing {len(all_documents)} documents metadata date: {current_date}, is_avyakt: {current_is_avyakt}.")
        return all_documents


    def load_pdf(self, pdf_path):
        """
        Loads a PDF and processes it to extract content and metadata,
        handling multiple Murlis within a single PDF.        
        Args:
            pdf_path (str): Path to the PDF file.
        Returns:
            list[Document]: A list of Document objects with extracted content and metadata.
        """
        # Call the private helper function with a default header_check_chars
        return self._load_pdf_multi_murli(pdf_path, header_check_chars=300)


if __name__ == "__main__":
    # Example usage with the new class structure
    TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'tests', 'data', 'hindi')
    pdf_name = "03. AV-H-07.01.1980.pdf"
    pdf_path = os.path.join(TEST_DATA_DIR, pdf_name)

    # Instantiate the processor
    pdf_processor = PdfProcessor()

    # Load documents using the instance method
    documents_with_correct_dates = pdf_processor.load_pdf(pdf_path)

    if documents_with_correct_dates:
        # Optional: Print metadata of first few docs to verify
        for i in range(min(5, len(documents_with_correct_dates))):
             print(f"Doc {i} Metadata: {documents_with_correct_dates[i].metadata}")

        # Proceed with splitting using inherited methods
        # chunks = pdf_processor.split_text(documents_with_correct_dates)
        chunks = pdf_processor.semantic_chunking(documents_with_correct_dates) # Example using semantic
        # ... further processing (e.g., indexing chunks) ...
        print(f"\nTotal chunks created: {len(chunks)}")
        if chunks:
            print(f"First chunk metadata: {chunks[0].metadata}")
            print(f"Last chunk metadata: {chunks[-1].metadata}")
    else:
        print(f"Could not process PDF: {pdf_path}")