Spaces:
Building
Building
File size: 6,119 Bytes
3f61806 b9ccd0b 3f61806 b9ccd0b 3f61806 b9ccd0b 3f61806 b9ccd0b 3f61806 b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 7361b6f b9ccd0b 3f61806 b9ccd0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
import sys
from langchain_community.document_loaders import PyMuPDFLoader # Keep only PyMuPDFLoader if PyPDFLoader is unused
from langchain_core.documents import Document
# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)
from RAG_BOT.logger import logger
from RAG_BOT.document_processor import DocumentProcessor # Import the base class
class PdfProcessor(DocumentProcessor):
"""
Processes PDF files to extract text, metadata (including dates and type),
inheriting common functionality from DocumentProcessor.
"""
def _load_pdf_multi_murli(self, pdf_path, header_check_chars=300):
"""
Loads a PDF, detects dates at the start of pages (indicating new Murlis),
and applies the correct date metadata to all pages of that Murli.
This is a private helper function.
Args:
pdf_path (str): Path to the PDF file.
header_check_chars (int): Number of characters at the start of a page to check for a date.
Returns:
list[Document]: A list of Document objects with corrected date metadata.
"""
#loader = PyPDFLoader(pdf_path)
loader = PyMuPDFLoader(pdf_path) # Use PyMuPDFLoader for better performance
try:
pages = loader.load() # Use load() instead of load_and_split() initially
except Exception as e:
logger.error(f"Failed to load PDF: {pdf_path}. Error: {e}")
return []
all_documents = []
current_date = None
current_is_avyakt = None # Track avyakt status similarly
logger.info(f"Processing {len(pages)} pages from {pdf_path}...")
for i, page in enumerate(pages):
page_text = page.page_content
metadata = page.metadata.copy() # Work on a copy
# Check the beginning of the page for a new date/type
header_text = page_text[:header_check_chars]
header_len = len(header_text)
header_preview = repr(header_text[:100]) # Use repr() to show whitespace/special chars clearly
logger.debug(f"Page {metadata.get('page', i)}: Header Text Length={header_len}. Preview (first 100 chars): {header_preview}")
potential_new_date = self.extract_date_from_text(header_text) # Use self.method
potential_is_avyakt = self.get_murli_type(header_text) # Use self.method
# If a date is found in the header, assume it's the start of a new Murli
if potential_new_date:
if potential_new_date != current_date:
logger.debug(f"Found new date '{potential_new_date}' on page {metadata.get('page', i)}.")
current_date = potential_new_date
# Update avyakt status whenever a new date is found
if potential_is_avyakt != current_is_avyakt:
logger.debug(f"Murli type set to Avyakt={potential_is_avyakt} starting page {metadata.get('page', i)}.")
current_is_avyakt = potential_is_avyakt
# Apply the current date and type (if found) to the page's metadata
if current_date:
metadata["date"] = current_date
else:
# If no date has been found yet (e.g., first few pages have no date)
if "date" in metadata:
del metadata["date"] # Remove potentially incorrect date from loader
# Only add is_avyakt to metadata if it's an Avyakt Murli
if current_is_avyakt is True:
metadata["is_avyakt"] = True
elif "is_avyakt" in metadata:
del metadata["is_avyakt"]
# Create a new Document object with updated metadata
# Using page_text ensures we have the content, metadata has page number etc.
processed_doc = Document(page_content=page_text, metadata=metadata)
all_documents.append(processed_doc)
if current_date is None:
logger.warning(f"No date could be extracted from the headers in {pdf_path}.")
logger.info(f"Finished processing {len(all_documents)} documents metadata date: {current_date}, is_avyakt: {current_is_avyakt}.")
return all_documents
def load_pdf(self, pdf_path):
"""
Loads a PDF and processes it to extract content and metadata,
handling multiple Murlis within a single PDF.
Args:
pdf_path (str): Path to the PDF file.
Returns:
list[Document]: A list of Document objects with extracted content and metadata.
"""
# Call the private helper function with a default header_check_chars
return self._load_pdf_multi_murli(pdf_path, header_check_chars=300)
if __name__ == "__main__":
# Example usage with the new class structure
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'tests', 'data', 'hindi')
pdf_name = "03. AV-H-07.01.1980.pdf"
pdf_path = os.path.join(TEST_DATA_DIR, pdf_name)
# Instantiate the processor
pdf_processor = PdfProcessor()
# Load documents using the instance method
documents_with_correct_dates = pdf_processor.load_pdf(pdf_path)
if documents_with_correct_dates:
# Optional: Print metadata of first few docs to verify
for i in range(min(5, len(documents_with_correct_dates))):
print(f"Doc {i} Metadata: {documents_with_correct_dates[i].metadata}")
# Proceed with splitting using inherited methods
# chunks = pdf_processor.split_text(documents_with_correct_dates)
chunks = pdf_processor.semantic_chunking(documents_with_correct_dates) # Example using semantic
# ... further processing (e.g., indexing chunks) ...
print(f"\nTotal chunks created: {len(chunks)}")
if chunks:
print(f"First chunk metadata: {chunks[0].metadata}")
print(f"Last chunk metadata: {chunks[-1].metadata}")
else:
print(f"Could not process PDF: {pdf_path}")
|