Spaces:
Building
Building
import os | |
import sys | |
from langchain_community.document_loaders import PyMuPDFLoader # Keep only PyMuPDFLoader if PyPDFLoader is unused | |
from langchain_core.documents import Document | |
# Add the project root to the Python path | |
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) | |
sys.path.insert(0, project_root) | |
from RAG_BOT.logger import logger | |
from RAG_BOT.document_processor import DocumentProcessor # Import the base class | |
class PdfProcessor(DocumentProcessor): | |
""" | |
Processes PDF files to extract text, metadata (including dates and type), | |
inheriting common functionality from DocumentProcessor. | |
""" | |
def _load_pdf_multi_murli(self, pdf_path, header_check_chars=300): | |
""" | |
Loads a PDF, detects dates at the start of pages (indicating new Murlis), | |
and applies the correct date metadata to all pages of that Murli. | |
This is a private helper function. | |
Args: | |
pdf_path (str): Path to the PDF file. | |
header_check_chars (int): Number of characters at the start of a page to check for a date. | |
Returns: | |
list[Document]: A list of Document objects with corrected date metadata. | |
""" | |
#loader = PyPDFLoader(pdf_path) | |
loader = PyMuPDFLoader(pdf_path) # Use PyMuPDFLoader for better performance | |
try: | |
pages = loader.load() # Use load() instead of load_and_split() initially | |
except Exception as e: | |
logger.error(f"Failed to load PDF: {pdf_path}. Error: {e}") | |
return [] | |
all_documents = [] | |
current_date = None | |
current_is_avyakt = None # Track avyakt status similarly | |
logger.info(f"Processing {len(pages)} pages from {pdf_path}...") | |
for i, page in enumerate(pages): | |
page_text = page.page_content | |
metadata = page.metadata.copy() # Work on a copy | |
# Check the beginning of the page for a new date/type | |
header_text = page_text[:header_check_chars] | |
header_len = len(header_text) | |
header_preview = repr(header_text[:100]) # Use repr() to show whitespace/special chars clearly | |
logger.debug(f"Page {metadata.get('page', i)}: Header Text Length={header_len}. Preview (first 100 chars): {header_preview}") | |
potential_new_date = self.extract_date_from_text(header_text) # Use self.method | |
potential_is_avyakt = self.get_murli_type(header_text) # Use self.method | |
# If a date is found in the header, assume it's the start of a new Murli | |
if potential_new_date: | |
if potential_new_date != current_date: | |
logger.debug(f"Found new date '{potential_new_date}' on page {metadata.get('page', i)}.") | |
current_date = potential_new_date | |
# Update avyakt status whenever a new date is found | |
if potential_is_avyakt != current_is_avyakt: | |
logger.debug(f"Murli type set to Avyakt={potential_is_avyakt} starting page {metadata.get('page', i)}.") | |
current_is_avyakt = potential_is_avyakt | |
# Apply the current date and type (if found) to the page's metadata | |
if current_date: | |
metadata["date"] = current_date | |
else: | |
# If no date has been found yet (e.g., first few pages have no date) | |
if "date" in metadata: | |
del metadata["date"] # Remove potentially incorrect date from loader | |
# Only add is_avyakt to metadata if it's an Avyakt Murli | |
if current_is_avyakt is True: | |
metadata["is_avyakt"] = True | |
elif "is_avyakt" in metadata: | |
del metadata["is_avyakt"] | |
# Create a new Document object with updated metadata | |
# Using page_text ensures we have the content, metadata has page number etc. | |
processed_doc = Document(page_content=page_text, metadata=metadata) | |
all_documents.append(processed_doc) | |
if current_date is None: | |
logger.warning(f"No date could be extracted from the headers in {pdf_path}.") | |
logger.info(f"Finished processing {len(all_documents)} documents metadata date: {current_date}, is_avyakt: {current_is_avyakt}.") | |
return all_documents | |
def load_pdf(self, pdf_path): | |
""" | |
Loads a PDF and processes it to extract content and metadata, | |
handling multiple Murlis within a single PDF. | |
Args: | |
pdf_path (str): Path to the PDF file. | |
Returns: | |
list[Document]: A list of Document objects with extracted content and metadata. | |
""" | |
# Call the private helper function with a default header_check_chars | |
return self._load_pdf_multi_murli(pdf_path, header_check_chars=300) | |
if __name__ == "__main__": | |
# Example usage with the new class structure | |
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'tests', 'data', 'hindi') | |
pdf_name = "03. AV-H-07.01.1980.pdf" | |
pdf_path = os.path.join(TEST_DATA_DIR, pdf_name) | |
# Instantiate the processor | |
pdf_processor = PdfProcessor() | |
# Load documents using the instance method | |
documents_with_correct_dates = pdf_processor.load_pdf(pdf_path) | |
if documents_with_correct_dates: | |
# Optional: Print metadata of first few docs to verify | |
for i in range(min(5, len(documents_with_correct_dates))): | |
print(f"Doc {i} Metadata: {documents_with_correct_dates[i].metadata}") | |
# Proceed with splitting using inherited methods | |
# chunks = pdf_processor.split_text(documents_with_correct_dates) | |
chunks = pdf_processor.semantic_chunking(documents_with_correct_dates) # Example using semantic | |
# ... further processing (e.g., indexing chunks) ... | |
print(f"\nTotal chunks created: {len(chunks)}") | |
if chunks: | |
print(f"First chunk metadata: {chunks[0].metadata}") | |
print(f"Last chunk metadata: {chunks[-1].metadata}") | |
else: | |
print(f"Could not process PDF: {pdf_path}") | |