SpiritualChatBot / RAG_BOT /tests /integration /test_pdf_processor_integration.py
bk-anupam
Refactor unit tests for PDF processor, Telegram bot, and vector store
4623a33
import os
import pytest
from langchain_core.documents import Document
from RAG_BOT.pdf_processor import load_pdf
# Define the path to the test data directory relative to this test file
# Assuming the test file is in RAG_BOT/tests/integration/
# and data is in RAG_BOT/tests/data/hindi/
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), '..', 'data', 'hindi')
# Helper function to check if a path is a valid file
def check_pdf_exists(pdf_name):
pdf_path = os.path.join(TEST_DATA_DIR, pdf_name)
if not os.path.isfile(pdf_path):
pytest.skip(f"Test PDF not found: {pdf_path}")
return pdf_path
@pytest.fixture
def sakar_pdf_path():
return check_pdf_exists("01.03.14-h.pdf")
@pytest.fixture
def avyakt_pdf_path():
return check_pdf_exists("03. AV-H-07.01.1980.pdf")
@pytest.fixture
def multi_date_header_pdf_path():
return check_pdf_exists("FHM - 17-11-2013 (AM Revised - 31-12-1996).pdf")
def test_load_pdf_sakar_murli(sakar_pdf_path):
"""
Tests loading a standard Sakar Murli PDF.
Checks if the correct date is extracted and is_avyakt is not set.
"""
documents = load_pdf(sakar_pdf_path)
assert isinstance(documents, list)
assert len(documents) > 0
assert all(isinstance(doc, Document) for doc in documents)
expected_date = "2014-03-01"
for doc in documents:
assert "date" in doc.metadata
assert doc.metadata["date"] == expected_date
assert "is_avyakt" not in doc.metadata
def test_load_pdf_avyakt_murli(avyakt_pdf_path):
"""
Tests loading a standard Avyakt Murli PDF.
Checks if the correct date is extracted and is_avyakt is set to True.
"""
documents = load_pdf(avyakt_pdf_path)
assert isinstance(documents, list)
assert len(documents) > 0
assert all(isinstance(doc, Document) for doc in documents)
expected_date = "1980-01-07"
for doc in documents:
assert "date" in doc.metadata
assert doc.metadata["date"] == expected_date
assert "is_avyakt" in doc.metadata
assert doc.metadata["is_avyakt"] is True
def test_load_pdf_multiple_header_dates(multi_date_header_pdf_path):
"""
Tests loading a PDF with multiple dates in the header (original/revised).
Checks if the *first* date found is extracted and applied consistently.
"""
documents = load_pdf(multi_date_header_pdf_path)
assert isinstance(documents, list)
assert len(documents) > 0
assert all(isinstance(doc, Document) for doc in documents)
# The current logic should pick the first date it finds in the header
expected_date = "2013-11-17"
for doc in documents:
assert "date" in doc.metadata
assert doc.metadata["date"] == expected_date
# Assuming this is Sakar based on filename pattern, but code checks content.
# If the content check finds 'avyakt', this assertion might need adjustment.
# For now, testing based on the primary date extraction logic.
assert "is_avyakt" not in doc.metadata