Spaces:
Building
Building
File size: 6,107 Bytes
3f61806 4623a33 3f61806 4623a33 3f61806 4623a33 3f61806 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import unittest
from unittest.mock import patch, MagicMock
from RAG_BOT.pdf_processor import PdfProcessor # Import the class
from langchain_core.documents import Document
from datetime import datetime
class TestPDFProcessor(unittest.TestCase):
def setUp(self):
"""Setup a PdfProcessor instance for use in tests."""
self.processor = PdfProcessor()
@patch('RAG_BOT.pdf_processor.PyMuPDFLoader') # Changed to PyMuPDFLoader
def test_load_pdf_multi_murli_single_murli(self, mock_pymupdfloader):
# Mock PyPDFLoader to return pages for a single Murli
mock_page1 = Document(page_content="Date: 01.01.2024\nContent of page 1.", metadata={"page": 0, "source": "test.pdf"})
mock_page2 = Document(page_content="Content of page 2.", metadata={"page": 1, "source": "test.pdf"})
mock_page3 = Document(page_content="Content of page 3.", metadata={"page": 2, "source": "test.pdf"})
mock_pymupdfloader.return_value.load.return_value = [mock_page1, mock_page2, mock_page3]
documents = self.processor.load_pdf("test.pdf")
self.assertEqual(len(documents), 3)
self.assertEqual(documents[0].metadata.get("date"), "2024-01-01")
self.assertEqual(documents[1].metadata.get("date"), "2024-01-01")
self.assertEqual(documents[2].metadata.get("date"), "2024-01-01")
self.assertNotIn("is_avyakt", documents[0].metadata)
self.assertNotIn("is_avyakt", documents[1].metadata)
self.assertNotIn("is_avyakt", documents[2].metadata)
@patch('RAG_BOT.pdf_processor.PyMuPDFLoader') # Changed to PyMuPDFLoader
def test_load_pdf_multi_murli_multiple_murlis(self, mock_pymupdfloader):
# Mock PyPDFLoader to return pages for multiple Murlis
mock_page1 = Document(page_content="Date: 01.01.2024\nContent of Murli 1, page 1.", metadata={"page": 0, "source": "test.pdf"})
mock_page2 = Document(page_content="Content of Murli 1, page 2.", metadata={"page": 1, "source": "test.pdf"})
mock_page3 = Document(page_content="Date: 02.01.2024\nContent of Murli 2, page 1.", metadata={"page": 2, "source": "test.pdf"})
mock_page4 = Document(page_content="Content of Murli 2, page 2.", metadata={"page": 3, "source": "test.pdf"})
mock_pymupdfloader.return_value.load.return_value = [mock_page1, mock_page2, mock_page3, mock_page4]
documents = self.processor.load_pdf("test.pdf")
self.assertEqual(len(documents), 4)
self.assertEqual(documents[0].metadata.get("date"), "2024-01-01")
self.assertEqual(documents[1].metadata.get("date"), "2024-01-01")
self.assertEqual(documents[2].metadata.get("date"), "2024-01-02")
self.assertEqual(documents[3].metadata.get("date"), "2024-01-02")
self.assertNotIn("is_avyakt", documents[0].metadata)
self.assertNotIn("is_avyakt", documents[1].metadata)
self.assertNotIn("is_avyakt", documents[2].metadata)
self.assertNotIn("is_avyakt", documents[3].metadata)
@patch('RAG_BOT.pdf_processor.PyMuPDFLoader') # Changed to PyMuPDFLoader
def test_load_pdf_multi_murli_with_avyakt(self, mock_pymupdfloader):
# Mock PyPDFLoader to return pages including an Avyakt Murli
mock_page1 = Document(page_content="Date: 01.01.2024\nContent of Sakar Murli, page 1.", metadata={"page": 0, "source": "test.pdf"})
mock_page2 = Document(page_content="Date: 02.01.2024 Avyakt Murli\nContent of Avyakt Murli, page 1.", metadata={"page": 1, "source": "test.pdf"})
mock_page3 = Document(page_content="Content of Avyakt Murli, page 2.", metadata={"page": 2, "source": "test.pdf"})
mock_pymupdfloader.return_value.load.return_value = [mock_page1, mock_page2, mock_page3]
documents = self.processor.load_pdf("test.pdf")
self.assertEqual(len(documents), 3)
self.assertEqual(documents[0].metadata.get("date"), "2024-01-01")
self.assertNotIn("is_avyakt", documents[0].metadata)
self.assertEqual(documents[1].metadata.get("date"), "2024-01-02")
self.assertTrue(documents[1].metadata.get("is_avyakt"))
self.assertEqual(documents[2].metadata.get("date"), "2024-01-02")
self.assertTrue(documents[2].metadata.get("is_avyakt"))
@patch('RAG_BOT.pdf_processor.PyMuPDFLoader') # Changed to PyMuPDFLoader
def test_load_pdf_multi_murli_no_date_or_avyakt(self, mock_pymupdfloader):
# Mock PyPDFLoader to return pages with no date or avyakt
mock_page1 = Document(page_content="Content of page 1.", metadata={"page": 0, "source": "test.pdf"})
mock_page2 = Document(page_content="Content of page 2.", metadata={"page": 1, "source": "test.pdf"})
mock_pymupdfloader.return_value.load.return_value = [mock_page1, mock_page2]
documents = self.processor.load_pdf("test.pdf")
self.assertEqual(len(documents), 2)
self.assertNotIn("date", documents[0].metadata)
self.assertNotIn("is_avyakt", documents[0].metadata)
self.assertNotIn("date", documents[1].metadata)
self.assertNotIn("is_avyakt", documents[1].metadata)
@patch('RAG_BOT.pdf_processor.PyMuPDFLoader') # Changed to PyMuPDFLoader
@patch('RAG_BOT.pdf_processor.logger') # Mock logger
def test_load_pdf_failure(self, mock_logger, mock_pymupdfloader):
# Mock PyMuPDFLoader to raise an exception
mock_pymupdfloader.return_value.load.side_effect = Exception("Failed to load")
documents = self.processor.load_pdf("test_fail.pdf")
self.assertEqual(len(documents), 0)
mock_logger.error.assert_called_with("Failed to load PDF: test_fail.pdf. Error: Failed to load")
@patch('RAG_BOT.pdf_processor.PyMuPDFLoader') # Changed to PyMuPDFLoader
def test_load_pdf_multi_murli_empty_pdf(self, mock_pymupdfloader):
# Mock PyPDFLoader to return an empty list of pages
mock_pymupdfloader.return_value.load.return_value = []
documents = self.processor.load_pdf("test.pdf")
self.assertEqual(len(documents), 0)
if __name__ == '__main__':
unittest.main()
|