File size: 6,107 Bytes
3f61806
4623a33
 
 
3f61806
 
 
 
4623a33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f61806
4623a33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f61806
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import unittest
from unittest.mock import patch, MagicMock
from RAG_BOT.pdf_processor import PdfProcessor # Import the class
from langchain_core.documents import Document
from datetime import datetime

class TestPDFProcessor(unittest.TestCase):

    def setUp(self):
        """Setup a PdfProcessor instance for use in tests."""
        self.processor = PdfProcessor()

    @patch('RAG_BOT.pdf_processor.PyMuPDFLoader') # Changed to PyMuPDFLoader
    def test_load_pdf_multi_murli_single_murli(self, mock_pymupdfloader):
        # Mock PyPDFLoader to return pages for a single Murli
        mock_page1 = Document(page_content="Date: 01.01.2024\nContent of page 1.", metadata={"page": 0, "source": "test.pdf"})
        mock_page2 = Document(page_content="Content of page 2.", metadata={"page": 1, "source": "test.pdf"})
        mock_page3 = Document(page_content="Content of page 3.", metadata={"page": 2, "source": "test.pdf"})
        mock_pymupdfloader.return_value.load.return_value = [mock_page1, mock_page2, mock_page3]

        documents = self.processor.load_pdf("test.pdf")

        self.assertEqual(len(documents), 3)
        self.assertEqual(documents[0].metadata.get("date"), "2024-01-01")
        self.assertEqual(documents[1].metadata.get("date"), "2024-01-01")
        self.assertEqual(documents[2].metadata.get("date"), "2024-01-01")
        self.assertNotIn("is_avyakt", documents[0].metadata)
        self.assertNotIn("is_avyakt", documents[1].metadata)
        self.assertNotIn("is_avyakt", documents[2].metadata)


    @patch('RAG_BOT.pdf_processor.PyMuPDFLoader') # Changed to PyMuPDFLoader
    def test_load_pdf_multi_murli_multiple_murlis(self, mock_pymupdfloader):
        # Mock PyPDFLoader to return pages for multiple Murlis
        mock_page1 = Document(page_content="Date: 01.01.2024\nContent of Murli 1, page 1.", metadata={"page": 0, "source": "test.pdf"})
        mock_page2 = Document(page_content="Content of Murli 1, page 2.", metadata={"page": 1, "source": "test.pdf"})
        mock_page3 = Document(page_content="Date: 02.01.2024\nContent of Murli 2, page 1.", metadata={"page": 2, "source": "test.pdf"})
        mock_page4 = Document(page_content="Content of Murli 2, page 2.", metadata={"page": 3, "source": "test.pdf"})
        mock_pymupdfloader.return_value.load.return_value = [mock_page1, mock_page2, mock_page3, mock_page4]

        documents = self.processor.load_pdf("test.pdf")

        self.assertEqual(len(documents), 4)
        self.assertEqual(documents[0].metadata.get("date"), "2024-01-01")
        self.assertEqual(documents[1].metadata.get("date"), "2024-01-01")
        self.assertEqual(documents[2].metadata.get("date"), "2024-01-02")
        self.assertEqual(documents[3].metadata.get("date"), "2024-01-02")
        self.assertNotIn("is_avyakt", documents[0].metadata)
        self.assertNotIn("is_avyakt", documents[1].metadata)
        self.assertNotIn("is_avyakt", documents[2].metadata)
        self.assertNotIn("is_avyakt", documents[3].metadata)

    @patch('RAG_BOT.pdf_processor.PyMuPDFLoader') # Changed to PyMuPDFLoader
    def test_load_pdf_multi_murli_with_avyakt(self, mock_pymupdfloader):
        # Mock PyPDFLoader to return pages including an Avyakt Murli
        mock_page1 = Document(page_content="Date: 01.01.2024\nContent of Sakar Murli, page 1.", metadata={"page": 0, "source": "test.pdf"})
        mock_page2 = Document(page_content="Date: 02.01.2024 Avyakt Murli\nContent of Avyakt Murli, page 1.", metadata={"page": 1, "source": "test.pdf"})
        mock_page3 = Document(page_content="Content of Avyakt Murli, page 2.", metadata={"page": 2, "source": "test.pdf"})
        mock_pymupdfloader.return_value.load.return_value = [mock_page1, mock_page2, mock_page3]

        documents = self.processor.load_pdf("test.pdf")

        self.assertEqual(len(documents), 3)
        self.assertEqual(documents[0].metadata.get("date"), "2024-01-01")
        self.assertNotIn("is_avyakt", documents[0].metadata)
        self.assertEqual(documents[1].metadata.get("date"), "2024-01-02")
        self.assertTrue(documents[1].metadata.get("is_avyakt"))
        self.assertEqual(documents[2].metadata.get("date"), "2024-01-02")
        self.assertTrue(documents[2].metadata.get("is_avyakt"))

    @patch('RAG_BOT.pdf_processor.PyMuPDFLoader') # Changed to PyMuPDFLoader
    def test_load_pdf_multi_murli_no_date_or_avyakt(self, mock_pymupdfloader):
        # Mock PyPDFLoader to return pages with no date or avyakt
        mock_page1 = Document(page_content="Content of page 1.", metadata={"page": 0, "source": "test.pdf"})
        mock_page2 = Document(page_content="Content of page 2.", metadata={"page": 1, "source": "test.pdf"})
        mock_pymupdfloader.return_value.load.return_value = [mock_page1, mock_page2]

        documents = self.processor.load_pdf("test.pdf")

        self.assertEqual(len(documents), 2)
        self.assertNotIn("date", documents[0].metadata)
        self.assertNotIn("is_avyakt", documents[0].metadata)
        self.assertNotIn("date", documents[1].metadata)
        self.assertNotIn("is_avyakt", documents[1].metadata)
    
    @patch('RAG_BOT.pdf_processor.PyMuPDFLoader') # Changed to PyMuPDFLoader
    @patch('RAG_BOT.pdf_processor.logger') # Mock logger
    def test_load_pdf_failure(self, mock_logger, mock_pymupdfloader):
        # Mock PyMuPDFLoader to raise an exception
        mock_pymupdfloader.return_value.load.side_effect = Exception("Failed to load")
        
        documents = self.processor.load_pdf("test_fail.pdf")
        self.assertEqual(len(documents), 0)
        mock_logger.error.assert_called_with("Failed to load PDF: test_fail.pdf. Error: Failed to load")

    @patch('RAG_BOT.pdf_processor.PyMuPDFLoader') # Changed to PyMuPDFLoader
    def test_load_pdf_multi_murli_empty_pdf(self, mock_pymupdfloader):
        # Mock PyPDFLoader to return an empty list of pages
        mock_pymupdfloader.return_value.load.return_value = []

        documents = self.processor.load_pdf("test.pdf")

        self.assertEqual(len(documents), 0)

if __name__ == '__main__':
    unittest.main()