File size: 1,453 Bytes
ad33df7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import json
from pathlib import Path

import pytest

from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader

from .conftest import skip_when_unstructured_pdf_not_installed

input_file = Path(__file__).parent / "resources" / "table.pdf"
input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx"


@pytest.fixture
def fullocr_output():
    with open(
        Path(__file__).parent / "resources" / "fullocr_sample_output.json",
        encoding="utf-8",
    ) as f:
        fullocr = json.load(f)
    return fullocr


@pytest.fixture
def mathpix_output():
    with open(Path(__file__).parent / "resources" / "policy.md", encoding="utf-8") as f:
        content = f.read()
    return content


@skip_when_unstructured_pdf_not_installed
def test_ocr_reader(fullocr_output):
    reader = OCRReader()
    documents = reader.load_data(input_file, response_content=fullocr_output)
    table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
    assert len(table_docs) == 2


def test_mathpix_reader(mathpix_output):
    reader = MathpixPDFReader()
    documents = reader.load_data(input_file, response_content=mathpix_output)
    table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
    assert len(table_docs) == 4


def test_excel_reader():
    reader = PandasExcelReader()
    documents = reader.load_data(
        input_file_excel,
    )
    assert len(documents) == 1