Spaces:
Paused
Paused
File size: 1,453 Bytes
ad33df7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import json
from pathlib import Path
import pytest
from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader
from .conftest import skip_when_unstructured_pdf_not_installed
input_file = Path(__file__).parent / "resources" / "table.pdf"
input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx"
@pytest.fixture
def fullocr_output():
with open(
Path(__file__).parent / "resources" / "fullocr_sample_output.json",
encoding="utf-8",
) as f:
fullocr = json.load(f)
return fullocr
@pytest.fixture
def mathpix_output():
with open(Path(__file__).parent / "resources" / "policy.md", encoding="utf-8") as f:
content = f.read()
return content
@skip_when_unstructured_pdf_not_installed
def test_ocr_reader(fullocr_output):
reader = OCRReader()
documents = reader.load_data(input_file, response_content=fullocr_output)
table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
assert len(table_docs) == 2
def test_mathpix_reader(mathpix_output):
reader = MathpixPDFReader()
documents = reader.load_data(input_file, response_content=mathpix_output)
table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
assert len(table_docs) == 4
def test_excel_reader():
reader = PandasExcelReader()
documents = reader.load_data(
input_file_excel,
)
assert len(documents) == 1
|