Spaces:
Paused
Paused
import json | |
from pathlib import Path | |
import pytest | |
from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader | |
from .conftest import skip_when_unstructured_pdf_not_installed | |
input_file = Path(__file__).parent / "resources" / "table.pdf" | |
input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx" | |
def fullocr_output(): | |
with open( | |
Path(__file__).parent / "resources" / "fullocr_sample_output.json", | |
encoding="utf-8", | |
) as f: | |
fullocr = json.load(f) | |
return fullocr | |
def mathpix_output(): | |
with open(Path(__file__).parent / "resources" / "policy.md", encoding="utf-8") as f: | |
content = f.read() | |
return content | |
def test_ocr_reader(fullocr_output): | |
reader = OCRReader() | |
documents = reader.load_data(input_file, response_content=fullocr_output) | |
table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"] | |
assert len(table_docs) == 2 | |
def test_mathpix_reader(mathpix_output): | |
reader = MathpixPDFReader() | |
documents = reader.load_data(input_file, response_content=mathpix_output) | |
table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"] | |
assert len(table_docs) == 4 | |
def test_excel_reader(): | |
reader = PandasExcelReader() | |
documents = reader.load_data( | |
input_file_excel, | |
) | |
assert len(documents) == 1 | |