Spaces:
Configuration error
Configuration error
File size: 4,614 Bytes
22bc712 16d905e 22bc712 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import json
import os
import httpx
import pytest
import pytest_asyncio
from pytest_check import check
@pytest_asyncio.fixture
async def async_client():
async with httpx.AsyncClient(timeout=60.0) as client:
yield client
@pytest.mark.asyncio
async def test_convert_file(async_client):
"""Test convert single file to all outputs"""
url = "http://localhost:5001/v1alpha/convert/file"
options = {
"from_formats": [
"docx",
"pptx",
"html",
"image",
"pdf",
"asciidoc",
"md",
"xlsx",
],
"to_formats": ["md", "json", "html", "text", "doctags"],
"image_export_mode": "placeholder",
"ocr": True,
"force_ocr": False,
"ocr_engine": "easyocr",
"ocr_lang": ["en"],
"pdf_backend": "dlparse_v2",
"table_mode": "fast",
"abort_on_error": False,
"return_as_file": False,
}
current_dir = os.path.dirname(__file__)
file_path = os.path.join(current_dir, "2206.01062v1.pdf")
files = {
"files": ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf"),
}
response = await async_client.post(
url, files=files, data={"options": json.dumps(options)}
)
assert response.status_code == 200, "Response should be 200 OK"
data = response.json()
# Response content checks
# Helper function to safely slice strings
def safe_slice(value, length=100):
if isinstance(value, str):
return value[:length]
return str(value) # Convert non-string values to string for debug purposes
# Document check
check.is_in(
"document",
data,
msg=f"Response should contain 'document' key. Received keys: {list(data.keys())}",
)
# MD check
check.is_in(
"md_content",
data.get("document", {}),
msg=f"Response should contain 'md_content' key. Received keys: {list(data.get('document', {}).keys())}",
)
if data.get("document", {}).get("md_content") is not None:
check.is_in(
"## DocLayNet: ",
data["document"]["md_content"],
msg=f"Markdown document should contain 'DocLayNet: '. Received: {safe_slice(data['document']['md_content'])}",
)
# JSON check
check.is_in(
"json_content",
data.get("document", {}),
msg=f"Response should contain 'json_content' key. Received keys: {list(data.get('document', {}).keys())}",
)
if data.get("document", {}).get("json_content") is not None:
check.is_in(
'{"schema_name": "DoclingDocument"',
json.dumps(data["document"]["json_content"]),
msg=f'JSON document should contain \'{{\\n "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}',
)
# HTML check
check.is_in(
"html_content",
data.get("document", {}),
msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}",
)
if data.get("document", {}).get("html_content") is not None:
check.is_in(
'<!DOCTYPE html>\n<html lang="en">\n<head>',
data["document"]["html_content"],
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
)
# Text check
check.is_in(
"text_content",
data.get("document", {}),
msg=f"Response should contain 'text_content' key. Received keys: {list(data.get('document', {}).keys())}",
)
if data.get("document", {}).get("text_content") is not None:
check.is_in(
"DocLayNet: A Large Human-Annotated Dataset",
data["document"]["text_content"],
msg=f"Text document should contain 'DocLayNet: A Large Human-Annotated Dataset'. Received: {safe_slice(data['document']['text_content'])}",
)
# DocTags check
check.is_in(
"doctags_content",
data.get("document", {}),
msg=f"Response should contain 'doctags_content' key. Received keys: {list(data.get('document', {}).keys())}",
)
if data.get("document", {}).get("doctags_content") is not None:
check.is_in(
"<document>\n<section_header_level_1><location>",
data["document"]["doctags_content"],
msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
)
|