Spaces:
Configuration error
Configuration error
import json | |
import httpx | |
import pytest | |
import pytest_asyncio | |
from pytest_check import check | |
async def async_client(): | |
async with httpx.AsyncClient(timeout=60.0) as client: | |
yield client | |
async def test_convert_url(async_client): | |
"""Test convert URL to all outputs""" | |
url = "http://localhost:5001/v1alpha/convert/source" | |
payload = { | |
"options": { | |
"from_formats": [ | |
"docx", | |
"pptx", | |
"html", | |
"image", | |
"pdf", | |
"asciidoc", | |
"md", | |
"xlsx", | |
], | |
"to_formats": ["md", "json", "html", "text", "doctags"], | |
"image_export_mode": "placeholder", | |
"ocr": True, | |
"force_ocr": False, | |
"ocr_engine": "easyocr", | |
"ocr_lang": ["en"], | |
"pdf_backend": "dlparse_v2", | |
"table_mode": "fast", | |
"abort_on_error": False, | |
"return_as_file": False, | |
}, | |
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}], | |
} | |
print(json.dumps(payload, indent=2)) | |
response = await async_client.post(url, json=payload) | |
assert response.status_code == 200, "Response should be 200 OK" | |
data = response.json() | |
# Response content checks | |
# Helper function to safely slice strings | |
def safe_slice(value, length=100): | |
if isinstance(value, str): | |
return value[:length] | |
return str(value) # Convert non-string values to string for debug purposes | |
# Document check | |
check.is_in( | |
"document", | |
data, | |
msg=f"Response should contain 'document' key. Received keys: {list(data.keys())}", | |
) | |
# MD check | |
check.is_in( | |
"md_content", | |
data.get("document", {}), | |
msg=f"Response should contain 'md_content' key. Received keys: {list(data.get('document', {}).keys())}", | |
) | |
if data.get("document", {}).get("md_content") is not None: | |
check.is_in( | |
"## DocLayNet: ", | |
data["document"]["md_content"], | |
msg=f"Markdown document should contain 'DocLayNet: '. Received: {safe_slice(data['document']['md_content'])}", | |
) | |
# JSON check | |
check.is_in( | |
"json_content", | |
data.get("document", {}), | |
msg=f"Response should contain 'json_content' key. Received keys: {list(data.get('document', {}).keys())}", | |
) | |
if data.get("document", {}).get("json_content") is not None: | |
check.is_in( | |
'{"schema_name": "DoclingDocument"', | |
json.dumps(data["document"]["json_content"]), | |
msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}", | |
) | |
# HTML check | |
check.is_in( | |
"html_content", | |
data.get("document", {}), | |
msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}", | |
) | |
if data.get("document", {}).get("html_content") is not None: | |
check.is_in( | |
'<!DOCTYPE html>\n<html lang="en">\n<head>', | |
data["document"]["html_content"], | |
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}", | |
) | |
# Text check | |
check.is_in( | |
"text_content", | |
data.get("document", {}), | |
msg=f"Response should contain 'text_content' key. Received keys: {list(data.get('document', {}).keys())}", | |
) | |
if data.get("document", {}).get("text_content") is not None: | |
check.is_in( | |
"DocLayNet: A Large Human-Annotated Dataset", | |
data["document"]["text_content"], | |
msg=f"Text document should contain 'DocLayNet: A Large Human-Annotated Dataset'. Received: {safe_slice(data['document']['text_content'])}", | |
) | |
# DocTags check | |
check.is_in( | |
"doctags_content", | |
data.get("document", {}), | |
msg=f"Response should contain 'doctags_content' key. Received keys: {list(data.get('document', {}).keys())}", | |
) | |
if data.get("document", {}).get("doctags_content") is not None: | |
check.is_in( | |
"<document>\n<section_header_level_1><location>", | |
data["document"]["doctags_content"], | |
msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}", | |
) | |