Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -352,7 +352,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
352 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Outpost/discussions)")
|
353 |
gr.Markdown("> Camel-Doc-OCR-080125 is a specialized vision-language model, fine-tuned from Qwen2.5-VL-7B-Instruct, and excels at document retrieval, content extraction, and analysis recognition for both structured and unstructured digital documents. OCRFlux-3B is a 3B-parameter vision-language model optimized for high-quality OCR on PDFs and images, excelling in converting documents to clean Markdown text and supporting features like cross-page table/paragraph merging.")
|
354 |
gr.Markdown("> Both ViGoRL-MCTS-SFT-3b-Spatial and 7b-Spatial are vision-language models that use multi-turn visually grounded reinforcement learning for precise spatial reasoning and visual grounding, with the 3b and 7b variants differing mainly in their architectural size for fine-grained visual tasks.")
|
355 |
-
gr.Markdown(">
|
356 |
|
357 |
# Define the submit button actions
|
358 |
image_submit.click(fn=generate_image,
|
|
|
352 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Outpost/discussions)")
|
353 |
gr.Markdown("> Camel-Doc-OCR-080125 is a specialized vision-language model, fine-tuned from Qwen2.5-VL-7B-Instruct, and excels at document retrieval, content extraction, and analysis recognition for both structured and unstructured digital documents. OCRFlux-3B is a 3B-parameter vision-language model optimized for high-quality OCR on PDFs and images, excelling in converting documents to clean Markdown text and supporting features like cross-page table/paragraph merging.")
|
354 |
gr.Markdown("> Both ViGoRL-MCTS-SFT-3b-Spatial and 7b-Spatial are vision-language models that use multi-turn visually grounded reinforcement learning for precise spatial reasoning and visual grounding, with the 3b and 7b variants differing mainly in their architectural size for fine-grained visual tasks.")
|
355 |
+
gr.Markdown("> Behemoth-3B-070225-post0.1 is an advanced 3B parameter model tailored for extensive multimodal comprehension, document parsing, and possibly generalized OCR/vision-language tasks. MonkeyOCR-pro-1.2B is a lightweight OCR model focusing on high-accuracy text extraction from images and scanned documents, suitable for resource-constrained environments.")
|
356 |
|
357 |
# Define the submit button actions
|
358 |
image_submit.click(fn=generate_image,
|