Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -330,8 +330,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
330 |
model_choice = gr.Radio(choices=[
|
331 |
"Camel-Doc-OCR-080125(v2)", "OCRFlux-3B",
|
332 |
"ViGoRL-MCTS-SFT-3B", "Behemoth-3B-070225",
|
333 |
-
"MonkeyOCR-pro-1.2B"
|
334 |
-
],
|
335 |
label="Select Model",
|
336 |
value="Camel-Doc-OCR-080125(v2)")
|
337 |
|
@@ -339,7 +338,8 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
339 |
gr.Markdown("> Camel-Doc-OCR-080125 is a specialized vision-language model, fine-tuned from Qwen2.5-VL-7B-Instruct, and excels at document retrieval, content extraction, and analysis recognition for both structured and unstructured digital documents. OCRFlux-3B is a 3B-parameter vision-language model optimized for high-quality OCR on PDFs and images, excelling in converting documents to clean Markdown text and supporting features like cross-page table/paragraph merging.")
|
340 |
gr.Markdown("> Both ViGoRL-MCTS-SFT-3b-Spatial and 7b-Spatial are vision-language models that use multi-turn visually grounded reinforcement learning for precise spatial reasoning and visual grounding, with the 3b and 7b variants differing mainly in their architectural size for fine-grained visual tasks.")
|
341 |
gr.Markdown("> Behemoth-3B-070225-post0.1 is an advanced 3B parameter model tailored for extensive multimodal comprehension, document parsing, and possibly generalized OCR/vision-language tasks. MonkeyOCR-pro-1.2B is a lightweight OCR model focusing on high-accuracy text extraction from images and scanned documents, suitable for resource-constrained environments.")
|
342 |
-
|
|
|
343 |
# Define the submit button actions
|
344 |
image_submit.click(fn=generate_image,
|
345 |
inputs=[
|
|
|
330 |
model_choice = gr.Radio(choices=[
|
331 |
"Camel-Doc-OCR-080125(v2)", "OCRFlux-3B",
|
332 |
"ViGoRL-MCTS-SFT-3B", "Behemoth-3B-070225",
|
333 |
+
"MonkeyOCR-pro-1.2B"],
|
|
|
334 |
label="Select Model",
|
335 |
value="Camel-Doc-OCR-080125(v2)")
|
336 |
|
|
|
338 |
gr.Markdown("> Camel-Doc-OCR-080125 is a specialized vision-language model, fine-tuned from Qwen2.5-VL-7B-Instruct, and excels at document retrieval, content extraction, and analysis recognition for both structured and unstructured digital documents. OCRFlux-3B is a 3B-parameter vision-language model optimized for high-quality OCR on PDFs and images, excelling in converting documents to clean Markdown text and supporting features like cross-page table/paragraph merging.")
|
339 |
gr.Markdown("> Both ViGoRL-MCTS-SFT-3b-Spatial and 7b-Spatial are vision-language models that use multi-turn visually grounded reinforcement learning for precise spatial reasoning and visual grounding, with the 3b and 7b variants differing mainly in their architectural size for fine-grained visual tasks.")
|
340 |
gr.Markdown("> Behemoth-3B-070225-post0.1 is an advanced 3B parameter model tailored for extensive multimodal comprehension, document parsing, and possibly generalized OCR/vision-language tasks. MonkeyOCR-pro-1.2B is a lightweight OCR model focusing on high-accuracy text extraction from images and scanned documents, suitable for resource-constrained environments.")
|
341 |
+
gr.Markdown("> ⚠️ Note: Models in this space may not perform well on video inference tasks.")
|
342 |
+
|
343 |
# Define the submit button actions
|
344 |
image_submit.click(fn=generate_image,
|
345 |
inputs=[
|