prithivMLmods commited on
Commit
a783606
·
verified ·
1 Parent(s): c8ef24f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -330,8 +330,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
330
  model_choice = gr.Radio(choices=[
331
  "Camel-Doc-OCR-080125(v2)", "OCRFlux-3B",
332
  "ViGoRL-MCTS-SFT-3B", "Behemoth-3B-070225",
333
- "MonkeyOCR-pro-1.2B"
334
- ],
335
  label="Select Model",
336
  value="Camel-Doc-OCR-080125(v2)")
337
 
@@ -339,7 +338,8 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
339
  gr.Markdown("> Camel-Doc-OCR-080125 is a specialized vision-language model, fine-tuned from Qwen2.5-VL-7B-Instruct, and excels at document retrieval, content extraction, and analysis recognition for both structured and unstructured digital documents. OCRFlux-3B is a 3B-parameter vision-language model optimized for high-quality OCR on PDFs and images, excelling in converting documents to clean Markdown text and supporting features like cross-page table/paragraph merging.")
340
  gr.Markdown("> Both ViGoRL-MCTS-SFT-3b-Spatial and 7b-Spatial are vision-language models that use multi-turn visually grounded reinforcement learning for precise spatial reasoning and visual grounding, with the 3b and 7b variants differing mainly in their architectural size for fine-grained visual tasks.")
341
  gr.Markdown("> Behemoth-3B-070225-post0.1 is an advanced 3B parameter model tailored for extensive multimodal comprehension, document parsing, and possibly generalized OCR/vision-language tasks. MonkeyOCR-pro-1.2B is a lightweight OCR model focusing on high-accuracy text extraction from images and scanned documents, suitable for resource-constrained environments.")
342
-
 
343
  # Define the submit button actions
344
  image_submit.click(fn=generate_image,
345
  inputs=[
 
330
  model_choice = gr.Radio(choices=[
331
  "Camel-Doc-OCR-080125(v2)", "OCRFlux-3B",
332
  "ViGoRL-MCTS-SFT-3B", "Behemoth-3B-070225",
333
+ "MonkeyOCR-pro-1.2B"],
 
334
  label="Select Model",
335
  value="Camel-Doc-OCR-080125(v2)")
336
 
 
338
  gr.Markdown("> Camel-Doc-OCR-080125 is a specialized vision-language model, fine-tuned from Qwen2.5-VL-7B-Instruct, and excels at document retrieval, content extraction, and analysis recognition for both structured and unstructured digital documents. OCRFlux-3B is a 3B-parameter vision-language model optimized for high-quality OCR on PDFs and images, excelling in converting documents to clean Markdown text and supporting features like cross-page table/paragraph merging.")
339
  gr.Markdown("> Both ViGoRL-MCTS-SFT-3b-Spatial and 7b-Spatial are vision-language models that use multi-turn visually grounded reinforcement learning for precise spatial reasoning and visual grounding, with the 3b and 7b variants differing mainly in their architectural size for fine-grained visual tasks.")
340
  gr.Markdown("> Behemoth-3B-070225-post0.1 is an advanced 3B parameter model tailored for extensive multimodal comprehension, document parsing, and possibly generalized OCR/vision-language tasks. MonkeyOCR-pro-1.2B is a lightweight OCR model focusing on high-accuracy text extraction from images and scanned documents, suitable for resource-constrained environments.")
341
+ gr.Markdown("> ⚠️ Note: Models in this space may not perform well on video inference tasks.")
342
+
343
  # Define the submit button actions
344
  image_submit.click(fn=generate_image,
345
  inputs=[