prithivMLmods commited on
Commit
6fcc30e
·
verified ·
1 Parent(s): 3594349

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -7
app.py CHANGED
@@ -101,13 +101,13 @@ def generate_image(model_name: str,
101
  """
102
  Generates responses using the selected model for image input.
103
  """
104
- if model_name == "Vision-Matters-7B-Math":
105
  processor = processor_m
106
  model = model_m
107
  elif model_name == "ViGaL-7B":
108
  processor = processor_x
109
  model = model_x
110
- elif model_name == "Visionary-R1":
111
  processor = processor_o
112
  model = model_o
113
  elif model_name == "R1-Onevision-7B":
@@ -168,13 +168,13 @@ def generate_video(model_name: str,
168
  """
169
  Generates responses using the selected model for video input.
170
  """
171
- if model_name == "Vision-Matters-7B-Math":
172
  processor = processor_m
173
  model = model_m
174
  elif model_name == "ViGaL-7B":
175
  processor = processor_x
176
  model = model_x
177
- elif model_name == "Visionary-R1":
178
  processor = processor_o
179
  model = model_o
180
  elif model_name == "R1-Onevision-7B":
@@ -330,17 +330,19 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
330
  #download_btn = gr.Button("Download Result.md")
331
 
332
  model_choice = gr.Radio(choices=[
333
- "MonkeyOCR-1.2B-0709", "ViGaL-7B", "Visionary-R1", "Vision-Matters-7B-Math", "R1-Onevision-7B"
 
334
  ],
335
  label="Select Model",
336
- value="Vision-Matters-7B-Math")
337
 
338
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLMs-5x/discussions)")
339
- gr.Markdown("> [Vision Matters 7B Math](https://huggingface.co/Yuting6/Vision-Matters-7B): vision-matters is a simple visual perturbation framework that can be easily integrated into existing post-training pipelines including sft, dpo, and grpo. our findings highlight the critical role of visual perturbation: better reasoning begins with better seeing.")
340
  gr.Markdown("> [MonkeyOCR-1.2B-0709](https://huggingface.co/echo840/MonkeyOCR-1.2B-0709): MonkeyOCR adopts a structure-recognition-relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
 
341
  gr.Markdown("> [ViGaL 7B](https://huggingface.co/yunfeixie/ViGaL-7B): vigal-7b shows that training a 7b mllm on simple games like snake using reinforcement learning boosts performance on benchmarks like mathvista and mmmu without needing worked solutions or diagrams indicating transferable reasoning skills.")
342
  gr.Markdown("> [Visionary-R1](https://huggingface.co/maifoundations/Visionary-R1): visionary-r1 is a novel framework for training visual language models (vlms) to perform robust visual reasoning using reinforcement learning (rl). unlike traditional approaches that rely heavily on (sft) or (cot) annotations, visionary-r1 leverages only visual question-answer pairs and rl, making the process more scalable and accessible.")
343
  gr.Markdown("> [R1-Onevision-7B](https://huggingface.co/Fancy-MLLM/R1-Onevision-7B): r1-onevision model enhances vision-language understanding and reasoning capabilities, making it suitable for various tasks such as visual reasoning and image understanding. with its robust ability to perform multimodal reasoning, r1-onevision emerges as a powerful ai assistant capable of addressing different domains.")
 
344
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
345
 
346
  # Define the submit button actions
 
101
  """
102
  Generates responses using the selected model for image input.
103
  """
104
+ if model_name == "Vision-Matters-7B":
105
  processor = processor_m
106
  model = model_m
107
  elif model_name == "ViGaL-7B":
108
  processor = processor_x
109
  model = model_x
110
+ elif model_name == "Visionary-R1-3B":
111
  processor = processor_o
112
  model = model_o
113
  elif model_name == "R1-Onevision-7B":
 
168
  """
169
  Generates responses using the selected model for video input.
170
  """
171
+ if model_name == "Vision-Matters-7B":
172
  processor = processor_m
173
  model = model_m
174
  elif model_name == "ViGaL-7B":
175
  processor = processor_x
176
  model = model_x
177
+ elif model_name == "Visionary-R1-3B":
178
  processor = processor_o
179
  model = model_o
180
  elif model_name == "R1-Onevision-7B":
 
330
  #download_btn = gr.Button("Download Result.md")
331
 
332
  model_choice = gr.Radio(choices=[
333
+ "Vision-Matters-7B", "MonkeyOCR-1.2B-0709",
334
+ "ViGaL-7B", "Visionary-R1-3B", , "R1-Onevision-7B"
335
  ],
336
  label="Select Model",
337
+ value="Vision-Matters-7B")
338
 
339
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLMs-5x/discussions)")
 
340
  gr.Markdown("> [MonkeyOCR-1.2B-0709](https://huggingface.co/echo840/MonkeyOCR-1.2B-0709): MonkeyOCR adopts a structure-recognition-relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
341
+ gr.Markdown("> [Vision Matters 7B](https://huggingface.co/Yuting6/Vision-Matters-7B): vision-matters is a simple visual perturbation framework that can be easily integrated into existing post-training pipelines including sft, dpo, and grpo. our findings highlight the critical role of visual perturbation: better reasoning begins with better seeing.")
342
  gr.Markdown("> [ViGaL 7B](https://huggingface.co/yunfeixie/ViGaL-7B): vigal-7b shows that training a 7b mllm on simple games like snake using reinforcement learning boosts performance on benchmarks like mathvista and mmmu without needing worked solutions or diagrams indicating transferable reasoning skills.")
343
  gr.Markdown("> [Visionary-R1](https://huggingface.co/maifoundations/Visionary-R1): visionary-r1 is a novel framework for training visual language models (vlms) to perform robust visual reasoning using reinforcement learning (rl). unlike traditional approaches that rely heavily on (sft) or (cot) annotations, visionary-r1 leverages only visual question-answer pairs and rl, making the process more scalable and accessible.")
344
  gr.Markdown("> [R1-Onevision-7B](https://huggingface.co/Fancy-MLLM/R1-Onevision-7B): r1-onevision model enhances vision-language understanding and reasoning capabilities, making it suitable for various tasks such as visual reasoning and image understanding. with its robust ability to perform multimodal reasoning, r1-onevision emerges as a powerful ai assistant capable of addressing different domains.")
345
+
346
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
347
 
348
  # Define the submit button actions