prithivMLmods commited on
Commit
b0228be
·
verified ·
1 Parent(s): 05336be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -13
app.py CHANGED
@@ -77,6 +77,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
77
  repetition_penalty: float = 1.2):
78
  """
79
  Generates responses using the selected model for image input.
 
80
  """
81
  if model_name == "Qwen2.5-VL-7B-Instruct":
82
  processor = processor_m
@@ -85,11 +86,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
85
  processor = processor_x
86
  model = model_x
87
  else:
88
- yield "Invalid model selected."
89
  return
90
 
91
  if image is None:
92
- yield "Please upload an image."
93
  return
94
 
95
  messages = [{
@@ -116,7 +117,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
116
  for new_text in streamer:
117
  buffer += new_text
118
  time.sleep(0.01)
119
- yield buffer
120
 
121
  @spaces.GPU
122
  def generate_video(model_name: str, text: str, video_path: str,
@@ -127,6 +128,7 @@ def generate_video(model_name: str, text: str, video_path: str,
127
  repetition_penalty: float = 1.2):
128
  """
129
  Generates responses using the selected model for video input.
 
130
  """
131
  if model_name == "Qwen2.5-VL-7B-Instruct":
132
  processor = processor_m
@@ -135,11 +137,11 @@ def generate_video(model_name: str, text: str, video_path: str,
135
  processor = processor_x
136
  model = model_x
137
  else:
138
- yield "Invalid model selected."
139
  return
140
 
141
  if video_path is None:
142
- yield "Please upload a video."
143
  return
144
 
145
  frames = downsample_video(video_path)
@@ -177,7 +179,7 @@ def generate_video(model_name: str, text: str, video_path: str,
177
  for new_text in streamer:
178
  buffer += new_text
179
  time.sleep(0.01)
180
- yield buffer
181
 
182
  # Define examples for image and video inference
183
  image_examples = [
@@ -199,11 +201,16 @@ css = """
199
  .submit-btn:hover {
200
  background-color: #3498db !important;
201
  }
 
 
 
 
 
202
  """
203
 
204
  # Create the Gradio Interface
205
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
206
- gr.Markdown("# **Qwen2.5-VL**")
207
  with gr.Row():
208
  with gr.Column():
209
  with gr.Tabs():
@@ -223,33 +230,40 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
223
  examples=video_examples,
224
  inputs=[video_query, video_upload]
225
  )
 
226
  with gr.Accordion("Advanced options", open=False):
227
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
228
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
229
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
230
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
231
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
232
  with gr.Column():
233
- output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
 
 
 
 
 
 
234
  model_choice = gr.Radio(
235
  choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct"],
236
  label="Select Model",
237
  value="Qwen2.5-VL-7B-Instruct"
238
  )
239
-
240
- gr.Markdown("**Model Info**")
241
  gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
242
  gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
243
-
244
  image_submit.click(
245
  fn=generate_image,
246
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
247
- outputs=output
248
  )
249
  video_submit.click(
250
  fn=generate_video,
251
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
252
- outputs=output
253
  )
254
 
255
  if __name__ == "__main__":
 
77
  repetition_penalty: float = 1.2):
78
  """
79
  Generates responses using the selected model for image input.
80
+ Yields raw text and Markdown-formatted text.
81
  """
82
  if model_name == "Qwen2.5-VL-7B-Instruct":
83
  processor = processor_m
 
86
  processor = processor_x
87
  model = model_x
88
  else:
89
+ yield "Invalid model selected.", "Invalid model selected."
90
  return
91
 
92
  if image is None:
93
+ yield "Please upload an image.", "Please upload an image."
94
  return
95
 
96
  messages = [{
 
117
  for new_text in streamer:
118
  buffer += new_text
119
  time.sleep(0.01)
120
+ yield buffer, buffer
121
 
122
  @spaces.GPU
123
  def generate_video(model_name: str, text: str, video_path: str,
 
128
  repetition_penalty: float = 1.2):
129
  """
130
  Generates responses using the selected model for video input.
131
+ Yields raw text and Markdown-formatted text.
132
  """
133
  if model_name == "Qwen2.5-VL-7B-Instruct":
134
  processor = processor_m
 
137
  processor = processor_x
138
  model = model_x
139
  else:
140
+ yield "Invalid model selected.", "Invalid model selected."
141
  return
142
 
143
  if video_path is None:
144
+ yield "Please upload a video.", "Please upload a video."
145
  return
146
 
147
  frames = downsample_video(video_path)
 
179
  for new_text in streamer:
180
  buffer += new_text
181
  time.sleep(0.01)
182
+ yield buffer, buffer
183
 
184
  # Define examples for image and video inference
185
  image_examples = [
 
201
  .submit-btn:hover {
202
  background-color: #3498db !important;
203
  }
204
+ .canvas-output {
205
+ border: 2px solid #4682B4;
206
+ border-radius: 10px;
207
+ padding: 20px;
208
+ }
209
  """
210
 
211
  # Create the Gradio Interface
212
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
213
+ gr.Markdown("# **[Qwen2.5-VL](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
214
  with gr.Row():
215
  with gr.Column():
216
  with gr.Tabs():
 
230
  examples=video_examples,
231
  inputs=[video_query, video_upload]
232
  )
233
+
234
  with gr.Accordion("Advanced options", open=False):
235
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
236
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
237
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
238
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
239
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
240
+
241
  with gr.Column():
242
+ with gr.Column(elem_classes="canvas-output"):
243
+ gr.Markdown("## Result.Md")
244
+ output = gr.Textbox(label="Raw Output", interactive=False, lines=2, scale=2)
245
+
246
+ with gr.Accordion("Formatted Result", open=False):
247
+ markdown_output = gr.Markdown()
248
+
249
  model_choice = gr.Radio(
250
  choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct"],
251
  label="Select Model",
252
  value="Qwen2.5-VL-7B-Instruct"
253
  )
254
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Qwen2.5-VL/discussions)")
 
255
  gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
256
  gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
257
+
258
  image_submit.click(
259
  fn=generate_image,
260
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
261
+ outputs=[output, markdown_output]
262
  )
263
  video_submit.click(
264
  fn=generate_video,
265
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
266
+ outputs=[output, markdown_output]
267
  )
268
 
269
  if __name__ == "__main__":