ajforbes20 commited on
Commit
74d0e21
·
verified ·
1 Parent(s): b09c3a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -246
app.py CHANGED
@@ -1,7 +1,5 @@
1
  import os
2
- import random
3
  import uuid
4
- import json
5
  import time
6
  import asyncio
7
  from threading import Thread
@@ -22,317 +20,183 @@ from transformers import (
22
  )
23
  from transformers.image_utils import load_image
24
 
25
- #theme:custom
26
- #custom_theme = gr.themes.Base(
27
- # primary_hue="indigo",
28
- # secondary_hue="violet",
29
- # neutral_hue="gray"
30
- #).set(
31
- # body_background_fill="#f7f5fa",
32
- # body_text_color="#1f1f1f",
33
- # input_background_fill="#ffffff",
34
- # button_primary_background_fill="#8b5cf6",
35
- # button_primary_text_color="#ffffff",
36
- # button_secondary_background_fill="#e0d7f5",
37
- # button_secondary_text_color="#1f1f1f",
38
- # shadow_spread="sm"
39
- #)
40
-
41
- # Constants for text generation
42
  MAX_MAX_NEW_TOKENS = 2048
43
  DEFAULT_MAX_NEW_TOKENS = 1024
44
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
45
-
46
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
47
 
48
- # Load Nanonets-OCR-s
 
49
  MODEL_ID_V = "nanonets/Nanonets-OCR-s"
50
  processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
51
  model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
52
  MODEL_ID_V,
53
  trust_remote_code=True,
54
- torch_dtype=torch.float16
55
  ).to(device).eval()
56
 
57
- # Load Qwen2-VL-OCR-2B-Instruct
58
  MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
59
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
60
  model_x = Qwen2VLForConditionalGeneration.from_pretrained(
61
- MODEL_ID_X,
62
- trust_remote_code=True,
63
- torch_dtype=torch.float16
64
  ).to(device).eval()
65
 
66
- # Load Aya-Vision-8b
67
- MODEL_ID_A = "CohereForAI/aya-vision-8b"
68
- processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
69
- model_a = AutoModelForImageTextToText.from_pretrained(
70
- MODEL_ID_A,
71
- trust_remote_code=True,
72
- torch_dtype=torch.float16
73
  ).to(device).eval()
74
 
75
- # Load Lh41-1042-Magellanic-7B-0711
76
  MODEL_ID_W = "prithivMLmods/Lh41-1042-Magellanic-7B-0711"
77
  processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
78
  model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
79
- MODEL_ID_W,
80
- trust_remote_code=True,
81
- torch_dtype=torch.float16
82
- ).to(device).eval()
83
-
84
- # Load RolmOCR
85
- MODEL_ID_M = "reducto/RolmOCR"
86
- processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
87
- model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
88
- MODEL_ID_M,
89
- trust_remote_code=True,
90
- torch_dtype=torch.float16
91
  ).to(device).eval()
92
 
93
  def downsample_video(video_path):
94
- """
95
- Downsamples the video to evenly spaced frames.
96
- Each frame is returned as a PIL image along with its timestamp.
97
- """
98
  vidcap = cv2.VideoCapture(video_path)
99
- total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
100
  fps = vidcap.get(cv2.CAP_PROP_FPS)
101
  frames = []
102
- frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
103
- for i in frame_indices:
104
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
105
- success, image = vidcap.read()
106
- if success:
107
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
108
- pil_image = Image.fromarray(image)
109
- timestamp = round(i / fps, 2)
110
- frames.append((pil_image, timestamp))
111
  vidcap.release()
112
  return frames
113
 
114
  @spaces.GPU
115
- def generate_image(model_name: str, text: str, image: Image.Image,
116
- max_new_tokens: int = 1024,
117
- temperature: float = 0.6,
118
- top_p: float = 0.9,
119
- top_k: int = 50,
120
- repetition_penalty: float = 1.2):
121
- """
122
- Generates responses using the selected model for image input.
123
- Yields raw text and Markdown-formatted text.
124
- """
125
- if model_name == "RolmOCR-7B":
126
- processor = processor_m
127
- model = model_m
128
- elif model_name == "Qwen2-VL-OCR-2B":
129
- processor = processor_x
130
- model = model_x
131
- elif model_name == "Nanonets-OCR-s":
132
- processor = processor_v
133
- model = model_v
134
- elif model_name == "Aya-Vision-8B":
135
- processor = processor_a
136
- model = model_a
137
- elif model_name == "Lh41-1042-Magellanic-7B-0711":
138
- processor = processor_w
139
- model = model_w
140
- else:
141
- yield "Invalid model selected.", "Invalid model selected."
142
  return
143
 
 
144
  if image is None:
145
- yield "Please upload an image.", "Please upload an image."
146
  return
147
 
148
- messages = [{
149
- "role": "user",
150
- "content": [
151
- {"type": "image", "image": image},
152
- {"type": "text", "text": text},
153
- ]
154
- }]
155
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
156
- inputs = processor(
157
- text=[prompt_full],
158
- images=[image],
159
- return_tensors="pt",
160
- padding=True,
161
- truncation=False,
162
- max_length=MAX_INPUT_TOKEN_LENGTH
163
- ).to(device)
164
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
165
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
166
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
167
  thread.start()
168
- buffer = ""
169
- for new_text in streamer:
170
- buffer += new_text
171
- buffer = buffer.replace("<|im_end|>", "")
172
  time.sleep(0.01)
173
- yield buffer, buffer
174
 
175
  @spaces.GPU
176
- def generate_video(model_name: str, text: str, video_path: str,
177
- max_new_tokens: int = 1024,
178
- temperature: float = 0.6,
179
- top_p: float = 0.9,
180
- top_k: int = 50,
181
- repetition_penalty: float = 1.2):
182
- """
183
- Generates responses using the selected model for video input.
184
- Yields raw text and Markdown-formatted text.
185
- """
186
- if model_name == "RolmOCR-7B":
187
- processor = processor_m
188
- model = model_m
189
- elif model_name == "Qwen2-VL-OCR-2B":
190
- processor = processor_x
191
- model = model_x
192
- elif model_name == "Nanonets-OCR-s":
193
- processor = processor_v
194
- model = model_v
195
- elif model_name == "Aya-Vision-8B":
196
- processor = processor_a
197
- model = model_a
198
- elif model_name == "Lh41-1042-Magellanic-7B-0711":
199
- processor = processor_w
200
- model = model_w
201
- else:
202
- yield "Invalid model selected.", "Invalid model selected."
203
  return
204
 
 
205
  if video_path is None:
206
- yield "Please upload a video.", "Please upload a video."
207
  return
208
 
209
  frames = downsample_video(video_path)
210
- messages = [
211
- {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
212
- {"role": "user", "content": [{"type": "text", "text": text}]}
213
- ]
214
- for frame in frames:
215
- image, timestamp = frame
216
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
217
- messages[1]["content"].append({"type": "image", "image": image})
218
- inputs = processor.apply_chat_template(
219
- messages,
220
- tokenize=True,
221
- add_generation_prompt=True,
222
- return_dict=True,
223
- return_tensors="pt",
224
- truncation=False,
225
- max_length=MAX_INPUT_TOKEN_LENGTH
226
- ).to(device)
227
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
228
- generation_kwargs = {
229
- **inputs,
230
- "streamer": streamer,
231
- "max_new_tokens": max_new_tokens,
232
- "do_sample": True,
233
- "temperature": temperature,
234
- "top_p": top_p,
235
- "top_k": top_k,
236
- "repetition_penalty": repetition_penalty,
237
- }
238
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
239
  thread.start()
240
- buffer = ""
241
- for new_text in streamer:
242
- buffer += new_text
243
- buffer = buffer.replace("<|im_end|>", "")
244
  time.sleep(0.01)
245
- yield buffer, buffer
246
 
247
- # Define examples for image and video inference
248
  image_examples = [
249
- ["Extract the content", "images/4.png"],
250
- ["Explain the scene", "images/3.jpg"],
251
- ["Convert this page to doc [table] precisely for markdown.", "images/0.png"],
252
- ["Perform OCR on the Image.", "images/1.jpg"],
253
- ["Extract the table content", "images/2.png"]
254
  ]
255
-
256
  video_examples = [
257
  ["Explain the Ad in Detail", "videos/1.mp4"],
258
- ["Identify the main actions in the cartoon video", "videos/2.mp4"]
259
  ]
260
 
261
  css = """
262
- .submit-btn {
263
- background-color: #2980b9 !important;
264
- color: white !important;
265
- }
266
- .submit-btn:hover {
267
- background-color: #3498db !important;
268
- }
269
- .canvas-output {
270
- border: 2px solid #4682B4;
271
- border-radius: 10px;
272
- padding: 20px;
273
- }
274
  """
275
 
276
- # Create the Gradio Interface
277
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
278
- gr.Markdown("# **[Multimodal OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
279
  with gr.Row():
280
  with gr.Column():
281
  with gr.Tabs():
282
  with gr.TabItem("Image Inference"):
283
- image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
284
- image_upload = gr.Image(type="pil", label="Image")
285
- image_submit = gr.Button("Submit", elem_classes="submit-btn")
286
- gr.Examples(
287
- examples=image_examples,
288
- inputs=[image_query, image_upload]
289
- )
290
  with gr.TabItem("Video Inference"):
291
- video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
292
- video_upload = gr.Video(label="Video")
293
- video_submit = gr.Button("Submit", elem_classes="submit-btn")
294
- gr.Examples(
295
- examples=video_examples,
296
- inputs=[video_query, video_upload]
297
- )
298
- with gr.Accordion("Advanced options", open=False):
299
- max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
300
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
301
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
302
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
303
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
304
-
305
- with gr.Column():
306
- with gr.Column(elem_classes="canvas-output"):
307
- gr.Markdown("## Output")
308
- output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2, show_copy_button=True)
309
- #format[ft.md]
310
- with gr.Accordion("(Result.md)", open=False):
311
- markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
312
- model_choice = gr.Radio(
313
- choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B", "RolmOCR-7B",
314
- "Lh41-1042-Magellanic-7B-0711", "Aya-Vision-8B"],
315
- label="Select Model",
316
- value="Nanonets-OCR-s"
317
- )
318
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR/discussions)")
319
- gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
320
- gr.Markdown("> [Lh41-1042-Magellanic-7B-0711](https://huggingface.co/prithivMLmods/Lh41-1042-Magellanic-7B-0711): lh41-1042-magellanic-7b-0711 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for image captioning, visual analysis, and image reasoning. built on top of the qwen2.5-vl, this experimental model enhances visual comprehension, focused training on 3,000k image pairs for superior image understanding")
321
- gr.Markdown("> [Qwen2-VL-OCR-2B](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
322
- gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
323
- gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
324
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
325
-
326
- image_submit.click(
327
- fn=generate_image,
328
- inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
329
- outputs=[output, markdown_output]
330
- )
331
- video_submit.click(
332
- fn=generate_video,
333
- inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
334
- outputs=[output, markdown_output]
335
- )
336
 
337
  if __name__ == "__main__":
338
- demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
1
  import os
 
2
  import uuid
 
3
  import time
4
  import asyncio
5
  from threading import Thread
 
20
  )
21
  from transformers.image_utils import load_image
22
 
23
+ # Constants
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  MAX_MAX_NEW_TOKENS = 2048
25
  DEFAULT_MAX_NEW_TOKENS = 1024
26
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
27
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
28
 
29
+ # Load public OCR models
30
+
31
  MODEL_ID_V = "nanonets/Nanonets-OCR-s"
32
  processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
33
  model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
34
  MODEL_ID_V,
35
  trust_remote_code=True,
36
+ torch_dtype=torch.bfloat16
37
  ).to(device).eval()
38
 
 
39
  MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
40
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
41
  model_x = Qwen2VLForConditionalGeneration.from_pretrained(
42
+ MODEL_ID_X, trust_remote_code=True, torch_dtype=torch.bfloat16
 
 
43
  ).to(device).eval()
44
 
45
+ MODEL_ID_M = "reducto/RolmOCR"
46
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
47
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
48
+ MODEL_ID_M, trust_remote_code=True, torch_dtype=torch.bfloat16
 
 
 
49
  ).to(device).eval()
50
 
 
51
  MODEL_ID_W = "prithivMLmods/Lh41-1042-Magellanic-7B-0711"
52
  processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
53
  model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
54
+ MODEL_ID_W, trust_remote_code=True, torch_dtype=torch.bfloat16
 
 
 
 
 
 
 
 
 
 
 
55
  ).to(device).eval()
56
 
57
  def downsample_video(video_path):
 
 
 
 
58
  vidcap = cv2.VideoCapture(video_path)
59
+ total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
60
  fps = vidcap.get(cv2.CAP_PROP_FPS)
61
  frames = []
62
+ for i in np.linspace(0, total - 1, 10, dtype=int):
 
63
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
64
+ ok, img = vidcap.read()
65
+ if ok:
66
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
67
+ frames.append((Image.fromarray(img), round(i / fps, 2)))
 
 
68
  vidcap.release()
69
  return frames
70
 
71
  @spaces.GPU
72
+ def generate_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
73
+ mapping = {
74
+ "Nanonets-OCR-s": (processor_v, model_v),
75
+ "Qwen2-VL-OCR-2B": (processor_x, model_x),
76
+ "RolmOCR-7B": (processor_m, model_m),
77
+ "Lh41-1042-Magellanic-7B-0711": (processor_w, model_w),
78
+ }
79
+ if model_name not in mapping:
80
+ yield "Invalid model selected.", "Invalid model."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  return
82
 
83
+ processor, model = mapping[model_name]
84
  if image is None:
85
+ yield "Please upload an image.", ""
86
  return
87
 
88
+ msg = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text}]}]
89
+ prompt = processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
90
+ inputs = processor(text=[prompt], images=[image], return_tensors="pt", padding=True).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
92
+ thread = Thread(target=model.generate, kwargs={**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens})
 
93
  thread.start()
94
+
95
+ out = ""
96
+ for token in streamer:
97
+ out += token.replace("<|im_end|>", "")
98
  time.sleep(0.01)
99
+ yield out, out
100
 
101
  @spaces.GPU
102
+ def generate_video(model_name, text, video_path, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
103
+ mapping = {
104
+ "Nanonets-OCR-s": (processor_v, model_v),
105
+ "Qwen2-VL-OCR-2B": (processor_x, model_x),
106
+ "RolmOCR-7B": (processor_m, model_m),
107
+ "Lh41-1042-Magellanic-7B-0711": (processor_w, model_w),
108
+ }
109
+ if model_name not in mapping:
110
+ yield "Invalid model selected.", "Invalid model."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  return
112
 
113
+ processor, model = mapping[model_name]
114
  if video_path is None:
115
+ yield "Please upload a video.", ""
116
  return
117
 
118
  frames = downsample_video(video_path)
119
+ messages = [{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
120
+ {"role": "user", "content": [{"type": "text", "text": text}]}]
121
+ for img, ts in frames:
122
+ messages[1]["content"].append({"type": "text", "text": f"Frame {ts}:"})
123
+ messages[1]["content"].append({"type": "image", "image": img})
124
+
125
+ inputs = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True,
126
+ return_tensors="pt").to(device)
 
 
 
 
 
 
 
 
 
127
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
128
+ thread = Thread(target=model.generate, kwargs={**inputs,
129
+ "streamer": streamer,
130
+ "max_new_tokens": max_new_tokens,
131
+ "temperature": temperature,
132
+ "top_p": top_p,
133
+ "top_k": top_k,
134
+ "repetition_penalty": repetition_penalty})
 
 
 
 
135
  thread.start()
136
+ out = ""
137
+ for token in streamer:
138
+ out += token.replace("<|im_end|>", "")
 
139
  time.sleep(0.01)
140
+ yield out, out
141
 
142
+ # Examples
143
  image_examples = [
144
+ ["Extract the content", "images/4.png"],
145
+ ["Explain the scene", "images/3.jpg"],
146
+ ["Perform OCR on the image", "images/1.jpg"],
 
 
147
  ]
 
148
  video_examples = [
149
  ["Explain the Ad in Detail", "videos/1.mp4"],
 
150
  ]
151
 
152
  css = """
153
+ .submit-btn { background-color: #2980b9 !important; color: white !important; }
154
+ .submit-btn:hover { background-color: #3498db !important; }
155
+ .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
 
 
 
 
 
 
 
 
 
156
  """
157
 
 
158
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
159
+ gr.Markdown("# **Multimodal OCR**")
160
  with gr.Row():
161
  with gr.Column():
162
  with gr.Tabs():
163
  with gr.TabItem("Image Inference"):
164
+ img_q = gr.Textbox(label="Query Input", placeholder="Enter prompt")
165
+ img_up = gr.Image(type="pil", label="Upload Image")
166
+ img_btn = gr.Button("Submit", elem_classes="submit-btn")
167
+ gr.Examples(examples=image_examples, inputs=[img_q, img_up])
 
 
 
168
  with gr.TabItem("Video Inference"):
169
+ vid_q = gr.Textbox(label="Query Input")
170
+ vid_up = gr.Video(label="Upload Video")
171
+ vid_btn = gr.Button("Submit", elem_classes="submit-btn")
172
+ gr.Examples(examples=video_examples, inputs=[vid_q, vid_up])
173
+ with gr.Column(elem_classes="canvas-output"):
174
+ gr.Markdown("## Output")
175
+ out_raw = gr.Textbox(interactive=False, lines=2, show_copy_button=True)
176
+ with gr.Accordion("Formatted Output", open=False):
177
+ out_md = gr.Markdown()
178
+
179
+ model_choice = gr.Radio(
180
+ choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B", "RolmOCR-7B", "Lh41-1042-Magellanic-7B-0711"],
181
+ label="Select Model",
182
+ value="Nanonets-OCR-s"
183
+ )
184
+
185
+ img_btn.click(generate_image, inputs=[model_choice, img_q, img_up,
186
+ gr.Slider(1, MAX_MAX_NEW_TOKENS, value=DEFAULT_MAX_NEW_TOKENS),
187
+ gr.Slider(0.1,4.0,value=0.6),
188
+ gr.Slider(0.05,1.0,value=0.9),
189
+ gr.Slider(1,1000,value=50),
190
+ gr.Slider(1.0,2.0,value=1.2)],
191
+ outputs=[out_raw, out_md])
192
+
193
+ vid_btn.click(generate_video, inputs=[model_choice, vid_q, vid_up,
194
+ gr.Slider(1, MAX_MAX_NEW_TOKENS, value=DEFAULT_MAX_NEW_TOKENS),
195
+ gr.Slider(0.1,4.0,value=0.6),
196
+ gr.Slider(0.05,1.0,value=0.9),
197
+ gr.Slider(1,1000,value=50),
198
+ gr.Slider(1.0,2.0,value=1.2)],
199
+ outputs=[out_raw, out_md])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  if __name__ == "__main__":
202
+ demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)