Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1 |
import os
|
2 |
-
import random
|
3 |
import uuid
|
4 |
-
import json
|
5 |
import time
|
6 |
import asyncio
|
7 |
from threading import Thread
|
@@ -22,317 +20,183 @@ from transformers import (
|
|
22 |
)
|
23 |
from transformers.image_utils import load_image
|
24 |
|
25 |
-
#
|
26 |
-
#custom_theme = gr.themes.Base(
|
27 |
-
# primary_hue="indigo",
|
28 |
-
# secondary_hue="violet",
|
29 |
-
# neutral_hue="gray"
|
30 |
-
#).set(
|
31 |
-
# body_background_fill="#f7f5fa",
|
32 |
-
# body_text_color="#1f1f1f",
|
33 |
-
# input_background_fill="#ffffff",
|
34 |
-
# button_primary_background_fill="#8b5cf6",
|
35 |
-
# button_primary_text_color="#ffffff",
|
36 |
-
# button_secondary_background_fill="#e0d7f5",
|
37 |
-
# button_secondary_text_color="#1f1f1f",
|
38 |
-
# shadow_spread="sm"
|
39 |
-
#)
|
40 |
-
|
41 |
-
# Constants for text generation
|
42 |
MAX_MAX_NEW_TOKENS = 2048
|
43 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
44 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
45 |
-
|
46 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
47 |
|
48 |
-
# Load
|
|
|
49 |
MODEL_ID_V = "nanonets/Nanonets-OCR-s"
|
50 |
processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
|
51 |
model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
52 |
MODEL_ID_V,
|
53 |
trust_remote_code=True,
|
54 |
-
torch_dtype=torch.
|
55 |
).to(device).eval()
|
56 |
|
57 |
-
# Load Qwen2-VL-OCR-2B-Instruct
|
58 |
MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
59 |
processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
|
60 |
model_x = Qwen2VLForConditionalGeneration.from_pretrained(
|
61 |
-
MODEL_ID_X,
|
62 |
-
trust_remote_code=True,
|
63 |
-
torch_dtype=torch.float16
|
64 |
).to(device).eval()
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
MODEL_ID_A,
|
71 |
-
trust_remote_code=True,
|
72 |
-
torch_dtype=torch.float16
|
73 |
).to(device).eval()
|
74 |
|
75 |
-
# Load Lh41-1042-Magellanic-7B-0711
|
76 |
MODEL_ID_W = "prithivMLmods/Lh41-1042-Magellanic-7B-0711"
|
77 |
processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
|
78 |
model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
79 |
-
MODEL_ID_W,
|
80 |
-
trust_remote_code=True,
|
81 |
-
torch_dtype=torch.float16
|
82 |
-
).to(device).eval()
|
83 |
-
|
84 |
-
# Load RolmOCR
|
85 |
-
MODEL_ID_M = "reducto/RolmOCR"
|
86 |
-
processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
|
87 |
-
model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
88 |
-
MODEL_ID_M,
|
89 |
-
trust_remote_code=True,
|
90 |
-
torch_dtype=torch.float16
|
91 |
).to(device).eval()
|
92 |
|
93 |
def downsample_video(video_path):
|
94 |
-
"""
|
95 |
-
Downsamples the video to evenly spaced frames.
|
96 |
-
Each frame is returned as a PIL image along with its timestamp.
|
97 |
-
"""
|
98 |
vidcap = cv2.VideoCapture(video_path)
|
99 |
-
|
100 |
fps = vidcap.get(cv2.CAP_PROP_FPS)
|
101 |
frames = []
|
102 |
-
|
103 |
-
for i in frame_indices:
|
104 |
vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
|
105 |
-
|
106 |
-
if
|
107 |
-
|
108 |
-
|
109 |
-
timestamp = round(i / fps, 2)
|
110 |
-
frames.append((pil_image, timestamp))
|
111 |
vidcap.release()
|
112 |
return frames
|
113 |
|
114 |
@spaces.GPU
|
115 |
-
def generate_image(model_name
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
"""
|
125 |
-
if model_name == "RolmOCR-7B":
|
126 |
-
processor = processor_m
|
127 |
-
model = model_m
|
128 |
-
elif model_name == "Qwen2-VL-OCR-2B":
|
129 |
-
processor = processor_x
|
130 |
-
model = model_x
|
131 |
-
elif model_name == "Nanonets-OCR-s":
|
132 |
-
processor = processor_v
|
133 |
-
model = model_v
|
134 |
-
elif model_name == "Aya-Vision-8B":
|
135 |
-
processor = processor_a
|
136 |
-
model = model_a
|
137 |
-
elif model_name == "Lh41-1042-Magellanic-7B-0711":
|
138 |
-
processor = processor_w
|
139 |
-
model = model_w
|
140 |
-
else:
|
141 |
-
yield "Invalid model selected.", "Invalid model selected."
|
142 |
return
|
143 |
|
|
|
144 |
if image is None:
|
145 |
-
yield "Please upload an image.", "
|
146 |
return
|
147 |
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
{"type": "image", "image": image},
|
152 |
-
{"type": "text", "text": text},
|
153 |
-
]
|
154 |
-
}]
|
155 |
-
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
156 |
-
inputs = processor(
|
157 |
-
text=[prompt_full],
|
158 |
-
images=[image],
|
159 |
-
return_tensors="pt",
|
160 |
-
padding=True,
|
161 |
-
truncation=False,
|
162 |
-
max_length=MAX_INPUT_TOKEN_LENGTH
|
163 |
-
).to(device)
|
164 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
165 |
-
|
166 |
-
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
167 |
thread.start()
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
time.sleep(0.01)
|
173 |
-
yield
|
174 |
|
175 |
@spaces.GPU
|
176 |
-
def generate_video(model_name
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
"""
|
186 |
-
if model_name == "RolmOCR-7B":
|
187 |
-
processor = processor_m
|
188 |
-
model = model_m
|
189 |
-
elif model_name == "Qwen2-VL-OCR-2B":
|
190 |
-
processor = processor_x
|
191 |
-
model = model_x
|
192 |
-
elif model_name == "Nanonets-OCR-s":
|
193 |
-
processor = processor_v
|
194 |
-
model = model_v
|
195 |
-
elif model_name == "Aya-Vision-8B":
|
196 |
-
processor = processor_a
|
197 |
-
model = model_a
|
198 |
-
elif model_name == "Lh41-1042-Magellanic-7B-0711":
|
199 |
-
processor = processor_w
|
200 |
-
model = model_w
|
201 |
-
else:
|
202 |
-
yield "Invalid model selected.", "Invalid model selected."
|
203 |
return
|
204 |
|
|
|
205 |
if video_path is None:
|
206 |
-
yield "Please upload a video.", "
|
207 |
return
|
208 |
|
209 |
frames = downsample_video(video_path)
|
210 |
-
messages = [
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
inputs = processor.apply_chat_template(
|
219 |
-
messages,
|
220 |
-
tokenize=True,
|
221 |
-
add_generation_prompt=True,
|
222 |
-
return_dict=True,
|
223 |
-
return_tensors="pt",
|
224 |
-
truncation=False,
|
225 |
-
max_length=MAX_INPUT_TOKEN_LENGTH
|
226 |
-
).to(device)
|
227 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
"top_k": top_k,
|
236 |
-
"repetition_penalty": repetition_penalty,
|
237 |
-
}
|
238 |
-
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
239 |
thread.start()
|
240 |
-
|
241 |
-
for
|
242 |
-
|
243 |
-
buffer = buffer.replace("<|im_end|>", "")
|
244 |
time.sleep(0.01)
|
245 |
-
yield
|
246 |
|
247 |
-
#
|
248 |
image_examples = [
|
249 |
-
["Extract the content", "images/4.png"],
|
250 |
-
["Explain the scene", "images/3.jpg"],
|
251 |
-
["
|
252 |
-
["Perform OCR on the Image.", "images/1.jpg"],
|
253 |
-
["Extract the table content", "images/2.png"]
|
254 |
]
|
255 |
-
|
256 |
video_examples = [
|
257 |
["Explain the Ad in Detail", "videos/1.mp4"],
|
258 |
-
["Identify the main actions in the cartoon video", "videos/2.mp4"]
|
259 |
]
|
260 |
|
261 |
css = """
|
262 |
-
.submit-btn {
|
263 |
-
|
264 |
-
|
265 |
-
}
|
266 |
-
.submit-btn:hover {
|
267 |
-
background-color: #3498db !important;
|
268 |
-
}
|
269 |
-
.canvas-output {
|
270 |
-
border: 2px solid #4682B4;
|
271 |
-
border-radius: 10px;
|
272 |
-
padding: 20px;
|
273 |
-
}
|
274 |
"""
|
275 |
|
276 |
-
# Create the Gradio Interface
|
277 |
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
278 |
-
gr.Markdown("# **
|
279 |
with gr.Row():
|
280 |
with gr.Column():
|
281 |
with gr.Tabs():
|
282 |
with gr.TabItem("Image Inference"):
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
gr.Examples(
|
287 |
-
examples=image_examples,
|
288 |
-
inputs=[image_query, image_upload]
|
289 |
-
)
|
290 |
with gr.TabItem("Video Inference"):
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
gr.Examples(
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
with gr.Accordion("
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
|
323 |
-
gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
|
324 |
-
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
325 |
-
|
326 |
-
image_submit.click(
|
327 |
-
fn=generate_image,
|
328 |
-
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
329 |
-
outputs=[output, markdown_output]
|
330 |
-
)
|
331 |
-
video_submit.click(
|
332 |
-
fn=generate_video,
|
333 |
-
inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
334 |
-
outputs=[output, markdown_output]
|
335 |
-
)
|
336 |
|
337 |
if __name__ == "__main__":
|
338 |
-
demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
|
|
|
1 |
import os
|
|
|
2 |
import uuid
|
|
|
3 |
import time
|
4 |
import asyncio
|
5 |
from threading import Thread
|
|
|
20 |
)
|
21 |
from transformers.image_utils import load_image
|
22 |
|
23 |
+
# Constants
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
MAX_MAX_NEW_TOKENS = 2048
|
25 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
26 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
|
|
27 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
28 |
|
29 |
+
# Load public OCR models
|
30 |
+
|
31 |
MODEL_ID_V = "nanonets/Nanonets-OCR-s"
|
32 |
processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
|
33 |
model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
34 |
MODEL_ID_V,
|
35 |
trust_remote_code=True,
|
36 |
+
torch_dtype=torch.bfloat16
|
37 |
).to(device).eval()
|
38 |
|
|
|
39 |
MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
40 |
processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
|
41 |
model_x = Qwen2VLForConditionalGeneration.from_pretrained(
|
42 |
+
MODEL_ID_X, trust_remote_code=True, torch_dtype=torch.bfloat16
|
|
|
|
|
43 |
).to(device).eval()
|
44 |
|
45 |
+
MODEL_ID_M = "reducto/RolmOCR"
|
46 |
+
processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
|
47 |
+
model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
48 |
+
MODEL_ID_M, trust_remote_code=True, torch_dtype=torch.bfloat16
|
|
|
|
|
|
|
49 |
).to(device).eval()
|
50 |
|
|
|
51 |
MODEL_ID_W = "prithivMLmods/Lh41-1042-Magellanic-7B-0711"
|
52 |
processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
|
53 |
model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
54 |
+
MODEL_ID_W, trust_remote_code=True, torch_dtype=torch.bfloat16
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
).to(device).eval()
|
56 |
|
57 |
def downsample_video(video_path):
|
|
|
|
|
|
|
|
|
58 |
vidcap = cv2.VideoCapture(video_path)
|
59 |
+
total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
|
60 |
fps = vidcap.get(cv2.CAP_PROP_FPS)
|
61 |
frames = []
|
62 |
+
for i in np.linspace(0, total - 1, 10, dtype=int):
|
|
|
63 |
vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
|
64 |
+
ok, img = vidcap.read()
|
65 |
+
if ok:
|
66 |
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
67 |
+
frames.append((Image.fromarray(img), round(i / fps, 2)))
|
|
|
|
|
68 |
vidcap.release()
|
69 |
return frames
|
70 |
|
71 |
@spaces.GPU
|
72 |
+
def generate_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
|
73 |
+
mapping = {
|
74 |
+
"Nanonets-OCR-s": (processor_v, model_v),
|
75 |
+
"Qwen2-VL-OCR-2B": (processor_x, model_x),
|
76 |
+
"RolmOCR-7B": (processor_m, model_m),
|
77 |
+
"Lh41-1042-Magellanic-7B-0711": (processor_w, model_w),
|
78 |
+
}
|
79 |
+
if model_name not in mapping:
|
80 |
+
yield "Invalid model selected.", "Invalid model."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
return
|
82 |
|
83 |
+
processor, model = mapping[model_name]
|
84 |
if image is None:
|
85 |
+
yield "Please upload an image.", ""
|
86 |
return
|
87 |
|
88 |
+
msg = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text}]}]
|
89 |
+
prompt = processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
|
90 |
+
inputs = processor(text=[prompt], images=[image], return_tensors="pt", padding=True).to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
92 |
+
thread = Thread(target=model.generate, kwargs={**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens})
|
|
|
93 |
thread.start()
|
94 |
+
|
95 |
+
out = ""
|
96 |
+
for token in streamer:
|
97 |
+
out += token.replace("<|im_end|>", "")
|
98 |
time.sleep(0.01)
|
99 |
+
yield out, out
|
100 |
|
101 |
@spaces.GPU
|
102 |
+
def generate_video(model_name, text, video_path, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
|
103 |
+
mapping = {
|
104 |
+
"Nanonets-OCR-s": (processor_v, model_v),
|
105 |
+
"Qwen2-VL-OCR-2B": (processor_x, model_x),
|
106 |
+
"RolmOCR-7B": (processor_m, model_m),
|
107 |
+
"Lh41-1042-Magellanic-7B-0711": (processor_w, model_w),
|
108 |
+
}
|
109 |
+
if model_name not in mapping:
|
110 |
+
yield "Invalid model selected.", "Invalid model."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
return
|
112 |
|
113 |
+
processor, model = mapping[model_name]
|
114 |
if video_path is None:
|
115 |
+
yield "Please upload a video.", ""
|
116 |
return
|
117 |
|
118 |
frames = downsample_video(video_path)
|
119 |
+
messages = [{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
|
120 |
+
{"role": "user", "content": [{"type": "text", "text": text}]}]
|
121 |
+
for img, ts in frames:
|
122 |
+
messages[1]["content"].append({"type": "text", "text": f"Frame {ts}:"})
|
123 |
+
messages[1]["content"].append({"type": "image", "image": img})
|
124 |
+
|
125 |
+
inputs = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True,
|
126 |
+
return_tensors="pt").to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
128 |
+
thread = Thread(target=model.generate, kwargs={**inputs,
|
129 |
+
"streamer": streamer,
|
130 |
+
"max_new_tokens": max_new_tokens,
|
131 |
+
"temperature": temperature,
|
132 |
+
"top_p": top_p,
|
133 |
+
"top_k": top_k,
|
134 |
+
"repetition_penalty": repetition_penalty})
|
|
|
|
|
|
|
|
|
135 |
thread.start()
|
136 |
+
out = ""
|
137 |
+
for token in streamer:
|
138 |
+
out += token.replace("<|im_end|>", "")
|
|
|
139 |
time.sleep(0.01)
|
140 |
+
yield out, out
|
141 |
|
142 |
+
# Examples
|
143 |
image_examples = [
|
144 |
+
["Extract the content", "images/4.png"],
|
145 |
+
["Explain the scene", "images/3.jpg"],
|
146 |
+
["Perform OCR on the image", "images/1.jpg"],
|
|
|
|
|
147 |
]
|
|
|
148 |
video_examples = [
|
149 |
["Explain the Ad in Detail", "videos/1.mp4"],
|
|
|
150 |
]
|
151 |
|
152 |
css = """
|
153 |
+
.submit-btn { background-color: #2980b9 !important; color: white !important; }
|
154 |
+
.submit-btn:hover { background-color: #3498db !important; }
|
155 |
+
.canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
"""
|
157 |
|
|
|
158 |
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
159 |
+
gr.Markdown("# **Multimodal OCR**")
|
160 |
with gr.Row():
|
161 |
with gr.Column():
|
162 |
with gr.Tabs():
|
163 |
with gr.TabItem("Image Inference"):
|
164 |
+
img_q = gr.Textbox(label="Query Input", placeholder="Enter prompt")
|
165 |
+
img_up = gr.Image(type="pil", label="Upload Image")
|
166 |
+
img_btn = gr.Button("Submit", elem_classes="submit-btn")
|
167 |
+
gr.Examples(examples=image_examples, inputs=[img_q, img_up])
|
|
|
|
|
|
|
168 |
with gr.TabItem("Video Inference"):
|
169 |
+
vid_q = gr.Textbox(label="Query Input")
|
170 |
+
vid_up = gr.Video(label="Upload Video")
|
171 |
+
vid_btn = gr.Button("Submit", elem_classes="submit-btn")
|
172 |
+
gr.Examples(examples=video_examples, inputs=[vid_q, vid_up])
|
173 |
+
with gr.Column(elem_classes="canvas-output"):
|
174 |
+
gr.Markdown("## Output")
|
175 |
+
out_raw = gr.Textbox(interactive=False, lines=2, show_copy_button=True)
|
176 |
+
with gr.Accordion("Formatted Output", open=False):
|
177 |
+
out_md = gr.Markdown()
|
178 |
+
|
179 |
+
model_choice = gr.Radio(
|
180 |
+
choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B", "RolmOCR-7B", "Lh41-1042-Magellanic-7B-0711"],
|
181 |
+
label="Select Model",
|
182 |
+
value="Nanonets-OCR-s"
|
183 |
+
)
|
184 |
+
|
185 |
+
img_btn.click(generate_image, inputs=[model_choice, img_q, img_up,
|
186 |
+
gr.Slider(1, MAX_MAX_NEW_TOKENS, value=DEFAULT_MAX_NEW_TOKENS),
|
187 |
+
gr.Slider(0.1,4.0,value=0.6),
|
188 |
+
gr.Slider(0.05,1.0,value=0.9),
|
189 |
+
gr.Slider(1,1000,value=50),
|
190 |
+
gr.Slider(1.0,2.0,value=1.2)],
|
191 |
+
outputs=[out_raw, out_md])
|
192 |
+
|
193 |
+
vid_btn.click(generate_video, inputs=[model_choice, vid_q, vid_up,
|
194 |
+
gr.Slider(1, MAX_MAX_NEW_TOKENS, value=DEFAULT_MAX_NEW_TOKENS),
|
195 |
+
gr.Slider(0.1,4.0,value=0.6),
|
196 |
+
gr.Slider(0.05,1.0,value=0.9),
|
197 |
+
gr.Slider(1,1000,value=50),
|
198 |
+
gr.Slider(1.0,2.0,value=1.2)],
|
199 |
+
outputs=[out_raw, out_md])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
if __name__ == "__main__":
|
202 |
+
demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
|