Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -42,7 +42,7 @@ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
42 |
torch_dtype=torch.float16).to(device).eval()
|
43 |
|
44 |
# Load prithivMLmods/WR30a-Deep-7B-0711
|
45 |
-
MODEL_ID_T = "
|
46 |
processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
|
47 |
model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
48 |
MODEL_ID_T, trust_remote_code=True,
|
@@ -55,17 +55,6 @@ model_o = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
55 |
MODEL_ID_O, trust_remote_code=True,
|
56 |
torch_dtype=torch.float16).to(device).eval()
|
57 |
|
58 |
-
#-----------------------------subfolder-----------------------------#
|
59 |
-
# Load MonkeyOCR-pro-1.2B
|
60 |
-
MODEL_ID_W = "echo840/MonkeyOCR-pro-1.2B"
|
61 |
-
SUBFOLDER = "Recognition"
|
62 |
-
processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True, subfolder=SUBFOLDER)
|
63 |
-
model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
64 |
-
MODEL_ID_W, trust_remote_code=True,
|
65 |
-
subfolder=SUBFOLDER,
|
66 |
-
torch_dtype=torch.float16).to(device).eval()
|
67 |
-
#-----------------------------subfolder-----------------------------#
|
68 |
-
|
69 |
# Function to downsample video frames
|
70 |
def downsample_video(video_path):
|
71 |
"""
|
@@ -110,12 +99,9 @@ def generate_image(model_name: str,
|
|
110 |
elif model_name == "Visionary-R1-3B":
|
111 |
processor = processor_o
|
112 |
model = model_o
|
113 |
-
elif model_name == "
|
114 |
processor = processor_t
|
115 |
model = model_t
|
116 |
-
elif model_name == "MonkeyOCR-pro-1.2B":
|
117 |
-
processor = processor_w
|
118 |
-
model = model_w
|
119 |
else:
|
120 |
yield "Invalid model selected.", "Invalid model selected."
|
121 |
return
|
@@ -177,12 +163,9 @@ def generate_video(model_name: str,
|
|
177 |
elif model_name == "Visionary-R1-3B":
|
178 |
processor = processor_o
|
179 |
model = model_o
|
180 |
-
elif model_name == "
|
181 |
processor = processor_t
|
182 |
model = model_t
|
183 |
-
elif model_name == "MonkeyOCR-pro-1.2B":
|
184 |
-
processor = processor_w
|
185 |
-
model = model_w
|
186 |
else:
|
187 |
yield "Invalid model selected.", "Invalid model selected."
|
188 |
return
|
@@ -331,11 +314,11 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
331 |
#download_btn = gr.Button("Download Result.md")
|
332 |
|
333 |
model_choice = gr.Radio(choices=[
|
334 |
-
"Camel-Doc-OCR-080125(v2)", "
|
335 |
-
"ViGaL-7B", "
|
336 |
],
|
337 |
label="Select Model",
|
338 |
-
value="
|
339 |
|
340 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLMs-5x/discussions)")
|
341 |
gr.Markdown("> [Camel-Doc-OCR-080125(v2)](https://huggingface.co/prithivMLmods/WR30a-Deep-7B-0711): the camel-doc-ocr-080125 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
|
|
|
42 |
torch_dtype=torch.float16).to(device).eval()
|
43 |
|
44 |
# Load prithivMLmods/WR30a-Deep-7B-0711
|
45 |
+
MODEL_ID_T = "NCSOFT/VARCO-VISION-2.0-14B"
|
46 |
processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
|
47 |
model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
48 |
MODEL_ID_T, trust_remote_code=True,
|
|
|
55 |
MODEL_ID_O, trust_remote_code=True,
|
56 |
torch_dtype=torch.float16).to(device).eval()
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
# Function to downsample video frames
|
59 |
def downsample_video(video_path):
|
60 |
"""
|
|
|
99 |
elif model_name == "Visionary-R1-3B":
|
100 |
processor = processor_o
|
101 |
model = model_o
|
102 |
+
elif model_name == "Varco-Vision-2.0-14B":
|
103 |
processor = processor_t
|
104 |
model = model_t
|
|
|
|
|
|
|
105 |
else:
|
106 |
yield "Invalid model selected.", "Invalid model selected."
|
107 |
return
|
|
|
163 |
elif model_name == "Visionary-R1-3B":
|
164 |
processor = processor_o
|
165 |
model = model_o
|
166 |
+
elif model_name == "Varco-Vision-2.0-14B":
|
167 |
processor = processor_t
|
168 |
model = model_t
|
|
|
|
|
|
|
169 |
else:
|
170 |
yield "Invalid model selected.", "Invalid model selected."
|
171 |
return
|
|
|
314 |
#download_btn = gr.Button("Download Result.md")
|
315 |
|
316 |
model_choice = gr.Radio(choices=[
|
317 |
+
"Camel-Doc-OCR-080125(v2)", "Varco-Vision-2.0-14B",
|
318 |
+
"ViGaL-7B", "Visionary-R1-3B"
|
319 |
],
|
320 |
label="Select Model",
|
321 |
+
value="Camel-Doc-OCR-080125(v2)")
|
322 |
|
323 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLMs-5x/discussions)")
|
324 |
gr.Markdown("> [Camel-Doc-OCR-080125(v2)](https://huggingface.co/prithivMLmods/WR30a-Deep-7B-0711): the camel-doc-ocr-080125 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
|