Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on 22 days ago

Commit

7a60f0d

verified ·

1 Parent(s): 65f3e50

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -50

app.py CHANGED Viewed

@@ -32,11 +32,14 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# --- Original Models ---
 # Load DREX-062225-exp
 MODEL_ID_X = "prithivMLmods/DREX-062225-exp"
-processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
 model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
     trust_remote_code=True,
@@ -45,7 +48,7 @@ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 # Load typhoon-ocr-3b
 MODEL_ID_T = "scb10x/typhoon-ocr-3b"
-processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
 model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_T,
     trust_remote_code=True,
@@ -54,7 +57,7 @@ model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 # Load olmOCR-7B-0225-preview
 MODEL_ID_O = "allenai/olmOCR-7B-0225-preview"
-processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True)
 model_o = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID_O,
     trust_remote_code=True,
@@ -64,7 +67,7 @@ model_o = Qwen2VLForConditionalGeneration.from_pretrained(
 # Load Lumian-VLR-7B-Thinking
 MODEL_ID_J = "prithivMLmods/Lumian-VLR-7B-Thinking"
 SUBFOLDER = "think-preview"
-processor_j = AutoProcessor.from_pretrained(MODEL_ID_J, trust_remote_code=True, subfolder=SUBFOLDER)
 model_j = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_J,
     trust_remote_code=True,
@@ -72,7 +75,7 @@ model_j = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# --- Load New Model: openbmb/MiniCPM-V-4 ---
 MODEL_ID_V4 = 'openbmb/MiniCPM-V-4'
 model_v4 = AutoModel.from_pretrained(
     MODEL_ID_V4,
@@ -80,7 +83,16 @@ model_v4 = AutoModel.from_pretrained(
     torch_dtype=torch.bfloat16,
     attn_implementation='sdpa'
 ).eval().to(device)
-tokenizer_v4 = AutoTokenizer.from_pretrained(MODEL_ID_V4, trust_remote_code=True)
 def downsample_video(video_path):
@@ -119,36 +131,25 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         yield "Please upload an image.", "Please upload an image."
         return
-    # Handle the new model separately due to its different API
     if model_name == "openbmb/MiniCPM-V-4":
         msgs = [{'role': 'user', 'content': [image, text]}]
         try:
             answer = model_v4.chat(
-                image=image.convert('RGB'),
-                msgs=msgs,
-                tokenizer=tokenizer_v4,
-                max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
             )
             yield answer, answer
         except Exception as e:
             yield f"Error: {e}", f"Error: {e}"
         return
-    # Original model selection logic
-    if model_name == "DREX-062225-7B-exp":
-        processor, model = processor_x, model_x
-    elif model_name == "olmOCR-7B-0225-preview":
-        processor, model = processor_o, model_o
-    elif model_name == "Typhoon-OCR":
-        processor, model = processor_t, model_t
-    elif model_name == "Lumian-VLR-7B-Thinking":
-        processor, model = processor_j, model_j
-    else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
@@ -185,46 +186,38 @@ def generate_video(model_name: str, text: str, video_path: str,
         yield "Could not process video.", "Could not process video."
         return
-    # Handle the new model separately
     if model_name == "openbmb/MiniCPM-V-4":
         images = [frame for frame, ts in frames_with_ts]
         content = [text] + images
         msgs = [{'role': 'user', 'content': content}]
         try:
             answer = model_v4.chat(
-                image=images[0].convert('RGB'),
-                msgs=msgs,
-                tokenizer=tokenizer_v4,
-                max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
             )
             yield answer, answer
         except Exception as e:
             yield f"Error: {e}", f"Error: {e}"
         return
-    # Original model selection logic
-    if model_name == "DREX-062225-7B-exp":
-        processor, model = processor_x, model_x
-    elif model_name == "olmOCR-7B-0225-preview":
-        processor, model = processor_o, model_o
-    elif model_name == "Typhoon-OCR":
-        processor, model = processor_t, model_t
-    elif model_name == "Lumian-VLR-7B-Thinking":
-        processor, model = processor_j, model_j
-    else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     # Prepare messages for Qwen-style models
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
     for frame, timestamp in frames_with_ts:
         messages[0]["content"].append({"type": "image", "image": frame})
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    images_for_processor = [frame for frame, ts in frames_with_ts]
     inputs = processor(
         text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True,
         truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
@@ -260,7 +253,6 @@ video_examples = [
     ["Explain the ad in detail.", "videos/1.mp4"]
 ]
-# Added CSS to style the output area as a "Canvas"
 css = """
 .submit-btn { background-color: #2980b9 !important; color: white !important; }
 .submit-btn:hover { background-color: #3498db !important; }
@@ -298,14 +290,16 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")
             model_choice = gr.Radio(
-                choices=[ "openbmb/MiniCPM-V-4", "Lumian-VLR-7B-Thinking", "Typhoon-OCR", "DREX-062225-7B-exp", "olmOCR-7B-0225-preview"],
                 label="Select Model",
                 value="openbmb/MiniCPM-V-4"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
-            gr.Markdown("> MiniCPM-V 4.0 is the latest efficient model in the MiniCPM-V series. The model is built based on SigLIP2-400M and MiniCPM4-3B with a total of 4.1B parameters. It inherits the strong single-image, multi-image and video understanding performance of MiniCPM-V 2.6 with largely improved efficiency. Lumian-VLR-7B-Thinking is a high-fidelity vision-language reasoning model built on Qwen2.5-VL-7B-Instruct, designed for fine-grained multimodal understanding, video reasoning, and document comprehension through explicit grounded reasoning.")
-            gr.Markdown("> olmOCR-7B-0225-preview is a 7B parameter open large model designed for OCR tasks with robust text extraction, especially in complex document layouts. Typhoon-ocr-3b is a 3B parameter OCR model optimized for efficient and accurate optical character recognition in challenging conditions.")
-            gr.Markdown("> DREX-062225-exp is an experimental multimodal model emphasizing strong document reading and extraction capabilities combined with vision-language understanding to support detailed document parsing and reasoning tasks.")
             gr.Markdown("> ⚠️ Note: Video inference performance can vary significantly between models.")
     image_submit.click(
@@ -320,4 +314,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)

 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# --- Model Loading ---
+# To address the warnings, we add `use_fast=False` to ensure we use the
+# processor version the model was originally saved with.
 # Load DREX-062225-exp
 MODEL_ID_X = "prithivMLmods/DREX-062225-exp"
+processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
 model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
     trust_remote_code=True,
 # Load typhoon-ocr-3b
 MODEL_ID_T = "scb10x/typhoon-ocr-3b"
+processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True, use_fast=False)
 model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_T,
     trust_remote_code=True,
 # Load olmOCR-7B-0225-preview
 MODEL_ID_O = "allenai/olmOCR-7B-0225-preview"
+processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True, use_fast=False)
 model_o = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID_O,
     trust_remote_code=True,
 # Load Lumian-VLR-7B-Thinking
 MODEL_ID_J = "prithivMLmods/Lumian-VLR-7B-Thinking"
 SUBFOLDER = "think-preview"
+processor_j = AutoProcessor.from_pretrained(MODEL_ID_J, trust_remote_code=True, subfolder=SUBFOLDER, use_fast=False)
 model_j = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_J,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load openbmb/MiniCPM-V-4
 MODEL_ID_V4 = 'openbmb/MiniCPM-V-4'
 model_v4 = AutoModel.from_pretrained(
     MODEL_ID_V4,
     torch_dtype=torch.bfloat16,
     attn_implementation='sdpa'
 ).eval().to(device)
+tokenizer_v4 = AutoTokenizer.from_pretrained(MODEL_ID_V4, trust_remote_code=True, use_fast=False)
+# --- Refactored Model Dictionary ---
+# This simplifies model selection in the generation functions.
+MODELS = {
+    "DREX-062225-7B-exp": (processor_x, model_x),
+    "Typhoon-OCR-3B": (processor_t, model_t),
+    "olmOCR-7B-0225-preview": (processor_o, model_o),
+    "Lumian-VLR-7B-Thinking": (processor_j, model_j),
+}
 def downsample_video(video_path):
         yield "Please upload an image.", "Please upload an image."
         return
+    # Handle MiniCPM-V-4 separately due to its different API
     if model_name == "openbmb/MiniCPM-V-4":
         msgs = [{'role': 'user', 'content': [image, text]}]
         try:
             answer = model_v4.chat(
+                image=image.convert('RGB'), msgs=msgs, tokenizer=tokenizer_v4,
+                max_new_tokens=max_new_tokens, temperature=temperature,
+                top_p=top_p, repetition_penalty=repetition_penalty,
             )
             yield answer, answer
         except Exception as e:
             yield f"Error: {e}", f"Error: {e}"
         return
+    # Use the dictionary for other models
+    if model_name not in MODELS:
         yield "Invalid model selected.", "Invalid model selected."
         return
+    processor, model = MODELS[model_name]
     messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         yield "Could not process video.", "Could not process video."
         return
+    # Handle MiniCPM-V-4 separately
     if model_name == "openbmb/MiniCPM-V-4":
         images = [frame for frame, ts in frames_with_ts]
+        # For video, the prompt includes the text and then all the image frames
         content = [text] + images
         msgs = [{'role': 'user', 'content': content}]
         try:
+            # The .chat API still takes a single image argument, typically the first frame
             answer = model_v4.chat(
+                image=images[0].convert('RGB'), msgs=msgs, tokenizer=tokenizer_v4,
+                max_new_tokens=max_new_tokens, temperature=temperature,
+                top_p=top_p, repetition_penalty=repetition_penalty,
             )
             yield answer, answer
         except Exception as e:
             yield f"Error: {e}", f"Error: {e}"
         return
+    # Use the dictionary for other models
+    if model_name not in MODELS:
         yield "Invalid model selected.", "Invalid model selected."
         return
+    processor, model = MODELS[model_name]
     # Prepare messages for Qwen-style models
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
+    images_for_processor = []
     for frame, timestamp in frames_with_ts:
         messages[0]["content"].append({"type": "image", "image": frame})
+        images_for_processor.append(frame)
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True,
         truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
     ["Explain the ad in detail.", "videos/1.mp4"]
 ]
 css = """
 .submit-btn { background-color: #2980b9 !important; color: white !important; }
 .submit-btn:hover { background-color: #3498db !important; }
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")
             model_choice = gr.Radio(
+                choices=["openbmb/MiniCPM-V-4", "Lumian-VLR-7B-Thinking", "Typhoon-OCR-3B", "DREX-062225-7B-exp", "olmOCR-7B-0225-preview"],
                 label="Select Model",
                 value="openbmb/MiniCPM-V-4"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
+            gr.Markdown("> **MiniCPM-V 4.0** is an efficient open-source multimodal model with strong performance in single/multi-image and video understanding, inheriting and improving upon the MiniCPM-V series.")
+            gr.Markdown("> **Lumian-VLR-7B-Thinking** is a high-fidelity vision-language reasoning model for fine-grained multimodal understanding, video reasoning, and document comprehension.")
+            gr.Markdown("> **olmOCR-7B-0225-preview** is a 7B parameter model designed for robust text extraction in complex OCR tasks.")
+            gr.Markdown("> **Typhoon-OCR-3B** is a 3B parameter OCR model optimized for efficient and accurate character recognition.")
+            gr.Markdown("> **DREX-062225-exp** is an experimental model emphasizing strong document reading, extraction, and vision-language understanding.")
             gr.Markdown("> ⚠️ Note: Video inference performance can vary significantly between models.")
     image_submit.click(
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(share=True, show_error=True)