Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 11

Commit

b73b04e

verified ·

1 Parent(s): 5f5851f

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -129

app.py CHANGED Viewed

@@ -15,12 +15,9 @@ import cv2
 import requests
 from transformers import (
-    Qwen2VLForConditionalGeneration,
-    Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
-    AutoModel,
-    AutoTokenizer,
 )
 from transformers.image_utils import load_image
@@ -48,65 +45,15 @@ print("Using device:", device)
 # To address the warnings, we add `use_fast=False` to ensure we use the
 # processor version the model was originally saved with.
-# Load DREX-062225-exp
-MODEL_ID_X = "prithivMLmods/DREX-062225-exp"
-processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
-model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_X,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load typhoon-ocr-3b
-MODEL_ID_T = "scb10x/typhoon-ocr-3b"
-processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True, use_fast=False)
-model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_T,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Load olmOCR-7B-0225-preview
-MODEL_ID_O = "allenai/olmOCR-7B-0225-preview"
-processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True, use_fast=False)
-model_o = Qwen2VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_O,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Load Lumian-VLR-7B-Thinking
-MODEL_ID_J = "prithivMLmods/Lumian-VLR-7B-Thinking"
-SUBFOLDER = "think-preview"
-processor_j = AutoProcessor.from_pretrained(MODEL_ID_J, trust_remote_code=True, subfolder=SUBFOLDER, use_fast=False)
-model_j = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_J,
-    trust_remote_code=True,
-    subfolder=SUBFOLDER,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Load openbmb/MiniCPM-V-4
-MODEL_ID_V4 = 'openbmb/MiniCPM-V-4'
-model_v4 = AutoModel.from_pretrained(
-    MODEL_ID_V4,
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16,
-    # Using 'sdpa' can sometimes cause issues in certain environments,
-    # letting transformers choose the default is safer.
-    # attn_implementation='sdpa'
-).eval().to(device)
-tokenizer_v4 = AutoTokenizer.from_pretrained(MODEL_ID_V4, trust_remote_code=True, use_fast=False)
-# --- Refactored Model Dictionary ---
-# This simplifies model selection in the generation functions.
-MODELS = {
-    "DREX-062225-7B-exp": (processor_x, model_x),
-    "Typhoon-OCR-3B": (processor_t, model_t),
-    "olmOCR-7B-0225-preview": (processor_o, model_o),
-    "Lumian-VLR-7B-Thinking": (processor_j, model_j),
-}
 def downsample_video(video_path):
     """
@@ -131,48 +78,28 @@ def downsample_video(video_path):
     return frames
 @spaces.GPU
-def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """
-    Generates responses using the selected model for image input.
     """
     if image is None:
         yield "Please upload an image.", "Please upload an image."
         return
-    # Handle MiniCPM-V-4 separately due to its different API
-    if model_name == "openbmb/MiniCPM-V-4":
-        msgs = [{'role': 'user', 'content': [image, text]}]
-        try:
-            answer = model_v4.chat(
-                image=image.convert('RGB'), msgs=msgs, tokenizer=tokenizer_v4,
-                max_new_tokens=max_new_tokens, temperature=temperature,
-                top_p=top_p, repetition_penalty=repetition_penalty,
-            )
-            yield answer, answer
-        except Exception as e:
-            yield f"Error: {e}", f"Error: {e}"
-        return
-    # Use the dictionary for other models
-    if model_name not in MODELS:
-        yield "Invalid model selected.", "Invalid model selected."
-        return
-    processor, model = MODELS[model_name]
-    messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text}]}]
-    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(
         text=[prompt_full], images=[image], return_tensors="pt", padding=True,
         truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
@@ -181,14 +108,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         yield buffer, buffer
 @spaces.GPU
-def generate_video(model_name: str, text: str, video_path: str,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """
-    Generates responses using the selected model for video input.
     """
     if video_path is None:
         yield "Please upload a video.", "Please upload a video."
@@ -199,49 +126,24 @@ def generate_video(model_name: str, text: str, video_path: str,
         yield "Could not process video.", "Could not process video."
         return
-    # Handle MiniCPM-V-4 separately
-    if model_name == "openbmb/MiniCPM-V-4":
-        images = [frame for frame, ts in frames_with_ts]
-        # For video, the prompt includes the text and then all the image frames
-        content = [text] + images
-        msgs = [{'role': 'user', 'content': content}]
-        try:
-            # The .chat API still takes a single image argument, typically the first frame
-            answer = model_v4.chat(
-                image=images[0].convert('RGB'), msgs=msgs, tokenizer=tokenizer_v4,
-                max_new_tokens=max_new_tokens, temperature=temperature,
-                top_p=top_p, repetition_penalty=repetition_penalty,
-            )
-            yield answer, answer
-        except Exception as e:
-            yield f"Error: {e}", f"Error: {e}"
-        return
-    # Use the dictionary for other models
-    if model_name not in MODELS:
-        yield "Invalid model selected.", "Invalid model selected."
-        return
-    processor, model = MODELS[model_name]
-    # Prepare messages for Qwen-style models
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
     images_for_processor = []
     for frame, timestamp in frames_with_ts:
-        messages[0]["content"].append({"type": "image", "image": frame})
         images_for_processor.append(frame)
-    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(
         text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True,
         truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
         "do_sample": True, "temperature": temperature, "top_p": top_p,
         "top_k": top_k, "repetition_penalty": repetition_penalty,
     }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
@@ -302,25 +204,18 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5, show_copy_button=True)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")
-            model_choice = gr.Radio(
-                choices=["Lumian-VLR-7B-Thinking", "openbmb/MiniCPM-V-4", "Typhoon-OCR-3B", "DREX-062225-7B-exp", "olmOCR-7B-0225-preview"],
-                label="Select Model",
-                value="Lumian-VLR-7B-Thinking"
-            )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
-            gr.Markdown("> [MiniCPM-V 4.0](https://huggingface.co/openbmb/MiniCPM-V-4) is the latest efficient model in the MiniCPM-V series. The model is built based on SigLIP2-400M and MiniCPM4-3B with a total of 4.1B parameters. It inherits the strong single-image, multi-image and video understanding performance of MiniCPM-V 2.6 with largely improved efficiency. [Lumian-VLR-7B-Thinking](https://huggingface.co/prithivMLmods/Lumian-VLR-7B-Thinking) is a high-fidelity vision-language reasoning model built on Qwen2.5-VL-7B-Instruct, designed for fine-grained multimodal understanding, video reasoning, and document comprehension through explicit grounded reasoning.")
-            gr.Markdown("> [olmOCR-7B-0225-preview](https://huggingface.co/allenai/olmOCR-7B-0225-preview) is a 7B parameter open large model designed for OCR tasks with robust text extraction, especially in complex document layouts. [Typhoon-ocr-3b](https://huggingface.co/scb10x/typhoon-ocr-3b) is a 3B parameter OCR model optimized for efficient and accurate optical character recognition in challenging conditions.")
-            gr.Markdown("> [DREX-062225-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp) is an experimental multimodal model emphasizing strong document reading and extraction capabilities combined with vision-language understanding to support detailed document parsing and reasoning tasks.")
-            gr.Markdown("> ⚠️ Note: Video inference performance can vary significantly between models.")
     image_submit.click(
         fn=generate_image,
-        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[output, markdown_output]
     )
     video_submit.click(
         fn=generate_video,
-        inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[output, markdown_output]
     )

 import requests
 from transformers import (
+    Qwen3VLMoeForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
 # To address the warnings, we add `use_fast=False` to ensure we use the
 # processor version the model was originally saved with.
+# Load Qwen3VL
+MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
+processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
+model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
+    MODEL_ID_Q3VL,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 def downsample_video(video_path):
     """
     return frames
 @spaces.GPU
+def generate_image(text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """
+    Generates responses using the Qwen3-VL model for image input.
     """
     if image is None:
         yield "Please upload an image.", "Please upload an image."
         return
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
+    prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor_q3vl(
         text=[prompt_full], images=[image], return_tensors="pt", padding=True,
         truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
+    streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+    thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         yield buffer, buffer
 @spaces.GPU
+def generate_video(text: str, video_path: str,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """
+    Generates responses using the Qwen3-VL model for video input.
     """
     if video_path is None:
         yield "Please upload a video.", "Please upload a video."
         yield "Could not process video.", "Could not process video."
         return
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
     images_for_processor = []
     for frame, timestamp in frames_with_ts:
+        messages[0]["content"].append({"type": "image"})
         images_for_processor.append(frame)
+    prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor_q3vl(
         text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True,
         truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
+    streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
         "do_sample": True, "temperature": temperature, "top_p": top_p,
         "top_k": top_k, "repetition_penalty": repetition_penalty,
     }
+    thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5, show_copy_button=True)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
+            gr.Markdown("> [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct) is a powerful, versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
+            gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
     image_submit.click(
         fn=generate_image,
+        inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[output, markdown_output]
     )
     video_submit.click(
         fn=generate_video,
+        inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[output, markdown_output]
     )