Spaces:

prithivMLmods
/

VisionScope-R2

Running on Zero

App Files Files Community

prithivMLmods commited on Jun 6

Commit

b40230a

verified ·

1 Parent(s): 45691d2

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -102

app.py CHANGED Viewed

@@ -37,6 +37,7 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 MODEL_ID_M = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
 processor_m = AutoImageProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 tokenizer_m = AutoTokenizer.from_pretrained(MODEL_ID_M)
 model_m = AutoModel.from_pretrained(
     MODEL_ID_M,
     trust_remote_code=True,
@@ -89,35 +90,65 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         processor = processor_m
         tokenizer = tokenizer_m
         model = model_m
-    elif model_name == "SpaceThinker-3B":
-        processor = processor_z
-        model = model_z
-    elif model_name == "coreOCR-7B-050325-preview":
-        processor = processor_k
-        model = model_k
-    else:
-        yield "Invalid model selected."
-        return
-    if image is None:
-        yield "Please upload an image."
-        return
-    # For Llama-3.1-Nemotron-Nano-VL-8B-V1, manually construct prompt and tokenize
-    if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
-        # Construct a simple prompt since apply_chat_template is not available
-        prompt_full = f"<|image|>{text}<|endoftext|>"
-        inputs = tokenizer(
-            prompt_full,
-            return_tensors="pt",
-            padding=True,
-            truncation=False,
-            max_length=MAX_INPUT_TOKEN_LENGTH
-        ).to(device)
-        # Process image separately
-        image_inputs = processor(image, return_tensors="pt").to(device)
-        inputs.update(image_inputs)
-    else:
         messages = [{
             "role": "user",
             "content": [
@@ -134,21 +165,19 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             truncation=False,
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
-    streamer = TextIteratorStreamer(
-        tokenizer if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1" else processor,
-        skip_prompt=True,
-        skip_special_tokens=True
-    )
-    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
-        time.sleep(0.01)
-        yield buffer
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
@@ -161,39 +190,65 @@ def generate_video(model_name: str, text: str, video_path: str,
         processor = processor_m
         tokenizer = tokenizer_m
         model = model_m
-    elif model_name == "SpaceThinker-3B":
-        processor = processor_z
-        model = model_z
-    elif model_name == "coreOCR-7B-050325-preview":
-        processor = processor_k
-        model = model_k
-    else:
-        yield "Invalid model selected."
-        return
-    if video_path is None:
-        yield "Please upload a video."
-        return
-    frames = downsample_video(video_path)
-    if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
-        # Construct a simple prompt for Llama-3.1-Nemotron-Nano-VL-8B-V1
-        prompt_parts = ["<|startoftext|>You are a helpful assistant.<|endoftext|>", text]
-        for frame in frames:
-            image, timestamp = frame
-            prompt_parts.append(f"Frame {timestamp}: <|image|>")
-        prompt_full = " ".join(prompt_parts) + "<|endoftext|>"
-        inputs = tokenizer(
-            prompt_full,
-            return_tensors="pt",
-            padding=True,
-            truncation=False,
-            max_length=MAX_INPUT_TOKEN_LENGTH
-        ).to(device)
         # Process all frames
-        image_inputs = processor([frame[0] for frame in frames], return_tensors="pt").to(device)
-        inputs.update(image_inputs)
-    else:
         messages = [
             {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
             {"role": "user", "content": [{"type": "text", "text": text}]}
@@ -208,33 +263,33 @@ def generate_video(model_name: str, text: str, video_path: str,
             add_generation_prompt=True,
             return_dict=True,
             return_tensors="pt",
             truncation=False,
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
-    streamer = TextIteratorStreamer(
-        tokenizer if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1" else processor,
-        skip_prompt=True,
-        skip_special_tokens=True
-    )
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "do_sample": True,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
-    }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
-        time.sleep(0.01)
-        yield buffer
 # Define examples for image and video inference
 image_examples = [
@@ -293,11 +348,11 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
             model_choice = gr.Radio(
                 choices=["Llama-3.1-Nemotron-Nano-VL-8B-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview"],
                 label="Select Model",
-                value="SkyCaptioner-V1"
             )
             gr.Markdown("**Model Info**")
-            gr.Markdown("⤷ [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1):  structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
             gr.Markdown("⤷ [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
             gr.Markdown("⤷ [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
             gr.Markdown("⤷ [Imgscope-OCR-2B-0527](https://huggingface.co/prithivMLmods/Imgscope-OCR-2B-0527): fine-tuned version of qwen2-vl-2b-instruct, specifically optimized for messy handwriting recognition, document ocr, realistic handwritten ocr, and math problem solving with latex formatting.")

 MODEL_ID_M = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
 processor_m = AutoImageProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 tokenizer_m = AutoTokenizer.from_pretrained(MODEL_ID_M)
+tokenizer_m.pad_token = tokenizer_m.eos_token  # Set pad_token to resolve ValueError
 model_m = AutoModel.from_pretrained(
     MODEL_ID_M,
     trust_remote_code=True,
         processor = processor_m
         tokenizer = tokenizer_m
         model = model_m
+        if image is None:
+            yield "Please upload an image."
+            return
+        # Construct message with <image> token as per reference
+        if "<image>" not in text:
+            message = f"<image>\n{text}"
+        else:
+            message = text
+        # Tokenize the message
+        inputs = tokenizer(message, return_tensors="pt").to(device)
+        # Process image
+        image_features = processor(image, return_tensors="pt").to(device)
+        # Combine inputs
+        generation_inputs = {
+            "input_ids": inputs["input_ids"],
+            "attention_mask": inputs["attention_mask"],
+            **image_features,
+        }
+        # Create streamer
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        # Generation kwargs
+        generation_kwargs = {
+            **generation_inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        # Start generation in a thread
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+    elif model_name in ["SpaceThinker-3B", "coreOCR-7B-050325-preview"]:
+        if model_name == "SpaceThinker-3B":
+            processor = processor_z
+            model = model_z
+        else:
+            processor = processor_k
+            model = model_k
+        if image is None:
+            yield "Please upload an image."
+            return
         messages = [{
             "role": "user",
             "content": [
             truncation=False,
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+    else:
+        yield "Invalid model selected."
+        return
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
         processor = processor_m
         tokenizer = tokenizer_m
         model = model_m
+        if video_path is None:
+            yield "Please upload a video."
+            return
+        frames = downsample_video(video_path)
+        # Construct message with multiple <image> tokens
+        prompt_parts = ["<image>"] * len(frames) + [text]
+        message = " ".join(prompt_parts)
+        # Tokenize
+        inputs = tokenizer(message, return_tensors="pt").to(device)
         # Process all frames
+        image_features = processor([frame[0] for frame in frames], return_tensors="pt").to(device)
+        # Combine inputs
+        generation_inputs = {
+            "input_ids": inputs["input_ids"],
+            "attention_mask": inputs["attention_mask"],
+            **image_features,
+        }
+        # Create streamer
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        # Generation kwargs
+        generation_kwargs = {
+            **generation_inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        # Start generation in a thread
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+    elif model_name in ["SpaceThinker-3B", "coreOCR-7B-050325-preview"]:
+        if model_name == "SpaceThinker-3B":
+            processor = processor_z
+            model = model_z
+        else:
+            processor = processor_k
+            model = model_k
+        if video_path is None:
+            yield "Please upload a video."
+            return
+        frames = downsample_video(video_path)
         messages = [
             {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
             {"role": "user", "content": [{"type": "text", "text": text}]}
             add_generation_prompt=True,
             return_dict=True,
             return_tensors="pt",
+           ilibre
             truncation=False,
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+    else:
+        yield "Invalid model selected."
+        return
 # Define examples for image and video inference
 image_examples = [
             model_choice = gr.Radio(
                 choices=["Llama-3.1-Nemotron-Nano-VL-8B-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview"],
                 label="Select Model",
+                value="Llama-3.1-Nemotron-Nano-VL-8B-V1"  # Updated default value to a valid choice
             )
             gr.Markdown("**Model Info**")
+            gr.Markdown("⤷ [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
             gr.Markdown("⤷ [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
             gr.Markdown("⤷ [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
             gr.Markdown("⤷ [Imgscope-OCR-2B-0527](https://huggingface.co/prithivMLmods/Imgscope-OCR-2B-0527): fine-tuned version of qwen2-vl-2b-instruct, specifically optimized for messy handwriting recognition, document ocr, realistic handwritten ocr, and math problem solving with latex formatting.")