Spaces:

microsoft
/

Magma-Gaming

Configuration error

App Files Files Community

jw2yang commited on Feb 27

Commit

4f00e93

1 Parent(s): 2f72390

add magma arena

Browse files

Files changed (13) hide show

app.py +51 -63
app_1p.py +213 -0
assets/images/magma_game_thin.png +0 -0
requirements.txt +3 -1
vlms/__pycache__/llavanext.cpython-310.pyc +0 -0
vlms/__pycache__/llavaov.cpython-310.pyc +0 -0
vlms/__pycache__/magma.cpython-310.pyc +0 -0
vlms/__pycache__/qwen25vl.cpython-310.pyc +0 -0
vlms/__pycache__/qwen2vl.cpython-310.pyc +0 -0
vlms/llavanext.py +43 -0
vlms/llavaov.py +44 -0
vlms/magma.py +40 -0
vlms/qwen2vl.py +59 -0

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-# add a command for installing flash-attn
 os.system('pip install flash-attn --no-build-isolation')
 os.system("pip install gradio==4.44.1")
@@ -12,12 +11,15 @@ from PIL import Image
 from transformers import AutoModelForCausalLM, AutoProcessor
 import re
 import random
 pygame.mixer.quit()  # Disable sound
 # Constants
-WIDTH, HEIGHT = 640, 640
-GRID_SIZE = 64
 WHITE = (255, 255, 255)
 GREEN = (34, 139, 34)  # Forest green - more like an apple
 RED = (200, 50, 50)
@@ -34,29 +36,24 @@ STATIC = (0, 0)
 ACTIONS = ["up", "down", "left", "right", "static"]
-# Load AI Model
 dtype = torch.bfloat16
-magma_model_id = "microsoft/Magma-8B"
-magam_model = AutoModelForCausalLM.from_pretrained(magma_model_id, trust_remote_code=True, torch_dtype=dtype)
-magma_processor = AutoProcessor.from_pretrained(magma_model_id, trust_remote_code=True)
-magam_model.to("cuda")
-# Load magma image
 magma_img = pygame.image.load("./assets/images/magma_game_thin.png")
 magma_img = pygame.transform.scale(magma_img, (GRID_SIZE, GRID_SIZE))
-target_img = pygame.image.load("./assets/images/apple.png")
-target_img = pygame.transform.scale(target_img, (GRID_SIZE, GRID_SIZE))
 class MagmaFindGPU:
     def __init__(self):
         self.reset()
     def reset(self):
         self.snake = [(5, 5)]
         self.direction = RIGHT
         self.score = 0
         self.game_over = False
         self.place_target()
     def place_target(self):
@@ -79,16 +76,18 @@ class MagmaFindGPU:
         elif action == "static":
             self.direction = STATIC
-        if self.game_over:
-            return self.render(), self.score
         new_head = (self.snake[0][0] + self.direction[0], self.snake[0][1] + self.direction[1])
         if new_head[0] < 0 or new_head[1] < 0 or new_head[0] >= WIDTH // GRID_SIZE or new_head[1] >= HEIGHT // GRID_SIZE:
-            self.game_over = True
             return self.render(), self.score
         self.snake = [new_head]  # Keep only the head (single block snake)
         # Check if the target is covered by four surrounding squares
         head_x, head_y = self.snake[0]
@@ -99,7 +98,7 @@ class MagmaFindGPU:
             self.place_target()
         return self.render(), self.score
     def render(self):
         pygame.init()
         surface = pygame.Surface((WIDTH, HEIGHT))
@@ -109,10 +108,8 @@ class MagmaFindGPU:
         surface.blit(magma_img, (head_x * GRID_SIZE, head_y * GRID_SIZE))
         # pygame.draw.rect(surface, RED, (self.snake[0][0] * GRID_SIZE, self.snake[0][1] * GRID_SIZE, GRID_SIZE, GRID_SIZE))
-        # pygame.draw.rect(surface, GREEN, (self.target[0] * GRID_SIZE, self.target[1] * GRID_SIZE, GRID_SIZE, GRID_SIZE))
-        # Draw green apple target
-        surface.blit(target_img, (self.target[0] * GRID_SIZE, self.target[1] * GRID_SIZE))
         # Draw four surrounding squares with labels
         head_x, head_y = self.snake[0]
         neighbors = [(head_x, head_y - 1), (head_x, head_y + 1), (head_x - 1, head_y), (head_x + 1, head_y)]
@@ -135,41 +132,20 @@ class MagmaFindGPU:
     def get_state(self):
         return self.render()
-game = MagmaFindGPU()
-def play_game():
     state, state_som = game.get_state()
     pil_img = Image.fromarray(state_som)
-    convs = [
-        {"role": "system", "content": "You are an agent that can see, talk, and act."},
-        {"role": "user", "content": "<image_start><image><image_end>\nWhich mark is closer to green apple? Answer with a single number."},
-    ]
-    prompt = magma_processor.tokenizer.apply_chat_template(convs, tokenize=False, add_generation_prompt=True)
-    inputs = magma_processor(images=[pil_img], texts=prompt, return_tensors="pt")
-    inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)
-    inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0)
-    inputs = inputs.to("cuda").to(dtype)
-    generation_args = {
-        "max_new_tokens": 10,
-        "temperature": 0,
-        "do_sample": False,
-        "use_cache": True,
-        "num_beams": 1,
-    }
-    with torch.inference_mode():
-        generate_ids = magam_model.generate(**inputs, **generation_args)
-    generate_ids = generate_ids[:, inputs["input_ids"].shape[-1] :]
-    action = magma_processor.decode(generate_ids[0], skip_special_tokens=True).strip()
     # extract mark id fro action use re
     match = re.search(r'\d+', action)
     if match:
         action = match.group(0)
         if action.isdigit() and 1 <= int(action) <= 4:
-            # epsilon sampling
-            if random.random() < 0.1:
-                action = random.choice(ACTIONS[:-1])
-            else:
-                action = ACTIONS[int(action) - 1]
         else:
             # random choose one from the pool
             action = random.choice(ACTIONS[:-1])
@@ -177,34 +153,46 @@ def play_game():
         action = random.choice(ACTIONS[:-1])
     img, score = game.step(action)
-    img = img[0]
-    return img, f"Score: {score}"
-def reset_game():
-    game.reset()
-    return game.render()[0], "Score: 0"
 MARKDOWN = """
 <div align="center">
 <h2>Magma: A Foundation Model for Multimodal AI Agents</h2>
 \[[arXiv Paper](https://www.arxiv.org/pdf/2502.13130)\] &nbsp; \[[Project Page](https://microsoft.github.io/Magma/)\] &nbsp; \[[Github Repo](https://github.com/microsoft/Magma)\] &nbsp; \[[Hugging Face Model](https://huggingface.co/microsoft/Magma-8B)\] &nbsp;
-Game: Magma collects apple by moving up, down, left and right.
 This demo is powered by [Gradio](https://gradio.app/).
 </div>
 """
 with gr.Blocks() as interface:
     gr.Markdown(MARKDOWN)
     with gr.Row():
-        image_output = gr.Image(label="Game Screen")
-        score_output = gr.Text(label="Score")
-    with gr.Row():
-        start_btn = gr.Button("Start/Reset Game")
-    interface.load(fn=play_game, every=1, inputs=[], outputs=[image_output, score_output])
-    start_btn.click(fn=reset_game, inputs=[], outputs=[image_output, score_output])
-interface.launch()

 import os
 os.system('pip install flash-attn --no-build-isolation')
 os.system("pip install gradio==4.44.1")
 from transformers import AutoModelForCausalLM, AutoProcessor
 import re
 import random
+from vlms.magma import MagmaAgent
+from vlms.llavaov import LLaVAOVAgent
+from vlms.qwen2vl import Qwen2VLAgent
 pygame.mixer.quit()  # Disable sound
 # Constants
+WIDTH, HEIGHT = 800, 800
+GRID_SIZE = 80
 WHITE = (255, 255, 255)
 GREEN = (34, 139, 34)  # Forest green - more like an apple
 RED = (200, 50, 50)
 ACTIONS = ["up", "down", "left", "right", "static"]
 dtype = torch.bfloat16
+agent_1 = MagmaAgent("cuda:0", dtype)
+agent_2 = Qwen2VLAgent("cuda:0", dtype)
 magma_img = pygame.image.load("./assets/images/magma_game_thin.png")
 magma_img = pygame.transform.scale(magma_img, (GRID_SIZE, GRID_SIZE))
 class MagmaFindGPU:
     def __init__(self):
         self.reset()
+        self.step_count = 0
     def reset(self):
         self.snake = [(5, 5)]
         self.direction = RIGHT
         self.score = 0
         self.game_over = False
+        self.step_count = 0
         self.place_target()
     def place_target(self):
         elif action == "static":
             self.direction = STATIC
+        # if self.game_over:
+        #     self.reset()
+        #     return self.render(), self.score
         new_head = (self.snake[0][0] + self.direction[0], self.snake[0][1] + self.direction[1])
         if new_head[0] < 0 or new_head[1] < 0 or new_head[0] >= WIDTH // GRID_SIZE or new_head[1] >= HEIGHT // GRID_SIZE:
+            # self.game_over = True
             return self.render(), self.score
         self.snake = [new_head]  # Keep only the head (single block snake)
+        self.step_count += 1
         # Check if the target is covered by four surrounding squares
         head_x, head_y = self.snake[0]
             self.place_target()
         return self.render(), self.score
     def render(self):
         pygame.init()
         surface = pygame.Surface((WIDTH, HEIGHT))
         surface.blit(magma_img, (head_x * GRID_SIZE, head_y * GRID_SIZE))
         # pygame.draw.rect(surface, RED, (self.snake[0][0] * GRID_SIZE, self.snake[0][1] * GRID_SIZE, GRID_SIZE, GRID_SIZE))
+        pygame.draw.rect(surface, GREEN, (self.target[0] * GRID_SIZE, self.target[1] * GRID_SIZE, GRID_SIZE, GRID_SIZE))
         # Draw four surrounding squares with labels
         head_x, head_y = self.snake[0]
         neighbors = [(head_x, head_y - 1), (head_x, head_y + 1), (head_x - 1, head_y), (head_x + 1, head_y)]
     def get_state(self):
         return self.render()
+game_1 = MagmaFindGPU()
+game_2 = MagmaFindGPU()
+def play_game(game, agent):
     state, state_som = game.get_state()
     pil_img = Image.fromarray(state_som)
+    action = agent.generate_response(pil_img, "Which mark is closer to green block? Answer with a single number.")
     # extract mark id fro action use re
+    # print(agent.__class__.__name__, action)
     match = re.search(r'\d+', action)
     if match:
         action = match.group(0)
         if action.isdigit() and 1 <= int(action) <= 4:
+            action = ACTIONS[int(action) - 1]
         else:
             # random choose one from the pool
             action = random.choice(ACTIONS[:-1])
         action = random.choice(ACTIONS[:-1])
     img, score = game.step(action)
+    return img[0], f"Score: {score}"
+def play_game_1():
+    return play_game(game_1, agent_1)
+def play_game_2():
+    return play_game(game_2, agent_2)
+def reset_games():
+    game_1.reset()
+    game_2.reset()
+    return game_1.render()[0], "Score: 0", game_2.render()[0], "Score: 0"
 MARKDOWN = """
 <div align="center">
 <h2>Magma: A Foundation Model for Multimodal AI Agents</h2>
 \[[arXiv Paper](https://www.arxiv.org/pdf/2502.13130)\] &nbsp; \[[Project Page](https://microsoft.github.io/Magma/)\] &nbsp; \[[Github Repo](https://github.com/microsoft/Magma)\] &nbsp; \[[Hugging Face Model](https://huggingface.co/microsoft/Magma-8B)\] &nbsp;
+<h3>Magma Arena: A battle between two agents to collect the green blocks by automatically moving up, down, left and right.</h3>
 This demo is powered by [Gradio](https://gradio.app/).
 </div>
 """
 with gr.Blocks() as interface:
     gr.Markdown(MARKDOWN)
     with gr.Row():
+        with gr.Column():
+            img_output_1 = gr.Image(label="{}".format(agent_1.__class__.__name__))
+            score_output_1 = gr.Text(label="Score 1")
+        with gr.Column():
+            img_output_2 = gr.Image(label="{}".format(agent_2.__class__.__name__))
+            score_output_2 = gr.Text(label="Score 2")
+    start_btn = gr.Button("Start/Reset Game")
+    interface.load(fn=play_game_1, every=1, inputs=[], outputs=[img_output_1, score_output_1])
+    interface.load(fn=play_game_2, every=1, inputs=[], outputs=[img_output_2, score_output_2])
+    start_btn.click(fn=reset_games, inputs=[], outputs=[img_output_1, score_output_1, img_output_2, score_output_2])
+interface.launch(server_port=7861)

app_1p.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import os
+# add a command for installing flash-attn
+os.system('pip install flash-attn --no-build-isolation')
+os.system("pip install gradio==4.44.1")
+import pygame
+import numpy as np
+import gradio as gr
+import time
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+import re
+import random
+pygame.mixer.quit()  # Disable sound
+# Constants
+WIDTH, HEIGHT = 800, 800
+GRID_SIZE = 80
+WHITE = (255, 255, 255)
+GREEN = (34, 139, 34)  # Forest green - more like an apple
+RED = (200, 50, 50)
+BLACK = (0, 0, 0)
+GRAY = (128, 128, 128)
+YELLOW = (218, 165, 32)  # Golden yellow color
+# Directions
+UP = (0, -1)
+DOWN = (0, 1)
+LEFT = (-1, 0)
+RIGHT = (1, 0)
+STATIC = (0, 0)
+ACTIONS = ["up", "down", "left", "right", "static"]
+# Load AI Model
+dtype = torch.bfloat16
+magma_model_id = "microsoft/Magma-8B"
+magam_model = AutoModelForCausalLM.from_pretrained(magma_model_id, trust_remote_code=True, torch_dtype=dtype)
+magma_processor = AutoProcessor.from_pretrained(magma_model_id, trust_remote_code=True)
+magam_model.to("cuda")
+magma_img = pygame.image.load("./assets/images/magma_game_thin.png")
+magma_img = pygame.transform.scale(magma_img, (GRID_SIZE, GRID_SIZE))
+class MagmaFindGPU:
+    def __init__(self):
+        self.reset()
+        self.step_count = 0
+    def reset(self):
+        self.snake = [(5, 5)]
+        self.direction = RIGHT
+        self.score = 0
+        self.game_over = False
+        self.step_count = 0
+        self.place_target()
+    def place_target(self):
+        while True:
+            target_x = np.random.randint(1, WIDTH // GRID_SIZE - 1)
+            target_y = np.random.randint(1, HEIGHT // GRID_SIZE - 1)
+            if (target_x, target_y) not in self.snake:
+                self.target = (target_x, target_y)
+                break
+    def step(self, action):
+        if action == "up":
+            self.direction = UP
+        elif action == "down":
+            self.direction = DOWN
+        elif action == "left":
+            self.direction = LEFT
+        elif action == "right":
+            self.direction = RIGHT
+        elif action == "static":
+            self.direction = STATIC
+        if self.game_over:
+            self.reset()
+            return self.render(), self.score
+        new_head = (self.snake[0][0] + self.direction[0], self.snake[0][1] + self.direction[1])
+        if new_head[0] < 0 or new_head[1] < 0 or new_head[0] >= WIDTH // GRID_SIZE or new_head[1] >= HEIGHT // GRID_SIZE:
+            self.game_over = True
+            return self.render(), self.score
+        self.snake = [new_head]  # Keep only the head (single block snake)
+        self.step_count += 1
+        # Check if the target is covered by four surrounding squares
+        head_x, head_y = self.snake[0]
+        neighbors = set([(head_x, head_y - 1), (head_x, head_y + 1), (head_x - 1, head_y), (head_x + 1, head_y)])
+        if neighbors.issuperset(set([self.target])):
+            self.score += 1
+            self.place_target()
+        return self.render(), self.score
+    def render(self):
+        pygame.init()
+        surface = pygame.Surface((WIDTH, HEIGHT))
+        surface.fill(BLACK)
+        head_x, head_y = self.snake[0]
+        surface.blit(magma_img, (head_x * GRID_SIZE, head_y * GRID_SIZE))
+        # pygame.draw.rect(surface, RED, (self.snake[0][0] * GRID_SIZE, self.snake[0][1] * GRID_SIZE, GRID_SIZE, GRID_SIZE))
+        pygame.draw.rect(surface, GREEN, (self.target[0] * GRID_SIZE, self.target[1] * GRID_SIZE, GRID_SIZE, GRID_SIZE))
+        # Draw four surrounding squares with labels
+        head_x, head_y = self.snake[0]
+        neighbors = [(head_x, head_y - 1), (head_x, head_y + 1), (head_x - 1, head_y), (head_x + 1, head_y)]
+        labels = ["1", "2", "3", "4"]
+        font = pygame.font.Font(None, 48)
+        # clone surface
+        surface_nomark = surface.copy()
+        for i, (nx, ny) in enumerate(neighbors):
+            if 0 <= nx < WIDTH // GRID_SIZE and 0 <= ny < HEIGHT // GRID_SIZE:
+                pygame.draw.rect(surface, RED, (nx * GRID_SIZE, ny * GRID_SIZE, GRID_SIZE, GRID_SIZE), GRID_SIZE)
+                # pygame.draw.rect(surface_nomark, RED, (nx * GRID_SIZE, ny * GRID_SIZE, GRID_SIZE, GRID_SIZE), GRID_SIZE)
+                text = font.render(labels[i], True, WHITE)
+                text_rect = text.get_rect(center=(nx * GRID_SIZE + GRID_SIZE // 2, ny * GRID_SIZE + GRID_SIZE // 2))
+                surface.blit(text, text_rect)
+        return np.array(pygame.surfarray.array3d(surface_nomark)).swapaxes(0, 1), np.array(pygame.surfarray.array3d(surface)).swapaxes(0, 1)
+    def get_state(self):
+        return self.render()
+game = MagmaFindGPU()
+def play_game():
+    state, state_som = game.get_state()
+    pil_img = Image.fromarray(state_som)
+    convs = [
+        {"role": "system", "content": "You are an agent that can see, talk, and act. Avoid hitting the wall."},
+        {"role": "user", "content": "<image_start><image><image_end>\nWhich mark is closer to green block? Answer with a single number."},
+    ]
+    prompt = magma_processor.tokenizer.apply_chat_template(convs, tokenize=False, add_generation_prompt=True)
+    inputs = magma_processor(images=[pil_img], texts=prompt, return_tensors="pt")
+    inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)
+    inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0)
+    inputs = inputs.to("cuda").to(dtype)
+    generation_args = {
+        "max_new_tokens": 10,
+        "temperature": 0.3,
+        "do_sample": True,
+        "use_cache": True,
+        "num_beams": 1,
+    }
+    with torch.inference_mode():
+        generate_ids = magam_model.generate(**inputs, **generation_args)
+    generate_ids = generate_ids[:, inputs["input_ids"].shape[-1] :]
+    action = magma_processor.decode(generate_ids[0], skip_special_tokens=True).strip()
+    # extract mark id fro action use re
+    match = re.search(r'\d+', action)
+    if match:
+        action = match.group(0)
+        if action.isdigit() and 1 <= int(action) <= 4:
+            action = ACTIONS[int(action) - 1]
+        else:
+            # random choose one from the pool
+            action = random.choice(ACTIONS[:-1])
+    else:
+        action = random.choice(ACTIONS[:-1])
+    img, score = game.step(action)
+    img = img[0]
+    return img, f"Score: {score}"
+def reset_game():
+    game.reset()
+    return game.render()[0], "Score: 0"
+MARKDOWN = """
+<div align="center">
+<img src="./assets/images/logo.png" alt="Magma Logo" style="margin-right: 5px; height: 80px;margin-top: -10px;">
+<h2>Magma: A Foundation Model for Multimodal AI Agents</h2>
+\[[arXiv Paper](https://www.arxiv.org/pdf/2502.13130)\] &nbsp; \[[Project Page](https://microsoft.github.io/Magma/)\] &nbsp; \[[Github Repo](https://github.com/microsoft/Magma)\] &nbsp; \[[Hugging Face Model](https://huggingface.co/microsoft/Magma-8B)\] &nbsp;
+This demo is powered by [Gradio](https://gradio.app/).
+<b>Goal: Collects the green blocks by automatically moving up, down, left and right.</b>
+</div>
+"""
+with gr.Blocks() as interface:
+    gr.Markdown(MARKDOWN)
+    with gr.Row():
+        image_output = gr.Image(label="Game Screen")
+        with gr.Column():
+            score_output = gr.Text(label="Score", elem_classes="large-text")
+            gr.HTML("""
+                <style>
+                .large-text textarea {
+                    font-size: 24px !important;
+                }
+                </style>
+            """)
+            start_btn = gr.Button("Start/Reset Game")
+    interface.load(fn=play_game, every=1, inputs=[], outputs=[image_output, score_output])
+    start_btn.click(fn=reset_game, inputs=[], outputs=[image_output, score_output])
+interface.launch()

assets/images/magma_game_thin.png CHANGED Viewed

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 torch==2.3.1
 torchvision==0.18.1
 pytorch-lightning>=1.0.8
-transformers @ git+https://github.com/jwyang/transformers.git@dev/jwyang-v4.44.1
 tokenizers>=0.15.0
 sentencepiece==0.1.99
 shortuuid
@@ -35,3 +35,5 @@ open_clip_torch
 supervision==0.18.0
 ultralytics==8.3.78
 pygame

 torch==2.3.1
 torchvision==0.18.1
 pytorch-lightning>=1.0.8
+transformers @ git+https://github.com/jwyang/transformers.git@dev/jwyang-v4.48.2
 tokenizers>=0.15.0
 sentencepiece==0.1.99
 shortuuid
 supervision==0.18.0
 ultralytics==8.3.78
 pygame
+pyautogui
+qwen-vl-utils

vlms/__pycache__/llavanext.cpython-310.pyc ADDED Viewed

Binary file (1.79 kB). View file

vlms/__pycache__/llavaov.cpython-310.pyc ADDED Viewed

Binary file (1.81 kB). View file

vlms/__pycache__/magma.cpython-310.pyc ADDED Viewed

Binary file (1.93 kB). View file

vlms/__pycache__/qwen25vl.cpython-310.pyc ADDED Viewed

Binary file (2.05 kB). View file

vlms/__pycache__/qwen2vl.cpython-310.pyc ADDED Viewed

Binary file (2.01 kB). View file

vlms/llavanext.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
+import torch
+import torch.nn as nn
+from PIL import Image
+import requests
+class LLaVANextAgent(nn.Module):
+    def __init__(self, device="cuda", dtype=torch.float16):
+        super().__init__()
+        self.processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+        self.model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=dtype, low_cpu_mem_usage=True)
+        self.dtype = dtype
+        self.device = device
+        self.model.to(device)
+        self.generation_args = {
+            "max_new_tokens": 10,
+            "temperature": 0.3,
+            "do_sample": True,
+            "use_cache": True,
+            "num_beams": 1,
+        }
+    def generate_response(self, image, question):
+        conversation = [
+            {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": question},
+                {"type": "image"},
+                ],
+            },
+        ]
+        prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
+        inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device)
+        # autoregressively complete prompt
+        self.model.generation_config.pad_token_id = self.processor.tokenizer.pad_token_id
+        with torch.inference_mode():
+            output = self.model.generate(**inputs, **self.generation_args)
+        output = output[:, inputs["input_ids"].shape[-1] :]
+        return self.processor.decode(output[0], skip_special_tokens=True).strip()

vlms/llavaov.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+import torch
+import torch.nn as nn
+from PIL import Image
+import requests
+class LLaVAOVAgent(nn.Module):
+    model_id = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
+    def __init__(self, device="cuda", dtype=torch.float16):
+        super().__init__()
+        self.processor = AutoProcessor.from_pretrained(self.model_id)
+        self.model = LlavaOnevisionForConditionalGeneration.from_pretrained(self.model_id, torch_dtype=dtype, low_cpu_mem_usage=True)
+        self.dtype = dtype
+        self.device = device
+        self.model.to(device)
+        self.generation_args = {
+            "max_new_tokens": 10,
+            "temperature": 0.3,
+            "do_sample": True,
+            "use_cache": True,
+            "num_beams": 1,
+        }
+    def generate_response(self, image, question):
+        conversation = [
+            {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": question},
+                {"type": "image"},
+                ],
+            },
+        ]
+        prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
+        inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device)
+        # autoregressively complete prompt
+        self.model.generation_config.pad_token_id = self.processor.tokenizer.pad_token_id
+        with torch.inference_mode():
+            output = self.model.generate(**inputs, **self.generation_args)
+        output = output[:, inputs["input_ids"].shape[-1] :]
+        return self.processor.decode(output[0], skip_special_tokens=True).strip()

vlms/magma.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from transformers import AutoModelForCausalLM, AutoProcessor
+import torch
+import torch.nn as nn
+from PIL import Image
+import requests
+model_id = "microsoft/Magma-8B"
+class MagmaAgent(nn.Module):
+    def __init__(self, device="cuda", dtype=torch.float16):
+        super().__init__()
+        self.model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=dtype, low_cpu_mem_usage=True)
+        self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+        self.dtype = dtype
+        self.device = device
+        self.model.to(device)
+        self.generation_args = {
+            "max_new_tokens": 10,
+            "temperature": 0.3,
+            "do_sample": True,
+            "use_cache": True,
+            "num_beams": 1,
+        }
+    def generate_response(self, image, question):
+        convs = [
+            {"role": "system", "content": "You are an agent that can see, talk, and act."},
+            {"role": "user", "content": "<image_start><image><image_end>\n{}".format(question)},
+        ]
+        prompt = self.processor.tokenizer.apply_chat_template(convs, tokenize=False, add_generation_prompt=True)
+        inputs = self.processor(images=[image], texts=prompt, return_tensors="pt").to(self.dtype).to(self.device)
+        inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)
+        inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0)
+        with torch.inference_mode():
+            generate_ids = self.model.generate(**inputs, **self.generation_args)
+        generate_ids = generate_ids[:, inputs["input_ids"].shape[-1] :]
+        action = self.processor.decode(generate_ids[0], skip_special_tokens=True).strip()
+        return action

vlms/qwen2vl.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import torch.nn as nn
+from PIL import Image
+import requests
+class Qwen2VLAgent(nn.Module):
+    model_id = "Qwen/Qwen2-VL-7B-Instruct"
+    def __init__(self, device="cuda", dtype=torch.float16):
+        super().__init__()
+        self.processor = AutoProcessor.from_pretrained(self.model_id)
+        self.model = Qwen2VLForConditionalGeneration.from_pretrained(self.model_id, torch_dtype=dtype, low_cpu_mem_usage=True)
+        self.dtype = dtype
+        self.device = device
+        self.model.to(device)
+        self.generation_args = {
+            "max_new_tokens": 10,
+            "temperature": 0.3,
+            "do_sample": True,
+            "use_cache": True,
+            "num_beams": 1,
+        }
+    def generate_response(self, image, question):
+        image.save('qwen25vl.png')
+        conversation = [
+            {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": question},
+                {"type": "image", "image": "qwen25vl.png"},
+                ],
+            },
+        ]
+        # Preparation for inference
+        text = self.processor.apply_chat_template(
+            conversation, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(conversation)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        ).to(self.device)
+        # autoregressively complete prompt
+        self.model.generation_config.pad_token_id = self.processor.tokenizer.pad_token_id
+        with torch.inference_mode():
+            output = self.model.generate(**inputs, **self.generation_args)
+        output = output[:, inputs["input_ids"].shape[-1] :]
+        return self.processor.decode(output[0], skip_special_tokens=True).strip()