Spaces:

Mungert
/

GUI-Actor

Running on Zero

App Files Files Community

johnbridges commited on 3 days ago

Commit

9f57ecf

1 Parent(s): 8ecfcea

.

Browse files

Files changed (10) hide show

app.py +180 -0
gui_actor/__init__.py +0 -0
gui_actor/constants.py +40 -0
gui_actor/dataset.py +533 -0
gui_actor/inference.py +300 -0
gui_actor/modeling.py +361 -0
gui_actor/modeling_qwen25vl.py +337 -0
gui_actor/trainer.py +313 -0
gui_actor/utils.py +90 -0
requirements.txt +16 -0

app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import base64, os
+# import spaces
+import json
+import torch
+import gradio as gr
+from typing import Optional
+from PIL import Image, ImageDraw
+import numpy as np
+import matplotlib.pyplot as plt
+from qwen_vl_utils import process_vision_info
+from datasets import load_dataset
+from transformers import AutoProcessor
+from gui_actor.constants import chat_template
+from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer
+from gui_actor.inference import inference
+MAX_PIXELS = 3200 * 1800
+def resize_image(image, resize_to_pixels=MAX_PIXELS):
+    image_width, image_height = image.size
+    if (resize_to_pixels is not None) and ((image_width * image_height) != resize_to_pixels):
+        resize_ratio = (resize_to_pixels / (image_width * image_height)) ** 0.5
+        image_width_resized, image_height_resized = int(image_width * resize_ratio), int(image_height * resize_ratio)
+        image = image.resize((image_width_resized, image_height_resized))
+    return image
+# @spaces.GPU
+@torch.inference_mode()
+def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)):
+    overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
+    overlay_draw = ImageDraw.Draw(overlay)
+    x, y = point
+    overlay_draw.ellipse(
+        [(x - radius, y - radius), (x + radius, y + radius)],
+        outline=color,
+        width=5  # Adjust thickness as needed
+    )
+    image = image.convert('RGBA')
+    combined = Image.alpha_composite(image, overlay)
+    combined = combined.convert('RGB')
+    return combined
+# @spaces.GPU
+@torch.inference_mode()
+def get_attn_map(image, attn_scores, n_width, n_height):
+    w, h = image.size
+    scores = np.array(attn_scores[0]).reshape(n_height, n_width)
+    scores_norm = (scores - scores.min()) / (scores.max() - scores.min())
+    # Resize score map to match image size
+    score_map = Image.fromarray((scores_norm * 255).astype(np.uint8)).resize((w, h), resample=Image.NEAREST) # BILINEAR)
+    # Apply colormap
+    colormap = plt.get_cmap('jet')
+    colored_score_map = colormap(np.array(score_map) / 255.0)  # returns RGBA
+    colored_score_map = (colored_score_map[:, :, :3] * 255).astype(np.uint8)
+    colored_overlay = Image.fromarray(colored_score_map)
+    # Blend with original image
+    blended = Image.blend(image, colored_overlay, alpha=0.3)
+    return blended
+# load model
+if torch.cuda.is_available():
+    # os.system('pip install flash-attn --no-build-isolation')
+    model_name_or_path = "microsoft/GUI-Actor-7B-Qwen2.5-VL"
+    data_processor = AutoProcessor.from_pretrained(model_name_or_path)
+    tokenizer = data_processor.tokenizer
+    model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
+        model_name_or_path,
+        torch_dtype=torch.bfloat16,
+        device_map="cuda:0",
+        attn_implementation="flash_attention_2"
+    ).eval()
+else:
+    model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL"
+    data_processor = AutoProcessor.from_pretrained(model_name_or_path)
+    tokenizer = data_processor.tokenizer
+    model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
+        model_name_or_path,
+        torch_dtype=torch.bfloat16,
+        device_map="cpu"
+    ).eval()
+title = "GUI-Actor"
+header = """
+<div align="center">
+    <h1 style="padding-bottom: 10px; padding-top: 10px;">🎯 <strong>GUI-Actor</strong>: Coordinate-Free Visual Grounding for GUI Agents</h1>
+    <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
+        Qianhui Wu*, Kanzhi Cheng*, Rui Yang*, Chaoyun Zhang, Jianwei Yang, Huiqiang Jiang, Jian Mu, Baolin Peng, Bo Qiao, Reuben Tan, Si Qin, Lars Liden<br>
+        Qingwei Lin, Huan Zhang, Tong Zhang, Jianbing Zhang, Dongmei Zhang, Jianfeng Gao<br/>
+    </div>
+    <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
+        <a href="https://microsoft.github.io/GUI-Actor/">🌐 Project Page</a> | <a href="https://arxiv.org/abs/2403.12968">📄 arXiv Paper</a> | <a href="https://github.com/microsoft/GUI-Actor">💻 Github Repo</a><br/>
+    </div>
+</div>
+"""
+theme = "soft"
+css = """#anno-img .mask {opacity: 0.5; transition: all 0.2s ease-in-out;}
+            #anno-img .mask.active {opacity: 0.7}"""
+# @spaces.GPU
+@torch.inference_mode()
+def process(image, instruction):
+    # resize image
+    w, h = image.size
+    if w * h > MAX_PIXELS:
+        image = resize_image(image)
+    conversation = [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, your task is to locate the screen element that corresponds to the instruction. You should output a PyAutoGUI action that performs a click on the correct position. To indicate the click location, we will use some special tokens, which is used to refer to a visual patch later. For example, you can output: pyautogui.click(<your_special_token_here>).",
+                }
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image, # PIL.Image.Image or str to path
+                    # "image_url": "https://xxxxx.png" or "https://xxxxx.jpg" or "file://xxxxx.png" or "data:image/png;base64,xxxxxxxx", will be split by "base64,"
+                },
+                {
+                    "type": "text",
+                    "text": instruction,
+                },
+            ],
+        },
+    ]
+    try:
+        pred = inference(conversation, model, tokenizer, data_processor, use_placeholder=True, topk=3)
+    except Exception as e:
+        print(e)
+        return image, f"Error: {e}", None
+    px, py = pred["topk_points"][0]
+    output_coord = f"({px:.4f}, {py:.4f})"
+    img_with_point = draw_point(image, (px * w, py * h))
+    n_width, n_height = pred["n_width"], pred["n_height"]
+    attn_scores = pred["attn_scores"]
+    att_map = get_attn_map(image, attn_scores, n_width, n_height)
+    return img_with_point, output_coord, att_map
+with gr.Blocks(title=title, css=css) as demo:
+    gr.Markdown(header)
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(
+                type='pil', label='Upload image')
+            # text box
+            input_instruction = gr.Textbox(label='Instruction', placeholder='Text your (low-level) instruction here')
+            submit_button = gr.Button(
+                value='Submit', variant='primary')
+        with gr.Column():
+            image_with_point = gr.Image(type='pil', label='Image with Point (red circle)')
+            with gr.Accordion('Detailed prediction'):
+                pred_xy = gr.Textbox(label='Predicted Coordinates', placeholder='(x, y)')
+                att_map = gr.Image(type='pil', label='Attention Map')
+    submit_button.click(
+        fn=process,
+        inputs=[
+            input_image,
+            input_instruction
+        ],
+        outputs=[image_with_point, pred_xy, att_map]
+    )
+# demo.launch(debug=False, show_error=True, share=True)
+# demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
+demo.queue().launch(share=False)

gui_actor/__init__.py ADDED Viewed

File without changes

gui_actor/constants.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import json
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_POINTER_START_TOKEN = "<|pointer_start|>"
+DEFAULT_POINTER_END_TOKEN = "<|pointer_end|>"
+DEFAULT_POINTER_PAD_TOKEN = "<|pointer_pad|>"
+# UNMASK_TOKEN_IDS = [198, 151644, 151645]
+# System Message
+grounding_system_message = "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, your task is to locate the screen element that corresponds to the instruction. You should output a PyAutoGUI action that performs a click on the correct position. To indicate the click location, we will use some special tokens, which is used to refer to a visual patch later. For example, you can output: pyautogui.click(<your_special_token_here>)."
+# Chat Template
+chat_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+assistant_template = "{% for message in messages %}{{'<|im_start|>' + message['role']}}{% if 'recipient' in message %}<|recipient|>{{ message['recipient'] }}{% endif %}{{'\n' + message['content'][0]['text']}}{% if 'end_turn' in message and message['end_turn'] %}{{'<|diff_marker|>\n'}}{% else %}{{'<|im_end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|recipient|>' }}{% endif %}"
+# Special Tokens
+ADDITIONAL_SPECIAL_TOKENS = [
+    "<|recipient|>",
+    "<|diff_marker|>",
+    DEFAULT_POINTER_START_TOKEN,
+    DEFAULT_POINTER_END_TOKEN,
+    DEFAULT_POINTER_PAD_TOKEN,
+]
+# Action Patterns to be replaced with special tokens
+ACTION_PATTENS_XY = [
+    r"x=([0-9.]+), y=([0-9.]+)",
+    r"from_coord=\[([0-9.]+), ([0-9.]+)\], to_coord=\[([0-9.]+), ([0-9.]+)\]",
+]
+until = ["<|diff_marker|>"]

gui_actor/dataset.py ADDED Viewed

	@@ -0,0 +1,533 @@

+import copy
+import json
+import math
+import os
+import random
+import re
+import ast
+from typing import Dict
+import torch
+import transformers
+import yaml
+from qwen_vl_utils import smart_resize, process_vision_info
+from torch.utils.data import Dataset
+from gui_actor.constants import (
+    IGNORE_INDEX,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_POINTER_START_TOKEN,
+    DEFAULT_POINTER_PAD_TOKEN,
+    DEFAULT_POINTER_END_TOKEN,
+    ACTION_PATTENS_XY,
+    ADDITIONAL_SPECIAL_TOKENS,
+    assistant_template,
+    chat_template,
+    grounding_system_message,
+)
+from gui_actor.trainer import rank0_print
+def reformat_coordinates(text):
+    """
+    (1) Find all the coordinates in the text.
+    (2) Replace the coordinates with the special tokens.
+    (3) Return the new text and the coordinates as a list of (x, y), where x in [0, 1] and y in [0, 1].
+    """
+    epsilon = 0.001
+    def adjust_coord(c):
+        """
+        Adjust coordinate if it is too close to 0 or 1.
+        """
+        if abs(c) < epsilon:
+            return epsilon
+        elif abs(c - 1) < epsilon:
+            return 1 - epsilon
+        return c
+    all_matches = []
+    for pattern in ACTION_PATTENS_XY:
+        matches = list(re.finditer(pattern, text))
+        for match in matches:
+            all_matches.append((match.start(), match.groups()))
+        if pattern == ACTION_PATTENS_XY[0]:
+            target_text = f"{DEFAULT_POINTER_START_TOKEN}{DEFAULT_POINTER_PAD_TOKEN}{DEFAULT_POINTER_END_TOKEN}"
+        else:
+            target_text = f"{DEFAULT_POINTER_START_TOKEN}{DEFAULT_POINTER_PAD_TOKEN}{DEFAULT_POINTER_END_TOKEN}, {DEFAULT_POINTER_START_TOKEN}{DEFAULT_POINTER_PAD_TOKEN}{DEFAULT_POINTER_END_TOKEN}"
+        text = re.sub(
+            pattern,
+            target_text,
+            text
+        )
+    coordinates = []
+    all_matches.sort(key=lambda x: x[0])
+    # Extract coordinates in order
+    for _, groups in all_matches:
+        # When two coordinate values are found, parse them as one (x, y) pair.
+        if len(groups) == 2:
+            x_str, y_str = groups
+            x = adjust_coord(ast.literal_eval(x_str))
+            y = adjust_coord(ast.literal_eval(y_str))
+            coordinates.append((x, y))
+        # When four coordinate values are found, parse them as two pairs.
+        elif len(groups) == 4:
+            x1_str, y1_str, x2_str, y2_str = groups
+            x1 = adjust_coord(ast.literal_eval(x1_str))
+            y1 = adjust_coord(ast.literal_eval(y1_str))
+            x2 = adjust_coord(ast.literal_eval(x2_str))
+            y2 = adjust_coord(ast.literal_eval(y2_str))
+            coordinates.append((x1, y1))
+            coordinates.append((x2, y2))
+    return text, coordinates
+def get_token_index(image_processor, image, point_x, point_y):
+    """
+    Get the index of the visual token that contains the point (x, y).
+    Args:
+        image_processor: the image processor
+        image: the image in PIL format
+        point_x: the x coordinate of the point, in [0, 1].
+        point_y: the y coordinate of the point, in [0, 1].
+    """
+    if len(image) != 1:
+        raise ValueError(f"Expected 1 image, got {len(image)}")
+    # get the original image size and the resized image size
+    image = image[0]
+    w, h = image.size
+    px, py = w * point_x, h * point_y
+    # rank0_print(f"px: {px}, py: {py}")
+    # get the token index
+    merge_patch_size = image_processor.patch_size * image_processor.merge_size
+    x_index = math.floor(px / merge_patch_size)
+    y_index = math.floor(py / merge_patch_size)
+    visual_token_index = y_index * (w // merge_patch_size) + x_index
+    # merge all above print into one line
+    return visual_token_index
+def get_multi_patch_labels(image_processor, image, bbox_gt):
+    """
+    Get the multi-patch labels for the bounding box.
+    Args:
+        image_processor: the image processor
+        image: the image in PIL format
+        bbox_gt: the bounding box in the format of (x_min, y_min, x_max, y_max) [0,1]
+    """
+    if len(image) != 1:
+        raise ValueError(f"Expected 1 image, got {len(image)}")
+    # Get the original image size and the resized image size
+    image = image[0]
+    w, h = image.size
+    bbox_gt = [bbox_gt[0]*w, bbox_gt[1]*h, bbox_gt[2]*w, bbox_gt[3]*h]
+    # Extract bounding box coordinates
+    x_min, y_min, x_max, y_max = bbox_gt
+    x_min = max(0, x_min)
+    y_min = max(0, y_min)
+    x_max = min(w, x_max)
+    y_max = min(h, y_max)
+    merge_patch_size = image_processor.patch_size * image_processor.merge_size
+    assert w % merge_patch_size == 0 and h % merge_patch_size == 0, f"Image size {w}x{h} is not divisible by merge_patch_size {merge_patch_size}"
+    grid_h, grid_w = h // merge_patch_size, w // merge_patch_size
+    binary_mask = torch.zeros(grid_h * grid_w)
+    # Iterate through all patches, check if they overlap with the bounding box
+    for y_idx in range(grid_h):
+        for x_idx in range(grid_w):
+            # Calculate patch boundaries
+            patch_x_min = x_idx * merge_patch_size
+            patch_y_min = y_idx * merge_patch_size
+            patch_x_max = patch_x_min + merge_patch_size
+            patch_y_max = patch_y_min + merge_patch_size
+            # Check if patch overlaps with the bounding box
+            if not (patch_x_max <= x_min or patch_x_min >= x_max or
+                    patch_y_max <= y_min or patch_y_min >= y_max):
+                # Calculate patch index in the flattened grid
+                patch_idx = y_idx * grid_w + x_idx
+                binary_mask[patch_idx] = 1
+    return binary_mask
+def token_index_to_coordinates(image_processor, visual_token_index, image_width, image_height):
+    merge_patch_size = image_processor.patch_size * image_processor.merge_size
+    x_index = visual_token_index % (image_width // merge_patch_size)
+    y_index = visual_token_index // (image_width // merge_patch_size)
+    px = x_index * merge_patch_size + merge_patch_size / 2
+    py = y_index * merge_patch_size + merge_patch_size / 2
+    return px, py
+class LazySupervisedDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer: transformers.PreTrainedTokenizer,
+        processor: transformers.ProcessorMixin,
+        data_path: str,
+        data_args,
+    ):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.list_data_dict = []
+        self.list_image_path = []
+        self.pointer_pad_token_id = tokenizer.encode(DEFAULT_POINTER_PAD_TOKEN)[0]
+        self.pointer_start_token_id = tokenizer.encode(DEFAULT_POINTER_START_TOKEN)[0]
+        self.pointer_end_token_id = tokenizer.encode(DEFAULT_POINTER_END_TOKEN)[0]
+        # Handle multiple JSON files specified in the data_path
+        if "{" in data_path and "}" in data_path:
+            base_path, file_pattern = re.match(r"^(.*)\{(.*)\}\.json$", data_path).groups()
+            file_names = file_pattern.split(",")
+            rank0_print(f"Loading {file_names} from {base_path}")
+            data_args.dataset_paths = []
+            for file_name in file_names:
+                data_args.dataset_paths.append(f"{base_path}{file_name}.json")
+                full_path = f"{base_path}{file_name}.json"
+                rank0_print(f"Loading {full_path}")
+                with open(full_path) as file:
+                    cur_data_dict = json.load(file)
+                    rank0_print(f"Loaded {len(cur_data_dict)} samples from {full_path}")
+                    self.list_data_dict.extend(cur_data_dict)
+        elif data_path.endswith(".yaml"):
+            with open(data_path) as file:
+                yaml_data = yaml.safe_load(file)
+                datasets = yaml_data.get("datasets")
+                # file should be in the format of:
+                # datasets:
+                #   - json_path: xxxx1.json
+                #     sampling_strategy: first:1000
+                #   - json_path: xxxx2.json
+                #     sampling_strategy: end:3000
+                #   - json_path: xxxx3.json
+                #     sampling_strategy: random:999
+                data_args.dataset_paths = [dataset.get("json_path") for dataset in datasets]
+                for dataset in datasets:
+                    json_path = dataset.get("json_path")
+                    sampling_strategy = dataset.get("sampling_strategy", "all")
+                    images_folder = dataset.get("images_folder")
+                    sampling_number = None
+                    rank0_print(f"Loading {json_path} with {sampling_strategy} sampling strategy")
+                    if json_path.endswith(".jsonl"):
+                        cur_data_dict = []
+                        with open(json_path) as json_file:
+                            for line in json_file:
+                                cur_data_dict.append(json.loads(line.strip()))
+                    elif json_path.endswith(".json"):
+                        # NOTE: we only use json_path with .json now
+                        # Handle the images_folder in yaml
+                        with open(json_path) as json_file:
+                            cur_data_dict = json.load(json_file)
+                    else:
+                        raise ValueError(f"Unsupported file type: {json_path}")
+                    if ":" in sampling_strategy:
+                        sampling_strategy, sampling_number = sampling_strategy.split(":")
+                        if "%" in sampling_number:
+                            sampling_number = math.ceil(int(sampling_number.split("%")[0]) * len(cur_data_dict) / 100)
+                        else:
+                            sampling_number = int(sampling_number)
+                    # Apply the sampling strategy
+                    if sampling_strategy == "first" and sampling_number is not None:
+                        cur_data_dict = cur_data_dict[:sampling_number]
+                    elif sampling_strategy == "end" and sampling_number is not None:
+                        cur_data_dict = cur_data_dict[-sampling_number:]
+                    elif sampling_strategy == "random" and sampling_number is not None:
+                        random.shuffle(cur_data_dict)
+                        cur_data_dict = cur_data_dict[:sampling_number]
+                    rank0_print(f"Loaded {len(cur_data_dict)} samples from {json_path}")
+                    self.list_data_dict.extend(cur_data_dict)
+                    self.list_image_path.extend([images_folder] * len(cur_data_dict))
+        else:
+            data_args.dataset_paths = [data_path]
+            rank0_print(f"Loading {data_path}")
+            with open(data_path) as file:
+                cur_data_dict = json.load(file)
+                rank0_print(f"Loaded {len(cur_data_dict)} samples from {data_path}")
+                self.list_data_dict.extend(cur_data_dict)
+                self.list_image_path.extend([""] * len(cur_data_dict))  # NOTE: the image subfolder is empty...
+        rank0_print(f"Loaded {len(self.list_data_dict)} samples from {data_path}")
+        rank0_print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+    def __len__(self):
+        return len(self.list_data_dict)
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = (
+                1200 * len(sample["image"]) if isinstance(sample["image"], list) else 1200 if "image" in sample else 0
+            )
+            length_list.append(sum(len(conv["value"].split()) for conv in sample["conversations"]) + img_tokens)
+        return length_list
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"])
+            assert cur_len > 0, f"Conversation length is 0 for {sample}"
+            img_tokens = (
+                1200 * len(sample["image"]) if isinstance(sample["image"], list) else 1200 if "image" in sample else 0
+            )
+            if "image" in sample or "video" in sample or self.data_args.early_mix_text:
+                length_list.append(cur_len + img_tokens)
+            else:
+                length_list.append(-cur_len)
+        return length_list
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sample = self._get_item(i)
+        if sample is None:
+            new_index = random.randint(0, len(self.list_data_dict) - 1)
+            return self.__getitem__(new_index)
+        else:
+            return sample
+        try:
+            sample = self._get_item(i)
+            if sample is None:
+                new_index = random.randint(0, len(self.list_data_dict) - 1)
+                return self.__getitem__(new_index)
+        except Exception as e:
+            print(f"Failed to fetch sample {i}. Exception:", e)
+            new_index = random.randint(0, len(self.list_data_dict) - 1)
+            return self.__getitem__(new_index)
+        return sample
+    def _get_item(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        image_path = os.path.join(self.data_args.image_folder, self.list_image_path[i])
+        if "image" in sources:
+            image_file = self.list_data_dict[i]["image"]
+            if type(image_file) is list:
+                image_list = [os.path.join(image_path, image_file) for image_file in image_file]
+            else:
+                image_list = [os.path.join(image_path, image_file)]
+            sources = copy.deepcopy(sources["conversations"])
+        elif "video" in sources:
+            raise NotImplementedError("Video is not supported for Qwen2VL")
+        else:
+            sources = copy.deepcopy(sources["conversations"])
+        item_id = self.list_data_dict[i].get("id", i)
+        data_dict = self.preprocess_qwen2vl(sources, self.tokenizer, self.processor, image_list, id=item_id)
+        if isinstance(i, int):
+            data_dict = {
+                "input_ids": data_dict["input_ids"][0],
+                "labels": data_dict["labels"][0],
+                "coordinates": data_dict["coordinates"][0],
+                "visual_token_indices_of_coordinates": data_dict["visual_token_indices_of_coordinates"][0],
+                "pixel_values": data_dict["pixel_values"],
+                "image_grid_thw": data_dict["image_grid_thw"],
+                "multi_patch_labels": data_dict["multi_patch_labels"][0],   # add multi_patch_labels
+            }
+        data_dict["id"] = item_id
+        # return None if the input_ids is longer than the model_max_length
+        n_image_tokens = (
+            data_dict["image_grid_thw"][0][0] *
+            data_dict["image_grid_thw"][0][1] *
+            data_dict["image_grid_thw"][0][2] /
+            self.processor.image_processor.merge_size /
+            self.processor.image_processor.merge_size
+        )
+        if (len(data_dict["input_ids"]) + n_image_tokens) > self.tokenizer.model_max_length:
+            rank0_print(f"=== Removed data_dict {i} because it is longer than the model_max_length: {len(data_dict['input_ids'])} + {n_image_tokens} > {self.tokenizer.model_max_length}")
+            return None
+        return data_dict
+    def preprocess_qwen2vl(
+        self,
+        source, # conversations
+        tokenizer: transformers.PreTrainedTokenizer,
+        processor: transformers.ProcessorMixin,
+        image: list,
+        system_message: str = grounding_system_message,
+        agent_mode: bool = True,
+        chat_template: str = chat_template,
+        assistant_template: str = assistant_template,
+        id: int = None,
+    ) -> Dict:
+        roles = {"human": "user", "gpt": "assistant", "system": "system"}
+        assistant_template = assistant_template if agent_mode else chat_template
+        processor.tokenizer = tokenizer
+        assert tokenizer.additional_special_tokens == ADDITIONAL_SPECIAL_TOKENS
+        # Apply prompt templates
+        pixel_values, image_grid_thw = None, None
+        input_id, target = [], []
+        coordinates = []
+        visual_token_indices_of_coordinates = []
+        multi_patch_labels = []
+        image_list = []
+        image_index = 0
+        ## prepare the system message
+        if roles[source[0]["from"]] == "system":
+            system_message = source[0]["value"]
+            source = source[1:self.data_args.max_conv_turns]
+        # else: use the constant system message
+        system_input_id = tokenizer.apply_chat_template(
+            conversation=[{"role": "system", "content": [{"type": "text", "text": system_message}]}],
+            chat_template=chat_template,
+        )
+        input_id += system_input_id
+        target += [IGNORE_INDEX] * len(system_input_id)
+        ## prepare user-assistant conversation
+        for conv in source:
+            # regularize the conversation format
+            try:
+                role = conv["role"]
+                content = conv["content"]
+            except Exception:
+                role = conv["from"]
+                content = conv["value"]
+            role = roles.get(role, role)
+            # Count the number of <image> tokens in the content
+            image_count = content.count(DEFAULT_IMAGE_TOKEN)
+            if image_count > 0:
+                assert role == "user", "Images are only supported for user messages"
+                # include image information regarding to current conversation turn
+                image_placeholders = []
+                for _ in range(image_count):
+                    image_placeholders.append({
+                        "type": "image",
+                        "image": image[image_index],
+                        "min_pixels": self.processor.image_processor.min_pixels,
+                        "max_pixels": self.processor.image_processor.max_pixels,
+                    })
+                    image_index += 1
+                content = content.replace(DEFAULT_IMAGE_TOKEN, "")
+                conv = {"role": role, "content": image_placeholders + [{"type": "text", "text": content}]}
+                image_inputs, _ = process_vision_info([conv]) # list of PIL.Image.Image
+                image_list.extend(image_inputs)
+                templated_conv = tokenizer.apply_chat_template(
+                    conversation=[conv], chat_template=chat_template, tokenize=False
+                )
+                inputs = processor(text=[templated_conv], images=image_inputs, return_tensors="pt")
+                if pixel_values is None and image_grid_thw is None:
+                    pixel_values = inputs["pixel_values"]
+                    image_grid_thw = inputs["image_grid_thw"]
+                else:
+                    pixel_values = torch.concat([pixel_values, inputs["pixel_values"]], dim=0)
+                    image_grid_thw = torch.concat([image_grid_thw, inputs["image_grid_thw"]], dim=0)
+            else:
+                if role in ["user", "system"]:
+                    conv = {"role": role, "content": [{"type": "text", "text": content}]}
+                else:  # assistant
+                    conv = {
+                        "role": role,
+                        "content": [{"type": "text", "text": content}],
+                        "recipient": conv.get("recipient", "os"),
+                        "end_turn": conv.get("end_turn", True),
+                        "bbox_gt": conv.get("bbox_gt", None),
+                    }
+                    if conv["recipient"] == "os":
+                        if len(image_inputs) == 0:
+                            raise ValueError("No image found for visual grounding")
+                        # replace the coordinates with the special tokens
+                        text, coord = reformat_coordinates(conv["content"][0]["text"])
+                        conv["content"][0]["text"] = text
+                        # rank0_print(f"coord: {coord}")
+                        # get the visual token indices of the coordinates
+                        coordinates.extend(coord)
+                        for (point_x, point_y) in coord:
+                            visual_token_index = get_token_index(
+                                processor.image_processor,
+                                image_list,
+                                point_x,
+                                point_y
+                            )
+                            # px, py = token_index_to_coordinates(
+                            #     processor.image_processor,
+                            #     visual_token_index,
+                            #     image_list[0].size[0], # make sure the size here is after qwen2vl processing
+                            #     image_list[0].size[1]
+                            # )
+                            # rank0_print(f"estimated px: {px}, py: {py}")
+                            visual_token_indices_of_coordinates.append(visual_token_index)
+                            if conv["bbox_gt"] is not None:
+                                patch_mask = get_multi_patch_labels(
+                                    processor.image_processor,
+                                    image_list,
+                                    conv["bbox_gt"]
+                                )
+                                multi_patch_labels.append(patch_mask)
+                templated_conv = tokenizer.apply_chat_template(
+                    conversation=[conv],
+                    chat_template=assistant_template,
+                    tokenize=False,
+                )
+                inputs = processor(text=[templated_conv], return_tensors="pt")
+            encode_id = inputs.input_ids[0].tolist()
+            input_id += encode_id
+            if role in ["user", "system"]:
+                target += [IGNORE_INDEX] * len(encode_id)
+            else:
+                target += encode_id
+        assert len(input_id) == len(target), f"{len(input_id)} != {len(target)}"
+        # make the labels of all pointer_end_token_id to be IGNORE_INDEX
+        target = [IGNORE_INDEX if token == self.pointer_end_token_id else token for token in target]
+        input_ids = torch.tensor([input_id], dtype=torch.long)
+        targets = torch.tensor([target], dtype=torch.long)
+        visual_token_indices_of_coordinates = torch.tensor([visual_token_indices_of_coordinates], dtype=torch.long) if len(visual_token_indices_of_coordinates) > 0 else [None]
+        coordinates = [coordinates] if len(coordinates) > 0 else [None]
+        # process multi_patch_labels
+        if len(multi_patch_labels) > 0:
+            multi_patch_labels = [torch.stack(multi_patch_labels)]
+        else:
+            multi_patch_labels = [None]
+        data_dict = {
+            "input_ids": input_ids,  # tensor(bs x seq_len)
+            "labels": targets,  # tensor(bs x seq_len)
+        }
+        if pixel_values is not None:
+            data_dict["pixel_values"] = pixel_values
+            data_dict["image_grid_thw"] = image_grid_thw
+        # if len(coordinates[0]) != len(visual_token_indices_of_coordinates[0]):
+        #     raise ValueError(f"The number of coordinates ({len(coordinates[0])}) does not match the number of image token indices ({len(visual_token_indices_of_coordinates[0])})")
+        data_dict["coordinates"] = coordinates
+        data_dict["visual_token_indices_of_coordinates"] = visual_token_indices_of_coordinates
+        data_dict["multi_patch_labels"] = multi_patch_labels
+        return data_dict

gui_actor/inference.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import torch
+import json
+import re
+import os
+from qwen_vl_utils import process_vision_info
+from transformers import (
+    Qwen2VLForConditionalGeneration,
+    LogitsProcessor,
+    LogitsProcessorList,
+    AutoModelForCausalLM,
+    AutoTokenizer
+)
+from gui_actor.constants import (
+    DEFAULT_POINTER_END_TOKEN,
+    DEFAULT_POINTER_PAD_TOKEN,
+    chat_template
+)
+class ForceFollowTokensLogitsProcessor(LogitsProcessor):
+    """
+    Forces tokens B (pointer_pad_token) and C (pointer_end_token) to follow token A (pointer_start_token).
+    Whenever token_a_id is generated, enqueue the forced_sequence (e.g. [B, C]).
+    As long as forced tokens remain in the queue, force them in the output.
+    """
+    def __init__(self, token_a_id, forced_sequence=[DEFAULT_POINTER_PAD_TOKEN, DEFAULT_POINTER_END_TOKEN]):
+        super().__init__()
+        self.token_a_id = token_a_id
+        self.forced_sequence = forced_sequence  # list of token IDs, e.g. [B_id, C_id]
+        self.force_queue = []  # holds the tokens we still need to force
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        Called at each decoding step to modify `scores`.
+        Args:
+            input_ids: shape (batch_size, seq_len). The already-decoded tokens.
+            scores:    shape (batch_size, vocab_size). Model logits for the next token.
+        """
+        batch_size = input_ids.shape[0]
+        if batch_size > 1:
+            raise NotImplementedError("Batch size must be 1 for this logits processor.")
+        # We assume batch_size=1 for simplicity; if you have multiple sequences,
+        # you'll need to adapt the logic to handle each item in the batch.
+        last_token_id = input_ids[0, -1].item()
+        # If the last token was A, enqueue B and C
+        if last_token_id == self.token_a_id:
+            self.force_queue.extend(self.forced_sequence)
+        # If we have forced tokens waiting in the queue, override the distribution
+        if len(self.force_queue) > 0:
+            forced_token = self.force_queue.pop(0)  # next token to force
+            # Create a mask of -inf for all tokens except the forced one
+            new_scores = torch.full_like(scores, float('-inf'))
+            new_scores[0, forced_token] = 0.0  # log prob = 0 => prob = 1
+            return new_scores
+        # Otherwise, return scores unmodified
+        return scores
+def get_prediction_region_point(attn_scores, n_width, n_height, top_n=30, activation_threshold=0.3, return_all_regions=True, rect_center=False):
+    """
+    1. Select activated patches
+    2. Divide connected patches into different regions
+    3. Calculate the average activation value for each region
+    4. Select the region with the highest average activation value
+    5. Return the center point of that region as the final prediction point
+    """
+    # Get patches with activation values greater than a certain proportion of the maximum activation value as activated patches
+    # Get the highest activation value and threshold
+    max_score = attn_scores[0].max().item()
+    threshold = max_score * activation_threshold
+    # Select all patches above the threshold
+    mask = attn_scores[0] > threshold
+    valid_indices = torch.nonzero(mask).squeeze(-1)
+    topk_values = attn_scores[0][valid_indices]
+    topk_indices = valid_indices
+    # Convert indices to 2D coordinates
+    topk_coords = []
+    for idx in topk_indices.tolist():
+        y = idx // n_width
+        x = idx % n_width
+        topk_coords.append((y, x, idx))
+    # Divide into connected regions
+    regions = []
+    visited = set()
+    for i, (y, x, idx) in enumerate(topk_coords):
+        if idx in visited:
+            continue
+        # Start a new region
+        region = [(y, x, idx, topk_values[i].item())]
+        visited.add(idx)
+        queue = [(y, x, idx, topk_values[i].item())]
+        # BFS to find connected points
+        while queue:
+            cy, cx, c_idx, c_val = queue.pop(0)
+            # Check 4 adjacent directions
+            for dy, dx in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
+                ny, nx = cy + dy, cx + dx
+                n_idx = ny * n_width + nx
+                # Check if this adjacent point is in the topk list
+                for j, (ty, tx, t_idx) in enumerate(topk_coords):
+                    if ty == ny and tx == nx and t_idx not in visited:
+                        visited.add(t_idx)
+                        region.append((ny, nx, t_idx, topk_values[j].item()))
+                        queue.append((ny, nx, t_idx, topk_values[j].item()))
+        regions.append(region)
+    # Calculate the average activation value for each region
+    region_scores = []
+    region_centers = []
+    region_points = []
+    for region in regions:
+        # Calculate average score for the region
+        avg_score = sum(item[3] for item in region) / len(region)
+        region_scores.append(avg_score)
+        # Calculate normalized center coordinates for each patch, then take the average
+        normalized_centers = []
+        weights = []
+        y_coords = set()
+        x_coords = set()
+        for y, x, _, score in region:
+            # Normalized coordinates of the center point for each patch
+            center_y = (y + 0.5) / n_height
+            center_x = (x + 0.5) / n_width
+            normalized_centers.append((center_x, center_y))
+            weights.append(score)
+            y_coords.add(center_y)
+            x_coords.add(center_x)
+        region_points.append(normalized_centers)
+        # Calculate the average of normalized coordinates as the region center
+        if not rect_center:
+            # Weighted average
+            total_weight = sum(weights)
+            weighted_x = sum(nc[0] * w for nc, w in zip(normalized_centers, weights)) / total_weight
+            weighted_y = sum(nc[1] * w for nc, w in zip(normalized_centers, weights)) / total_weight
+            avg_center_x, avg_center_y = weighted_x, weighted_y
+            # # Simple average
+            # avg_center_x = sum(nc[0] for nc in normalized_centers) / len(normalized_centers)
+            # avg_center_y = sum(nc[1] for nc in normalized_centers) / len(normalized_centers)
+        else:
+            avg_center_x = sum(x_coords) / len(x_coords)
+            avg_center_y = sum(y_coords) / len(y_coords)
+        region_centers.append((avg_center_x, avg_center_y))
+    # Select the region with the highest average activation value
+    sorted_indices = sorted(range(len(region_scores)), key=lambda i: region_scores[i], reverse=True)
+    sorted_scores = [region_scores[i] for i in sorted_indices]
+    sorted_centers = [region_centers[i] for i in sorted_indices]
+    sorted_points = [region_points[i] for i in sorted_indices]
+    best_point = sorted_centers[0]
+    if return_all_regions:
+        # Outputs:
+        # 1. best_point: the center point of the region with the highest average activation value
+        # 2. sorted_centers: the center points of all regions, sorted by the average activation value in descending order
+        # 3. sorted_scores: the average activation values of all regions, sorted in descending order
+        # 4. sorted_points: the normalized center coordinates of all patches, sorted by the average activation value in descending order
+        return best_point, sorted_centers, sorted_scores, sorted_points
+    else:
+        return best_point
+def inference(conversation, model, tokenizer, data_processor, logits_processor=None, use_placeholder=False, topk=5):
+    """
+    conversation = [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": grounding_system_message,
+                }
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": example["image"], # PIL.Image.Image or str to path
+                    # "image_url": "https://xxxxx.png" or "https://xxxxx.jpg" or "file://xxxxx.png" or "data:image/png;base64,xxxxxxxx", will be split by "base64,"
+                },
+                {
+                    "type": "text",
+                    "text": example["instruction"]
+                },
+            ],
+        },
+    ]
+    """
+    if logits_processor is None:
+        logits_processor = ForceFollowTokensLogitsProcessor(
+            token_a_id=tokenizer.encode(DEFAULT_POINTER_PAD_TOKEN)[0],
+            forced_sequence=[
+                tokenizer.encode(DEFAULT_POINTER_END_TOKEN)[0]
+            ]
+        )
+    assiatant_starter = "" if not use_placeholder else "<|im_start|>assistant<|recipient|>os\npyautogui.click(<|pointer_start|><|pointer_pad|><|pointer_end|>)"
+    pred = {
+        "output_text": None, # generated text
+        "n_width": None, # number of patch_tokens in width dimension
+        "n_height": None, # number of patch_tokens in height dimension
+        "attn_scores": None, # attention scores over the image patches
+        "topk_points": None, # topk points
+        "topk_values": None, # topk values
+        "topk_points_all": None, # all points
+    }
+    # prepare text
+    text = data_processor.apply_chat_template(conversation,
+                                            tokenize=False,
+                                            add_generation_prompt=False,
+                                            chat_template=chat_template
+                                            )
+    text += assiatant_starter
+    # prepare inputs
+    image_inputs, video_inputs = process_vision_info(conversation)
+    inputs = data_processor(text=[text],
+                            images=image_inputs,
+                            videos=video_inputs,
+                            padding=True,
+                            return_tensors="pt"
+                            )
+    inputs = inputs.to(model.device)
+    # generate
+    results = model.generate(**inputs,
+                            max_new_tokens=2048 if not use_placeholder else 1,
+                            logits_processor=LogitsProcessorList([logits_processor]),
+                            return_dict_in_generate=True,
+                            output_hidden_states=True
+                            )
+    # decode the generated ids
+    input_ids = inputs["input_ids"][0]
+    generated_ids = results.sequences[0][len(input_ids):]
+    output_text = tokenizer.decode(generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)
+    pred["output_text"] = output_text
+    # check if there are <POINTER_TOKEN> is inside the input_ids or generated_ids
+    if use_placeholder:
+        pointer_pad_mask = (inputs["input_ids"][0] == model.config.pointer_pad_token_id) # n_all_input_tokens
+    else:
+        pointer_pad_mask = (generated_ids[:-1] == model.config.pointer_pad_token_id) # seq_len_generated_ids-1
+    # if there are no <POINTER_TOKEN> in the input_ids or generated_ids, return the pred
+    if len(pointer_pad_mask) == 0:
+        return pred
+    # otherwise, get the coordinate from the action head
+    if use_placeholder:
+        decoder_hidden_states = results.hidden_states[0][-1][0] # n_all_input_tokens, hidden_size
+    else:
+        decoder_hidden_states = [step_hidden_states[-1][0] for step_hidden_states in results.hidden_states[1:]]
+        decoder_hidden_states = torch.cat(decoder_hidden_states, dim=0) # seq_len_generated_ids-1, hidden_size
+    decoder_hidden_states = decoder_hidden_states[pointer_pad_mask] # n_pointer_pad_tokens, hidden_size
+    # get the image embeddings as encoder vectors
+    # image_embeds = model.visual(inputs["pixel_values"], grid_thw=inputs["image_grid_thw"]) # n_image_tokens, hidden_size
+    image_mask = (inputs["input_ids"][0] == tokenizer.encode("<|image_pad|>")[0])
+    image_embeds = results.hidden_states[0][0][0][image_mask] # n_image_tokens, hidden_size
+    attn_scores, _ = model.multi_patch_pointer_head(image_embeds, decoder_hidden_states)
+    pred["attn_scores"] = attn_scores.tolist()
+    _, n_height, n_width = (inputs["image_grid_thw"][0] // model.visual.spatial_merge_size).tolist()
+    pred["n_width"] = n_width
+    pred["n_height"] = n_height
+    # get the topk points according to the attention scores
+    best_point, region_points, region_scores, region_points_all = get_prediction_region_point(attn_scores, n_width, n_height, return_all_regions=True, rect_center=False)
+    topk_points = region_points[:topk] if len(region_points) > topk else region_points
+    topk_values = region_scores[:topk] if len(region_scores) > topk else region_scores
+    topk_points_all = region_points_all[:topk] if len(region_points_all) > topk else region_points_all
+    pred["topk_points"] = topk_points
+    pred["topk_values"] = topk_values
+    pred["topk_points_all"] = topk_points_all
+    return pred

gui_actor/modeling.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLCausalLMOutputWithPast, Qwen2VLForConditionalGeneration
+from gui_actor.constants import IGNORE_INDEX
+from typing import List, Tuple, Union, Optional
+from gui_actor.trainer import rank0_print
+class QwenVLwithVisionHeadOutputWithPast(Qwen2VLCausalLMOutputWithPast):
+    """
+    Output class for Qwen2VL with pointer head, extending the base output class.
+    Args:
+        lm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
+            Language modeling loss.
+        pointer_loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
+            Vision pointer network loss.
+        pointer_scores (`List[torch.FloatTensor]`, *optional*):
+            Attention scores from the pointer network, one tensor per batch item.
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
+            Combined loss (weighted sum of lm_loss and pointer_loss).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores from the language modeling head.
+        past_key_values, hidden_states, attentions, rope_deltas:
+            Same as parent class.
+    """
+    def __init__(self, lm_loss=None, pointer_loss=None, pointer_scores=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lm_loss = lm_loss
+        self.pointer_loss = pointer_loss
+        self.pointer_scores = pointer_scores
+class VisionHead_MultiPatch(nn.Module):
+    def __init__(self, d_model, projection_dim, num_attention_heads=8, dropout_rate=0.1):
+        super().__init__()
+        self.d_model = d_model
+        # Note: We omit additional normalization here because Qwen2VL
+        # already normalizes hidden states using RMSNorm.
+        self.projection_enc = nn.Sequential(
+            nn.Linear(d_model, projection_dim),
+            nn.GELU(),
+            nn.Linear(projection_dim, d_model)
+        )
+        self.projection_dec = nn.Sequential(
+            nn.Linear(d_model, projection_dim),
+            nn.GELU(),
+            nn.Linear(projection_dim, d_model)
+        )
+        # Add self-attention layer for visual features
+        self.self_attention = nn.MultiheadAttention(
+            embed_dim=d_model,
+            num_heads=num_attention_heads,
+            dropout=dropout_rate,
+            batch_first=True
+        )
+        # Layer normalization and residual connection
+        self.layer_norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout_rate)
+    def forward(self,
+                hidden_state_enc,  # shape: [n_enc, d_model] where n_enc can vary with image size
+                hidden_state_dec,  # shape: [n_dec, d_model] there can be multiple query in one sample
+                labels: Optional[torch.Tensor] = None,  # shape: [n_dec, n_enc], binary mask of patches in bbox
+                do_single_patch: bool = False,
+               ):
+        enc_input = hidden_state_enc.unsqueeze(0)
+        attn_output, _ = self.self_attention(
+            query=enc_input,
+            key=enc_input,
+            value=enc_input,
+            # attn_mask=attention_mask,
+            need_weights=False
+        )
+        # Residual connection and layer normalization
+        hidden_state_enc_ctx = self.layer_norm(enc_input + self.dropout(attn_output))
+        # Remove batch dimension
+        hidden_state_enc_ctx = hidden_state_enc_ctx.squeeze(0)  # [n_enc, d_model]
+        # Apply the projection networks.
+        proj_enc = self.projection_enc(hidden_state_enc_ctx)  # [n_enc, d_model]
+        proj_dec = self.projection_dec(hidden_state_dec)  # [n_dec, d_model]
+        # Compute scaled dot-product attention scores.
+        # Scaling by sqrt(d_model) is critical regardless of variable n_enc.
+        scaling = self.d_model ** 0.5
+        patch_logits = torch.matmul(proj_dec, proj_enc.transpose(0, 1)) / scaling  # [n_dec, n_enc]
+        # Softmax normalization is applied along the encoder dimension.
+        attn_weights = F.softmax(patch_logits, dim=-1)
+        loss = None
+        if (labels is not None) and (not do_single_patch):
+            epsilon = 1e-8
+            labels_float = labels.float()
+            # Normalize each row to get target probability distribution
+            target_dist = labels_float / (labels_float.sum(dim=-1, keepdim=True) + epsilon)
+            # Apply log_softmax to logits
+            pred_log_probs = F.log_softmax(patch_logits, dim=-1)
+            # Use KL divergence as loss
+            loss = F.kl_div(pred_log_probs, target_dist, reduction='batchmean')
+        if do_single_patch and (labels is not None):
+            loss = F.cross_entropy(attn_scores, labels)
+        return attn_weights, loss
+class Qwen2VLForConditionalGenerationWithPointer(Qwen2VLForConditionalGeneration):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.multi_patch_pointer_head = VisionHead_MultiPatch(self.config.hidden_size, self.config.hidden_size)
+        self.pointer_loss_weight = kwargs.get("pointer_loss_weight", 1.0)
+        self.lm_loss_weight = kwargs.get("lm_loss_weight", 1.0)
+        self.post_init()
+    def reset_loss_weights(self, pointer_loss_weight, lm_loss_weight):
+        self.pointer_loss_weight = pointer_loss_weight
+        self.lm_loss_weight = lm_loss_weight
+    def forward(self,
+                input_ids: torch.LongTensor = None, # (batch_size, seq_len)
+                attention_mask: Optional[torch.Tensor] = None,
+                position_ids: Optional[torch.LongTensor] = None,
+                past_key_values: Optional[List[torch.FloatTensor]] = None,
+                inputs_embeds: Optional[torch.FloatTensor] = None,
+                labels: Optional[torch.LongTensor] = None,
+                use_cache: Optional[bool] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+                pixel_values: Optional[torch.Tensor] = None,
+                pixel_values_videos: Optional[torch.FloatTensor] = None,
+                image_grid_thw: Optional[torch.LongTensor] = None,
+                video_grid_thw: Optional[torch.LongTensor] = None,
+                rope_deltas: Optional[torch.LongTensor] = None,
+                cache_position: Optional[torch.LongTensor] = None,
+                # Grounding
+                visual_token_indices_of_coordinates: Optional[torch.Tensor] = None, # shape: (batch_size, n_target); each element is the ground-truth index of the visual token that should be attended to for the corresponding target token
+                multi_patch_labels: Optional[torch.Tensor] = None, # shape: list [(n_target, n_visual), ...]; binary mask of patches in bbox
+                if_multi_patch: bool = True,
+                coordinates: Optional[List[Tuple[float, float]]] = None,
+                verbose: bool = False) -> Union[Tuple, QwenVLwithVisionHeadOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if verbose:
+            rank0_print(f"input_ids: {input_ids.shape}, {input_ids[0][:5]}...")
+            rank0_print(f"labels: {labels.shape}, {labels[0][:5]}...")
+            rank0_print(f"pixel_values: {pixel_values.shape}")
+            rank0_print(f"image_grid_thw: {image_grid_thw.shape}, {image_grid_thw}")
+            rank0_print(f"coordinates: {coordinates}")
+            rank0_print(f"visual_token_indices_of_coordinates: {visual_token_indices_of_coordinates}")
+            rank0_print(f"return_dict: {return_dict}")
+        if inputs_embeds is None:
+            inputs_embeds = self.model.embed_tokens(input_ids) # shape: (batch_size, seq_len, d_model)
+            if pixel_values is not None:
+                pixel_values = pixel_values.type(self.visual.dtype)
+                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
+                n_image_features = image_embeds.shape[0]
+                if n_image_tokens != n_image_features:
+                    raise ValueError(
+                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                    )
+                image_mask = (
+                    (input_ids == self.config.image_token_id)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
+                )
+                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+            if pixel_values_videos is not None:
+                pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
+                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
+                n_video_features = video_embeds.shape[0]
+                if n_video_tokens != n_video_features:
+                    raise ValueError(
+                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
+                    )
+                video_mask = (
+                    (input_ids == self.config.video_token_id)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
+                )
+                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(inputs_embeds.device)
+        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
+        if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
+            # calculate RoPE index once per generation in the pre-fill stage only
+            if (
+                (cache_position is not None and cache_position[0] == 0)
+                or self.rope_deltas is None
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            ):
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids, image_grid_thw, video_grid_thw, attention_mask
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                    delta = delta.to(position_ids.device)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        outputs = self.model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0] # shape: (batch_size, seq_len, d_model)
+        logits = self.lm_head(hidden_states)
+        lm_loss = None
+        if labels is not None and self.lm_loss_weight > 0:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            lm_loss = loss_fct(shift_logits, shift_labels)
+        # If vision supervision is requested, process the action head.
+        pointer_loss = None
+        pointer_scores = []
+        if visual_token_indices_of_coordinates is not None:
+            batch_size = input_ids.shape[0]
+            pointer_losses = []
+            # Process each sample individually because the number of visual and target tokens may vary.
+            for i in range(batch_size):
+                dummy_target = False
+                # Get the token ids and corresponding hidden states for sample i.
+                token_ids = input_ids[i]          # shape: (seq_length,)
+                hs = hidden_states[i]             # shape: (seq_length, d_model)
+                # Identify visual tokens indices.
+                visual_mask = (token_ids == self.config.image_token_id)
+                visual_indices = torch.nonzero(visual_mask, as_tuple=False).squeeze(-1) # shape: (n_visual,)
+                # Identify target tokens (the ones that should attend to visual features).
+                target_mask = (token_ids == self.config.pointer_pad_token_id)
+                target_indices = torch.nonzero(target_mask, as_tuple=False).squeeze(-1)
+                # If either visual or target tokens are missing, skip this sample.
+                if visual_indices.numel() == 0:
+                    raise ValueError(f"No visual or target tokens found for sample {i}.")
+                if target_indices.numel() == 0:
+                    target_indices = torch.tensor([hs.shape[0] - 1]) # take the last token as the dummy target token
+                    gt = torch.tensor([0]).to(hs.device) # take the first visual token as the dummy ground truth
+                    if if_multi_patch:  # task the first 4 visual tokens as the ground truth
+                        sample_labels = torch.zeros_like(visual_indices).unsqueeze(0)
+                        sample_labels[0][:4] = 1
+                    dummy_target = True
+                else:
+                    # For supervision, we assume that visual_token_indices_of_coordinates[i] is a tensor of shape (n_target,)
+                    # where each element is an integer in the range [0, n_visual-1] indicating the ground-truth visual token.
+                    gt = visual_token_indices_of_coordinates[i].to(hs.device) # shape: (n_target,)
+                    if if_multi_patch:
+                        sample_labels = multi_patch_labels[i]
+                # Gather the corresponding hidden state representations.
+                # visual_hidden = hs[visual_indices]  # shape: (n_visual, d_model)
+                visual_embeds = inputs_embeds[i][visual_indices]
+                target_hidden = hs[target_indices]  # shape: (n_target, d_model)
+                # Calculate loss for multi-patch mode
+                if if_multi_patch:
+                    # Ensure the number of targets matches between sample and labels
+                    if sample_labels.shape[0] != target_indices.shape[0]:
+                        raise ValueError(f"Sample {i} has mismatched target counts: {sample_labels.shape[0]} labels but found {target_indices.shape[0]} target tokens")
+                    # Process using VisionHead_MultiPatch
+                    attn_scores, loss_v = self.multi_patch_pointer_head(
+                        visual_embeds,
+                        target_hidden,
+                        labels=sample_labels
+                    )
+                else:
+                    # Deprecated branch - single patch mode is no longer used
+                    # Run the action head to compute the attention (from target tokens to visual tokens) and its loss.
+                    attn_scores, loss_v = self.pointer_head(visual_embeds, target_hidden, labels=gt)
+                pointer_scores.append(attn_scores.detach().cpu())
+                pointer_losses.append(loss_v * 0.0 if dummy_target else loss_v)
+            pointer_loss = torch.stack(pointer_losses).mean()
+        # Combine the LM loss and vision loss using the provided loss weights.
+        if lm_loss is None:
+            total_loss = pointer_loss
+        elif pointer_loss is None:
+            total_loss = lm_loss
+        else:
+            total_loss = self.lm_loss_weight * lm_loss + self.pointer_loss_weight * pointer_loss
+        if return_dict:
+            return QwenVLwithVisionHeadOutputWithPast(
+                lm_loss=lm_loss,
+                pointer_loss=pointer_loss,
+                pointer_scores=pointer_scores,
+                loss=total_loss,
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+                rope_deltas=self.rope_deltas,
+            )
+        else:
+            # When labels are provided, parent's forward returns a tuple with loss as the first element.
+            if labels is not None:
+                # Replace the LM loss with the combined loss.
+                output = (lm_loss, pointer_loss, logits, pointer_scores,) + outputs[1:]
+                print(f"returning: total_loss, logits, pointer_scores, ...")
+                return (total_loss,) + output if total_loss is not None else output
+            else:
+                return outputs

gui_actor/modeling_qwen25vl.py ADDED Viewed

	@@ -0,0 +1,337 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List, Tuple, Union, Optional
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VLCausalLMOutputWithPast,
+    Qwen2_5_VLForConditionalGeneration,
+)
+from gui_actor.constants import IGNORE_INDEX
+from gui_actor.trainer import rank0_print
+def _get_token_embedding_layer(hf_model: nn.Module) -> nn.Module:
+    """
+    Robustly locate the token embedding layer across HF versions.
+    """
+    if hasattr(hf_model, "get_input_embeddings") and callable(hf_model.get_input_embeddings):
+        return hf_model.get_input_embeddings()
+    # Fallbacks (shouldn't be needed on recent transformers, but safe to keep)
+    lm = getattr(hf_model, "language_model", None)
+    if lm is not None and hasattr(lm, "embed_tokens"):
+        return lm.embed_tokens
+    raise AttributeError("Could not locate token embedding layer on model (no get_input_embeddings/embed_tokens).")
+class QwenVLwithVisionHeadOutputWithPast(Qwen2_5_VLCausalLMOutputWithPast):
+    """
+    Output class for Qwen2_5_VL with pointer head, extending the base output class.
+    Args:
+        lm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
+            Language modeling loss.
+        pointer_loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
+            Vision pointer network loss.
+        pointer_scores (`List[torch.FloatTensor]`, *optional*):
+            Attention scores from the pointer network, one tensor per batch item.
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
+            Combined loss (weighted sum of lm_loss and pointer_loss).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores from the language modeling head.
+        past_key_values, hidden_states, attentions, rope_deltas:
+            Same as parent class.
+    """
+    def __init__(self, lm_loss=None, pointer_loss=None, pointer_scores=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lm_loss = lm_loss
+        self.pointer_loss = pointer_loss
+        self.pointer_scores = pointer_scores
+class VisionHead_MultiPatch(nn.Module):
+    def __init__(self, d_model, projection_dim, num_attention_heads=8, dropout_rate=0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.projection_enc = nn.Sequential(
+            nn.Linear(d_model, projection_dim),
+            nn.GELU(),
+            nn.Linear(projection_dim, d_model),
+        )
+        self.projection_dec = nn.Sequential(
+            nn.Linear(d_model, projection_dim),
+            nn.GELU(),
+            nn.Linear(projection_dim, d_model),
+        )
+        self.self_attention = nn.MultiheadAttention(
+            embed_dim=d_model, num_heads=num_attention_heads, dropout=dropout_rate, batch_first=True
+        )
+        self.layer_norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout_rate)
+    def forward(
+        self,
+        hidden_state_enc,  # [n_enc, d_model]
+        hidden_state_dec,  # [n_dec, d_model]
+        labels: Optional[torch.Tensor] = None,  # [n_dec, n_enc] binary mask of patches in bbox
+        do_single_patch: bool = False,
+    ):
+        enc_input = hidden_state_enc.unsqueeze(0)
+        attn_output, _ = self.self_attention(query=enc_input, key=enc_input, value=enc_input, need_weights=False)
+        hidden_state_enc_ctx = self.layer_norm(enc_input + self.dropout(attn_output)).squeeze(0)  # [n_enc, d_model]
+        proj_enc = self.projection_enc(hidden_state_enc_ctx)  # [n_enc, d_model]
+        proj_dec = self.projection_dec(hidden_state_dec)      # [n_dec, d_model]
+        scaling = self.d_model ** 0.5
+        patch_logits = torch.matmul(proj_dec, proj_enc.transpose(0, 1)) / scaling  # [n_dec, n_enc]
+        attn_weights = F.softmax(patch_logits, dim=-1)
+        loss = None
+        if (labels is not None) and (not do_single_patch):
+            epsilon = 1e-8
+            labels_float = labels.float()
+            target_dist = labels_float / (labels_float.sum(dim=-1, keepdim=True) + epsilon)
+            pred_log_probs = F.log_softmax(patch_logits, dim=-1)
+            loss = F.kl_div(pred_log_probs, target_dist, reduction='batchmean')
+        if do_single_patch and (labels is not None):
+            # NOTE: if you ever enable this branch, use patch_logits for CE
+            loss = F.cross_entropy(patch_logits, labels)
+        return attn_weights, loss
+class Qwen2_5_VLForConditionalGenerationWithPointer(Qwen2_5_VLForConditionalGeneration):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.multi_patch_pointer_head = VisionHead_MultiPatch(self.config.hidden_size, self.config.hidden_size)
+        self.pointer_loss_weight = kwargs.get("pointer_loss_weight", 1.0)
+        self.lm_loss_weight = kwargs.get("lm_loss_weight", 1.0)
+        self.post_init()
+        # init rope cache slot (used in return_dict path)
+        self.rope_deltas = None
+    def reset_loss_weights(self, pointer_loss_weight, lm_loss_weight):
+        self.pointer_loss_weight = pointer_loss_weight
+        self.lm_loss_weight = lm_loss_weight
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,  # (batch_size, seq_len)
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+        # Grounding
+        visual_token_indices_of_coordinates: Optional[torch.Tensor] = None,  # (batch_size, n_target)
+        multi_patch_labels: Optional[torch.Tensor] = None,                   # list/packed: [(n_target, n_visual), ...]
+        if_multi_patch: bool = True,
+        coordinates: Optional[List[Tuple[float, float]]] = None,
+        verbose: bool = False,
+    ) -> Union[Tuple, QwenVLwithVisionHeadOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if verbose:
+            rank0_print(f"input_ids: {None if input_ids is None else (input_ids.shape, input_ids[0][:5])}")
+            rank0_print(f"labels: {None if labels is None else (labels.shape, labels[0][:5])}")
+            rank0_print(f"pixel_values: {None if pixel_values is None else pixel_values.shape}")
+            rank0_print(f"image_grid_thw: {None if image_grid_thw is None else image_grid_thw.shape}")
+            rank0_print(f"coordinates: {coordinates}")
+            rank0_print(f"visual_token_indices_of_coordinates: {visual_token_indices_of_coordinates}")
+            rank0_print(f"return_dict: {return_dict}")
+        if inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError("Either inputs_embeds or input_ids must be provided.")
+            # FIX: use embedding accessor instead of .embed_tokens
+            token_embedding = _get_token_embedding_layer(self.model)
+            inputs_embeds = token_embedding(input_ids)  # (batch, seq_len, d_model)
+            if pixel_values is not None:
+                pixel_values = pixel_values.type(self.visual.dtype)
+                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
+                n_image_features = image_embeds.shape[0]
+                if n_image_tokens != n_image_features:
+                    raise ValueError(
+                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features: {n_image_features}"
+                    )
+                image_mask = (
+                    (input_ids == self.config.image_token_id)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
+                )
+                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+            if pixel_values_videos is not None:
+                pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
+                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
+                n_video_features = video_embeds.shape[0]
+                if n_video_tokens != n_video_features:
+                    raise ValueError(
+                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features: {n_video_features}"
+                    )
+                video_mask = (
+                    (input_ids == self.config.video_token_id)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
+                )
+                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(inputs_embeds.device)
+        # RoPE positions / deltas
+        if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
+            if (
+                (cache_position is not None and cache_position[0] == 0)
+                or self.rope_deltas is None
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            ):
+                position_ids, rope_deltas = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask)
+                self.rope_deltas = rope_deltas
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0).to(position_ids.device)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        outputs = self.model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]  # (batch, seq_len, d_model)
+        logits = self.lm_head(hidden_states)
+        lm_loss = None
+        if labels is not None and self.lm_loss_weight > 0:
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1).to(shift_logits.device)
+            lm_loss = loss_fct(shift_logits, shift_labels)
+        pointer_loss = None
+        pointer_scores = []
+        if visual_token_indices_of_coordinates is not None:
+            batch_size = input_ids.shape[0]
+            pointer_losses = []
+            for i in range(batch_size):
+                dummy_target = False
+                token_ids = input_ids[i]      # (seq_len,)
+                hs = hidden_states[i]         # (seq_len, d_model)
+                visual_mask = (token_ids == self.config.image_token_id)
+                visual_indices = torch.nonzero(visual_mask, as_tuple=False).squeeze(-1)  # (n_visual,)
+                target_mask = (token_ids == self.config.pointer_pad_token_id)
+                target_indices = torch.nonzero(target_mask, as_tuple=False).squeeze(-1)
+                if visual_indices.numel() == 0:
+                    raise ValueError(f"No visual tokens found for sample {i}.")
+                if target_indices.numel() == 0:
+                    target_indices = torch.tensor([hs.shape[0] - 1], device=hs.device)
+                    gt = torch.tensor([0], device=hs.device)  # not used in multi-patch
+                    if if_multi_patch:
+                        sample_labels = torch.zeros_like(visual_indices).unsqueeze(0)
+                        sample_labels[0][:4] = 1
+                    dummy_target = True
+                else:
+                    gt = visual_token_indices_of_coordinates[i].to(hs.device)  # (n_target,)
+                    if if_multi_patch:
+                        sample_labels = multi_patch_labels[i]
+                # Use input embeddings for visual tokens (image tokens got replaced earlier)
+                visual_embeds = inputs_embeds[i][visual_indices]  # (n_visual, d_model)
+                target_hidden = hs[target_indices]                # (n_target, d_model)
+                if if_multi_patch:
+                    if sample_labels.shape[0] != target_indices.shape[0]:
+                        raise ValueError(
+                            f"Sample {i} mismatched targets: {sample_labels.shape[0]} labels vs {target_indices.shape[0]} targets"
+                        )
+                    attn_scores, loss_v = self.multi_patch_pointer_head(
+                        visual_embeds,
+                        target_hidden,
+                        labels=sample_labels,
+                    )
+                else:
+                    # Deprecated: single-patch branch
+                    attn_scores, loss_v = self.pointer_head(visual_embeds, target_hidden, labels=gt)
+                pointer_scores.append(attn_scores.detach().cpu())
+                pointer_losses.append(loss_v * 0.0 if dummy_target else loss_v)
+            pointer_loss = torch.stack(pointer_losses).mean()
+        if lm_loss is None:
+            total_loss = pointer_loss
+        elif pointer_loss is None:
+            total_loss = lm_loss
+        else:
+            total_loss = self.lm_loss_weight * lm_loss + self.pointer_loss_weight * pointer_loss
+        if return_dict:
+            return QwenVLwithVisionHeadOutputWithPast(
+                lm_loss=lm_loss,
+                pointer_loss=pointer_loss,
+                pointer_scores=pointer_scores,
+                loss=total_loss,
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+                rope_deltas=self.rope_deltas,
+            )
+        else:
+            if labels is not None:
+                output = (lm_loss, pointer_loss, logits, pointer_scores,) + outputs[1:]
+                return (total_loss,) + output if total_loss is not None else output
+            else:
+                return outputs

gui_actor/trainer.py ADDED Viewed

	@@ -0,0 +1,313 @@

+from datetime import timedelta
+from functools import wraps
+from typing import Optional
+import torch
+import torch.distributed as dist
+import transformers
+from accelerate import Accelerator, DataLoaderConfiguration
+from accelerate.utils import GradientAccumulationPlugin, InitProcessGroupKwargs
+from torch.utils.data import DataLoader, RandomSampler
+from transformers import Trainer
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.trainer_pt_utils import get_parameter_names
+from transformers.trainer_utils import has_length
+from transformers.utils import (
+    is_accelerate_available,
+    is_datasets_available,
+    is_sagemaker_mp_enabled,
+)
+from transformers.trainer_pt_utils import LengthGroupedSampler as HFLengthGroupedSampler
+from transformers.trainer_utils import seed_worker
+from transformers.utils import logging
+if is_datasets_available():
+    import datasets
+def rank0_print(*args):
+    if dist.is_initialized():
+        if dist.get_rank() == 0:
+            print(f"Rank {dist.get_rank()}: ", *args)
+    else:
+        print(*args)
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE and not ignore_status:
+            logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
+    """Collects the state dict and dump to disk."""
+    trainer.accelerator.wait_for_everyone()
+    torch.cuda.synchronize()
+    if trainer.deepspeed:
+        trainer.save_model(output_dir)
+        return
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)
+class AGUVISTrainer(Trainer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        original_save = self._save
+        original_save_model = self.save_model
+        def modify_eos_token(func):
+            @wraps(func)
+            def wrapper(*args, **kwargs):
+                tokenizer = self.processing_class.tokenizer
+                old_config_id = self.model.config.eos_token_id
+                old_eos_token = tokenizer.eos_token
+                old_generation_config_eos_token_id = (
+                    self.model.generation_config.eos_token_id if hasattr(self.model, "generation_config") else None
+                )
+                try:
+                    new_eos_token_id = tokenizer.convert_tokens_to_ids("<|diff_marker|>")
+                    self.model.config.eos_token_id = [new_eos_token_id]
+                    tokenizer.eos_token = "<|diff_marker|>"
+                    if hasattr(self.model, "generation_config"):
+                        self.model.generation_config.eos_token_id = [new_eos_token_id]
+                    print("Set eos token id to", new_eos_token_id)
+                    print("Set eos token to", "<|diff_marker|>")
+                    print("Set generation config eos token id to", [new_eos_token_id])
+                    result = func(*args, **kwargs)
+                    return result
+                finally:
+                    self.model.config.eos_token_id = old_config_id
+                    tokenizer.eos_token = old_eos_token
+                    if hasattr(self.model, "generation_config") and old_generation_config_eos_token_id is not None:
+                        self.model.generation_config.eos_token_id = old_generation_config_eos_token_id
+                    print("Set eos token id back to", old_config_id)
+                    print("Set eos token back to", old_eos_token)
+                    if old_generation_config_eos_token_id is not None:
+                        print("Set generation config eos token id back to", old_generation_config_eos_token_id)
+            return wrapper
+        self._save = modify_eos_token(original_save)
+        self.save_model = modify_eos_token(original_save_model)
+    def create_accelerator_and_postprocess(self):
+        grad_acc_kwargs = {"num_steps": self.args.gradient_accumulation_steps}
+        grad_acc_kwargs["sync_with_dataloader"] = False
+        gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)
+        accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+        # create accelerator object
+        dispatch_batches = getattr(self.args, "dispatch_batches", None)
+        split_batches = getattr(self.args, "split_batches", None)
+        self.dataloader_config = DataLoaderConfiguration(
+            dispatch_batches=dispatch_batches,
+            split_batches=split_batches,
+        )
+        self.accelerator = Accelerator(
+            dataloader_config=self.dataloader_config,
+            deepspeed_plugin=self.args.deepspeed_plugin,
+            gradient_accumulation_plugin=gradient_accumulation_plugin,
+            kwargs_handlers=[accelerator_kwargs],
+        )
+        # some Trainer classes need to use `gather` instead of `gather_for_metrics`, thus we store a flag
+        self.gather_function = self.accelerator.gather_for_metrics
+        # deepspeed and accelerate flags covering both trainer args and accelerate launcher
+        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
+        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
+        # post accelerator creation setup
+        if self.is_fsdp_enabled:
+            fsdp_plugin = self.accelerator.state.fsdp_plugin
+            fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get(
+                "limit_all_gathers", fsdp_plugin.limit_all_gathers
+            )
+            if is_accelerate_available("0.23.0"):
+                fsdp_plugin.activation_checkpointing = self.args.fsdp_config.get(
+                    "activation_checkpointing", fsdp_plugin.activation_checkpointing
+                )
+                if fsdp_plugin.activation_checkpointing and self.args.gradient_checkpointing:
+                    raise ValueError(
+                        "The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg "
+                        "can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic "
+                        "when using FSDP."
+                    )
+        if self.is_deepspeed_enabled and getattr(self.args, "hf_deepspeed_config", None) is None:
+            self.propagate_args_to_deepspeed()
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+        if self.args.group_by_length:
+            lengths = self.train_dataset.lengths
+            return HFLengthGroupedSampler(
+                self.args.train_batch_size * self.args.gradient_accumulation_steps,
+                dataset=self.train_dataset,
+                lengths=lengths,
+            )
+        elif self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return HFLengthGroupedSampler(
+                self.args.train_batch_size * self.args.gradient_accumulation_steps,
+                dataset=self.train_dataset,
+                lengths=lengths,
+            )
+        else:
+            return RandomSampler(self.train_dataset)
+    def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training [`~torch.utils.data.DataLoader`].
+        Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
+        training if necessary) otherwise.
+        Subclass and override this method if you want to inject some custom behavior.
+        """
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+        train_dataset = self.train_dataset
+        data_collator = self.data_collator
+        if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
+            train_dataset = self._remove_unused_columns(train_dataset, description="training")
+        else:
+            data_collator = self._get_collator_with_removed_columns(data_collator, description="training")
+        dataloader_params = {
+            "batch_size": self._train_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+        }
+        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_train_sampler()
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["worker_init_fn"] = seed_worker
+            dataloader_params["prefetch_factor"] = (
+                self.args.dataloader_num_workers * 2 if self.args.dataloader_num_workers != 0 else None
+            )
+        dataloader = self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
+        return dataloader
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            return super().create_optimizer()
+        opt_model = self.model
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            optimizer_grouped_parameters = [
+                {
+                    "params": [
+                        p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                    ],
+                    "weight_decay": self.args.weight_decay,
+                },
+                {
+                    "params": [
+                        p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
+                    ],
+                    "weight_decay": 0.0,
+                },
+            ]
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+        return self.optimizer
+    def create_optimizer_with_different_learning_rates(self):
+        """
+        Setup the optimizer.
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            raise NotImplementedError("Sagemaker MP is not supported for separate learning rate yet")
+            return super().create_optimizer()
+        opt_model = self.model
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            new_parameters = []
+            for name, param in opt_model.named_parameters():
+                if ("pointer_head" in name) or ("embed_tokens" in name):
+                    new_parameters.append(name)
+            rank0_print(f"new_parameters: {len(new_parameters)}")
+            optimizer_grouped_parameters = [
+                {
+                    "params": [p for n, p in opt_model.named_parameters() if ((n in decay_parameters) and (n not in new_parameters) and p.requires_grad)],
+                    "weight_decay": self.args.weight_decay,
+                    "lr": self.args.learning_rate,
+                },
+                {
+                    "params": [p for n, p in opt_model.named_parameters() if ((n not in decay_parameters) and (n not in new_parameters) and p.requires_grad)],
+                    "weight_decay": 0.0,
+                    "lr": self.args.learning_rate,
+                },
+                {
+                    "params": [p for n, p in opt_model.named_parameters() if ((n in decay_parameters) and (n in new_parameters) and p.requires_grad)],
+                    "weight_decay": self.args.weight_decay,
+                    "lr": self.args.learning_rate_new_params,
+                },
+                {
+                    "params": [p for n, p in opt_model.named_parameters() if ((n not in decay_parameters) and (n in new_parameters) and p.requires_grad)],
+                    "weight_decay": 0.0,
+                    "lr": self.args.learning_rate_new_params,
+                },
+            ]
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args) # {'lr': 0.0001, 'betas': (0.9, 0.999), 'eps': 1e-08}
+            optimizer_kwargs.pop("lr")
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+        return self.optimizer

gui_actor/utils.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from PIL import Image, ImageDraw, ImageColor
+import json
+import os
+def dump_args_to_json(model_config, data_processor, model_args, data_args, training_args, output_dir):
+    def is_json_serializable(v):
+        try:
+            json.dumps(v)
+            return True
+        except:
+            return False
+    save_path = f"{output_dir}/args.json"
+    if not os.path.exists(save_path):
+        with open(save_path, "w") as f:
+            json.dump({
+                "model_config": {k: v for k, v in model_config.__dict__.items() if is_json_serializable(v)},
+                "data_processor_config": {k: v for k, v in data_processor.__dict__.items() if is_json_serializable(v)},
+                "image_processor_config": {k: v for k, v in data_processor.image_processor.__dict__.items() if is_json_serializable(v)},
+                "model_args": {k: v for k, v in model_args.__dict__.items() if is_json_serializable(v)},
+                "data_args": {k: v for k, v in data_args.__dict__.items() if is_json_serializable(v)},
+                "training_args": {k: v for k, v in training_args.__dict__.items() if is_json_serializable(v)},
+            }, f, indent=4)
+def draw_point(image: Image.Image, point: list, color=None):
+    if isinstance(color, str):
+        try:
+            color = ImageColor.getrgb(color)
+            color = color + (128,)
+        except ValueError:
+            color = (255, 0, 0, 128)
+    else:
+        color = (255, 0, 0, 128)
+    overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
+    overlay_draw = ImageDraw.Draw(overlay)
+    radius = 14
+    x, y = point
+    overlay_draw.rectangle(
+        [x - radius, y - radius, x + radius, y + radius],
+        fill=color
+    )
+    center_radius = radius * 0.1
+    overlay_draw.ellipse(
+        [(x - center_radius, y - center_radius),
+         (x + center_radius, y + center_radius)],
+        fill=(0, 255, 0, 255)
+    )
+    image = image.convert('RGBA')
+    combined = Image.alpha_composite(image, overlay)
+    return combined.convert('RGB')
+def draw_bbox(image: Image.Image, bbox: list, color=None):
+    """bbox is in the format of [x1, y1, x2, y2]"""
+    if isinstance(color, str):
+        try:
+            color = ImageColor.getrgb(color)
+            color = color + (128,)
+        except ValueError:
+            color = (255, 0, 0, 128)
+    else:
+        color = (255, 0, 0, 128)
+    overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
+    overlay_draw = ImageDraw.Draw(overlay)
+    overlay_draw.rectangle(bbox, fill=color)
+    return Image.alpha_composite(image, overlay).convert('RGB')
+def do_boxes_overlap(box1, box2):
+    """
+    Check if two boxes overlap.
+    Each box is represented as a tuple: (x1, y1, x2, y2)
+    Where (x1, y1) is the top-left and (x2, y2) is the bottom-right corner.
+    """
+    # Unpack the coordinates
+    x1_min, y1_min, x1_max, y1_max = box1
+    x2_min, y2_min, x2_max, y2_max = box2
+    # Check for no overlap
+    if x1_max < x2_min or x2_max < x1_min:
+        return False
+    if y1_max < y2_min or y2_max < y1_min:
+        return False
+    return True

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+transformers
+accelerate
+torch
+Pillow
+requests
+torchvision
+torchaudio
+gradio
+gradio_client
+spaces
+opencv-python-headless
+datasets
+qwen-vl-utils
+pre-commit
+matplotlib
+#flash-attn