import base64, os, json
from typing import Optional

import torch
import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw

# ---- Hugging Face Spaces GPU decorator (safe fallback when not on Spaces) ----
try:
    import spaces
    GPU_DECORATOR = spaces.GPU
except Exception:
    def GPU_DECORATOR(fn):  # no-op locally
        return fn

from qwen_vl_utils import process_vision_info  # noqa: F401 (kept for parity if used elsewhere)
from datasets import load_dataset              # noqa: F401
from transformers import AutoProcessor
from gui_actor.constants import chat_template  # noqa: F401
from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer
from gui_actor.inference import inference

MAX_PIXELS = 3200 * 1800

def resize_image(image, resize_to_pixels=MAX_PIXELS):
    image_width, image_height = image.size
    if (resize_to_pixels is not None) and ((image_width * image_height) != resize_to_pixels):
        resize_ratio = (resize_to_pixels / (image_width * image_height)) ** 0.5
        image_width_resized, image_height_resized = int(image_width * resize_ratio), int(image_height * resize_ratio)
        image = image.resize((image_width_resized, image_height_resized))
    return image

@torch.inference_mode()
def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)):
    overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
    overlay_draw = ImageDraw.Draw(overlay)
    x, y = point
    overlay_draw.ellipse(
        [(x - radius, y - radius), (x + radius, y + radius)],
        outline=color,
        width=5
    )
    image = image.convert('RGBA')
    combined = Image.alpha_composite(image, overlay)
    combined = combined.convert('RGB')
    return combined

@torch.inference_mode()
def get_attn_map(image, attn_scores, n_width, n_height):
    w, h = image.size
    scores = np.array(attn_scores[0]).reshape(n_height, n_width)
    scores_norm = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
    score_map = Image.fromarray((scores_norm * 255).astype(np.uint8)).resize((w, h), resample=Image.NEAREST)
    colormap = plt.get_cmap('jet')
    colored_score_map = colormap(np.array(score_map) / 255.0)[:, :, :3]
    colored_overlay = Image.fromarray((colored_score_map * 255).astype(np.uint8))
    blended = Image.blend(image, colored_overlay, alpha=0.3)
    return blended

# ----------------------------
# Model/device init for Spaces
# ----------------------------
def _pick_gpu_dtype() -> torch.dtype:
    if not torch.cuda.is_available():
        return torch.float32
    major, minor = torch.cuda.get_device_capability()
    # Ampere (8.x) / Hopper (9.x) support bf16 well
    return torch.bfloat16 if major >= 8 else torch.float16

# Global holders initialized in load_model()
model = None
tokenizer = None
data_processor = None

@GPU_DECORATOR   # <-- This is what Spaces looks for at startup
def load_model():
    """
    Allocates the GPU on Spaces and loads the model on the right device/dtype.
    Runs once at startup.
    """
    global model, tokenizer, data_processor

    model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL"

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    dtype = _pick_gpu_dtype()

    # Enable some healthy defaults on GPU
    if device.startswith("cuda"):
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.set_grad_enabled(False)

    data_processor = AutoProcessor.from_pretrained(model_name_or_path)
    tokenizer = data_processor.tokenizer

    # Use SDPA attention to avoid flash-attn dependency
    attn_impl = "sdpa"

    model_local = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
        model_name_or_path,
        torch_dtype=dtype,
        attn_implementation=attn_impl,
    ).eval()

    # Move to device explicitly (avoid accelerate unless you need sharding)
    model_local.to(device)

    model = model_local
    return f"Loaded {model_name_or_path} on {device} with dtype={dtype} (attn={attn_impl})"

# Trigger model loading on import so Spaces allocates GPU immediately
_ = load_model()

@GPU_DECORATOR
@torch.inference_mode()
def process(image, instruction):
    # Safety: ensure model is loaded
    if model is None:
        _ = load_model()

    # Resize if needed
    w, h = image.size
    if w * h > MAX_PIXELS:
        image = resize_image(image)
        w, h = image.size

    conversation = [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": (
                        "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, "
                        "your task is to locate the screen element that corresponds to the instruction. "
                        "Output a PyAutoGUI action with a special token that points to the correct location."
                    ),
                }
            ],
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": instruction},
            ],
        },
    ]

    device = next(model.parameters()).device

    try:
        pred = inference(
            conversation,
            model,
            tokenizer,
            data_processor,
            use_placeholder=True,
            topk=3
        )
    except Exception as e:
        print("inference error:", e)
        return image, f"Error: {e}", None

    px, py = pred["topk_points"][0]
    output_coord = f"({px:.4f}, {py:.4f})"
    img_with_point = draw_point(image, (px * w, py * h))

    n_width, n_height = pred["n_width"], pred["n_height"]
    attn_scores = pred["attn_scores"]
    att_map = get_attn_map(image, attn_scores, n_width, n_height)

    return img_with_point, output_coord, att_map


# ----------------------------
# Gradio UI
# ----------------------------
title = "GUI-Actor"
header = """
<div align="center">
    <h1 style="padding-bottom: 10px; padding-top: 10px;">🎯 <strong>GUI-Actor</strong>: Coordinate-Free Visual Grounding for GUI Agents</h1>
    <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
        <a href="https://microsoft.github.io/GUI-Actor/">🌐 Project Page</a> | <a href="https://arxiv.org/abs/2403.12968">📄 arXiv Paper</a> | <a href="https://github.com/microsoft/GUI-Actor">💻 Github Repo</a><br/>
    </div>
</div>
"""
theme = "soft"
css = """#anno-img .mask {opacity: 0.5; transition: all 0.2s ease-in-out;}
#anno-img .mask.active {opacity: 0.7}"""

with gr.Blocks(title=title, css=css, theme=theme) as demo:
    gr.Markdown(header)
    with gr.Row():
        with gr.Column():
            input_image = gr.Image(type='pil', label='Upload image')
            input_instruction = gr.Textbox(label='Instruction', placeholder='Type your (low-level) instruction here')
            submit_button = gr.Button(value='Submit', variant='primary')
        with gr.Column():
            image_with_point = gr.Image(type='pil', label='Image with Point (red circle)')
            with gr.Accordion('Detailed prediction'):
                pred_xy = gr.Textbox(label='Predicted Coordinates', placeholder='(x, y)')
                att_map = gr.Image(type='pil', label='Attention Map')

    submit_button.click(
        fn=process,
        inputs=[input_image, input_instruction],
        outputs=[image_with_point, pred_xy, att_map],
        queue=True,
        api_name="predict",
    )

# Version-agnostic Gradio startup (works across 3.x/4.x/5.x)
# Try newer/older signatures, fall back gracefully.

# Queue (GPU scheduling needed on Spaces)
try:
    demo.queue(concurrency_count=1, max_size=4)
except TypeError:
    try:
        demo.queue(max_size=4)
    except TypeError:
        demo.queue()

# Launch
try:
    demo.launch(share=False, max_threads=1, max_queue_size=4)
except TypeError:
    try:
        demo.launch(share=False, max_queue_size=4)
    except TypeError:
        demo.launch(share=False)