import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from PIL import Image
import torch
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    CLIPTokenizer
)

# ----------------------------
# 🔐 Load API Keys & Setup
# ----------------------------
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
device = "cuda" if torch.cuda.is_available() else "cpu"

# ----------------------------
# 📸 Load BLIP Captioning Model
# ----------------------------
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# ----------------------------
# 🧠 Load CLIP Tokenizer (for token check)
# ----------------------------
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")

# ----------------------------
# 📸 Generate Caption from Product Image
# ----------------------------
def generate_blip_caption(image: Image.Image) -> str:
    try:
        inputs = processor(images=image, return_tensors="pt").to(device)
        out = blip_model.generate(**inputs, max_length=50)
        caption = processor.decode(out[0], skip_special_tokens=True)
        # Clean duplicate tokens
        caption = " ".join(dict.fromkeys(caption.split()))
        print(f"🖼️ BLIP Caption: {caption}")
        return caption
    except Exception as e:
        print("❌ BLIP Captioning Error:", e)
        return "a product image"

# ----------------------------
# 🧠 GPT Scene Planning with Caption + Visual Style
# ----------------------------
SCENE_SYSTEM_INSTRUCTIONS = """
You are a scene planning assistant for an AI image generation system.
Your job is to take a caption from a product image, a visual style hint, and a user prompt, then return a structured JSON with:
- scene (environment, setting)
- subject (main_actor)
- objects (main_product or items)
- layout (foreground/background elements and their placement)
- rules (validation rules to ensure visual correctness)
Respond ONLY in raw JSON format. Do NOT include explanations.
"""

def extract_scene_plan(prompt: str, image: Image.Image) -> dict:
    try:
        caption = generate_blip_caption(image)
        visual_hint = caption if "shoe" in caption or "product" in caption else "low-top product photo on white background"
        
        merged_prompt = (
            f"Image Caption: {caption}\n"
            f"Image Visual Style: {visual_hint}\n"
            f"User Prompt: {prompt}"
        )

        response = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=[
                {"role": "system", "content": SCENE_SYSTEM_INSTRUCTIONS},
                {"role": "user", "content": merged_prompt}
            ],
            temperature=0.3,
            max_tokens=500
        )
        content = response.choices[0].message.content
        print("🧠 Scene Plan (Raw):", content)

        # Logging
        os.makedirs("logs", exist_ok=True)
        with open("logs/scene_plans.jsonl", "a") as f:
            f.write(json.dumps({
                "caption": caption,
                "visual_hint": visual_hint,
                "prompt": prompt,
                "scene_plan": content
            }) + "\n")

        return json.loads(content)

    except Exception as e:
        print("❌ extract_scene_plan() Error:", e)
        return {
            "scene": {"environment": "studio", "setting": "plain white background"},
            "subject": {"main_actor": "a product"},
            "objects": {"main_product": "product"},
            "layout": {},
            "rules": {}
        }

# ----------------------------
# ✨ Enriched Prompt Generation (GPT, 77-token safe)
# ----------------------------
ENRICHED_PROMPT_INSTRUCTIONS = """
You are a prompt engineer for an AI image generation model.
Given a structured scene plan and a user prompt, generate a single natural-language enriched prompt that:
1. Describes the subject, product, setting, and layout clearly
2. Uses natural, photo-realistic language
3. Stays strictly under 77 tokens (CLIP token limit)
Return ONLY the enriched prompt string. No explanations.
"""

def generate_prompt_variations_from_scene(scene_plan: dict, base_prompt: str, n: int = 3) -> list:
    prompts = []
    for _ in range(n):
        try:
            user_input = f"Scene Plan:\n{json.dumps(scene_plan)}\n\nUser Prompt:\n{base_prompt}"
            response = client.chat.completions.create(
                model="gpt-4o-mini-2024-07-18",
                messages=[
                    {"role": "system", "content": ENRICHED_PROMPT_INSTRUCTIONS},
                    {"role": "user", "content": user_input}
                ],
                temperature=0.4,
                max_tokens=100
            )
            enriched = response.choices[0].message.content.strip()
            token_count = len(tokenizer(enriched)["input_ids"])
            print(f"📝 Enriched Prompt ({token_count} tokens): {enriched}")
            prompts.append(enriched)
        except Exception as e:
            print("⚠️ Prompt fallback:", e)
            prompts.append(base_prompt)
    return prompts

# ----------------------------
# ❌ Negative Prompt Generator
# ----------------------------
NEGATIVE_SYSTEM_PROMPT = """
You are a prompt engineer. Given a structured scene plan, generate a short negative prompt
to suppress unwanted visual elements such as: distortion, blurriness, poor anatomy,
logo errors, background noise, or low realism.
Return a single comma-separated list. No intro text.
"""

def generate_negative_prompt_from_scene(scene_plan: dict) -> str:
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=[
                {"role": "system", "content": NEGATIVE_SYSTEM_PROMPT},
                {"role": "user", "content": json.dumps(scene_plan)}
            ],
            temperature=0.2,
            max_tokens=100
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("❌ Negative Prompt Error:", e)
        return "blurry, distorted, low quality, deformed, watermark"