from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import requests
from dotenv import load_dotenv
from image_utils import UrlTest
import os

img = UrlTest()

class ImageCaptioning:
    def __init__(self):
        # Initialize Model and Tokenizer
        self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        self.blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base')
        self.topic_generator_processor = AutoTokenizer.from_pretrained("google/flan-t5-large")
        self.topic_generator_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
        self.blip_model.eval()
        self.topic_generator_model.eval()


    def generate_caption(self, image):
        # Generate Caption
        input_text = self.blip_processor(image, return_tensors="pt")
        outputs = self.blip_model.generate(pixel_values=input_text["pixel_values"], max_new_tokens=128, do_sample=True, temperature=0.9, top_k=50, top_p=0.95)
        caption_output = [self.blip_processor.decode(output, skip_special_tokens=True) for output in outputs]

        return outputs

    
    def generate_topics(self, user_input, num_topics=3):
        query = f"""Generate a topic sentence idea based on the user input. 
            The generated topics should portray the context or idea behind the given sentences or phrase.
            For Instance,
                - "Grocery Shopping" OR "Grocery List" OR "Shopping List": "I'm going grocery shopping tomorrow, 
                and I would like to get the following things on my grocery list: Milk, Soybeans, Cowpeas, 
                Saturated Water, Onions, Tomatoes, etc."
                - "Studying For Exams" OR "Exams Studies": "Exams aare coming up and I have to prepare for the core 
                courses. I'll be studying for Control Systems, Software Engineering and Circuit Theory."
                - "Healthy Breakfast": "To prepare a healthy breakfast, I need the appropriate combination of balanced 
                diet. I'll need oats, yogurt, fresh berries, honey and smoothies."
                -  "Fitness Routine": "Starting a fitness routine involves workout clothes, running shoes, 
                a water bottles, and a gym membership. With this, I can start a proper fitness plan."
                - "Summer Vacation": "Packing swimsuits and enjoy the view of the ocean."
                - "Coffee Break": "Sipping Coffee at the table."
                - "Relaxation": "Sitting at the table enjoying."
                
            This is what I'm expecting the model to do. Here is the input: {user_input}
                       """
            
        caption_input = self.topic_generator_processor(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
        caption_output = self.topic_generator_model.generate(**caption_input, temperature=0.1, num_return_sequences=num_topics, do_sample=True, max_length=50, top_k=50, top_p=0.95, num_beams=5) 
        caption_output = [self.topic_generator_processor.decode(output, skip_special_tokens=True) for output in caption_output]

        return caption_output

    def combo_model(self, image):
        image = img.load_image(image)
        caption = self.generate_caption(image)
        caption = self.blip_processor.decode(caption[0], skip_special_tokens=True)
        topics = self.generate_topics(caption)
        topics = [topic for topic in topics if len(topic) > 0]
        return {"caption": caption,
                "topics": topics}

    
if __name__ == "__main__":
    # Initialize Model
    model = ImageCaptioning()
    # Test Image
    image = "1071642.jpg"
    # Generate Caption and Topics
    outputs = model.combo_model(image)
    print(outputs)