from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM import torch import requests from dotenv import load_dotenv from image_utils import UrlTest import os img = UrlTest() class ImageCaptioning: def __init__(self): # Initialize Model and Tokenizer self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") self.blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base') self.topic_generator_processor = AutoTokenizer.from_pretrained("google/flan-t5-large") self.topic_generator_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large") self.blip_model.eval() self.topic_generator_model.eval() def generate_caption(self, image): # Generate Caption input_text = self.blip_processor(image, return_tensors="pt") outputs = self.blip_model.generate(pixel_values=input_text["pixel_values"], max_new_tokens=128, do_sample=True, temperature=0.9, top_k=50, top_p=0.95) caption_output = [self.blip_processor.decode(output, skip_special_tokens=True) for output in outputs] return outputs def generate_topics(self, user_input, num_topics=3): query = f"""Generate a topic sentence idea based on the user input. The generated topics should portray the context or idea behind the given sentences or phrase. For Instance, - "Grocery Shopping" OR "Grocery List" OR "Shopping List": "I'm going grocery shopping tomorrow, and I would like to get the following things on my grocery list: Milk, Soybeans, Cowpeas, Saturated Water, Onions, Tomatoes, etc." - "Studying For Exams" OR "Exams Studies": "Exams aare coming up and I have to prepare for the core courses. I'll be studying for Control Systems, Software Engineering and Circuit Theory." - "Healthy Breakfast": "To prepare a healthy breakfast, I need the appropriate combination of balanced diet. I'll need oats, yogurt, fresh berries, honey and smoothies." - "Fitness Routine": "Starting a fitness routine involves workout clothes, running shoes, a water bottles, and a gym membership. With this, I can start a proper fitness plan." - "Summer Vacation": "Packing swimsuits and enjoy the view of the ocean." - "Coffee Break": "Sipping Coffee at the table." - "Relaxation": "Sitting at the table enjoying." This is what I'm expecting the model to do. Here is the input: {user_input} """ caption_input = self.topic_generator_processor(query, return_tensors="pt", padding=True, truncation=True, max_length=512) caption_output = self.topic_generator_model.generate(**caption_input, temperature=0.1, num_return_sequences=num_topics, do_sample=True, max_length=50, top_k=50, top_p=0.95, num_beams=5) caption_output = [self.topic_generator_processor.decode(output, skip_special_tokens=True) for output in caption_output] return caption_output def combo_model(self, image): image = img.load_image(image) caption = self.generate_caption(image) caption = self.blip_processor.decode(caption[0], skip_special_tokens=True) topics = self.generate_topics(caption) topics = [topic for topic in topics if len(topic) > 0] return {"caption": caption, "topics": topics} if __name__ == "__main__": # Initialize Model model = ImageCaptioning() # Test Image image = "1071642.jpg" # Generate Caption and Topics outputs = model.combo_model(image) print(outputs)