Rediones-AI / utils /caption_utils.py
Testys's picture
Pushing First version before making full changes
67d6f5b
raw
history blame
3.94 kB
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import requests
from dotenv import load_dotenv
from image_utils import UrlTest
import os
img = UrlTest()
class ImageCaptioning:
def __init__(self):
# Initialize Model and Tokenizer
self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
self.blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base')
self.topic_generator_processor = AutoTokenizer.from_pretrained("google/flan-t5-large")
self.topic_generator_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
self.blip_model.eval()
self.topic_generator_model.eval()
def generate_caption(self, image):
# Generate Caption
input_text = self.blip_processor(image, return_tensors="pt")
outputs = self.blip_model.generate(pixel_values=input_text["pixel_values"], max_new_tokens=128, do_sample=True, temperature=0.9, top_k=50, top_p=0.95)
caption_output = [self.blip_processor.decode(output, skip_special_tokens=True) for output in outputs]
return outputs
def generate_topics(self, user_input, num_topics=3):
query = f"""Generate a topic sentence idea based on the user input.
The generated topics should portray the context or idea behind the given sentences or phrase.
For Instance,
- "Grocery Shopping" OR "Grocery List" OR "Shopping List": "I'm going grocery shopping tomorrow,
and I would like to get the following things on my grocery list: Milk, Soybeans, Cowpeas,
Saturated Water, Onions, Tomatoes, etc."
- "Studying For Exams" OR "Exams Studies": "Exams aare coming up and I have to prepare for the core
courses. I'll be studying for Control Systems, Software Engineering and Circuit Theory."
- "Healthy Breakfast": "To prepare a healthy breakfast, I need the appropriate combination of balanced
diet. I'll need oats, yogurt, fresh berries, honey and smoothies."
- "Fitness Routine": "Starting a fitness routine involves workout clothes, running shoes,
a water bottles, and a gym membership. With this, I can start a proper fitness plan."
- "Summer Vacation": "Packing swimsuits and enjoy the view of the ocean."
- "Coffee Break": "Sipping Coffee at the table."
- "Relaxation": "Sitting at the table enjoying."
This is what I'm expecting the model to do. Here is the input: {user_input}
"""
caption_input = self.topic_generator_processor(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
caption_output = self.topic_generator_model.generate(**caption_input, temperature=0.1, num_return_sequences=num_topics, do_sample=True, max_length=50, top_k=50, top_p=0.95, num_beams=5)
caption_output = [self.topic_generator_processor.decode(output, skip_special_tokens=True) for output in caption_output]
return caption_output
def combo_model(self, image):
image = img.load_image(image)
caption = self.generate_caption(image)
caption = self.blip_processor.decode(caption[0], skip_special_tokens=True)
topics = self.generate_topics(caption)
topics = [topic for topic in topics if len(topic) > 0]
return {"caption": caption,
"topics": topics}
if __name__ == "__main__":
# Initialize Model
model = ImageCaptioning()
# Test Image
image = "1071642.jpg"
# Generate Caption and Topics
outputs = model.combo_model(image)
print(outputs)