Spaces:
Sleeping
Sleeping
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM | |
import torch | |
import requests | |
from dotenv import load_dotenv | |
from image_utils import UrlTest | |
import os | |
img = UrlTest() | |
class ImageCaptioning: | |
def __init__(self): | |
# Initialize Model and Tokenizer | |
self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
self.blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base') | |
self.topic_generator_processor = AutoTokenizer.from_pretrained("google/flan-t5-large") | |
self.topic_generator_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large") | |
self.blip_model.eval() | |
self.topic_generator_model.eval() | |
def generate_caption(self, image): | |
# Generate Caption | |
input_text = self.blip_processor(image, return_tensors="pt") | |
outputs = self.blip_model.generate(pixel_values=input_text["pixel_values"], max_new_tokens=128, do_sample=True, temperature=0.9, top_k=50, top_p=0.95) | |
caption_output = [self.blip_processor.decode(output, skip_special_tokens=True) for output in outputs] | |
return outputs | |
def generate_topics(self, user_input, num_topics=3): | |
query = f"""Generate a topic sentence idea based on the user input. | |
The generated topics should portray the context or idea behind the given sentences or phrase. | |
For Instance, | |
- "Grocery Shopping" OR "Grocery List" OR "Shopping List": "I'm going grocery shopping tomorrow, | |
and I would like to get the following things on my grocery list: Milk, Soybeans, Cowpeas, | |
Saturated Water, Onions, Tomatoes, etc." | |
- "Studying For Exams" OR "Exams Studies": "Exams aare coming up and I have to prepare for the core | |
courses. I'll be studying for Control Systems, Software Engineering and Circuit Theory." | |
- "Healthy Breakfast": "To prepare a healthy breakfast, I need the appropriate combination of balanced | |
diet. I'll need oats, yogurt, fresh berries, honey and smoothies." | |
- "Fitness Routine": "Starting a fitness routine involves workout clothes, running shoes, | |
a water bottles, and a gym membership. With this, I can start a proper fitness plan." | |
- "Summer Vacation": "Packing swimsuits and enjoy the view of the ocean." | |
- "Coffee Break": "Sipping Coffee at the table." | |
- "Relaxation": "Sitting at the table enjoying." | |
This is what I'm expecting the model to do. Here is the input: {user_input} | |
""" | |
caption_input = self.topic_generator_processor(query, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
caption_output = self.topic_generator_model.generate(**caption_input, temperature=0.1, num_return_sequences=num_topics, do_sample=True, max_length=50, top_k=50, top_p=0.95, num_beams=5) | |
caption_output = [self.topic_generator_processor.decode(output, skip_special_tokens=True) for output in caption_output] | |
return caption_output | |
def combo_model(self, image): | |
image = img.load_image(image) | |
caption = self.generate_caption(image) | |
caption = self.blip_processor.decode(caption[0], skip_special_tokens=True) | |
topics = self.generate_topics(caption) | |
topics = [topic for topic in topics if len(topic) > 0] | |
return {"caption": caption, | |
"topics": topics} | |
if __name__ == "__main__": | |
# Initialize Model | |
model = ImageCaptioning() | |
# Test Image | |
image = "1071642.jpg" | |
# Generate Caption and Topics | |
outputs = model.combo_model(image) | |
print(outputs) | |