IMGCaption / app.py
jaimin's picture
Update app.py
aa4d252
raw
history blame
2.09 kB
from PIL import Image
import requests
import gradio as gr
from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, VisionEncoderDecoderModel
import torch
from label import predict_environment,recursion_change_bn,load_labels,hook_feature,returnCAM,returnTF,load_model
git_processor = AutoProcessor.from_pretrained("microsoft/git-large-r-textcaps")
git_model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-r-textcaps")
blip_processor = AutoProcessor.from_pretrained("jaimin/Imagecap")
blip_model = BlipForConditionalGeneration.from_pretrained("jaimin/Imagecap")
device = "cuda" if torch.cuda.is_available() else "cpu"
git_model_large_textcaps.to(device)
blip_model_large.to(device)
def generate_caption(processor, model, image, use_float_16=False):
inputs = processor(images=image, return_tensors="pt").to(device)
if use_float_16:
inputs = inputs.to(torch.float16)
generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50)
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return generated_caption
def generate_captions(image):
img = Image.open(image)
caption_git = generate_caption(git_processor, git_model, img)
caption_blip = generate_caption(blip_processor, blip_model, img)
env, scene = predict_environment(img)
return env,scene,caption_git_large_textcaps, caption_blip_large
outputs = [gr.outputs.Textbox(label="Environment"), gr.outputs.Textbox(label="Objects detected"), gr.outputs.Textbox(label="Caption generated by GIT"), gr.outputs.Textbox(label="Caption generated by BLIP")]
title = "Image Cap with Scene"
description = " Image caption with scene"
interface = gr.Interface(fn=generate_captions,
inputs=gr.inputs.Image(type="pil"),
outputs=outputs,
title=title,
description=description,
enable_queue=True)
interface.launch(debug=True)