coca-captioning / app.py
multimodalart's picture
Update app.py
688b141
import open_clip
from PIL import Image
import gradio as gr
model, _, transform = open_clip.create_model_and_transforms(
model_name="coca_ViT-L-14",
pretrained="laion2B-s13B-b90k"
)
def resize_background(img):
width, height = img.size
aspect_ratio = width / height
if aspect_ratio != 1:
new_img = Image.new("RGB", (512, 512), color=(255, 255, 255))
new_img.paste(img, (int((512 - width) / 2), int((512 - height) / 2)))
img = new_img
img = img.resize((512, 512), Image.LANCZOS)
return img
# load an image
def generate_caption(image):
im = resize_background(image)
# transform the image and add a batch size dimension
im = transform(im).unsqueeze(0)
generated = model.generate(im)
generated = generated.detach()
return(open_clip.decode(generated[0]).split("<start_of_text>")[1].split("<end_of_text>")[0])
with gr.Blocks() as demo:
gr.Markdown("## Captioning with OpenCLIP CoCa")
with gr.Row():
with gr.Column():
image = gr.Image(label="Image to Caption", type="pil")
out = gr.Textbox(label="Caption")
btn = gr.Button("Generate caption")
btn.click(fn=generate_caption,inputs=image, outputs=out)
demo.launch()