Spaces:
Sleeping
Sleeping
from turtle import title | |
import gradio as gr | |
from transformers import pipeline | |
import numpy as np | |
from PIL import Image | |
pipes = { | |
"ViT/B-16": pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch16"), | |
"ViT/L-14": pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch16"), | |
} | |
inputs = [ | |
gr.Image(type='pil', | |
label="Image"), | |
gr.Radio(choices=[ | |
"ViT/B-16", | |
"ViT/L-14", | |
], type="value", label="Model"), | |
gr.Textbox(lines=1, | |
label="Prompt Template Prompt", | |
placeholder="Optional prompt template as prefix", | |
value=""), | |
# gr.Textbox(lines=1, | |
# label="Prior Domains", placeholder="Add a domain label, one by one",), | |
gr.Textbox(lines=1, | |
label="Candidate Labels", placeholder="Add a class label, one by one",), | |
] | |
images="festival.jpg" | |
def shot(image, labels_text, model_name, hypothesis_template): | |
labels = [label.strip(" ") for label in labels_text.strip(" ").split(",")] | |
res = pipes[model_name](images=image, | |
candidate_labels=labels, | |
hypothesis_template=hypothesis_template) | |
return {dic["label"]: dic["score"] for dic in res} | |
iface = gr.Interface(shot, | |
inputs, | |
"label", | |
examples=[["festival.jpg", "ViT/B-16", "a photo of a {}", "lantern, firecracker, couplet"]], | |
description="""<p>Chinese CLIP is a contrastive-learning-based vision-language foundation model pretrained on large-scale Chinese data. For more information, please refer to the paper and official github. Also, Chinese CLIP has already been merged into Huggingface Transformers! <br><br> | |
Paper: <a href='https://arxiv.org/pdf/2403.02714'>https://arxiv.org/pdf/2403.02714</a> <br> | |
To begin with the demo, provide a picture (either upload manually, or select from the given examples) and add class labels one by one. Optionally, you can also add template as a prefix to the class labels. <br>""", | |
title="Cross-Domain Recognition") | |
iface.launch() |