from turtle import title import gradio as gr from transformers import pipeline import numpy as np from PIL import Image pipes = { "ViT/B-16": pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch16"), "ViT/L-14": pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch16"), } inputs = [ gr.Image(type='pil', label="Image"), gr.Textbox(lines=1, label="Candidate Labels"), gr.Radio(choices=[ "ViT/B-16", "ViT/L-14", ], type="value", label="Model"), gr.Textbox(lines=1, label="Prompt Template Prompt", share=False, value="a photo of a {}"), ] images="festival.jpg" def shot(image, labels_text, model_name, hypothesis_template): labels = [label.strip(" ") for label in labels_text.strip(" ").split(",")] res = pipes[model_name](images=image, candidate_labels=labels, hypothesis_template=hypothesis_template) return {dic["label"]: dic["score"] for dic in res} iface = gr.Interface(shot, inputs, "label", examples=[["festival.jpg", "lantern, firecracker, couplet", "ViT/B-16", "a photo of a {}"]], description="""

Chinese CLIP is a contrastive-learning-based vision-language foundation model pretrained on large-scale Chinese data. For more information, please refer to the paper and official github. Also, Chinese CLIP has already been merged into Huggingface Transformers!

Paper: https://arxiv.org/pdf/2403.02714
To begin with the demo, provide a picture (either upload manually, or select from the given examples) and add class labels one by one. Optionally, you can also add template as a prefix to the class labels.
""", title="Cross-Domain Recognition") iface.launch()