Spaces:

qihoo360
/

FG-CLIP-Retrieval-demo

Running

App Files Files Community

qingshan777 commited on Jul 8

Commit

703c654

verified ·

1 Parent(s): 403d070

Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
app.py +153 -0
cat.jpg +3 -0
cat_dfclor.jpg +0 -0
requirements.txt +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+cat.jpg filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import gradio as gr
+import torch
+import io
+from PIL import Image
+from transformers import (
+    AutoImageProcessor,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
+import numpy as np
+model_root = "qihoo360/fg-clip-base"
+model = AutoModelForCausalLM.from_pretrained(model_root,trust_remote_code=True)
+device = model.device
+tokenizer = AutoTokenizer.from_pretrained(model_root)
+image_processor = AutoImageProcessor.from_pretrained(model_root)
+import math
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+def postprocess_result(probs, labels):
+    pro_output = {labels[i]: probs[i] for i in range(len(labels))}
+    return pro_output
+def Retrieval(image, candidate_labels):
+    """
+    Takes an image and a comma-separated string of candidate labels,
+    and returns the classification scores.
+    """
+    image_size=224
+    image = image.convert("RGB")
+    image = image.resize((image_size,image_size))
+    image_input = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].to(device)
+    walk_short_pos = True
+    caption_input = torch.tensor(tokenizer(candidate_labels, max_length=77, padding="max_length", truncation=True).input_ids, dtype=torch.long, device=device)
+    with torch.no_grad():
+        image_feature = model.get_image_features(image_input)
+        text_feature = model.get_text_features(caption_input,walk_short_pos=walk_short_pos)
+        image_feature = image_feature / image_feature.norm(p=2, dim=-1, keepdim=True)
+        text_feature = text_feature / text_feature.norm(p=2, dim=-1, keepdim=True)
+        logits_per_image = image_feature @ text_feature.T
+        logits_per_image = model.logit_scale.exp() * logits_per_image
+        probs = logits_per_image.softmax(dim=1)
+    results = probs[0].tolist()
+    return results
+def Get_Densefeature(image, candidate_labels):
+    """
+    Takes an image and a comma-separated string of candidate labels,
+    and returns the classification scores.
+    """
+    candidate_labels = [label.lstrip(" ") for label in candidate_labels.split(",") if label !=""]
+    # print(candidate_labels)
+    image_size=224
+    image = image.convert("RGB")
+    image = image.resize((image_size,image_size))
+    image_input = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].to(device)
+    with torch.no_grad():
+        dense_image_feature = model.get_image_dense_features(image_input)
+        captions = candidate_labels
+        caption_input = torch.tensor(tokenizer(captions, max_length=77, padding="max_length", truncation=True).input_ids, dtype=torch.long, device=device)
+        text_feature = model.get_text_features(caption_input,walk_short_pos=True)
+        text_feature = text_feature / text_feature.norm(p=2, dim=-1, keepdim=True)
+        dense_image_feature = dense_image_feature / dense_image_feature.norm(p=2, dim=-1, keepdim=True)
+    similarity = dense_image_feature.squeeze() @ text_feature.squeeze().T
+    similarity = similarity.cpu().numpy()
+    patch_size = int(math.sqrt(similarity.shape[0]))
+    original_shape = (patch_size, patch_size)
+    show_image = similarity.reshape(original_shape)
+    # normalized = (show_image - show_image.min()) / (show_image.max() - show_image.min())
+    # def viridis_colormap(x):
+    #     r = np.clip(1.1746 * x - 0.1776, 0, 1)
+    #     g = np.clip(2.0 * x - 0.7, 0, 1)
+    #     b = np.clip(-2.0 * x + 1.7, 0, 1)
+    #     return np.stack([r, g, b], axis=-1)
+    # color_mapped = viridis_colormap(normalized)
+    # color_mapped_uint8 = (color_mapped * 255).astype(np.uint8)
+    # pil_img = Image.fromarray(color_mapped_uint8)
+    # pil_img = pil_img.resize((512,512))
+    fig = plt.figure(figsize=(6, 6))
+    plt.imshow(show_image)
+    plt.title('similarity Visualization')
+    plt.axis('off')
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    buf.seek(0)
+    plt.close(fig)
+    pil_img = Image.open(buf)
+    # buf.close()
+    return  pil_img
+def infer(image, candidate_labels):
+    candidate_labels = [label.lstrip(" ") for label in candidate_labels.split(",") if label !=""]
+    fg_probs = Retrieval(image, candidate_labels)
+    return postprocess_result(fg_probs,candidate_labels)
+with gr.Blocks() as demo:
+    gr.Markdown("# FG-CLIP Retrieval")
+    gr.Markdown(
+        "This app uses the FG-CLIP model (qihoo360/fg-clip-base) for retrieval on CPU :"
+    )
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil")
+            text_input = gr.Textbox(label="Input a list of labels (comma seperated)")
+            run_button = gr.Button("Run Retrieval", visible=True)
+            dfs_button = gr.Button("Run Densefeature", visible=True)
+        with gr.Column():
+            fg_output = gr.Label(label="FG-CLIP Output", num_top_classes=11)
+            dfs_output = gr.Image(label="Similarity Visualization", type="pil")
+    examples = [
+        # ["./baklava.jpg", "dessert on a plate, a serving of baklava, a plate and spoon"],
+        # ["./dog.jpg", "A light brown wood stool, A bucket with a body made of dark brown plastic, A black velvet back cover for a cellular telephone, A green ball with a perforated pattern, A light blue plastic helmet made of plastic, A grey slipper made of wool, A newspaper with white and black perforated printed on a paper texture, A blue dog with a white colored head, A yellow sponge with a dark green rough surface, A book with white, dark orange and brown pages made of paper, A black ceramic scarf with a body made of fabric."],
+        ["./Landscape.jpg", "red grass, yellow grass, green grass"],
+        ["./cat.jpg", "two sleeping cats, two cats playing, three cats laying down"],
+        ["./cat_dfclor.jpg", "white cat,"],
+    ]
+    gr.Examples(
+        examples=examples,
+        inputs=[image_input, text_input],
+        # outputs=fg_output,
+        # fn=infer,
+    )
+    run_button.click(fn=infer, inputs=[image_input, text_input], outputs=fg_output)
+    dfs_button.click(fn=Get_Densefeature, inputs=[image_input, text_input], outputs=dfs_output)
+demo.launch()

cat.jpg ADDED Viewed

Git LFS Details

SHA256: dea9e7ef97386345f7cff32f9055da4982da5471c48d575146c796ab4563b04e
Pointer size: 131 Bytes
Size of remote file: 173 kB

cat_dfclor.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+gradio
+accelerate
+transformers==4.41.0
+pillow
+einops
+torchvision
+matplotlib
+numpy