Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -58,9 +58,9 @@ def shot(image, labels_text, model_name, hypothesis_template_prefix, hypothesis_
|
|
58 |
if not domains_text == '':
|
59 |
domains = [domain.strip(" ") for domain in domains_text.strip(" ").split(",")]
|
60 |
else:
|
61 |
-
img = Image.open(image)
|
62 |
input_text = "Please describe the image from six dimensions, including weather (clear, sandstorm, foggy, rainy, snowy), angle (front, left, top), time (daytime, night), occlusion (unoccluded, lightly-occluded, partially-occluded, moderately-occluded, heavily-occluded), season (spring-summer, autumn, winter). Each dimension should be described in no more than 4 words and should match the image content. Please try to output from the options in the previous brackets. If there is no suitable result, output N/A."# Please also output a probability of your inference."# If there is no information in a certain dimension, you can directly output no information.
|
63 |
-
domains = gemini_response_vision(input_texts=input_text, image=
|
64 |
print(domains)
|
65 |
|
66 |
hypothesis_template = hypothesis_template_prefix + ' ' + hypothesis_template_suffix.format(*domains)
|
@@ -76,7 +76,8 @@ def shot(image, labels_text, model_name, hypothesis_template_prefix, hypothesis_
|
|
76 |
iface = gr.Interface(shot,
|
77 |
inputs,
|
78 |
"label",
|
79 |
-
examples=[
|
|
|
80 |
["car.png", "car, bike, truck", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", ""]],
|
81 |
description="""<p>Chinese CLIP is a contrastive-learning-based vision-language foundation model pretrained on large-scale Chinese data. For more information, please refer to the paper and official github. Also, Chinese CLIP has already been merged into Huggingface Transformers! <br><br>
|
82 |
Paper: <a href='https://arxiv.org/pdf/2403.02714'>https://arxiv.org/pdf/2403.02714</a> <br>
|
|
|
58 |
if not domains_text == '':
|
59 |
domains = [domain.strip(" ") for domain in domains_text.strip(" ").split(",")]
|
60 |
else:
|
61 |
+
#img = Image.open(image)
|
62 |
input_text = "Please describe the image from six dimensions, including weather (clear, sandstorm, foggy, rainy, snowy), angle (front, left, top), time (daytime, night), occlusion (unoccluded, lightly-occluded, partially-occluded, moderately-occluded, heavily-occluded), season (spring-summer, autumn, winter). Each dimension should be described in no more than 4 words and should match the image content. Please try to output from the options in the previous brackets. If there is no suitable result, output N/A."# Please also output a probability of your inference."# If there is no information in a certain dimension, you can directly output no information.
|
63 |
+
domains = gemini_response_vision(input_texts=input_text, image=image)
|
64 |
print(domains)
|
65 |
|
66 |
hypothesis_template = hypothesis_template_prefix + ' ' + hypothesis_template_suffix.format(*domains)
|
|
|
76 |
iface = gr.Interface(shot,
|
77 |
inputs,
|
78 |
"label",
|
79 |
+
examples=[
|
80 |
+
#["festival.jpg", "lantern, firecracker, couplet", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", "clear, autumn, day, side, light occlusion"],
|
81 |
["car.png", "car, bike, truck", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", ""]],
|
82 |
description="""<p>Chinese CLIP is a contrastive-learning-based vision-language foundation model pretrained on large-scale Chinese data. For more information, please refer to the paper and official github. Also, Chinese CLIP has already been merged into Huggingface Transformers! <br><br>
|
83 |
Paper: <a href='https://arxiv.org/pdf/2403.02714'>https://arxiv.org/pdf/2403.02714</a> <br>
|