FengHou97 commited on
Commit
8df8184
·
verified ·
1 Parent(s): bbdbb0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -58,9 +58,9 @@ def shot(image, labels_text, model_name, hypothesis_template_prefix, hypothesis_
58
  if not domains_text == '':
59
  domains = [domain.strip(" ") for domain in domains_text.strip(" ").split(",")]
60
  else:
61
- img = Image.open(image)
62
  input_text = "Please describe the image from six dimensions, including weather (clear, sandstorm, foggy, rainy, snowy), angle (front, left, top), time (daytime, night), occlusion (unoccluded, lightly-occluded, partially-occluded, moderately-occluded, heavily-occluded), season (spring-summer, autumn, winter). Each dimension should be described in no more than 4 words and should match the image content. Please try to output from the options in the previous brackets. If there is no suitable result, output N/A."# Please also output a probability of your inference."# If there is no information in a certain dimension, you can directly output no information.
63
- domains = gemini_response_vision(input_texts=input_text, image=img)
64
  print(domains)
65
 
66
  hypothesis_template = hypothesis_template_prefix + ' ' + hypothesis_template_suffix.format(*domains)
@@ -76,7 +76,8 @@ def shot(image, labels_text, model_name, hypothesis_template_prefix, hypothesis_
76
  iface = gr.Interface(shot,
77
  inputs,
78
  "label",
79
- examples=[["festival.jpg", "lantern, firecracker, couplet", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", "clear, autumn, day, side, light occlusion"],
 
80
  ["car.png", "car, bike, truck", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", ""]],
81
  description="""<p>Chinese CLIP is a contrastive-learning-based vision-language foundation model pretrained on large-scale Chinese data. For more information, please refer to the paper and official github. Also, Chinese CLIP has already been merged into Huggingface Transformers! <br><br>
82
  Paper: <a href='https://arxiv.org/pdf/2403.02714'>https://arxiv.org/pdf/2403.02714</a> <br>
 
58
  if not domains_text == '':
59
  domains = [domain.strip(" ") for domain in domains_text.strip(" ").split(",")]
60
  else:
61
+ #img = Image.open(image)
62
  input_text = "Please describe the image from six dimensions, including weather (clear, sandstorm, foggy, rainy, snowy), angle (front, left, top), time (daytime, night), occlusion (unoccluded, lightly-occluded, partially-occluded, moderately-occluded, heavily-occluded), season (spring-summer, autumn, winter). Each dimension should be described in no more than 4 words and should match the image content. Please try to output from the options in the previous brackets. If there is no suitable result, output N/A."# Please also output a probability of your inference."# If there is no information in a certain dimension, you can directly output no information.
63
+ domains = gemini_response_vision(input_texts=input_text, image=image)
64
  print(domains)
65
 
66
  hypothesis_template = hypothesis_template_prefix + ' ' + hypothesis_template_suffix.format(*domains)
 
76
  iface = gr.Interface(shot,
77
  inputs,
78
  "label",
79
+ examples=[
80
+ #["festival.jpg", "lantern, firecracker, couplet", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", "clear, autumn, day, side, light occlusion"],
81
  ["car.png", "car, bike, truck", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", ""]],
82
  description="""<p>Chinese CLIP is a contrastive-learning-based vision-language foundation model pretrained on large-scale Chinese data. For more information, please refer to the paper and official github. Also, Chinese CLIP has already been merged into Huggingface Transformers! <br><br>
83
  Paper: <a href='https://arxiv.org/pdf/2403.02714'>https://arxiv.org/pdf/2403.02714</a> <br>