FengHou97 commited on
Commit
1c2df8b
·
verified ·
1 Parent(s): 6c919b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -68,11 +68,12 @@ def shot(image, labels_text, model_name, hypothesis_template_prefix, hypothesis_
68
  domains = [domain.strip(" ") for domain in domains_text.strip(" ").split(",")]
69
  else:
70
  #img = Image.open(image)
71
- input_text = "Please describe the image from six dimensions, including weather (clear, sandstorm, foggy, rainy, snowy), angle (front, left, top), time (daytime, night), occlusion (unoccluded, lightly-occluded, partially-occluded, moderately-occluded, heavily-occluded), season (spring-summer, autumn, winter). Each dimension should be described in no more than 4 words and should match the image content. Please try to output from the options in the previous brackets. If there is no suitable result, output N/A."# Please also output a probability of your inference."# If there is no information in a certain dimension, you can directly output no information.
 
72
  domains = gemini_response_vision(input_texts=input_text, image=image)
73
  #IMAGE_PATH = './reasoning_xy.jpg'
74
  # base64_image = encode_image('car.png')
75
- # prompt = "Please describe the image from six dimensions, including weather (clear, sandstorm, foggy, rainy, snowy), angle (front, left, top), time (daytime, night), occlusion (unoccluded, lightly-occluded, partially-occluded, moderately-occluded, heavily-occluded), season (spring-summer, autumn, winter). Each dimension should be described in no more than 4 words and should match the image content. Please try to output from the options in the previous brackets. If there is no suitable result, output N/A."# Please also output a probability of your inference."# If there is no information in a certain dimension, you can directly output no information."
76
 
77
  # response = client.chat.completions.create(
78
  # model="gpt-4o",
@@ -104,7 +105,7 @@ iface = gr.Interface(shot,
104
  inputs,
105
  "label",
106
  examples=[
107
- #["festival.jpg", "lantern, firecracker, couplet", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", "clear, autumn, day, side, light occlusion"],
108
  ["car.png", "car, bike, truck", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", ""]],
109
  description="""<p>Chinese CLIP is a contrastive-learning-based vision-language foundation model pretrained on large-scale Chinese data. For more information, please refer to the paper and official github. Also, Chinese CLIP has already been merged into Huggingface Transformers! <br><br>
110
  Paper: <a href='https://arxiv.org/pdf/2403.02714'>https://arxiv.org/pdf/2403.02714</a> <br>
 
68
  domains = [domain.strip(" ") for domain in domains_text.strip(" ").split(",")]
69
  else:
70
  #img = Image.open(image)
71
+ #input_text = "Please describe the image from six dimensions, including weather (clear, sandstorm, foggy, rainy, snowy), angle (front, left, top), time (daytime, night), occlusion (no occlusion, light occlusion, partial occlusion, moderate occlusion, heavy occlusion), season (spring-summer, autumn, winter). Each dimension should be described in no more than 4 words and should match the image content. Please try to output from the options in the previous brackets. If there is no suitable result, output N/A."# Please also output a probability of your inference."# If there is no information in a certain dimension, you can directly output no information.
72
+ input_text = "You are an expert for domain knowledge analysis. Please describe the image from six domain shifts, including Weather (clear, sandstorm, foggy, rainy, snowy), Season (spring-summer, autumn, winter), Time (daytime, night), Angle (front, side, top) and Occlusion (no occlusion, light occlusion, partial occlusion, moderate occlusion, heavy occlusion). You are supposed to recognize each domain from the above domain shifts based on the image. Finally, output a list of domains like ['clear', 'autumn', 'night', 'front', 'light occlusion']"
73
  domains = gemini_response_vision(input_texts=input_text, image=image)
74
  #IMAGE_PATH = './reasoning_xy.jpg'
75
  # base64_image = encode_image('car.png')
76
+ # prompt = "Please describe the image from six dimensions, including weather (clear, sandstorm, foggy, rainy, snowy), angle (front, left, top), time (daytime, night), occlusion (no occlusion, light occlusion, partial occlusion, moderate occlusion, heavy occlusion), season (spring-summer, autumn, winter). Each dimension should be described in no more than 4 words and should match the image content. Please try to output from the options in the previous brackets. If there is no suitable result, output N/A."# Please also output a probability of your inference."# If there is no information in a certain dimension, you can directly output no information."
77
 
78
  # response = client.chat.completions.create(
79
  # model="gpt-4o",
 
105
  inputs,
106
  "label",
107
  examples=[
108
+ #["festival.jpg", "lantern, firecracker, couplet", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", "clear, autumn, day, side, partial occlusion"],
109
  ["car.png", "car, bike, truck", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", ""]],
110
  description="""<p>Chinese CLIP is a contrastive-learning-based vision-language foundation model pretrained on large-scale Chinese data. For more information, please refer to the paper and official github. Also, Chinese CLIP has already been merged into Huggingface Transformers! <br><br>
111
  Paper: <a href='https://arxiv.org/pdf/2403.02714'>https://arxiv.org/pdf/2403.02714</a> <br>