Spaces:

FengHou97
/

Cross-Domain-Recognition

Sleeping

App Files Files Community

Cross-Domain-Recognition / app.py

FengHou97

Update app.py

a4e28e0 verified 3 months ago

raw

history blame

6.32 kB

	from turtle import title
	import gradio as gr
	from transformers import pipeline
	import numpy as np
	from PIL import Image
	from dotenv import load_dotenv
	import google.generativeai as genai
	import os

	# from openai import OpenAI
	# client = OpenAI(api_key="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")
	# import base64

	# # Open the image file and encode it as a base64 string
	# def encode_image(image_path):
	# with open(image_path, "rb") as image_file:
	# return base64.b64encode(image_file.read()).decode("utf-8")

	#load_dotenv()
	#GOOGLE_API_KEY = os.getenv("AIzaSyByqW3ByYPxC4xLS_NhgwAOAMMEgB7DvoY")
	genai.configure(api_key="AIzaSyByqW3ByYPxC4xLS_NhgwAOAMMEgB7DvoY")
	model_vision = genai.GenerativeModel('gemini-pro-vision')

	def gemini_response_vision(input_texts, image):
	try:
	if input_texts != "":
	response2 = model_vision.generate_content([input_texts, image])
	else:
	response2 = model_vision.generate_content(image)

	return response2.text

	except Exception as e:
	raise e

	pipes = {
	"ViT/B-16": pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch16"),
	"ViT/L-14": pipeline("zero-shot-image-classification", model="openai/clip-vit-large-patch14"),
	}
	inputs = [
	gr.Image(type='pil',
	label="Image"),
	gr.Textbox(lines=1,
	label="Candidate Labels", placeholder="Add a class label, one by one"),
	gr.Radio(choices=[
	"ViT/B-16",
	"ViT/L-14",
	], type="value", label="Model"),
	gr.Textbox(lines=1,
	label="Prompt Template Prompt",
	placeholder="Optional prompt template as prefix",
	value="a photo of a {}"),

	gr.Textbox(lines=1,
	label="Prompt Template Prompt",
	placeholder="Optional prompt template as suffix",
	value="in {} {} {} from {} with {}."),

	gr.Textbox(lines=1,
	label="Prior Domains", placeholder="Add a domain label, one by one"),
	]
	images="festival.jpg"

	def shot(image, labels_text, model_name, hypothesis_template_prefix, hypothesis_template_suffix, domains_text):
	labels = [label.strip(" ") for label in labels_text.strip(" ").split(",")]

	if not domains_text == '':
	domains = [domain.strip(" ") for domain in domains_text.strip(" ").split(",")]
	else:
	#img = Image.open(image)
	input_text = "Please describe the image from six dimensions, including weather (clear, sandstorm, foggy, rainy, snowy), angle (front, left, top), time (daytime, night), occlusion (unoccluded, lightly-occluded, partially-occluded, moderately-occluded, heavily-occluded), season (spring-summer, autumn, winter). Each dimension should be described in no more than 4 words and should match the image content. Please try to output from the options in the previous brackets. If there is no suitable result, output N/A."# Please also output a probability of your inference."# If there is no information in a certain dimension, you can directly output no information.
	domains = gemini_response_vision(input_texts=input_text, image=image)
	#IMAGE_PATH = './reasoning_xy.jpg'
	# base64_image = encode_image('car.png')
	# prompt = "Please describe the image from six dimensions, including weather (clear, sandstorm, foggy, rainy, snowy), angle (front, left, top), time (daytime, night), occlusion (unoccluded, lightly-occluded, partially-occluded, moderately-occluded, heavily-occluded), season (spring-summer, autumn, winter). Each dimension should be described in no more than 4 words and should match the image content. Please try to output from the options in the previous brackets. If there is no suitable result, output N/A."# Please also output a probability of your inference."# If there is no information in a certain dimension, you can directly output no information."

	# response = client.chat.completions.create(
	# model="gpt-4o",
	# messages=[
	# # {"role": "system", "content": "You are a helpful assistant that responds in Markdown. Help me with my math homework!"},
	# {"role": "user", "content": [
	# {"type": "text", "text": prompt},
	# {"type": "image_url", "image_url": {
	# "url": f"data:image/png;base64,{base64_image}"}
	# }
	# ]}
	# ],
	# temperature=0.0,
	# )
	# domains = response.choices[0].message.content
	print(domains)

	hypothesis_template = hypothesis_template_prefix + ' ' + hypothesis_template_suffix.format(*domains)
	print(hypothesis_template)

	res = pipes[model_name](images=image,
	candidate_labels=labels,
	hypothesis_template=hypothesis_template)
	return {dic["label"]: dic["score"] for dic in res}

	#clear, winter, day, front, moderate occlusion

	iface = gr.Interface(shot,
	inputs,
	"label",
	examples=[
	#["festival.jpg", "lantern, firecracker, couplet", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", "clear, autumn, day, side, light occlusion"],
	["car.png", "car, bike, truck", "ViT/B-16", "a photo of a {}", "in {} {} {} from {} with {}.", ""]],
	description="""<p>Chinese CLIP is a contrastive-learning-based vision-language foundation model pretrained on large-scale Chinese data. For more information, please refer to the paper and official github. Also, Chinese CLIP has already been merged into Huggingface Transformers! <br><br>
	Paper: <a href='https://arxiv.org/pdf/2403.02714'>https://arxiv.org/pdf/2403.02714</a> <br>
	To begin with the demo, provide a picture (either upload manually, or select from the given examples) and add class labels one by one. Optionally, you can also add template as a prefix to the class labels. <br>""",
	title="Cross-Domain Recognition")

	iface.launch()