Spaces:

MykolaL
/

evp

Running on L4

evp / app.py

Upload app.py

44fbca1 verified about 1 month ago

6.73 kB

	import os
	import sys

	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'depth')))
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'refer')))
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'stable-diffusion')))
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src/taming-transformers')))
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src/clip')))


	os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), 'depth')))


	import cv2
	import numpy as np
	import torch
	from depth.models_depth.model import EVPDepth
	from models_refer.model import EVPRefer
	from depth.configs.train_options import TrainOptions
	from depth.configs.test_options import TestOptions
	import glob
	import utils
	import torchvision.transforms as transforms
	from utils_depth.misc import colorize
	from PIL import Image
	import torch.nn.functional as F
	import gradio as gr
	import tempfile
	from transformers import CLIPTokenizer, AutoModel


	css = """
	#img-display-container {
	max-height: 50vh;
	}
	#img-display-input {
	max-height: 40vh;
	}
	#img-display-output {
	max-height: 40vh;
	}

	"""

	def create_depth_demo(model, device):
	gr.Markdown("### Depth Prediction demo")
	with gr.Row():
	input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
	depth_image = gr.Image(label="Depth Map", elem_id='img-display-output')
	raw_file = gr.File(label="16-bit raw depth, multiplier:256")
	submit = gr.Button("Submit")

	def on_submit(image):
	transform = transforms.ToTensor()
	image = transform(image).unsqueeze(0).to(device)
	shape = image.shape
	image = torch.nn.functional.interpolate(image, (440,480), mode='bilinear', align_corners=True)
	image = F.pad(image, (0, 0, 40, 0))
	with torch.no_grad():
	pred = model(image)#['pred_d']
	pred = torch.from_numpy(pred).to(device).float()

	if pred.dim() == 2: # H×W
	pred = pred.unsqueeze(0).unsqueeze(0)

	pred = pred[:,:,40:,:]
	pred = torch.nn.functional.interpolate(pred, shape[2:], mode='bilinear', align_corners=True)
	pred_d_numpy = pred.squeeze().cpu().numpy()
	colored_depth, _, _ = colorize(pred_d_numpy, cmap='gray_r')

	tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
	raw_depth = Image.fromarray((pred_d_numpy*256).astype('uint16'))
	raw_depth.save(tmp.name)
	return [colored_depth, tmp.name]

	submit.click(on_submit, inputs=[input_image], outputs=[depth_image, raw_file])
	examples = gr.Examples(examples=["imgs/test_img1.jpg", "imgs/test_img2.jpg", "imgs/test_img3.jpg", "imgs/test_img4.jpg", "imgs/test_img5.jpg"],
	inputs=[input_image])


	def create_refseg_demo(model, tokenizer, device):
	gr.Markdown("### Referring Segmentation demo")
	with gr.Row():
	input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
	refseg_image = gr.Image(label="Output Mask", elem_id='img-display-output')
	input_text = gr.Textbox(label='Prompt', placeholder='Please upload your image first', lines=2)
	submit = gr.Button("Submit")

	def on_submit(image, text):
	# Convert PIL -> np array
	transform = transforms.ToTensor()
	image_t = transform(image).unsqueeze(0).to(device)

	with torch.no_grad():
	out = model(image_t, text)

	# Ensure numpy mask
	if isinstance(out, torch.Tensor):
	mask = out.squeeze().detach().cpu().numpy()
	else:
	mask = out

	# If model returns multi-channel, collapse with argmax
	if mask.ndim > 2:
	mask = np.argmax(mask, axis=0)

	mask = mask.astype(np.uint8)

	# Overlay mask on original image
	image_np = np.array(image).copy()
	alpha = 0.65
	image_np[mask == 0] = (image_np[mask == 0] * alpha).astype(np.uint8)

	# Draw contours
	contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
	cv2.drawContours(image_np, contours, -1, (0, 255, 0), 2)

	return Image.fromarray(image_np)


	submit.click(on_submit, inputs=[input_image, input_text], outputs=refseg_image)
	examples = gr.Examples(
	examples=[
	["imgs/test_img2.jpg", "green plant"],
	["imgs/test_img3.jpg", "chair"],
	["imgs/test_img4.jpg", "left green plant"],
	["imgs/test_img5.jpg", "man walking on foot"],
	["imgs/test_img5.jpg", "the rightest camel"],
	],
	inputs=[input_image, input_text]
	)



	def main():
	upload_2_models = True

	opt = TestOptions().initialize()
	args = opt.parse_args()

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	if upload_2_models:
	model = AutoModel.from_pretrained("MykolaL/evp_depth", trust_remote_code=True).to(device).eval()

	tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
	model_refseg = AutoModel.from_pretrained("MykolaL/evp_refer", trust_remote_code=True).to(device).eval()
	print('Models uploaded successfully')

	title = "# EVP"
	description = """Official demo for **EVP: Enhanced Visual Perception using Inverse Multi-Attentive Feature
	Refinement and Regularized Image-Text Alignment**.
	EVP is a deep learning model for metric depth estimation from a single image as well as referring segmentation.
	Please refer to our [project page](https://lavreniuk.github.io/EVP) or [paper](https://arxiv.org/abs/2312.08548) or [github](https://github.com/Lavreniuk/EVP) for more details."""

	with gr.Blocks() as demo:
	gr.Markdown(title)
	gr.Markdown(description)
	if upload_2_models:
	with gr.Tab("Depth Prediction"):
	create_depth_demo(model, device)
	with gr.Tab("Referring Segmentation"):
	create_refseg_demo(model_refseg, tokenizer, device)
	gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:<a href="https://huggingface.co/spaces/MykolaL/evp?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br>
	<p><img src="https://visitor-badge.glitch.me/badge?page_id=MykolaL/evp" alt="visitors"></p></center>''')

	demo.queue().launch(share=True)


	if __name__ == '__main__':
	main()