Spaces:

gizemsarsinlar
/

Florence-2_Vision

Sleeping

App Files Files Community

Florence-2_Vision / app.py

gizemsarsinlar

Update app.py

58835ab verified about 1 year ago

raw

history blame contribute delete

8.16 kB

	import os
	from unittest.mock import patch
	import gradio as gr
	from transformers import AutoProcessor, AutoModelForCausalLM
	from transformers.dynamic_module_utils import get_imports
	import torch
	from PIL import Image, ImageDraw
	import random
	import numpy as np
	import matplotlib.pyplot as plt
	import matplotlib.patches as patches
	import io

	# Define colormap
	colormap = ['red', 'green', 'blue', 'yellow', 'orange', 'purple', 'cyan']

	# Workaround to fix import issues for Florence-2 model
	def workaround_fixed_get_imports(filename):
	if not str(filename).endswith("/modeling_florence2.py"):
	return get_imports(filename)
	imports = get_imports(filename)
	if "flash_attn" in imports:
	imports.remove("flash_attn") # Remove 'flash_attn' if it's causing issues
	return imports

	def initialize_model():
	# Check if CUDA (GPU) is available and set the device accordingly
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	# Patch the get_imports function and load the model and processor
	with patch("transformers.dynamic_module_utils.get_imports", workaround_fixed_get_imports):
	try:
	model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large-ft", trust_remote_code=True).to(device).eval()
	processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large-ft", trust_remote_code=True)
	print("Model and processor loaded successfully.")
	return model, processor, device
	except Exception as e:
	print(f"An error occurred while loading the model or processor: {e}")
	return None, None, device

	# Initialize the model and processor
	model, processor, device = initialize_model()

	# def run_example(task_prompt, image, text_input=None):
	# prompt = task_prompt if text_input is None else task_prompt + text_input
	# inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
	# with torch.inference_mode():
	# generated_ids = model.generate(**inputs, max_new_tokens=1024, early_stopping=False, do_sample=False, num_beams=3)
	# generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
	# return processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.size[0], image.size[1]))

	def run_example(task_prompt, image, text_input=None):
	if text_input is None:
	prompt = task_prompt
	else:
	prompt = task_prompt + text_input
	inputs = processor(text=prompt, images=image, return_tensors="pt")
	generated_ids = model.generate(
	input_ids=inputs["input_ids"],
	pixel_values=inputs["pixel_values"],
	max_new_tokens=1024,
	early_stopping=False,
	do_sample=False,
	num_beams=3,
	)
	generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
	parsed_answer = processor.post_process_generation(
	generated_text,
	task=task_prompt,
	image_size=(image.width, image.height)
	)

	return parsed_answer

	def fig_to_pil(fig):
	buf = io.BytesIO()
	fig.savefig(buf, format='png', dpi=300, bbox_inches='tight')
	buf.seek(0)
	return Image.open(buf)

	def plot_bbox_img(image, data):
	fig, ax = plt.subplots(figsize=(10, 10))
	ax.imshow(image)

	if 'bboxes' in data and 'labels' in data:
	bboxes, labels = data['bboxes'], data['labels']
	else:
	return fig_to_pil(fig)

	for bbox, label in zip(bboxes, labels):
	x1, y1, x2, y2 = bbox
	rect = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=2, edgecolor='indigo', facecolor='none')
	ax.add_patch(rect)
	plt.text(x1, y1, label, color='white', fontsize=10, bbox=dict(facecolor='indigo', alpha=0.8))

	ax.axis('off')
	return fig_to_pil(fig)

	def draw_polygons(image, prediction, fill_mask=False):
	fig, ax = plt.subplots(figsize=(10, 10))
	ax.imshow(image)
	for polygons, label in zip(prediction.get('polygons', []), prediction.get('labels', [])):
	color = random.choice(colormap)
	for polygon in polygons:
	if isinstance(polygon[0], (int, float)):
	polygon = [(polygon[i], polygon[i+1]) for i in range(0, len(polygon), 2)]
	poly = patches.Polygon(polygon, edgecolor=color, facecolor=color if fill_mask else 'none', alpha=0.5 if fill_mask else 1, linewidth=2)
	ax.add_patch(poly)
	if polygon:
	plt.text(polygon[0][0], polygon[0][1], label, color='white', fontsize=10, bbox=dict(facecolor=color, alpha=0.8))
	ax.axis('off')
	return fig_to_pil(fig)

	def process_image(image, task, text):
	task_mapping = {
	"Caption": ("<CAPTION>", lambda result: (result['<CAPTION>'], image)),
	"Detailed Caption": ("<DETAILED_CAPTION>", lambda result: (result['<DETAILED_CAPTION>'], image)),
	"More Detailed Caption": ("<MORE_DETAILED_CAPTION>", lambda result: (result.get('<MORE_DETAILED_CAPTION>', 'Failed to generate detailed caption'), image)),
	"Caption to Phrase Grounding": ("<CAPTION_TO_PHRASE_GROUNDING>", lambda result: (str(result['<CAPTION_TO_PHRASE_GROUNDING>']), plot_bbox_img(image, result['<CAPTION_TO_PHRASE_GROUNDING>']))),
	"Object Detection": ("<OD>", lambda result: (str(result['<OD>']), plot_bbox_img(image, result['<OD>']))),
	"Referring Expression Segmentation": ("<REFERRING_EXPRESSION_SEGMENTATION>", lambda result: (str(result['<REFERRING_EXPRESSION_SEGMENTATION>']), draw_polygons(image, result['<REFERRING_EXPRESSION_SEGMENTATION>'], fill_mask=True))),
	"Region to Segmentation": ("<REGION_TO_SEGMENTATION>", lambda result: (str(result['<REGION_TO_SEGMENTATION>']), draw_polygons(image, result['<REGION_TO_SEGMENTATION>'], fill_mask=True))),
	"OCR": ("<OCR>", lambda result: (result['<OCR>'], image)),
	}

	if task in task_mapping:
	prompt, process_func = task_mapping[task]
	print(f"Task: {task}, Prompt: {prompt}") # Debugging statement
	result = run_example(prompt, image, text)
	print(f"Result: {result}") # Debugging statement
	return process_func(result)
	else:
	return "", image


	image_path_1 = "Fiat-500-9-scaled.jpg"
	image_path_2 = "OCR_2.png"

	with gr.Blocks() as demo:
	gr.HTML("<h1><center>Florence-2 Vision</center></h1>")

	with gr.Tab(label="Image"):
	with gr.Row():
	with gr.Column():
	input_img = gr.Image(label="Input Picture", type="pil")
	task_dropdown = gr.Dropdown(
	choices=["Caption", "Detailed Caption", "More Detailed Caption", "Object Detection", "Caption to Phrase Grounding", "Referring Expression Segmentation", "Region to Segmentation", "OCR"],
	label="Task", value="Caption"
	)
	text_input = gr.Textbox(label="Text Input (Optional)", visible=False)
	gr.Examples(
	examples=[
	[image_path_1, "Detailed Caption", ""],
	[image_path_1, "Object Detection", ""],
	[image_path_1, "More Detailed Caption", ""],
	[image_path_1, "Caption to Phrase Grounding", "A white car parked on the street."],
	[image_path_1, "Region to Segmentation", ""],
	[image_path_2, "OCR", ""]
	],
	inputs=[input_img, task_dropdown, text_input],
	cache_examples=False # Set this to False if caching is not needed
	)
	submit_btn = gr.Button(value="Submit")
	with gr.Column():
	output_text = gr.Textbox(label="Results")
	output_image = gr.Image(label="Image", type="pil")

	def update_text_input(task):
	return gr.update(visible=task in ["Region to Segmentation"])


	task_dropdown.change(fn=update_text_input, inputs=task_dropdown, outputs=text_input)

	submit_btn.click(fn=process_image, inputs=[input_img, task_dropdown, text_input], outputs=[output_text, output_image])

	demo.launch()