import argparse
import os
import random
from collections import defaultdict
import cv2
import re
import numpy as np
from PIL import Image
import torch
import html
import gradio as gr
import torchvision.transforms as T
import torch.backends.cudnn as cudnn
from minigpt4.common.config import Config
from minigpt4.common.registry import registry
from minigpt4.conversation.conversation import Conversation, SeparatorStyle, Chat
# imports modules for registration
from minigpt4.datasets.builders import *
from minigpt4.models import *
from minigpt4.processors import *
from minigpt4.runners import *
from minigpt4.tasks import *
import socket
import os
import spaces
def find_free_port(start_port, end_port):
for port in range(start_port, end_port + 1):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
if sock.connect_ex(('localhost', port)) != 0: # Port is not open
return port
raise OSError(f"Cannot find empty port in range: {start_port}-{end_port}")
def set_gradio_server_port():
start_port = 7870
end_port = 9999
free_port = find_free_port(start_port, end_port)
os.environ["GRADIO_SERVER_PORT"] = str(free_port)
print(f"Set GRADIO_SERVER_PORT to {free_port}")
# Set GRADIO_SERVER_PORT
set_gradio_server_port()
def parse_args():
parser = argparse.ArgumentParser(description="Demo")
parser.add_argument("--cfg-path", default='eval_configs/demo.yaml',
help="path to configuration file.")
parser.add_argument(
"--options",
nargs="+",
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
)
args = parser.parse_args()
return args
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
cudnn.benchmark = False
cudnn.deterministic = True
print('Initializing Chat')
args = parse_args()
cfg = Config(args)
device = 'cuda'
model_config = cfg.model_cfg
print("model_config:", model_config)
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to(device)
bounding_box_size = 100
vis_processor_cfg = cfg.datasets_cfg.feature_face_caption.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
model = model.eval()
CONV_VISION = Conversation(
system="",
roles=(r"[INST] ", r" [/INST]"),
messages=[],
offset=2,
sep_style=SeparatorStyle.SINGLE,
sep="",
)
def extract_substrings(string):
# first check if there is no-finished bracket
index = string.rfind('}')
if index != -1:
string = string[:index + 1]
pattern = r'
(.*?)\}(?!<)' matches = re.findall(pattern, string) substrings = [match for match in matches] return substrings def is_overlapping(rect1, rect2): x1, y1, x2, y2 = rect1 x3, y3, x4, y4 = rect2 return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4) def computeIoU(bbox1, bbox2): x1, y1, x2, y2 = bbox1 x3, y3, x4, y4 = bbox2 intersection_x1 = max(x1, x3) intersection_y1 = max(y1, y3) intersection_x2 = min(x2, x4) intersection_y2 = min(y2, y4) intersection_area = max(0, intersection_x2 - intersection_x1 + 1) * max(0, intersection_y2 - intersection_y1 + 1) bbox1_area = (x2 - x1 + 1) * (y2 - y1 + 1) bbox2_area = (x4 - x3 + 1) * (y4 - y3 + 1) union_area = bbox1_area + bbox2_area - intersection_area iou = intersection_area / union_area return iou def save_tmp_img(visual_img): file_name = "".join([str(random.randint(0, 9)) for _ in range(5)]) + ".jpg" file_path = "/tmp/gradio" + file_name visual_img.save(file_path) return file_path def mask2bbox(mask): if mask is None: return '' mask = mask.resize([100, 100], resample=Image.NEAREST) mask = np.array(mask)[:, :, 0] rows = np.any(mask, axis=1) cols = np.any(mask, axis=0) if rows.sum(): # Get the top, bottom, left, and right boundaries rmin, rmax = np.where(rows)[0][[0, -1]] cmin, cmax = np.where(cols)[0][[0, -1]] bbox = '{{<{}><{}><{}><{}>}}'.format(cmin, rmin, cmax, rmax) else: bbox = '' return bbox def escape_markdown(text): # List of Markdown special characters that need to be escaped md_chars = ['<', '>'] # Escape each special character for char in md_chars: text = text.replace(char, '\\' + char) return text def reverse_escape(text): # Add safety check for None values if text is None: return "" md_chars = ['\\<', '\\>'] for char in md_chars: text = text.replace(char, char[1:]) return text colors = [ (255, 0, 0), (0, 255, 0), (0, 0, 255), (210, 210, 0), (255, 0, 255), (0, 255, 255), (114, 128, 250), (0, 165, 255), (0, 128, 0), (144, 238, 144), (238, 238, 175), (255, 191, 0), (0, 128, 0), (226, 43, 138), (255, 0, 255), (0, 215, 255), ] color_map = { f"{color_id}": f"#{hex(color[2])[2:].zfill(2)}{hex(color[1])[2:].zfill(2)}{hex(color[0])[2:].zfill(2)}" for color_id, color in enumerate(colors) } used_colors = colors def get_first_frame(video_path): cap = cv2.VideoCapture(video_path) if not cap.isOpened(): print("Error: Cannot open video.") return None ret, frame = cap.read() cap.release() if ret: return frame else: print("Error: Cannot read frame from video.") return None def visualize_all_bbox_together(image, generation): if image is None: return None, '' if isinstance(image, str): # is a image path raw_image = get_first_frame(image) if raw_image is None: return None, '' frame_rgb = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) image = Image.fromarray(frame_rgb) generation = html.unescape(generation) image_width, image_height = image.size image = image.resize([500, int(500 / image_width * image_height)]) image_width, image_height = image.size string_list = extract_substrings(generation) if string_list: # it is grounding or detection mode = 'all' entities = defaultdict(list) i = 0 j = 0 for string in string_list: try: obj, string = string.split('
') except ValueError: print('wrong string: ', string) continue bbox_list = string.split('(.*?)
', colored_phrases, generation) else: generation_colored = '' pil_image = Image.fromarray(new_image) return pil_image, generation_colored def gradio_reset(chat_state, img_list): if chat_state is not None: chat_state.messages = [] if img_list is not None: img_list = [] return None, gr.update(value=None, interactive=True), gr.update(placeholder='Upload your image and chat', interactive=True), chat_state, img_list def image_upload_trigger(gr_img, upload_flag, replace_flag, img_list): # set the upload flag to true when receive a new image. # if there is an old image (and old conversation), set the replace flag to true to reset the conv later. print(f"Image upload triggered: {gr_img}") upload_flag = 1 if img_list: replace_flag = 1 return upload_flag, replace_flag def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag, replace_flag): print("+++gradio_ask+++") print(f"gr_img: {gr_img}, type: {type(gr_img)}") print(f"upload_flag: {upload_flag}, replace_flag: {replace_flag}") if len(user_message) == 0: text_box_show = 'Input should not be empty!' else: text_box_show = '' print('user_message:', user_message) print('chatbot:', chatbot) print('chat_state:', chat_state) if isinstance(gr_img, dict): gr_img, mask = gr_img['image'], gr_img['mask'] else: mask = None if '[identify]' in user_message: # check if user provide bbox in the text input integers = re.findall(r'-?\d+', user_message) if len(integers) != 4: # no bbox in text bbox = mask2bbox(mask) user_message = user_message + bbox if chat_state is None: chat_state = CONV_VISION.copy() # Always process the image if it exists and upload_flag is set or img_list is empty if gr_img is not None and (upload_flag or len(img_list) == 0): if replace_flag: chat_state = CONV_VISION.copy() # new image, reset everything replace_flag = 0 chatbot = [] img_list = [] try: llm_message = chat.upload_img(gr_img, chat_state, img_list) print(f"Image uploaded successfully. img_list length: {len(img_list)}") except Exception as e: print(f"Error uploading image: {e}") return "Error uploading image. Please try again.", chatbot, chat_state, img_list, 0, replace_flag upload_flag = 0 elif gr_img is None: return "Please upload a video first.", chatbot, chat_state, img_list, upload_flag, replace_flag chat.ask(user_message, chat_state) print('user_message: ', user_message) print('chat_state: ', chat_state) chatbot = chatbot + [[user_message, None]] if '[identify]' in user_message: visual_img, _ = visualize_all_bbox_together(gr_img, user_message) if visual_img is not None: file_path = save_tmp_img(visual_img) chatbot = chatbot + [[(file_path,), None]] return text_box_show, chatbot, chat_state, img_list, upload_flag, replace_flag def gradio_answer(chatbot, chat_state, img_list, temperature): print("--gradio_answer--") # print('img_list: ', img_list) llm_message = chat.answer(conv=chat_state, img_list=img_list, temperature=temperature, max_new_tokens=500, max_length=2000)[0] chatbot[-1][1] = llm_message print('gradio_answer: ', llm_message) return chatbot, chat_state def process_english_text(text): if len(text) < 2: return text text = text[0].upper() + text[1:] sentences = text.split('. ') corrected_sentences = [s.capitalize() for s in sentences] text = '. '.join(corrected_sentences) if text.endswith(','): text = text[:-1] if not text.endswith('.'): text += '.' return text @spaces.GPU def gradio_stream_answer(chatbot, chat_state, img_list, temperature): print('---gradio_stream_answer---') print(f"img_list length: {len(img_list)}") # Check if img_list is empty if len(img_list) == 0: error_msg = "No image/video uploaded. Please upload a video first." print(error_msg) if len(chatbot) > 0: chatbot[-1][1] = error_msg yield chatbot, chat_state return if len(img_list) > 0: if not isinstance(img_list[0], torch.Tensor): chat.encode_img(img_list) print(chat) try: streamer = chat.stream_answer(conv=chat_state, img_list=img_list, temperature=temperature, max_new_tokens=500, max_length=2000) output = '' print('streamer:', streamer) for new_output in streamer: escapped = escape_markdown(new_output) output += escapped chatbot[-1][1] = output chatbot[-1][1] = process_english_text(chatbot[-1][1]) yield chatbot, chat_state chat_state.messages[-1][1] = '' print('output:', output) except Exception as e: error_msg = f"Error generating response: {str(e)}" print(error_msg) if len(chatbot) > 0: chatbot[-1][1] = error_msg yield chatbot, chat_state return chatbot, chat_state def gradio_visualize(chatbot, gr_img): # Safety check for empty chatbot or None response if len(chatbot) == 0 or chatbot[-1][1] is None: return chatbot if isinstance(gr_img, dict): gr_img, mask = gr_img['image'], gr_img['mask'] unescaped = reverse_escape(chatbot[-1][1]) visual_img, generation_color = visualize_all_bbox_together(gr_img, unescaped) if visual_img is not None: if len(generation_color): chatbot[-1][1] = generation_color file_path = save_tmp_img(visual_img) chatbot = chatbot + [[None, (file_path,)]] return chatbot def gradio_taskselect(idx): prompt_list = [ '', '[reason] ', '[emotion] ', '[visual] ', '[audio] ' ] instruct_list = [ '**Hint:** Type in whatever you want', '**Hint:** Send the command to multimodal emotion reasoning', '**Hint:** Send the command to multimodal emotion recognition', '**Hint:** Send the command to generate visual description', '**Hint:** Send the command to generate audio description' ] return prompt_list[idx], instruct_list[idx] chat = Chat(model, vis_processor, device=device) title = """