import argparse import os import random from collections import defaultdict import cv2 import re import numpy as np from PIL import Image import torch import html import gradio as gr import torchvision.transforms as T import torch.backends.cudnn as cudnn from minigpt4.common.config import Config from minigpt4.common.registry import registry from minigpt4.conversation.conversation import Conversation, SeparatorStyle, Chat # imports modules for registration from minigpt4.datasets.builders import * from minigpt4.models import * from minigpt4.processors import * from minigpt4.runners import * from minigpt4.tasks import * import socket import os import spaces def find_free_port(start_port, end_port): for port in range(start_port, end_port + 1): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: if sock.connect_ex(('localhost', port)) != 0: # Port is not open return port raise OSError(f"Cannot find empty port in range: {start_port}-{end_port}") def set_gradio_server_port(): start_port = 7870 end_port = 9999 free_port = find_free_port(start_port, end_port) os.environ["GRADIO_SERVER_PORT"] = str(free_port) print(f"Set GRADIO_SERVER_PORT to {free_port}") # Set GRADIO_SERVER_PORT set_gradio_server_port() def parse_args(): parser = argparse.ArgumentParser(description="Demo") parser.add_argument("--cfg-path", default='eval_configs/demo.yaml', help="path to configuration file.") parser.add_argument( "--options", nargs="+", help="override some settings in the used config, the key-value pair " "in xxx=yyy format will be merged into config file (deprecate), " "change to --cfg-options instead.", ) args = parser.parse_args() return args random.seed(42) np.random.seed(42) torch.manual_seed(42) cudnn.benchmark = False cudnn.deterministic = True print('Initializing Chat') args = parse_args() cfg = Config(args) device = 'cuda' model_config = cfg.model_cfg print("model_config:", model_config) model_cls = registry.get_model_class(model_config.arch) model = model_cls.from_config(model_config).to(device) bounding_box_size = 100 vis_processor_cfg = cfg.datasets_cfg.feature_face_caption.vis_processor.train vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg) model = model.eval() CONV_VISION = Conversation( system="", roles=(r"[INST] ", r" [/INST]"), messages=[], offset=2, sep_style=SeparatorStyle.SINGLE, sep="", ) def extract_substrings(string): # first check if there is no-finished bracket index = string.rfind('}') if index != -1: string = string[:index + 1] pattern = r'
(.*?)\}(?!<)' matches = re.findall(pattern, string) substrings = [match for match in matches] return substrings def is_overlapping(rect1, rect2): x1, y1, x2, y2 = rect1 x3, y3, x4, y4 = rect2 return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4) def computeIoU(bbox1, bbox2): x1, y1, x2, y2 = bbox1 x3, y3, x4, y4 = bbox2 intersection_x1 = max(x1, x3) intersection_y1 = max(y1, y3) intersection_x2 = min(x2, x4) intersection_y2 = min(y2, y4) intersection_area = max(0, intersection_x2 - intersection_x1 + 1) * max(0, intersection_y2 - intersection_y1 + 1) bbox1_area = (x2 - x1 + 1) * (y2 - y1 + 1) bbox2_area = (x4 - x3 + 1) * (y4 - y3 + 1) union_area = bbox1_area + bbox2_area - intersection_area iou = intersection_area / union_area return iou def save_tmp_img(visual_img): file_name = "".join([str(random.randint(0, 9)) for _ in range(5)]) + ".jpg" file_path = "/tmp/gradio" + file_name visual_img.save(file_path) return file_path def mask2bbox(mask): if mask is None: return '' mask = mask.resize([100, 100], resample=Image.NEAREST) mask = np.array(mask)[:, :, 0] rows = np.any(mask, axis=1) cols = np.any(mask, axis=0) if rows.sum(): # Get the top, bottom, left, and right boundaries rmin, rmax = np.where(rows)[0][[0, -1]] cmin, cmax = np.where(cols)[0][[0, -1]] bbox = '{{<{}><{}><{}><{}>}}'.format(cmin, rmin, cmax, rmax) else: bbox = '' return bbox def escape_markdown(text): # List of Markdown special characters that need to be escaped md_chars = ['<', '>'] # Escape each special character for char in md_chars: text = text.replace(char, '\\' + char) return text def reverse_escape(text): # Add safety check for None values if text is None: return "" md_chars = ['\\<', '\\>'] for char in md_chars: text = text.replace(char, char[1:]) return text colors = [ (255, 0, 0), (0, 255, 0), (0, 0, 255), (210, 210, 0), (255, 0, 255), (0, 255, 255), (114, 128, 250), (0, 165, 255), (0, 128, 0), (144, 238, 144), (238, 238, 175), (255, 191, 0), (0, 128, 0), (226, 43, 138), (255, 0, 255), (0, 215, 255), ] color_map = { f"{color_id}": f"#{hex(color[2])[2:].zfill(2)}{hex(color[1])[2:].zfill(2)}{hex(color[0])[2:].zfill(2)}" for color_id, color in enumerate(colors) } used_colors = colors def get_first_frame(video_path): cap = cv2.VideoCapture(video_path) if not cap.isOpened(): print("Error: Cannot open video.") return None ret, frame = cap.read() cap.release() if ret: return frame else: print("Error: Cannot read frame from video.") return None def visualize_all_bbox_together(image, generation): if image is None: return None, '' if isinstance(image, str): # is a image path raw_image = get_first_frame(image) if raw_image is None: return None, '' frame_rgb = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) image = Image.fromarray(frame_rgb) generation = html.unescape(generation) image_width, image_height = image.size image = image.resize([500, int(500 / image_width * image_height)]) image_width, image_height = image.size string_list = extract_substrings(generation) if string_list: # it is grounding or detection mode = 'all' entities = defaultdict(list) i = 0 j = 0 for string in string_list: try: obj, string = string.split('
') except ValueError: print('wrong string: ', string) continue bbox_list = string.split('') flag = False for bbox_string in bbox_list: integers = re.findall(r'-?\d+', bbox_string) if len(integers) == 4: x0, y0, x1, y1 = int(integers[0]), int(integers[1]), int(integers[2]), int(integers[3]) left = x0 / bounding_box_size * image_width bottom = y0 / bounding_box_size * image_height right = x1 / bounding_box_size * image_width top = y1 / bounding_box_size * image_height entities[obj].append([left, bottom, right, top]) j += 1 flag = True if flag: i += 1 else: integers = re.findall(r'-?\d+', generation) if len(integers) == 4: # it is refer mode = 'single' entities = list() x0, y0, x1, y1 = int(integers[0]), int(integers[1]), int(integers[2]), int(integers[3]) left = x0 / bounding_box_size * image_width bottom = y0 / bounding_box_size * image_height right = x1 / bounding_box_size * image_width top = y1 / bounding_box_size * image_height entities.append([left, bottom, right, top]) else: # don't detect any valid bbox to visualize return None, '' if len(entities) == 0: return None, '' if isinstance(image, Image.Image): image_h = image.height image_w = image.width image = np.array(image) elif isinstance(image, str): if os.path.exists(image): pil_img = Image.open(image).convert("RGB") image = np.array(pil_img)[:, :, [2, 1, 0]] image_h = pil_img.height image_w = pil_img.width else: raise ValueError(f"invaild image path, {image}") elif isinstance(image, torch.Tensor): image_tensor = image.cpu() reverse_norm_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])[:, None, None] reverse_norm_std = torch.tensor([0.26862954, 0.26130258, 0.27577711])[:, None, None] image_tensor = image_tensor * reverse_norm_std + reverse_norm_mean pil_img = T.ToPILImage()(image_tensor) image_h = pil_img.height image_w = pil_img.width image = np.array(pil_img)[:, :, [2, 1, 0]] else: raise ValueError(f"invaild image format, {type(image)} for {image}") indices = list(range(len(entities))) new_image = image.copy() previous_bboxes = [] # size of text text_size = 0.5 # thickness of text text_line = 1 # int(max(1 * min(image_h, image_w) / 512, 1)) box_line = 2 (c_width, text_height), _ = cv2.getTextSize("F", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line) base_height = int(text_height * 0.675) text_offset_original = text_height - base_height text_spaces = 2 # num_bboxes = sum(len(x[-1]) for x in entities) used_colors = colors # random.sample(colors, k=num_bboxes) color_id = -1 for entity_idx, entity_name in enumerate(entities): if mode == 'single' or mode == 'identify': bboxes = entity_name bboxes = [bboxes] else: bboxes = entities[entity_name] color_id += 1 for bbox_id, (x1_norm, y1_norm, x2_norm, y2_norm) in enumerate(bboxes): skip_flag = False orig_x1, orig_y1, orig_x2, orig_y2 = int(x1_norm), int(y1_norm), int(x2_norm), int(y2_norm) color = used_colors[entity_idx % len(used_colors)] # tuple(np.random.randint(0, 255, size=3).tolist()) new_image = cv2.rectangle(new_image, (orig_x1, orig_y1), (orig_x2, orig_y2), color, box_line) if mode == 'all': l_o, r_o = box_line // 2 + box_line % 2, box_line // 2 + box_line % 2 + 1 x1 = orig_x1 - l_o y1 = orig_y1 - l_o if y1 < text_height + text_offset_original + 2 * text_spaces: y1 = orig_y1 + r_o + text_height + text_offset_original + 2 * text_spaces x1 = orig_x1 + r_o # add text background (text_width, text_height), _ = cv2.getTextSize(f" {entity_name}", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line) text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2 = x1, y1 - ( text_height + text_offset_original + 2 * text_spaces), x1 + text_width, y1 for prev_bbox in previous_bboxes: if computeIoU((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox['bbox']) > 0.95 and \ prev_bbox['phrase'] == entity_name: skip_flag = True break while is_overlapping((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox['bbox']): text_bg_y1 += (text_height + text_offset_original + 2 * text_spaces) text_bg_y2 += (text_height + text_offset_original + 2 * text_spaces) y1 += (text_height + text_offset_original + 2 * text_spaces) if text_bg_y2 >= image_h: text_bg_y1 = max(0, image_h - (text_height + text_offset_original + 2 * text_spaces)) text_bg_y2 = image_h y1 = image_h break if not skip_flag: alpha = 0.5 for i in range(text_bg_y1, text_bg_y2): for j in range(text_bg_x1, text_bg_x2): if i < image_h and j < image_w: if j < text_bg_x1 + 1.35 * c_width: # original color bg_color = color else: # white bg_color = [255, 255, 255] new_image[i, j] = (alpha * new_image[i, j] + (1 - alpha) * np.array(bg_color)).astype( np.uint8) cv2.putText( new_image, f" {entity_name}", (x1, y1 - text_offset_original - 1 * text_spaces), cv2.FONT_HERSHEY_COMPLEX, text_size, (0, 0, 0), text_line, cv2.LINE_AA ) previous_bboxes.append( {'bbox': (text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), 'phrase': entity_name}) if mode == 'all': def color_iterator(colors): while True: for color in colors: yield color color_gen = color_iterator(colors) # Add colors to phrases and remove
def colored_phrases(match): phrase = match.group(1) color = next(color_gen) return f'{phrase}' generation = re.sub(r'{<\d+><\d+><\d+><\d+>}|', '', generation) generation_colored = re.sub(r'
(.*?)
', colored_phrases, generation) else: generation_colored = '' pil_image = Image.fromarray(new_image) return pil_image, generation_colored def gradio_reset(chat_state, img_list): if chat_state is not None: chat_state.messages = [] if img_list is not None: img_list = [] return None, gr.update(value=None, interactive=True), gr.update(placeholder='Upload your image and chat', interactive=True), chat_state, img_list def image_upload_trigger(gr_img, upload_flag, replace_flag, img_list): # set the upload flag to true when receive a new image. # if there is an old image (and old conversation), set the replace flag to true to reset the conv later. print(f"Image upload triggered: {gr_img}") upload_flag = 1 if img_list: replace_flag = 1 return upload_flag, replace_flag def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag, replace_flag): print("+++gradio_ask+++") print(f"gr_img: {gr_img}, type: {type(gr_img)}") print(f"upload_flag: {upload_flag}, replace_flag: {replace_flag}") if len(user_message) == 0: text_box_show = 'Input should not be empty!' else: text_box_show = '' print('user_message:', user_message) print('chatbot:', chatbot) print('chat_state:', chat_state) if isinstance(gr_img, dict): gr_img, mask = gr_img['image'], gr_img['mask'] else: mask = None if '[identify]' in user_message: # check if user provide bbox in the text input integers = re.findall(r'-?\d+', user_message) if len(integers) != 4: # no bbox in text bbox = mask2bbox(mask) user_message = user_message + bbox if chat_state is None: chat_state = CONV_VISION.copy() # Always process the image if it exists and upload_flag is set or img_list is empty if gr_img is not None and (upload_flag or len(img_list) == 0): if replace_flag: chat_state = CONV_VISION.copy() # new image, reset everything replace_flag = 0 chatbot = [] img_list = [] try: llm_message = chat.upload_img(gr_img, chat_state, img_list) print(f"Image uploaded successfully. img_list length: {len(img_list)}") except Exception as e: print(f"Error uploading image: {e}") return "Error uploading image. Please try again.", chatbot, chat_state, img_list, 0, replace_flag upload_flag = 0 elif gr_img is None: return "Please upload a video first.", chatbot, chat_state, img_list, upload_flag, replace_flag chat.ask(user_message, chat_state) print('user_message: ', user_message) print('chat_state: ', chat_state) chatbot = chatbot + [[user_message, None]] if '[identify]' in user_message: visual_img, _ = visualize_all_bbox_together(gr_img, user_message) if visual_img is not None: file_path = save_tmp_img(visual_img) chatbot = chatbot + [[(file_path,), None]] return text_box_show, chatbot, chat_state, img_list, upload_flag, replace_flag def gradio_answer(chatbot, chat_state, img_list, temperature): print("--gradio_answer--") # print('img_list: ', img_list) llm_message = chat.answer(conv=chat_state, img_list=img_list, temperature=temperature, max_new_tokens=500, max_length=2000)[0] chatbot[-1][1] = llm_message print('gradio_answer: ', llm_message) return chatbot, chat_state def process_english_text(text): if len(text) < 2: return text text = text[0].upper() + text[1:] sentences = text.split('. ') corrected_sentences = [s.capitalize() for s in sentences] text = '. '.join(corrected_sentences) if text.endswith(','): text = text[:-1] if not text.endswith('.'): text += '.' return text @spaces.GPU def gradio_stream_answer(chatbot, chat_state, img_list, temperature): print('---gradio_stream_answer---') print(f"img_list length: {len(img_list)}") # Check if img_list is empty if len(img_list) == 0: error_msg = "No image/video uploaded. Please upload a video first." print(error_msg) if len(chatbot) > 0: chatbot[-1][1] = error_msg yield chatbot, chat_state return if len(img_list) > 0: if not isinstance(img_list[0], torch.Tensor): chat.encode_img(img_list) print(chat) try: streamer = chat.stream_answer(conv=chat_state, img_list=img_list, temperature=temperature, max_new_tokens=500, max_length=2000) output = '' print('streamer:', streamer) for new_output in streamer: escapped = escape_markdown(new_output) output += escapped chatbot[-1][1] = output chatbot[-1][1] = process_english_text(chatbot[-1][1]) yield chatbot, chat_state chat_state.messages[-1][1] = '' print('output:', output) except Exception as e: error_msg = f"Error generating response: {str(e)}" print(error_msg) if len(chatbot) > 0: chatbot[-1][1] = error_msg yield chatbot, chat_state return chatbot, chat_state def gradio_visualize(chatbot, gr_img): # Safety check for empty chatbot or None response if len(chatbot) == 0 or chatbot[-1][1] is None: return chatbot if isinstance(gr_img, dict): gr_img, mask = gr_img['image'], gr_img['mask'] unescaped = reverse_escape(chatbot[-1][1]) visual_img, generation_color = visualize_all_bbox_together(gr_img, unescaped) if visual_img is not None: if len(generation_color): chatbot[-1][1] = generation_color file_path = save_tmp_img(visual_img) chatbot = chatbot + [[None, (file_path,)]] return chatbot def gradio_taskselect(idx): prompt_list = [ '', '[reason] ', '[emotion] ', '[visual] ', '[audio] ' ] instruct_list = [ '**Hint:** Type in whatever you want', '**Hint:** Send the command to multimodal emotion reasoning', '**Hint:** Send the command to multimodal emotion recognition', '**Hint:** Send the command to generate visual description', '**Hint:** Send the command to generate audio description' ] return prompt_list[idx], instruct_list[idx] chat = Chat(model, vis_processor, device=device) title = """

Emotion-LLaMA Demo

""" description = 'Welcome to Our Emotion-LLaMA Chatbot Demo!' article = """

""" introduction = ''' For Abilities Involging Multimodal Emotion Understanding: 1. Reason: Click **Send** to generate a multimodal emotion description. 2. Emotion: Click **Send** to generate an emotion label. 3. Visual: Click **Send** to generate a visual description. 4. Audio: Click **Send** to generate an audio description. 5. No Tag: Input whatever you want and click **Send** without any tagging. You can also simply chat in free form! ''' text_input = gr.Textbox(placeholder='Upload your image and chat', interactive=True, show_label=False, container=False, scale=8) with gr.Blocks() as demo: gr.Markdown(title) # gr.Markdown(description) gr.Markdown(article) with gr.Row(): with gr.Column(scale=0.5): # image = gr.Image(type="pil", tool='sketch', brush_radius=20) image = gr.Video(sources=["upload", "webcam"]) temperature = gr.Slider( minimum=0.1, maximum=1.5, value=0.2, step=0.1, interactive=True, label="Temperature", ) clear = gr.Button("Restart") gr.Markdown(introduction) with gr.Column(): chat_state = gr.State(value=None) img_list = gr.State(value=[]) chatbot = gr.Chatbot(label='Emotion-LLaMA') # Updated Dataset component for Gradio 5 dataset = gr.Dataset( components=[gr.Textbox(visible=False)], samples=[['No Tag'], ['reason'], ['emotion'], ['visual'], ['audio']], type="index", label='Task Shortcuts', ) task_inst = gr.Markdown('**Hint:** Upload your video and chat') with gr.Row(): text_input.render() send = gr.Button("Send", variant='primary', size='sm', scale=1) upload_flag = gr.State(value=0) replace_flag = gr.State(value=0) # Updated upload trigger for Gradio 5 - fixed parameter order image.upload(image_upload_trigger, [image, upload_flag, replace_flag, img_list], [upload_flag, replace_flag]) # Updated Examples component for Gradio 5 - this is the key fix! with gr.Row(): with gr.Column(): examples1 = gr.Examples( examples=[ ["examples/samplenew_00004251.mp4", "[detection] face"], ["examples/sample_00000338.mp4", "The person in video says: Oh no, my phone and wallet are all in my bag. [emotion] Please determine which emotion label in the video represents: happy, sad, neutral, angry, worried, surprise."], ["examples/sample_00000669.mp4", "The person in video says: Why are you looking at me like this? It's just a woman, so you have to have something to do with me. [emotion] Determine the emotional state shown in the video, choosing from happy, sad, neutral, angry, worried, or surprise."], ["examples/sample_00003462.mp4", "The person in video says: Do you believe that you push me around? [emotion] Assess and label the emotion evident in the video: could it be happy, sad, neutral, angry, worried, surprise?"], ["examples/sample_00000727.mp4", "The person in video says: No, this, I have to get up! You, I'm sorry, everyone. I'm sorry, it's from the German side. [emotion] Identify the displayed emotion in the video: is it happy, sad, neutral, angry, worried, or surprise?"], ["examples/samplenew_00061200.mp4", "The person in video says: Me: I'm not going in anymore, scared. [emotion] Identify the displayed emotion in the video: is it happy, sad, neutral, angry, fear, contempt, doubt, worried, or surprise?"], ], inputs=[image, text_input], # Remove fn and outputs - let Examples handle this automatically in Gradio 5 ) with gr.Column(): examples2 = gr.Examples( examples=[ ["examples/samplenew_00051251.mp4", "In what state is the person in the video, say the following: \"Do you really think so?\""], ["examples/sample_00004735.mp4", "[visual] What are the emotions of the woman in the video?"], ["examples/sample_00002422.mp4", "[audio] Analyze the speaker's voice in the video."], ["examples/sample_00001073.mp4", "The person in video says: Make him different from before. I like the way you are now. [reason] Please analyze all the clues in the video and reason out the emotional label of the person in the video."], ["examples/sample_00004671.mp4", "The person in video says: Won't you? Impossible! Fan Xiaomei is not such a person. [reason] What are the facial expressions and vocal tone used in the video? What is the intended meaning behind his words? Which emotion does this reflect?"], ["examples/sample_00005854.mp4", "The person in video says: Bastard! Boss, you don't choose, you prefer. [reason] Please integrate information from various modalities to infer the emotional category of the person in the video."], ], inputs=[image, text_input], # Remove fn and outputs - let Examples handle this automatically in Gradio 5 ) dataset.click( gradio_taskselect, inputs=[dataset], outputs=[text_input, task_inst], show_progress="hidden", queue=False, ) text_input.submit( gradio_ask, [text_input, chatbot, chat_state, image, img_list, upload_flag, replace_flag], [text_input, chatbot, chat_state, img_list, upload_flag, replace_flag], queue=False ).then( gradio_stream_answer, [chatbot, chat_state, img_list, temperature], [chatbot, chat_state] ).then( gradio_visualize, [chatbot, image], [chatbot], queue=False, ) send.click( gradio_ask, [text_input, chatbot, chat_state, image, img_list, upload_flag, replace_flag], [text_input, chatbot, chat_state, img_list, upload_flag, replace_flag], queue=False ).then( gradio_stream_answer, [chatbot, chat_state, img_list, temperature], [chatbot, chat_state] ).then( gradio_visualize, [chatbot, image], [chatbot], queue=False, ) clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, chat_state, img_list], queue=False) demo.queue() demo.launch(share=True)