Spaces:

ZebangCheng
/

Emotion-LLaMA

Running on Zero

App Files Files Community

ZebangCheng commited on 16 days ago

Commit

640c1d6

verified ·

1 Parent(s): f41c1e4

gradio 5 (#4)

Browse files

- support gradio 5 and ZeroGPU (add017d61b33ddb6d8aee6d02df9d452e7dcd044)

Files changed (2) hide show

app.py +96 -67
requirements.txt +4 -3

app.py CHANGED Viewed

@@ -1,28 +1,19 @@
 import argparse
 import os
-os.system("pip uninstall -y gradio")
-os.system("pip install gradio==3.47.1")
 import random
 from collections import defaultdict
 import cv2
 import re
 import numpy as np
 from PIL import Image
 import torch
 import html
 import gradio as gr
 import torchvision.transforms as T
 import torch.backends.cudnn as cudnn
 from minigpt4.common.config import Config
 from minigpt4.common.registry import registry
 from minigpt4.conversation.conversation import Conversation, SeparatorStyle, Chat
 # imports modules for registration
 from minigpt4.datasets.builders import *
 from minigpt4.models import *
@@ -32,6 +23,7 @@ from minigpt4.tasks import *
 import socket
 import os
 def find_free_port(start_port, end_port):
     for port in range(start_port, end_port + 1):
@@ -173,6 +165,10 @@ def escape_markdown(text):
 def reverse_escape(text):
     md_chars = ['\\<', '\\>']
     for char in md_chars:
@@ -229,6 +225,8 @@ def visualize_all_bbox_together(image, generation):
     if isinstance(image, str):  # is a image path
         raw_image = get_first_frame(image)
         frame_rgb = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB)
         image = Image.fromarray(frame_rgb)
@@ -431,27 +429,20 @@ def gradio_reset(chat_state, img_list):
                                                                     interactive=True), chat_state, img_list
-def image_upload_trigger(upload_flag, replace_flag, img_list):
     # set the upload flag to true when receive a new image.
     # if there is an old image (and old conversation), set the replace flag to true to reset the conv later.
     upload_flag = 1
     if img_list:
         replace_flag = 1
     return upload_flag, replace_flag
-def example_trigger(text_input, image, upload_flag, replace_flag, img_list):
-    # set the upload flag to true when receive a new image.
-    # if there is an old image (and old conversation), set the replace flag to true to reset the conv later.
-    upload_flag = 1
-    if img_list or replace_flag == 1:
-        replace_flag = 1
-    return upload_flag, replace_flag
 def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag, replace_flag):
     print("+++gradio_ask+++")
     if len(user_message) == 0:
         text_box_show = 'Input should not be empty!'
@@ -462,7 +453,6 @@ def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag,
     print('chatbot:', chatbot)
     print('chat_state:', chat_state)
     if isinstance(gr_img, dict):
         gr_img, mask = gr_img['image'], gr_img['mask']
     else:
@@ -478,14 +468,22 @@ def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag,
     if chat_state is None:
         chat_state = CONV_VISION.copy()
-    if upload_flag:
         if replace_flag:
             chat_state = CONV_VISION.copy()  # new image, reset everything
             replace_flag = 0
             chatbot = []
         img_list = []
-        llm_message = chat.upload_img(gr_img, chat_state, img_list)
         upload_flag = 0
     chat.ask(user_message, chat_state)
     print('user_message: ', user_message)
@@ -531,32 +529,56 @@ def process_english_text(text):
     return text
 def gradio_stream_answer(chatbot, chat_state, img_list, temperature):
     print('---gradio_stream_answer---')
     if len(img_list) > 0:
         if not isinstance(img_list[0], torch.Tensor):
             chat.encode_img(img_list)
     print(chat)
-    streamer = chat.stream_answer(conv=chat_state,
-                                  img_list=img_list,
-                                  temperature=temperature,
-                                  max_new_tokens=500,
-                                  max_length=2000)
-    output = ''
-    print('streamer:', streamer)
-    for new_output in streamer:
-        escapped = escape_markdown(new_output)
-        output += escapped
-        chatbot[-1][1] = output
-        chatbot[-1][1] = process_english_text(chatbot[-1][1])
         yield chatbot, chat_state
-    chat_state.messages[-1][1] = '</s>'
-    print('output:', output)
     return chatbot, chat_state
 def gradio_visualize(chatbot, gr_img):
     if isinstance(gr_img, dict):
         gr_img, mask = gr_img['image'], gr_img['mask']
@@ -589,8 +611,6 @@ def gradio_taskselect(idx):
     return prompt_list[idx], instruct_list[idx]
 chat = Chat(model, vis_processor, device=device)
 title = """<h1 align="center">Emotion-LLaMA Demo</h1>"""
@@ -604,11 +624,11 @@ For Abilities Involging Multimodal Emotion Understanding:
 3. Visual: Click **Send** to generate a visual description.
 4. Audio: Click **Send** to generate an audio description.
 5. No Tag: Input whatever you want and click **Send** without any tagging.
 You can also simply chat in free form!
 '''
 text_input = gr.Textbox(placeholder='Upload your image and chat', interactive=True, show_label=False, container=False, scale=8)
 with gr.Blocks() as demo:
     gr.Markdown(title)
     # gr.Markdown(description)
@@ -637,6 +657,7 @@ with gr.Blocks() as demo:
             img_list = gr.State(value=[])
             chatbot = gr.Chatbot(label='Emotion-LLaMA')
             dataset = gr.Dataset(
                 components=[gr.Textbox(visible=False)],
                 samples=[['No Tag'], ['reason'], ['emotion'], ['visual'], ['audio']],
@@ -650,36 +671,44 @@ with gr.Blocks() as demo:
     upload_flag = gr.State(value=0)
     replace_flag = gr.State(value=0)
-    image.upload(image_upload_trigger, [upload_flag, replace_flag, img_list], [upload_flag, replace_flag])
     with gr.Row():
         with gr.Column():
-            gr.Examples(examples=[
-                ["examples/samplenew_00004251.mp4", "[detection] face", upload_flag, replace_flag, img_list],
-                ["examples/sample_00000338.mp4", "The person in video says: Oh no, my phone and wallet are all in my bag. [emotion] Please determine which emotion label in the video represents: happy, sad, neutral, angry, worried, surprise.", upload_flag, replace_flag, img_list],
-                ["examples/sample_00000669.mp4", "The person in video says: Why are you looking at me like this? It's just a woman, so you have to have something to do with me.  [emotion] Determine the emotional state shown in the video, choosing from happy, sad, neutral, angry, worried, or surprise.", upload_flag, replace_flag, img_list],
-                ["examples/sample_00003462.mp4", "The person in video says: Do you believe that you push me around?  [emotion] Assess and label the emotion evident in the video: could it be happy, sad, neutral, angry, worried, surprise?", upload_flag, replace_flag, img_list],
-                ["examples/sample_00000727.mp4", "The person in video says: No, this, I have to get up! You, I'm sorry, everyone. I'm sorry, it's from the German side.  [emotion] Identify the displayed emotion in the video: is it happy, sad, neutral, angry, worried, or surprise?", upload_flag, replace_flag, img_list],
-                ["examples/samplenew_00061200.mp4", "The person in video says: Me: I'm not going in anymore, scared.  [emotion] Identify the displayed emotion in the video: is it happy, sad, neutral, angry, fear, contempt, doubt, worried, or surprise?", upload_flag, replace_flag, img_list],
-            ], inputs=[image, text_input, upload_flag, replace_flag, img_list], fn=example_trigger,
-                outputs=[upload_flag, replace_flag])
         with gr.Column():
-            gr.Examples(examples=[
-                ["examples/samplenew_00051251.mp4", "In what state is the person in the video, say the following: \"Do you really think so?\"", upload_flag, replace_flag, img_list],
-                ["examples/sample_00004735.mp4", "[visual] What are the emotions of the woman in the video?", upload_flag, replace_flag, img_list],
-                ["examples/sample_00002422.mp4", "[audio] Analyze the speaker's voice in the video.", upload_flag, replace_flag, img_list],
-                ["examples/sample_00001073.mp4", "The person in video says: Make him different from before. I like the way you are now.  [reason] Please analyze all the clues in the video and reason out the emotional label of the person in the video.", upload_flag, replace_flag, img_list],
-                ["examples/sample_00004671.mp4", "The person in video says: Won't you? Impossible! Fan Xiaomei is not such a person.  [reason] What are the facial expressions and vocal tone used in the video? What is the intended meaning behind his words? Which emotion does this reflect?", upload_flag, replace_flag, img_list],
-                ["examples/sample_00005854.mp4", "The person in video says: Bastard! Boss, you don't choose, you prefer.  [reason] Please integrate information from various modalities to infer the emotional category of the person in the video.", upload_flag, replace_flag, img_list],
-            ], inputs=[image, text_input, upload_flag, replace_flag, img_list], fn=example_trigger,
-                outputs=[upload_flag, replace_flag])
     dataset.click(
         gradio_taskselect,
         inputs=[dataset],
         outputs=[text_input, task_inst],
         show_progress="hidden",
-        postprocess=False,
         queue=False,
     )
@@ -687,11 +716,11 @@ with gr.Blocks() as demo:
         gradio_ask,
         [text_input, chatbot, chat_state, image, img_list, upload_flag, replace_flag],
         [text_input, chatbot, chat_state, img_list, upload_flag, replace_flag], queue=False
-    ).success(
         gradio_stream_answer,
         [chatbot, chat_state, img_list, temperature],
         [chatbot, chat_state]
-    ).success(
         gradio_visualize,
         [chatbot, image],
         [chatbot],
@@ -702,11 +731,11 @@ with gr.Blocks() as demo:
         gradio_ask,
         [text_input, chatbot, chat_state, image, img_list, upload_flag, replace_flag],
         [text_input, chatbot, chat_state, img_list, upload_flag, replace_flag], queue=False
-    ).success(
         gradio_stream_answer,
         [chatbot, chat_state, img_list, temperature],
         [chatbot, chat_state]
-    ).success(
         gradio_visualize,
         [chatbot, image],
         [chatbot],
@@ -715,5 +744,5 @@ with gr.Blocks() as demo:
     clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, chat_state, img_list], queue=False)
-demo.launch(share=True, enable_queue=True)
-# demo.launch(share=True)

 import argparse
 import os
 import random
 from collections import defaultdict
 import cv2
 import re
 import numpy as np
 from PIL import Image
 import torch
 import html
 import gradio as gr
 import torchvision.transforms as T
 import torch.backends.cudnn as cudnn
 from minigpt4.common.config import Config
 from minigpt4.common.registry import registry
 from minigpt4.conversation.conversation import Conversation, SeparatorStyle, Chat
 # imports modules for registration
 from minigpt4.datasets.builders import *
 from minigpt4.models import *
 import socket
 import os
+import spaces
 def find_free_port(start_port, end_port):
     for port in range(start_port, end_port + 1):
 def reverse_escape(text):
+    # Add safety check for None values
+    if text is None:
+        return ""
     md_chars = ['\\<', '\\>']
     for char in md_chars:
     if isinstance(image, str):  # is a image path
         raw_image = get_first_frame(image)
+        if raw_image is None:
+            return None, ''
         frame_rgb = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB)
         image = Image.fromarray(frame_rgb)
                                                                     interactive=True), chat_state, img_list
+def image_upload_trigger(gr_img, upload_flag, replace_flag, img_list):
     # set the upload flag to true when receive a new image.
     # if there is an old image (and old conversation), set the replace flag to true to reset the conv later.
+    print(f"Image upload triggered: {gr_img}")
     upload_flag = 1
     if img_list:
         replace_flag = 1
     return upload_flag, replace_flag
 def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag, replace_flag):
     print("+++gradio_ask+++")
+    print(f"gr_img: {gr_img}, type: {type(gr_img)}")
+    print(f"upload_flag: {upload_flag}, replace_flag: {replace_flag}")
     if len(user_message) == 0:
         text_box_show = 'Input should not be empty!'
     print('chatbot:', chatbot)
     print('chat_state:', chat_state)
     if isinstance(gr_img, dict):
         gr_img, mask = gr_img['image'], gr_img['mask']
     else:
     if chat_state is None:
         chat_state = CONV_VISION.copy()
+    # Always process the image if it exists and upload_flag is set or img_list is empty
+    if gr_img is not None and (upload_flag or len(img_list) == 0):
         if replace_flag:
             chat_state = CONV_VISION.copy()  # new image, reset everything
             replace_flag = 0
             chatbot = []
         img_list = []
+        try:
+            llm_message = chat.upload_img(gr_img, chat_state, img_list)
+            print(f"Image uploaded successfully. img_list length: {len(img_list)}")
+        except Exception as e:
+            print(f"Error uploading image: {e}")
+            return "Error uploading image. Please try again.", chatbot, chat_state, img_list, 0, replace_flag
         upload_flag = 0
+    elif gr_img is None:
+        return "Please upload a video first.", chatbot, chat_state, img_list, upload_flag, replace_flag
     chat.ask(user_message, chat_state)
     print('user_message: ', user_message)
     return text
+@spaces.GPU
 def gradio_stream_answer(chatbot, chat_state, img_list, temperature):
     print('---gradio_stream_answer---')
+    print(f"img_list length: {len(img_list)}")
+    # Check if img_list is empty
+    if len(img_list) == 0:
+        error_msg = "No image/video uploaded. Please upload a video first."
+        print(error_msg)
+        if len(chatbot) > 0:
+            chatbot[-1][1] = error_msg
+        yield chatbot, chat_state
+        return
     if len(img_list) > 0:
         if not isinstance(img_list[0], torch.Tensor):
             chat.encode_img(img_list)
     print(chat)
+    try:
+        streamer = chat.stream_answer(conv=chat_state,
+                                      img_list=img_list,
+                                      temperature=temperature,
+                                      max_new_tokens=500,
+                                      max_length=2000)
+        output = ''
+        print('streamer:', streamer)
+        for new_output in streamer:
+            escapped = escape_markdown(new_output)
+            output += escapped
+            chatbot[-1][1] = output
+            chatbot[-1][1] = process_english_text(chatbot[-1][1])
+            yield chatbot, chat_state
+        chat_state.messages[-1][1] = '</s>'
+        print('output:', output)
+    except Exception as e:
+        error_msg = f"Error generating response: {str(e)}"
+        print(error_msg)
+        if len(chatbot) > 0:
+            chatbot[-1][1] = error_msg
         yield chatbot, chat_state
     return chatbot, chat_state
 def gradio_visualize(chatbot, gr_img):
+    # Safety check for empty chatbot or None response
+    if len(chatbot) == 0 or chatbot[-1][1] is None:
+        return chatbot
     if isinstance(gr_img, dict):
         gr_img, mask = gr_img['image'], gr_img['mask']
     return prompt_list[idx], instruct_list[idx]
 chat = Chat(model, vis_processor, device=device)
 title = """<h1 align="center">Emotion-LLaMA Demo</h1>"""
 3. Visual: Click **Send** to generate a visual description.
 4. Audio: Click **Send** to generate an audio description.
 5. No Tag: Input whatever you want and click **Send** without any tagging.
 You can also simply chat in free form!
 '''
 text_input = gr.Textbox(placeholder='Upload your image and chat', interactive=True, show_label=False, container=False, scale=8)
 with gr.Blocks() as demo:
     gr.Markdown(title)
     # gr.Markdown(description)
             img_list = gr.State(value=[])
             chatbot = gr.Chatbot(label='Emotion-LLaMA')
+            # Updated Dataset component for Gradio 5
             dataset = gr.Dataset(
                 components=[gr.Textbox(visible=False)],
                 samples=[['No Tag'], ['reason'], ['emotion'], ['visual'], ['audio']],
     upload_flag = gr.State(value=0)
     replace_flag = gr.State(value=0)
+    # Updated upload trigger for Gradio 5 - fixed parameter order
+    image.upload(image_upload_trigger, [image, upload_flag, replace_flag, img_list], [upload_flag, replace_flag])
+    # Updated Examples component for Gradio 5 - this is the key fix!
     with gr.Row():
         with gr.Column():
+            examples1 = gr.Examples(
+                examples=[
+                    ["examples/samplenew_00004251.mp4", "[detection] face"],
+                    ["examples/sample_00000338.mp4", "The person in video says: Oh no, my phone and wallet are all in my bag. [emotion] Please determine which emotion label in the video represents: happy, sad, neutral, angry, worried, surprise."],
+                    ["examples/sample_00000669.mp4", "The person in video says: Why are you looking at me like this? It's just a woman, so you have to have something to do with me.  [emotion] Determine the emotional state shown in the video, choosing from happy, sad, neutral, angry, worried, or surprise."],
+                    ["examples/sample_00003462.mp4", "The person in video says: Do you believe that you push me around?  [emotion] Assess and label the emotion evident in the video: could it be happy, sad, neutral, angry, worried, surprise?"],
+                    ["examples/sample_00000727.mp4", "The person in video says: No, this, I have to get up! You, I'm sorry, everyone. I'm sorry, it's from the German side.  [emotion] Identify the displayed emotion in the video: is it happy, sad, neutral, angry, worried, or surprise?"],
+                    ["examples/samplenew_00061200.mp4", "The person in video says: Me: I'm not going in anymore, scared.  [emotion] Identify the displayed emotion in the video: is it happy, sad, neutral, angry, fear, contempt, doubt, worried, or surprise?"],
+                ],
+                inputs=[image, text_input],
+                # Remove fn and outputs - let Examples handle this automatically in Gradio 5
+            )
         with gr.Column():
+            examples2 = gr.Examples(
+                examples=[
+                    ["examples/samplenew_00051251.mp4", "In what state is the person in the video, say the following: \"Do you really think so?\""],
+                    ["examples/sample_00004735.mp4", "[visual] What are the emotions of the woman in the video?"],
+                    ["examples/sample_00002422.mp4", "[audio] Analyze the speaker's voice in the video."],
+                    ["examples/sample_00001073.mp4", "The person in video says: Make him different from before. I like the way you are now.  [reason] Please analyze all the clues in the video and reason out the emotional label of the person in the video."],
+                    ["examples/sample_00004671.mp4", "The person in video says: Won't you? Impossible! Fan Xiaomei is not such a person.  [reason] What are the facial expressions and vocal tone used in the video? What is the intended meaning behind his words? Which emotion does this reflect?"],
+                    ["examples/sample_00005854.mp4", "The person in video says: Bastard! Boss, you don't choose, you prefer.  [reason] Please integrate information from various modalities to infer the emotional category of the person in the video."],
+                ],
+                inputs=[image, text_input],
+                # Remove fn and outputs - let Examples handle this automatically in Gradio 5
+            )
     dataset.click(
         gradio_taskselect,
         inputs=[dataset],
         outputs=[text_input, task_inst],
         show_progress="hidden",
         queue=False,
     )
         gradio_ask,
         [text_input, chatbot, chat_state, image, img_list, upload_flag, replace_flag],
         [text_input, chatbot, chat_state, img_list, upload_flag, replace_flag], queue=False
+    ).then(
         gradio_stream_answer,
         [chatbot, chat_state, img_list, temperature],
         [chatbot, chat_state]
+    ).then(
         gradio_visualize,
         [chatbot, image],
         [chatbot],
         gradio_ask,
         [text_input, chatbot, chat_state, image, img_list, upload_flag, replace_flag],
         [text_input, chatbot, chat_state, img_list, upload_flag, replace_flag], queue=False
+    ).then(
         gradio_stream_answer,
         [chatbot, chat_state, img_list, temperature],
         [chatbot, chat_state]
+    ).then(
         gradio_visualize,
         [chatbot, image],
         [chatbot],
     clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, chat_state, img_list], queue=False)
+demo.queue()
+demo.launch(share=True)

requirements.txt CHANGED Viewed

@@ -1,5 +1,3 @@
-gradio==3.47.1
-gradio_client==0.6.0
 decorator==4.4.2
 moviepy==1.0.3
 decord==0.6.0
@@ -15,6 +13,9 @@ bitsandbytes==0.41.0
 scipy
 huggingface_hub
 torch==2.1.2
-torchvision==0.15.1
 timm==0.6.13
 transformers==4.30.0

 decorator==4.4.2
 moviepy==1.0.3
 decord==0.6.0
 scipy
 huggingface_hub
 torch==2.1.2
+torchvision
 timm==0.6.13
 transformers==4.30.0
+gradio
+gradio_client
+numpy<2.0