ZebangCheng commited on
Commit
640c1d6
·
verified ·
1 Parent(s): f41c1e4

gradio 5 (#4)

Browse files

- support gradio 5 and ZeroGPU (add017d61b33ddb6d8aee6d02df9d452e7dcd044)

Files changed (2) hide show
  1. app.py +96 -67
  2. requirements.txt +4 -3
app.py CHANGED
@@ -1,28 +1,19 @@
1
  import argparse
2
  import os
3
- os.system("pip uninstall -y gradio")
4
- os.system("pip install gradio==3.47.1")
5
-
6
  import random
7
  from collections import defaultdict
8
-
9
  import cv2
10
  import re
11
-
12
  import numpy as np
13
  from PIL import Image
14
  import torch
15
  import html
16
  import gradio as gr
17
-
18
  import torchvision.transforms as T
19
  import torch.backends.cudnn as cudnn
20
-
21
  from minigpt4.common.config import Config
22
-
23
  from minigpt4.common.registry import registry
24
  from minigpt4.conversation.conversation import Conversation, SeparatorStyle, Chat
25
-
26
  # imports modules for registration
27
  from minigpt4.datasets.builders import *
28
  from minigpt4.models import *
@@ -32,6 +23,7 @@ from minigpt4.tasks import *
32
 
33
  import socket
34
  import os
 
35
 
36
  def find_free_port(start_port, end_port):
37
  for port in range(start_port, end_port + 1):
@@ -173,6 +165,10 @@ def escape_markdown(text):
173
 
174
 
175
  def reverse_escape(text):
 
 
 
 
176
  md_chars = ['\\<', '\\>']
177
 
178
  for char in md_chars:
@@ -229,6 +225,8 @@ def visualize_all_bbox_together(image, generation):
229
 
230
  if isinstance(image, str): # is a image path
231
  raw_image = get_first_frame(image)
 
 
232
  frame_rgb = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB)
233
  image = Image.fromarray(frame_rgb)
234
 
@@ -431,27 +429,20 @@ def gradio_reset(chat_state, img_list):
431
  interactive=True), chat_state, img_list
432
 
433
 
434
- def image_upload_trigger(upload_flag, replace_flag, img_list):
435
  # set the upload flag to true when receive a new image.
436
  # if there is an old image (and old conversation), set the replace flag to true to reset the conv later.
 
437
  upload_flag = 1
438
  if img_list:
439
  replace_flag = 1
440
  return upload_flag, replace_flag
441
 
442
 
443
- def example_trigger(text_input, image, upload_flag, replace_flag, img_list):
444
- # set the upload flag to true when receive a new image.
445
- # if there is an old image (and old conversation), set the replace flag to true to reset the conv later.
446
- upload_flag = 1
447
- if img_list or replace_flag == 1:
448
- replace_flag = 1
449
-
450
- return upload_flag, replace_flag
451
-
452
-
453
  def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag, replace_flag):
454
  print("+++gradio_ask+++")
 
 
455
 
456
  if len(user_message) == 0:
457
  text_box_show = 'Input should not be empty!'
@@ -462,7 +453,6 @@ def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag,
462
  print('chatbot:', chatbot)
463
  print('chat_state:', chat_state)
464
 
465
-
466
  if isinstance(gr_img, dict):
467
  gr_img, mask = gr_img['image'], gr_img['mask']
468
  else:
@@ -478,14 +468,22 @@ def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag,
478
  if chat_state is None:
479
  chat_state = CONV_VISION.copy()
480
 
481
- if upload_flag:
 
482
  if replace_flag:
483
  chat_state = CONV_VISION.copy() # new image, reset everything
484
  replace_flag = 0
485
  chatbot = []
486
  img_list = []
487
- llm_message = chat.upload_img(gr_img, chat_state, img_list)
 
 
 
 
 
488
  upload_flag = 0
 
 
489
 
490
  chat.ask(user_message, chat_state)
491
  print('user_message: ', user_message)
@@ -531,32 +529,56 @@ def process_english_text(text):
531
 
532
  return text
533
 
534
-
535
  def gradio_stream_answer(chatbot, chat_state, img_list, temperature):
536
  print('---gradio_stream_answer---')
 
 
 
 
 
 
 
 
 
 
 
537
  if len(img_list) > 0:
538
  if not isinstance(img_list[0], torch.Tensor):
539
  chat.encode_img(img_list)
540
  print(chat)
541
- streamer = chat.stream_answer(conv=chat_state,
542
- img_list=img_list,
543
- temperature=temperature,
544
- max_new_tokens=500,
545
- max_length=2000)
546
- output = ''
547
- print('streamer:', streamer)
548
- for new_output in streamer:
549
- escapped = escape_markdown(new_output)
550
- output += escapped
551
- chatbot[-1][1] = output
552
- chatbot[-1][1] = process_english_text(chatbot[-1][1])
 
 
 
 
 
 
 
 
 
 
553
  yield chatbot, chat_state
554
- chat_state.messages[-1][1] = '</s>'
555
- print('output:', output)
556
  return chatbot, chat_state
557
 
558
 
559
  def gradio_visualize(chatbot, gr_img):
 
 
 
 
560
  if isinstance(gr_img, dict):
561
  gr_img, mask = gr_img['image'], gr_img['mask']
562
 
@@ -589,8 +611,6 @@ def gradio_taskselect(idx):
589
  return prompt_list[idx], instruct_list[idx]
590
 
591
 
592
-
593
-
594
  chat = Chat(model, vis_processor, device=device)
595
 
596
  title = """<h1 align="center">Emotion-LLaMA Demo</h1>"""
@@ -604,11 +624,11 @@ For Abilities Involging Multimodal Emotion Understanding:
604
  3. Visual: Click **Send** to generate a visual description.
605
  4. Audio: Click **Send** to generate an audio description.
606
  5. No Tag: Input whatever you want and click **Send** without any tagging.
607
-
608
  You can also simply chat in free form!
609
  '''
610
 
611
  text_input = gr.Textbox(placeholder='Upload your image and chat', interactive=True, show_label=False, container=False, scale=8)
 
612
  with gr.Blocks() as demo:
613
  gr.Markdown(title)
614
  # gr.Markdown(description)
@@ -637,6 +657,7 @@ with gr.Blocks() as demo:
637
  img_list = gr.State(value=[])
638
  chatbot = gr.Chatbot(label='Emotion-LLaMA')
639
 
 
640
  dataset = gr.Dataset(
641
  components=[gr.Textbox(visible=False)],
642
  samples=[['No Tag'], ['reason'], ['emotion'], ['visual'], ['audio']],
@@ -650,36 +671,44 @@ with gr.Blocks() as demo:
650
 
651
  upload_flag = gr.State(value=0)
652
  replace_flag = gr.State(value=0)
653
- image.upload(image_upload_trigger, [upload_flag, replace_flag, img_list], [upload_flag, replace_flag])
 
 
654
 
 
655
  with gr.Row():
656
  with gr.Column():
657
- gr.Examples(examples=[
658
- ["examples/samplenew_00004251.mp4", "[detection] face", upload_flag, replace_flag, img_list],
659
- ["examples/sample_00000338.mp4", "The person in video says: Oh no, my phone and wallet are all in my bag. [emotion] Please determine which emotion label in the video represents: happy, sad, neutral, angry, worried, surprise.", upload_flag, replace_flag, img_list],
660
- ["examples/sample_00000669.mp4", "The person in video says: Why are you looking at me like this? It's just a woman, so you have to have something to do with me. [emotion] Determine the emotional state shown in the video, choosing from happy, sad, neutral, angry, worried, or surprise.", upload_flag, replace_flag, img_list],
661
- ["examples/sample_00003462.mp4", "The person in video says: Do you believe that you push me around? [emotion] Assess and label the emotion evident in the video: could it be happy, sad, neutral, angry, worried, surprise?", upload_flag, replace_flag, img_list],
662
- ["examples/sample_00000727.mp4", "The person in video says: No, this, I have to get up! You, I'm sorry, everyone. I'm sorry, it's from the German side. [emotion] Identify the displayed emotion in the video: is it happy, sad, neutral, angry, worried, or surprise?", upload_flag, replace_flag, img_list],
663
- ["examples/samplenew_00061200.mp4", "The person in video says: Me: I'm not going in anymore, scared. [emotion] Identify the displayed emotion in the video: is it happy, sad, neutral, angry, fear, contempt, doubt, worried, or surprise?", upload_flag, replace_flag, img_list],
664
- ], inputs=[image, text_input, upload_flag, replace_flag, img_list], fn=example_trigger,
665
- outputs=[upload_flag, replace_flag])
 
 
 
666
  with gr.Column():
667
- gr.Examples(examples=[
668
- ["examples/samplenew_00051251.mp4", "In what state is the person in the video, say the following: \"Do you really think so?\"", upload_flag, replace_flag, img_list],
669
- ["examples/sample_00004735.mp4", "[visual] What are the emotions of the woman in the video?", upload_flag, replace_flag, img_list],
670
- ["examples/sample_00002422.mp4", "[audio] Analyze the speaker's voice in the video.", upload_flag, replace_flag, img_list],
671
- ["examples/sample_00001073.mp4", "The person in video says: Make him different from before. I like the way you are now. [reason] Please analyze all the clues in the video and reason out the emotional label of the person in the video.", upload_flag, replace_flag, img_list],
672
- ["examples/sample_00004671.mp4", "The person in video says: Won't you? Impossible! Fan Xiaomei is not such a person. [reason] What are the facial expressions and vocal tone used in the video? What is the intended meaning behind his words? Which emotion does this reflect?", upload_flag, replace_flag, img_list],
673
- ["examples/sample_00005854.mp4", "The person in video says: Bastard! Boss, you don't choose, you prefer. [reason] Please integrate information from various modalities to infer the emotional category of the person in the video.", upload_flag, replace_flag, img_list],
674
- ], inputs=[image, text_input, upload_flag, replace_flag, img_list], fn=example_trigger,
675
- outputs=[upload_flag, replace_flag])
 
 
 
676
 
677
  dataset.click(
678
  gradio_taskselect,
679
  inputs=[dataset],
680
  outputs=[text_input, task_inst],
681
  show_progress="hidden",
682
- postprocess=False,
683
  queue=False,
684
  )
685
 
@@ -687,11 +716,11 @@ with gr.Blocks() as demo:
687
  gradio_ask,
688
  [text_input, chatbot, chat_state, image, img_list, upload_flag, replace_flag],
689
  [text_input, chatbot, chat_state, img_list, upload_flag, replace_flag], queue=False
690
- ).success(
691
  gradio_stream_answer,
692
  [chatbot, chat_state, img_list, temperature],
693
  [chatbot, chat_state]
694
- ).success(
695
  gradio_visualize,
696
  [chatbot, image],
697
  [chatbot],
@@ -702,11 +731,11 @@ with gr.Blocks() as demo:
702
  gradio_ask,
703
  [text_input, chatbot, chat_state, image, img_list, upload_flag, replace_flag],
704
  [text_input, chatbot, chat_state, img_list, upload_flag, replace_flag], queue=False
705
- ).success(
706
  gradio_stream_answer,
707
  [chatbot, chat_state, img_list, temperature],
708
  [chatbot, chat_state]
709
- ).success(
710
  gradio_visualize,
711
  [chatbot, image],
712
  [chatbot],
@@ -715,5 +744,5 @@ with gr.Blocks() as demo:
715
 
716
  clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, chat_state, img_list], queue=False)
717
 
718
- demo.launch(share=True, enable_queue=True)
719
- # demo.launch(share=True)
 
1
  import argparse
2
  import os
 
 
 
3
  import random
4
  from collections import defaultdict
 
5
  import cv2
6
  import re
 
7
  import numpy as np
8
  from PIL import Image
9
  import torch
10
  import html
11
  import gradio as gr
 
12
  import torchvision.transforms as T
13
  import torch.backends.cudnn as cudnn
 
14
  from minigpt4.common.config import Config
 
15
  from minigpt4.common.registry import registry
16
  from minigpt4.conversation.conversation import Conversation, SeparatorStyle, Chat
 
17
  # imports modules for registration
18
  from minigpt4.datasets.builders import *
19
  from minigpt4.models import *
 
23
 
24
  import socket
25
  import os
26
+ import spaces
27
 
28
  def find_free_port(start_port, end_port):
29
  for port in range(start_port, end_port + 1):
 
165
 
166
 
167
  def reverse_escape(text):
168
+ # Add safety check for None values
169
+ if text is None:
170
+ return ""
171
+
172
  md_chars = ['\\<', '\\>']
173
 
174
  for char in md_chars:
 
225
 
226
  if isinstance(image, str): # is a image path
227
  raw_image = get_first_frame(image)
228
+ if raw_image is None:
229
+ return None, ''
230
  frame_rgb = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB)
231
  image = Image.fromarray(frame_rgb)
232
 
 
429
  interactive=True), chat_state, img_list
430
 
431
 
432
+ def image_upload_trigger(gr_img, upload_flag, replace_flag, img_list):
433
  # set the upload flag to true when receive a new image.
434
  # if there is an old image (and old conversation), set the replace flag to true to reset the conv later.
435
+ print(f"Image upload triggered: {gr_img}")
436
  upload_flag = 1
437
  if img_list:
438
  replace_flag = 1
439
  return upload_flag, replace_flag
440
 
441
 
 
 
 
 
 
 
 
 
 
 
442
  def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag, replace_flag):
443
  print("+++gradio_ask+++")
444
+ print(f"gr_img: {gr_img}, type: {type(gr_img)}")
445
+ print(f"upload_flag: {upload_flag}, replace_flag: {replace_flag}")
446
 
447
  if len(user_message) == 0:
448
  text_box_show = 'Input should not be empty!'
 
453
  print('chatbot:', chatbot)
454
  print('chat_state:', chat_state)
455
 
 
456
  if isinstance(gr_img, dict):
457
  gr_img, mask = gr_img['image'], gr_img['mask']
458
  else:
 
468
  if chat_state is None:
469
  chat_state = CONV_VISION.copy()
470
 
471
+ # Always process the image if it exists and upload_flag is set or img_list is empty
472
+ if gr_img is not None and (upload_flag or len(img_list) == 0):
473
  if replace_flag:
474
  chat_state = CONV_VISION.copy() # new image, reset everything
475
  replace_flag = 0
476
  chatbot = []
477
  img_list = []
478
+ try:
479
+ llm_message = chat.upload_img(gr_img, chat_state, img_list)
480
+ print(f"Image uploaded successfully. img_list length: {len(img_list)}")
481
+ except Exception as e:
482
+ print(f"Error uploading image: {e}")
483
+ return "Error uploading image. Please try again.", chatbot, chat_state, img_list, 0, replace_flag
484
  upload_flag = 0
485
+ elif gr_img is None:
486
+ return "Please upload a video first.", chatbot, chat_state, img_list, upload_flag, replace_flag
487
 
488
  chat.ask(user_message, chat_state)
489
  print('user_message: ', user_message)
 
529
 
530
  return text
531
 
532
+ @spaces.GPU
533
  def gradio_stream_answer(chatbot, chat_state, img_list, temperature):
534
  print('---gradio_stream_answer---')
535
+ print(f"img_list length: {len(img_list)}")
536
+
537
+ # Check if img_list is empty
538
+ if len(img_list) == 0:
539
+ error_msg = "No image/video uploaded. Please upload a video first."
540
+ print(error_msg)
541
+ if len(chatbot) > 0:
542
+ chatbot[-1][1] = error_msg
543
+ yield chatbot, chat_state
544
+ return
545
+
546
  if len(img_list) > 0:
547
  if not isinstance(img_list[0], torch.Tensor):
548
  chat.encode_img(img_list)
549
  print(chat)
550
+
551
+ try:
552
+ streamer = chat.stream_answer(conv=chat_state,
553
+ img_list=img_list,
554
+ temperature=temperature,
555
+ max_new_tokens=500,
556
+ max_length=2000)
557
+ output = ''
558
+ print('streamer:', streamer)
559
+ for new_output in streamer:
560
+ escapped = escape_markdown(new_output)
561
+ output += escapped
562
+ chatbot[-1][1] = output
563
+ chatbot[-1][1] = process_english_text(chatbot[-1][1])
564
+ yield chatbot, chat_state
565
+ chat_state.messages[-1][1] = '</s>'
566
+ print('output:', output)
567
+ except Exception as e:
568
+ error_msg = f"Error generating response: {str(e)}"
569
+ print(error_msg)
570
+ if len(chatbot) > 0:
571
+ chatbot[-1][1] = error_msg
572
  yield chatbot, chat_state
573
+
 
574
  return chatbot, chat_state
575
 
576
 
577
  def gradio_visualize(chatbot, gr_img):
578
+ # Safety check for empty chatbot or None response
579
+ if len(chatbot) == 0 or chatbot[-1][1] is None:
580
+ return chatbot
581
+
582
  if isinstance(gr_img, dict):
583
  gr_img, mask = gr_img['image'], gr_img['mask']
584
 
 
611
  return prompt_list[idx], instruct_list[idx]
612
 
613
 
 
 
614
  chat = Chat(model, vis_processor, device=device)
615
 
616
  title = """<h1 align="center">Emotion-LLaMA Demo</h1>"""
 
624
  3. Visual: Click **Send** to generate a visual description.
625
  4. Audio: Click **Send** to generate an audio description.
626
  5. No Tag: Input whatever you want and click **Send** without any tagging.
 
627
  You can also simply chat in free form!
628
  '''
629
 
630
  text_input = gr.Textbox(placeholder='Upload your image and chat', interactive=True, show_label=False, container=False, scale=8)
631
+
632
  with gr.Blocks() as demo:
633
  gr.Markdown(title)
634
  # gr.Markdown(description)
 
657
  img_list = gr.State(value=[])
658
  chatbot = gr.Chatbot(label='Emotion-LLaMA')
659
 
660
+ # Updated Dataset component for Gradio 5
661
  dataset = gr.Dataset(
662
  components=[gr.Textbox(visible=False)],
663
  samples=[['No Tag'], ['reason'], ['emotion'], ['visual'], ['audio']],
 
671
 
672
  upload_flag = gr.State(value=0)
673
  replace_flag = gr.State(value=0)
674
+
675
+ # Updated upload trigger for Gradio 5 - fixed parameter order
676
+ image.upload(image_upload_trigger, [image, upload_flag, replace_flag, img_list], [upload_flag, replace_flag])
677
 
678
+ # Updated Examples component for Gradio 5 - this is the key fix!
679
  with gr.Row():
680
  with gr.Column():
681
+ examples1 = gr.Examples(
682
+ examples=[
683
+ ["examples/samplenew_00004251.mp4", "[detection] face"],
684
+ ["examples/sample_00000338.mp4", "The person in video says: Oh no, my phone and wallet are all in my bag. [emotion] Please determine which emotion label in the video represents: happy, sad, neutral, angry, worried, surprise."],
685
+ ["examples/sample_00000669.mp4", "The person in video says: Why are you looking at me like this? It's just a woman, so you have to have something to do with me. [emotion] Determine the emotional state shown in the video, choosing from happy, sad, neutral, angry, worried, or surprise."],
686
+ ["examples/sample_00003462.mp4", "The person in video says: Do you believe that you push me around? [emotion] Assess and label the emotion evident in the video: could it be happy, sad, neutral, angry, worried, surprise?"],
687
+ ["examples/sample_00000727.mp4", "The person in video says: No, this, I have to get up! You, I'm sorry, everyone. I'm sorry, it's from the German side. [emotion] Identify the displayed emotion in the video: is it happy, sad, neutral, angry, worried, or surprise?"],
688
+ ["examples/samplenew_00061200.mp4", "The person in video says: Me: I'm not going in anymore, scared. [emotion] Identify the displayed emotion in the video: is it happy, sad, neutral, angry, fear, contempt, doubt, worried, or surprise?"],
689
+ ],
690
+ inputs=[image, text_input],
691
+ # Remove fn and outputs - let Examples handle this automatically in Gradio 5
692
+ )
693
  with gr.Column():
694
+ examples2 = gr.Examples(
695
+ examples=[
696
+ ["examples/samplenew_00051251.mp4", "In what state is the person in the video, say the following: \"Do you really think so?\""],
697
+ ["examples/sample_00004735.mp4", "[visual] What are the emotions of the woman in the video?"],
698
+ ["examples/sample_00002422.mp4", "[audio] Analyze the speaker's voice in the video."],
699
+ ["examples/sample_00001073.mp4", "The person in video says: Make him different from before. I like the way you are now. [reason] Please analyze all the clues in the video and reason out the emotional label of the person in the video."],
700
+ ["examples/sample_00004671.mp4", "The person in video says: Won't you? Impossible! Fan Xiaomei is not such a person. [reason] What are the facial expressions and vocal tone used in the video? What is the intended meaning behind his words? Which emotion does this reflect?"],
701
+ ["examples/sample_00005854.mp4", "The person in video says: Bastard! Boss, you don't choose, you prefer. [reason] Please integrate information from various modalities to infer the emotional category of the person in the video."],
702
+ ],
703
+ inputs=[image, text_input],
704
+ # Remove fn and outputs - let Examples handle this automatically in Gradio 5
705
+ )
706
 
707
  dataset.click(
708
  gradio_taskselect,
709
  inputs=[dataset],
710
  outputs=[text_input, task_inst],
711
  show_progress="hidden",
 
712
  queue=False,
713
  )
714
 
 
716
  gradio_ask,
717
  [text_input, chatbot, chat_state, image, img_list, upload_flag, replace_flag],
718
  [text_input, chatbot, chat_state, img_list, upload_flag, replace_flag], queue=False
719
+ ).then(
720
  gradio_stream_answer,
721
  [chatbot, chat_state, img_list, temperature],
722
  [chatbot, chat_state]
723
+ ).then(
724
  gradio_visualize,
725
  [chatbot, image],
726
  [chatbot],
 
731
  gradio_ask,
732
  [text_input, chatbot, chat_state, image, img_list, upload_flag, replace_flag],
733
  [text_input, chatbot, chat_state, img_list, upload_flag, replace_flag], queue=False
734
+ ).then(
735
  gradio_stream_answer,
736
  [chatbot, chat_state, img_list, temperature],
737
  [chatbot, chat_state]
738
+ ).then(
739
  gradio_visualize,
740
  [chatbot, image],
741
  [chatbot],
 
744
 
745
  clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, chat_state, img_list], queue=False)
746
 
747
+ demo.queue()
748
+ demo.launch(share=True)
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
- gradio==3.47.1
2
- gradio_client==0.6.0
3
  decorator==4.4.2
4
  moviepy==1.0.3
5
  decord==0.6.0
@@ -15,6 +13,9 @@ bitsandbytes==0.41.0
15
  scipy
16
  huggingface_hub
17
  torch==2.1.2
18
- torchvision==0.15.1
19
  timm==0.6.13
20
  transformers==4.30.0
 
 
 
 
 
 
1
  decorator==4.4.2
2
  moviepy==1.0.3
3
  decord==0.6.0
 
13
  scipy
14
  huggingface_hub
15
  torch==2.1.2
16
+ torchvision
17
  timm==0.6.13
18
  transformers==4.30.0
19
+ gradio
20
+ gradio_client
21
+ numpy<2.0