skallewag commited on
Commit
687eada
·
verified ·
1 Parent(s): dd429be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -45
app.py CHANGED
@@ -6,13 +6,22 @@
6
  # Written by Xueyan Zou ([email protected]), Jianwei Yang ([email protected])
7
  # --------------------------------------------------------
8
 
9
- # Install dependencies and patch files before any imports
10
  import os
11
  import sys
12
  import subprocess
13
 
14
  print("Setting up SEEM environment...")
15
 
 
 
 
 
 
 
 
 
 
16
  # Create a custom distributed.py file that doesn't need mpi4py
17
  os.makedirs('utils', exist_ok=True)
18
  with open('utils/distributed.py', 'w') as f:
@@ -58,16 +67,47 @@ def all_gather(data):
58
  def reduce_dict(input_dict, average=True):
59
  return input_dict
60
  """)
61
- print("Created custom distributed.py")
62
 
63
- # Install detectron2
64
- print("Installing detectron2...")
65
- try:
66
- subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/MaureenZOU/detectron2-xyz.git"])
67
- print("Detectron2 installation complete!")
68
- except Exception as e:
69
- print(f"Error installing detectron2: {e}")
70
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  # Set Python path to include the repository root
73
  os.environ["PYTHONPATH"] = os.getcwd()
@@ -92,7 +132,8 @@ from utils.distributed import init_distributed
92
  from utils.arguments import load_opt_from_config_files
93
  from utils.constants import COCO_PANOPTIC_CLASSES
94
 
95
- from demo.seem.tasks import *
 
96
 
97
  def parse_option():
98
  parser = argparse.ArgumentParser('SEEM Demo', add_help=False)
@@ -125,29 +166,59 @@ build model
125
  '''
126
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
127
  print(f"Using device: {device}")
128
- model = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth).eval().to(device)
129
- with torch.no_grad():
130
- model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(COCO_PANOPTIC_CLASSES + ["background"], is_eval=True)
 
 
 
 
 
 
 
 
 
131
 
132
  '''
133
  audio
134
  '''
135
- audio = whisper.load_model("base")
 
 
 
 
 
 
136
 
137
  @torch.no_grad()
138
  def inference(image, task, *args, **kwargs):
139
- if torch.cuda.is_available():
140
- with torch.autocast(device_type='cuda', dtype=torch.float16):
141
- if 'Video' in task:
142
- return interactive_infer_video(model, audio, image, task, *args, **kwargs)
143
- else:
144
- return interactive_infer_image(model, audio, image, task, *args, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  else:
146
- # Run without autocast on CPU
147
- if 'Video' in task:
148
- return interactive_infer_video(model, audio, image, task, *args, **kwargs)
149
- else:
150
- return interactive_infer_image(model, audio, image, task, *args, **kwargs)
151
 
152
  class ImageMask(gr.components.Image):
153
  """
@@ -180,7 +251,14 @@ class Video(gr.components.Video):
180
  launch app
181
  '''
182
  title = "SEEM: Segment Everything Everywhere All At Once"
183
- description = """
 
 
 
 
 
 
 
184
  <div style="text-align: center; font-weight: bold;">
185
  <span style="font-size: 18px" id="paper-info">
186
  [<a href="https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once" target="_blank">GitHub</a>]
@@ -189,22 +267,12 @@ description = """
189
  </div>
190
  <div style="text-align: left; font-weight: bold;">
191
  <br>
192
- &#x1F32A Note: The current model is run on <span style="color:blue;">SEEM {}</span>, for <span style="color:blue;">best performance</span> refer to <a href="https://huggingface.co/spaces/xdecoder/SEEM" target="_blank"><span style="color:red;">our demo</span></a>.
193
  </p>
194
  </div>
195
- """.format(cur_model)
196
-
197
- '''Usage
198
- Instructions:
199
- &#x1F388 Try our default examples first (Sketch is not automatically drawed on input and example image);
200
- &#x1F388 For video demo, it takes about 30-60s to process, please refresh if you meet an error on uploading;
201
- &#x1F388 Upload an image/video (If you want to use referred region of another image please check "Example" and upload another image in referring image panel);
202
- &#x1F388 Select at least one type of prompt of your choice (If you want to use referred region of another image please check "Example");
203
- &#x1F388 Remember to provide the actual prompt for each promt type you select, otherwise you will meet an error (e.g., rember to draw on the referring image);
204
- &#x1F388 Our model by default support the vocabulary of COCO 133 categories, others will be classified to 'others' or misclassifed.
205
- '''
206
 
207
- article = "The Demo is Run on SEEM-Tiny."
208
  inputs = [ImageMask(label="[Stroke] Draw on Image",type="pil"), gr.inputs.CheckboxGroup(choices=["Stroke", "Example", "Text", "Audio", "Video", "Panoptic"], type="value", label="Interative Mode"), ImageMask(label="[Example] Draw on Referring Image",type="pil"), gr.Textbox(label="[Text] Referring Text"), gr.Audio(label="[Audio] Referring Audio", source="microphone", type="filepath"), gr.Video(label="[Video] Referring Video Segmentation",format="mp4",interactive=True)]
209
  gr.Interface(
210
  fn=inference,
@@ -218,11 +286,11 @@ gr.Interface(
218
  ),
219
  ],
220
  examples=[
221
- ["examples/corgi1.webp", ["Text"], "examples/corgi2.jpg", "The corgi.", None, None],
222
- ["examples/river1.png", ["Text", "Audio"], "examples/river2.png", "The green trees.", "examples/river1.wav", None],
223
- ["examples/zebras1.jpg", ["Example"], "examples/zebras2.jpg", "", None, None],
224
- ["examples/fries1.png", ["Example"], "examples/fries2.png", "", None, None],
225
- ["examples/placeholder.png", ["Video"], "examples/ref_vase.JPG", "", None, "examples/vasedeck.mp4"],
226
  ],
227
  title=title,
228
  description=description,
 
6
  # Written by Xueyan Zou ([email protected]), Jianwei Yang ([email protected])
7
  # --------------------------------------------------------
8
 
9
+ # Setup paths and install dependencies before any imports
10
  import os
11
  import sys
12
  import subprocess
13
 
14
  print("Setting up SEEM environment...")
15
 
16
+ # Install detectron2 first
17
+ print("Installing detectron2...")
18
+ try:
19
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/MaureenZOU/detectron2-xyz.git"])
20
+ print("Detectron2 installation complete!")
21
+ except Exception as e:
22
+ print(f"Error installing detectron2: {e}")
23
+ sys.exit(1)
24
+
25
  # Create a custom distributed.py file that doesn't need mpi4py
26
  os.makedirs('utils', exist_ok=True)
27
  with open('utils/distributed.py', 'w') as f:
 
67
  def reduce_dict(input_dict, average=True):
68
  return input_dict
69
  """)
 
70
 
71
+ # Create a simple visualizer if it doesn't exist
72
+ if not os.path.exists('utils/visualizer.py'):
73
+ with open('utils/visualizer.py', 'w') as f:
74
+ f.write("""# Simple visualizer class
75
+ import numpy as np
76
+ import cv2
77
+
78
+ class Visualizer:
79
+ def __init__(self, img_rgb, metadata=None, scale=1.0):
80
+ self.img = img_rgb
81
+ self.metadata = metadata
82
+ self.scale = scale
83
+
84
+ def draw_binary_mask(self, mask, color=None, text=None):
85
+ if color is None:
86
+ color = [0, 255, 0] # Default to green
87
+
88
+ mask_img = np.zeros_like(self.img, dtype=np.float32)
89
+ color_mask = np.array(color) * 255
90
+
91
+ for c in range(3):
92
+ mask_img[:, :, c] = color_mask[c]
93
+
94
+ mask_img = mask_img * mask[:, :, None] * 0.5
95
+ self.img = self.img * (1 - mask[:, :, None] * 0.5) + mask_img
96
+
97
+ if text:
98
+ # Simplified text placement
99
+ x, y = np.where(mask)[0][0], np.where(mask)[1][0] if np.any(mask) else (10, 10)
100
+ cv2.putText(self.img, text, (y, x), cv2.FONT_HERSHEY_SIMPLEX, 0.5, tuple(map(int, color_mask)), 1)
101
+
102
+ return self
103
+
104
+ def draw_panoptic_seg(self, panoptic_seg, segments_info):
105
+ # Simplified panoptic visualization - just a placeholder
106
+ return self
107
+
108
+ def get_image(self):
109
+ return self.img.astype(np.uint8)
110
+ """)
111
 
112
  # Set Python path to include the repository root
113
  os.environ["PYTHONPATH"] = os.getcwd()
 
132
  from utils.arguments import load_opt_from_config_files
133
  from utils.constants import COCO_PANOPTIC_CLASSES
134
 
135
+ # Import the interactive functions from the existing implementation
136
+ from demo.seem.tasks.interactive import interactive_infer_image, interactive_infer_video
137
 
138
  def parse_option():
139
  parser = argparse.ArgumentParser('SEEM Demo', add_help=False)
 
166
  '''
167
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
168
  print(f"Using device: {device}")
169
+
170
+ try:
171
+ model = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth).eval().to(device)
172
+ with torch.no_grad():
173
+ model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(COCO_PANOPTIC_CLASSES + ["background"], is_eval=True)
174
+ print("Model loaded successfully")
175
+ model_loaded = True
176
+ except Exception as e:
177
+ print(f"Error loading model: {e}")
178
+ print("Continuing with simplified interface")
179
+ model = None
180
+ model_loaded = False
181
 
182
  '''
183
  audio
184
  '''
185
+ try:
186
+ audio = whisper.load_model("base")
187
+ audio_loaded = True
188
+ except Exception as e:
189
+ print(f"Error loading audio model: {e}")
190
+ audio = None
191
+ audio_loaded = False
192
 
193
  @torch.no_grad()
194
  def inference(image, task, *args, **kwargs):
195
+ if not model_loaded:
196
+ # Return a placeholder image if model failed to load
197
+ warning_img = Image.new('RGB', (600, 400), color=(240, 240, 240))
198
+ d = ImageDraw.Draw(warning_img)
199
+ d.text((50, 150), "Model could not be loaded.", fill=(255, 0, 0))
200
+ d.text((50, 200), "Please check logs for details.", fill=(255, 0, 0))
201
+ return warning_img, None
202
+
203
+ # Prepare input parameters for the interactive functions
204
+ image_input = {"image": image, "mask": kwargs.get("mask", None)}
205
+ referring_image = kwargs.get("referring_image", None)
206
+
207
+ # If referring image is provided, prepare it in the expected format
208
+ refimg = None
209
+ if referring_image is not None:
210
+ refimg = {"image": referring_image, "mask": kwargs.get("referring_mask", None)}
211
+
212
+ # Get text and audio parameters
213
+ reftxt = kwargs.get("referring_text", "")
214
+ audio_pth = kwargs.get("referring_audio", None)
215
+ video_pth = kwargs.get("video", None)
216
+
217
+ # Call the appropriate interactive function
218
+ if 'Video' in task:
219
+ return interactive_infer_video(model, audio, image_input, task, refimg, reftxt, audio_pth, video_pth)
220
  else:
221
+ return interactive_infer_image(model, audio, image_input, task, refimg, reftxt, audio_pth, video_pth)
 
 
 
 
222
 
223
  class ImageMask(gr.components.Image):
224
  """
 
251
  launch app
252
  '''
253
  title = "SEEM: Segment Everything Everywhere All At Once"
254
+
255
+ # Update description based on model loading status
256
+ if model_loaded:
257
+ model_status = f"<span style=\"color:green;\">✓ Model loaded successfully</span> (SEEM {cur_model})"
258
+ else:
259
+ model_status = "<span style=\"color:red;\">✗ Model failed to load</span> (see logs for details)"
260
+
261
+ description = f"""
262
  <div style="text-align: center; font-weight: bold;">
263
  <span style="font-size: 18px" id="paper-info">
264
  [<a href="https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once" target="_blank">GitHub</a>]
 
267
  </div>
268
  <div style="text-align: left; font-weight: bold;">
269
  <br>
270
+ &#x1F32A Status: {model_status}
271
  </p>
272
  </div>
273
+ """
 
 
 
 
 
 
 
 
 
 
274
 
275
+ article = "The Demo is Run on SEEM"
276
  inputs = [ImageMask(label="[Stroke] Draw on Image",type="pil"), gr.inputs.CheckboxGroup(choices=["Stroke", "Example", "Text", "Audio", "Video", "Panoptic"], type="value", label="Interative Mode"), ImageMask(label="[Example] Draw on Referring Image",type="pil"), gr.Textbox(label="[Text] Referring Text"), gr.Audio(label="[Audio] Referring Audio", source="microphone", type="filepath"), gr.Video(label="[Video] Referring Video Segmentation",format="mp4",interactive=True)]
277
  gr.Interface(
278
  fn=inference,
 
286
  ),
287
  ],
288
  examples=[
289
+ ["demo/seem/examples/corgi1.webp", ["Text"], "demo/seem/examples/corgi2.jpg", "The corgi.", None, None],
290
+ ["demo/seem/examples/river1.png", ["Text", "Audio"], "demo/seem/examples/river2.png", "The green trees.", "demo/seem/examples/river1.wav", None],
291
+ ["demo/seem/examples/zebras1.jpg", ["Example"], "demo/seem/examples/zebras2.jpg", "", None, None],
292
+ ["demo/seem/examples/fries1.png", ["Example"], "demo/seem/examples/fries2.png", "", None, None],
293
+ ["demo/seem/examples/placeholder.png", ["Video"], "demo/seem/examples/ref_vase.JPG", "", None, "demo/seem/examples/vasedeck.mp4"],
294
  ],
295
  title=title,
296
  description=description,