Sa2VA-simple-demo

Runtime error

App Files Files Community

aiqcamp commited on Jan 11

Commit

f289e91

verified ·

1 Parent(s): c6df80d

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -18

app.py CHANGED Viewed

@@ -78,6 +78,9 @@ def visualize(pred_mask, image_path, work_dir):
 @spaces.GPU
 def image_vision(image_input_path, prompt):
     image_path = image_input_path
     text_prompts = f"<image>{prompt}"
     image = Image.open(image_path).convert('RGB')
@@ -92,9 +95,16 @@ def image_vision(image_input_path, prompt):
     print(return_dict)
     answer = return_dict["prediction"] # the text format answer
     seg_image = return_dict["prediction_masks"]
-    if '[SEG]' in answer and Visualizer is not None:
         pred_masks = seg_image[0]
         temp_dir = tempfile.mkdtemp()
         pred_mask = pred_masks
@@ -106,19 +116,16 @@ def image_vision(image_input_path, prompt):
 @spaces.GPU(duration=80)
 def video_vision(video_input_path, prompt, video_interval):
     # Open the original video
     cap = cv2.VideoCapture(video_input_path)
-    # Get original video properties
     original_fps = cap.get(cv2.CAP_PROP_FPS)
     frame_skip_factor = video_interval
-    # Calculate new FPS
     new_fps = original_fps / frame_skip_factor
     vid_frames, image_paths = read_video(video_input_path, video_interval)
-    # create a question (<image> is a placeholder for the video frames)
     question = f"<image>{prompt}"
     result = model.predict_forward(
         video=vid_frames,
@@ -128,7 +135,13 @@ def video_vision(video_input_path, prompt, video_interval):
     prediction = result['prediction']
     print(prediction)
-    if '[SEG]' in prediction and Visualizer is not None:
         _seg_idx = 0
         pred_masks = result['prediction_masks'][_seg_idx]
         seg_frames = []
@@ -140,29 +153,22 @@ def video_vision(video_input_path, prompt, video_interval):
             seg_frames.append(seg_frame)
         output_video = "output_video.mp4"
-        # Read the first image to get the size (resolution)
         frame = cv2.imread(seg_frames[0])
         height, width, layers = frame.shape
-        # Define the video codec and create VideoWriter object
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4
         video = cv2.VideoWriter(output_video, fourcc, new_fps, (width, height))
-        # Iterate over the image paths and write to the video
         for img_path in seg_frames:
             frame = cv2.imread(img_path)
             video.write(frame)
-        # Release the video writer
         video.release()
         print(f"Video created successfully at {output_video}")
-        return result['prediction'], output_video
     else:
-        return result['prediction'], None

 @spaces.GPU
 def image_vision(image_input_path, prompt):
+    # 입력된 프롬프트가 한글인지 확인
+    is_korean = any(ord('가') <= ord(char) <= ord('힣') for char in prompt)
     image_path = image_input_path
     text_prompts = f"<image>{prompt}"
     image = Image.open(image_path).convert('RGB')
     print(return_dict)
     answer = return_dict["prediction"] # the text format answer
+    # 한글 프롬프트인 경우 응답을 한글로 변환
+    if is_korean:
+        # 기본 응답 패턴을 한글로 변환
+        answer = answer.replace("Yes", "네")
+        answer = answer.replace("No", "아니오")
+        answer = answer.replace("[SEG]", "[분할]")
     seg_image = return_dict["prediction_masks"]
+    if ('[SEG]' in answer or '[분할]' in answer) and Visualizer is not None:
         pred_masks = seg_image[0]
         temp_dir = tempfile.mkdtemp()
         pred_mask = pred_masks
 @spaces.GPU(duration=80)
 def video_vision(video_input_path, prompt, video_interval):
+    # 입력된 프롬프트가 한글인지 확인
+    is_korean = any(ord('가') <= ord(char) <= ord('힣') for char in prompt)
     # Open the original video
     cap = cv2.VideoCapture(video_input_path)
     original_fps = cap.get(cv2.CAP_PROP_FPS)
     frame_skip_factor = video_interval
     new_fps = original_fps / frame_skip_factor
     vid_frames, image_paths = read_video(video_input_path, video_interval)
     question = f"<image>{prompt}"
     result = model.predict_forward(
         video=vid_frames,
     prediction = result['prediction']
     print(prediction)
+    # 한글 프롬프트인 경우 응답을 한글로 변환
+    if is_korean:
+        prediction = prediction.replace("Yes", "네")
+        prediction = prediction.replace("No", "아니오")
+        prediction = prediction.replace("[SEG]", "[분할]")
+    if ('[SEG]' in prediction or '[분할]' in prediction) and Visualizer is not None:
         _seg_idx = 0
         pred_masks = result['prediction_masks'][_seg_idx]
         seg_frames = []
             seg_frames.append(seg_frame)
         output_video = "output_video.mp4"
         frame = cv2.imread(seg_frames[0])
         height, width, layers = frame.shape
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
         video = cv2.VideoWriter(output_video, fourcc, new_fps, (width, height))
         for img_path in seg_frames:
             frame = cv2.imread(img_path)
             video.write(frame)
         video.release()
         print(f"Video created successfully at {output_video}")
+        return prediction, output_video
     else:
+        return prediction, None