Spaces:

XAI
/

TAB4IDC-InterventionDemo

Running

App Files Files Community

pooyanrg commited on Mar 14

Commit

ffbbba2

1 Parent(s): 26cf243

fix

Browse files

Files changed (1) hide show

app.py +90 -90

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import gradio as gr
 import numpy as np
 from PIL import Image, ImageDraw
-# import torch
-# from torchvision.transforms import Compose, Resize, ToTensor, Normalize
-# from utils.model import init_model
-# from utils.tokenization_clip import SimpleTokenizer as ClipTokenizer
 from fastapi.staticfiles import StaticFiles
 from fileservice import app
@@ -16,22 +16,22 @@ html_text = """
     </div>
 """
-# def image_to_tensor(image_path):
-#     image = Image.open(image_path).convert('RGB')
-#     preprocess = Compose([
-#         Resize([224, 224], interpolation=Image.BICUBIC),
-#         lambda image: image.convert("RGB"),
-#         ToTensor(),
-#         Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-#     ])
-#     image_data = preprocess(image)
-#     return {'image': image_data}
-# def get_image_data(image_path):
-#     image_input = image_to_tensor(image_path)
-#     return image_input
 def get_intervention_vector(selected_cells_bef, selected_cells_aft):
     left = np.reshape(np.zeros((1, 14 * 14)), (14, 14))
@@ -59,83 +59,83 @@ def get_intervention_vector(selected_cells_bef, selected_cells_aft):
     return left_map, right_map
-# def _get_rawimage(image_path):
-#     # Pair x L x T x 3 x H x W
-#     image = np.zeros((1, 3, 224,
-#                     224), dtype=np.float)
-#     for i in range(1):
-#         raw_image_data = get_image_data(image_path)
-#         raw_image_data = raw_image_data['image']
-#         image[i] = raw_image_data
-#     return image
-# def greedy_decode(model, tokenizer, video, video_mask, gt_left_map, gt_right_map):
-#     visual_output, left_map, right_map = model.get_sequence_visual_output(video, video_mask,
-#                                                             gt_left_map[:, 0, :].squeeze(), gt_right_map[:, 0, :].squeeze())
-#     video_mask = torch.ones(visual_output.shape[0], visual_output.shape[1], device=visual_output.device).long()
-#     input_caption_ids = torch.zeros(visual_output.shape[0], device=visual_output.device).data.fill_(tokenizer.vocab["<|startoftext|>"])
-#     input_caption_ids = input_caption_ids.long().unsqueeze(1)
-#     decoder_mask = torch.ones_like(input_caption_ids)
-#     for i in range(32):
-#         decoder_scores = model.decoder_caption(visual_output, video_mask, input_caption_ids, decoder_mask, get_logits=True)
-#         next_words = decoder_scores[:, -1].max(1)[1].unsqueeze(1)
-#         input_caption_ids = torch.cat([input_caption_ids, next_words], 1)
-#         next_mask = torch.ones_like(next_words)
-#         decoder_mask = torch.cat([decoder_mask, next_mask], 1)
-#     return input_caption_ids[:, 1:].tolist(), left_map, right_map
 # Dummy prediction function
-# def predict_image(image_bef, image_aft, selected_cells_bef, selected_cells_aft):
-#     if image_bef is None:
-#         return "No image provided", "", ""
-#     if image_aft is None:
-#         return "No image provided", "", ""
-#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#     model = init_model('data/pytorch_model.pt', device)
-#     tokenizer = ClipTokenizer()
-#     left_map, right_map = get_intervention_vector(selected_cells_bef, selected_cells_aft)
-#     left_map, right_map = torch.from_numpy(left_map).unsqueeze(0), torch.from_numpy(right_map).unsqueeze(0)
-#     bef_image = torch.from_numpy(_get_rawimage(image_bef)).unsqueeze(1)
-#     aft_image = torch.from_numpy(_get_rawimage(image_aft)).unsqueeze(1)
-#     image_pair = torch.cat([bef_image, aft_image], 1)
-#     image_mask = torch.from_numpy(np.ones(2, dtype=np.long)).unsqueeze(0)
-#     result_list, left_map, right_map = greedy_decode(model, tokenizer, image_pair, image_mask, left_map, right_map)
-#     decode_text_list = tokenizer.convert_ids_to_tokens(result_list[0])
-#     if "<|endoftext|>" in decode_text_list:
-#         SEP_index = decode_text_list.index("<|endoftext|>")
-#         decode_text_list = decode_text_list[:SEP_index]
-#     if "!" in decode_text_list:
-#         PAD_index = decode_text_list.index("!")
-#         decode_text_list = decode_text_list[:PAD_index]
-#     decode_text = decode_text_list.strip()
-#     # Generate dummy predictions
-#     pred = f"{decode_text}"
-#     # Include information about selected cells
-#     selected_info_bef = f"{selected_cells_bef}" if selected_cells_bef else "No image patch was selected"
-#     selected_info_aft = f"{selected_cells_aft}" if selected_cells_aft else "No image patch was selected"
-#     return pred, selected_info_bef, selected_info_aft
 # Add grid to the image
 def add_grid_to_image(image_path, grid_size=14):
@@ -297,25 +297,25 @@ with gr.Blocks() as demo:
             html = gr.HTML(html_text)
     # Connect the predict button to the prediction function
-    # predict_btn.click(
-    #     fn=predict_image,
-    #     inputs=[image_bef, image_aft, selected_cells_bef, selected_cells_aft],
-    #     outputs=[prediction, selected_info_bef, selected_info_aft]
-    # )
-    # image_bef.change(
-    #             fn=None,
-    #             inputs=[image_bef],
-    #             outputs=[],
-    #             js="(image) => { initializeEditor(); importBackground(image); return []; }",
-    #         )
-    # image_aft.change(
-    #     fn=None,
-    #     inputs=[image_aft],
-    #     outputs=[],
-    #     js="(image) => { initializeEditor(); importBackground(image); return []; }",
-    # )

 import gradio as gr
 import numpy as np
 from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import Compose, Resize, ToTensor, Normalize
+from utils.model import init_model
+from utils.tokenization_clip import SimpleTokenizer as ClipTokenizer
 from fastapi.staticfiles import StaticFiles
 from fileservice import app
     </div>
 """
+def image_to_tensor(image_path):
+    image = Image.open(image_path).convert('RGB')
+    preprocess = Compose([
+        Resize([224, 224], interpolation=Image.BICUBIC),
+        lambda image: image.convert("RGB"),
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+    image_data = preprocess(image)
+    return {'image': image_data}
+def get_image_data(image_path):
+    image_input = image_to_tensor(image_path)
+    return image_input
 def get_intervention_vector(selected_cells_bef, selected_cells_aft):
     left = np.reshape(np.zeros((1, 14 * 14)), (14, 14))
     return left_map, right_map
+def _get_rawimage(image_path):
+    # Pair x L x T x 3 x H x W
+    image = np.zeros((1, 3, 224,
+                    224), dtype=np.float)
+    for i in range(1):
+        raw_image_data = get_image_data(image_path)
+        raw_image_data = raw_image_data['image']
+        image[i] = raw_image_data
+    return image
+def greedy_decode(model, tokenizer, video, video_mask, gt_left_map, gt_right_map):
+    visual_output, left_map, right_map = model.get_sequence_visual_output(video, video_mask,
+                                                            gt_left_map[:, 0, :].squeeze(), gt_right_map[:, 0, :].squeeze())
+    video_mask = torch.ones(visual_output.shape[0], visual_output.shape[1], device=visual_output.device).long()
+    input_caption_ids = torch.zeros(visual_output.shape[0], device=visual_output.device).data.fill_(tokenizer.vocab["<|startoftext|>"])
+    input_caption_ids = input_caption_ids.long().unsqueeze(1)
+    decoder_mask = torch.ones_like(input_caption_ids)
+    for i in range(32):
+        decoder_scores = model.decoder_caption(visual_output, video_mask, input_caption_ids, decoder_mask, get_logits=True)
+        next_words = decoder_scores[:, -1].max(1)[1].unsqueeze(1)
+        input_caption_ids = torch.cat([input_caption_ids, next_words], 1)
+        next_mask = torch.ones_like(next_words)
+        decoder_mask = torch.cat([decoder_mask, next_mask], 1)
+    return input_caption_ids[:, 1:].tolist(), left_map, right_map
 # Dummy prediction function
+def predict_image(image_bef, image_aft, selected_cells_bef, selected_cells_aft):
+    if image_bef is None:
+        return "No image provided", "", ""
+    if image_aft is None:
+        return "No image provided", "", ""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = init_model('data/pytorch_model.pt', device)
+    tokenizer = ClipTokenizer()
+    left_map, right_map = get_intervention_vector(selected_cells_bef, selected_cells_aft)
+    left_map, right_map = torch.from_numpy(left_map).unsqueeze(0), torch.from_numpy(right_map).unsqueeze(0)
+    bef_image = torch.from_numpy(_get_rawimage(image_bef)).unsqueeze(1)
+    aft_image = torch.from_numpy(_get_rawimage(image_aft)).unsqueeze(1)
+    image_pair = torch.cat([bef_image, aft_image], 1)
+    image_mask = torch.from_numpy(np.ones(2, dtype=np.long)).unsqueeze(0)
+    result_list, left_map, right_map = greedy_decode(model, tokenizer, image_pair, image_mask, left_map, right_map)
+    decode_text_list = tokenizer.convert_ids_to_tokens(result_list[0])
+    if "<|endoftext|>" in decode_text_list:
+        SEP_index = decode_text_list.index("<|endoftext|>")
+        decode_text_list = decode_text_list[:SEP_index]
+    if "!" in decode_text_list:
+        PAD_index = decode_text_list.index("!")
+        decode_text_list = decode_text_list[:PAD_index]
+    decode_text = decode_text_list.strip()
+    # Generate dummy predictions
+    pred = f"{decode_text}"
+    # Include information about selected cells
+    selected_info_bef = f"{selected_cells_bef}" if selected_cells_bef else "No image patch was selected"
+    selected_info_aft = f"{selected_cells_aft}" if selected_cells_aft else "No image patch was selected"
+    return pred, selected_info_bef, selected_info_aft
 # Add grid to the image
 def add_grid_to_image(image_path, grid_size=14):
             html = gr.HTML(html_text)
     # Connect the predict button to the prediction function
+    predict_btn.click(
+        fn=predict_image,
+        inputs=[image_bef, image_aft, selected_cells_bef, selected_cells_aft],
+        outputs=[prediction, selected_info_bef, selected_info_aft]
+    )
+    image_bef.change(
+                fn=None,
+                inputs=[image_bef],
+                outputs=[],
+                js="(image) => { initializeEditor(); importBackground(image); return []; }",
+            )
+    image_aft.change(
+        fn=None,
+        inputs=[image_aft],
+        outputs=[],
+        js="(image) => { initializeEditor(); importBackground(image); return []; }",
+    )