Spaces:
Runtime error
Runtime error
update app.py
Browse files- app.py +0 -115
- multimodal/setup.py +1 -0
app.py
CHANGED
|
@@ -70,121 +70,6 @@ def get_outputs(
|
|
| 70 |
return outputs
|
| 71 |
|
| 72 |
|
| 73 |
-
def evaluate_refcoco(
|
| 74 |
-
model,
|
| 75 |
-
tokenizer,
|
| 76 |
-
image_processor,
|
| 77 |
-
batch_size,
|
| 78 |
-
tsvfile,
|
| 79 |
-
max_generation_length=20,
|
| 80 |
-
num_beams=3,
|
| 81 |
-
length_penalty=-2.0,
|
| 82 |
-
device=-1,
|
| 83 |
-
vis_embed_size=None,
|
| 84 |
-
rank=0,
|
| 85 |
-
world_size=1,
|
| 86 |
-
id=0,
|
| 87 |
-
):
|
| 88 |
-
model.eval().cuda()
|
| 89 |
-
loc_token_ids = []
|
| 90 |
-
for i in range(1000):
|
| 91 |
-
loc_token_ids.append(int(tokenizer(f"<loc_{i}>", add_special_tokens=False)["input_ids"][-1]))
|
| 92 |
-
media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
|
| 93 |
-
endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
|
| 94 |
-
pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
|
| 95 |
-
bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
|
| 96 |
-
prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
|
| 97 |
-
# all_ids = set(range(model.lang_encoder.lm_head.out_features))
|
| 98 |
-
# bad_words_ids = list(all_ids - set(loc_token_ids))
|
| 99 |
-
# bad_words_ids = [[b] for b in bad_words_ids]
|
| 100 |
-
# min_loc_token_id = min(loc_token_ids)
|
| 101 |
-
# max_loc_token_id = max(loc_token_ids)
|
| 102 |
-
total = 0
|
| 103 |
-
correct = 0
|
| 104 |
-
ious = []
|
| 105 |
-
if "refcocog" in tsvfile:
|
| 106 |
-
dataset_name = "refcocog"
|
| 107 |
-
elif "refcocoplus" in tsvfile:
|
| 108 |
-
dataset_name = "refcocoplus"
|
| 109 |
-
else:
|
| 110 |
-
dataset_name = "refcoco"
|
| 111 |
-
with open(tsvfile, "r") as f:
|
| 112 |
-
lines = f.readlines()
|
| 113 |
-
pbar = tqdm(lines, disable=(rank != 0))
|
| 114 |
-
for ii, line in enumerate(pbar):
|
| 115 |
-
if ii % world_size != rank:
|
| 116 |
-
continue
|
| 117 |
-
total += 1
|
| 118 |
-
line = line.rstrip()
|
| 119 |
-
uniq_id, image_id, text, region_coord, image = line.split("\t")
|
| 120 |
-
|
| 121 |
-
image = Image.open(BytesIO(base64.urlsafe_b64decode(image))).convert("RGB")
|
| 122 |
-
# image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/yolo.png").convert("RGB")
|
| 123 |
-
# image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/cat.png").convert("RGB")
|
| 124 |
-
# image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/262148000.png")
|
| 125 |
-
|
| 126 |
-
gt_box = np.array(list(map(float, region_coord.split(","))))
|
| 127 |
-
width = image.width
|
| 128 |
-
height = image.height
|
| 129 |
-
image = image.resize((224, 224))
|
| 130 |
-
gt_box = gt_box / np.array([width, height, width, height]) * 224
|
| 131 |
-
batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
|
| 132 |
-
prompt = [
|
| 133 |
-
f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token * vis_embed_size}<|#endofimage#|><|#object#|>{text.rstrip('.').strip()}<|#endofobject#|><|#visual#|>"]
|
| 134 |
-
# prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>the cat<|#visual#|>"]
|
| 135 |
-
# prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"]
|
| 136 |
-
# prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>a man<|#visual#|> is doing a trick on a skateboard<|#visual#|>"]
|
| 137 |
-
|
| 138 |
-
encodings = tokenizer(
|
| 139 |
-
prompt,
|
| 140 |
-
padding="longest",
|
| 141 |
-
truncation=True,
|
| 142 |
-
return_tensors="pt",
|
| 143 |
-
max_length=2000,
|
| 144 |
-
)
|
| 145 |
-
input_ids = encodings["input_ids"]
|
| 146 |
-
attention_mask = encodings["attention_mask"]
|
| 147 |
-
# attention_mask[input_ids == prebox_token_id] = 0
|
| 148 |
-
image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
|
| 149 |
-
image_start_index_list = [[x] for x in image_start_index_list]
|
| 150 |
-
image_nums = [1] * len(input_ids)
|
| 151 |
-
vision_x = batch_images.cuda()
|
| 152 |
-
lang_x = input_ids.cuda()
|
| 153 |
-
attention_mask = attention_mask.cuda()
|
| 154 |
-
|
| 155 |
-
model.debug_id = 0
|
| 156 |
-
with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
|
| 157 |
-
outputs = model(
|
| 158 |
-
vision_x=vision_x,
|
| 159 |
-
lang_x=lang_x,
|
| 160 |
-
attention_mask=attention_mask,
|
| 161 |
-
labels=None,
|
| 162 |
-
image_nums=image_nums,
|
| 163 |
-
image_start_index_list=image_start_index_list,
|
| 164 |
-
added_bbox_list=None,
|
| 165 |
-
add_box=False,
|
| 166 |
-
)
|
| 167 |
-
boxes = outputs["boxes"]
|
| 168 |
-
scores = outputs["scores"]
|
| 169 |
-
if len(scores) > 0:
|
| 170 |
-
box = boxes[scores.argmax()]
|
| 171 |
-
iou = get_iou(box, gt_box)
|
| 172 |
-
else:
|
| 173 |
-
iou = 0.0
|
| 174 |
-
# tqdm.write(f"output: {tokenizer.batch_decode(outputs)}")
|
| 175 |
-
tqdm.write(f"no output for: {uniq_id}, {image_id}, {text}")
|
| 176 |
-
if iou >= 0.5:
|
| 177 |
-
correct += 1
|
| 178 |
-
pbar.set_description(f"iou: {iou:.2f} score: {correct / total:.4f}")
|
| 179 |
-
# open_cv_image = np.array(image)
|
| 180 |
-
# # Convert RGB to BGR
|
| 181 |
-
# open_cv_image = open_cv_image[:, :, ::-1].copy()
|
| 182 |
-
# for box, score in zip(boxes, scores):
|
| 183 |
-
# open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (255, 0, 0), 2)
|
| 184 |
-
# cv2.imwrite("output.jpg", open_cv_image)
|
| 185 |
-
# print(boxes)
|
| 186 |
-
# print(scores)
|
| 187 |
-
# exit()
|
| 188 |
|
| 189 |
|
| 190 |
def generate(
|
|
|
|
| 70 |
return outputs
|
| 71 |
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
|
| 75 |
def generate(
|
multimodal/setup.py
CHANGED
|
@@ -33,6 +33,7 @@ if __name__ == "__main__":
|
|
| 33 |
"inflection",
|
| 34 |
"sentencepiece",
|
| 35 |
"open_clip_torch",
|
|
|
|
| 36 |
]
|
| 37 |
|
| 38 |
setup(
|
|
|
|
| 33 |
"inflection",
|
| 34 |
"sentencepiece",
|
| 35 |
"open_clip_torch",
|
| 36 |
+
"opencv-python"
|
| 37 |
]
|
| 38 |
|
| 39 |
setup(
|