init project
Browse files- app.py +10 -25
- modules/pe3r/models.py +3 -3
app.py
CHANGED
@@ -37,11 +37,9 @@ from modules.mobilesamv2.utils.transforms import ResizeLongestSide
|
|
37 |
from modules.pe3r.models import Models
|
38 |
import torchvision.transforms as tvf
|
39 |
|
40 |
-
from transformers import AutoTokenizer, AutoModel, AutoProcessor, SamModel
|
41 |
-
|
42 |
silent = False
|
43 |
-
|
44 |
-
pe3r = Models(
|
45 |
|
46 |
def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
|
47 |
cam_color=None, as_pointcloud=False,
|
@@ -246,9 +244,7 @@ def slerp_multiple(vectors, t_values):
|
|
246 |
@torch.no_grad
|
247 |
def get_mask_from_img_sam1(sam1_image, yolov8_image, original_size, input_size, transform):
|
248 |
|
249 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
250 |
-
pe3r.yolov8.to(device)
|
251 |
-
pe3r.mobilesamv2.to(device)
|
252 |
|
253 |
sam_mask=[]
|
254 |
img_area = original_size[0] * original_size[1]
|
@@ -304,14 +300,7 @@ def get_mask_from_img_sam1(sam1_image, yolov8_image, original_size, input_size,
|
|
304 |
@torch.no_grad
|
305 |
def get_cog_feats(images):
|
306 |
|
307 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
308 |
-
pe3r.sam2.to(device)
|
309 |
-
|
310 |
-
siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|
311 |
-
siglip_processor = AutoProcessor.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|
312 |
-
|
313 |
-
# pe3r.siglip_processor.to(device)
|
314 |
-
# pe3r.siglip.to(device)
|
315 |
|
316 |
cog_seg_maps = []
|
317 |
rev_cog_seg_maps = []
|
@@ -405,10 +394,10 @@ def get_cog_feats(images):
|
|
405 |
seg_imgs = np.stack(seg_img_list, axis=0) # b,H,W,3
|
406 |
seg_imgs = torch.from_numpy(seg_imgs).permute(0,3,1,2) # / 255.0
|
407 |
|
408 |
-
inputs = siglip_processor(images=seg_imgs, return_tensors="pt")
|
409 |
inputs = {key: value.to(device) for key, value in inputs.items()}
|
410 |
|
411 |
-
image_features = siglip.get_image_features(**inputs)
|
412 |
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
413 |
image_features = image_features.detach().cpu()
|
414 |
|
@@ -457,8 +446,7 @@ def get_reconstructed_scene(outdir, filelist, schedule, niter, min_conf_thr,
|
|
457 |
then run get_3D_model_from_scene
|
458 |
"""
|
459 |
|
460 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
461 |
-
pe3r.mast3r.to(device)
|
462 |
|
463 |
if len(filelist) < 2:
|
464 |
raise gradio.Error("Please input at least 2 images.")
|
@@ -523,16 +511,13 @@ def get_reconstructed_scene(outdir, filelist, schedule, niter, min_conf_thr,
|
|
523 |
def get_3D_object_from_scene(outdir, text, threshold, scene, min_conf_thr, as_pointcloud,
|
524 |
mask_sky, clean_depth, transparent_cams, cam_size):
|
525 |
|
526 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
527 |
-
# pe3r.siglip_tokenizer.to(device)
|
528 |
-
siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|
529 |
-
siglip_tokenizer = AutoTokenizer.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|
530 |
|
531 |
texts = [text]
|
532 |
-
inputs = siglip_tokenizer(text=texts, padding="max_length", return_tensors="pt")
|
533 |
inputs = {key: value.to(device) for key, value in inputs.items()}
|
534 |
with torch.no_grad():
|
535 |
-
text_feats =siglip.get_text_features(**inputs)
|
536 |
text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
|
537 |
scene.render_image(text_feats, threshold)
|
538 |
scene.ori_imgs = scene.rendered_imgs
|
|
|
37 |
from modules.pe3r.models import Models
|
38 |
import torchvision.transforms as tvf
|
39 |
|
|
|
|
|
40 |
silent = False
|
41 |
+
device = 'cpu' #'cuda' if torch.cuda.is_available() else 'cpu'
|
42 |
+
pe3r = Models(device) #
|
43 |
|
44 |
def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
|
45 |
cam_color=None, as_pointcloud=False,
|
|
|
244 |
@torch.no_grad
|
245 |
def get_mask_from_img_sam1(sam1_image, yolov8_image, original_size, input_size, transform):
|
246 |
|
247 |
+
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
|
|
248 |
|
249 |
sam_mask=[]
|
250 |
img_area = original_size[0] * original_size[1]
|
|
|
300 |
@torch.no_grad
|
301 |
def get_cog_feats(images):
|
302 |
|
303 |
+
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
|
305 |
cog_seg_maps = []
|
306 |
rev_cog_seg_maps = []
|
|
|
394 |
seg_imgs = np.stack(seg_img_list, axis=0) # b,H,W,3
|
395 |
seg_imgs = torch.from_numpy(seg_imgs).permute(0,3,1,2) # / 255.0
|
396 |
|
397 |
+
inputs = pe3r.siglip_processor(images=seg_imgs, return_tensors="pt")
|
398 |
inputs = {key: value.to(device) for key, value in inputs.items()}
|
399 |
|
400 |
+
image_features = pe3r.siglip.get_image_features(**inputs)
|
401 |
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
402 |
image_features = image_features.detach().cpu()
|
403 |
|
|
|
446 |
then run get_3D_model_from_scene
|
447 |
"""
|
448 |
|
449 |
+
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
450 |
|
451 |
if len(filelist) < 2:
|
452 |
raise gradio.Error("Please input at least 2 images.")
|
|
|
511 |
def get_3D_object_from_scene(outdir, text, threshold, scene, min_conf_thr, as_pointcloud,
|
512 |
mask_sky, clean_depth, transparent_cams, cam_size):
|
513 |
|
514 |
+
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
|
|
|
|
515 |
|
516 |
texts = [text]
|
517 |
+
inputs = pe3r.siglip_tokenizer(text=texts, padding="max_length", return_tensors="pt")
|
518 |
inputs = {key: value.to(device) for key, value in inputs.items()}
|
519 |
with torch.no_grad():
|
520 |
+
text_feats =pe3r.siglip.get_text_features(**inputs)
|
521 |
text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
|
522 |
scene.render_image(text_feats, threshold)
|
523 |
scene.ori_imgs = scene.rendered_imgs
|
modules/pe3r/models.py
CHANGED
@@ -47,6 +47,6 @@ class Models:
|
|
47 |
self.yolov8 = ObjectAwareModel(YOLO8_CKP)
|
48 |
|
49 |
# -- siglip --
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
47 |
self.yolov8 = ObjectAwareModel(YOLO8_CKP)
|
48 |
|
49 |
# -- siglip --
|
50 |
+
self.siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|
51 |
+
self.siglip_tokenizer = AutoTokenizer.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|
52 |
+
self.siglip_processor = AutoProcessor.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|