init project
Browse files- app.py +25 -14
- modules/pe3r/models.py +3 -3
app.py
CHANGED
|
@@ -37,10 +37,12 @@ from modules.mobilesamv2.utils.transforms import ResizeLongestSide
|
|
| 37 |
from modules.pe3r.models import Models
|
| 38 |
import torchvision.transforms as tvf
|
| 39 |
|
|
|
|
|
|
|
| 40 |
silent = False
|
| 41 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu' #'cpu' #
|
| 42 |
-
pe3r = Models(
|
| 43 |
-
print(device)
|
| 44 |
|
| 45 |
def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
|
| 46 |
cam_color=None, as_pointcloud=False,
|
|
@@ -245,7 +247,9 @@ def slerp_multiple(vectors, t_values):
|
|
| 245 |
@torch.no_grad
|
| 246 |
def get_mask_from_img_sam1(sam1_image, yolov8_image, original_size, input_size, transform):
|
| 247 |
|
| 248 |
-
|
|
|
|
|
|
|
| 249 |
|
| 250 |
sam_mask=[]
|
| 251 |
img_area = original_size[0] * original_size[1]
|
|
@@ -301,7 +305,10 @@ def get_mask_from_img_sam1(sam1_image, yolov8_image, original_size, input_size,
|
|
| 301 |
@torch.no_grad
|
| 302 |
def get_cog_feats(images):
|
| 303 |
|
| 304 |
-
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
cog_seg_maps = []
|
| 307 |
rev_cog_seg_maps = []
|
|
@@ -395,10 +402,10 @@ def get_cog_feats(images):
|
|
| 395 |
seg_imgs = np.stack(seg_img_list, axis=0) # b,H,W,3
|
| 396 |
seg_imgs = torch.from_numpy(seg_imgs).permute(0,3,1,2) # / 255.0
|
| 397 |
|
| 398 |
-
inputs =
|
| 399 |
inputs = {key: value.to(device) for key, value in inputs.items()}
|
| 400 |
|
| 401 |
-
image_features =
|
| 402 |
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
| 403 |
image_features = image_features.detach().cpu()
|
| 404 |
|
|
@@ -438,7 +445,7 @@ def get_cog_feats(images):
|
|
| 438 |
return cog_seg_maps, rev_cog_seg_maps, multi_view_clip_feats
|
| 439 |
|
| 440 |
|
| 441 |
-
@spaces.GPU(duration=
|
| 442 |
def get_reconstructed_scene(outdir, filelist, schedule, niter, min_conf_thr,
|
| 443 |
as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
|
| 444 |
scenegraph_type, winsize, refid):
|
|
@@ -447,7 +454,9 @@ def get_reconstructed_scene(outdir, filelist, schedule, niter, min_conf_thr,
|
|
| 447 |
then run get_3D_model_from_scene
|
| 448 |
"""
|
| 449 |
|
| 450 |
-
|
|
|
|
|
|
|
| 451 |
|
| 452 |
if len(filelist) < 2:
|
| 453 |
raise gradio.Error("Please input at least 2 images.")
|
|
@@ -505,22 +514,24 @@ def get_reconstructed_scene(outdir, filelist, schedule, niter, min_conf_thr,
|
|
| 505 |
outfile = get_3D_model_from_scene(outdir, scene, min_conf_thr, as_pointcloud, mask_sky,
|
| 506 |
clean_depth, transparent_cams, cam_size)
|
| 507 |
|
| 508 |
-
scene.to('cpu')
|
| 509 |
torch.cuda.empty_cache()
|
| 510 |
|
| 511 |
return scene, outfile
|
| 512 |
|
| 513 |
-
@spaces.GPU(duration=
|
| 514 |
def get_3D_object_from_scene(outdir, text, threshold, scene, min_conf_thr, as_pointcloud,
|
| 515 |
mask_sky, clean_depth, transparent_cams, cam_size):
|
| 516 |
|
| 517 |
-
|
|
|
|
|
|
|
| 518 |
|
| 519 |
texts = [text]
|
| 520 |
-
inputs =
|
| 521 |
inputs = {key: value.to(device) for key, value in inputs.items()}
|
| 522 |
with torch.no_grad():
|
| 523 |
-
text_feats =
|
| 524 |
text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
|
| 525 |
scene.render_image(text_feats, threshold)
|
| 526 |
scene.ori_imgs = scene.rendered_imgs
|
|
|
|
| 37 |
from modules.pe3r.models import Models
|
| 38 |
import torchvision.transforms as tvf
|
| 39 |
|
| 40 |
+
from transformers import AutoTokenizer, AutoModel, AutoProcessor
|
| 41 |
+
|
| 42 |
silent = False
|
| 43 |
+
# device = 'cuda' if torch.cuda.is_available() else 'cpu' #'cpu' #
|
| 44 |
+
pe3r = Models('cpu') #
|
| 45 |
+
# print(device)
|
| 46 |
|
| 47 |
def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
|
| 48 |
cam_color=None, as_pointcloud=False,
|
|
|
|
| 247 |
@torch.no_grad
|
| 248 |
def get_mask_from_img_sam1(sam1_image, yolov8_image, original_size, input_size, transform):
|
| 249 |
|
| 250 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 251 |
+
pe3r.yolov8.to(device)
|
| 252 |
+
pe3r.mobilesamv2.to(device)
|
| 253 |
|
| 254 |
sam_mask=[]
|
| 255 |
img_area = original_size[0] * original_size[1]
|
|
|
|
| 305 |
@torch.no_grad
|
| 306 |
def get_cog_feats(images):
|
| 307 |
|
| 308 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 309 |
+
pe3r.sam2.to(device)
|
| 310 |
+
siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|
| 311 |
+
siglip_processor = AutoProcessor.from_pretrained("google/siglip-large-patch16-256")
|
| 312 |
|
| 313 |
cog_seg_maps = []
|
| 314 |
rev_cog_seg_maps = []
|
|
|
|
| 402 |
seg_imgs = np.stack(seg_img_list, axis=0) # b,H,W,3
|
| 403 |
seg_imgs = torch.from_numpy(seg_imgs).permute(0,3,1,2) # / 255.0
|
| 404 |
|
| 405 |
+
inputs = siglip_processor(images=seg_imgs, return_tensors="pt")
|
| 406 |
inputs = {key: value.to(device) for key, value in inputs.items()}
|
| 407 |
|
| 408 |
+
image_features = siglip.get_image_features(**inputs)
|
| 409 |
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
| 410 |
image_features = image_features.detach().cpu()
|
| 411 |
|
|
|
|
| 445 |
return cog_seg_maps, rev_cog_seg_maps, multi_view_clip_feats
|
| 446 |
|
| 447 |
|
| 448 |
+
@spaces.GPU(duration=60)
|
| 449 |
def get_reconstructed_scene(outdir, filelist, schedule, niter, min_conf_thr,
|
| 450 |
as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
|
| 451 |
scenegraph_type, winsize, refid):
|
|
|
|
| 454 |
then run get_3D_model_from_scene
|
| 455 |
"""
|
| 456 |
|
| 457 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 458 |
+
|
| 459 |
+
pe3r.mast3r.to(device)
|
| 460 |
|
| 461 |
if len(filelist) < 2:
|
| 462 |
raise gradio.Error("Please input at least 2 images.")
|
|
|
|
| 514 |
outfile = get_3D_model_from_scene(outdir, scene, min_conf_thr, as_pointcloud, mask_sky,
|
| 515 |
clean_depth, transparent_cams, cam_size)
|
| 516 |
|
| 517 |
+
# scene.to('cpu')
|
| 518 |
torch.cuda.empty_cache()
|
| 519 |
|
| 520 |
return scene, outfile
|
| 521 |
|
| 522 |
+
# @spaces.GPU(duration=60)
|
| 523 |
def get_3D_object_from_scene(outdir, text, threshold, scene, min_conf_thr, as_pointcloud,
|
| 524 |
mask_sky, clean_depth, transparent_cams, cam_size):
|
| 525 |
|
| 526 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 527 |
+
siglip_tokenizer = AutoTokenizer.from_pretrained("google/siglip-large-patch16-256")
|
| 528 |
+
siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|
| 529 |
|
| 530 |
texts = [text]
|
| 531 |
+
inputs = siglip_tokenizer(text=texts, padding="max_length", return_tensors="pt")
|
| 532 |
inputs = {key: value.to(device) for key, value in inputs.items()}
|
| 533 |
with torch.no_grad():
|
| 534 |
+
text_feats =siglip.get_text_features(**inputs)
|
| 535 |
text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
|
| 536 |
scene.render_image(text_feats, threshold)
|
| 537 |
scene.ori_imgs = scene.rendered_imgs
|
modules/pe3r/models.py
CHANGED
|
@@ -47,6 +47,6 @@ class Models:
|
|
| 47 |
self.yolov8 = ObjectAwareModel(YOLO8_CKP)
|
| 48 |
|
| 49 |
# -- siglip --
|
| 50 |
-
self.siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|
| 51 |
-
self.siglip_tokenizer = AutoTokenizer.from_pretrained("google/siglip-large-patch16-256"
|
| 52 |
-
self.siglip_processor = AutoProcessor.from_pretrained("google/siglip-large-patch16-256"
|
|
|
|
| 47 |
self.yolov8 = ObjectAwareModel(YOLO8_CKP)
|
| 48 |
|
| 49 |
# -- siglip --
|
| 50 |
+
# self.siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|
| 51 |
+
# self.siglip_tokenizer = AutoTokenizer.from_pretrained("google/siglip-large-patch16-256")
|
| 52 |
+
# self.siglip_processor = AutoProcessor.from_pretrained("google/siglip-large-patch16-256")
|