Spaces:

flax-community
/

DietNerf-Demo

Build error

App Files Files Community

alexlau commited on Jul 20, 2021

Commit

4ef918b

1 Parent(s): b87426b

fix pose rendering

Browse files

Files changed (1) hide show

jaxnerf/nerf/clip_utils.py +43 -59

jaxnerf/nerf/clip_utils.py CHANGED Viewed

@@ -9,38 +9,53 @@ import jax.numpy as jnp
 import numpy as np
 from transformers import FlaxCLIPModel
 FLAGS = flags.FLAGS
-# import jmp
-# my_policy = jmp.Policy(compute_dtype=np.float16,
-#                        param_dtype=np.float16,
-#                        output_dtype=np.float16)
 @partial(jax.jit, static_argnums=[0, 1])
-def update_semantic_loss(model, clip_model, rng, state, batch, lr):
     # the batch is without shard
     random_rays = batch["random_rays"]
-    #rng, key_0, key_1 = rng
     rng, key_0, key_1 = random.split(rng,3)
     def semantic_loss(variables):
-        # TODO @Alex: (alt) sample less along a ray/ sample on a strided grid (make change on model call)
-        # TODO @Alex: (alt) apply mixed precision
-        src_ret = model.apply(variables, key_0, key_1, random_rays, False)
-        src_image, _, _ = src_ret[-1]
         # reshape flat pixel to an image (assume 3 channels & square shape)
         w = int(math.sqrt(src_image.shape[0]))
-        src_image = src_image.reshape([-1, w, w, 3]).transpose(0, 3, 1, 2)
-        src_image = preprocess_for_CLIP(src_image)
-        src_embedding = clip_model.get_image_features(pixel_values=src_image)
         src_embedding /= jnp.linalg.norm(src_embedding, axis=-1, keepdims=True)
         src_embedding = jnp.array(src_embedding)
         target_embedding = batch["embedding"]
-        sc_loss = 0.5 * FLAGS.sc_loss_mult * jnp.sum((src_embedding - target_embedding) ** 2) / src_embedding.shape[0]
-        return sc_loss * 1e-2
-    sc_loss, grad = jax.value_and_grad(semantic_loss)(jax.device_get(jax.tree_map(lambda x:x[0], state)).optimizer.target)
-    return sc_loss, grad
 def trans_t(t):
     return jnp.array([
@@ -49,39 +64,34 @@ def trans_t(t):
         [0, 0, 1, t],
         [0, 0, 0, 1]], dtype=jnp.float32)
 def rot_phi(phi):
     return jnp.array([
         [1, 0, 0, 0],
-        [0, jnp.cos(phi), -np.sin(phi), 0],
-        [0, jnp.sin(phi), jnp.cos(phi), 0],
         [0, 0, 0, 1]], dtype=jnp.float32)
 def rot_theta(th):
     return jnp.array([
-        [np.cos(th), 0, -np.sin(th), 0],
         [0, 1, 0, 0],
-        [np.sin(th), 0, jnp.cos(th), 0],
         [0, 0, 0, 1]], dtype=jnp.float32)
-def pose_spherical(theta, phi, radius):
     c2w = trans_t(radius)
-    c2w = rot_phi(phi / 180. * jnp.pi) @ c2w
-    c2w = rot_theta(theta / 180. * jnp.pi) @ c2w
     c2w = jnp.array([[-1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]) @ c2w
     return c2w
 def random_pose(rng, bds):
     rng, *rng_inputs = jax.random.split(rng, 3)
     radius = random.uniform(rng_inputs[1], minval=bds[0], maxval=bds[1])
-    theta = random.uniform(rng_inputs[1], minval=0, maxval=2 * jnp.pi)
-    phi = random.uniform(rng_inputs[1], minval=0, maxval=np.pi / 2)
     return pose_spherical(radius, theta, phi)
 def preprocess_for_CLIP(image):
     """
     jax-based preprocessing for CLIP
@@ -89,14 +99,12 @@ def preprocess_for_CLIP(image):
     return [B, 3, 224, 224]: pre-processed image for CLIP
     """
     B, D, H, W = image.shape
-    image = jax.image.resize(image, (B, D, 224, 224), 'bicubic')  # assume that images have rectangle shape.
     mean = jnp.array([0.48145466, 0.4578275, 0.40821073]).reshape(1, 3, 1, 1)
     std = jnp.array([0.26862954, 0.26130258, 0.27577711]).reshape(1, 3, 1, 1)
     image = (image - mean.astype(image.dtype)) / std.astype(image.dtype)
     return image
-# TODO @Alex: VisionModel v.s. original CLIP? (differ by a projection matrix)
 def init_CLIP(dtype: str, model_name: Optional[str]) -> FlaxCLIPModel:
     if dtype == 'float16':
         dtype = jnp.float16
@@ -108,27 +116,3 @@ def init_CLIP(dtype: str, model_name: Optional[str]) -> FlaxCLIPModel:
     if model_name is None:
         model_name = 'openai/clip-vit-base-patch32'
     return FlaxCLIPModel.from_pretrained(model_name, dtype=dtype)
-# def SC_loss(rng_inputs, model, params, bds, rays, N_samples, target_emb, CLIP_model, l):
-#     """
-#     target_emb [1, D]: pre-computed target embedding vector \phi(I)
-#     source_img [1, 3, H, W]: source image \hat{I}
-#     l: loss weight lambda
-#     return: SC_loss
-#     """
-#     # _,H,W,D = rays.shape
-#     rng_inputs, model, params, bds, rays, N_samples, target_emb, CLIP_model, l = my_policy.cast_to_compute(
-#         (rng_inputs, model, params, bds, rays, N_samples, target_emb, CLIP_model, l))
-#     _, H, W, _ = rays.shape
-#     source_img = jnp.clip(render_fn(rng_inputs, model, params, None,
-#                                    np.reshape(rays, (2, -1, 3)),
-#                                    bds[0], bds[1], 1, rand=False),
-#                          0, 1)
-#     # source_img = np.clip(render_rays(rng_inputs, model, params, None, np.reshape(rays, (2, -1, 3)), bds[0], bds[1], 1, rand=False), 0, 1)
-#     source_img = np.reshape(source_img, [1, H, W, 3]).transpose(0, 3, 1, 2)
-#     source_img = preprocess_for_CLIP(source_img)
-#     source_emb = CLIP_model.get_image_features(pixel_values=source_img)
-#     source_emb /= np.linalg.norm(source_emb, axis=-1, keepdims=True)
-#     return l/2 * (np.sum((source_emb - target_emb) ** 2) / source_emb.shape[0])

 import numpy as np
 from transformers import FlaxCLIPModel
+from nerf import utils
 FLAGS = flags.FLAGS
+@partial(jax.jit, static_argnums=[0])
+def semantic_loss(clip_model, src_image, target_embedding):
+    src_image = utils.unshard(src_image)
+    w = int(math.sqrt(src_image.size//3))
+    src_image = src_image.reshape([w, w, 3])
+    src_embedding = clip_model.get_image_features(pixel_values=preprocess_for_CLIP(jnp.expand_dims(src_image,0).transpose(0, 3, 1, 2)))
+    src_embedding /= jnp.linalg.norm(src_embedding, axis=-1, keepdims=True)
+    src_embedding = jnp.array(src_embedding)
+    sc_loss = 0.5 * jnp.sum((src_embedding - target_embedding) ** 2) / src_embedding.shape[0]
+    return sc_loss, src_image
+def semantic_step_multi(render_pfn, clip_model, rng, state, batch, lr):
+    random_rays = jax.tree_map(lambda x: utils.shard(x).astype(jnp.float16), batch["random_rays"])
+    target_embedding = batch["embedding"].astype(jnp.float16)
+    rng, key_0, key_1 = random.split(rng,3)
+    def loss_fn(variables):
+        src_image = render_pfn(variables, key_0, key_1, random_rays)
+        sc_loss, src_image = semantic_loss(clip_model, src_image, target_embedding)
+        return sc_loss * FLAGS.sc_loss_mult, src_image
+    (sc_loss, src_image), grad = jax.value_and_grad(loss_fn, has_aux = True)(jax.device_get(jax.tree_map(lambda x:x[0].astype(jnp.float16), state)).optimizer.target)
+    return sc_loss, grad, src_image
 @partial(jax.jit, static_argnums=[0, 1])
+def semantic_step_single(model, clip_model, rng, state, batch, lr):
     # the batch is without shard
     random_rays = batch["random_rays"]
     rng, key_0, key_1 = random.split(rng,3)
     def semantic_loss(variables):
+        src_image = model.apply(variables, key_0, key_1, random_rays, False, rgb_only = True)
         # reshape flat pixel to an image (assume 3 channels & square shape)
         w = int(math.sqrt(src_image.shape[0]))
+        src_image = src_image.reshape([w, w, 3])
+        src_embedding = clip_model.get_image_features(pixel_values=preprocess_for_CLIP(jnp.expand_dims(src_image,0).transpose(0, 3, 1, 2)))
         src_embedding /= jnp.linalg.norm(src_embedding, axis=-1, keepdims=True)
         src_embedding = jnp.array(src_embedding)
         target_embedding = batch["embedding"]
+        sc_loss = 0.5 * jnp.sum((src_embedding - target_embedding) ** 2) / src_embedding.shape[0]
+        return sc_loss * FLAGS.sc_loss_mult, src_image
+    (sc_loss, src_image), grad = jax.value_and_grad(semantic_loss, has_aux = True)(jax.device_get(jax.tree_map(lambda x:x[0], state)).optimizer.target)
+    return sc_loss, grad, src_image
 def trans_t(t):
     return jnp.array([
         [0, 0, 1, t],
         [0, 0, 0, 1]], dtype=jnp.float32)
 def rot_phi(phi):
     return jnp.array([
         [1, 0, 0, 0],
+        [0, jnp.cos(phi), jnp.sin(phi), 0],
+        [0,-jnp.sin(phi), jnp.cos(phi), 0],
         [0, 0, 0, 1]], dtype=jnp.float32)
 def rot_theta(th):
     return jnp.array([
+        [jnp.cos(th), 0,-jnp.sin(th), 0],
         [0, 1, 0, 0],
+        [jnp.sin(th), 0, jnp.cos(th), 0],
         [0, 0, 0, 1]], dtype=jnp.float32)
+def pose_spherical(radius, theta, phi):
     c2w = trans_t(radius)
+    c2w = rot_phi(phi) @ c2w
+    c2w = rot_theta(theta) @ c2w
     c2w = jnp.array([[-1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]) @ c2w
     return c2w
 def random_pose(rng, bds):
     rng, *rng_inputs = jax.random.split(rng, 3)
     radius = random.uniform(rng_inputs[1], minval=bds[0], maxval=bds[1])
+    theta = random.uniform(rng_inputs[1], minval=-jnp.pi, maxval=jnp.pi)
+    phi = random.uniform(rng_inputs[1], minval=0, maxval=jnp.pi/2)
     return pose_spherical(radius, theta, phi)
 def preprocess_for_CLIP(image):
     """
     jax-based preprocessing for CLIP
     return [B, 3, 224, 224]: pre-processed image for CLIP
     """
     B, D, H, W = image.shape
     mean = jnp.array([0.48145466, 0.4578275, 0.40821073]).reshape(1, 3, 1, 1)
     std = jnp.array([0.26862954, 0.26130258, 0.27577711]).reshape(1, 3, 1, 1)
+    image = jax.image.resize(image, (B, D, 224, 224), 'bicubic')  # assume that images have rectangle shape.
     image = (image - mean.astype(image.dtype)) / std.astype(image.dtype)
     return image
 def init_CLIP(dtype: str, model_name: Optional[str]) -> FlaxCLIPModel:
     if dtype == 'float16':
         dtype = jnp.float16
     if model_name is None:
         model_name = 'openai/clip-vit-base-patch32'
     return FlaxCLIPModel.from_pretrained(model_name, dtype=dtype)