Spaces:

yslan
/

worldmem

Running on Zero

App Files Files Community

xizaoqu commited on Apr 12

Commit

cef86dc

1 Parent(s): db555c7

update

Browse files

Files changed (1) hide show

app.py +90 -57

app.py CHANGED Viewed

@@ -71,10 +71,10 @@ KEY_TO_ACTION = {
 }
 example_images = [
-    ["1", "assets/ice_plains.png", "turn right+go backward+look up+turn left+look down+turn right+go forward+turn left", 20, 3, 8],
-    ["2", "assets/place.png", "put item+go backward+put item+go backward+go around", 20, 3, 8],
-    ["3", "assets/rain_sunflower_plains.png", "turn right+look up+turn right+look down+turn left+go backward+turn left", 20, 3, 8],
-    ["4", "assets/desert.png", "turn 360 degree+turn right+go forward+turn left", 20, 3, 8],
 ]
 def load_custom_checkpoint(algo, checkpoint_path):
@@ -264,10 +264,18 @@ def generate(keys, input_history, memory_frames, self_frames, self_actions, self
     memory_frames = np.concatenate([memory_frames, new_frame[:,0]])
-    out_video = memory_frames.transpose(0,2,3,1)
     out_video = np.clip(out_video, a_min=0.0, a_max=1.0)
     out_video = (out_video * 255).astype(np.uint8)
     temporal_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
     save_video(out_video, temporal_video_path)
     input_history += keys
@@ -289,7 +297,7 @@ def generate(keys, input_history, memory_frames, self_frames, self_actions, self
     # np.savez(os.path.join(folder_path, "data_bundle.npz"), **data_dict)
-    return out_video[-1], temporal_video_path, input_history, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx
 def reset(selected_image):
     self_frames = None
@@ -381,6 +389,24 @@ with gr.Blocks(css=css) as demo:
         """
         )
         # <div style="text-align: center;">
         # <!-- Public Website -->
         # <a style="display:inline-block" href="https://nirvanalan.github.io/projects/GA/">
@@ -403,25 +429,50 @@ with gr.Blocks(css=css) as demo:
         # </a>
         # </div>
-    example_actions = {"turn left + turn right": "AAAAAAAAAAAADDDDDDDDDDDD",
                         "turn 360 degree": "AAAAAAAAAAAAAAAAAAAAAAAA",
-                        "turn right+go backward+look up+turn left+look down": "DDDDDDDDEEEEEEEEEESSSAAAAAAAAWWW",
-                        "turn right+go forward+turn right": "DDDDDDDDDDDDQQQQQQQQQQQQQQQDDDDDDDDDDDD",
-                        "turn right+look up+turn right+look down": "DDDDWWWDDDDDDDDDDDDDDDDDDDDSSS",
-                        "put item+go backward+put item+go backward":"SSUNNWWEEEEEEEEEAAASSUNNWWEEEEEEEEE"}
     selected_image = gr.State(ICE_PLAINS_IMAGE)
     with gr.Row(variant="panel"):
-        video_display = gr.Video(autoplay=True, loop=True)
-        image_display = gr.Image(value=selected_image.value, interactive=False, label="Current Frame")
     with gr.Row(variant="panel"):
         with gr.Column(scale=2):
-            input_box = gr.Textbox(label="Action Sequence", placeholder="Enter action sequence here...", lines=1, max_lines=1)
-            log_output = gr.Textbox(label="History Log", interactive=False)
-            gr.Markdown("### Action sequence examples.")
             with gr.Row():
                 buttons = []
                 for action_key in list(example_actions.keys())[:2]:
@@ -437,11 +488,28 @@ with gr.Blocks(css=css) as demo:
                         buttons.append(gr.Button(action_key))
         with gr.Column(scale=1):
-            slider_denoising_step = gr.Slider(minimum=10, maximum=50, value=worldmem.sampling_timesteps, step=1, label="Denoising Steps")
-            slider_context_length = gr.Slider(minimum=2, maximum=10, value=worldmem.n_tokens, step=1, label="Context Length")
-            slider_memory_length = gr.Slider(minimum=4, maximum=16, value=worldmem.condition_similar_length, step=1, label="Memory Length")
-            submit_button = gr.Button("Generate")
-            reset_btn = gr.Button("Reset")
     sampling_timesteps_state = gr.State(worldmem.sampling_timesteps)
     sampling_context_length_state = gr.State(worldmem.n_tokens)
@@ -457,24 +525,12 @@ with gr.Blocks(css=css) as demo:
     def set_action(action):
         return action
-    # gr.Markdown("### Action sequence examples.")
     for button, action_key in zip(buttons, list(example_actions.keys())):
             button.click(set_action, inputs=[gr.State(value=example_actions[action_key])], outputs=input_box)
-    gr.Markdown("### Click on the images below to reset the sequence and generate from the new image.")
-    with gr.Row():
-        image_display_1 = gr.Image(value=SUNFLOWERS_IMAGE, interactive=False, label="Sunflower Plains")
-        image_display_2 = gr.Image(value=DESERT_IMAGE, interactive=False, label="Desert")
-        image_display_3 = gr.Image(value=SAVANNA_IMAGE, interactive=False, label="Savanna")
-        image_display_4 = gr.Image(value=ICE_PLAINS_IMAGE, interactive=False, label="Ice Plains")
-        image_display_5 = gr.Image(value=SUNFLOWERS_RAIN_IMAGE, interactive=False, label="Rainy Sunflower Plains")
-        image_display_6 = gr.Image(value=PLACE_IMAGE, interactive=False, label="Place")
-    gr.Markdown("### Click the examples below for a quick review, and continue generating based on them.")
     example_case = gr.Textbox(label="Case", visible=False)
     image_output = gr.Image(visible=False)
@@ -499,29 +555,6 @@ with gr.Blocks(css=css) as demo:
     )
-    gr.Markdown(
-        """
-        ## Instructions & Notes:
-        1. Enter an action sequence in the **"Action Sequence"** text box and click **"Generate"** to begin.
-        2. You can continue generation by clicking **"Generation"** again and again. Previous sequences are logged in the history panel.
-        3. Click **"Reset"** to clear the current sequence and start fresh.
-        4. Action sequences can be composed using the following keys:
-            - W: turn up
-            - S: turn down
-            - A: turn left
-            - D: turn right
-            - Q: move forward
-            - E: move backward
-            - N: no-op (do nothing)
-            - U: use item
-        5. Higher denoising steps produce more detailed results but take longer. 20 steps is a good balance between quality and speed. The same applies to context and memory length.
-        6. For faster performance, we recommend running the demo locally (~1s/frame on H100 vs ~5s on Spaces).
-        7. If you find this project interesting or useful, please consider giving it a ⭐️ on [GitHub]()!
-        8. For feedback or suggestions, feel free to open a GitHub issue or contact me directly at **[email protected]**.
-        """
-    )
     # input_box.submit(update_image_and_log, inputs=[input_box], outputs=[image_display, video_display, log_output])
     submit_button.click(generate, inputs=[input_box, log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx], outputs=[image_display, video_display, log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
     reset_btn.click(reset, inputs=[selected_image], outputs=[log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])

 }
 example_images = [
+    ["1", "assets/ice_plains.png", "turn rightgo backward→look up→turn left→look down→turn right→go forward→turn left", 20, 3, 8],
+    ["2", "assets/place.png", "put item→go backward→put item→go backward→go around", 20, 3, 8],
+    ["3", "assets/rain_sunflower_plains.png", "turn right→look up→turn right→look down→turn left→go backward→turn left", 20, 3, 8],
+    ["4", "assets/desert.png", "turn 360 degree→turn right→go forward→turn left", 20, 3, 8],
 ]
 def load_custom_checkpoint(algo, checkpoint_path):
     memory_frames = np.concatenate([memory_frames, new_frame[:,0]])
+    out_video = memory_frames.transpose(0,2,3,1).copy()
     out_video = np.clip(out_video, a_min=0.0, a_max=1.0)
     out_video = (out_video * 255).astype(np.uint8)
+    last_frame = out_video[-1].copy()
+    border_thickness = 2
+    out_video[-len(new_frame):, :border_thickness, :, :] = [255, 0, 0]
+    out_video[-len(new_frame):, -border_thickness:, :, :] = [255, 0, 0]
+    out_video[-len(new_frame):, :, :border_thickness, :] = [255, 0, 0]
+    out_video[-len(new_frame):, :, -border_thickness:, :] = [255, 0, 0]
     temporal_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
     save_video(out_video, temporal_video_path)
     input_history += keys
     # np.savez(os.path.join(folder_path, "data_bundle.npz"), **data_dict)
+    return last_frame, temporal_video_path, input_history, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx
 def reset(selected_image):
     self_frames = None
         """
         )
+    gr.Markdown(
+        """
+        ## 🚀 How to Explore WorldMem
+        Follow these simple steps to get started:
+        1. **Choose a scene**.
+        2. **Input your action sequence**.
+        3. **Click "Generate"**.
+        - You can continuously click **"Generate"** to **extend the video** and observe how well the world maintains consistency over time.
+        - For best performance, we recommend **running locally** (1s/frame on H100) instead of Spaces (5s/frame).
+        - ⭐️ If you like this project, please [give it a star on GitHub]()!
+        - 💬 For questions or feedback, feel free to open an issue or email me at **[email protected]**.
+        Happy exploring! 🌍
+        """
+    )
         # <div style="text-align: center;">
         # <!-- Public Website -->
         # <a style="display:inline-block" href="https://nirvanalan.github.io/projects/GA/">
         # </a>
         # </div>
+    example_actions = {"turn left→turn right": "AAAAAAAAAAAADDDDDDDDDDDD",
                         "turn 360 degree": "AAAAAAAAAAAAAAAAAAAAAAAA",
+                        "turn right→go backward→look up→turn left→look down": "DDDDDDDDEEEEEEEEEESSSAAAAAAAAWWW",
+                        "turn right→go forward→turn right": "DDDDDDDDDDDDQQQQQQQQQQQQQQQDDDDDDDDDDDD",
+                        "turn right→look up→turn right→look down": "DDDDWWWDDDDDDDDDDDDDDDDDDDDSSS",
+                        "put item→go backward→put item→go backward":"SSUNNWWEEEEEEEEEAAASSUNNWWEEEEEEEEE"}
     selected_image = gr.State(ICE_PLAINS_IMAGE)
     with gr.Row(variant="panel"):
+        with gr.Column():
+            gr.Markdown("🖼️ Start from this frame.")
+            image_display = gr.Image(value=selected_image.value, interactive=False, label="Current Frame")
+        with gr.Column():
+            gr.Markdown("🎞️ Generated videos. New contents are marked in red box.")
+            video_display = gr.Video(autoplay=True, loop=True)
+    gr.Markdown("### 🏞️ Choose a scene and start generation.")
+    with gr.Row():
+        image_display_1 = gr.Image(value=SUNFLOWERS_IMAGE, interactive=False, label="Sunflower Plains")
+        image_display_2 = gr.Image(value=DESERT_IMAGE, interactive=False, label="Desert")
+        image_display_3 = gr.Image(value=SAVANNA_IMAGE, interactive=False, label="Savanna")
+        image_display_4 = gr.Image(value=ICE_PLAINS_IMAGE, interactive=False, label="Ice Plains")
+        image_display_5 = gr.Image(value=SUNFLOWERS_RAIN_IMAGE, interactive=False, label="Rainy Sunflower Plains")
+        image_display_6 = gr.Image(value=PLACE_IMAGE, interactive=False, label="Place")
     with gr.Row(variant="panel"):
         with gr.Column(scale=2):
+            gr.Markdown("### 🕹️ Input action sequences for interaction.")
+            input_box = gr.Textbox(label="Action Sequences", placeholder="Enter action sequences here, e.g. (AAAAAAAAAAAADDDDDDDDDDDD)", lines=1, max_lines=1)
+            log_output = gr.Textbox(label="History Sequences", interactive=False)
+            gr.Markdown(
+                """
+                ### 💡 Action Key Guide
+                <pre style="font-family: monospace; font-size: 14px; line-height: 1.6;">
+                W: Turn up      S: Turn down     A: Turn left     D: Turn right
+                Q: Go forward   E: Go backward   N: No-op         U: Use item
+                </pre>
+                """
+            )
+            gr.Markdown("### 👇 Click to quickly set action sequence examples.")
             with gr.Row():
                 buttons = []
                 for action_key in list(example_actions.keys())[:2]:
                         buttons.append(gr.Button(action_key))
         with gr.Column(scale=1):
+            submit_button = gr.Button("🎬 Generate!", variant="primary")
+            reset_btn = gr.Button("🔄 Reset")
+            gr.Markdown("<div style='flex-grow:1; height: 100px'></div>")
+            gr.Markdown("### ⚙️ Advanced Settings")
+            slider_denoising_step = gr.Slider(
+                minimum=10, maximum=50, value=worldmem.sampling_timesteps, step=1,
+                label="Denoising Steps",
+                info="Higher values yield better quality but slower speed"
+            )
+            slider_context_length = gr.Slider(
+                minimum=2, maximum=10, value=worldmem.n_tokens, step=1,
+                label="Context Length",
+                info="How many previous frames in temporal context window."
+            )
+            slider_memory_length = gr.Slider(
+                minimum=4, maximum=16, value=worldmem.condition_similar_length, step=1,
+                label="Memory Length",
+                info="How many previous frames in memory window."
+            )
     sampling_timesteps_state = gr.State(worldmem.sampling_timesteps)
     sampling_context_length_state = gr.State(worldmem.n_tokens)
     def set_action(action):
         return action
     for button, action_key in zip(buttons, list(example_actions.keys())):
             button.click(set_action, inputs=[gr.State(value=example_actions[action_key])], outputs=input_box)
+    gr.Markdown("### 👇 Click to review generated examples, and continue generation based on them.")
     example_case = gr.Textbox(label="Case", visible=False)
     image_output = gr.Image(visible=False)
     )
     # input_box.submit(update_image_and_log, inputs=[input_box], outputs=[image_display, video_display, log_output])
     submit_button.click(generate, inputs=[input_box, log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx], outputs=[image_display, video_display, log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
     reset_btn.click(reset, inputs=[selected_image], outputs=[log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])