File size: 12,315 Bytes
120332b
2bdb9d3
 
6cdf1cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bdb9d3
6cdf1cf
 
 
 
 
 
 
 
 
 
2bdb9d3
 
6cdf1cf
 
 
 
 
 
 
2bdb9d3
6cdf1cf
 
 
 
 
 
 
 
 
 
 
 
 
 
2bdb9d3
6cdf1cf
 
 
 
 
 
 
 
 
 
 
2bdb9d3
6cdf1cf
 
 
 
 
 
 
2bdb9d3
6cdf1cf
 
 
 
 
 
2bdb9d3
4cdf7e3
6cdf1cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bdb9d3
6cdf1cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bdb9d3
6cdf1cf
 
 
 
e54e976
 
 
 
 
6cdf1cf
89f2153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cdf1cf
89f2153
 
 
 
 
 
6cdf1cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bdb9d3
6cdf1cf
 
 
 
 
 
 
 
 
 
 
2bdb9d3
6cdf1cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bdb9d3
6cdf1cf
 
 
 
 
 
 
 
 
2bdb9d3
6cdf1cf
2bdb9d3
 
2329017
 
00b465c
2329017
ffad500
00b465c
 
ffad500
2329017
 
 
 
 
 
 
 
 
 
 
ffad500
a537982
ffad500
 
2329017
 
 
 
5597539
00b465c
 
2bdb9d3
6cdf1cf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
import spaces
import gradio as gr
import torch
import os
import json
from transformers import AutoProcessor, AutoModelForCausalLM, AutoModelForVision2Seq
from qwen_vl_utils import process_vision_info

# Model configuration
MODEL_PATH = "nvidia/Cosmos-Reason1-7B"

# Role configurations
ROLES = {
    "General Assistant": "You are a helpful assistant. Answer the question in the following format: \n<think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>.",
    "Video Analyzer": """You are a helpful video analyzer. The goal is to identify artifacts and anomalies in the video. Watch carefully and focus on the following details:

* Physical accuracy (gravity, collision, object interaction, fluid dynamics, object permanence, etc.)
* Common sense
* Cause-and-effect
* Temporal consistency
* Spatial consistency
* Human motion
* Material and Texture realism

Here are some examples of commonly found artifacts and anomalies:

* If objects penetrate each other, this indicates a failure in collision detection, object interaction, and physical accuracy.
* If hands penetrate each other, or hands pass through objects, this indicates a failure in collision detection, object interaction, and physical accuracy.
* If an object moves in an unexpected way or move without any apparent reason, this suggests a failure in causality, object interaction, and physical accuracy.
* If an object suddenly flips or changes direction, this suggests a failure in temporal consistency.
* If an object suddenly appears or disappears, or the count of objects in the video suddenly changes, this suggests a failure in temporal consistency.
* If an object transforms or deforms half way through the video, this suggests a failure in temporal consistency.
* If an object is used in a way that defies its intended purpose or normal function, this indicates a violation of common sense.
* If the liquid flows through a solid object, such as water flowing through a pan, this suggests a failure in physical accuracy and fluid dynamics.
* If a person's legs or arms suddenly switch positions in an impossible way—such as the left leg appearing where the right leg was just a moment ago, this suggests a failure in human motion and temporal consistency.
* If a person's body suddenly morphs or changes shape, this suggests a failure in human motion and temporal consistency.
* If an object's texture, material or surface is unnaturally smooth, this suggests a failure in object surface reconstruction.

Here are some examples of non-artifacts you should not include in your analysis:

* Being an animated video, such as a cartoon, does not automatically make it artifacts.
* Avoid ungrounded and over-general explanations such as overall impression, artistic style, or background elements.
* The video has no sound. Avoid explanations based on sound.
* Do not mention lighting, shadows, blurring, or camera effects in your analysis.

Answer the question in English with provided options in the following format:
<think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>.""",
    "Custom Role": "You are a helpful assistant. Answer the question in the following format: \n<think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>."
}

# Default configuration
default_config = {
    "attention_mode": "sdpa",
    "torch_dtype": "float16",
    "device_map": "auto",
    "trust_remote_code": True
}

# Load or create config file
config_file = "cosmos_reason1_config.json"
try:
    if not os.path.exists(config_file):
        with open(config_file, "w") as f:
            json.dump(default_config, f, indent=4)
        config = default_config
    else:
        with open(config_file, "r") as f:
            config = json.load(f)
except Exception as e:
    print(f"Warning: Could not load config file: {e}")
    print("Using default configuration")
    config = default_config

# Initialize the model with configuration
try:
    model = AutoModelForVision2Seq.from_pretrained(
        MODEL_PATH,
        torch_dtype=getattr(torch, config["torch_dtype"]),
        device_map=config["device_map"],
        trust_remote_code=config["trust_remote_code"]
    )
except Exception as e:
    print(f"Error loading model: {e}")
    raise

# Initialize sampling parameters
generation_config = {
    "temperature": 0.6,
    "top_p": 0.95,
    "repetition_penalty": 1.05,
    "max_new_tokens": 4096,
}

# Initialize the processor
try:
    processor = AutoProcessor.from_pretrained(MODEL_PATH)
except Exception as e:
    print(f"Error loading processor: {e}")
    raise

@spaces.GPU
def process_input(image, video, text_prompt, temperature, top_p, repetition_penalty, max_tokens, role, custom_role_text):
    """Process the input and generate a response."""
    try:
        # Use custom role text if role is "Custom Role"
        role_prompt = custom_role_text if role == "Custom Role" else ROLES[role]
        
        messages = [
            {"role": "system", "content": role_prompt},
            {"role": "user", "content": []}
        ]
        
        # Add text prompt
        if text_prompt:
            messages[1]["content"].append({"type": "text", "text": text_prompt})
        
        # Add image if provided
        if image is not None:
            messages[1]["content"].append({"type": "image", "image": image})
        
        # Add video if provided
        if video is not None:
            messages[1]["content"].append({
                "type": "video",
                "video": video,
                "fps": 4,
            })
        
        # Process the prompt
        prompt = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
        
        # Process vision information
        image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
        
        # Prepare inputs
        inputs = processor(
            text=prompt,
            images=image_inputs if image_inputs is not None else None,
            videos=video_inputs if video_inputs is not None else None,
            return_tensors="pt"
        ).to(model.device)
        
        # Update generation config with user parameters
        current_generation_config = {
            "temperature": temperature,
            "top_p": top_p,
            "repetition_penalty": repetition_penalty,
            "max_new_tokens": max_tokens,
        }
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                **current_generation_config
            )
        
        generated_text = processor.decode(outputs[0], skip_special_tokens=True)
        
        return generated_text, "✅ Generation completed successfully!"
    except Exception as e:
        import traceback
        error_trace = traceback.format_exc()
        return f"Error processing input: {str(e)}", f"❌ Error occurred:\n{error_trace}"

def apply_config_changes(attention_mode, torch_dtype, device_map):
    """Apply configuration changes and save to file."""
    try:
        config = {
            "attention_mode": attention_mode,
            "torch_dtype": torch_dtype,
            "device_map": device_map,
            "trust_remote_code": True
        }
        
        with open(config_file, "w") as f:
            json.dump(config, f, indent=4)
        
        return "Configuration updated. Please restart the application for changes to take effect."
    except Exception as e:
        return f"Error updating configuration: {str(e)}"

# Create the Gradio interface
with gr.Blocks(title="Cosmos-Reason1", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Cosmos-Reason1")
    gr.Markdown("Upload an image or video and ask a question about it.")
    gr.Markdown(
        """
        [[Model]](https://huggingface.co/nvidia/Cosmos-Reason1-7B) | [[Code]](https://github.com/nvidia-cosmos/cosmos-reason1)
        """
    )
    
    # with gr.Accordion("Model Configuration", open=False):
    #     attention_mode = gr.Dropdown(
    #         choices=["sdpa", "xformers", "flash_attention_2"],
    #         value=config["attention_mode"],
    #         label="Attention Mode"
    #     )
    #     torch_dtype = gr.Dropdown(
    #         choices=["float16", "bfloat16", "float32"],
    #         value=config["torch_dtype"],
    #         label="Torch Data Type"
    #     )
    #     device_map = gr.Dropdown(
    #         choices=["auto", "cuda", "cpu"],
    #         value=config["device_map"],
    #         label="Device Map"
    #     )
    #     config_btn = gr.Button("Apply Configuration")
    #     config_msg = gr.Markdown()
        
    #     config_btn.click(
    #         fn=apply_config_changes,
    #         inputs=[attention_mode, torch_dtype, device_map],
    #         outputs=config_msg
    #     )
    
    
    with gr.Row():
        with gr.Column():
            role_selector = gr.Dropdown(
                choices=list(ROLES.keys()),
                value="General Assistant",
                label="Select Role"
            )
            
            custom_role_panel = gr.Group(visible=False)
            with custom_role_panel:
                custom_role_text = gr.Textbox(
                    label="Custom Role Instructions",
                    placeholder="Enter custom role instructions here...",
                    lines=10,
                    value=ROLES["Custom Role"]
                )
                apply_custom_role = gr.Button("Apply Custom Role")
                custom_role_status = gr.Markdown()
                
                def update_custom_role(text):
                    ROLES["Custom Role"] = text
                    return "Custom role updated successfully!"
                
                apply_custom_role.click(
                    fn=update_custom_role,
                    inputs=[custom_role_text],
                    outputs=[custom_role_status]
                )
            
            def toggle_custom_role(role):
                return gr.update(visible=(role == "Custom Role"))
            
            role_selector.change(
                fn=toggle_custom_role,
                inputs=[role_selector],
                outputs=[custom_role_panel]
            )
            
            image_input = gr.Image(label="Image Input", type="filepath")
            video_input = gr.Video(label="Video Input")
            text_input = gr.Textbox(label="Question", placeholder="Ask a question about the image or video...")
            
            with gr.Accordion("Generation Parameters", open=False):
                temperature = gr.Slider(0.1, 2.0, value=0.6, step=0.1, label="Temperature")
                top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P")
                repetition_penalty = gr.Slider(1.0, 2.0, value=1.05, step=0.05, label="Repetition Penalty")
                max_tokens = gr.Slider(64, 4096, value=4096, step=64, label="Max Tokens")
            
            submit_btn = gr.Button("Submit")
        
        with gr.Column():
            output = gr.Textbox(label="Model Response", lines=10)
            status = gr.Markdown(label="Status")
    
    submit_btn.click(
        fn=process_input,
        inputs=[
            image_input,
            video_input,
            text_input,
            temperature,
            top_p,
            repetition_penalty,
            max_tokens,
            role_selector,
            custom_role_text
        ],
        outputs=[output, status]
    )

    # Example for image
    image_examples = [
        [
            "group_in_park.jpg",
            "What is happening in this image?"
        ]
    ]
    
    # Example for video
    video_examples = [
        [
            "car_curb_video.mp4",
            "What is wrong in this video?"
        ]
    ]
    
    # Image example block
    gr.Examples(
        examples=image_examples,
        inputs=[image_input, text_input],
        label="Image Example: click to load then hit Submit"
    )
    
    # Video example block
    gr.Examples(
        examples=video_examples,
        inputs=[video_input, text_input],
        label="Video Example: click to load then hit Submit"
    )

if __name__ == "__main__":
    demo.launch()