Spaces:
Running
on
Zero
Running
on
Zero
File size: 12,315 Bytes
120332b 2bdb9d3 6cdf1cf 2bdb9d3 6cdf1cf 2bdb9d3 6cdf1cf 2bdb9d3 6cdf1cf 2bdb9d3 6cdf1cf 2bdb9d3 6cdf1cf 2bdb9d3 6cdf1cf 2bdb9d3 4cdf7e3 6cdf1cf 2bdb9d3 6cdf1cf 2bdb9d3 6cdf1cf e54e976 6cdf1cf 89f2153 6cdf1cf 89f2153 6cdf1cf 2bdb9d3 6cdf1cf 2bdb9d3 6cdf1cf 2bdb9d3 6cdf1cf 2bdb9d3 6cdf1cf 2bdb9d3 2329017 00b465c 2329017 ffad500 00b465c ffad500 2329017 ffad500 a537982 ffad500 2329017 5597539 00b465c 2bdb9d3 6cdf1cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 |
import spaces
import gradio as gr
import torch
import os
import json
from transformers import AutoProcessor, AutoModelForCausalLM, AutoModelForVision2Seq
from qwen_vl_utils import process_vision_info
# Model configuration
MODEL_PATH = "nvidia/Cosmos-Reason1-7B"
# Role configurations
ROLES = {
"General Assistant": "You are a helpful assistant. Answer the question in the following format: \n<think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>.",
"Video Analyzer": """You are a helpful video analyzer. The goal is to identify artifacts and anomalies in the video. Watch carefully and focus on the following details:
* Physical accuracy (gravity, collision, object interaction, fluid dynamics, object permanence, etc.)
* Common sense
* Cause-and-effect
* Temporal consistency
* Spatial consistency
* Human motion
* Material and Texture realism
Here are some examples of commonly found artifacts and anomalies:
* If objects penetrate each other, this indicates a failure in collision detection, object interaction, and physical accuracy.
* If hands penetrate each other, or hands pass through objects, this indicates a failure in collision detection, object interaction, and physical accuracy.
* If an object moves in an unexpected way or move without any apparent reason, this suggests a failure in causality, object interaction, and physical accuracy.
* If an object suddenly flips or changes direction, this suggests a failure in temporal consistency.
* If an object suddenly appears or disappears, or the count of objects in the video suddenly changes, this suggests a failure in temporal consistency.
* If an object transforms or deforms half way through the video, this suggests a failure in temporal consistency.
* If an object is used in a way that defies its intended purpose or normal function, this indicates a violation of common sense.
* If the liquid flows through a solid object, such as water flowing through a pan, this suggests a failure in physical accuracy and fluid dynamics.
* If a person's legs or arms suddenly switch positions in an impossible way—such as the left leg appearing where the right leg was just a moment ago, this suggests a failure in human motion and temporal consistency.
* If a person's body suddenly morphs or changes shape, this suggests a failure in human motion and temporal consistency.
* If an object's texture, material or surface is unnaturally smooth, this suggests a failure in object surface reconstruction.
Here are some examples of non-artifacts you should not include in your analysis:
* Being an animated video, such as a cartoon, does not automatically make it artifacts.
* Avoid ungrounded and over-general explanations such as overall impression, artistic style, or background elements.
* The video has no sound. Avoid explanations based on sound.
* Do not mention lighting, shadows, blurring, or camera effects in your analysis.
Answer the question in English with provided options in the following format:
<think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>.""",
"Custom Role": "You are a helpful assistant. Answer the question in the following format: \n<think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>."
}
# Default configuration
default_config = {
"attention_mode": "sdpa",
"torch_dtype": "float16",
"device_map": "auto",
"trust_remote_code": True
}
# Load or create config file
config_file = "cosmos_reason1_config.json"
try:
if not os.path.exists(config_file):
with open(config_file, "w") as f:
json.dump(default_config, f, indent=4)
config = default_config
else:
with open(config_file, "r") as f:
config = json.load(f)
except Exception as e:
print(f"Warning: Could not load config file: {e}")
print("Using default configuration")
config = default_config
# Initialize the model with configuration
try:
model = AutoModelForVision2Seq.from_pretrained(
MODEL_PATH,
torch_dtype=getattr(torch, config["torch_dtype"]),
device_map=config["device_map"],
trust_remote_code=config["trust_remote_code"]
)
except Exception as e:
print(f"Error loading model: {e}")
raise
# Initialize sampling parameters
generation_config = {
"temperature": 0.6,
"top_p": 0.95,
"repetition_penalty": 1.05,
"max_new_tokens": 4096,
}
# Initialize the processor
try:
processor = AutoProcessor.from_pretrained(MODEL_PATH)
except Exception as e:
print(f"Error loading processor: {e}")
raise
@spaces.GPU
def process_input(image, video, text_prompt, temperature, top_p, repetition_penalty, max_tokens, role, custom_role_text):
"""Process the input and generate a response."""
try:
# Use custom role text if role is "Custom Role"
role_prompt = custom_role_text if role == "Custom Role" else ROLES[role]
messages = [
{"role": "system", "content": role_prompt},
{"role": "user", "content": []}
]
# Add text prompt
if text_prompt:
messages[1]["content"].append({"type": "text", "text": text_prompt})
# Add image if provided
if image is not None:
messages[1]["content"].append({"type": "image", "image": image})
# Add video if provided
if video is not None:
messages[1]["content"].append({
"type": "video",
"video": video,
"fps": 4,
})
# Process the prompt
prompt = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
# Process vision information
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
# Prepare inputs
inputs = processor(
text=prompt,
images=image_inputs if image_inputs is not None else None,
videos=video_inputs if video_inputs is not None else None,
return_tensors="pt"
).to(model.device)
# Update generation config with user parameters
current_generation_config = {
"temperature": temperature,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"max_new_tokens": max_tokens,
}
# Generate response
with torch.no_grad():
outputs = model.generate(
**inputs,
**current_generation_config
)
generated_text = processor.decode(outputs[0], skip_special_tokens=True)
return generated_text, "✅ Generation completed successfully!"
except Exception as e:
import traceback
error_trace = traceback.format_exc()
return f"Error processing input: {str(e)}", f"❌ Error occurred:\n{error_trace}"
def apply_config_changes(attention_mode, torch_dtype, device_map):
"""Apply configuration changes and save to file."""
try:
config = {
"attention_mode": attention_mode,
"torch_dtype": torch_dtype,
"device_map": device_map,
"trust_remote_code": True
}
with open(config_file, "w") as f:
json.dump(config, f, indent=4)
return "Configuration updated. Please restart the application for changes to take effect."
except Exception as e:
return f"Error updating configuration: {str(e)}"
# Create the Gradio interface
with gr.Blocks(title="Cosmos-Reason1", theme=gr.themes.Soft()) as demo:
gr.Markdown("# Cosmos-Reason1")
gr.Markdown("Upload an image or video and ask a question about it.")
gr.Markdown(
"""
[[Model]](https://huggingface.co/nvidia/Cosmos-Reason1-7B) | [[Code]](https://github.com/nvidia-cosmos/cosmos-reason1)
"""
)
# with gr.Accordion("Model Configuration", open=False):
# attention_mode = gr.Dropdown(
# choices=["sdpa", "xformers", "flash_attention_2"],
# value=config["attention_mode"],
# label="Attention Mode"
# )
# torch_dtype = gr.Dropdown(
# choices=["float16", "bfloat16", "float32"],
# value=config["torch_dtype"],
# label="Torch Data Type"
# )
# device_map = gr.Dropdown(
# choices=["auto", "cuda", "cpu"],
# value=config["device_map"],
# label="Device Map"
# )
# config_btn = gr.Button("Apply Configuration")
# config_msg = gr.Markdown()
# config_btn.click(
# fn=apply_config_changes,
# inputs=[attention_mode, torch_dtype, device_map],
# outputs=config_msg
# )
with gr.Row():
with gr.Column():
role_selector = gr.Dropdown(
choices=list(ROLES.keys()),
value="General Assistant",
label="Select Role"
)
custom_role_panel = gr.Group(visible=False)
with custom_role_panel:
custom_role_text = gr.Textbox(
label="Custom Role Instructions",
placeholder="Enter custom role instructions here...",
lines=10,
value=ROLES["Custom Role"]
)
apply_custom_role = gr.Button("Apply Custom Role")
custom_role_status = gr.Markdown()
def update_custom_role(text):
ROLES["Custom Role"] = text
return "Custom role updated successfully!"
apply_custom_role.click(
fn=update_custom_role,
inputs=[custom_role_text],
outputs=[custom_role_status]
)
def toggle_custom_role(role):
return gr.update(visible=(role == "Custom Role"))
role_selector.change(
fn=toggle_custom_role,
inputs=[role_selector],
outputs=[custom_role_panel]
)
image_input = gr.Image(label="Image Input", type="filepath")
video_input = gr.Video(label="Video Input")
text_input = gr.Textbox(label="Question", placeholder="Ask a question about the image or video...")
with gr.Accordion("Generation Parameters", open=False):
temperature = gr.Slider(0.1, 2.0, value=0.6, step=0.1, label="Temperature")
top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P")
repetition_penalty = gr.Slider(1.0, 2.0, value=1.05, step=0.05, label="Repetition Penalty")
max_tokens = gr.Slider(64, 4096, value=4096, step=64, label="Max Tokens")
submit_btn = gr.Button("Submit")
with gr.Column():
output = gr.Textbox(label="Model Response", lines=10)
status = gr.Markdown(label="Status")
submit_btn.click(
fn=process_input,
inputs=[
image_input,
video_input,
text_input,
temperature,
top_p,
repetition_penalty,
max_tokens,
role_selector,
custom_role_text
],
outputs=[output, status]
)
# Example for image
image_examples = [
[
"group_in_park.jpg",
"What is happening in this image?"
]
]
# Example for video
video_examples = [
[
"car_curb_video.mp4",
"What is wrong in this video?"
]
]
# Image example block
gr.Examples(
examples=image_examples,
inputs=[image_input, text_input],
label="Image Example: click to load then hit Submit"
)
# Video example block
gr.Examples(
examples=video_examples,
inputs=[video_input, text_input],
label="Video Example: click to load then hit Submit"
)
if __name__ == "__main__":
demo.launch() |