harrim-nv's picture
Update app.py
5597539 verified
import spaces
import gradio as gr
import torch
import os
import json
from transformers import AutoProcessor, AutoModelForCausalLM, AutoModelForVision2Seq
from qwen_vl_utils import process_vision_info
# Model configuration
MODEL_PATH = "nvidia/Cosmos-Reason1-7B"
# Role configurations
ROLES = {
"General Assistant": "You are a helpful assistant. Answer the question in the following format: \n<think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>.",
"Video Analyzer": """You are a helpful video analyzer. The goal is to identify artifacts and anomalies in the video. Watch carefully and focus on the following details:
* Physical accuracy (gravity, collision, object interaction, fluid dynamics, object permanence, etc.)
* Common sense
* Cause-and-effect
* Temporal consistency
* Spatial consistency
* Human motion
* Material and Texture realism
Here are some examples of commonly found artifacts and anomalies:
* If objects penetrate each other, this indicates a failure in collision detection, object interaction, and physical accuracy.
* If hands penetrate each other, or hands pass through objects, this indicates a failure in collision detection, object interaction, and physical accuracy.
* If an object moves in an unexpected way or move without any apparent reason, this suggests a failure in causality, object interaction, and physical accuracy.
* If an object suddenly flips or changes direction, this suggests a failure in temporal consistency.
* If an object suddenly appears or disappears, or the count of objects in the video suddenly changes, this suggests a failure in temporal consistency.
* If an object transforms or deforms half way through the video, this suggests a failure in temporal consistency.
* If an object is used in a way that defies its intended purpose or normal function, this indicates a violation of common sense.
* If the liquid flows through a solid object, such as water flowing through a pan, this suggests a failure in physical accuracy and fluid dynamics.
* If a person's legs or arms suddenly switch positions in an impossible way—such as the left leg appearing where the right leg was just a moment ago, this suggests a failure in human motion and temporal consistency.
* If a person's body suddenly morphs or changes shape, this suggests a failure in human motion and temporal consistency.
* If an object's texture, material or surface is unnaturally smooth, this suggests a failure in object surface reconstruction.
Here are some examples of non-artifacts you should not include in your analysis:
* Being an animated video, such as a cartoon, does not automatically make it artifacts.
* Avoid ungrounded and over-general explanations such as overall impression, artistic style, or background elements.
* The video has no sound. Avoid explanations based on sound.
* Do not mention lighting, shadows, blurring, or camera effects in your analysis.
Answer the question in English with provided options in the following format:
<think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>.""",
"Custom Role": "You are a helpful assistant. Answer the question in the following format: \n<think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>."
}
# Default configuration
default_config = {
"attention_mode": "sdpa",
"torch_dtype": "float16",
"device_map": "auto",
"trust_remote_code": True
}
# Load or create config file
config_file = "cosmos_reason1_config.json"
try:
if not os.path.exists(config_file):
with open(config_file, "w") as f:
json.dump(default_config, f, indent=4)
config = default_config
else:
with open(config_file, "r") as f:
config = json.load(f)
except Exception as e:
print(f"Warning: Could not load config file: {e}")
print("Using default configuration")
config = default_config
# Initialize the model with configuration
try:
model = AutoModelForVision2Seq.from_pretrained(
MODEL_PATH,
torch_dtype=getattr(torch, config["torch_dtype"]),
device_map=config["device_map"],
trust_remote_code=config["trust_remote_code"]
)
except Exception as e:
print(f"Error loading model: {e}")
raise
# Initialize sampling parameters
generation_config = {
"temperature": 0.6,
"top_p": 0.95,
"repetition_penalty": 1.05,
"max_new_tokens": 4096,
}
# Initialize the processor
try:
processor = AutoProcessor.from_pretrained(MODEL_PATH)
except Exception as e:
print(f"Error loading processor: {e}")
raise
@spaces.GPU
def process_input(image, video, text_prompt, temperature, top_p, repetition_penalty, max_tokens, role, custom_role_text):
"""Process the input and generate a response."""
try:
# Use custom role text if role is "Custom Role"
role_prompt = custom_role_text if role == "Custom Role" else ROLES[role]
messages = [
{"role": "system", "content": role_prompt},
{"role": "user", "content": []}
]
# Add text prompt
if text_prompt:
messages[1]["content"].append({"type": "text", "text": text_prompt})
# Add image if provided
if image is not None:
messages[1]["content"].append({"type": "image", "image": image})
# Add video if provided
if video is not None:
messages[1]["content"].append({
"type": "video",
"video": video,
"fps": 4,
})
# Process the prompt
prompt = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
# Process vision information
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
# Prepare inputs
inputs = processor(
text=prompt,
images=image_inputs if image_inputs is not None else None,
videos=video_inputs if video_inputs is not None else None,
return_tensors="pt"
).to(model.device)
# Update generation config with user parameters
current_generation_config = {
"temperature": temperature,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"max_new_tokens": max_tokens,
}
# Generate response
with torch.no_grad():
outputs = model.generate(
**inputs,
**current_generation_config
)
generated_text = processor.decode(outputs[0], skip_special_tokens=True)
return generated_text, "✅ Generation completed successfully!"
except Exception as e:
import traceback
error_trace = traceback.format_exc()
return f"Error processing input: {str(e)}", f"❌ Error occurred:\n{error_trace}"
def apply_config_changes(attention_mode, torch_dtype, device_map):
"""Apply configuration changes and save to file."""
try:
config = {
"attention_mode": attention_mode,
"torch_dtype": torch_dtype,
"device_map": device_map,
"trust_remote_code": True
}
with open(config_file, "w") as f:
json.dump(config, f, indent=4)
return "Configuration updated. Please restart the application for changes to take effect."
except Exception as e:
return f"Error updating configuration: {str(e)}"
# Create the Gradio interface
with gr.Blocks(title="Cosmos-Reason1", theme=gr.themes.Soft()) as demo:
gr.Markdown("# Cosmos-Reason1")
gr.Markdown("Upload an image or video and ask a question about it.")
gr.Markdown(
"""
[[Model]](https://huggingface.co/nvidia/Cosmos-Reason1-7B) | [[Code]](https://github.com/nvidia-cosmos/cosmos-reason1)
"""
)
# with gr.Accordion("Model Configuration", open=False):
# attention_mode = gr.Dropdown(
# choices=["sdpa", "xformers", "flash_attention_2"],
# value=config["attention_mode"],
# label="Attention Mode"
# )
# torch_dtype = gr.Dropdown(
# choices=["float16", "bfloat16", "float32"],
# value=config["torch_dtype"],
# label="Torch Data Type"
# )
# device_map = gr.Dropdown(
# choices=["auto", "cuda", "cpu"],
# value=config["device_map"],
# label="Device Map"
# )
# config_btn = gr.Button("Apply Configuration")
# config_msg = gr.Markdown()
# config_btn.click(
# fn=apply_config_changes,
# inputs=[attention_mode, torch_dtype, device_map],
# outputs=config_msg
# )
with gr.Row():
with gr.Column():
role_selector = gr.Dropdown(
choices=list(ROLES.keys()),
value="General Assistant",
label="Select Role"
)
custom_role_panel = gr.Group(visible=False)
with custom_role_panel:
custom_role_text = gr.Textbox(
label="Custom Role Instructions",
placeholder="Enter custom role instructions here...",
lines=10,
value=ROLES["Custom Role"]
)
apply_custom_role = gr.Button("Apply Custom Role")
custom_role_status = gr.Markdown()
def update_custom_role(text):
ROLES["Custom Role"] = text
return "Custom role updated successfully!"
apply_custom_role.click(
fn=update_custom_role,
inputs=[custom_role_text],
outputs=[custom_role_status]
)
def toggle_custom_role(role):
return gr.update(visible=(role == "Custom Role"))
role_selector.change(
fn=toggle_custom_role,
inputs=[role_selector],
outputs=[custom_role_panel]
)
image_input = gr.Image(label="Image Input", type="filepath")
video_input = gr.Video(label="Video Input")
text_input = gr.Textbox(label="Question", placeholder="Ask a question about the image or video...")
with gr.Accordion("Generation Parameters", open=False):
temperature = gr.Slider(0.1, 2.0, value=0.6, step=0.1, label="Temperature")
top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P")
repetition_penalty = gr.Slider(1.0, 2.0, value=1.05, step=0.05, label="Repetition Penalty")
max_tokens = gr.Slider(64, 4096, value=4096, step=64, label="Max Tokens")
submit_btn = gr.Button("Submit")
with gr.Column():
output = gr.Textbox(label="Model Response", lines=10)
status = gr.Markdown(label="Status")
submit_btn.click(
fn=process_input,
inputs=[
image_input,
video_input,
text_input,
temperature,
top_p,
repetition_penalty,
max_tokens,
role_selector,
custom_role_text
],
outputs=[output, status]
)
# Example for image
image_examples = [
[
"group_in_park.jpg",
"What is happening in this image?"
]
]
# Example for video
video_examples = [
[
"car_curb_video.mp4",
"What is wrong in this video?"
]
]
# Image example block
gr.Examples(
examples=image_examples,
inputs=[image_input, text_input],
label="Image Example: click to load then hit Submit"
)
# Video example block
gr.Examples(
examples=video_examples,
inputs=[video_input, text_input],
label="Video Example: click to load then hit Submit"
)
if __name__ == "__main__":
demo.launch()