import pickle import warnings from pathlib import Path import gradio as gr from neus_v.puls.prompts import Mode from neus_v.puls.puls import PULS from neus_v.smooth_scoring import smooth_confidence_scores from neus_v.utils import clear_gpu_memory from neus_v.veval.eval import evaluate_video_with_sequence_of_images from neus_v.veval.parse import parse_proposition_set, parse_tl_specification from neus_v.vlm.vllm_client import VLLMClient # Suppress specific warnings warnings.filterwarnings( "ignore", category=DeprecationWarning, message="Conversion of an array with ndim > 0 to a scalar is deprecated" ) # Paths and parameters WEIGHT_PATH = Path("./assets/") pickle_path = WEIGHT_PATH / "distributions.pkl" num_of_frame_in_sequence = 3 model = "InternVL2-8B" # Load the vision-language model vision_language_model = VLLMClient(api_base="http://localhost:8000/v1", model="OpenGVLab/InternVL2_5-8B") # Load distributions print(f"Loading distributions from {pickle_path}") with open(pickle_path, "rb") as f: distributions = pickle.load(f) all_dimension_data = distributions.get(model).get("all_dimension") # TODO: Make paths better for public release def process_video(video_path, propositions, tl): """Process the video and compute the score_on_all.""" proposition_set = parse_proposition_set(propositions.split(",")) tl_spec = parse_tl_specification(tl) threshold = 0.349 try: result = evaluate_video_with_sequence_of_images( vision_language_model=vision_language_model, confidence_as_token_probability=True, video_path=video_path, proposition_set=proposition_set, tl_spec=tl_spec, parallel_inference=False, num_of_frame_in_sequence=num_of_frame_in_sequence, threshold=threshold, ) probability = result.get("probability") score_on_all = float( smooth_confidence_scores( target_data=[probability], prior_distribution=all_dimension_data, ) ) clear_gpu_memory() return score_on_all except Exception as e: clear_gpu_memory() return f"Error: {str(e)}" def generate_from_puls(prompt, mode_choice): """Generate propositions and TL spec from a natural language prompt using PULS.""" # Check if prompt is blank if not prompt or prompt.strip() == "": raise gr.Error("Please enter a text prompt first", duration=5) # return "Error: Please enter a text prompt first", "Error: Please enter a text prompt first" mode_map = { "Object-Action Alignment": Mode.OBJECT_ACTION_ALIGNMENT, "Overall Consistency (default)": Mode.OVERALL_CONSISTENCY, "Object Existence": Mode.OBJECT_EXISTENCE, "Spatial Relationship": Mode.SPATIAL_RELATIONSHIP, } selected_mode = mode_map[mode_choice] result = PULS(prompt, [selected_mode]) # Extract the relevant propositions and spec based on the selected mode mode_key = selected_mode.name.lower().replace("_", " ") if mode_key == "object action alignment": mode_key = "object_action_alignment" elif mode_key == "overall consistency": mode_key = "overall_consistency" elif mode_key == "object existence": mode_key = "object_existence" elif mode_key == "spatial relationship": mode_key = "spatial_relationships" propositions = result.get(mode_key, []) spec = result.get(f"{mode_key}_spec", "") return ", ".join(propositions), spec # Gradio interface def demo_interface(video, propositions, tl): """Wrapper for the Gradio interface.""" return process_video(video, propositions, tl) # Example data from the original script example_video_path_1 = ( "assets/A_storm_bursts_in_with_intermittent_lightning_and_causes_flooding_and_large_waves_crash_in.mp4" ) example_video_path_2 = "assets/The ocean waves gently lapping at the shore, until a storm bursts in, and then lightning flashes across the sky.mp4" example_propositions = "waves lapping,ocean shore,storm bursts in,lightning on the sky" example_tl = '("waves_lapping" & "ocean_shore") U ("storm_bursts_in" U "lightning_on_the_sky")' example_prompt = ( "The ocean waves gently lapping at the shore, until a storm bursts in, and then lightning flashes across the sky" ) with gr.Blocks(title="NeuS-V: Neuro-Symbolic Evaluation of Text-to-Video Models") as demo: gr.HTML( """