File size: 3,429 Bytes
291adda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from enum import Enum

class Mode(Enum):
    OBJECT_ACTION_ALIGNMENT = 1
    OBJECT_EXISTENCE = 2
    OVERALL_CONSISTENCY = 3
    SPATIAL_RELATIONSHIP = 4

mode_prompts = {
    Mode.OBJECT_ACTION_ALIGNMENT: (
        "\"object_action_alignment\":\n"
        "Extract actions and their participating objects. Each proposition must describe an action and its related objects.\n"
        "Example:\n"
        "\"object_action_alignment\": [\"person holds hotdog\", \"person walks\"]"
    ),
    Mode.OBJECT_EXISTENCE: (
        "\"object_existence\":\n"
        "Extract only the tangible objects mentioned in the prompt.\n"
        "Example:\n"
        "\"object_existence\": [\"person\", \"hotdog\", \"car\", \"truck\"]"
    ),
    Mode.OVERALL_CONSISTENCY: (
        "\"overall_consistency\":\n"
        "Extract all meaningful event propositions that describe the combined semantics of objects, actions, and spatial relationships — "
        "but avoid TL keywords such as 'and', 'or', 'not', 'until', 'eventually'.\n"
        "Example:\n"
        "\"overall_consistency\": [\"person holds hotdog\", \"person walks\", \"car next to truck\"]"
    ),
    Mode.SPATIAL_RELATIONSHIP: (
        "\"spatial_relationships\":\n"
        "Extract only spatial relationships between tangible objects (e.g., \"object A next to object B\"). Do not infer or hallucinate spatial relationships.\n"
        "Example:\n"
        "\"spatial_relationships\": [\"car next to truck\"]"
    )
}

mode_outputs = {
    Mode.OBJECT_ACTION_ALIGNMENT: (
        "  \"object_action_alignment\": [...],\n"
        "  \"object_action_alignment_spec\": \"...\","
    ),
    Mode.OBJECT_EXISTENCE: (
        "  \"object_existence\": [...],\n"
        "  \"object_existence_spec\": \"...\","
    ),
    Mode.OVERALL_CONSISTENCY: (
        "  \"overall_consistency\": [...],\n"
        "  \"overall_consistency_spec\": \"...\","
    ),
    Mode.SPATIAL_RELATIONSHIP: (
        "  \"spatial_relationships\": [...],\n"
        "  \"spatial_relationships_spec\": \"...\""
    )
}

header = (
    "You are an intelligent agent designed to extract structured representations from video description prompts. "
    "You will operate in two stages: (1) proposition extraction and (2) TL specification generation.\n\n"
)

stage1_intro = (
    "Stage 1: Proposition Extraction\n\n"
    "Given an input prompt summarizing a video, extract atomic propositions in the following four modes. "
    "Return all outputs in JSON format.\n\n"
)

stage2_intro = "Stage 2: TL Specification Generation\n\n"

spec_gen_intro = (
    "For each of the {n} list(s) of propositions extracted in Stage 1, generate a separate Temporal Logic (TL) specification "
    "describing the structure or sequence of events in that list.\n\n"
)

tl_instructions = (
    "Rules for TL specification:\n"
    "- The input is a single list of propositions from one of the extraction modes.\n"
    "- The output is a single TL formula using **only** the propositions from that list and the allowed TL symbols: "
    "['AND', 'OR', 'NOT', 'UNTIL', 'ALWAYS', 'EVENTUALLY']\n"
    "- Do not introduce any new propositions.\n"
    "- Each formula should reflect the temporal or logical relationships between the propositions in a way that makes semantic sense.\n\n"
)

input_template = "Input:\n{{\n  \"prompt\": \"{}\"\n}}\n\n"
expected_output_header = "Expected Output:\n{\n"
expected_output_footer = "\n}"