File size: 2,889 Bytes
52e4f53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import logging

logger = logging.getLogger(__name__)


if False:
    IMG_TAG_TOKEN = "<image>"
    IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
    IMG_START_TOKEN = "<img>"
    IMG_END_TOKEN = "</img>"

    VID_TAG_TOKEN = "<video>"
    VID_CONTEXT_TOKEN = "<VID_CONTEXT>"
    VID_START_TOKEN = "<vid>"
    VID_END_TOKEN = "</vid>"

    PATCH_CONTEXT_TOKEN = "<PATCH_CONTEXT>"
    PATCH_START_TOKEN = "<patch>"
    PATCH_END_TOKEN = "</patch>"

    AUD_TAG_TOKEN = "<audio>"
    AUD_START_TOKEN = "<|begin_of_audio|>"
    AUD_END_TOKEN = "<|end_of_audio|>"

    QUAD_START_TOKEN = "<quad>"
    QUAD_END_TOKEN = "</quad>"
    REF_START_TOKEN = "<ref>"
    REF_END_TOKEN = "</ref>"
    BOX_START_TOKEN = "<box>"
    BOX_END_TOKEN = "</box>"


if True:

    IMG_TAG_TOKEN = "<|image|>"
    IMG_CONTEXT_TOKEN = "<|context_of_image|>"
    IMG_START_TOKEN = "<|begin_of_image|>"
    IMG_END_TOKEN = "<|end_of_image|>"

    VID_TAG_TOKEN = "<|video|>"
    VID_CONTEXT_TOKEN = "<|context_of_video|>"
    VID_START_TOKEN = "<|begin_of_video|>"
    VID_END_TOKEN = "<|end_of_video|>"

    PATCH_CONTEXT_TOKEN = "<|context_of_patch|>"
    PATCH_START_TOKEN = "<|begin_of_patch|>"
    PATCH_END_TOKEN = "<|end_of_patch|>"

    AUD_TAG_TOKEN = "<|audio|>"
    AUD_CONTEXT_TOKEN = "<|context_of_audio|>"
    AUD_START_TOKEN = "<|begin_of_audio|>"
    AUD_END_TOKEN = "<|end_of_audio|>"

    QUAD_START_TOKEN = "<|begin_of_quad|>"
    QUAD_END_TOKEN = "<|end_of_quad|>"
    REF_START_TOKEN = "<|begin_of_ref|>"
    REF_END_TOKEN = "<|end_of_ref|>"
    BOX_START_TOKEN = "<|begin_of_box|>"
    BOX_END_TOKEN = "<|end_of_box|>"


logger.info(f"{IMG_TAG_TOKEN=}")
logger.info(f"{IMG_CONTEXT_TOKEN=}")
logger.info(f"{IMG_START_TOKEN=}")
logger.info(f"{IMG_END_TOKEN=}")

logger.info(f"{VID_TAG_TOKEN=}")
logger.info(f"{VID_CONTEXT_TOKEN=}")
logger.info(f"{VID_START_TOKEN=}")
logger.info(f"{VID_END_TOKEN=}")

logger.info(f"{PATCH_CONTEXT_TOKEN=}")
logger.info(f"{PATCH_START_TOKEN=}")
logger.info(f"{PATCH_END_TOKEN=}")

logger.info(f"{AUD_TAG_TOKEN=}")
logger.info(f"{AUD_CONTEXT_TOKEN=}")
logger.info(f"{AUD_START_TOKEN=}")
logger.info(f"{AUD_END_TOKEN=}")

# IMAGENET_MEAN = (0.485, 0.456, 0.406)
# IMAGENET_STD = (0.229, 0.224, 0.225)

# CLIP_MEAN = (0.4814546, 0.4578275, 0.40821073)
# CLIP_STD = (0.2686295, 0.2613025, 0.2757711)

# SIGLIP_MEAN = (0.5, 0.5, 0.5)
# SIGLIP_STD = (0.5, 0.5, 0.5)


IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]


# Model Constants
IGNORE_INDEX = -100
IMAGE_TOKEN_INDEX = -200
DEFAULT_IMAGE_TOKEN = IMG_CONTEXT_TOKEN
DEFAULT_IMAGE_PATCH_TOKEN = PATCH_CONTEXT_TOKEN
DEFAULT_IM_START_TOKEN = IMG_START_TOKEN
DEFAULT_IM_END_TOKEN = IMG_END_TOKEN