WishArdently commited on
Commit
5d7c651
·
verified ·
1 Parent(s): c9b2dd2

Upload InternVideo2Stage2VideoEncoder

Browse files
Files changed (2) hide show
  1. config.py +160 -34
  2. model.safetensors +1 -1
config.py CHANGED
@@ -1,23 +1,149 @@
1
  from transformers import PretrainedConfig, PreTrainedModel, AutoModel, AutoConfig
2
 
3
- class DotDict(dict):
4
- """字典类,支持通过属性访问键值对。"""
5
-
6
- def __getattr__(self, key):
7
- if key in self:
8
- return self[key]
9
- else:
10
- raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{key}'")
11
-
12
- def __setattr__(self, key, value):
13
- self[key] = value
14
-
15
- def __delattr__(self, key):
16
- if key in self:
17
- del self[key]
18
- else:
19
- raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{key}'")
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  class InternVideo2Config(PretrainedConfig):
23
  model_type = "internvideo2"
@@ -72,7 +198,7 @@ class InternVideo2Config(PretrainedConfig):
72
 
73
  # Data configuration
74
  self.train_file = train_file or "available_corpus[\"pretrain_example_data_1B\"]"
75
- self.test_file = DotDict(test_file or {
76
  "msrvtt_1k_test": "available_corpus[\"msrvtt_1k_test\"]",
77
  "didemo_ret_test": "available_corpus[\"didemo_ret_test\"]"
78
  })
@@ -86,25 +212,25 @@ class InternVideo2Config(PretrainedConfig):
86
  self.batch_size = batch_size
87
  self.batch_size_test = batch_size_test
88
  self.max_txt_l = max_txt_l
89
- self.inputs = DotDict(inputs or {
90
  "image_res": 224,
91
- "video_input": DotDict({
92
  "num_frames": num_frames,
93
  "sample_type": "rand",
94
  "num_frames_test": num_frames_test,
95
  "sample_type_test": "middle",
96
  "random_aug": False
97
  }),
98
- "max_txt_l": DotDict({"image": max_txt_l, "video": max_txt_l}),
99
- "batch_size": DotDict({"image": batch_size, "video": batch_size}),
100
- "batch_size_test": DotDict({"image": batch_size_test, "video": batch_size_test})
101
  })
102
 
103
  # Model configuration
104
  self.text_enc = text_enc
105
- self.model = DotDict(model or {
106
  "model_cls": "InternVideo2_Stage2",
107
- "vision_encoder": DotDict({
108
  "name": "pretrain_internvideo2_1b_patch14_224",
109
  "img_size": 224,
110
  "num_frames": num_frames,
@@ -135,15 +261,15 @@ class InternVideo2Config(PretrainedConfig):
135
  "only_mask": True
136
  }),
137
  "text_encoder": text_enc,
138
- "multimodal": DotDict({"enable": True}),
139
  "embed_dim": 512,
140
  "temp": 0.07,
141
  "find_unused_parameters": False
142
  })
143
 
144
  # Criterion configuration
145
- self.criterion = DotDict(criterion or {
146
- "loss_weight": DotDict({
147
  "vtc": 1.0,
148
  "mlm": 1.0,
149
  "vtm": 1.0,
@@ -157,17 +283,17 @@ class InternVideo2Config(PretrainedConfig):
157
  })
158
 
159
  # Optimizer configuration
160
- self.optimizer = DotDict(optimizer or {
161
  "opt": "adamW",
162
  "lr": 5e-5,
163
  "opt_betas": [0.9, 0.98],
164
  "weight_decay": 0.05,
165
  "max_grad_norm": 3.0,
166
- "different_lr": DotDict({"enable": False, "module_names": [], "lr": 1e-3})
167
  })
168
 
169
  # Scheduler configuration
170
- self.scheduler = DotDict(scheduler or {
171
  "sched": "cosine",
172
  "epochs": 10,
173
  "min_lr_multi": 0.01,
@@ -177,7 +303,7 @@ class InternVideo2Config(PretrainedConfig):
177
  # Evaluation configuration
178
  self.evaluate = evaluate
179
  self.deep_fusion = deep_fusion
180
- self.evaluation = DotDict(evaluation or {
181
  "eval_frame_ensemble": "concat",
182
  "eval_x_only": False,
183
  "k_test": 128,
@@ -192,7 +318,7 @@ class InternVideo2Config(PretrainedConfig):
192
  self.use_mem_efficient_sdp = use_mem_efficient_sdp
193
  self.compile_model = compile_model
194
 
195
- self.wandb = DotDict(wandb or {
196
  "enable": False,
197
  "entity": "opengvlab",
198
  "project": "InternVideo2-Stage2"
@@ -214,7 +340,7 @@ class InternVideo2Config(PretrainedConfig):
214
  self.save_ckpt_iter = save_ckpt_iter
215
  self.delete_ds_optim_states = delete_ds_optim_states
216
 
217
- self.deepspeed = DotDict(deepspeed or {
218
  "enable": True,
219
  "stage": 1
220
  })
 
1
  from transformers import PretrainedConfig, PreTrainedModel, AutoModel, AutoConfig
2
 
3
+ class EasyDict(dict):
4
+ """
5
+ Get attributes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ >>> d = EasyDict({'foo':3})
8
+ >>> d['foo']
9
+ 3
10
+ >>> d.foo
11
+ 3
12
+ >>> d.bar
13
+ Traceback (most recent call last):
14
+ ...
15
+ AttributeError: 'EasyDict' object has no attribute 'bar'
16
+
17
+ Works recursively
18
+
19
+ >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}})
20
+ >>> isinstance(d.bar, dict)
21
+ True
22
+ >>> d.bar.x
23
+ 1
24
+
25
+ Bullet-proof
26
+
27
+ >>> EasyDict({})
28
+ {}
29
+ >>> EasyDict(d={})
30
+ {}
31
+ >>> EasyDict(None)
32
+ {}
33
+ >>> d = {'a': 1}
34
+ >>> EasyDict(**d)
35
+ {'a': 1}
36
+
37
+ Set attributes
38
+
39
+ >>> d = EasyDict()
40
+ >>> d.foo = 3
41
+ >>> d.foo
42
+ 3
43
+ >>> d.bar = {'prop': 'value'}
44
+ >>> d.bar.prop
45
+ 'value'
46
+ >>> d
47
+ {'foo': 3, 'bar': {'prop': 'value'}}
48
+ >>> d.bar.prop = 'newer'
49
+ >>> d.bar.prop
50
+ 'newer'
51
+
52
+
53
+ Values extraction
54
+
55
+ >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]})
56
+ >>> isinstance(d.bar, list)
57
+ True
58
+ >>> from operator import attrgetter
59
+ >>> map(attrgetter('x'), d.bar)
60
+ [1, 3]
61
+ >>> map(attrgetter('y'), d.bar)
62
+ [2, 4]
63
+ >>> d = EasyDict()
64
+ >>> d.keys()
65
+ []
66
+ >>> d = EasyDict(foo=3, bar=dict(x=1, y=2))
67
+ >>> d.foo
68
+ 3
69
+ >>> d.bar.x
70
+ 1
71
+
72
+ Still like a dict though
73
+
74
+ >>> o = EasyDict({'clean':True})
75
+ >>> o.items()
76
+ [('clean', True)]
77
+
78
+ And like a class
79
+
80
+ >>> class Flower(EasyDict):
81
+ ... power = 1
82
+ ...
83
+ >>> f = Flower()
84
+ >>> f.power
85
+ 1
86
+ >>> f = Flower({'height': 12})
87
+ >>> f.height
88
+ 12
89
+ >>> f['power']
90
+ 1
91
+ >>> sorted(f.keys())
92
+ ['height', 'power']
93
+
94
+ update and pop items
95
+ >>> d = EasyDict(a=1, b='2')
96
+ >>> e = EasyDict(c=3.0, a=9.0)
97
+ >>> d.update(e)
98
+ >>> d.c
99
+ 3.0
100
+ >>> d['c']
101
+ 3.0
102
+ >>> d.get('c')
103
+ 3.0
104
+ >>> d.update(a=4, b=4)
105
+ >>> d.b
106
+ 4
107
+ >>> d.pop('a')
108
+ 4
109
+ >>> d.a
110
+ Traceback (most recent call last):
111
+ ...
112
+ AttributeError: 'EasyDict' object has no attribute 'a'
113
+ """
114
+
115
+ def __init__(self, d=None, **kwargs):
116
+ if d is None:
117
+ d = {}
118
+ if kwargs:
119
+ d.update(**kwargs)
120
+ for k, v in d.items():
121
+ setattr(self, k, v)
122
+ # Class attributes
123
+ for k in self.__class__.__dict__.keys():
124
+ if not (k.startswith("__") and k.endswith("__")) and not k in ("update", "pop"):
125
+ setattr(self, k, getattr(self, k))
126
+
127
+ def __setattr__(self, name, value):
128
+ if isinstance(value, (list, tuple)):
129
+ value = [self.__class__(x) if isinstance(x, dict) else x for x in value]
130
+ elif isinstance(value, dict) and not isinstance(value, self.__class__):
131
+ value = self.__class__(value)
132
+ super(EasyDict, self).__setattr__(name, value)
133
+ super(EasyDict, self).__setitem__(name, value)
134
+
135
+ __setitem__ = __setattr__
136
+
137
+ def update(self, e=None, **f):
138
+ d = e or dict()
139
+ d.update(f)
140
+ for k in d:
141
+ setattr(self, k, d[k])
142
+
143
+ def pop(self, k, d=None):
144
+ if hasattr(self, k):
145
+ delattr(self, k)
146
+ return super(EasyDict, self).pop(k, d)
147
 
148
  class InternVideo2Config(PretrainedConfig):
149
  model_type = "internvideo2"
 
198
 
199
  # Data configuration
200
  self.train_file = train_file or "available_corpus[\"pretrain_example_data_1B\"]"
201
+ self.test_file = EasyDict(test_file or {
202
  "msrvtt_1k_test": "available_corpus[\"msrvtt_1k_test\"]",
203
  "didemo_ret_test": "available_corpus[\"didemo_ret_test\"]"
204
  })
 
212
  self.batch_size = batch_size
213
  self.batch_size_test = batch_size_test
214
  self.max_txt_l = max_txt_l
215
+ self.inputs = EasyDict(inputs or {
216
  "image_res": 224,
217
+ "video_input": EasyDict({
218
  "num_frames": num_frames,
219
  "sample_type": "rand",
220
  "num_frames_test": num_frames_test,
221
  "sample_type_test": "middle",
222
  "random_aug": False
223
  }),
224
+ "max_txt_l": EasyDict({"image": max_txt_l, "video": max_txt_l}),
225
+ "batch_size": EasyDict({"image": batch_size, "video": batch_size}),
226
+ "batch_size_test": EasyDict({"image": batch_size_test, "video": batch_size_test})
227
  })
228
 
229
  # Model configuration
230
  self.text_enc = text_enc
231
+ self.model = EasyDict(model or {
232
  "model_cls": "InternVideo2_Stage2",
233
+ "vision_encoder": EasyDict({
234
  "name": "pretrain_internvideo2_1b_patch14_224",
235
  "img_size": 224,
236
  "num_frames": num_frames,
 
261
  "only_mask": True
262
  }),
263
  "text_encoder": text_enc,
264
+ "multimodal": EasyDict({"enable": True}),
265
  "embed_dim": 512,
266
  "temp": 0.07,
267
  "find_unused_parameters": False
268
  })
269
 
270
  # Criterion configuration
271
+ self.criterion = EasyDict(criterion or {
272
+ "loss_weight": EasyDict({
273
  "vtc": 1.0,
274
  "mlm": 1.0,
275
  "vtm": 1.0,
 
283
  })
284
 
285
  # Optimizer configuration
286
+ self.optimizer = EasyDict(optimizer or {
287
  "opt": "adamW",
288
  "lr": 5e-5,
289
  "opt_betas": [0.9, 0.98],
290
  "weight_decay": 0.05,
291
  "max_grad_norm": 3.0,
292
+ "different_lr": EasyDict({"enable": False, "module_names": [], "lr": 1e-3})
293
  })
294
 
295
  # Scheduler configuration
296
+ self.scheduler = EasyDict(scheduler or {
297
  "sched": "cosine",
298
  "epochs": 10,
299
  "min_lr_multi": 0.01,
 
303
  # Evaluation configuration
304
  self.evaluate = evaluate
305
  self.deep_fusion = deep_fusion
306
+ self.evaluation = EasyDict(evaluation or {
307
  "eval_frame_ensemble": "concat",
308
  "eval_x_only": False,
309
  "k_test": 128,
 
318
  self.use_mem_efficient_sdp = use_mem_efficient_sdp
319
  self.compile_model = compile_model
320
 
321
+ self.wandb = EasyDict(wandb or {
322
  "enable": False,
323
  "entity": "opengvlab",
324
  "project": "InternVideo2-Stage2"
 
340
  self.save_ckpt_iter = save_ckpt_iter
341
  self.delete_ds_optim_states = delete_ds_optim_states
342
 
343
+ self.deepspeed = EasyDict(deepspeed or {
344
  "enable": True,
345
  "stage": 1
346
  })
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1217b84f65506792a1fe141d636ffb05cfe151005a8bc9ec46006b343af02ee
3
  size 2104856154
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0e5845f86e194d4043bb2d0cfb78fadaae0481882163350973df077cb22256a
3
  size 2104856154