Yiming-M commited on
Commit
c628976
Β·
1 Parent(s): df60639

2025-08-01 10:49 πŸš€

Browse files
app.py CHANGED
@@ -791,7 +791,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), title="ZIP Crowd Counting") as d
791
  gr.Markdown("""
792
  ### Step-by-step Guide:
793
 
794
- 1. **πŸŽ›οΈ Select Model**: Choose your preferred model variant, pre-trained dataset, and evaluation metric from the dropdown
795
  2. **πŸ“Έ Upload Image**: Click the image area to upload your crowd photo or use clipboard
796
  3. **πŸš€ Analyze**: Click the "Analyze Crowd" button to start processing
797
  4. **πŸ“Š View Results**: Examine the density maps and crowd count in the output panels
@@ -821,20 +821,20 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), title="ZIP Crowd Counting") as d
821
  - **ZIP-N**: Nano model for mobile applications
822
  - **ZIP-P**: Pico model for edge devices
823
 
824
- ### Datasets:
825
  - **ShanghaiTech A**: Dense, low-resolution crowd scenes
826
  - **ShanghaiTech B**: Sparse, high-resolution crowd scenes
827
  - **UCF-QNRF**: Dense, ultra high-resolution crowd images
828
  - **NWPU-Crowd**: Largest ultra high-resolution crowd counting dataset
829
 
830
- ### Metrics:
831
- - **MAE**: Mean Absolute Error - average counting error
832
  - **NAE**: Normalized Absolute Error - relative counting error
833
  """)
834
 
835
  demo.launch(
836
  server_name="0.0.0.0",
837
- server_port=7860,
838
  show_api=False,
839
  share=False
840
  )
 
791
  gr.Markdown("""
792
  ### Step-by-step Guide:
793
 
794
+ 1. **πŸŽ›οΈ Select Model**: Choose your preferred model variant, pre-training dataset, and pre-training evaluation metric from the dropdown
795
  2. **πŸ“Έ Upload Image**: Click the image area to upload your crowd photo or use clipboard
796
  3. **πŸš€ Analyze**: Click the "Analyze Crowd" button to start processing
797
  4. **πŸ“Š View Results**: Examine the density maps and crowd count in the output panels
 
821
  - **ZIP-N**: Nano model for mobile applications
822
  - **ZIP-P**: Pico model for edge devices
823
 
824
+ ### Pre-trainining Datasets:
825
  - **ShanghaiTech A**: Dense, low-resolution crowd scenes
826
  - **ShanghaiTech B**: Sparse, high-resolution crowd scenes
827
  - **UCF-QNRF**: Dense, ultra high-resolution crowd images
828
  - **NWPU-Crowd**: Largest ultra high-resolution crowd counting dataset
829
 
830
+ ### Pre-trainining Evaluation Metrics:
831
+ - **MAE**: Mean Absolute Error - average counting error.
832
  - **NAE**: Normalized Absolute Error - relative counting error
833
  """)
834
 
835
  demo.launch(
836
  server_name="0.0.0.0",
837
+ server_port=7861,
838
  show_api=False,
839
  share=False
840
  )
models/__init__.py CHANGED
@@ -17,12 +17,6 @@ def get_model(
17
  num_vpt: Optional[int] = None,
18
  vpt_drop: Optional[float] = None,
19
  input_size: Optional[int] = None,
20
- adapter: bool = False,
21
- adapter_reduction: Optional[int] = None,
22
- lora: bool = False,
23
- lora_rank: Optional[int] = None,
24
- lora_alpha: Optional[int] = None,
25
- lora_dropout: Optional[float] = None,
26
  norm: str = "none",
27
  act: str = "none",
28
  text_prompts: Optional[List[str]] = None
@@ -41,15 +35,6 @@ def get_model(
41
  num_vpt = model_info["config"].get("num_vpt", None)
42
  vpt_drop = model_info["config"].get("vpt_drop", None)
43
 
44
-
45
- adapter = model_info["config"].get("adapter", False)
46
- adapter_reduction = model_info["config"].get("adapter_reduction", None)
47
-
48
- lora = model_info["config"].get("lora", False)
49
- lora_rank = model_info["config"].get("lora_rank", None)
50
- lora_alpha = model_info["config"].get("lora_alpha", None)
51
- lora_dropout = model_info["config"].get("lora_dropout", None)
52
-
53
  input_size = model_info["config"].get("input_size", None)
54
  text_prompts = model_info["config"].get("text_prompts", None)
55
 
@@ -81,12 +66,6 @@ def get_model(
81
  num_vpt=num_vpt,
82
  vpt_drop=vpt_drop,
83
  input_size=input_size,
84
- adapter=adapter,
85
- adapter_reduction=adapter_reduction,
86
- lora=lora,
87
- lora_rank=lora_rank,
88
- lora_alpha=lora_alpha,
89
- lora_dropout=lora_dropout,
90
  text_prompts=text_prompts,
91
  norm=norm,
92
  act=act
@@ -101,20 +80,12 @@ def get_model(
101
  "num_vpt": num_vpt,
102
  "vpt_drop": vpt_drop,
103
  "input_size": input_size,
104
- "adapter": adapter,
105
- "adapter_reduction": adapter_reduction,
106
- "lora": lora,
107
- "lora_rank": lora_rank,
108
- "lora_alpha": lora_alpha,
109
- "lora_dropout": lora_dropout,
110
  "text_prompts": model.text_prompts,
111
  "norm": norm,
112
  "act": act
113
  }
114
 
115
  else:
116
- assert not adapter, "adapter for non-CLIP models is not implemented yet"
117
- assert not lora, "lora for non-CLIP models is not implemented yet"
118
  model = _ebc(
119
  model_name=model_name,
120
  block_size=block_size,
 
17
  num_vpt: Optional[int] = None,
18
  vpt_drop: Optional[float] = None,
19
  input_size: Optional[int] = None,
 
 
 
 
 
 
20
  norm: str = "none",
21
  act: str = "none",
22
  text_prompts: Optional[List[str]] = None
 
35
  num_vpt = model_info["config"].get("num_vpt", None)
36
  vpt_drop = model_info["config"].get("vpt_drop", None)
37
 
 
 
 
 
 
 
 
 
 
38
  input_size = model_info["config"].get("input_size", None)
39
  text_prompts = model_info["config"].get("text_prompts", None)
40
 
 
66
  num_vpt=num_vpt,
67
  vpt_drop=vpt_drop,
68
  input_size=input_size,
 
 
 
 
 
 
69
  text_prompts=text_prompts,
70
  norm=norm,
71
  act=act
 
80
  "num_vpt": num_vpt,
81
  "vpt_drop": vpt_drop,
82
  "input_size": input_size,
 
 
 
 
 
 
83
  "text_prompts": model.text_prompts,
84
  "norm": norm,
85
  "act": act
86
  }
87
 
88
  else:
 
 
89
  model = _ebc(
90
  model_name=model_name,
91
  block_size=block_size,
models/clip_ebc/convnext.py CHANGED
@@ -1,8 +1,7 @@
1
  from torch import nn, Tensor
2
  import open_clip
3
- from peft import get_peft_model, LoraConfig
4
 
5
- from ..utils import ConvRefine, ConvAdapter
6
  from ..utils import ConvUpsample, _get_norm_layer, _get_activation
7
 
8
 
@@ -41,8 +40,6 @@ class ConvNeXt(nn.Module):
41
  model_name: str,
42
  weight_name: str,
43
  block_size: int = 16,
44
- adapter: bool = False,
45
- adapter_reduction: int = 4,
46
  norm: str = "none",
47
  act: str = "none"
48
  ) -> None:
@@ -55,22 +52,11 @@ class ConvNeXt(nn.Module):
55
 
56
  # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
57
  model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
58
-
59
- self.adapter = adapter
60
- if adapter:
61
- self.adapter_reduction = adapter_reduction
62
- for param in model.parameters():
63
- param.requires_grad = False
64
 
65
  self.stem = model.trunk.stem
66
  self.depth = len(model.trunk.stages)
67
  for idx, stage in enumerate(model.trunk.stages):
68
  setattr(self, f"stage{idx}", stage)
69
- if adapter:
70
- setattr(self, f"adapter{idx}", ConvAdapter(
71
- in_channels=stage.blocks[-1].mlp.fc2.out_features,
72
- bottleneck_channels=stage.blocks[-1].mlp.fc2.out_features // adapter_reduction,
73
- ) if idx < self.depth - 1 else nn.Identity()) # No adapter for the last stage
74
 
75
  if self.model_name in ["convnext_base", "convnext_base_w", "convnext_base_w_320", "convnext_xxlarge"]:
76
  self.in_features, self.out_features = model.head.proj.in_features, model.head.proj.out_features
@@ -125,30 +111,12 @@ class ConvNeXt(nn.Module):
125
  ),
126
  )
127
 
128
- def train(self, mode: bool = True):
129
- if self.adapter and mode:
130
- # training:
131
- self.stem.eval()
132
-
133
- for idx in range(self.depth):
134
- getattr(self, f"stage{idx}").eval()
135
- getattr(self, f"adapter{idx}").train()
136
-
137
- self.refiner.train()
138
-
139
- else:
140
- # evaluation:
141
- for module in self.children():
142
- module.train(mode)
143
-
144
  def forward(self, x: Tensor) -> Tensor:
145
  x = self.stem(x)
146
 
147
  for idx in range(self.depth):
148
  x = getattr(self, f"stage{idx}")(x)
149
- if self.adapter:
150
- x = getattr(self, f"adapter{idx}")(x)
151
-
152
  x = self.refiner(x)
153
  return x
154
 
@@ -157,44 +125,14 @@ def _convnext(
157
  model_name: str,
158
  weight_name: str,
159
  block_size: int = 16,
160
- adapter: bool = False,
161
- adapter_reduction: int = 4,
162
- lora: bool = False,
163
- lora_rank: int = 16,
164
- lora_alpha: float = 32.0,
165
- lora_dropout: float = 0.1,
166
  norm: str = "none",
167
  act: str = "none"
168
  ) -> ConvNeXt:
169
- assert not (lora and adapter), "Lora and adapter cannot be used together."
170
  model = ConvNeXt(
171
  model_name=model_name,
172
  weight_name=weight_name,
173
  block_size=block_size,
174
- adapter=adapter,
175
- adapter_reduction=adapter_reduction,
176
  norm=norm,
177
  act=act
178
  )
179
-
180
- if lora:
181
- target_modules = []
182
- for name, module in model.named_modules():
183
- if isinstance(module, (nn.Linear, nn.Conv2d)) and "refiner" not in name:
184
- target_modules.append(name)
185
-
186
- lora_config = LoraConfig(
187
- r=lora_rank,
188
- lora_alpha=lora_alpha,
189
- lora_dropout=lora_dropout,
190
- bias="none",
191
- target_modules=target_modules,
192
- )
193
- model = get_peft_model(model, lora_config)
194
-
195
- # Unfreeze refiner
196
- for name, module in model.named_modules():
197
- if "refiner" in name:
198
- module.requires_grad_(True)
199
-
200
  return model
 
1
  from torch import nn, Tensor
2
  import open_clip
 
3
 
4
+ from ..utils import ConvRefine
5
  from ..utils import ConvUpsample, _get_norm_layer, _get_activation
6
 
7
 
 
40
  model_name: str,
41
  weight_name: str,
42
  block_size: int = 16,
 
 
43
  norm: str = "none",
44
  act: str = "none"
45
  ) -> None:
 
52
 
53
  # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
54
  model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
 
 
 
 
 
 
55
 
56
  self.stem = model.trunk.stem
57
  self.depth = len(model.trunk.stages)
58
  for idx, stage in enumerate(model.trunk.stages):
59
  setattr(self, f"stage{idx}", stage)
 
 
 
 
 
60
 
61
  if self.model_name in ["convnext_base", "convnext_base_w", "convnext_base_w_320", "convnext_xxlarge"]:
62
  self.in_features, self.out_features = model.head.proj.in_features, model.head.proj.out_features
 
111
  ),
112
  )
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  def forward(self, x: Tensor) -> Tensor:
115
  x = self.stem(x)
116
 
117
  for idx in range(self.depth):
118
  x = getattr(self, f"stage{idx}")(x)
119
+
 
 
120
  x = self.refiner(x)
121
  return x
122
 
 
125
  model_name: str,
126
  weight_name: str,
127
  block_size: int = 16,
 
 
 
 
 
 
128
  norm: str = "none",
129
  act: str = "none"
130
  ) -> ConvNeXt:
 
131
  model = ConvNeXt(
132
  model_name=model_name,
133
  weight_name=weight_name,
134
  block_size=block_size,
 
 
135
  norm=norm,
136
  act=act
137
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  return model
models/clip_ebc/mobileclip.py CHANGED
@@ -1,8 +1,7 @@
1
  from torch import nn, Tensor
2
  import open_clip
3
- from peft import get_peft_model, LoraConfig
4
 
5
- from ..utils import ConvRefine, ConvUpsample, ConvAdapter
6
  from ..utils import _get_norm_layer, _get_activation
7
 
8
 
@@ -29,8 +28,6 @@ class MobileCLIP(nn.Module):
29
  model_name: str,
30
  weight_name: str,
31
  block_size: int = 16,
32
- adapter: bool = False,
33
- adapter_reduction: int = 4,
34
  norm: str = "none",
35
  act: str = "none"
36
  ) -> None:
@@ -44,21 +41,10 @@ class MobileCLIP(nn.Module):
44
  # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
45
  model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
46
 
47
- self.adapter = adapter
48
- if adapter:
49
- for param in model.parameters():
50
- param.requires_grad = False
51
-
52
  self.stem = model.trunk.stem
53
  self.stages = model.trunk.stages
54
 
55
  self.depth = len(model.trunk.stages)
56
- for idx, stage in enumerate(model.trunk.stages):
57
- if adapter:
58
- setattr(self, f"adapter{idx}", ConvAdapter(
59
- in_channels=stage.blocks[-1].mlp.fc2.out_channels,
60
- bottleneck_channels=stage.blocks[-1].mlp.fc2.out_channels // adapter_reduction,
61
- ))
62
 
63
  self.final_conv = model.trunk.final_conv
64
 
@@ -114,31 +100,12 @@ class MobileCLIP(nn.Module):
114
  groups=refiner_groups[self.model_name],
115
  ),
116
  )
117
-
118
- def train(self, mode: bool = True):
119
- if self.adapter and mode:
120
- # training:
121
- self.stem.eval()
122
-
123
- for idx in range(self.depth):
124
- getattr(self, f"stage{idx}").eval()
125
- getattr(self, f"adapter{idx}").train()
126
-
127
- self.final_conv.eval()
128
- self.refiner.train()
129
-
130
- else:
131
- # evaluation:
132
- for module in self.children():
133
- module.train(mode)
134
 
135
  def forward(self, x: Tensor) -> Tensor:
136
  x = self.stem(x)
137
 
138
  for idx in range(self.depth):
139
  x = self.stages[idx](x)
140
- if self.adapter:
141
- x = getattr(self, f"adapter{idx}")(x)
142
 
143
  x = self.final_conv(x)
144
 
@@ -150,49 +117,14 @@ def _mobileclip(
150
  model_name: str,
151
  weight_name: str,
152
  block_size: int = 16,
153
- adapter: bool = False,
154
- adapter_reduction: int = 4,
155
- lora: bool = False,
156
- lora_rank: int = 16,
157
- lora_alpha: float = 32.0,
158
- lora_dropout: float = 0.1,
159
  norm: str = "none",
160
  act: str = "none"
161
  ) -> MobileCLIP:
162
- assert not (lora and adapter), "Lora and adapter cannot be used together."
163
  model = MobileCLIP(
164
  model_name=model_name,
165
  weight_name=weight_name,
166
  block_size=block_size,
167
- adapter=adapter,
168
- adapter_reduction=adapter_reduction,
169
  norm=norm,
170
  act=act
171
  )
172
-
173
- if lora:
174
- target_modules = []
175
- for name, module in model.named_modules():
176
- if isinstance(module, (nn.Linear, nn.Conv2d)):
177
- target_modules.append(name)
178
-
179
- lora_config = LoraConfig(
180
- r=lora_rank,
181
- lora_alpha=lora_alpha,
182
- lora_dropout=lora_dropout,
183
- bias="none",
184
- target_modules=target_modules,
185
- )
186
- model = get_peft_model(model, lora_config)
187
-
188
- # Unfreeze the BN layers
189
- for name, module in model.named_modules() and "refiner" not in name:
190
- if isinstance(module, nn.BatchNorm2d):
191
- module.requires_grad_(True)
192
-
193
- # Unfreeze refiner
194
- for name, module in model.named_modules():
195
- if "refiner" in name:
196
- module.requires_grad_(True)
197
-
198
  return model
 
1
  from torch import nn, Tensor
2
  import open_clip
 
3
 
4
+ from ..utils import ConvRefine, ConvUpsample
5
  from ..utils import _get_norm_layer, _get_activation
6
 
7
 
 
28
  model_name: str,
29
  weight_name: str,
30
  block_size: int = 16,
 
 
31
  norm: str = "none",
32
  act: str = "none"
33
  ) -> None:
 
41
  # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
42
  model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
43
 
 
 
 
 
 
44
  self.stem = model.trunk.stem
45
  self.stages = model.trunk.stages
46
 
47
  self.depth = len(model.trunk.stages)
 
 
 
 
 
 
48
 
49
  self.final_conv = model.trunk.final_conv
50
 
 
100
  groups=refiner_groups[self.model_name],
101
  ),
102
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  def forward(self, x: Tensor) -> Tensor:
105
  x = self.stem(x)
106
 
107
  for idx in range(self.depth):
108
  x = self.stages[idx](x)
 
 
109
 
110
  x = self.final_conv(x)
111
 
 
117
  model_name: str,
118
  weight_name: str,
119
  block_size: int = 16,
 
 
 
 
 
 
120
  norm: str = "none",
121
  act: str = "none"
122
  ) -> MobileCLIP:
 
123
  model = MobileCLIP(
124
  model_name=model_name,
125
  weight_name=weight_name,
126
  block_size=block_size,
 
 
127
  norm=norm,
128
  act=act
129
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  return model
models/clip_ebc/model.py CHANGED
@@ -31,12 +31,6 @@ class CLIP_EBC(nn.Module):
31
  num_vpt: Optional[int] = None,
32
  vpt_drop: Optional[float] = None,
33
  input_size: Optional[int] = None,
34
- adapter: Optional[bool] = False,
35
- adapter_reduction: Optional[int] = None,
36
- lora: Optional[bool] = False,
37
- lora_rank: Optional[int] = None,
38
- lora_alpha: Optional[float] = None,
39
- lora_dropout: Optional[float] = None,
40
  text_prompts: Optional[Dict[str, List[str]]] = None,
41
  norm: Optional[str] = "none",
42
  act: Optional[str] = "none",
@@ -70,12 +64,6 @@ class CLIP_EBC(nn.Module):
70
  num_vpt=num_vpt,
71
  vpt_drop=vpt_drop,
72
  block_size=block_size,
73
- adapter=adapter,
74
- adapter_reduction=adapter_reduction,
75
- lora=lora,
76
- lora_rank=lora_rank,
77
- lora_alpha=lora_alpha,
78
- lora_dropout=lora_dropout,
79
  input_size=(input_size, input_size),
80
  norm=norm,
81
  act=act
@@ -85,12 +73,6 @@ class CLIP_EBC(nn.Module):
85
  model_name=model_name,
86
  weight_name=weight_name,
87
  block_size=block_size,
88
- adapter=adapter,
89
- adapter_reduction=adapter_reduction,
90
- lora=lora,
91
- lora_rank=lora_rank,
92
- lora_alpha=lora_alpha,
93
- lora_dropout=lora_dropout,
94
  norm=norm,
95
  act=act
96
  )
@@ -99,12 +81,6 @@ class CLIP_EBC(nn.Module):
99
  model_name=model_name,
100
  weight_name=weight_name,
101
  block_size=block_size,
102
- adapter=adapter,
103
- adapter_reduction=adapter_reduction,
104
- lora=lora,
105
- lora_rank=lora_rank,
106
- lora_alpha=lora_alpha,
107
- lora_dropout=lora_dropout,
108
  norm=norm,
109
  act=act
110
  )
@@ -113,12 +89,6 @@ class CLIP_EBC(nn.Module):
113
  model_name=model_name,
114
  weight_name=weight_name,
115
  block_size=block_size,
116
- adapter=adapter,
117
- adapter_reduction=adapter_reduction,
118
- lora=lora,
119
- lora_rank=lora_rank,
120
- lora_alpha=lora_alpha,
121
- lora_dropout=lora_dropout,
122
  norm=norm,
123
  act=act
124
  )
@@ -240,12 +210,6 @@ def _clip_ebc(
240
  num_vpt: Optional[int] = None,
241
  vpt_drop: Optional[float] = None,
242
  input_size: Optional[int] = None,
243
- adapter: Optional[bool] = False,
244
- adapter_reduction: Optional[int] = None,
245
- lora: Optional[bool] = False,
246
- lora_rank: Optional[int] = None,
247
- lora_alpha: Optional[float] = None,
248
- lora_dropout: Optional[float] = None,
249
  text_prompts: Optional[List[str]] = None,
250
  norm: Optional[str] = "none",
251
  act: Optional[str] = "none",
@@ -260,12 +224,6 @@ def _clip_ebc(
260
  num_vpt=num_vpt,
261
  vpt_drop=vpt_drop,
262
  input_size=input_size,
263
- adapter=adapter,
264
- adapter_reduction=adapter_reduction,
265
- lora=lora,
266
- lora_rank=lora_rank,
267
- lora_alpha=lora_alpha,
268
- lora_dropout=lora_dropout,
269
  text_prompts=text_prompts,
270
  norm=norm,
271
  act=act,
 
31
  num_vpt: Optional[int] = None,
32
  vpt_drop: Optional[float] = None,
33
  input_size: Optional[int] = None,
 
 
 
 
 
 
34
  text_prompts: Optional[Dict[str, List[str]]] = None,
35
  norm: Optional[str] = "none",
36
  act: Optional[str] = "none",
 
64
  num_vpt=num_vpt,
65
  vpt_drop=vpt_drop,
66
  block_size=block_size,
 
 
 
 
 
 
67
  input_size=(input_size, input_size),
68
  norm=norm,
69
  act=act
 
73
  model_name=model_name,
74
  weight_name=weight_name,
75
  block_size=block_size,
 
 
 
 
 
 
76
  norm=norm,
77
  act=act
78
  )
 
81
  model_name=model_name,
82
  weight_name=weight_name,
83
  block_size=block_size,
 
 
 
 
 
 
84
  norm=norm,
85
  act=act
86
  )
 
89
  model_name=model_name,
90
  weight_name=weight_name,
91
  block_size=block_size,
 
 
 
 
 
 
92
  norm=norm,
93
  act=act
94
  )
 
210
  num_vpt: Optional[int] = None,
211
  vpt_drop: Optional[float] = None,
212
  input_size: Optional[int] = None,
 
 
 
 
 
 
213
  text_prompts: Optional[List[str]] = None,
214
  norm: Optional[str] = "none",
215
  act: Optional[str] = "none",
 
224
  num_vpt=num_vpt,
225
  vpt_drop=vpt_drop,
226
  input_size=input_size,
 
 
 
 
 
 
227
  text_prompts=text_prompts,
228
  norm=norm,
229
  act=act,
models/clip_ebc/resnet.py CHANGED
@@ -1,8 +1,7 @@
1
  from torch import nn, Tensor
2
  import open_clip
3
- from peft import get_peft_model, LoraConfig
4
 
5
- from ..utils import ConvRefine, ConvUpsample, ConvAdapter
6
  from ..utils import _get_norm_layer, _get_activation
7
 
8
 
@@ -37,8 +36,6 @@ class ResNet(nn.Module):
37
  model_name: str,
38
  weight_name: str,
39
  block_size: int = 16,
40
- adapter: bool = False,
41
- adapter_reduction: int = 4,
42
  norm: str = "none",
43
  act: str = "none"
44
  ) -> None:
@@ -52,11 +49,6 @@ class ResNet(nn.Module):
52
  # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
53
  model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
54
 
55
- self.adapter = adapter
56
- if adapter:
57
- for param in model.parameters():
58
- param.requires_grad = False
59
-
60
  # Stem
61
  self.conv1 = model.conv1
62
  self.bn1 = model.bn1
@@ -73,12 +65,7 @@ class ResNet(nn.Module):
73
  # Layers
74
  for idx in range(1, 5):
75
  setattr(self, f"layer{idx}", getattr(model, f"layer{idx}"))
76
- if adapter:
77
- setattr(self, f"adapter{idx}", ConvAdapter(
78
- in_channels=getattr(model, f"layer{idx}")[-1].conv3.out_channels,
79
- bottleneck_channels=getattr(model, f"layer{idx}")[-1].conv3.out_channels // adapter_reduction,
80
- ) if idx < 4 else nn.Identity()) # No adapter for the last layer
81
-
82
  self.in_features = model.attnpool.c_proj.weight.shape[1]
83
  self.out_features = model.attnpool.c_proj.weight.shape[0]
84
 
@@ -129,31 +116,6 @@ class ResNet(nn.Module):
129
  groups=refiner_groups[self.model_name],
130
  ),
131
  )
132
-
133
- def train(self, mode: bool = True):
134
- if self.adapter and mode:
135
- # training:
136
- self.conv1.eval()
137
- self.bn1.eval()
138
- self.act1.eval()
139
- self.conv2.eval()
140
- self.bn2.eval()
141
- self.act2.eval()
142
- self.conv3.eval()
143
- self.bn3.eval()
144
- self.act3.eval()
145
- self.avgpool.eval()
146
-
147
- for idx in range(1, 5):
148
- getattr(self, f"layer{idx}").eval()
149
- getattr(self, f"adapter{idx}").train()
150
-
151
- self.refiner.train()
152
-
153
- else:
154
- # evaluation:
155
- for module in self.children():
156
- module.train(mode)
157
 
158
  def stem(self, x: Tensor) -> Tensor:
159
  x = self.act1(self.bn1(self.conv1(x)))
@@ -166,21 +128,9 @@ class ResNet(nn.Module):
166
  x = self.stem(x)
167
 
168
  x = self.layer1(x)
169
- if self.adapter:
170
- x = self.adapter1(x)
171
-
172
  x = self.layer2(x)
173
- if self.adapter:
174
- x = self.adapter2(x)
175
-
176
  x = self.layer3(x)
177
- if self.adapter:
178
- x = self.adapter3(x)
179
-
180
  x = self.layer4(x)
181
- if self.adapter:
182
- x = self.adapter4(x)
183
-
184
  x = self.refiner(x)
185
  return x
186
 
@@ -189,49 +139,14 @@ def _resnet(
189
  model_name: str,
190
  weight_name: str,
191
  block_size: int = 16,
192
- adapter: bool = False,
193
- adapter_reduction: int = 4,
194
- lora: bool = False,
195
- lora_rank: int = 16,
196
- lora_alpha: float = 32.0,
197
- lora_dropout: float = 0.1,
198
  norm: str = "none",
199
  act: str = "none"
200
  ) -> ResNet:
201
- assert not (lora and adapter), "Lora and adapter cannot be used together."
202
  model = ResNet(
203
  model_name=model_name,
204
  weight_name=weight_name,
205
  block_size=block_size,
206
- adapter=adapter,
207
- adapter_reduction=adapter_reduction,
208
  norm=norm,
209
  act=act
210
  )
211
-
212
- if lora:
213
- target_modules = []
214
- for name, module in model.named_modules():
215
- if isinstance(module, (nn.Linear, nn.Conv2d)):
216
- target_modules.append(name)
217
-
218
- lora_config = LoraConfig(
219
- r=lora_rank,
220
- lora_alpha=lora_alpha,
221
- lora_dropout=lora_dropout,
222
- bias="none",
223
- target_modules=target_modules,
224
- )
225
- model = get_peft_model(model, lora_config)
226
-
227
- # Unfreeze BN layers
228
- for name, module in model.named_modules():
229
- if isinstance(module, nn.BatchNorm2d) and "refiner" not in name:
230
- module.requires_grad_(True)
231
-
232
- # Unfreeze refiner
233
- for name, module in model.named_modules():
234
- if "refiner" in name:
235
- module.requires_grad_(True)
236
-
237
  return model
 
1
  from torch import nn, Tensor
2
  import open_clip
 
3
 
4
+ from ..utils import ConvRefine, ConvUpsample
5
  from ..utils import _get_norm_layer, _get_activation
6
 
7
 
 
36
  model_name: str,
37
  weight_name: str,
38
  block_size: int = 16,
 
 
39
  norm: str = "none",
40
  act: str = "none"
41
  ) -> None:
 
49
  # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
50
  model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
51
 
 
 
 
 
 
52
  # Stem
53
  self.conv1 = model.conv1
54
  self.bn1 = model.bn1
 
65
  # Layers
66
  for idx in range(1, 5):
67
  setattr(self, f"layer{idx}", getattr(model, f"layer{idx}"))
68
+
 
 
 
 
 
69
  self.in_features = model.attnpool.c_proj.weight.shape[1]
70
  self.out_features = model.attnpool.c_proj.weight.shape[0]
71
 
 
116
  groups=refiner_groups[self.model_name],
117
  ),
118
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  def stem(self, x: Tensor) -> Tensor:
121
  x = self.act1(self.bn1(self.conv1(x)))
 
128
  x = self.stem(x)
129
 
130
  x = self.layer1(x)
 
 
 
131
  x = self.layer2(x)
 
 
 
132
  x = self.layer3(x)
 
 
 
133
  x = self.layer4(x)
 
 
 
134
  x = self.refiner(x)
135
  return x
136
 
 
139
  model_name: str,
140
  weight_name: str,
141
  block_size: int = 16,
 
 
 
 
 
 
142
  norm: str = "none",
143
  act: str = "none"
144
  ) -> ResNet:
 
145
  model = ResNet(
146
  model_name=model_name,
147
  weight_name=weight_name,
148
  block_size=block_size,
 
 
149
  norm=norm,
150
  act=act
151
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  return model
models/clip_ebc/vit.py CHANGED
@@ -3,10 +3,9 @@ from torch import nn, Tensor
3
  import math
4
  from einops import rearrange
5
  import open_clip
6
- from peft import get_peft_model, LoraConfig
7
  from typing import Optional, Tuple
8
 
9
- from ..utils import interpolate_pos_embed, ViTAdapter
10
  # from ..utils import TransformerRefine, TransformerDownsample, TransformerUpsample
11
  from ..utils import ConvRefine, ConvDownsample, ConvUpsample
12
  from ..utils import _get_norm_layer, _get_activation
@@ -73,8 +72,6 @@ class ViT(nn.Module):
73
  block_size: int = 16,
74
  num_vpt: int = 32,
75
  vpt_drop: float = 0.0,
76
- adapter: bool = False,
77
- adapter_reduction: int = 4,
78
  input_size: Optional[Tuple[int, int]] = None,
79
  norm: str = "none",
80
  act: str = "none"
@@ -82,18 +79,14 @@ class ViT(nn.Module):
82
  super(ViT, self).__init__()
83
  assert model_name in vit_names_and_weights, f"Model name should be one of {list(vit_names_and_weights.keys())}, but got {model_name}."
84
  assert weight_name in vit_names_and_weights[model_name], f"Pretrained should be one of {vit_names_and_weights[model_name]}, but got {weight_name}."
85
- if adapter:
86
- assert num_vpt is None or num_vpt == 0, "num_vpt should be None or 0 when using adapter."
87
- assert vpt_drop is None or vpt_drop == 0.0, "vpt_drop should be None or 0.0 when using adapter."
88
- else:
89
- assert num_vpt > 0, f"Number of VPT tokens should be greater than 0, but got {num_vpt}."
90
- assert 0.0 <= vpt_drop < 1.0, f"VPT dropout should be in [0.0, 1.0), but got {vpt_drop}."
91
 
92
  self.model_name, self.weight_name = model_name, weight_name
93
  self.block_size = block_size
94
  self.num_vpt = num_vpt
95
  self.vpt_drop = vpt_drop
96
- self.adapter = adapter
97
 
98
  # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
99
  model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
@@ -119,15 +112,9 @@ class ViT(nn.Module):
119
  # Setup VPT tokens
120
  val = math.sqrt(6. / float(3 * self.patch_size[0] + self.embed_dim))
121
  for idx in range(self.num_layers):
122
- if self.adapter:
123
- setattr(self, f"adapter{idx}", ViTAdapter(
124
- in_channels=self.embed_dim,
125
- bottleneck_channels=self.embed_dim // adapter_reduction,
126
- ))
127
- else:
128
- setattr(self, f"vpt_{idx}", nn.Parameter(torch.empty(self.num_vpt, self.embed_dim)))
129
- nn.init.uniform_(getattr(self, f"vpt_{idx}"), -val, val)
130
- setattr(self, f"vpt_drop_{idx}", nn.Dropout(self.vpt_drop))
131
 
132
  # Adjust the positional embedding to match the new input size
133
  self._adjust_pos_embed()
@@ -299,13 +286,10 @@ class ViT(nn.Module):
299
 
300
  return x
301
 
302
- def _forward_adapter(self, x: Tensor, idx: int) -> Tensor:
303
- return getattr(self, f"adapter{idx}")(x)
304
-
305
  def forward_encoder(self, x: Tensor) -> Tensor:
306
  x = self._forward_patch_embed(x)
307
  for idx in range(self.num_layers):
308
- x = self._forward_adapter(x, idx) if self.adapter else self._forward_vpt(x, idx)
309
  x = self.ln_post(x)
310
  return x
311
 
@@ -326,48 +310,19 @@ def _vit(
326
  block_size: int = 16,
327
  num_vpt: int = 32,
328
  vpt_drop: float = 0.1,
329
- adapter: bool = False,
330
- adapter_reduction: int = 4,
331
- lora: bool = False,
332
- lora_rank: int = 16,
333
- lora_alpha: float = 32.0,
334
- lora_dropout: float = 0.1,
335
  input_size: Optional[Tuple[int, int]] = None,
336
  norm: str = "none",
337
  act: str = "none"
338
  ) -> ViT:
339
- assert not (lora and adapter), "LoRA and adapter cannot be used together."
340
  model = ViT(
341
  model_name=model_name,
342
  weight_name=weight_name,
343
  block_size=block_size,
344
  num_vpt=num_vpt,
345
  vpt_drop=vpt_drop,
346
- adapter=adapter,
347
- adapter_reduction=adapter_reduction,
348
  input_size=input_size,
349
  norm=norm,
350
  act=act
351
  )
352
 
353
- if lora:
354
- target_modules = []
355
- for name, module in model.named_modules():
356
- if isinstance(module, (nn.Linear, nn.Conv2d, nn.MultiheadAttention)) and "refiner" not in name:
357
- target_modules.append(name)
358
-
359
- lora_config = LoraConfig(
360
- r=lora_rank,
361
- lora_alpha=lora_alpha,
362
- lora_dropout=lora_dropout,
363
- bias="none",
364
- target_modules=target_modules,
365
- )
366
- model = get_peft_model(model, lora_config)
367
-
368
- # Unfreeze refiner
369
- for name, module in model.named_modules():
370
- if "refiner" in name:
371
- module.requires_grad_(True)
372
-
373
  return model
 
3
  import math
4
  from einops import rearrange
5
  import open_clip
 
6
  from typing import Optional, Tuple
7
 
8
+ from ..utils import interpolate_pos_embed
9
  # from ..utils import TransformerRefine, TransformerDownsample, TransformerUpsample
10
  from ..utils import ConvRefine, ConvDownsample, ConvUpsample
11
  from ..utils import _get_norm_layer, _get_activation
 
72
  block_size: int = 16,
73
  num_vpt: int = 32,
74
  vpt_drop: float = 0.0,
 
 
75
  input_size: Optional[Tuple[int, int]] = None,
76
  norm: str = "none",
77
  act: str = "none"
 
79
  super(ViT, self).__init__()
80
  assert model_name in vit_names_and_weights, f"Model name should be one of {list(vit_names_and_weights.keys())}, but got {model_name}."
81
  assert weight_name in vit_names_and_weights[model_name], f"Pretrained should be one of {vit_names_and_weights[model_name]}, but got {weight_name}."
82
+
83
+ assert num_vpt > 0, f"Number of VPT tokens should be greater than 0, but got {num_vpt}."
84
+ assert 0.0 <= vpt_drop < 1.0, f"VPT dropout should be in [0.0, 1.0), but got {vpt_drop}."
 
 
 
85
 
86
  self.model_name, self.weight_name = model_name, weight_name
87
  self.block_size = block_size
88
  self.num_vpt = num_vpt
89
  self.vpt_drop = vpt_drop
 
90
 
91
  # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
92
  model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
 
112
  # Setup VPT tokens
113
  val = math.sqrt(6. / float(3 * self.patch_size[0] + self.embed_dim))
114
  for idx in range(self.num_layers):
115
+ setattr(self, f"vpt_{idx}", nn.Parameter(torch.empty(self.num_vpt, self.embed_dim)))
116
+ nn.init.uniform_(getattr(self, f"vpt_{idx}"), -val, val)
117
+ setattr(self, f"vpt_drop_{idx}", nn.Dropout(self.vpt_drop))
 
 
 
 
 
 
118
 
119
  # Adjust the positional embedding to match the new input size
120
  self._adjust_pos_embed()
 
286
 
287
  return x
288
 
 
 
 
289
  def forward_encoder(self, x: Tensor) -> Tensor:
290
  x = self._forward_patch_embed(x)
291
  for idx in range(self.num_layers):
292
+ x = self._forward_vpt(x, idx)
293
  x = self.ln_post(x)
294
  return x
295
 
 
310
  block_size: int = 16,
311
  num_vpt: int = 32,
312
  vpt_drop: float = 0.1,
 
 
 
 
 
 
313
  input_size: Optional[Tuple[int, int]] = None,
314
  norm: str = "none",
315
  act: str = "none"
316
  ) -> ViT:
 
317
  model = ViT(
318
  model_name=model_name,
319
  weight_name=weight_name,
320
  block_size=block_size,
321
  num_vpt=num_vpt,
322
  vpt_drop=vpt_drop,
 
 
323
  input_size=input_size,
324
  norm=norm,
325
  act=act
326
  )
327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  return model
requirements.txt CHANGED
@@ -3,7 +3,6 @@ gradio==5.23.1
3
  huggingface_hub==0.29.3
4
  matplotlib==3.10.1
5
  numpy==2.2.4
6
- peft==0.7.0
7
  Pillow==11.3.0
8
  spaces==0.39.0
9
  timm==1.0.19
 
3
  huggingface_hub==0.29.3
4
  matplotlib==3.10.1
5
  numpy==2.2.4
 
6
  Pillow==11.3.0
7
  spaces==0.39.0
8
  timm==1.0.19