davidberenstein1957 commited on
Commit
bdec2d8
·
verified ·
1 Parent(s): c1c495c

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +43 -0
  2. config.json +18 -0
  3. pytorch_model.bin +3 -0
  4. tiny_vqa_model.py +81 -0
README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ tags:
5
+ - vision-language
6
+ - vqa
7
+ - text-to-image-evaluation
8
+ license: mit
9
+ ---
10
+
11
+ # Tiny Random VQAScore Model
12
+
13
+ This is a tiny random version of the VQAScore architecture for educational and testing purposes.
14
+
15
+ ## Model Architecture
16
+
17
+ - **Vision Encoder**: Tiny CNN + Transformer (64 hidden size)
18
+ - **Language Model**: Tiny Transformer (256 hidden size)
19
+ - **Multimodal Projector**: MLP with 256 → 128 → 64 → 1
20
+
21
+ ## Usage
22
+
23
+ ```python
24
+ from create_tiny_vqa_model import TinyVQAScore
25
+
26
+ # Load the model
27
+ model = TinyVQAScore(device="cpu")
28
+
29
+ # Score an image
30
+ from PIL import Image
31
+ image = Image.open("your_image.jpg")
32
+ score = model.score(image, "What is shown in this image?")
33
+ print(f"VQA Score: {score}")
34
+ ```
35
+
36
+ ## Model Size
37
+
38
+ - **Parameters**: ~50K (vs ~11B for the original XXL model)
39
+ - **Memory**: ~200KB (vs ~22GB for the original XXL model)
40
+
41
+ ## Disclaimer
42
+
43
+ This is a randomly initialized model for testing and educational purposes. It is not trained and will not produce meaningful VQA results.
config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "tiny_clip_t5",
3
+ "vision_config": {
4
+ "hidden_size": 64,
5
+ "num_hidden_layers": 1,
6
+ "num_attention_heads": 8,
7
+ "image_size": 224,
8
+ "patch_size": 16
9
+ },
10
+ "text_config": {
11
+ "vocab_size": 32128,
12
+ "hidden_size": 256,
13
+ "num_hidden_layers": 1,
14
+ "num_attention_heads": 8
15
+ },
16
+ "mm_hidden_size": 256,
17
+ "torch_dtype": "float32"
18
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf86183c665a0f6aa4eead61f4fa0cac99611ff446e1a0fd88ff14ef6e484c8f
3
+ size 35447186
tiny_vqa_model.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Tiny VQAScore Model Wrapper
4
+ """
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ from PIL import Image
9
+ import numpy as np
10
+
11
+ class TinyVQAScore:
12
+ """A tiny random version of the VQAScore model."""
13
+
14
+ def __init__(self, model="tiny-random", device="cpu"):
15
+ self.device = torch.device(device)
16
+ self.model = self._create_tiny_model()
17
+ self.model.to(self.device)
18
+ self.model.eval()
19
+
20
+ def _create_tiny_model(self):
21
+ class TinyCLIPT5(nn.Module):
22
+ def __init__(self):
23
+ super().__init__()
24
+ self.vision_encoder = nn.Sequential(
25
+ nn.Conv2d(3, 64, kernel_size=16, stride=16),
26
+ nn.AdaptiveAvgPool2d((1, 1)), # Global average pooling
27
+ nn.Flatten(),
28
+ nn.Linear(64, 256)
29
+ )
30
+ self.text_encoder = nn.Sequential(
31
+ nn.Embedding(32128, 256),
32
+ nn.LayerNorm(256),
33
+ nn.TransformerEncoderLayer(d_model=256, nhead=8, dim_feedforward=512, dropout=0.1, batch_first=True)
34
+ )
35
+ self.multimodal_projector = nn.Sequential(
36
+ nn.Linear(256, 128), nn.GELU(),
37
+ nn.Linear(128, 64), nn.GELU(),
38
+ nn.Linear(64, 1)
39
+ )
40
+ self._init_weights()
41
+
42
+ def _init_weights(self):
43
+ for module in self.modules():
44
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
45
+ nn.init.xavier_uniform_(module.weight, gain=0.1)
46
+ if module.bias is not None:
47
+ nn.init.uniform_(module.bias, -0.1, 0.1)
48
+ elif isinstance(module, nn.Embedding):
49
+ nn.init.uniform_(module.weight, -0.1, 0.1)
50
+
51
+ def forward(self, pixel_values, input_ids):
52
+ vision_features = self.vision_encoder(pixel_values)
53
+ text_features = self.text_encoder(input_ids)
54
+ text_features = text_features.mean(dim=1)
55
+ combined_features = vision_features + text_features
56
+ score = self.multimodal_projector(combined_features)
57
+ return score.squeeze(-1)
58
+
59
+ return TinyCLIPT5()
60
+
61
+ def score(self, image, question):
62
+ if isinstance(image, Image.Image):
63
+ image = image.resize((224, 224))
64
+ image_tensor = torch.from_numpy(np.array(image)).float()
65
+ image_tensor = image_tensor.permute(2, 0, 1).unsqueeze(0) / 255.0
66
+ else:
67
+ image_tensor = image
68
+
69
+ input_ids = torch.randint(0, 32128, (1, 10)).to(self.device)
70
+
71
+ with torch.no_grad():
72
+ score = self.model(image_tensor.to(self.device), input_ids)
73
+
74
+ return torch.sigmoid(score).item()
75
+
76
+ if __name__ == "__main__":
77
+ # Test the model
78
+ model = TinyVQAScore(device="cpu")
79
+ test_image = Image.new('RGB', (224, 224), color='red')
80
+ score = model.score(test_image, "What color is this image?")
81
+ print(f"Test score: {score}")