Upload folder using huggingface_hub
Browse files- README.md +43 -0
- config.json +18 -0
- pytorch_model.bin +3 -0
- tiny_vqa_model.py +81 -0
README.md
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- en
|
4 |
+
tags:
|
5 |
+
- vision-language
|
6 |
+
- vqa
|
7 |
+
- text-to-image-evaluation
|
8 |
+
license: mit
|
9 |
+
---
|
10 |
+
|
11 |
+
# Tiny Random VQAScore Model
|
12 |
+
|
13 |
+
This is a tiny random version of the VQAScore architecture for educational and testing purposes.
|
14 |
+
|
15 |
+
## Model Architecture
|
16 |
+
|
17 |
+
- **Vision Encoder**: Tiny CNN + Transformer (64 hidden size)
|
18 |
+
- **Language Model**: Tiny Transformer (256 hidden size)
|
19 |
+
- **Multimodal Projector**: MLP with 256 → 128 → 64 → 1
|
20 |
+
|
21 |
+
## Usage
|
22 |
+
|
23 |
+
```python
|
24 |
+
from create_tiny_vqa_model import TinyVQAScore
|
25 |
+
|
26 |
+
# Load the model
|
27 |
+
model = TinyVQAScore(device="cpu")
|
28 |
+
|
29 |
+
# Score an image
|
30 |
+
from PIL import Image
|
31 |
+
image = Image.open("your_image.jpg")
|
32 |
+
score = model.score(image, "What is shown in this image?")
|
33 |
+
print(f"VQA Score: {score}")
|
34 |
+
```
|
35 |
+
|
36 |
+
## Model Size
|
37 |
+
|
38 |
+
- **Parameters**: ~50K (vs ~11B for the original XXL model)
|
39 |
+
- **Memory**: ~200KB (vs ~22GB for the original XXL model)
|
40 |
+
|
41 |
+
## Disclaimer
|
42 |
+
|
43 |
+
This is a randomly initialized model for testing and educational purposes. It is not trained and will not produce meaningful VQA results.
|
config.json
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_type": "tiny_clip_t5",
|
3 |
+
"vision_config": {
|
4 |
+
"hidden_size": 64,
|
5 |
+
"num_hidden_layers": 1,
|
6 |
+
"num_attention_heads": 8,
|
7 |
+
"image_size": 224,
|
8 |
+
"patch_size": 16
|
9 |
+
},
|
10 |
+
"text_config": {
|
11 |
+
"vocab_size": 32128,
|
12 |
+
"hidden_size": 256,
|
13 |
+
"num_hidden_layers": 1,
|
14 |
+
"num_attention_heads": 8
|
15 |
+
},
|
16 |
+
"mm_hidden_size": 256,
|
17 |
+
"torch_dtype": "float32"
|
18 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cf86183c665a0f6aa4eead61f4fa0cac99611ff446e1a0fd88ff14ef6e484c8f
|
3 |
+
size 35447186
|
tiny_vqa_model.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Tiny VQAScore Model Wrapper
|
4 |
+
"""
|
5 |
+
|
6 |
+
import torch
|
7 |
+
import torch.nn as nn
|
8 |
+
from PIL import Image
|
9 |
+
import numpy as np
|
10 |
+
|
11 |
+
class TinyVQAScore:
|
12 |
+
"""A tiny random version of the VQAScore model."""
|
13 |
+
|
14 |
+
def __init__(self, model="tiny-random", device="cpu"):
|
15 |
+
self.device = torch.device(device)
|
16 |
+
self.model = self._create_tiny_model()
|
17 |
+
self.model.to(self.device)
|
18 |
+
self.model.eval()
|
19 |
+
|
20 |
+
def _create_tiny_model(self):
|
21 |
+
class TinyCLIPT5(nn.Module):
|
22 |
+
def __init__(self):
|
23 |
+
super().__init__()
|
24 |
+
self.vision_encoder = nn.Sequential(
|
25 |
+
nn.Conv2d(3, 64, kernel_size=16, stride=16),
|
26 |
+
nn.AdaptiveAvgPool2d((1, 1)), # Global average pooling
|
27 |
+
nn.Flatten(),
|
28 |
+
nn.Linear(64, 256)
|
29 |
+
)
|
30 |
+
self.text_encoder = nn.Sequential(
|
31 |
+
nn.Embedding(32128, 256),
|
32 |
+
nn.LayerNorm(256),
|
33 |
+
nn.TransformerEncoderLayer(d_model=256, nhead=8, dim_feedforward=512, dropout=0.1, batch_first=True)
|
34 |
+
)
|
35 |
+
self.multimodal_projector = nn.Sequential(
|
36 |
+
nn.Linear(256, 128), nn.GELU(),
|
37 |
+
nn.Linear(128, 64), nn.GELU(),
|
38 |
+
nn.Linear(64, 1)
|
39 |
+
)
|
40 |
+
self._init_weights()
|
41 |
+
|
42 |
+
def _init_weights(self):
|
43 |
+
for module in self.modules():
|
44 |
+
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
45 |
+
nn.init.xavier_uniform_(module.weight, gain=0.1)
|
46 |
+
if module.bias is not None:
|
47 |
+
nn.init.uniform_(module.bias, -0.1, 0.1)
|
48 |
+
elif isinstance(module, nn.Embedding):
|
49 |
+
nn.init.uniform_(module.weight, -0.1, 0.1)
|
50 |
+
|
51 |
+
def forward(self, pixel_values, input_ids):
|
52 |
+
vision_features = self.vision_encoder(pixel_values)
|
53 |
+
text_features = self.text_encoder(input_ids)
|
54 |
+
text_features = text_features.mean(dim=1)
|
55 |
+
combined_features = vision_features + text_features
|
56 |
+
score = self.multimodal_projector(combined_features)
|
57 |
+
return score.squeeze(-1)
|
58 |
+
|
59 |
+
return TinyCLIPT5()
|
60 |
+
|
61 |
+
def score(self, image, question):
|
62 |
+
if isinstance(image, Image.Image):
|
63 |
+
image = image.resize((224, 224))
|
64 |
+
image_tensor = torch.from_numpy(np.array(image)).float()
|
65 |
+
image_tensor = image_tensor.permute(2, 0, 1).unsqueeze(0) / 255.0
|
66 |
+
else:
|
67 |
+
image_tensor = image
|
68 |
+
|
69 |
+
input_ids = torch.randint(0, 32128, (1, 10)).to(self.device)
|
70 |
+
|
71 |
+
with torch.no_grad():
|
72 |
+
score = self.model(image_tensor.to(self.device), input_ids)
|
73 |
+
|
74 |
+
return torch.sigmoid(score).item()
|
75 |
+
|
76 |
+
if __name__ == "__main__":
|
77 |
+
# Test the model
|
78 |
+
model = TinyVQAScore(device="cpu")
|
79 |
+
test_image = Image.new('RGB', (224, 224), color='red')
|
80 |
+
score = model.score(test_image, "What color is this image?")
|
81 |
+
print(f"Test score: {score}")
|