Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +43 -0
config.json +18 -0
pytorch_model.bin +3 -0
tiny_vqa_model.py +81 -0

README.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+language:
+- en
+tags:
+- vision-language
+- vqa
+- text-to-image-evaluation
+license: mit
+---
+# Tiny Random VQAScore Model
+This is a tiny random version of the VQAScore architecture for educational and testing purposes.
+## Model Architecture
+- **Vision Encoder**: Tiny CNN + Transformer (64 hidden size)
+- **Language Model**: Tiny Transformer (256 hidden size)
+- **Multimodal Projector**: MLP with 256 → 128 → 64 → 1
+## Usage
+```python
+from create_tiny_vqa_model import TinyVQAScore
+# Load the model
+model = TinyVQAScore(device="cpu")
+# Score an image
+from PIL import Image
+image = Image.open("your_image.jpg")
+score = model.score(image, "What is shown in this image?")
+print(f"VQA Score: {score}")
+```
+## Model Size
+- **Parameters**: ~50K (vs ~11B for the original XXL model)
+- **Memory**: ~200KB (vs ~22GB for the original XXL model)
+## Disclaimer
+This is a randomly initialized model for testing and educational purposes. It is not trained and will not produce meaningful VQA results.

config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "model_type": "tiny_clip_t5",
+  "vision_config": {
+    "hidden_size": 64,
+    "num_hidden_layers": 1,
+    "num_attention_heads": 8,
+    "image_size": 224,
+    "patch_size": 16
+  },
+  "text_config": {
+    "vocab_size": 32128,
+    "hidden_size": 256,
+    "num_hidden_layers": 1,
+    "num_attention_heads": 8
+  },
+  "mm_hidden_size": 256,
+  "torch_dtype": "float32"
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf86183c665a0f6aa4eead61f4fa0cac99611ff446e1a0fd88ff14ef6e484c8f
+size 35447186

tiny_vqa_model.py ADDED Viewed

	@@ -0,0 +1,81 @@

+#!/usr/bin/env python3
+"""
+Tiny VQAScore Model Wrapper
+"""
+import torch
+import torch.nn as nn
+from PIL import Image
+import numpy as np
+class TinyVQAScore:
+    """A tiny random version of the VQAScore model."""
+    def __init__(self, model="tiny-random", device="cpu"):
+        self.device = torch.device(device)
+        self.model = self._create_tiny_model()
+        self.model.to(self.device)
+        self.model.eval()
+    def _create_tiny_model(self):
+        class TinyCLIPT5(nn.Module):
+            def __init__(self):
+                super().__init__()
+                                 self.vision_encoder = nn.Sequential(
+                     nn.Conv2d(3, 64, kernel_size=16, stride=16),
+                     nn.AdaptiveAvgPool2d((1, 1)),  # Global average pooling
+                     nn.Flatten(),
+                     nn.Linear(64, 256)
+                 )
+                self.text_encoder = nn.Sequential(
+                    nn.Embedding(32128, 256),
+                    nn.LayerNorm(256),
+                    nn.TransformerEncoderLayer(d_model=256, nhead=8, dim_feedforward=512, dropout=0.1, batch_first=True)
+                )
+                self.multimodal_projector = nn.Sequential(
+                    nn.Linear(256, 128), nn.GELU(),
+                    nn.Linear(128, 64), nn.GELU(),
+                    nn.Linear(64, 1)
+                )
+                self._init_weights()
+            def _init_weights(self):
+                for module in self.modules():
+                    if isinstance(module, (nn.Linear, nn.Conv2d)):
+                        nn.init.xavier_uniform_(module.weight, gain=0.1)
+                        if module.bias is not None:
+                            nn.init.uniform_(module.bias, -0.1, 0.1)
+                    elif isinstance(module, nn.Embedding):
+                        nn.init.uniform_(module.weight, -0.1, 0.1)
+            def forward(self, pixel_values, input_ids):
+                vision_features = self.vision_encoder(pixel_values)
+                text_features = self.text_encoder(input_ids)
+                text_features = text_features.mean(dim=1)
+                combined_features = vision_features + text_features
+                score = self.multimodal_projector(combined_features)
+                return score.squeeze(-1)
+        return TinyCLIPT5()
+    def score(self, image, question):
+        if isinstance(image, Image.Image):
+            image = image.resize((224, 224))
+            image_tensor = torch.from_numpy(np.array(image)).float()
+            image_tensor = image_tensor.permute(2, 0, 1).unsqueeze(0) / 255.0
+        else:
+            image_tensor = image
+        input_ids = torch.randint(0, 32128, (1, 10)).to(self.device)
+        with torch.no_grad():
+            score = self.model(image_tensor.to(self.device), input_ids)
+        return torch.sigmoid(score).item()
+if __name__ == "__main__":
+    # Test the model
+    model = TinyVQAScore(device="cpu")
+    test_image = Image.new('RGB', (224, 224), color='red')
+    score = model.score(test_image, "What color is this image?")
+    print(f"Test score: {score}")