Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -21,7 +21,8 @@ import torchvision.transforms as transforms
|
|
21 |
from transformers import AutoTokenizer
|
22 |
device = torch.device(0 if torch.cuda.is_available() else 'cpu')
|
23 |
|
24 |
-
|
|
|
25 |
# Get the dimensions of the image tensor
|
26 |
bs, c, h, w = image_tensor.size()
|
27 |
|
@@ -55,7 +56,7 @@ class SinusoidalPosEmb(nn.Module):
|
|
55 |
|
56 |
# Define a module for attention blocks
|
57 |
class AttentionBlock(nn.Module):
|
58 |
-
def __init__(self, hidden_size=
|
59 |
super(AttentionBlock, self).__init__()
|
60 |
self.masking = masking
|
61 |
|
@@ -81,7 +82,7 @@ class AttentionBlock(nn.Module):
|
|
81 |
# Define a module for a transformer block with self-attention
|
82 |
# and optional causal masking
|
83 |
class TransformerBlock(nn.Module):
|
84 |
-
def __init__(self, hidden_size=
|
85 |
super(TransformerBlock, self).__init__()
|
86 |
self.decoder = decoder
|
87 |
|
@@ -122,7 +123,7 @@ class TransformerBlock(nn.Module):
|
|
122 |
|
123 |
# Define a decoder module for the Transformer architecture
|
124 |
class Decoder(nn.Module):
|
125 |
-
def __init__(self, num_emb, hidden_size=
|
126 |
super(Decoder, self).__init__()
|
127 |
|
128 |
# Create an embedding layer for tokens
|
@@ -165,8 +166,8 @@ class Decoder(nn.Module):
|
|
165 |
|
166 |
# Define an Vision Encoder module for the Transformer architecture
|
167 |
class VisionEncoder(nn.Module):
|
168 |
-
def __init__(self, image_size, channels_in, patch_size=
|
169 |
-
num_layers=3, num_heads=
|
170 |
super(VisionEncoder, self).__init__()
|
171 |
|
172 |
self.patch_size = patch_size
|
@@ -200,8 +201,8 @@ class VisionEncoder(nn.Module):
|
|
200 |
|
201 |
# Define an Vision Encoder-Decoder module for the Transformer architecture
|
202 |
class VisionEncoderDecoder(nn.Module):
|
203 |
-
def __init__(self, image_size, channels_in, num_emb, patch_size=
|
204 |
-
hidden_size=
|
205 |
super(VisionEncoderDecoder, self).__init__()
|
206 |
|
207 |
# Create an encoder and decoder with specified parameters
|
@@ -225,7 +226,7 @@ class VisionEncoderDecoder(nn.Module):
|
|
225 |
input_padding_mask=bool_padding_mask)
|
226 |
return decoded_seq
|
227 |
|
228 |
-
model = torch.load("caption_model.pth", weights_only=False
|
229 |
model.eval()
|
230 |
tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
231 |
|
|
|
21 |
from transformers import AutoTokenizer
|
22 |
device = torch.device(0 if torch.cuda.is_available() else 'cpu')
|
23 |
|
24 |
+
|
25 |
+
def extract_patches(image_tensor, patch_size=patch_size):
|
26 |
# Get the dimensions of the image tensor
|
27 |
bs, c, h, w = image_tensor.size()
|
28 |
|
|
|
56 |
|
57 |
# Define a module for attention blocks
|
58 |
class AttentionBlock(nn.Module):
|
59 |
+
def __init__(self, hidden_size=hidden_size, num_heads=num_heads, masking=True):
|
60 |
super(AttentionBlock, self).__init__()
|
61 |
self.masking = masking
|
62 |
|
|
|
82 |
# Define a module for a transformer block with self-attention
|
83 |
# and optional causal masking
|
84 |
class TransformerBlock(nn.Module):
|
85 |
+
def __init__(self, hidden_size=hidden_size, num_heads=num_heads, decoder=False, masking=True):
|
86 |
super(TransformerBlock, self).__init__()
|
87 |
self.decoder = decoder
|
88 |
|
|
|
123 |
|
124 |
# Define a decoder module for the Transformer architecture
|
125 |
class Decoder(nn.Module):
|
126 |
+
def __init__(self, num_emb, hidden_size=hidden_size, num_layers=num_layers, num_heads=num_heads):
|
127 |
super(Decoder, self).__init__()
|
128 |
|
129 |
# Create an embedding layer for tokens
|
|
|
166 |
|
167 |
# Define an Vision Encoder module for the Transformer architecture
|
168 |
class VisionEncoder(nn.Module):
|
169 |
+
def __init__(self, image_size, channels_in, patch_size=patch_size, hidden_size=hidden_size,
|
170 |
+
num_layers=3, num_heads=num_heads):
|
171 |
super(VisionEncoder, self).__init__()
|
172 |
|
173 |
self.patch_size = patch_size
|
|
|
201 |
|
202 |
# Define an Vision Encoder-Decoder module for the Transformer architecture
|
203 |
class VisionEncoderDecoder(nn.Module):
|
204 |
+
def __init__(self, image_size, channels_in, num_emb, patch_size=patch_size,
|
205 |
+
hidden_size=hidden_size, num_layers=num_layers, num_heads=num_heads):
|
206 |
super(VisionEncoderDecoder, self).__init__()
|
207 |
|
208 |
# Create an encoder and decoder with specified parameters
|
|
|
226 |
input_padding_mask=bool_padding_mask)
|
227 |
return decoded_seq
|
228 |
|
229 |
+
model = torch.load("caption_model.pth", weights_only=False)
|
230 |
model.eval()
|
231 |
tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
232 |
|