raun12345678 commited on
Commit
edb3f9b
·
verified ·
1 Parent(s): 0769851

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -21,7 +21,8 @@ import torchvision.transforms as transforms
21
  from transformers import AutoTokenizer
22
  device = torch.device(0 if torch.cuda.is_available() else 'cpu')
23
 
24
- def extract_patches(image_tensor, patch_size=16):
 
25
  # Get the dimensions of the image tensor
26
  bs, c, h, w = image_tensor.size()
27
 
@@ -55,7 +56,7 @@ class SinusoidalPosEmb(nn.Module):
55
 
56
  # Define a module for attention blocks
57
  class AttentionBlock(nn.Module):
58
- def __init__(self, hidden_size=128, num_heads=4, masking=True):
59
  super(AttentionBlock, self).__init__()
60
  self.masking = masking
61
 
@@ -81,7 +82,7 @@ class AttentionBlock(nn.Module):
81
  # Define a module for a transformer block with self-attention
82
  # and optional causal masking
83
  class TransformerBlock(nn.Module):
84
- def __init__(self, hidden_size=128, num_heads=4, decoder=False, masking=True):
85
  super(TransformerBlock, self).__init__()
86
  self.decoder = decoder
87
 
@@ -122,7 +123,7 @@ class TransformerBlock(nn.Module):
122
 
123
  # Define a decoder module for the Transformer architecture
124
  class Decoder(nn.Module):
125
- def __init__(self, num_emb, hidden_size=128, num_layers=3, num_heads=4):
126
  super(Decoder, self).__init__()
127
 
128
  # Create an embedding layer for tokens
@@ -165,8 +166,8 @@ class Decoder(nn.Module):
165
 
166
  # Define an Vision Encoder module for the Transformer architecture
167
  class VisionEncoder(nn.Module):
168
- def __init__(self, image_size, channels_in, patch_size=16, hidden_size=128,
169
- num_layers=3, num_heads=4):
170
  super(VisionEncoder, self).__init__()
171
 
172
  self.patch_size = patch_size
@@ -200,8 +201,8 @@ class VisionEncoder(nn.Module):
200
 
201
  # Define an Vision Encoder-Decoder module for the Transformer architecture
202
  class VisionEncoderDecoder(nn.Module):
203
- def __init__(self, image_size, channels_in, num_emb, patch_size=16,
204
- hidden_size=128, num_layers=(3, 3), num_heads=4):
205
  super(VisionEncoderDecoder, self).__init__()
206
 
207
  # Create an encoder and decoder with specified parameters
@@ -225,7 +226,7 @@ class VisionEncoderDecoder(nn.Module):
225
  input_padding_mask=bool_padding_mask)
226
  return decoded_seq
227
 
228
- model = torch.load("caption_model.pth", weights_only=False,map_location=torch.device('cpu'))
229
  model.eval()
230
  tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased")
231
 
 
21
  from transformers import AutoTokenizer
22
  device = torch.device(0 if torch.cuda.is_available() else 'cpu')
23
 
24
+
25
+ def extract_patches(image_tensor, patch_size=patch_size):
26
  # Get the dimensions of the image tensor
27
  bs, c, h, w = image_tensor.size()
28
 
 
56
 
57
  # Define a module for attention blocks
58
  class AttentionBlock(nn.Module):
59
+ def __init__(self, hidden_size=hidden_size, num_heads=num_heads, masking=True):
60
  super(AttentionBlock, self).__init__()
61
  self.masking = masking
62
 
 
82
  # Define a module for a transformer block with self-attention
83
  # and optional causal masking
84
  class TransformerBlock(nn.Module):
85
+ def __init__(self, hidden_size=hidden_size, num_heads=num_heads, decoder=False, masking=True):
86
  super(TransformerBlock, self).__init__()
87
  self.decoder = decoder
88
 
 
123
 
124
  # Define a decoder module for the Transformer architecture
125
  class Decoder(nn.Module):
126
+ def __init__(self, num_emb, hidden_size=hidden_size, num_layers=num_layers, num_heads=num_heads):
127
  super(Decoder, self).__init__()
128
 
129
  # Create an embedding layer for tokens
 
166
 
167
  # Define an Vision Encoder module for the Transformer architecture
168
  class VisionEncoder(nn.Module):
169
+ def __init__(self, image_size, channels_in, patch_size=patch_size, hidden_size=hidden_size,
170
+ num_layers=3, num_heads=num_heads):
171
  super(VisionEncoder, self).__init__()
172
 
173
  self.patch_size = patch_size
 
201
 
202
  # Define an Vision Encoder-Decoder module for the Transformer architecture
203
  class VisionEncoderDecoder(nn.Module):
204
+ def __init__(self, image_size, channels_in, num_emb, patch_size=patch_size,
205
+ hidden_size=hidden_size, num_layers=num_layers, num_heads=num_heads):
206
  super(VisionEncoderDecoder, self).__init__()
207
 
208
  # Create an encoder and decoder with specified parameters
 
226
  input_padding_mask=bool_padding_mask)
227
  return decoded_seq
228
 
229
+ model = torch.load("caption_model.pth", weights_only=False)
230
  model.eval()
231
  tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased")
232