Spaces:

raun12345678
/

random_random

Sleeping

App Files Files Community

raun12345678 commited on Sep 18, 2024

Commit

edb3f9b

verified ·

1 Parent(s): 0769851

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -9

app.py CHANGED Viewed

@@ -21,7 +21,8 @@ import torchvision.transforms as transforms
 from transformers import AutoTokenizer
 device = torch.device(0 if torch.cuda.is_available() else 'cpu')
-def extract_patches(image_tensor, patch_size=16):
     # Get the dimensions of the image tensor
     bs, c, h, w = image_tensor.size()
@@ -55,7 +56,7 @@ class SinusoidalPosEmb(nn.Module):
 # Define a module for attention blocks
 class AttentionBlock(nn.Module):
-    def __init__(self, hidden_size=128, num_heads=4, masking=True):
         super(AttentionBlock, self).__init__()
         self.masking = masking
@@ -81,7 +82,7 @@ class AttentionBlock(nn.Module):
 # Define a module for a transformer block with self-attention
 # and optional causal masking
 class TransformerBlock(nn.Module):
-    def __init__(self, hidden_size=128, num_heads=4, decoder=False, masking=True):
         super(TransformerBlock, self).__init__()
         self.decoder = decoder
@@ -122,7 +123,7 @@ class TransformerBlock(nn.Module):
 # Define a decoder module for the Transformer architecture
 class Decoder(nn.Module):
-    def __init__(self, num_emb, hidden_size=128, num_layers=3, num_heads=4):
         super(Decoder, self).__init__()
         # Create an embedding layer for tokens
@@ -165,8 +166,8 @@ class Decoder(nn.Module):
 # Define an Vision Encoder module for the Transformer architecture
 class VisionEncoder(nn.Module):
-    def __init__(self, image_size, channels_in, patch_size=16, hidden_size=128,
-                 num_layers=3, num_heads=4):
         super(VisionEncoder, self).__init__()
         self.patch_size = patch_size
@@ -200,8 +201,8 @@ class VisionEncoder(nn.Module):
 # Define an Vision Encoder-Decoder module for the Transformer architecture
 class VisionEncoderDecoder(nn.Module):
-    def __init__(self, image_size, channels_in, num_emb, patch_size=16,
-                 hidden_size=128, num_layers=(3, 3), num_heads=4):
         super(VisionEncoderDecoder, self).__init__()
         # Create an encoder and decoder with specified parameters
@@ -225,7 +226,7 @@ class VisionEncoderDecoder(nn.Module):
                                    input_padding_mask=bool_padding_mask)
         return decoded_seq
-model = torch.load("caption_model.pth", weights_only=False,map_location=torch.device('cpu'))
 model.eval()
 tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased")

 from transformers import AutoTokenizer
 device = torch.device(0 if torch.cuda.is_available() else 'cpu')
+def extract_patches(image_tensor, patch_size=patch_size):
     # Get the dimensions of the image tensor
     bs, c, h, w = image_tensor.size()
 # Define a module for attention blocks
 class AttentionBlock(nn.Module):
+    def __init__(self, hidden_size=hidden_size, num_heads=num_heads, masking=True):
         super(AttentionBlock, self).__init__()
         self.masking = masking
 # Define a module for a transformer block with self-attention
 # and optional causal masking
 class TransformerBlock(nn.Module):
+    def __init__(self, hidden_size=hidden_size, num_heads=num_heads, decoder=False, masking=True):
         super(TransformerBlock, self).__init__()
         self.decoder = decoder
 # Define a decoder module for the Transformer architecture
 class Decoder(nn.Module):
+    def __init__(self, num_emb, hidden_size=hidden_size, num_layers=num_layers, num_heads=num_heads):
         super(Decoder, self).__init__()
         # Create an embedding layer for tokens
 # Define an Vision Encoder module for the Transformer architecture
 class VisionEncoder(nn.Module):
+    def __init__(self, image_size, channels_in, patch_size=patch_size, hidden_size=hidden_size,
+                 num_layers=3, num_heads=num_heads):
         super(VisionEncoder, self).__init__()
         self.patch_size = patch_size
 # Define an Vision Encoder-Decoder module for the Transformer architecture
 class VisionEncoderDecoder(nn.Module):
+    def __init__(self, image_size, channels_in, num_emb, patch_size=patch_size,
+                 hidden_size=hidden_size, num_layers=num_layers, num_heads=num_heads):
         super(VisionEncoderDecoder, self).__init__()
         # Create an encoder and decoder with specified parameters
                                    input_padding_mask=bool_padding_mask)
         return decoded_seq
+model = torch.load("caption_model.pth", weights_only=False)
 model.eval()
 tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased")