Spaces:

mazen2100
/

image_captioner_pretrained

Sleeping

App Files Files Community

mazen2100 commited on May 2

Commit

fe92329

verified ·

1 Parent(s): bd43080

Update src/app.py

Browse files

Files changed (1) hide show

src/app.py +14 -26

src/app.py CHANGED Viewed

@@ -38,9 +38,9 @@ st.set_page_config(
 MODEL_CONFIGS = {
     "BLIP": {
         "name": "BLIP",
-        "icon": "⭐", #WE SHOULD IMPLEMENT AND SVG ICON REPLACE WITH THIS
         "description": "BLIP (Bootstrapping Language-Image Pre-training) is designed to learn vision-language representation from noisy web data. It excels at generating detailed and accurate image descriptions.",
-        "generate_params": {"max_length": 50, "num_beams": 5, "min_length": 10, "top_p": 0.9, "repetition_penalty": 1.5}
     },
     "ViT-GPT2": {
         "name": "ViT-GPT2",
@@ -64,9 +64,8 @@ MODEL_CONFIGS = {
 # ......................... LOADING FUNCTIONS .....................................
 @st.cache_resource
 def load_blip_model():
-    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
-    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-    if torch.cuda.is_available(): model = model.to("cuda")
     return model, processor
 @st.cache_resource
@@ -74,21 +73,18 @@ def load_vit_gpt2_model():
     model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
     feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
     tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-    if torch.cuda.is_available(): model = model.to("cuda")
     return model, feature_extractor, tokenizer
 @st.cache_resource
 def load_git_model():
     processor = AutoProcessor.from_pretrained("microsoft/git-base")
     model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
-    if torch.cuda.is_available(): model = model.to("cuda")
     return model, processor
 @st.cache_resource
 def load_clip_model():
-    processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
-    model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
-    if torch.cuda.is_available(): model = model.to("cuda")
     return model, processor
 # ......................... IMAGE PROCESSING ...............................
@@ -132,9 +128,7 @@ def generate_caption(image, model_name, models_data):
 def get_blip_caption(image, model, processor):
     try:
-        inputs = processor(image, return_tensors="pt")
-        if torch.cuda.is_available():
-            inputs = {k: v.to("cuda") for k, v in inputs.items()}
         output = model.generate(**inputs, **MODEL_CONFIGS["BLIP"]["generate_params"])
         caption = processor.decode(output[0], skip_special_tokens=True)
         return caption
@@ -143,10 +137,12 @@ def get_blip_caption(image, model, processor):
 def get_vit_gpt2_caption(image, model, feature_extractor, tokenizer):
     try:
-        inputs = feature_extractor(images=image, return_tensors="pt")
-        if torch.cuda.is_available():
-            inputs = {k: v.to("cuda") for k, v in inputs.items()}
-        output = model.generate(**inputs, **MODEL_CONFIGS["ViT-GPT2"]["generate_params"])
         caption = tokenizer.decode(output[0], skip_special_tokens=True)
         return caption
     except Exception as e:
@@ -154,9 +150,7 @@ def get_vit_gpt2_caption(image, model, feature_extractor, tokenizer):
 def get_git_caption(image, model, processor):
     try:
-        inputs = processor(images=image, return_tensors="pt")
-        if torch.cuda.is_available():
-            inputs = {k: v.to("cuda") for k, v in inputs.items()}
         output = model.generate(**inputs, **MODEL_CONFIGS["GIT"]["generate_params"])
         caption = processor.decode(output[0], skip_special_tokens=True)
         return caption
@@ -190,22 +184,16 @@ STYLE_ATTRIBUTES = [
 def get_clip_caption(image, model, processor):
     try:
         content_inputs = processor(text=CONTENT_CATEGORIES, images=image, return_tensors="pt", padding=True)
-        if torch.cuda.is_available():
-            content_inputs = {k: v.to("cuda") for k, v in content_inputs.items() if torch.is_tensor(v)}
         content_outputs = model(**content_inputs)
         content_probs = content_outputs.logits_per_image.softmax(dim=1)[0]
         top_content_probs, top_content_indices = torch.topk(content_probs, 2)
         scene_inputs = processor(text=SCENE_ATTRIBUTES, images=image, return_tensors="pt", padding=True)
-        if torch.cuda.is_available():
-            scene_inputs = {k: v.to("cuda") for k, v in scene_inputs.items() if torch.is_tensor(v)}
         scene_outputs = model(**scene_inputs)
         scene_probs = scene_outputs.logits_per_image.softmax(dim=1)[0]
         top_scene_probs, top_scene_indices = torch.topk(scene_probs, 2)
         style_inputs = processor(text=STYLE_ATTRIBUTES, images=image, return_tensors="pt", padding=True)
-        if torch.cuda.is_available():
-            style_inputs = {k: v.to("cuda") for k, v in style_inputs.items() if torch.is_tensor(v)}
         style_outputs = model(**style_inputs)
         style_probs = style_outputs.logits_per_image.softmax(dim=1)[0]
         top_style_probs, top_style_indices = torch.topk(style_probs, 1)

 MODEL_CONFIGS = {
     "BLIP": {
         "name": "BLIP",
+        "icon": "⭐",
         "description": "BLIP (Bootstrapping Language-Image Pre-training) is designed to learn vision-language representation from noisy web data. It excels at generating detailed and accurate image descriptions.",
+        "generate_params": {"max_length": 50, "num_beams": 5, "min_length": 10, "do_sample": True, "top_p": 0.9, "repetition_penalty": 1.5}  # Added do_sample=True
     },
     "ViT-GPT2": {
         "name": "ViT-GPT2",
 # ......................... LOADING FUNCTIONS .....................................
 @st.cache_resource
 def load_blip_model():
+    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")  # Changed to base model
+    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
     return model, processor
 @st.cache_resource
     model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
     feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
     tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
     return model, feature_extractor, tokenizer
 @st.cache_resource
 def load_git_model():
     processor = AutoProcessor.from_pretrained("microsoft/git-base")
     model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
     return model, processor
 @st.cache_resource
 def load_clip_model():
+    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")  # Changed to smaller model
+    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
     return model, processor
 # ......................... IMAGE PROCESSING ...............................
 def get_blip_caption(image, model, processor):
     try:
+        inputs = processor(images=image, return_tensors="pt", padding=True, truncation=True)
         output = model.generate(**inputs, **MODEL_CONFIGS["BLIP"]["generate_params"])
         caption = processor.decode(output[0], skip_special_tokens=True)
         return caption
 def get_vit_gpt2_caption(image, model, feature_extractor, tokenizer):
     try:
+        inputs = feature_extractor(images=image, return_tensors="pt", padding=True)
+        output = model.generate(
+            pixel_values=inputs.pixel_values,
+            **MODEL_CONFIGS["ViT-GPT2"]["generate_params"],
+            attention_mask=inputs.attention_mask if hasattr(inputs, "attention_mask") else None
+        )
         caption = tokenizer.decode(output[0], skip_special_tokens=True)
         return caption
     except Exception as e:
 def get_git_caption(image, model, processor):
     try:
+        inputs = processor(images=image, return_tensors="pt", padding=True)
         output = model.generate(**inputs, **MODEL_CONFIGS["GIT"]["generate_params"])
         caption = processor.decode(output[0], skip_special_tokens=True)
         return caption
 def get_clip_caption(image, model, processor):
     try:
         content_inputs = processor(text=CONTENT_CATEGORIES, images=image, return_tensors="pt", padding=True)
         content_outputs = model(**content_inputs)
         content_probs = content_outputs.logits_per_image.softmax(dim=1)[0]
         top_content_probs, top_content_indices = torch.topk(content_probs, 2)
         scene_inputs = processor(text=SCENE_ATTRIBUTES, images=image, return_tensors="pt", padding=True)
         scene_outputs = model(**scene_inputs)
         scene_probs = scene_outputs.logits_per_image.softmax(dim=1)[0]
         top_scene_probs, top_scene_indices = torch.topk(scene_probs, 2)
         style_inputs = processor(text=STYLE_ATTRIBUTES, images=image, return_tensors="pt", padding=True)
         style_outputs = model(**style_inputs)
         style_probs = style_outputs.logits_per_image.softmax(dim=1)[0]
         top_style_probs, top_style_indices = torch.topk(style_probs, 1)