Spaces:

Adieee5
/

Image_Prompting-and-Captioning

Running

App Files Files Community

Adieee5 commited on 23 days ago

Commit

19dc712

verified ·

1 Parent(s): e52bf46

Upload 7 files

Browse files

Files changed (7) hide show

README.md +12 -0
app.py +153 -0
generate_caption.py +151 -0
image_adapter.py +111 -0
initializer.py +13 -0
model_initial.py +41 -0
requirements.txt +19 -3

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Image Prompting-and-Captioning
+emoji: 🖼️
+colorFrom: red
+colorTo: blue
+sdk: streamlit
+sdk_version: 1.45.1
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Make a virtual environment with compulsory GPU!!
+# python -m venv .venv
+# Activate the virtual environment
+# Windows: .venv\Scripts\activate
+# Linux/Mac: source .venv/bin/activate
+# Install packages
+# pip install -r requirements.txt
+import streamlit as st
+from generate_caption import generate_caption
+from PIL import Image
+import io
+# Set page config
+st.set_page_config(
+    page_title="AI Image Caption & Prompt Generator",
+    page_icon="🖼️",
+    layout="wide"
+)
+# Title and description
+st.title("🖼️ AI Image Caption & Prompt Generator")
+# Create two columns for layout
+col1, col2 = st.columns([1, 1])
+with col1:
+    st.header("📤 Upload & Configure")
+    # File uploader
+    uploaded_file = st.file_uploader(
+        "Choose an image file",
+        type=['png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'webp'],
+        help="Supported formats: PNG, JPG, JPEG, GIF, BMP, TIFF, WebP"
+    )
+    # Caption options
+    st.subheader("Caption Options")
+    caption_type = st.selectbox(
+        "Caption Type",
+        options=["MidJourney", "Descriptive", "Training Prompt"],
+        index=0,
+        help="Choose the style of caption you want to generate"
+    )
+    caption_length = st.selectbox(
+        "Caption Length",
+        options=["short", "any", "long"],
+        index=0,
+        help="Select the desired length of the caption"
+    )
+    # Generate button
+    generate_btn = st.button("🎯 Generate Caption", type="primary", use_container_width=True)
+with col2:
+    st.header("Preview & Results")
+    if uploaded_file is not None:
+        # Display uploaded image
+        image = Image.open(uploaded_file)
+        st.image(image, caption="Uploaded Image", use_container_width=True)
+        # Generate caption when button is clicked
+        if generate_btn:
+            with st.spinner("Generating caption... This may take a moment."):
+                try:
+                    # Generate caption
+                    prompt_used, caption = generate_caption(
+                        image,
+                        caption_type=caption_type,
+                        caption_length=caption_length,
+                        extra_options=None
+                    )
+                    # Display results
+                    st.success("Caption generated successfully!")
+                    # Caption result
+                    st.subheader("📝 Generated Caption")
+                    st.write(f"{caption}")
+                    # Copy to clipboard button
+                    st.code(caption, language=None)
+                    # Additional info
+                    with st.expander("ℹ️ Generation Details"):
+                        st.write(f"**Caption Type:** {caption_type}")
+                        st.write(f"**Caption Length:** {caption_length}")
+                        if prompt_used:
+                            st.write(f"**Prompt Used:** {prompt_used}")
+                except Exception as e:
+                    st.error(f"Error generating caption: {str(e)}")
+                    st.info("Please make sure you have installed all required dependencies and your GPU is properly configured.")
+    else:
+        st.markdown("""
+        ### How to use:
+        1. Upload an image using the file uploader
+        2. Select your preferred caption type and length
+        3. Click 'Generate Caption' to create your AI caption
+        ### Caption Types:
+        - **MidJourney**: Optimized for AI art generation prompts
+        - **Descriptive**: Detailed description of the image content
+        - **Training Prompt**: Formatted for AI model training
+        """)
+# Sidebar with additional information
+with st.sidebar:
+    st.header("🔧 System Requirements")
+    st.markdown("""
+    **Required Setup:**
+    - GPU-enabled environment
+    - Virtual environment activated
+    - All dependencies installed via `requirements.txt`
+    **Supported Image Formats:**
+    - PNG, JPG, JPEG
+    - GIF, BMP, TIFF, WebP
+    """)
+    st.header("💡 Tips")
+    st.markdown("""
+    - Higher quality images produce better captions
+    - Different caption types serve different purposes
+    - Short captions are more focused, long ones more detailed
+    """)
+    st.header("⚙️ Setup Instructions")
+    with st.expander("Click to view setup commands"):
+        st.code("""
+# Create virtual environment
+python -m venv .venv
+# Activate virtual environment
+# Windows:
+.venv\\Scripts\\activate
+# Linux/Mac:
+source .venv/bin/activate
+# Install dependencies
+pip install -r requirements.txt
+# Run Streamlit app
+streamlit run app.py
+        """, language="bash")
+# Footer
+st.markdown("Built by [Aditya Singh]")

generate_caption.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import torch
+from PIL import Image
+import torchvision.transforms.functional as TVF
+import google.generativeai as genai
+import os
+from dotenv import load_dotenv
+# load_dotenv()
+# GEMINI_API_KEY = os.getenv('GOOGLE_API_KEY')
+GEMINI_API_KEY = os.environ.get("GOOGLE_API_KEY")
+if GEMINI_API_KEY:
+    genai.configure(api_key=GEMINI_API_KEY)
+    gemini_model = genai.GenerativeModel('gemini-1.5-flash')
+else:
+    print("Warning: GOOGLE_API_KEY not found in environment variables")
+    gemini_model = None
+CAPTION_TYPE_MAP = {
+    "Descriptive": [
+        "Write a descriptive caption for this image in a formal tone.",
+        "Write a descriptive caption for this image in a formal tone within {word_count} words.",
+        "Write a {length} descriptive caption for this image in a formal tone.",
+    ],
+    "Training Prompt": [
+        "Write a stable diffusion prompt for this image.",
+        "Write a stable diffusion prompt for this image within {word_count} words.",
+        "Write a {length} stable diffusion prompt for this image.",
+    ],
+    "MidJourney": [
+        "Write a MidJourney prompt for this image.",
+        "Write a MidJourney prompt for this image within {word_count} words.",
+        "Write a {length} MidJourney prompt for this image.",
+    ],
+}
+def get_image_features(input_image: Image.Image, clip_model, image_adapter=None):
+    """Extract features from image using CLIP"""
+    image = input_image.resize((384, 384), Image.LANCZOS)
+    pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
+    pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+    with torch.no_grad():
+        vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
+        if image_adapter is not None:
+            embedded_images = image_adapter(vision_outputs.hidden_states)
+            return embedded_images
+        else:
+            return vision_outputs.last_hidden_state
+def generate_caption(input_image: Image.Image,
+                    caption_type: str = "Descriptive",
+                    caption_length: str = "long",
+                    extra_options: list = None,
+                    name_input: str = "",
+                    custom_prompt: str = "",
+                    clip_model=None,
+                    image_adapter=None):
+    """
+    Generate caption for an image using Gemini API.
+    Args:
+        input_image: PIL Image object
+        caption_type: Type of caption ("Descriptive", "Training Prompt", "MidJourney")
+        caption_length: Length specification ("any", "short", "long", etc. or number as string)
+        extra_options: List of extra options
+        name_input: Name to use for person/character in image
+        custom_prompt: Custom prompt to override default settings
+        clip_model: CLIP model (optional, for compatibility)
+        image_adapter: Image adapter model (optional, for compatibility)
+    Returns:
+        tuple: (generated_caption)
+    """
+    if gemini_model is None:
+        return "Error: Gemini API key not configured", "Please set GEMINI_API_KEY environment variable"
+    if input_image is None:
+        return "Error: No image provided", "Please provide an image"
+    if extra_options is None:
+        extra_options = []
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    length = None if caption_length == "any" else caption_length
+    if isinstance(length, str):
+        try:
+            length = int(length)
+        except ValueError:
+            pass
+    if length is None:
+        map_idx = 0
+    elif isinstance(length, int):
+        map_idx = 1
+    elif isinstance(length, str):
+        map_idx = 2
+    else:
+        raise ValueError(f"Invalid caption length: {length}")
+    prompt_str = CAPTION_TYPE_MAP[caption_type][map_idx]
+    if len(extra_options) > 0:
+        prompt_str += " " + " ".join(extra_options)
+    prompt_str = prompt_str.format(name=name_input, length=caption_length, word_count=caption_length)
+    if custom_prompt.strip() != "":
+        prompt_str = custom_prompt.strip()
+    try:
+        if clip_model is not None:
+            image_features = get_image_features(input_image, clip_model, image_adapter)
+            print(f"Extracted image features shape: {image_features.shape if hasattr(image_features, 'shape') else 'N/A'}")
+        full_prompt = f"""You are a helpful image captioner.
+{prompt_str}
+Please analyze the provided image and generate a caption according to the instructions above. Just only the caption text, no additional information."""
+        response = gemini_model.generate_content([full_prompt, input_image])
+        if response.text:
+            caption = response.text.strip()
+        else:
+            caption = "Failed to generate caption"
+    except Exception as e:
+        print(f"Error generating caption: {str(e)}")
+        return prompt_str, f"Error: {str(e)}"
+    return prompt_str, caption
+def caption_image_from_path(image_path: str, **kwargs):
+    """Caption an image from file path"""
+    image = Image.open(image_path)
+    return generate_caption(image, **kwargs)
+def caption_image_simple(image_path: str, caption_type: str = "Descriptive"):
+    """Simple interface to caption an image"""
+    image = Image.open(image_path)
+    prompt_used, caption = generate_caption(image, caption_type=caption_type)
+    print(f"Caption: {caption}")
+    return caption

image_adapter.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from torch import nn
+from transformers import AutoModel, AutoProcessor
+from pathlib import Path
+import torch
+import torch.amp.autocast_mode
+from PIL import Image
+import os
+import torchvision.transforms.functional as TVF
+import base64
+import io
+class ImageAdapter(nn.Module):
+    def __init__(self, input_features: int, output_features: int, ln1: bool, pos_emb: bool, num_image_tokens: int, deep_extract: bool):
+        super().__init__()
+        self.deep_extract = deep_extract
+        if self.deep_extract:
+            input_features = input_features * 5
+        self.linear1 = nn.Linear(input_features, output_features)
+        self.activation = nn.GELU()
+        self.linear2 = nn.Linear(output_features, output_features)
+        self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features)
+        self.pos_emb = None if not pos_emb else nn.Parameter(torch.zeros(num_image_tokens, input_features))
+        self.other_tokens = nn.Embedding(3, output_features)
+        self.other_tokens.weight.data.normal_(mean=0.0, std=0.02)
+    def forward(self, vision_outputs: torch.Tensor):
+        if self.deep_extract:
+            x = torch.concat((
+                vision_outputs[-2],
+                vision_outputs[3],
+                vision_outputs[7],
+                vision_outputs[13],
+                vision_outputs[20],
+            ), dim=-1)
+            assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}"
+            assert x.shape[-1] == vision_outputs[-2].shape[-1] * 5, f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}"
+        else:
+            x = vision_outputs[-2]
+        x = self.ln1(x)
+        if self.pos_emb is not None:
+            assert x.shape[-2:] == self.pos_emb.shape, f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}"
+            x = x + self.pos_emb
+        x = self.linear1(x)
+        x = self.activation(x)
+        x = self.linear2(x)
+        other_tokens = self.other_tokens(torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(x.shape[0], -1))
+        assert other_tokens.shape == (x.shape[0], 2, x.shape[2]), f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}"
+        x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1)
+        return x
+    def get_eot_embedding(self):
+        return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0)
+class ImageAdapter(nn.Module):
+    def __init__(self, input_features: int, output_features: int, ln1: bool, pos_emb: bool, num_image_tokens: int, deep_extract: bool):
+        super().__init__()
+        self.deep_extract = deep_extract
+        if self.deep_extract:
+            input_features = input_features * 5
+        self.linear1 = nn.Linear(input_features, output_features)
+        self.activation = nn.GELU()
+        self.linear2 = nn.Linear(output_features, output_features)
+        self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features)
+        self.pos_emb = None if not pos_emb else nn.Parameter(torch.zeros(num_image_tokens, input_features))
+        self.other_tokens = nn.Embedding(3, output_features)
+        self.other_tokens.weight.data.normal_(mean=0.0, std=0.02)
+    def forward(self, vision_outputs: torch.Tensor):
+        if self.deep_extract:
+            x = torch.concat((
+                vision_outputs[-2],
+                vision_outputs[3],
+                vision_outputs[7],
+                vision_outputs[13],
+                vision_outputs[20],
+            ), dim=-1)
+            assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}"
+            assert x.shape[-1] == vision_outputs[-2].shape[-1] * 5, f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}"
+        else:
+            x = vision_outputs[-2]
+        x = self.ln1(x)
+        if self.pos_emb is not None:
+            assert x.shape[-2:] == self.pos_emb.shape, f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}"
+            x = x + self.pos_emb
+        x = self.linear1(x)
+        x = self.activation(x)
+        x = self.linear2(x)
+        other_tokens = self.other_tokens(torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(x.shape[0], -1))
+        assert other_tokens.shape == (x.shape[0], 2, x.shape[2]), f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}"
+        x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1)
+        return x
+    def get_eot_embedding(self):
+        return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0)

initializer.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from model_initial import initialize_models
+from generate_caption import generate_caption
+from PIL import Image
+if __name__ == "__main__" or "get_ipython" in globals():
+    print("Initializing models...")
+    try:
+        clip_model, image_adapter = initialize_models()
+        print("Models initialized successfully!")
+    except Exception as e:
+        print(f"Error initializing models: {e}")
+        print("You can still use the basic caption functionality with Gemini API only")
+        clip_model, image_adapter = None, None

model_initial.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from transformers import AutoModel, AutoProcessor
+from pathlib import Path
+from image_adapter import ImageAdapter
+import torch
+CLIP_PATH = "google/siglip-so400m-patch14-384"
+CHECKPOINT_PATH = Path("Adieee5/Image-captioning")
+# CHECKPOINT_PATH = Path("cheackpoints")
+def initialize_models():
+    """Initialize and load all models"""
+    print("Loading CLIP")
+    clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
+    clip_model = AutoModel.from_pretrained(CLIP_PATH)
+    clip_model = clip_model.vision_model
+    if (CHECKPOINT_PATH / "clip_model.pt").exists():
+        print("Loading VLM's custom vision model")
+        checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu')
+        checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()}
+        clip_model.load_state_dict(checkpoint)
+        del checkpoint
+    else:
+        print("Custom CLIP weights not found, using default weights")
+    clip_model.eval()
+    clip_model.requires_grad_(False)
+    clip_model.to("cpu")
+    image_adapter = None
+    if (CHECKPOINT_PATH / "image_presenter.pt").exists():
+        print("Loading image adapter")
+        image_adapter = ImageAdapter(clip_model.config.hidden_size, 4096, False, False, 38, False)
+        image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_presenter.pt", map_location="cpu"))
+        image_adapter.eval()
+        image_adapter.to("cpu")
+    else:
+        print("Image adapter not found, will use CLIP features directly")
+    return clip_model, image_adapter

requirements.txt CHANGED Viewed

@@ -1,3 +1,19 @@
-altair
-pandas
-streamlit

+huggingface_hub
+accelerate
+torch
+transformers
+sentencepiece
+peft
+torchvision
+protobuf
+google-ai-generativelanguage==0.4.0
+google-api-core==2.24.2
+google-auth==2.38.0
+google-generativeai==0.4.1
+langchain==0.1.13
+langchain-community==0.0.29
+langchain-google-genai==0.0.11
+google-ai-generativelanguage==0.4.0
+python-dotenv
+streamlit
+watchdog