Spaces:

cpt-subtext
/

image-metadata-generator

Running

App Files Files Community

cptsubtext commited on 18 days ago

Commit

b8a909c

1 Parent(s): 1640c7d

update files with app

Browse files

Files changed (2) hide show

requirements.txt +4 -3
src/streamlit_app.py +141 -38

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
-altair
-pandas
-streamlit

+streamlit
+Pillow
+transformers
+torch

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,143 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import os
+import json
+from PIL import Image
+from transformers import LlavaForConditionalGeneration, AutoProcessor
+import torch
+import base64
+from io import BytesIO
+# Configuration (similar to aim.py, but adapted for Streamlit)
+DEFAULT_KEYWORD_COUNT = 5
+DEFAULT_MODEL = "llava-hf/llava-1.5-7b-hf" # A common Llava model on Hugging Face
+DEFAULT_TONE = "witty,curious"
+DEFAULT_TEMP = 0.5
+# Function to convert PIL Image to base64 for display
+def convert_to_base64(pil_image):
+    buffered = BytesIO()
+    pil_image.save(buffered, format="JPEG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return img_str
+# Function to extract keywords (from aim.py)
+def extract_keywords(keywords_string):
+    if keywords_string.startswith("Keywords: "):
+        keywords = keywords_string.replace("Keywords: ", "").strip().split(",")
+        return [keyword.strip() for keyword in keywords]
+    else:
+        return []
+# Function to generate metadata using Transformers Llava model
+@st.cache_resource
+def load_llava_model(model_name):
+    processor = AutoProcessor.from_pretrained(model_name)
+    model = LlavaForConditionalGeneration.from_pretrained(model_name)
+    return processor, model
+def generate_metadata(image, prompt_template, model_name, temperature):
+    processor, model = load_llava_model(model_name)
+    # Prepare image and prompt for the model
+    # Llava models typically take a conversation-like prompt
+    # Example prompt format for Llava: "USER: <image>\nWhat is this?\nASSISTANT:"
+    # We'll need to adapt the prompt to fit the Llava model's expected input
+    # For now, let's keep it simple and pass the image and the direct prompt.
+    # A more robust solution might involve a chat template from the processor.
+    inputs = processor(text=prompt_template, images=image, return_tensors="pt")
+    # Generate response
+    with torch.no_grad():
+        output = model.generate(**inputs, max_new_tokens=100, temperature=temperature, do_sample=True, top_p=0.9)
+    generated_text = processor.decode(output[0], skip_special_tokens=True)
+    # The generated_text will contain the prompt itself and then the model's response.
+    # We need to extract only the part that is the model's answer.
+    # This might require some string manipulation depending on the exact output format of the model.
+    # For now, let's assume the model's response starts after the prompt.
+    # Find the end of the prompt in the generated text
+    # This is a simplification and might need adjustment based on actual model output
+    if prompt_template in generated_text:
+        model_response = generated_text.split(prompt_template)[-1].strip()
+    else:
+        model_response = generated_text # Fallback if prompt is not found as a prefix
+    return model_response
+# Streamlit App
+st.set_page_config(layout="wide", page_title="Image Metadata Generator")
+st.title("📸 AI-Powered Image Metadata Generator")
+st.markdown("Upload an image and let the AI generate a catchy title, description, and keywords!")
+# Sidebar for configuration
+st.sidebar.header("Configuration")
+selected_model = st.sidebar.selectbox(
+    "Choose a Llava Model",
+    ["llava-hf/llava-1.5-7b-hf", "llava-hf/baklava-hf"], # Add more Llava models as needed
+    index=0
+)
+temperature = st.sidebar.slider("Creativity (Temperature)", 0.0, 1.0, DEFAULT_TEMP, 0.05)
+keyword_count = st.sidebar.number_input("Number of Keywords", 1, 10, DEFAULT_KEYWORD_COUNT)
+tone_input = st.sidebar.text_input("Tone (e.g., witty, curious)", DEFAULT_TONE)
+tone = [t.strip() for t in tone_input.split(',')]
+uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
+if uploaded_file is not None:
+    image = Image.open(uploaded_file).convert("RGB")
+    st.subheader("Uploaded Image")
+    st.image(image, caption="Uploaded Image", use_column_width=True)
+    if st.button("Generate Metadata"):
+        with st.spinner("Generating metadata... This might take a moment."):
+            prompt_template = f"""
+            As a photojournalist analyze the following image and provide it in a {tone[0]} and {tone[1] if len(tone) > 1 else tone[0]} tone:
+            - Image Headline: A short, impactful title
+            - Image Description: A brief, informative summary
+            - {keyword_count} Image Keywords, separated by commas
+            - Return the Image Headline, Image Description, and Image Keywords in the following format: Headline: ..., Description: ..., Keywords: ...".
+            """
+            # Generate metadata using the selected Llava model
+            ollama_response = generate_metadata(image, prompt_template, selected_model, temperature)
+            if ollama_response:
+                st.subheader("Generated Metadata")
+                # Parse the response similar to aim.py
+                lines = ollama_response.split('\n')
+                headline = ""
+                description = ""
+                keywords = []
+                for line in lines:
+                    if line.startswith("Headline:"):
+                        headline = line.replace("Headline:", "").strip()
+                    elif line.startswith("Description:"):
+                        description = line.replace("Description:", "").strip()
+                    elif line.startswith("Keywords:"):
+                        keywords = extract_keywords(line)
+                # Remove quotation marks
+                headline = headline.strip('"')
+                description = description.strip('"')
+                lstkeywords = [x.strip('"') for x in keywords]
+                st.info(f"**Headline:** {headline}")
+                st.info(f"**Description:** {description}")
+                st.info(f"**Keywords:** {', '.join(lstkeywords)}")
+            else:
+                st.error("Failed to generate metadata. Please try again.")
+st.markdown("""
+---
+*This app utilizes Hugging Face's Transformers library and Llava models to generate image metadata.
+The quality of the generated metadata depends on the chosen model and the complexity of the image.*
+""")