File size: 6,415 Bytes
cb04361 c5caf73 38da6b8 b8a909c 8e91616 b8a909c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import os
# Fix for permission errors in Hugging Face Spaces
os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit"
os.environ["STREAMLIT_CACHE_DIR"] = "/tmp/.cache"
os.makedirs("/tmp/.streamlit", exist_ok=True)
os.makedirs("/tmp/.cache", exist_ok=True)
import streamlit as st
import os
import json
from PIL import Image
from transformers import LlavaForConditionalGeneration, AutoProcessor
import torch
import base64
from io import BytesIO
# Configuration (similar to aim.py, but adapted for Streamlit)
DEFAULT_KEYWORD_COUNT = 5
DEFAULT_MODEL = "llava-hf/llava-1.5-7b-hf" # A common Llava model on Hugging Face
DEFAULT_TONE = "witty,curious"
DEFAULT_TEMP = 0.5
# Function to convert PIL Image to base64 for display
def convert_to_base64(pil_image):
buffered = BytesIO()
pil_image.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
return img_str
# Function to extract keywords (from aim.py)
def extract_keywords(keywords_string):
if keywords_string.startswith("Keywords: "):
keywords = keywords_string.replace("Keywords: ", "").strip().split(",")
return [keyword.strip() for keyword in keywords]
else:
return []
# Function to generate metadata using Transformers Llava model
@st.cache_resource
def load_llava_model(model_name):
processor = AutoProcessor.from_pretrained(model_name)
model = LlavaForConditionalGeneration.from_pretrained(model_name)
return processor, model
def generate_metadata(image, prompt_template, model_name, temperature):
processor, model = load_llava_model(model_name)
# Prepare image and prompt for the model
# Llava models typically take a conversation-like prompt
# Example prompt format for Llava: "USER: <image>\nWhat is this?\nASSISTANT:"
# We'll need to adapt the prompt to fit the Llava model's expected input
# For now, let's keep it simple and pass the image and the direct prompt.
# A more robust solution might involve a chat template from the processor.
inputs = processor(text=prompt_template, images=image, return_tensors="pt")
# Generate response
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=100, temperature=temperature, do_sample=True, top_p=0.9)
generated_text = processor.decode(output[0], skip_special_tokens=True)
# The generated_text will contain the prompt itself and then the model's response.
# We need to extract only the part that is the model's answer.
# This might require some string manipulation depending on the exact output format of the model.
# For now, let's assume the model's response starts after the prompt.
# Find the end of the prompt in the generated text
# This is a simplification and might need adjustment based on actual model output
if prompt_template in generated_text:
model_response = generated_text.split(prompt_template)[-1].strip()
else:
model_response = generated_text # Fallback if prompt is not found as a prefix
return model_response
# Streamlit App
st.set_page_config(layout="wide", page_title="Image Metadata Generator")
st.title("📸 AI-Powered Image Metadata Generator")
st.markdown("Upload an image and let the AI generate a catchy title, description, and keywords!")
# Sidebar for configuration
st.sidebar.header("Configuration")
selected_model = st.sidebar.selectbox(
"Choose a Llava Model",
["llava-hf/llava-1.5-7b-hf", "llava-hf/baklava-hf"], # Add more Llava models as needed
index=0
)
temperature = st.sidebar.slider("Creativity (Temperature)", 0.0, 1.0, DEFAULT_TEMP, 0.05)
keyword_count = st.sidebar.number_input("Number of Keywords", 1, 10, DEFAULT_KEYWORD_COUNT)
tone_input = st.sidebar.text_input("Tone (e.g., witty, curious)", DEFAULT_TONE)
tone = [t.strip() for t in tone_input.split(',')]
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
image = Image.open(uploaded_file).convert("RGB")
st.subheader("Uploaded Image")
st.image(image, caption="Uploaded Image", use_column_width=True)
if st.button("Generate Metadata"):
with st.spinner("Generating metadata... This might take a moment."):
prompt_template = f"""
As a photojournalist analyze the following image and provide it in a {tone[0]} and {tone[1] if len(tone) > 1 else tone[0]} tone:
- Image Headline: A short, impactful title
- Image Description: A brief, informative summary
- {keyword_count} Image Keywords, separated by commas
- Return the Image Headline, Image Description, and Image Keywords in the following format: Headline: ..., Description: ..., Keywords: ...".
"""
# Generate metadata using the selected Llava model
ollama_response = generate_metadata(image, prompt_template, selected_model, temperature)
if ollama_response:
st.subheader("Generated Metadata")
# Parse the response similar to aim.py
lines = ollama_response.split('\n')
headline = ""
description = ""
keywords = []
for line in lines:
if line.startswith("Headline:"):
headline = line.replace("Headline:", "").strip()
elif line.startswith("Description:"):
description = line.replace("Description:", "").strip()
elif line.startswith("Keywords:"):
keywords = extract_keywords(line)
# Remove quotation marks
headline = headline.strip('"')
description = description.strip('"')
lstkeywords = [x.strip('"') for x in keywords]
st.info(f"**Headline:** {headline}")
st.info(f"**Description:** {description}")
st.info(f"**Keywords:** {', '.join(lstkeywords)}")
else:
st.error("Failed to generate metadata. Please try again.")
st.markdown("""
---
*This app utilizes Hugging Face's Transformers library and Llava models to generate image metadata.
The quality of the generated metadata depends on the chosen model and the complexity of the image.*
""") |