import os # Fix for permission errors in Hugging Face Spaces os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit" os.environ["STREAMLIT_CACHE_DIR"] = "/tmp/.cache" os.makedirs("/tmp/.streamlit", exist_ok=True) os.makedirs("/tmp/.cache", exist_ok=True) import streamlit as st import os import json from PIL import Image from transformers import LlavaForConditionalGeneration, AutoProcessor import torch import base64 from io import BytesIO # Configuration (similar to aim.py, but adapted for Streamlit) DEFAULT_KEYWORD_COUNT = 5 DEFAULT_MODEL = "llava-hf/llava-1.5-7b-hf" # A common Llava model on Hugging Face DEFAULT_TONE = "witty,curious" DEFAULT_TEMP = 0.5 # Function to convert PIL Image to base64 for display def convert_to_base64(pil_image): buffered = BytesIO() pil_image.save(buffered, format="JPEG") img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") return img_str # Function to extract keywords (from aim.py) def extract_keywords(keywords_string): if keywords_string.startswith("Keywords: "): keywords = keywords_string.replace("Keywords: ", "").strip().split(",") return [keyword.strip() for keyword in keywords] else: return [] # Function to generate metadata using Transformers Llava model @st.cache_resource def load_llava_model(model_name): processor = AutoProcessor.from_pretrained(model_name) model = LlavaForConditionalGeneration.from_pretrained(model_name) return processor, model def generate_metadata(image, prompt_template, model_name, temperature): processor, model = load_llava_model(model_name) # Prepare image and prompt for the model # Llava models typically take a conversation-like prompt # Example prompt format for Llava: "USER: \nWhat is this?\nASSISTANT:" # We'll need to adapt the prompt to fit the Llava model's expected input # For now, let's keep it simple and pass the image and the direct prompt. # A more robust solution might involve a chat template from the processor. inputs = processor(text=prompt_template, images=image, return_tensors="pt") # Generate response with torch.no_grad(): output = model.generate(**inputs, max_new_tokens=100, temperature=temperature, do_sample=True, top_p=0.9) generated_text = processor.decode(output[0], skip_special_tokens=True) # The generated_text will contain the prompt itself and then the model's response. # We need to extract only the part that is the model's answer. # This might require some string manipulation depending on the exact output format of the model. # For now, let's assume the model's response starts after the prompt. # Find the end of the prompt in the generated text # This is a simplification and might need adjustment based on actual model output if prompt_template in generated_text: model_response = generated_text.split(prompt_template)[-1].strip() else: model_response = generated_text # Fallback if prompt is not found as a prefix return model_response # Streamlit App st.set_page_config(layout="wide", page_title="Image Metadata Generator") st.title("📸 AI-Powered Image Metadata Generator") st.markdown("Upload an image and let the AI generate a catchy title, description, and keywords!") # Sidebar for configuration st.sidebar.header("Configuration") selected_model = st.sidebar.selectbox( "Choose a Llava Model", ["llava-hf/llava-1.5-7b-hf", "llava-hf/baklava-hf"], # Add more Llava models as needed index=0 ) temperature = st.sidebar.slider("Creativity (Temperature)", 0.0, 1.0, DEFAULT_TEMP, 0.05) keyword_count = st.sidebar.number_input("Number of Keywords", 1, 10, DEFAULT_KEYWORD_COUNT) tone_input = st.sidebar.text_input("Tone (e.g., witty, curious)", DEFAULT_TONE) tone = [t.strip() for t in tone_input.split(',')] uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: image = Image.open(uploaded_file).convert("RGB") st.subheader("Uploaded Image") st.image(image, caption="Uploaded Image", use_column_width=True) if st.button("Generate Metadata"): with st.spinner("Generating metadata... This might take a moment."): prompt_template = f""" As a photojournalist analyze the following image and provide it in a {tone[0]} and {tone[1] if len(tone) > 1 else tone[0]} tone: - Image Headline: A short, impactful title - Image Description: A brief, informative summary - {keyword_count} Image Keywords, separated by commas - Return the Image Headline, Image Description, and Image Keywords in the following format: Headline: ..., Description: ..., Keywords: ...". """ # Generate metadata using the selected Llava model ollama_response = generate_metadata(image, prompt_template, selected_model, temperature) if ollama_response: st.subheader("Generated Metadata") # Parse the response similar to aim.py lines = ollama_response.split('\n') headline = "" description = "" keywords = [] for line in lines: if line.startswith("Headline:"): headline = line.replace("Headline:", "").strip() elif line.startswith("Description:"): description = line.replace("Description:", "").strip() elif line.startswith("Keywords:"): keywords = extract_keywords(line) # Remove quotation marks headline = headline.strip('"') description = description.strip('"') lstkeywords = [x.strip('"') for x in keywords] st.info(f"**Headline:** {headline}") st.info(f"**Description:** {description}") st.info(f"**Keywords:** {', '.join(lstkeywords)}") else: st.error("Failed to generate metadata. Please try again.") st.markdown(""" --- *This app utilizes Hugging Face's Transformers library and Llava models to generate image metadata. The quality of the generated metadata depends on the chosen model and the complexity of the image.* """)