Spaces:
Sleeping
Sleeping
File size: 4,837 Bytes
1c58aa5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import json
import io
import base64
import os, re
from langchain_google_vertexai.vision_models import VertexAIVisualQnAChat
from PIL import Image
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from groq import Groq
from flask import Flask, jsonify
from langgraph.prebuilt import create_react_agent
from langchain_community.llms import huggingface_pipeline
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from langchain_core.prompts import PromptTemplate
load_dotenv()
# os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
# groq_api_key = os.getenv("GROQ_API_KEY")
app = Flask(__name__)
static_image_path = os.path.join("images", "page2_print.jfif")
# llm = ChatGroq(
# model="meta-llama/llama-4-maverick-17b-128e-instruct",
# temperature=0,
# max_tokens=None,
# )
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
def analyze_with_blip(image_pil):
inputs = processor(image_pil, return_tensors="pt").to("cpu")
out = model.generate(**inputs, max_new_tokens=100)
caption = processor.decode(out[0], skip_special_tokens=True)
return caption
@app.route("/", methods=["GET"])
def analyze_static_image():
if not os.path.exists(static_image_path):
return jsonify({"error": f"Image not found"})
# Load image and convert to base64 string
image_path = r"images\page2_print.jfif"
with open(image_path, "rb") as image_file:
image_bytes = image_file.read()
img_base64 = base64.b64encode(image_bytes).decode("utf-8")
# SET A SYSTEM PROMPT
system_prompt = """
You are an expert in visual scene understanding.
Your Job is to analyze an image and respond with structured json like This :
- Any number of "Sprites": These refer to distinct characters, animals, or objects in the image that are **in front of the background** (e.g., cat, ball, crab, person, etc.).
{
"Sprite 1": {
"name": "Cat",
"description":"An orange cartoon cat with a cheerful expression, shown jumping playfully."
},
"Backdrop":{
"name":"Beach Scene",
"description":"A serene beach with sand, blue water, and a clear sky."
}
}
Guidelines:
- Focus only the images given in Square Shape.
- Don't Consider Blank areas in Image as "Backdrop".
- Do NOT classify the background scene as a sprite.
- All characters or objects placed in the foreground should be "Sprites".
- Use 'Sprite 1', 'Sprite 2', etc. for character or figures.
- Use 'Backdrop' for environmental setting or Background behind Sprite.
- Don't include generic summary or explanation outside the fields.
Return only valid JSON.
"""
# Compose message using LangChain's HumanMessage
content = [
{
"type": "text",
"text": "Analyze the image and describe the backdrops and characters as per instruction."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_base64}"
}
}
]
agent = create_react_agent(
model = llm,
tools = [],
prompt = system_prompt
)
# call the LLM
try:
# response = llm.invoke(messages)
# response = agent.invoke({"input":human_prompt})
response = agent.invoke({"messages": [{"role": "user", "content":content}]})
print(response)
raw_response = response["messages"][-1].content
cleaned_json_str = re.sub(r"^```json\s*|\s*```$", "", raw_response.strip(), flags=re.DOTALL)
try:
detected_info = json.loads(cleaned_json_str)
except json.JSONDecodeError as e:
# If parsing fails, fallback to raw string or handle error
print("JSON parsing error:", e)
detected_info = cleaned_json_str # or handle as needed
except Exception as e:
return jsonify({"error": str(e)}), 500
# Save the detected information to a JSON file
result = {
"image_path": image_path,
"detected_info": detected_info,
}
# Save JSON result
with open("detected_image_info.json", "w") as f:
json.dump(result, f, indent=4)
print("Detection results saved to detected_image_info.json")
return jsonify(result)
if __name__ == "__main__":
app.run(debug=True) |