Spaces:
Sleeping
Sleeping
import json | |
import io | |
import base64 | |
import os, re | |
from langchain_google_vertexai.vision_models import VertexAIVisualQnAChat | |
from PIL import Image | |
from langchain_core.messages import HumanMessage, SystemMessage | |
from langchain_groq import ChatGroq | |
from dotenv import load_dotenv | |
from groq import Groq | |
from flask import Flask, jsonify | |
from langgraph.prebuilt import create_react_agent | |
from langchain_community.llms import huggingface_pipeline | |
from transformers import BlipProcessor, BlipForConditionalGeneration | |
import torch | |
from langchain_core.prompts import PromptTemplate | |
load_dotenv() | |
# os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY") | |
# groq_api_key = os.getenv("GROQ_API_KEY") | |
app = Flask(__name__) | |
static_image_path = os.path.join("images", "page2_print.jfif") | |
# llm = ChatGroq( | |
# model="meta-llama/llama-4-maverick-17b-128e-instruct", | |
# temperature=0, | |
# max_tokens=None, | |
# ) | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu") | |
def analyze_with_blip(image_pil): | |
inputs = processor(image_pil, return_tensors="pt").to("cpu") | |
out = model.generate(**inputs, max_new_tokens=100) | |
caption = processor.decode(out[0], skip_special_tokens=True) | |
return caption | |
def analyze_static_image(): | |
if not os.path.exists(static_image_path): | |
return jsonify({"error": f"Image not found"}) | |
# Load image and convert to base64 string | |
image_path = r"images\page2_print.jfif" | |
with open(image_path, "rb") as image_file: | |
image_bytes = image_file.read() | |
img_base64 = base64.b64encode(image_bytes).decode("utf-8") | |
# SET A SYSTEM PROMPT | |
system_prompt = """ | |
You are an expert in visual scene understanding. | |
Your Job is to analyze an image and respond with structured json like This : | |
- Any number of "Sprites": These refer to distinct characters, animals, or objects in the image that are **in front of the background** (e.g., cat, ball, crab, person, etc.). | |
{ | |
"Sprite 1": { | |
"name": "Cat", | |
"description":"An orange cartoon cat with a cheerful expression, shown jumping playfully." | |
}, | |
"Backdrop":{ | |
"name":"Beach Scene", | |
"description":"A serene beach with sand, blue water, and a clear sky." | |
} | |
} | |
Guidelines: | |
- Focus only the images given in Square Shape. | |
- Don't Consider Blank areas in Image as "Backdrop". | |
- Do NOT classify the background scene as a sprite. | |
- All characters or objects placed in the foreground should be "Sprites". | |
- Use 'Sprite 1', 'Sprite 2', etc. for character or figures. | |
- Use 'Backdrop' for environmental setting or Background behind Sprite. | |
- Don't include generic summary or explanation outside the fields. | |
Return only valid JSON. | |
""" | |
# Compose message using LangChain's HumanMessage | |
content = [ | |
{ | |
"type": "text", | |
"text": "Analyze the image and describe the backdrops and characters as per instruction." | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{img_base64}" | |
} | |
} | |
] | |
agent = create_react_agent( | |
model = llm, | |
tools = [], | |
prompt = system_prompt | |
) | |
# call the LLM | |
try: | |
# response = llm.invoke(messages) | |
# response = agent.invoke({"input":human_prompt}) | |
response = agent.invoke({"messages": [{"role": "user", "content":content}]}) | |
print(response) | |
raw_response = response["messages"][-1].content | |
cleaned_json_str = re.sub(r"^```json\s*|\s*```$", "", raw_response.strip(), flags=re.DOTALL) | |
try: | |
detected_info = json.loads(cleaned_json_str) | |
except json.JSONDecodeError as e: | |
# If parsing fails, fallback to raw string or handle error | |
print("JSON parsing error:", e) | |
detected_info = cleaned_json_str # or handle as needed | |
except Exception as e: | |
return jsonify({"error": str(e)}), 500 | |
# Save the detected information to a JSON file | |
result = { | |
"image_path": image_path, | |
"detected_info": detected_info, | |
} | |
# Save JSON result | |
with open("detected_image_info.json", "w") as f: | |
json.dump(result, f, indent=4) | |
print("Detection results saved to detected_image_info.json") | |
return jsonify(result) | |
if __name__ == "__main__": | |
app.run(debug=True) |