File size: 4,837 Bytes
1c58aa5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import json
import io
import base64
import os, re
from langchain_google_vertexai.vision_models import VertexAIVisualQnAChat
from PIL import Image
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from groq import Groq
from flask import Flask, jsonify
from langgraph.prebuilt import create_react_agent

from langchain_community.llms import huggingface_pipeline
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch 
from langchain_core.prompts import PromptTemplate

load_dotenv()
# os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
# groq_api_key = os.getenv("GROQ_API_KEY")

app = Flask(__name__)

static_image_path = os.path.join("images", "page2_print.jfif")

# llm = ChatGroq(
#     model="meta-llama/llama-4-maverick-17b-128e-instruct",
#     temperature=0,
#     max_tokens=None,
# )

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")

def analyze_with_blip(image_pil):
    inputs = processor(image_pil, return_tensors="pt").to("cpu")
    out = model.generate(**inputs, max_new_tokens=100)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

@app.route("/", methods=["GET"])
def analyze_static_image():
    if not os.path.exists(static_image_path):
        return jsonify({"error": f"Image not found"})

    # Load image and convert to base64 string
    image_path = r"images\page2_print.jfif"
    with open(image_path, "rb") as image_file:
        image_bytes = image_file.read()
    img_base64 = base64.b64encode(image_bytes).decode("utf-8")

    # SET A SYSTEM PROMPT
    system_prompt = """

        You are an expert in visual scene understanding.

        Your Job is to analyze an image and respond with structured json like This : 

        - Any number of "Sprites": These refer to distinct characters, animals, or objects in the image that are **in front of the background** (e.g., cat, ball, crab, person, etc.).

        {

            "Sprite 1": {

                "name": "Cat",

                "description":"An orange cartoon cat with a cheerful expression, shown jumping playfully."

            },

            "Backdrop":{

                "name":"Beach Scene",

                "description":"A serene beach with sand, blue water, and a clear sky."

            }

        }

        Guidelines:

        - Focus only the images given in Square Shape.

        - Don't Consider Blank areas in Image as "Backdrop".

        - Do NOT classify the background scene as a sprite.

        - All characters or objects placed in the foreground should be "Sprites".

        - Use 'Sprite 1', 'Sprite 2', etc. for character or figures.

        - Use 'Backdrop' for environmental setting or Background behind Sprite.

        - Don't include generic summary or explanation outside the fields.

        Return only valid JSON.

        """

    # Compose message using LangChain's HumanMessage
    content = [
        {
            "type": "text",
            "text": "Analyze the image and describe the backdrops and characters as per instruction."
        },
        {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{img_base64}"
            }
        }
    ]

    agent = create_react_agent(
        model = llm, 
        tools = [],
        prompt = system_prompt
    )
    
    # call the LLM
    try:
        # response = llm.invoke(messages)
        # response = agent.invoke({"input":human_prompt})
        response = agent.invoke({"messages": [{"role": "user", "content":content}]})
        print(response)
        
        raw_response = response["messages"][-1].content
        
        cleaned_json_str = re.sub(r"^```json\s*|\s*```$", "", raw_response.strip(), flags=re.DOTALL)
        try:
            detected_info = json.loads(cleaned_json_str)
        except json.JSONDecodeError as e:
            # If parsing fails, fallback to raw string or handle error
            print("JSON parsing error:", e)
            detected_info = cleaned_json_str  # or handle as needed
    except Exception as e:
        return jsonify({"error": str(e)}), 500

    # Save the detected information to a JSON file
    result = {
        "image_path": image_path,
        "detected_info": detected_info,
    }

    # Save JSON result
    with open("detected_image_info.json", "w") as f:
        json.dump(result, f, indent=4)
    print("Detection results saved to detected_image_info.json")
    return jsonify(result)

if __name__ == "__main__":
    app.run(debug=True)