prthm11 commited on
Commit
4012b1c
·
verified ·
1 Parent(s): 92be525

Update app_main.py

Browse files
Files changed (1) hide show
  1. app_main.py +177 -90
app_main.py CHANGED
@@ -7,24 +7,25 @@ from PIL import Image, ImageEnhance, ImageDraw
7
  from imutils.perspective import four_point_transform
8
  from dotenv import load_dotenv
9
  import pytesseract
10
- from transformers import AutoProcessor, AutoModelForImageTextToText
11
  from langchain_community.document_loaders.image_captions import ImageCaptionLoader
12
  from werkzeug.utils import secure_filename
13
- import tempfile, logging
 
 
 
14
 
15
- app = Flask(__name__)
 
 
16
 
17
- # Configure logging
18
- logging.basicConfig(
19
- level=logging.DEBUG, # Use INFO or ERROR in production
20
- format="%(asctime)s [%(levelname)s] %(message)s",
21
- handlers=[
22
- logging.FileHandler("app.log"),
23
- logging.StreamHandler()
24
- ]
25
  )
26
 
27
- logger = logging.getLogger(__name__)
28
 
29
  pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
30
  poppler_path=r"C:\poppler-23.11.0\Library\bin"
@@ -41,99 +42,185 @@ for path in [OUTPUT_FOLDER, IMAGE_FOLDER_PATH, DETECTED_IMAGE_FOLDER_PATH, JSON_
41
  os.makedirs(path, exist_ok=True)
42
 
43
  # Model Initialization
44
- smolvlm256m_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct")
45
- smolvlm256m_model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct").to("cpu")
 
 
 
 
 
 
 
 
46
 
47
  # SmolVLM Image Captioning functioning
48
  def get_smolvlm_caption(image: Image.Image, prompt: str = "") -> str:
49
- # Ensure exactly one <image> token
50
- if "<image>" not in prompt:
51
- prompt = f"<image> {prompt.strip()}"
52
-
53
- num_image_tokens = prompt.count("<image>")
54
- if num_image_tokens != 1:
55
- raise ValueError(f"Prompt must contain exactly 1 <image> token. Found {num_image_tokens}")
56
-
57
- inputs = smolvlm256m_processor(images=[image], text=[prompt], return_tensors="pt").to("cpu")
58
- output_ids = smolvlm256m_model.generate(**inputs, max_new_tokens=100)
59
- return smolvlm256m_processor.decode(output_ids[0], skip_special_tokens=True)
 
 
 
60
 
61
  # --- FUNCTION: Extract images from saved PDF ---
62
  def extract_images_from_pdf(pdf_path, output_json_path):
63
  ''' Extract images from PDF and generate structured sprite JSON '''
64
 
65
- pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0] # e.g., "scratch_crab"
66
- pdf_dir_path = os.path.dirname(pdf_path).replace("/", "\\")
67
-
68
- # Create subfolders
69
- extracted_image_subdir = os.path.join(DETECTED_IMAGE_FOLDER_PATH, pdf_filename)
70
- json_subdir = os.path.join(JSON_FOLDER_PATH, pdf_filename)
71
- os.makedirs(extracted_image_subdir, exist_ok=True)
72
- os.makedirs(json_subdir, exist_ok=True)
 
 
 
 
 
73
 
74
- # Output paths
75
- output_json_path = os.path.join(json_subdir, "extracted.json")
76
- final_json_path = os.path.join(json_subdir, "extracted_sprites.json")
 
 
 
 
 
 
77
 
78
- elements = partition_pdf(
79
- filename=pdf_path,
80
- strategy="hi_res",
81
- extract_image_block_types=["Image"],
82
- extract_image_block_to_payload=True, # Set to True to get base64 in output
83
- )
 
 
 
 
 
 
84
 
85
- with open(output_json_path, "w") as f:
86
- json.dump([element.to_dict() for element in elements], f, indent=4)
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
- # Display extracted images
89
- with open(output_json_path, 'r') as file:
90
- file_elements = json.load(file)
 
 
91
 
92
- # extracted_images_dir = os.path.join(os.path.dirname(output_json_path), "extracted_images")
93
- # os.makedirs(extracted_images_dir, exist_ok=True)
94
-
95
- # Prepare manipulated sprite JSON structure
96
- manipulated_json = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- # Final manipulated file (for captions)
99
- final_json_path = output_json_path.replace(".json", "_sprites.json")
100
-
101
- # If JSON already exists, load it and find the next available Sprite number
102
- if os.path.exists(final_json_path):
103
- with open(final_json_path, "r") as existing_file:
104
- manipulated = json.load(existing_file)
105
- # Determine the next available index (e.g., Sprite 4 if 1–3 already exist)
106
- existing_keys = [int(k.replace("Sprite ", "")) for k in manipulated.keys()]
107
- start_count = max(existing_keys, default=0) + 1
108
- else:
109
- start_count = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- sprite_count = start_count
112
- for i,element in enumerate(file_elements):
113
- if "image_base64" in element["metadata"]:
114
- image_data = base64.b64decode(element["metadata"]["image_base64"])
115
- image = Image.open(io.BytesIO(image_data)).convert("RGB")
116
- image.show(title=f"Extracted Image {i+1}")
117
- image_path = os.path.join(extracted_image_subdir, f"Sprite_{i+1}.png")
118
- image.save(image_path)
119
-
120
- description = get_smolvlm_caption(image, prompt="Give a brief Description")
121
- name = get_smolvlm_caption(image, prompt="give a short name/title of this Image.")
122
-
123
- manipulated_json[f"Sprite {sprite_count}"] = {
124
- "name": name,
125
- "base64": element["metadata"]["image_base64"],
126
- "file-path": pdf_dir_path,
127
- "description":description
128
- }
129
- sprite_count += 1
130
-
131
- # Save manipulated JSON
132
- with open(final_json_path, "w") as sprite_file:
133
- json.dump(manipulated_json, sprite_file, indent=4)
134
-
135
- print(f"✅ Manipulated sprite JSON saved: {final_json_path}")
136
- return final_json_path, manipulated_json
137
 
138
  @app.route('/')
139
  def index():
 
7
  from imutils.perspective import four_point_transform
8
  from dotenv import load_dotenv
9
  import pytesseract
10
+ from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForVision2Seq
11
  from langchain_community.document_loaders.image_captions import ImageCaptionLoader
12
  from werkzeug.utils import secure_filename
13
+ import tempfile
14
+ import torch
15
+ from langchain_groq import ChatGroq
16
+ from langgraph.prebuilt import create_react_agent
17
 
18
+ load_dotenv()
19
+ # os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
20
+ groq_api_key = os.getenv("GROQ_API_KEY")
21
 
22
+ llm = ChatGroq(
23
+ model="meta-llama/llama-4-maverick-17b-128e-instruct",
24
+ temperature=0,
25
+ max_tokens=None,
 
 
 
 
26
  )
27
 
28
+ app = Flask(__name__)
29
 
30
  pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
31
  poppler_path=r"C:\poppler-23.11.0\Library\bin"
 
42
  os.makedirs(path, exist_ok=True)
43
 
44
  # Model Initialization
45
+ try:
46
+ smolvlm256m_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct")
47
+ # smolvlm256m_model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct").to("cpu")
48
+ smolvlm256m_model = AutoModelForVision2Seq.from_pretrained(
49
+ "HuggingFaceTB/SmolVLM-256M-Instruct",
50
+ torch_dtype=torch.bfloat16 if hasattr(torch, "bfloat16") else torch.float32,
51
+ _attn_implementation="eager"
52
+ ).to("cpu")
53
+ except Exception as e:
54
+ raise RuntimeError(f"❌ Failed to load SmolVLM model: {str(e)}")
55
 
56
  # SmolVLM Image Captioning functioning
57
  def get_smolvlm_caption(image: Image.Image, prompt: str = "") -> str:
58
+ try:
59
+ # Ensure exactly one <image> token
60
+ if "<image>" not in prompt:
61
+ prompt = f"<image> {prompt.strip()}"
62
+
63
+ num_image_tokens = prompt.count("<image>")
64
+ if num_image_tokens != 1:
65
+ raise ValueError(f"Prompt must contain exactly 1 <image> token. Found {num_image_tokens}")
66
+
67
+ inputs = smolvlm256m_processor(images=[image], text=[prompt], return_tensors="pt").to("cpu")
68
+ output_ids = smolvlm256m_model.generate(**inputs, max_new_tokens=100)
69
+ return smolvlm256m_processor.decode(output_ids[0], skip_special_tokens=True)
70
+ except Exception as e:
71
+ return f"❌ Error during caption generation: {str(e)}"
72
 
73
  # --- FUNCTION: Extract images from saved PDF ---
74
  def extract_images_from_pdf(pdf_path, output_json_path):
75
  ''' Extract images from PDF and generate structured sprite JSON '''
76
 
77
+ try:
78
+ pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0] # e.g., "scratch_crab"
79
+ pdf_dir_path = os.path.dirname(pdf_path).replace("/", "\\")
80
+
81
+ # Create subfolders
82
+ extracted_image_subdir = os.path.join(DETECTED_IMAGE_FOLDER_PATH, pdf_filename)
83
+ json_subdir = os.path.join(JSON_FOLDER_PATH, pdf_filename)
84
+ os.makedirs(extracted_image_subdir, exist_ok=True)
85
+ os.makedirs(json_subdir, exist_ok=True)
86
+
87
+ # Output paths
88
+ output_json_path = os.path.join(json_subdir, "extracted.json")
89
+ final_json_path = os.path.join(json_subdir, "extracted_sprites.json")
90
 
91
+ try:
92
+ elements = partition_pdf(
93
+ filename=pdf_path,
94
+ strategy="hi_res",
95
+ extract_image_block_types=["Image"],
96
+ extract_image_block_to_payload=True, # Set to True to get base64 in output
97
+ )
98
+ except Exception as e:
99
+ raise RuntimeError(f"❌ Failed to extract images from PDF: {str(e)}")
100
 
101
+ try:
102
+ with open(output_json_path, "w") as f:
103
+ json.dump([element.to_dict() for element in elements], f, indent=4)
104
+ except Exception as e:
105
+ raise RuntimeError(f"❌ Failed to write extracted.json: {str(e)}")
106
+
107
+ try:
108
+ # Display extracted images
109
+ with open(output_json_path, 'r') as file:
110
+ file_elements = json.load(file)
111
+ except Exception as e:
112
+ raise RuntimeError(f"❌ Failed to read extracted.json: {str(e)}")
113
 
114
+ # Prepare manipulated sprite JSON structure
115
+ manipulated_json = {}
116
+
117
+ # SET A SYSTEM PROMPT
118
+ system_prompt = """
119
+ You are an expert in visual scene understanding.
120
+ Your Job is to analyze an image and respond acoording if asked for name give simple name by analyzing it and if ask for descrption generate a short description covering its elements.
121
+
122
+ Guidelines:
123
+ - Focus only the images given in Square Shape.
124
+ - Don't Consider Blank areas in Image as.
125
+ - Don't include generic summary or explanation outside the fields.
126
+ Return only string.
127
+ """
128
 
129
+ agent = create_react_agent(
130
+ model = llm,
131
+ tools = [],
132
+ prompt = system_prompt
133
+ )
134
 
135
+ # If JSON already exists, load it and find the next available Sprite number
136
+ if os.path.exists(final_json_path):
137
+ with open(final_json_path, "r") as existing_file:
138
+ manipulated = json.load(existing_file)
139
+ # Determine the next available index (e.g., Sprite 4 if 1–3 already exist)
140
+ existing_keys = [int(k.replace("Sprite ", "")) for k in manipulated.keys()]
141
+ start_count = max(existing_keys, default=0) + 1
142
+ else:
143
+ start_count = 1
144
+
145
+ sprite_count = start_count
146
+ for i,element in enumerate(file_elements):
147
+ if "image_base64" in element["metadata"]:
148
+ try:
149
+ image_data = base64.b64decode(element["metadata"]["image_base64"])
150
+ image = Image.open(io.BytesIO(image_data)).convert("RGB")
151
+ image.show(title=f"Extracted Image {i+1}")
152
+ image_path = os.path.join(extracted_image_subdir, f"Sprite_{i+1}.png")
153
+ image.save(image_path)
154
+ with open(image_path, "rb") as image_file:
155
+ image_bytes = image_file.read()
156
+ img_base64 = base64.b64encode(image_bytes).decode("utf-8")
157
+ # description = get_smolvlm_caption(image, prompt="Give a brief Description")
158
+ # name = get_smolvlm_caption(image, prompt="give a short name/title of this Image.")
159
+ def clean_caption_output(raw_output: str, prompt: str) -> str:
160
+ answer = raw_output.replace(prompt, '').replace("<image>", '').strip(" :-\n")
161
+ return answer
162
 
163
+ prompt_description = "Give a brief Captioning."
164
+ prompt_name = "give a short name caption of this Image."
165
+
166
+ content1 = [
167
+ {
168
+ "type": "text",
169
+ "text": f"{prompt_description}"
170
+ },
171
+ {
172
+ "type": "image_url",
173
+ "image_url": {
174
+ "url": f"data:image/jpeg;base64,{img_base64}"
175
+ }
176
+ }
177
+ ]
178
+ response1 = agent.invoke({"messages": [{"role": "user", "content":content1}]})
179
+ print(response1)
180
+ description = response1["messages"][-1].content
181
+
182
+ content2 = [
183
+ {
184
+ "type": "text",
185
+ "text": f"{prompt_name}"
186
+ },
187
+ {
188
+ "type": "image_url",
189
+ "image_url": {
190
+ "url": f"data:image/jpeg;base64,{img_base64}"
191
+ }
192
+ }
193
+ ]
194
+
195
+ response2 = agent.invoke({"messages": [{"role": "user", "content":content2}]})
196
+ print(response2)
197
+ name = response2["messages"][-1].content
198
+
199
+ #raw_description = get_smolvlm_caption(image, prompt=prompt_description)
200
+ #raw_name = get_smolvlm_caption(image, prompt=prompt_name)
201
+
202
+ #description = clean_caption_output(raw_description, prompt_description)
203
+ #name = clean_caption_output(raw_name, prompt_name)
204
+
205
+ manipulated_json[f"Sprite {sprite_count}"] = {
206
+ "name": name,
207
+ "base64": element["metadata"]["image_base64"],
208
+ "file-path": pdf_dir_path,
209
+ "description":description
210
+ }
211
+ sprite_count += 1
212
+ except Exception as e:
213
+ print(f"⚠️ Error processing Sprite {i+1}: {str(e)}")
214
+
215
+ # Save manipulated JSON
216
+ with open(final_json_path, "w") as sprite_file:
217
+ json.dump(manipulated_json, sprite_file, indent=4)
218
+
219
+ print(f"✅ Manipulated sprite JSON saved: {final_json_path}")
220
+ return final_json_path, manipulated_json
221
 
222
+ except Exception as e:
223
+ raise RuntimeError(f"❌ Error in extract_images_from_pdf: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  @app.route('/')
226
  def index():