prthm11 commited on
Commit
4959a61
·
verified ·
1 Parent(s): ea87f78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -55
app.py CHANGED
@@ -1651,31 +1651,31 @@ scratch_keywords = [
1651
  "touching", "sensing", "pen", "clear","Scratch","Code","scratch blocks"
1652
  ]
1653
 
1654
- # --- FUNCTION: Extract images from saved PDF ---
1655
- def extract_images_from_pdf(pdf_path, final_json_path_2):
1656
  ''' Extract images from PDF and generate structured sprite JSON '''
1657
  try:
1658
- pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0] # e.g., "scratch_crab"
1659
- pdf_dir_path = os.path.dirname(pdf_path).replace("/", "\\")
1660
-
1661
- # Create subfolders
1662
- extracted_image_subdir = os.path.join(
1663
- DETECTED_IMAGE_DIR, pdf_filename)
1664
- json_subdir = os.path.join(JSON_DIR, pdf_filename)
1665
- os.makedirs(extracted_image_subdir, exist_ok=True)
1666
- os.makedirs(json_subdir, exist_ok=True)
1667
-
1668
- # Output paths
1669
- output_json_path = os.path.join(json_subdir, "extracted.json")
1670
- final_json_path = os.path.join(json_subdir, "extracted_sprites.json")
1671
- final_json_path_2 = os.path.join(json_subdir, "extracted_sprites_2.json")
 
1672
 
1673
  try:
1674
  elements = partition_pdf(
1675
- filename=pdf_path,
1676
  strategy="hi_res",
1677
  extract_image_block_types=["Image"],
1678
- extract_image_block_to_payload=True, # Set to True to get base64 in output
1679
  )
1680
  except Exception as e:
1681
  raise RuntimeError(
@@ -1716,7 +1716,7 @@ def extract_images_from_pdf(pdf_path, final_json_path_2):
1716
  )
1717
 
1718
  # If JSON already exists, load it and find the next available Sprite number
1719
- if os.path.exists(final_json_path):
1720
  with open(final_json_path, "r") as existing_file:
1721
  manipulated = json.load(existing_file)
1722
  # Determine the next available index (e.g., Sprite 4 if 1–3 already exist)
@@ -1732,17 +1732,22 @@ def extract_images_from_pdf(pdf_path, final_json_path_2):
1732
  try:
1733
  image_data = base64.b64decode(
1734
  element["metadata"]["image_base64"])
1735
- image = Image.open(io.BytesIO(image_data)).convert("RGB")
1736
 
1737
  image = upscale_image(image, scale=2)
1738
  # image.show(title=f"Extracted Image {i+1}")
1739
- image_path = os.path.join(extracted_image_subdir, f"Sprite_{i+1}.png")
1740
- image.save(image_path) # don't need to store image in local folder, process it from variable
1741
 
1742
- with open(image_path, "rb") as image_file:
1743
- image_bytes = image_file.read()
1744
- img_base64 = base64.b64encode(image_bytes).decode("utf-8")
1745
-
 
 
 
 
 
 
 
1746
  prompt_combined = """
1747
  Analyze this image and return JSON with keys:# modify prompt for "name", if it detects "code-blocks only then give name as 'scratch-block'"
1748
  {
@@ -1762,24 +1767,34 @@ def extract_images_from_pdf(pdf_path, final_json_path_2):
1762
  ]
1763
 
1764
  response = agent.invoke({"messages": [{"role": "user", "content": content}]})
1765
- result_json = json.loads(response["messages"][-1].content)
 
 
 
 
 
 
 
 
 
 
1766
  try:
1767
  name = result_json.get("name", "").strip()
1768
  description = result_json.get("description", "").strip()
1769
  except Exception as e:
1770
- logger.error(f"⚠️ Failed to extract name/description: {str(e)}")
1771
  name = "unknown"
1772
  description = "unknown"
1773
 
1774
  manipulated_json[f"Sprite {sprite_count}"] = {
1775
  "name": name,
1776
- "base64": element["metadata"]["image_base64"],
1777
- "file-path": pdf_dir_path,
1778
  "description": description
1779
  }
1780
  sprite_count += 1
1781
  except Exception as e:
1782
- print(f"⚠️ Error processing Sprite {i+1}: {str(e)}")
1783
 
1784
  # Save manipulated JSON
1785
  with open(final_json_path, "w") as sprite_file:
@@ -1800,17 +1815,13 @@ def extract_images_from_pdf(pdf_path, final_json_path_2):
1800
  else:
1801
  logger.info(f"🛑 Excluded code block-like image: {key}")
1802
 
1803
- # if not any(is_code_block(value.get("name","")) for value in manipulated_json.values()):
1804
- # return jsonify({"message":"Invalid Content"}), 400
1805
- # if not filtered_sprites:
1806
- # return "Invalid Content", {}
1807
-
1808
  # Overwrite with filtered content
1809
  with open(final_json_path_2, "w") as sprite_file:
1810
  json.dump(filtered_sprites, sprite_file, indent=4)
1811
- # print(f"✅ Manipulated sprite JSON saved: {final_json_path}")
1812
 
1813
- return final_json_path, manipulated_json
 
 
1814
  except Exception as e:
1815
  raise RuntimeError(f"❌ Error in extract_images_from_pdf: {str(e)}")
1816
 
@@ -2110,10 +2121,12 @@ def create_sb3_archive(project_folder: Path, project_id: str) -> Path | None:
2110
  zip_path = None
2111
  sb3_path = None
2112
  try:
 
2113
  zip_path_str = shutil.make_archive(str(output_base_name), 'zip', root_dir=str(project_folder))
2114
- zip_path = Path(zip_path_str)
2115
  logger.info(f"Project folder zipped to: {zip_path}")
2116
 
 
2117
  sb3_path = GEN_PROJECT_DIR / f"{project_id}.sb3"
2118
  os.rename(zip_path, sb3_path)
2119
  logger.info(f"Renamed {zip_path} to {sb3_path}")
@@ -2121,13 +2134,13 @@ def create_sb3_archive(project_folder: Path, project_id: str) -> Path | None:
2121
  return sb3_path
2122
  except Exception as e:
2123
  logger.error(f"Error creating SB3 archive for {project_id}: {e}", exc_info=True)
 
2124
  if zip_path and zip_path.exists():
2125
  os.remove(zip_path)
2126
  if sb3_path and sb3_path.exists():
2127
  os.remove(sb3_path)
2128
  return None
2129
 
2130
-
2131
  @app.route('/')
2132
  def index():
2133
  return render_template('app_index.html')
@@ -2160,7 +2173,8 @@ def download_sb3(project_id):
2160
  def process_pdf():
2161
  project_id = None
2162
  project_folder = None
2163
- temp_dir = None # Initialize temp_dir for finally block
 
2164
  try:
2165
  logger.info("Received request to process PDF.")
2166
  if 'pdf_file' not in request.files:
@@ -2184,24 +2198,23 @@ def process_pdf():
2184
 
2185
  logger.info(f"Saved uploaded PDF to: {saved_pdf_path}")
2186
 
2187
- json_path = None
2188
- extracted_output_dir, result = extract_images_from_pdf(saved_pdf_path, json_path)
 
 
 
 
2189
 
2190
- # Ensure extracted_output_dir is a Path object for the '/' operator
2191
- # This was the source of the TypeError
2192
- if not isinstance(extracted_output_dir, Path):
2193
- extracted_output_dir = Path(extracted_output_dir)
2194
-
2195
- extracted_sprites_json_path = extracted_output_dir / "extracted_sprites.json"
2196
-
2197
- if not extracted_sprites_json_path.exists():
2198
- logger.error(f"No extracted_sprites.json found at {extracted_sprites_json_path}")
2199
  return jsonify({"error": "No extracted_sprites.json found"}), 500
2200
 
2201
- with open(extracted_sprites_json_path, 'r') as f:
2202
  sprite_data = json.load(f)
2203
 
2204
- project_output = similarity_matching(extracted_output_dir, project_folder)
 
2205
  logger.info("Similarity matching completed.")
2206
 
2207
  with open(project_output, 'r') as f:
@@ -2248,6 +2261,17 @@ def process_pdf():
2248
  if temp_dir and temp_dir.exists():
2249
  shutil.rmtree(temp_dir)
2250
  logger.info(f"Cleaned up temporary directory: {temp_dir}")
 
 
 
 
 
 
 
 
 
 
 
2251
 
2252
  @app.route('/list_projects', methods=['GET'])
2253
  def list_projects():
@@ -2268,4 +2292,4 @@ def list_projects():
2268
  return jsonify({"error": "Failed to list generated projects"}), 500
2269
 
2270
  if __name__ == '__main__':
2271
- app.run(host='0.0.0.0', port=7860, debug=True)
 
1651
  "touching", "sensing", "pen", "clear","Scratch","Code","scratch blocks"
1652
  ]
1653
 
1654
+ def extract_images_from_pdf(pdf_path: Path, json_base_dir: Path, image_base_dir: Path):
 
1655
  ''' Extract images from PDF and generate structured sprite JSON '''
1656
  try:
1657
+ pdf_filename = pdf_path.stem # e.g., "scratch_crab" from Path object
1658
+
1659
+ # Create subfolders under the provided base directories
1660
+ # This will create paths like:
1661
+ # /app/detected_images/pdf_filename/
1662
+ # /app/json_data/pdf_filename/
1663
+ extracted_image_subdir = image_base_dir / pdf_filename
1664
+ json_subdir = json_base_dir / pdf_filename
1665
+ extracted_image_subdir.mkdir(parents=True, exist_ok=True)
1666
+ json_subdir.mkdir(parents=True, exist_ok=True)
1667
+
1668
+ # Output paths (now using Path objects directly)
1669
+ output_json_path = json_subdir / "extracted.json"
1670
+ final_json_path = json_subdir / "extracted_sprites.json" # Path to extracted_sprites.json
1671
+ final_json_path_2 = json_subdir / "extracted_sprites_2.json"
1672
 
1673
  try:
1674
  elements = partition_pdf(
1675
+ filename=str(pdf_path), # partition_pdf might expect a string
1676
  strategy="hi_res",
1677
  extract_image_block_types=["Image"],
1678
+ extract_image_block_to_payload=True,
1679
  )
1680
  except Exception as e:
1681
  raise RuntimeError(
 
1716
  )
1717
 
1718
  # If JSON already exists, load it and find the next available Sprite number
1719
+ if final_json_path.exists(): # Use Path.exists()
1720
  with open(final_json_path, "r") as existing_file:
1721
  manipulated = json.load(existing_file)
1722
  # Determine the next available index (e.g., Sprite 4 if 1–3 already exist)
 
1732
  try:
1733
  image_data = base64.b64decode(
1734
  element["metadata"]["image_base64"])
1735
+ image = Image.open(BytesIO(image_data)).convert("RGB") # Use BytesIO here
1736
 
1737
  image = upscale_image(image, scale=2)
1738
  # image.show(title=f"Extracted Image {i+1}")
 
 
1739
 
1740
+ # MODIFIED: Store image directly to BytesIO to avoid saving to disk if not needed
1741
+ # and then converting back to base64.
1742
+ img_buffer = BytesIO()
1743
+ image.save(img_buffer, format="PNG")
1744
+ img_bytes = img_buffer.getvalue()
1745
+ img_base64 = base64.b64encode(img_bytes).decode("utf-8")
1746
+
1747
+ # Optionally save image to disk if desired for debugging/permanent storage
1748
+ image_path = extracted_image_subdir / f"Sprite_{i+1}.png"
1749
+ image.save(image_path)
1750
+
1751
  prompt_combined = """
1752
  Analyze this image and return JSON with keys:# modify prompt for "name", if it detects "code-blocks only then give name as 'scratch-block'"
1753
  {
 
1767
  ]
1768
 
1769
  response = agent.invoke({"messages": [{"role": "user", "content": content}]})
1770
+
1771
+ # Ensure response is handled correctly, it might be a string that needs json.loads
1772
+ try:
1773
+ # Assuming the agent returns a dictionary with 'messages' key,
1774
+ # and the last message's content is the JSON string.
1775
+ response_content_str = response.get("messages", [])[-1].content
1776
+ result_json = json.loads(response_content_str)
1777
+ except (json.JSONDecodeError, IndexError, AttributeError) as e:
1778
+ logger.error(f"⚠️ Failed to parse agent response as JSON: {e}. Response was: {response}", exc_info=True)
1779
+ result_json = {} # Default to empty dict if parsing fails
1780
+
1781
  try:
1782
  name = result_json.get("name", "").strip()
1783
  description = result_json.get("description", "").strip()
1784
  except Exception as e:
1785
+ logger.error(f"⚠️ Failed to extract name/description from result_json: {str(e)}", exc_info=True)
1786
  name = "unknown"
1787
  description = "unknown"
1788
 
1789
  manipulated_json[f"Sprite {sprite_count}"] = {
1790
  "name": name,
1791
+ "base64": element["metadata"]["image_base64"], # Keep original base64 if needed
1792
+ "file-path": str(extracted_image_subdir), # Store the directory path as string
1793
  "description": description
1794
  }
1795
  sprite_count += 1
1796
  except Exception as e:
1797
+ logger.error(f"⚠️ Error processing Sprite {i+1}: {str(e)}", exc_info=True)
1798
 
1799
  # Save manipulated JSON
1800
  with open(final_json_path, "w") as sprite_file:
 
1815
  else:
1816
  logger.info(f"🛑 Excluded code block-like image: {key}")
1817
 
 
 
 
 
 
1818
  # Overwrite with filtered content
1819
  with open(final_json_path_2, "w") as sprite_file:
1820
  json.dump(filtered_sprites, sprite_file, indent=4)
 
1821
 
1822
+ # MODIFIED RETURN VALUE: Return the Path to the primary extracted_sprites.json file
1823
+ # and the directory where it's located.
1824
+ return final_json_path, json_subdir # Return the file path and its parent directory
1825
  except Exception as e:
1826
  raise RuntimeError(f"❌ Error in extract_images_from_pdf: {str(e)}")
1827
 
 
2121
  zip_path = None
2122
  sb3_path = None
2123
  try:
2124
+ # shutil.make_archive automatically adds .zip extension
2125
  zip_path_str = shutil.make_archive(str(output_base_name), 'zip', root_dir=str(project_folder))
2126
+ zip_path = Path(zip_path_str) # Convert back to Path object
2127
  logger.info(f"Project folder zipped to: {zip_path}")
2128
 
2129
+ # 2. Rename the .zip file to .sb3
2130
  sb3_path = GEN_PROJECT_DIR / f"{project_id}.sb3"
2131
  os.rename(zip_path, sb3_path)
2132
  logger.info(f"Renamed {zip_path} to {sb3_path}")
 
2134
  return sb3_path
2135
  except Exception as e:
2136
  logger.error(f"Error creating SB3 archive for {project_id}: {e}", exc_info=True)
2137
+ # Clean up any partial files if an error occurs
2138
  if zip_path and zip_path.exists():
2139
  os.remove(zip_path)
2140
  if sb3_path and sb3_path.exists():
2141
  os.remove(sb3_path)
2142
  return None
2143
 
 
2144
  @app.route('/')
2145
  def index():
2146
  return render_template('app_index.html')
 
2173
  def process_pdf():
2174
  project_id = None
2175
  project_folder = None
2176
+ temp_dir = None
2177
+ extracted_json_parent_dir = None # Initialize for finally block cleanup or later use
2178
  try:
2179
  logger.info("Received request to process PDF.")
2180
  if 'pdf_file' not in request.files:
 
2198
 
2199
  logger.info(f"Saved uploaded PDF to: {saved_pdf_path}")
2200
 
2201
+ # MODIFIED CALL: Pass JSON_DIR and DETECTED_IMAGE_DIR
2202
+ # extract_images_from_pdf now returns the file path to extracted_sprites.json
2203
+ # and the parent directory where it was created.
2204
+ extracted_sprites_json_file_path, extracted_json_parent_dir = extract_images_from_pdf(
2205
+ saved_pdf_path, JSON_DIR, DETECTED_IMAGE_DIR
2206
+ )
2207
 
2208
+ # Now, directly use extracted_sprites_json_file_path to check for its existence
2209
+ if not extracted_sprites_json_file_path.exists():
2210
+ logger.error(f"No extracted_sprites.json found at {extracted_sprites_json_file_path}")
 
 
 
 
 
 
2211
  return jsonify({"error": "No extracted_sprites.json found"}), 500
2212
 
2213
+ with open(extracted_sprites_json_file_path, 'r') as f:
2214
  sprite_data = json.load(f)
2215
 
2216
+ # MODIFIED CALL: Pass the extracted_json_parent_dir (the directory) to similarity_matching
2217
+ project_output = similarity_matching(extracted_json_parent_dir, project_folder)
2218
  logger.info("Similarity matching completed.")
2219
 
2220
  with open(project_output, 'r') as f:
 
2261
  if temp_dir and temp_dir.exists():
2262
  shutil.rmtree(temp_dir)
2263
  logger.info(f"Cleaned up temporary directory: {temp_dir}")
2264
+
2265
+ # Optional: Clean up the extracted JSON and image directories for this project_id
2266
+ # if extracted_json_parent_dir and extracted_json_parent_dir.exists():
2267
+ # shutil.rmtree(extracted_json_parent_dir)
2268
+ # logger.info(f"Cleaned up extracted JSON directory: {extracted_json_parent_dir}")
2269
+ # if pdf_filename_stem: # You'd need to get pdf_filename_stem from `filename` earlier
2270
+ # corresponding_image_dir = DETECTED_IMAGE_DIR / pdf_filename_stem
2271
+ # if corresponding_image_dir.exists():
2272
+ # shutil.rmtree(corresponding_image_dir)
2273
+ # logger.info(f"Cleaned up detected image directory: {corresponding_image_dir}")
2274
+
2275
 
2276
  @app.route('/list_projects', methods=['GET'])
2277
  def list_projects():
 
2292
  return jsonify({"error": "Failed to list generated projects"}), 500
2293
 
2294
  if __name__ == '__main__':
2295
+ app.run(host='0.0.0.0', port=7860, debug=True))