Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1651,31 +1651,31 @@ scratch_keywords = [
|
|
1651 |
"touching", "sensing", "pen", "clear","Scratch","Code","scratch blocks"
|
1652 |
]
|
1653 |
|
1654 |
-
|
1655 |
-
def extract_images_from_pdf(pdf_path, final_json_path_2):
|
1656 |
''' Extract images from PDF and generate structured sprite JSON '''
|
1657 |
try:
|
1658 |
-
pdf_filename =
|
1659 |
-
|
1660 |
-
|
1661 |
-
#
|
1662 |
-
|
1663 |
-
|
1664 |
-
|
1665 |
-
|
1666 |
-
|
1667 |
-
|
1668 |
-
|
1669 |
-
|
1670 |
-
|
1671 |
-
|
|
|
1672 |
|
1673 |
try:
|
1674 |
elements = partition_pdf(
|
1675 |
-
filename=pdf_path,
|
1676 |
strategy="hi_res",
|
1677 |
extract_image_block_types=["Image"],
|
1678 |
-
extract_image_block_to_payload=True,
|
1679 |
)
|
1680 |
except Exception as e:
|
1681 |
raise RuntimeError(
|
@@ -1716,7 +1716,7 @@ def extract_images_from_pdf(pdf_path, final_json_path_2):
|
|
1716 |
)
|
1717 |
|
1718 |
# If JSON already exists, load it and find the next available Sprite number
|
1719 |
-
if
|
1720 |
with open(final_json_path, "r") as existing_file:
|
1721 |
manipulated = json.load(existing_file)
|
1722 |
# Determine the next available index (e.g., Sprite 4 if 1–3 already exist)
|
@@ -1732,17 +1732,22 @@ def extract_images_from_pdf(pdf_path, final_json_path_2):
|
|
1732 |
try:
|
1733 |
image_data = base64.b64decode(
|
1734 |
element["metadata"]["image_base64"])
|
1735 |
-
image = Image.open(
|
1736 |
|
1737 |
image = upscale_image(image, scale=2)
|
1738 |
# image.show(title=f"Extracted Image {i+1}")
|
1739 |
-
image_path = os.path.join(extracted_image_subdir, f"Sprite_{i+1}.png")
|
1740 |
-
image.save(image_path) # don't need to store image in local folder, process it from variable
|
1741 |
|
1742 |
-
|
1743 |
-
|
1744 |
-
|
1745 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1746 |
prompt_combined = """
|
1747 |
Analyze this image and return JSON with keys:# modify prompt for "name", if it detects "code-blocks only then give name as 'scratch-block'"
|
1748 |
{
|
@@ -1762,24 +1767,34 @@ def extract_images_from_pdf(pdf_path, final_json_path_2):
|
|
1762 |
]
|
1763 |
|
1764 |
response = agent.invoke({"messages": [{"role": "user", "content": content}]})
|
1765 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1766 |
try:
|
1767 |
name = result_json.get("name", "").strip()
|
1768 |
description = result_json.get("description", "").strip()
|
1769 |
except Exception as e:
|
1770 |
-
logger.error(f"⚠️ Failed to extract name/description: {str(e)}")
|
1771 |
name = "unknown"
|
1772 |
description = "unknown"
|
1773 |
|
1774 |
manipulated_json[f"Sprite {sprite_count}"] = {
|
1775 |
"name": name,
|
1776 |
-
"base64": element["metadata"]["image_base64"],
|
1777 |
-
"file-path":
|
1778 |
"description": description
|
1779 |
}
|
1780 |
sprite_count += 1
|
1781 |
except Exception as e:
|
1782 |
-
|
1783 |
|
1784 |
# Save manipulated JSON
|
1785 |
with open(final_json_path, "w") as sprite_file:
|
@@ -1800,17 +1815,13 @@ def extract_images_from_pdf(pdf_path, final_json_path_2):
|
|
1800 |
else:
|
1801 |
logger.info(f"🛑 Excluded code block-like image: {key}")
|
1802 |
|
1803 |
-
# if not any(is_code_block(value.get("name","")) for value in manipulated_json.values()):
|
1804 |
-
# return jsonify({"message":"Invalid Content"}), 400
|
1805 |
-
# if not filtered_sprites:
|
1806 |
-
# return "Invalid Content", {}
|
1807 |
-
|
1808 |
# Overwrite with filtered content
|
1809 |
with open(final_json_path_2, "w") as sprite_file:
|
1810 |
json.dump(filtered_sprites, sprite_file, indent=4)
|
1811 |
-
# print(f"✅ Manipulated sprite JSON saved: {final_json_path}")
|
1812 |
|
1813 |
-
|
|
|
|
|
1814 |
except Exception as e:
|
1815 |
raise RuntimeError(f"❌ Error in extract_images_from_pdf: {str(e)}")
|
1816 |
|
@@ -2110,10 +2121,12 @@ def create_sb3_archive(project_folder: Path, project_id: str) -> Path | None:
|
|
2110 |
zip_path = None
|
2111 |
sb3_path = None
|
2112 |
try:
|
|
|
2113 |
zip_path_str = shutil.make_archive(str(output_base_name), 'zip', root_dir=str(project_folder))
|
2114 |
-
zip_path = Path(zip_path_str)
|
2115 |
logger.info(f"Project folder zipped to: {zip_path}")
|
2116 |
|
|
|
2117 |
sb3_path = GEN_PROJECT_DIR / f"{project_id}.sb3"
|
2118 |
os.rename(zip_path, sb3_path)
|
2119 |
logger.info(f"Renamed {zip_path} to {sb3_path}")
|
@@ -2121,13 +2134,13 @@ def create_sb3_archive(project_folder: Path, project_id: str) -> Path | None:
|
|
2121 |
return sb3_path
|
2122 |
except Exception as e:
|
2123 |
logger.error(f"Error creating SB3 archive for {project_id}: {e}", exc_info=True)
|
|
|
2124 |
if zip_path and zip_path.exists():
|
2125 |
os.remove(zip_path)
|
2126 |
if sb3_path and sb3_path.exists():
|
2127 |
os.remove(sb3_path)
|
2128 |
return None
|
2129 |
|
2130 |
-
|
2131 |
@app.route('/')
|
2132 |
def index():
|
2133 |
return render_template('app_index.html')
|
@@ -2160,7 +2173,8 @@ def download_sb3(project_id):
|
|
2160 |
def process_pdf():
|
2161 |
project_id = None
|
2162 |
project_folder = None
|
2163 |
-
temp_dir = None
|
|
|
2164 |
try:
|
2165 |
logger.info("Received request to process PDF.")
|
2166 |
if 'pdf_file' not in request.files:
|
@@ -2184,24 +2198,23 @@ def process_pdf():
|
|
2184 |
|
2185 |
logger.info(f"Saved uploaded PDF to: {saved_pdf_path}")
|
2186 |
|
2187 |
-
|
2188 |
-
|
|
|
|
|
|
|
|
|
2189 |
|
2190 |
-
#
|
2191 |
-
|
2192 |
-
|
2193 |
-
extracted_output_dir = Path(extracted_output_dir)
|
2194 |
-
|
2195 |
-
extracted_sprites_json_path = extracted_output_dir / "extracted_sprites.json"
|
2196 |
-
|
2197 |
-
if not extracted_sprites_json_path.exists():
|
2198 |
-
logger.error(f"No extracted_sprites.json found at {extracted_sprites_json_path}")
|
2199 |
return jsonify({"error": "No extracted_sprites.json found"}), 500
|
2200 |
|
2201 |
-
with open(
|
2202 |
sprite_data = json.load(f)
|
2203 |
|
2204 |
-
|
|
|
2205 |
logger.info("Similarity matching completed.")
|
2206 |
|
2207 |
with open(project_output, 'r') as f:
|
@@ -2248,6 +2261,17 @@ def process_pdf():
|
|
2248 |
if temp_dir and temp_dir.exists():
|
2249 |
shutil.rmtree(temp_dir)
|
2250 |
logger.info(f"Cleaned up temporary directory: {temp_dir}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2251 |
|
2252 |
@app.route('/list_projects', methods=['GET'])
|
2253 |
def list_projects():
|
@@ -2268,4 +2292,4 @@ def list_projects():
|
|
2268 |
return jsonify({"error": "Failed to list generated projects"}), 500
|
2269 |
|
2270 |
if __name__ == '__main__':
|
2271 |
-
app.run(host='0.0.0.0', port=7860, debug=True)
|
|
|
1651 |
"touching", "sensing", "pen", "clear","Scratch","Code","scratch blocks"
|
1652 |
]
|
1653 |
|
1654 |
+
def extract_images_from_pdf(pdf_path: Path, json_base_dir: Path, image_base_dir: Path):
|
|
|
1655 |
''' Extract images from PDF and generate structured sprite JSON '''
|
1656 |
try:
|
1657 |
+
pdf_filename = pdf_path.stem # e.g., "scratch_crab" from Path object
|
1658 |
+
|
1659 |
+
# Create subfolders under the provided base directories
|
1660 |
+
# This will create paths like:
|
1661 |
+
# /app/detected_images/pdf_filename/
|
1662 |
+
# /app/json_data/pdf_filename/
|
1663 |
+
extracted_image_subdir = image_base_dir / pdf_filename
|
1664 |
+
json_subdir = json_base_dir / pdf_filename
|
1665 |
+
extracted_image_subdir.mkdir(parents=True, exist_ok=True)
|
1666 |
+
json_subdir.mkdir(parents=True, exist_ok=True)
|
1667 |
+
|
1668 |
+
# Output paths (now using Path objects directly)
|
1669 |
+
output_json_path = json_subdir / "extracted.json"
|
1670 |
+
final_json_path = json_subdir / "extracted_sprites.json" # Path to extracted_sprites.json
|
1671 |
+
final_json_path_2 = json_subdir / "extracted_sprites_2.json"
|
1672 |
|
1673 |
try:
|
1674 |
elements = partition_pdf(
|
1675 |
+
filename=str(pdf_path), # partition_pdf might expect a string
|
1676 |
strategy="hi_res",
|
1677 |
extract_image_block_types=["Image"],
|
1678 |
+
extract_image_block_to_payload=True,
|
1679 |
)
|
1680 |
except Exception as e:
|
1681 |
raise RuntimeError(
|
|
|
1716 |
)
|
1717 |
|
1718 |
# If JSON already exists, load it and find the next available Sprite number
|
1719 |
+
if final_json_path.exists(): # Use Path.exists()
|
1720 |
with open(final_json_path, "r") as existing_file:
|
1721 |
manipulated = json.load(existing_file)
|
1722 |
# Determine the next available index (e.g., Sprite 4 if 1–3 already exist)
|
|
|
1732 |
try:
|
1733 |
image_data = base64.b64decode(
|
1734 |
element["metadata"]["image_base64"])
|
1735 |
+
image = Image.open(BytesIO(image_data)).convert("RGB") # Use BytesIO here
|
1736 |
|
1737 |
image = upscale_image(image, scale=2)
|
1738 |
# image.show(title=f"Extracted Image {i+1}")
|
|
|
|
|
1739 |
|
1740 |
+
# MODIFIED: Store image directly to BytesIO to avoid saving to disk if not needed
|
1741 |
+
# and then converting back to base64.
|
1742 |
+
img_buffer = BytesIO()
|
1743 |
+
image.save(img_buffer, format="PNG")
|
1744 |
+
img_bytes = img_buffer.getvalue()
|
1745 |
+
img_base64 = base64.b64encode(img_bytes).decode("utf-8")
|
1746 |
+
|
1747 |
+
# Optionally save image to disk if desired for debugging/permanent storage
|
1748 |
+
image_path = extracted_image_subdir / f"Sprite_{i+1}.png"
|
1749 |
+
image.save(image_path)
|
1750 |
+
|
1751 |
prompt_combined = """
|
1752 |
Analyze this image and return JSON with keys:# modify prompt for "name", if it detects "code-blocks only then give name as 'scratch-block'"
|
1753 |
{
|
|
|
1767 |
]
|
1768 |
|
1769 |
response = agent.invoke({"messages": [{"role": "user", "content": content}]})
|
1770 |
+
|
1771 |
+
# Ensure response is handled correctly, it might be a string that needs json.loads
|
1772 |
+
try:
|
1773 |
+
# Assuming the agent returns a dictionary with 'messages' key,
|
1774 |
+
# and the last message's content is the JSON string.
|
1775 |
+
response_content_str = response.get("messages", [])[-1].content
|
1776 |
+
result_json = json.loads(response_content_str)
|
1777 |
+
except (json.JSONDecodeError, IndexError, AttributeError) as e:
|
1778 |
+
logger.error(f"⚠️ Failed to parse agent response as JSON: {e}. Response was: {response}", exc_info=True)
|
1779 |
+
result_json = {} # Default to empty dict if parsing fails
|
1780 |
+
|
1781 |
try:
|
1782 |
name = result_json.get("name", "").strip()
|
1783 |
description = result_json.get("description", "").strip()
|
1784 |
except Exception as e:
|
1785 |
+
logger.error(f"⚠️ Failed to extract name/description from result_json: {str(e)}", exc_info=True)
|
1786 |
name = "unknown"
|
1787 |
description = "unknown"
|
1788 |
|
1789 |
manipulated_json[f"Sprite {sprite_count}"] = {
|
1790 |
"name": name,
|
1791 |
+
"base64": element["metadata"]["image_base64"], # Keep original base64 if needed
|
1792 |
+
"file-path": str(extracted_image_subdir), # Store the directory path as string
|
1793 |
"description": description
|
1794 |
}
|
1795 |
sprite_count += 1
|
1796 |
except Exception as e:
|
1797 |
+
logger.error(f"⚠️ Error processing Sprite {i+1}: {str(e)}", exc_info=True)
|
1798 |
|
1799 |
# Save manipulated JSON
|
1800 |
with open(final_json_path, "w") as sprite_file:
|
|
|
1815 |
else:
|
1816 |
logger.info(f"🛑 Excluded code block-like image: {key}")
|
1817 |
|
|
|
|
|
|
|
|
|
|
|
1818 |
# Overwrite with filtered content
|
1819 |
with open(final_json_path_2, "w") as sprite_file:
|
1820 |
json.dump(filtered_sprites, sprite_file, indent=4)
|
|
|
1821 |
|
1822 |
+
# MODIFIED RETURN VALUE: Return the Path to the primary extracted_sprites.json file
|
1823 |
+
# and the directory where it's located.
|
1824 |
+
return final_json_path, json_subdir # Return the file path and its parent directory
|
1825 |
except Exception as e:
|
1826 |
raise RuntimeError(f"❌ Error in extract_images_from_pdf: {str(e)}")
|
1827 |
|
|
|
2121 |
zip_path = None
|
2122 |
sb3_path = None
|
2123 |
try:
|
2124 |
+
# shutil.make_archive automatically adds .zip extension
|
2125 |
zip_path_str = shutil.make_archive(str(output_base_name), 'zip', root_dir=str(project_folder))
|
2126 |
+
zip_path = Path(zip_path_str) # Convert back to Path object
|
2127 |
logger.info(f"Project folder zipped to: {zip_path}")
|
2128 |
|
2129 |
+
# 2. Rename the .zip file to .sb3
|
2130 |
sb3_path = GEN_PROJECT_DIR / f"{project_id}.sb3"
|
2131 |
os.rename(zip_path, sb3_path)
|
2132 |
logger.info(f"Renamed {zip_path} to {sb3_path}")
|
|
|
2134 |
return sb3_path
|
2135 |
except Exception as e:
|
2136 |
logger.error(f"Error creating SB3 archive for {project_id}: {e}", exc_info=True)
|
2137 |
+
# Clean up any partial files if an error occurs
|
2138 |
if zip_path and zip_path.exists():
|
2139 |
os.remove(zip_path)
|
2140 |
if sb3_path and sb3_path.exists():
|
2141 |
os.remove(sb3_path)
|
2142 |
return None
|
2143 |
|
|
|
2144 |
@app.route('/')
|
2145 |
def index():
|
2146 |
return render_template('app_index.html')
|
|
|
2173 |
def process_pdf():
|
2174 |
project_id = None
|
2175 |
project_folder = None
|
2176 |
+
temp_dir = None
|
2177 |
+
extracted_json_parent_dir = None # Initialize for finally block cleanup or later use
|
2178 |
try:
|
2179 |
logger.info("Received request to process PDF.")
|
2180 |
if 'pdf_file' not in request.files:
|
|
|
2198 |
|
2199 |
logger.info(f"Saved uploaded PDF to: {saved_pdf_path}")
|
2200 |
|
2201 |
+
# MODIFIED CALL: Pass JSON_DIR and DETECTED_IMAGE_DIR
|
2202 |
+
# extract_images_from_pdf now returns the file path to extracted_sprites.json
|
2203 |
+
# and the parent directory where it was created.
|
2204 |
+
extracted_sprites_json_file_path, extracted_json_parent_dir = extract_images_from_pdf(
|
2205 |
+
saved_pdf_path, JSON_DIR, DETECTED_IMAGE_DIR
|
2206 |
+
)
|
2207 |
|
2208 |
+
# Now, directly use extracted_sprites_json_file_path to check for its existence
|
2209 |
+
if not extracted_sprites_json_file_path.exists():
|
2210 |
+
logger.error(f"No extracted_sprites.json found at {extracted_sprites_json_file_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
2211 |
return jsonify({"error": "No extracted_sprites.json found"}), 500
|
2212 |
|
2213 |
+
with open(extracted_sprites_json_file_path, 'r') as f:
|
2214 |
sprite_data = json.load(f)
|
2215 |
|
2216 |
+
# MODIFIED CALL: Pass the extracted_json_parent_dir (the directory) to similarity_matching
|
2217 |
+
project_output = similarity_matching(extracted_json_parent_dir, project_folder)
|
2218 |
logger.info("Similarity matching completed.")
|
2219 |
|
2220 |
with open(project_output, 'r') as f:
|
|
|
2261 |
if temp_dir and temp_dir.exists():
|
2262 |
shutil.rmtree(temp_dir)
|
2263 |
logger.info(f"Cleaned up temporary directory: {temp_dir}")
|
2264 |
+
|
2265 |
+
# Optional: Clean up the extracted JSON and image directories for this project_id
|
2266 |
+
# if extracted_json_parent_dir and extracted_json_parent_dir.exists():
|
2267 |
+
# shutil.rmtree(extracted_json_parent_dir)
|
2268 |
+
# logger.info(f"Cleaned up extracted JSON directory: {extracted_json_parent_dir}")
|
2269 |
+
# if pdf_filename_stem: # You'd need to get pdf_filename_stem from `filename` earlier
|
2270 |
+
# corresponding_image_dir = DETECTED_IMAGE_DIR / pdf_filename_stem
|
2271 |
+
# if corresponding_image_dir.exists():
|
2272 |
+
# shutil.rmtree(corresponding_image_dir)
|
2273 |
+
# logger.info(f"Cleaned up detected image directory: {corresponding_image_dir}")
|
2274 |
+
|
2275 |
|
2276 |
@app.route('/list_projects', methods=['GET'])
|
2277 |
def list_projects():
|
|
|
2292 |
return jsonify({"error": "Failed to list generated projects"}), 500
|
2293 |
|
2294 |
if __name__ == '__main__':
|
2295 |
+
app.run(host='0.0.0.0', port=7860, debug=True))
|