Spaces:
Running
on
Zero
Running
on
Zero
delete docs
Browse files
app.py
CHANGED
@@ -1158,6 +1158,114 @@ class PDFSearchApp:
|
|
1158 |
else:
|
1159 |
return "medium"
|
1160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1161 |
def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
|
1162 |
"""
|
1163 |
Optimize selection to include consecutive pages when beneficial
|
@@ -3498,14 +3606,45 @@ def create_ui():
|
|
3498 |
visible=True
|
3499 |
)
|
3500 |
|
3501 |
-
|
3502 |
-
|
3503 |
-
|
|
|
|
|
3504 |
|
3505 |
-
|
3506 |
-
|
3507 |
-
|
3508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3509 |
|
3510 |
|
3511 |
# Event handlers
|
@@ -3522,6 +3661,46 @@ def create_ui():
|
|
3522 |
outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
|
3523 |
)
|
3524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3525 |
|
3526 |
|
3527 |
|
|
|
1158 |
else:
|
1159 |
return "medium"
|
1160 |
|
1161 |
+
def delete_documents(self, collection_name=None):
|
1162 |
+
"""
|
1163 |
+
Delete documents and their associated collections from the system
|
1164 |
+
|
1165 |
+
Args:
|
1166 |
+
collection_name: Name of the collection to delete. If None, deletes all collections.
|
1167 |
+
|
1168 |
+
Returns:
|
1169 |
+
Status message about the deletion operation
|
1170 |
+
"""
|
1171 |
+
try:
|
1172 |
+
print(f"ποΈ DELETE DOCUMENTS REQUESTED")
|
1173 |
+
print(f"π Collection to delete: {collection_name if collection_name else 'ALL COLLECTIONS'}")
|
1174 |
+
|
1175 |
+
if not self.indexed_docs:
|
1176 |
+
return "β No documents found to delete. Please upload some documents first."
|
1177 |
+
|
1178 |
+
deleted_collections = []
|
1179 |
+
deleted_files = []
|
1180 |
+
|
1181 |
+
if collection_name:
|
1182 |
+
# Delete specific collection
|
1183 |
+
if collection_name in self.indexed_docs:
|
1184 |
+
collection_info = self.indexed_docs[collection_name]
|
1185 |
+
|
1186 |
+
# Delete from Milvus
|
1187 |
+
try:
|
1188 |
+
middleware = Middleware(collection_name, create_collection=False)
|
1189 |
+
middleware.drop_collection()
|
1190 |
+
print(f"β
Dropped Milvus collection: {collection_name}")
|
1191 |
+
except Exception as e:
|
1192 |
+
print(f"β οΈ Warning: Could not drop Milvus collection {collection_name}: {e}")
|
1193 |
+
|
1194 |
+
# Delete page images
|
1195 |
+
try:
|
1196 |
+
base_output_dir = self._ensure_base_directory()
|
1197 |
+
collection_dir = os.path.join(base_output_dir, collection_name)
|
1198 |
+
if os.path.exists(collection_dir):
|
1199 |
+
shutil.rmtree(collection_dir)
|
1200 |
+
print(f"β
Deleted page images directory: {collection_dir}")
|
1201 |
+
deleted_files.append(f"Page images: {collection_dir}")
|
1202 |
+
except Exception as e:
|
1203 |
+
print(f"β οΈ Warning: Could not delete page images for {collection_name}: {e}")
|
1204 |
+
|
1205 |
+
# Remove from indexed_docs
|
1206 |
+
del self.indexed_docs[collection_name]
|
1207 |
+
deleted_collections.append(collection_name)
|
1208 |
+
|
1209 |
+
return f"β
Successfully deleted collection '{collection_name}'\nπ Deleted: {len(deleted_files)} file/directory items"
|
1210 |
+
else:
|
1211 |
+
return f"β Collection '{collection_name}' not found. Available collections: {list(self.indexed_docs.keys())}"
|
1212 |
+
else:
|
1213 |
+
# Delete all collections
|
1214 |
+
for coll_name in list(self.indexed_docs.keys()):
|
1215 |
+
try:
|
1216 |
+
# Delete from Milvus
|
1217 |
+
middleware = Middleware(coll_name, create_collection=False)
|
1218 |
+
middleware.drop_collection()
|
1219 |
+
print(f"β
Dropped Milvus collection: {coll_name}")
|
1220 |
+
except Exception as e:
|
1221 |
+
print(f"β οΈ Warning: Could not drop Milvus collection {coll_name}: {e}")
|
1222 |
+
|
1223 |
+
# Delete page images
|
1224 |
+
try:
|
1225 |
+
base_output_dir = self._ensure_base_directory()
|
1226 |
+
collection_dir = os.path.join(base_output_dir, coll_name)
|
1227 |
+
if os.path.exists(collection_dir):
|
1228 |
+
shutil.rmtree(collection_dir)
|
1229 |
+
print(f"β
Deleted page images directory: {collection_dir}")
|
1230 |
+
deleted_files.append(f"Page images: {collection_dir}")
|
1231 |
+
except Exception as e:
|
1232 |
+
print(f"β οΈ Warning: Could not delete page images for {coll_name}: {e}")
|
1233 |
+
|
1234 |
+
deleted_collections.append(coll_name)
|
1235 |
+
|
1236 |
+
# Clear all indexed docs
|
1237 |
+
self.indexed_docs.clear()
|
1238 |
+
|
1239 |
+
return f"β
Successfully deleted ALL collections ({len(deleted_collections)} total)\nπ Deleted: {len(deleted_files)} file/directory items\nποΈ Collections deleted: {', '.join(deleted_collections)}"
|
1240 |
+
|
1241 |
+
except Exception as e:
|
1242 |
+
error_msg = f"β Error during document deletion: {str(e)}"
|
1243 |
+
print(f"{error_msg}")
|
1244 |
+
print(f"β Traceback: {traceback.format_exc()}")
|
1245 |
+
return error_msg
|
1246 |
+
|
1247 |
+
def get_available_collections(self):
|
1248 |
+
"""
|
1249 |
+
Get list of available collections for deletion
|
1250 |
+
|
1251 |
+
Returns:
|
1252 |
+
List of collection names and their details
|
1253 |
+
"""
|
1254 |
+
if not self.indexed_docs:
|
1255 |
+
return "No collections available for deletion."
|
1256 |
+
|
1257 |
+
collection_list = []
|
1258 |
+
for collection_name, collection_info in self.indexed_docs.items():
|
1259 |
+
collection_list.append(f"π {collection_name}")
|
1260 |
+
if isinstance(collection_info, dict):
|
1261 |
+
if 'files' in collection_info:
|
1262 |
+
collection_list.append(f" π Files: {len(collection_info['files'])}")
|
1263 |
+
if 'pages' in collection_info:
|
1264 |
+
collection_list.append(f" π Pages: {collection_info['pages']}")
|
1265 |
+
collection_list.append("")
|
1266 |
+
|
1267 |
+
return "\n".join(collection_list)
|
1268 |
+
|
1269 |
def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
|
1270 |
"""
|
1271 |
Optimize selection to include consecutive pages when beneficial
|
|
|
3606 |
visible=True
|
3607 |
)
|
3608 |
|
3609 |
+
# Delete Documents Tab
|
3610 |
+
with gr.Tab("ποΈ Delete Documents"):
|
3611 |
+
with gr.Column():
|
3612 |
+
gr.Markdown("### Delete Document Collections")
|
3613 |
+
gr.Markdown("β οΈ **Warning**: This will permanently delete documents and their associated data from the system.")
|
3614 |
|
3615 |
+
# Show available collections
|
3616 |
+
gr.Markdown("#### Available Collections")
|
3617 |
+
collections_display = gr.Textbox(
|
3618 |
+
label="Current Collections",
|
3619 |
+
interactive=False,
|
3620 |
+
lines=8,
|
3621 |
+
value="No collections available. Upload some documents first."
|
3622 |
+
)
|
3623 |
+
|
3624 |
+
# Collection selection
|
3625 |
+
collection_dropdown = gr.Dropdown(
|
3626 |
+
label="Select Collection to Delete",
|
3627 |
+
choices=[],
|
3628 |
+
value=None,
|
3629 |
+
allow_custom_value=True,
|
3630 |
+
info="Select a specific collection to delete, or leave empty to delete all collections"
|
3631 |
+
)
|
3632 |
+
|
3633 |
+
# Delete options
|
3634 |
+
with gr.Row():
|
3635 |
+
delete_specific_btn = gr.Button("ποΈ Delete Selected Collection", variant="secondary")
|
3636 |
+
delete_all_btn = gr.Button("ποΈ Delete ALL Collections", variant="stop")
|
3637 |
+
|
3638 |
+
# Status output
|
3639 |
+
delete_status = gr.Textbox(
|
3640 |
+
label="Deletion Status",
|
3641 |
+
interactive=False,
|
3642 |
+
lines=6
|
3643 |
+
)
|
3644 |
+
|
3645 |
+
# Refresh button
|
3646 |
+
refresh_collections_btn = gr.Button("π Refresh Collections List", variant="secondary")
|
3647 |
+
|
3648 |
|
3649 |
|
3650 |
# Event handlers
|
|
|
3661 |
outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
|
3662 |
)
|
3663 |
|
3664 |
+
# Delete events
|
3665 |
+
def refresh_collections():
|
3666 |
+
"""Refresh the collections list and dropdown"""
|
3667 |
+
collections_text = app.get_available_collections()
|
3668 |
+
collection_choices = list(app.indexed_docs.keys()) if app.indexed_docs else []
|
3669 |
+
return collections_text, gr.Dropdown(choices=collection_choices)
|
3670 |
+
|
3671 |
+
def delete_specific_collection(collection_name):
|
3672 |
+
"""Delete a specific collection"""
|
3673 |
+
if not collection_name or collection_name.strip() == "":
|
3674 |
+
return "β Please select a collection to delete."
|
3675 |
+
return app.delete_documents(collection_name.strip())
|
3676 |
+
|
3677 |
+
def delete_all_collections():
|
3678 |
+
"""Delete all collections"""
|
3679 |
+
return app.delete_documents()
|
3680 |
+
|
3681 |
+
# Delete event handlers
|
3682 |
+
refresh_collections_btn.click(
|
3683 |
+
fn=refresh_collections,
|
3684 |
+
outputs=[collections_display, collection_dropdown]
|
3685 |
+
)
|
3686 |
+
|
3687 |
+
delete_specific_btn.click(
|
3688 |
+
fn=delete_specific_collection,
|
3689 |
+
inputs=[collection_dropdown],
|
3690 |
+
outputs=[delete_status]
|
3691 |
+
)
|
3692 |
+
|
3693 |
+
delete_all_btn.click(
|
3694 |
+
fn=delete_all_collections,
|
3695 |
+
outputs=[delete_status]
|
3696 |
+
)
|
3697 |
+
|
3698 |
+
# Initialize collections on page load
|
3699 |
+
demo.load(
|
3700 |
+
fn=refresh_collections,
|
3701 |
+
outputs=[collections_display, collection_dropdown]
|
3702 |
+
)
|
3703 |
+
|
3704 |
|
3705 |
|
3706 |
|