Kazel commited on
Commit
31c8759
Β·
1 Parent(s): cfd58b0

delete docs

Browse files
Files changed (1) hide show
  1. app.py +186 -7
app.py CHANGED
@@ -1158,6 +1158,114 @@ class PDFSearchApp:
1158
  else:
1159
  return "medium"
1160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1161
  def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
1162
  """
1163
  Optimize selection to include consecutive pages when beneficial
@@ -3498,14 +3606,45 @@ def create_ui():
3498
  visible=True
3499
  )
3500
 
3501
-
3502
-
3503
-
 
 
3504
 
3505
-
3506
-
3507
-
3508
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3509
 
3510
 
3511
  # Event handlers
@@ -3522,6 +3661,46 @@ def create_ui():
3522
  outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
3523
  )
3524
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3525
 
3526
 
3527
 
 
1158
  else:
1159
  return "medium"
1160
 
1161
+ def delete_documents(self, collection_name=None):
1162
+ """
1163
+ Delete documents and their associated collections from the system
1164
+
1165
+ Args:
1166
+ collection_name: Name of the collection to delete. If None, deletes all collections.
1167
+
1168
+ Returns:
1169
+ Status message about the deletion operation
1170
+ """
1171
+ try:
1172
+ print(f"πŸ—‘οΈ DELETE DOCUMENTS REQUESTED")
1173
+ print(f"πŸ“ Collection to delete: {collection_name if collection_name else 'ALL COLLECTIONS'}")
1174
+
1175
+ if not self.indexed_docs:
1176
+ return "❌ No documents found to delete. Please upload some documents first."
1177
+
1178
+ deleted_collections = []
1179
+ deleted_files = []
1180
+
1181
+ if collection_name:
1182
+ # Delete specific collection
1183
+ if collection_name in self.indexed_docs:
1184
+ collection_info = self.indexed_docs[collection_name]
1185
+
1186
+ # Delete from Milvus
1187
+ try:
1188
+ middleware = Middleware(collection_name, create_collection=False)
1189
+ middleware.drop_collection()
1190
+ print(f"βœ… Dropped Milvus collection: {collection_name}")
1191
+ except Exception as e:
1192
+ print(f"⚠️ Warning: Could not drop Milvus collection {collection_name}: {e}")
1193
+
1194
+ # Delete page images
1195
+ try:
1196
+ base_output_dir = self._ensure_base_directory()
1197
+ collection_dir = os.path.join(base_output_dir, collection_name)
1198
+ if os.path.exists(collection_dir):
1199
+ shutil.rmtree(collection_dir)
1200
+ print(f"βœ… Deleted page images directory: {collection_dir}")
1201
+ deleted_files.append(f"Page images: {collection_dir}")
1202
+ except Exception as e:
1203
+ print(f"⚠️ Warning: Could not delete page images for {collection_name}: {e}")
1204
+
1205
+ # Remove from indexed_docs
1206
+ del self.indexed_docs[collection_name]
1207
+ deleted_collections.append(collection_name)
1208
+
1209
+ return f"βœ… Successfully deleted collection '{collection_name}'\nπŸ“ Deleted: {len(deleted_files)} file/directory items"
1210
+ else:
1211
+ return f"❌ Collection '{collection_name}' not found. Available collections: {list(self.indexed_docs.keys())}"
1212
+ else:
1213
+ # Delete all collections
1214
+ for coll_name in list(self.indexed_docs.keys()):
1215
+ try:
1216
+ # Delete from Milvus
1217
+ middleware = Middleware(coll_name, create_collection=False)
1218
+ middleware.drop_collection()
1219
+ print(f"βœ… Dropped Milvus collection: {coll_name}")
1220
+ except Exception as e:
1221
+ print(f"⚠️ Warning: Could not drop Milvus collection {coll_name}: {e}")
1222
+
1223
+ # Delete page images
1224
+ try:
1225
+ base_output_dir = self._ensure_base_directory()
1226
+ collection_dir = os.path.join(base_output_dir, coll_name)
1227
+ if os.path.exists(collection_dir):
1228
+ shutil.rmtree(collection_dir)
1229
+ print(f"βœ… Deleted page images directory: {collection_dir}")
1230
+ deleted_files.append(f"Page images: {collection_dir}")
1231
+ except Exception as e:
1232
+ print(f"⚠️ Warning: Could not delete page images for {coll_name}: {e}")
1233
+
1234
+ deleted_collections.append(coll_name)
1235
+
1236
+ # Clear all indexed docs
1237
+ self.indexed_docs.clear()
1238
+
1239
+ return f"βœ… Successfully deleted ALL collections ({len(deleted_collections)} total)\nπŸ“ Deleted: {len(deleted_files)} file/directory items\nπŸ—‘οΈ Collections deleted: {', '.join(deleted_collections)}"
1240
+
1241
+ except Exception as e:
1242
+ error_msg = f"❌ Error during document deletion: {str(e)}"
1243
+ print(f"{error_msg}")
1244
+ print(f"❌ Traceback: {traceback.format_exc()}")
1245
+ return error_msg
1246
+
1247
+ def get_available_collections(self):
1248
+ """
1249
+ Get list of available collections for deletion
1250
+
1251
+ Returns:
1252
+ List of collection names and their details
1253
+ """
1254
+ if not self.indexed_docs:
1255
+ return "No collections available for deletion."
1256
+
1257
+ collection_list = []
1258
+ for collection_name, collection_info in self.indexed_docs.items():
1259
+ collection_list.append(f"πŸ“ {collection_name}")
1260
+ if isinstance(collection_info, dict):
1261
+ if 'files' in collection_info:
1262
+ collection_list.append(f" πŸ“„ Files: {len(collection_info['files'])}")
1263
+ if 'pages' in collection_info:
1264
+ collection_list.append(f" πŸ“„ Pages: {collection_info['pages']}")
1265
+ collection_list.append("")
1266
+
1267
+ return "\n".join(collection_list)
1268
+
1269
  def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
1270
  """
1271
  Optimize selection to include consecutive pages when beneficial
 
3606
  visible=True
3607
  )
3608
 
3609
+ # Delete Documents Tab
3610
+ with gr.Tab("πŸ—‘οΈ Delete Documents"):
3611
+ with gr.Column():
3612
+ gr.Markdown("### Delete Document Collections")
3613
+ gr.Markdown("⚠️ **Warning**: This will permanently delete documents and their associated data from the system.")
3614
 
3615
+ # Show available collections
3616
+ gr.Markdown("#### Available Collections")
3617
+ collections_display = gr.Textbox(
3618
+ label="Current Collections",
3619
+ interactive=False,
3620
+ lines=8,
3621
+ value="No collections available. Upload some documents first."
3622
+ )
3623
+
3624
+ # Collection selection
3625
+ collection_dropdown = gr.Dropdown(
3626
+ label="Select Collection to Delete",
3627
+ choices=[],
3628
+ value=None,
3629
+ allow_custom_value=True,
3630
+ info="Select a specific collection to delete, or leave empty to delete all collections"
3631
+ )
3632
+
3633
+ # Delete options
3634
+ with gr.Row():
3635
+ delete_specific_btn = gr.Button("πŸ—‘οΈ Delete Selected Collection", variant="secondary")
3636
+ delete_all_btn = gr.Button("πŸ—‘οΈ Delete ALL Collections", variant="stop")
3637
+
3638
+ # Status output
3639
+ delete_status = gr.Textbox(
3640
+ label="Deletion Status",
3641
+ interactive=False,
3642
+ lines=6
3643
+ )
3644
+
3645
+ # Refresh button
3646
+ refresh_collections_btn = gr.Button("πŸ”„ Refresh Collections List", variant="secondary")
3647
+
3648
 
3649
 
3650
  # Event handlers
 
3661
  outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
3662
  )
3663
 
3664
+ # Delete events
3665
+ def refresh_collections():
3666
+ """Refresh the collections list and dropdown"""
3667
+ collections_text = app.get_available_collections()
3668
+ collection_choices = list(app.indexed_docs.keys()) if app.indexed_docs else []
3669
+ return collections_text, gr.Dropdown(choices=collection_choices)
3670
+
3671
+ def delete_specific_collection(collection_name):
3672
+ """Delete a specific collection"""
3673
+ if not collection_name or collection_name.strip() == "":
3674
+ return "❌ Please select a collection to delete."
3675
+ return app.delete_documents(collection_name.strip())
3676
+
3677
+ def delete_all_collections():
3678
+ """Delete all collections"""
3679
+ return app.delete_documents()
3680
+
3681
+ # Delete event handlers
3682
+ refresh_collections_btn.click(
3683
+ fn=refresh_collections,
3684
+ outputs=[collections_display, collection_dropdown]
3685
+ )
3686
+
3687
+ delete_specific_btn.click(
3688
+ fn=delete_specific_collection,
3689
+ inputs=[collection_dropdown],
3690
+ outputs=[delete_status]
3691
+ )
3692
+
3693
+ delete_all_btn.click(
3694
+ fn=delete_all_collections,
3695
+ outputs=[delete_status]
3696
+ )
3697
+
3698
+ # Initialize collections on page load
3699
+ demo.load(
3700
+ fn=refresh_collections,
3701
+ outputs=[collections_display, collection_dropdown]
3702
+ )
3703
+
3704
 
3705
 
3706