Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
Cleanup script to remove all web crawler data from MongoDB | |
and list files to be removed | |
""" | |
import os | |
import sys | |
import logging | |
import shutil | |
from pymongo import MongoClient | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s [%(name)s] %(levelname)s: %(message)s' | |
) | |
logger = logging.getLogger("cleanup") | |
def cleanup_mongodb(): | |
"""Remove all web crawler data from MongoDB""" | |
try: | |
# Connect to MongoDB | |
logger.info("Connecting to MongoDB...") | |
client = MongoClient("mongodb://localhost:27017/") | |
# Access crawler database | |
db = client["crawler"] | |
# List and drop all collections | |
collections = db.list_collection_names() | |
if not collections: | |
logger.info("No collections found in the crawler database") | |
else: | |
logger.info(f"Found {len(collections)} collections to drop: {collections}") | |
for collection in collections: | |
logger.info(f"Dropping collection: {collection}") | |
db[collection].drop() | |
logger.info("All crawler collections dropped successfully") | |
# Optional: Drop the entire database | |
# client.drop_database("crawler") | |
# logger.info("Dropped entire crawler database") | |
logger.info("MongoDB cleanup completed") | |
except Exception as e: | |
logger.error(f"Error cleaning up MongoDB: {e}") | |
return False | |
return True | |
def cleanup_files(): | |
"""List and remove files related to simple_crawler""" | |
try: | |
crawler_dir = os.path.dirname(os.path.abspath(__file__)) | |
# Files directly related to simple_crawler | |
simple_crawler_files = [ | |
os.path.join(crawler_dir, "simple_crawler.py"), | |
os.path.join(crawler_dir, "README_SIMPLE.md"), | |
os.path.join(crawler_dir, "simple_crawler.log") | |
] | |
# Check storage directories | |
storage_dir = os.path.join(crawler_dir, "storage") | |
if os.path.exists(storage_dir): | |
logger.info(f"Will remove storage directory: {storage_dir}") | |
simple_crawler_files.append(storage_dir) | |
# List all files that will be removed | |
logger.info("The following files will be removed:") | |
for file_path in simple_crawler_files: | |
if os.path.exists(file_path): | |
logger.info(f" - {file_path}") | |
else: | |
logger.info(f" - {file_path} (not found)") | |
# Confirm removal | |
confirm = input("Do you want to proceed with removal? (y/n): ") | |
if confirm.lower() != 'y': | |
logger.info("File removal cancelled") | |
return False | |
# Remove files and directories | |
for file_path in simple_crawler_files: | |
if os.path.exists(file_path): | |
if os.path.isdir(file_path): | |
logger.info(f"Removing directory: {file_path}") | |
shutil.rmtree(file_path) | |
else: | |
logger.info(f"Removing file: {file_path}") | |
os.remove(file_path) | |
logger.info("File cleanup completed") | |
except Exception as e: | |
logger.error(f"Error cleaning up files: {e}") | |
return False | |
return True | |
if __name__ == "__main__": | |
print("Web Crawler Cleanup Utility") | |
print("---------------------------") | |
print("This script will:") | |
print("1. Remove all web crawler collections from MongoDB") | |
print("2. List and remove files related to simple_crawler") | |
print() | |
proceed = input("Do you want to proceed? (y/n): ") | |
if proceed.lower() != 'y': | |
print("Cleanup cancelled") | |
sys.exit(0) | |
# Clean up MongoDB | |
print("\nStep 1: Cleaning up MongoDB...") | |
mongo_success = cleanup_mongodb() | |
# Clean up files | |
print("\nStep 2: Cleaning up files...") | |
files_success = cleanup_files() | |
# Summary | |
print("\nCleanup Summary:") | |
print(f"MongoDB cleanup: {'Completed' if mongo_success else 'Failed'}") | |
print(f"File cleanup: {'Completed' if files_success else 'Failed'}") |