Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
Script to remove all web crawler data from MongoDB without interactive confirmation | |
""" | |
import logging | |
from pymongo import MongoClient | |
import sys | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s [%(name)s] %(levelname)s: %(message)s' | |
) | |
logger = logging.getLogger("mongo_cleanup") | |
def cleanup_mongodb(): | |
"""Remove all web crawler data from MongoDB""" | |
try: | |
# Connect to MongoDB | |
logger.info("Connecting to MongoDB...") | |
client = MongoClient("mongodb://localhost:27017/") | |
# Access crawler database | |
db = client["crawler"] | |
# List and drop all collections | |
collections = db.list_collection_names() | |
if not collections: | |
logger.info("No collections found in the crawler database") | |
else: | |
logger.info(f"Found {len(collections)} collections to drop: {collections}") | |
for collection in collections: | |
logger.info(f"Dropping collection: {collection}") | |
db[collection].drop() | |
logger.info("All crawler collections dropped successfully") | |
# Optionally drop the entire database | |
logger.info("Dropping entire crawler database") | |
client.drop_database("crawler") | |
# Check for any URLs collection in other databases that might be related | |
all_dbs = client.list_database_names() | |
for db_name in all_dbs: | |
if db_name in ['admin', 'config', 'local']: | |
continue | |
db = client[db_name] | |
if 'urls' in db.list_collection_names() or 'pages' in db.list_collection_names(): | |
logger.info(f"Found crawler-related collections in database: {db_name}") | |
# Ask for confirmation before dropping collections in other databases | |
for collection in ['urls', 'pages', 'domains', 'stats']: | |
if collection in db.list_collection_names(): | |
logger.info(f"Dropping collection {db_name}.{collection}") | |
db[collection].drop() | |
logger.info("MongoDB cleanup completed successfully") | |
return True | |
except Exception as e: | |
logger.error(f"Error cleaning up MongoDB: {e}") | |
return False | |
if __name__ == "__main__": | |
print("MongoDB Crawler Data Cleanup") | |
print("--------------------------") | |
print("This script will remove all web crawler collections from MongoDB") | |
print() | |
if len(sys.argv) > 1 and sys.argv[1] == '--force': | |
# Non-interactive mode for scripting | |
success = cleanup_mongodb() | |
sys.exit(0 if success else 1) | |
else: | |
# Interactive mode | |
proceed = input("Do you want to proceed with MongoDB cleanup? (y/n): ") | |
if proceed.lower() != 'y': | |
print("Cleanup cancelled") | |
sys.exit(0) | |
success = cleanup_mongodb() | |
print(f"\nMongoDB cleanup: {'Completed' if success else 'Failed'}") |