#!/usr/bin/env python3 """ Main script to run the web crawler with command line arguments """ import os import sys import time import logging import argparse import signal from urllib.parse import urlparse # Add the current directory to path if needed script_dir = os.path.dirname(os.path.abspath(__file__)) if script_dir not in sys.path: sys.path.insert(0, script_dir) # Configure logging - do this first logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(name)s] %(levelname)s: %(message)s', handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler(os.path.join(script_dir, 'crawler.log')) ] ) logger = logging.getLogger("run_crawler") # Now import the crawler components logger.info("Importing crawler modules...") try: from crawler import Crawler from models import Priority logger.info("Successfully imported crawler modules") except Exception as e: logger.error(f"Error importing crawler modules: {e}", exc_info=True) sys.exit(1) def parse_arguments(): """Parse command line arguments""" parser = argparse.ArgumentParser(description='Run the web crawler with custom settings') parser.add_argument('--seed', nargs='+', metavar='URL', help='One or more seed URLs to start crawling') parser.add_argument('--depth', type=int, default=None, help='Maximum crawl depth') parser.add_argument('--workers', type=int, default=None, help='Number of worker threads') parser.add_argument('--delay', type=float, default=None, help='Delay between requests to the same domain (in seconds)') parser.add_argument('--respect-robots', dest='respect_robots', action='store_true', help='Respect robots.txt rules') parser.add_argument('--ignore-robots', dest='respect_robots', action='store_false', help='Ignore robots.txt rules') parser.add_argument('--user-agent', type=str, default=None, help='User agent to use for requests') parser.add_argument('--async', dest='async_mode', action='store_true', help='Use async mode for requests') parser.add_argument('--domain-filter', type=str, default=None, help='Only crawl URLs that match this domain') parser.add_argument('--reset-db', action='store_true', help='Reset MongoDB and flush Redis data before starting') parser.add_argument('--verbose', action='store_true', help='Enable verbose logging') args = parser.parse_args() # Set log level based on verbose flag if args.verbose: logger.setLevel(logging.DEBUG) logger.debug("Verbose logging enabled") return args def reset_databases(): """Reset MongoDB and flush Redis data""" success = True # Reset MongoDB try: logger.info("Starting MongoDB cleanup...") from mongo_cleanup import cleanup_mongodb mongo_success = cleanup_mongodb() if not mongo_success: logger.warning("MongoDB cleanup may not have been completely successful") success = False else: logger.info("MongoDB cleanup completed successfully") except Exception as e: logger.error(f"Error cleaning up MongoDB: {e}", exc_info=True) success = False # Flush Redis try: logger.info("Starting Redis flush...") import redis logger.debug("Connecting to Redis to flush data...") # Set a timeout for Redis connection r = redis.Redis(host='localhost', port=6379, db=0, socket_timeout=5) # Check if Redis is available try: logger.debug("Testing Redis connection...") ping_result = r.ping() logger.debug(f"Redis ping result: {ping_result}") # If connection works, flush all data logger.info("Flushing all Redis data...") result = r.flushall() logger.info(f"Redis flush result: {result}") except redis.ConnectionError as e: logger.error(f"Redis connection error: {e}") success = False except Exception as e: logger.error(f"Error flushing Redis: {e}", exc_info=True) success = False return success def setup_signal_handlers(crawler_instance): """Setup signal handlers for graceful shutdown""" def signal_handler(sig, frame): logger.info(f"Received signal {sig}, shutting down gracefully...") if crawler_instance and crawler_instance.running: logger.info("Stopping crawler...") crawler_instance.stop() sys.exit(0) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) def run_crawler(): """Run the crawler with command-line arguments""" args = parse_arguments() crawler = None try: logger.info("Starting the web crawler...") # Reset database if requested if args.reset_db: logger.info("Resetting MongoDB and flushing Redis data...") if not reset_databases(): logger.warning("Database reset was not completely successful") # Create crawler instance logger.info("Creating crawler instance...") crawler = Crawler() logger.info("Crawler instance created successfully") # Setup signal handlers setup_signal_handlers(crawler) # Override settings from command line if provided if args.depth is not None: import config config.MAX_DEPTH = args.depth logger.info(f"Setting maximum depth to {args.depth}") if args.delay is not None: import config config.DELAY_BETWEEN_REQUESTS = args.delay logger.info(f"Setting delay between requests to {args.delay} seconds") if args.respect_robots is not None: import config config.RESPECT_ROBOTS_TXT = args.respect_robots logger.info(f"Respect robots.txt: {args.respect_robots}") if args.user_agent is not None: import config config.USER_AGENT = args.user_agent logger.info(f"Using user agent: {args.user_agent}") # Add seed URLs if provided if args.seed: logger.info(f"Adding {len(args.seed)} seed URLs") seed_urls = [] for url in args.seed: if not (url.startswith('http://') or url.startswith('https://')): url = 'https://' + url seed_urls.append(url) logger.debug(f"Added seed URL: {url}") # Add the URLs to the frontier logger.info("Adding seed URLs to frontier...") added = crawler.add_seed_urls(seed_urls, Priority.VERY_HIGH) logger.info(f"Successfully added {added} seed URLs to the frontier") # Apply domain filter if provided if args.domain_filter: import config # Allow both domain.com or http://domain.com formats domain = args.domain_filter if domain.startswith('http://') or domain.startswith('https://'): domain = urlparse(domain).netloc config.ALLOWED_DOMAINS = [domain] logger.info(f"Filtering to domain: {domain}") # Start the crawler num_workers = args.workers if args.workers is not None else 4 logger.info(f"Starting crawler with {num_workers} workers...") crawler.start(num_workers=num_workers, async_mode=args.async_mode) # If we get here, crawler has finished or was stopped logger.info("Crawler finished") except KeyboardInterrupt: logger.info("Crawler interrupted by user") if crawler and crawler.running: logger.info("Stopping crawler...") crawler.stop() except Exception as e: logger.error(f"Error running crawler: {e}", exc_info=True) if crawler and crawler.running: try: logger.info("Attempting to stop crawler after error...") crawler.stop() except: pass if __name__ == "__main__": run_crawler()