AI_SEO_Crawler

Sleeping

File size: 8,751 Bytes

6f509ec

#!/usr/bin/env python3
"""
Main script to run the web crawler with command line arguments
"""

import os
import sys
import time
import logging
import argparse
import signal
from urllib.parse import urlparse

# Add the current directory to path if needed
script_dir = os.path.dirname(os.path.abspath(__file__))
if script_dir not in sys.path:
    sys.path.insert(0, script_dir)

# Configure logging - do this first
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(name)s] %(levelname)s: %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(os.path.join(script_dir, 'crawler.log'))
    ]
)
logger = logging.getLogger("run_crawler")

# Now import the crawler components
logger.info("Importing crawler modules...")
try:
    from crawler import Crawler
    from models import Priority
    logger.info("Successfully imported crawler modules")
except Exception as e:
    logger.error(f"Error importing crawler modules: {e}", exc_info=True)
    sys.exit(1)

def parse_arguments():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(description='Run the web crawler with custom settings')
    
    parser.add_argument('--seed', nargs='+', metavar='URL',
                        help='One or more seed URLs to start crawling')
                        
    parser.add_argument('--depth', type=int, default=None, 
                        help='Maximum crawl depth')
                        
    parser.add_argument('--workers', type=int, default=None,
                        help='Number of worker threads')
                        
    parser.add_argument('--delay', type=float, default=None,
                        help='Delay between requests to the same domain (in seconds)')
                        
    parser.add_argument('--respect-robots', dest='respect_robots', action='store_true',
                        help='Respect robots.txt rules')
                        
    parser.add_argument('--ignore-robots', dest='respect_robots', action='store_false',
                        help='Ignore robots.txt rules')
    
    parser.add_argument('--user-agent', type=str, default=None,
                       help='User agent to use for requests')
                       
    parser.add_argument('--async', dest='async_mode', action='store_true',
                       help='Use async mode for requests')
    
    parser.add_argument('--domain-filter', type=str, default=None,
                       help='Only crawl URLs that match this domain')
                       
    parser.add_argument('--reset-db', action='store_true',
                       help='Reset MongoDB and flush Redis data before starting')
    
    parser.add_argument('--verbose', action='store_true',
                       help='Enable verbose logging')
    
    args = parser.parse_args()
    
    # Set log level based on verbose flag
    if args.verbose:
        logger.setLevel(logging.DEBUG)
        logger.debug("Verbose logging enabled")
    
    return args

def reset_databases():
    """Reset MongoDB and flush Redis data"""
    success = True
    
    # Reset MongoDB
    try:
        logger.info("Starting MongoDB cleanup...")
        from mongo_cleanup import cleanup_mongodb
        mongo_success = cleanup_mongodb()
        if not mongo_success:
            logger.warning("MongoDB cleanup may not have been completely successful")
            success = False
        else:
            logger.info("MongoDB cleanup completed successfully")
    except Exception as e:
        logger.error(f"Error cleaning up MongoDB: {e}", exc_info=True)
        success = False
    
    # Flush Redis
    try:
        logger.info("Starting Redis flush...")
        import redis
        logger.debug("Connecting to Redis to flush data...")
        
        # Set a timeout for Redis connection
        r = redis.Redis(host='localhost', port=6379, db=0, socket_timeout=5)
        
        # Check if Redis is available
        try:
            logger.debug("Testing Redis connection...")
            ping_result = r.ping()
            logger.debug(f"Redis ping result: {ping_result}")
            
            # If connection works, flush all data
            logger.info("Flushing all Redis data...")
            result = r.flushall()
            logger.info(f"Redis flush result: {result}")
        except redis.ConnectionError as e:
            logger.error(f"Redis connection error: {e}")
            success = False
    except Exception as e:
        logger.error(f"Error flushing Redis: {e}", exc_info=True)
        success = False
        
    return success

def setup_signal_handlers(crawler_instance):
    """Setup signal handlers for graceful shutdown"""
    def signal_handler(sig, frame):
        logger.info(f"Received signal {sig}, shutting down gracefully...")
        if crawler_instance and crawler_instance.running:
            logger.info("Stopping crawler...")
            crawler_instance.stop()
        sys.exit(0)
    
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

def run_crawler():
    """Run the crawler with command-line arguments"""
    args = parse_arguments()
    crawler = None
    
    try:
        logger.info("Starting the web crawler...")
        
        # Reset database if requested
        if args.reset_db:
            logger.info("Resetting MongoDB and flushing Redis data...")
            if not reset_databases():
                logger.warning("Database reset was not completely successful")
        
        # Create crawler instance
        logger.info("Creating crawler instance...")
        crawler = Crawler()
        logger.info("Crawler instance created successfully")
        
        # Setup signal handlers
        setup_signal_handlers(crawler)
        
        # Override settings from command line if provided
        if args.depth is not None:
            import config
            config.MAX_DEPTH = args.depth
            logger.info(f"Setting maximum depth to {args.depth}")
            
        if args.delay is not None:
            import config
            config.DELAY_BETWEEN_REQUESTS = args.delay
            logger.info(f"Setting delay between requests to {args.delay} seconds")
            
        if args.respect_robots is not None:
            import config
            config.RESPECT_ROBOTS_TXT = args.respect_robots
            logger.info(f"Respect robots.txt: {args.respect_robots}")
            
        if args.user_agent is not None:
            import config
            config.USER_AGENT = args.user_agent
            logger.info(f"Using user agent: {args.user_agent}")
            
        # Add seed URLs if provided
        if args.seed:
            logger.info(f"Adding {len(args.seed)} seed URLs")
            seed_urls = []
            for url in args.seed:
                if not (url.startswith('http://') or url.startswith('https://')):
                    url = 'https://' + url
                seed_urls.append(url)
                logger.debug(f"Added seed URL: {url}")
                    
            # Add the URLs to the frontier
            logger.info("Adding seed URLs to frontier...")
            added = crawler.add_seed_urls(seed_urls, Priority.VERY_HIGH)
            logger.info(f"Successfully added {added} seed URLs to the frontier")
            
        # Apply domain filter if provided
        if args.domain_filter:
            import config
            
            # Allow both domain.com or http://domain.com formats
            domain = args.domain_filter
            if domain.startswith('http://') or domain.startswith('https://'):
                domain = urlparse(domain).netloc
                
            config.ALLOWED_DOMAINS = [domain]
            logger.info(f"Filtering to domain: {domain}")
        
        # Start the crawler
        num_workers = args.workers if args.workers is not None else 4
        
        logger.info(f"Starting crawler with {num_workers} workers...")
        crawler.start(num_workers=num_workers, async_mode=args.async_mode)
        # If we get here, crawler has finished or was stopped
        logger.info("Crawler finished")
        
    except KeyboardInterrupt:
        logger.info("Crawler interrupted by user")
        if crawler and crawler.running:
            logger.info("Stopping crawler...")
            crawler.stop()
    except Exception as e:
        logger.error(f"Error running crawler: {e}", exc_info=True)
        if crawler and crawler.running:
            try:
                logger.info("Attempting to stop crawler after error...")
                crawler.stop()
            except:
                pass
        
if __name__ == "__main__":
    run_crawler()