Spaces:
Sleeping
Sleeping
File size: 8,751 Bytes
6f509ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 |
#!/usr/bin/env python3
"""
Main script to run the web crawler with command line arguments
"""
import os
import sys
import time
import logging
import argparse
import signal
from urllib.parse import urlparse
# Add the current directory to path if needed
script_dir = os.path.dirname(os.path.abspath(__file__))
if script_dir not in sys.path:
sys.path.insert(0, script_dir)
# Configure logging - do this first
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(name)s] %(levelname)s: %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(os.path.join(script_dir, 'crawler.log'))
]
)
logger = logging.getLogger("run_crawler")
# Now import the crawler components
logger.info("Importing crawler modules...")
try:
from crawler import Crawler
from models import Priority
logger.info("Successfully imported crawler modules")
except Exception as e:
logger.error(f"Error importing crawler modules: {e}", exc_info=True)
sys.exit(1)
def parse_arguments():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description='Run the web crawler with custom settings')
parser.add_argument('--seed', nargs='+', metavar='URL',
help='One or more seed URLs to start crawling')
parser.add_argument('--depth', type=int, default=None,
help='Maximum crawl depth')
parser.add_argument('--workers', type=int, default=None,
help='Number of worker threads')
parser.add_argument('--delay', type=float, default=None,
help='Delay between requests to the same domain (in seconds)')
parser.add_argument('--respect-robots', dest='respect_robots', action='store_true',
help='Respect robots.txt rules')
parser.add_argument('--ignore-robots', dest='respect_robots', action='store_false',
help='Ignore robots.txt rules')
parser.add_argument('--user-agent', type=str, default=None,
help='User agent to use for requests')
parser.add_argument('--async', dest='async_mode', action='store_true',
help='Use async mode for requests')
parser.add_argument('--domain-filter', type=str, default=None,
help='Only crawl URLs that match this domain')
parser.add_argument('--reset-db', action='store_true',
help='Reset MongoDB and flush Redis data before starting')
parser.add_argument('--verbose', action='store_true',
help='Enable verbose logging')
args = parser.parse_args()
# Set log level based on verbose flag
if args.verbose:
logger.setLevel(logging.DEBUG)
logger.debug("Verbose logging enabled")
return args
def reset_databases():
"""Reset MongoDB and flush Redis data"""
success = True
# Reset MongoDB
try:
logger.info("Starting MongoDB cleanup...")
from mongo_cleanup import cleanup_mongodb
mongo_success = cleanup_mongodb()
if not mongo_success:
logger.warning("MongoDB cleanup may not have been completely successful")
success = False
else:
logger.info("MongoDB cleanup completed successfully")
except Exception as e:
logger.error(f"Error cleaning up MongoDB: {e}", exc_info=True)
success = False
# Flush Redis
try:
logger.info("Starting Redis flush...")
import redis
logger.debug("Connecting to Redis to flush data...")
# Set a timeout for Redis connection
r = redis.Redis(host='localhost', port=6379, db=0, socket_timeout=5)
# Check if Redis is available
try:
logger.debug("Testing Redis connection...")
ping_result = r.ping()
logger.debug(f"Redis ping result: {ping_result}")
# If connection works, flush all data
logger.info("Flushing all Redis data...")
result = r.flushall()
logger.info(f"Redis flush result: {result}")
except redis.ConnectionError as e:
logger.error(f"Redis connection error: {e}")
success = False
except Exception as e:
logger.error(f"Error flushing Redis: {e}", exc_info=True)
success = False
return success
def setup_signal_handlers(crawler_instance):
"""Setup signal handlers for graceful shutdown"""
def signal_handler(sig, frame):
logger.info(f"Received signal {sig}, shutting down gracefully...")
if crawler_instance and crawler_instance.running:
logger.info("Stopping crawler...")
crawler_instance.stop()
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
def run_crawler():
"""Run the crawler with command-line arguments"""
args = parse_arguments()
crawler = None
try:
logger.info("Starting the web crawler...")
# Reset database if requested
if args.reset_db:
logger.info("Resetting MongoDB and flushing Redis data...")
if not reset_databases():
logger.warning("Database reset was not completely successful")
# Create crawler instance
logger.info("Creating crawler instance...")
crawler = Crawler()
logger.info("Crawler instance created successfully")
# Setup signal handlers
setup_signal_handlers(crawler)
# Override settings from command line if provided
if args.depth is not None:
import config
config.MAX_DEPTH = args.depth
logger.info(f"Setting maximum depth to {args.depth}")
if args.delay is not None:
import config
config.DELAY_BETWEEN_REQUESTS = args.delay
logger.info(f"Setting delay between requests to {args.delay} seconds")
if args.respect_robots is not None:
import config
config.RESPECT_ROBOTS_TXT = args.respect_robots
logger.info(f"Respect robots.txt: {args.respect_robots}")
if args.user_agent is not None:
import config
config.USER_AGENT = args.user_agent
logger.info(f"Using user agent: {args.user_agent}")
# Add seed URLs if provided
if args.seed:
logger.info(f"Adding {len(args.seed)} seed URLs")
seed_urls = []
for url in args.seed:
if not (url.startswith('http://') or url.startswith('https://')):
url = 'https://' + url
seed_urls.append(url)
logger.debug(f"Added seed URL: {url}")
# Add the URLs to the frontier
logger.info("Adding seed URLs to frontier...")
added = crawler.add_seed_urls(seed_urls, Priority.VERY_HIGH)
logger.info(f"Successfully added {added} seed URLs to the frontier")
# Apply domain filter if provided
if args.domain_filter:
import config
# Allow both domain.com or http://domain.com formats
domain = args.domain_filter
if domain.startswith('http://') or domain.startswith('https://'):
domain = urlparse(domain).netloc
config.ALLOWED_DOMAINS = [domain]
logger.info(f"Filtering to domain: {domain}")
# Start the crawler
num_workers = args.workers if args.workers is not None else 4
logger.info(f"Starting crawler with {num_workers} workers...")
crawler.start(num_workers=num_workers, async_mode=args.async_mode)
# If we get here, crawler has finished or was stopped
logger.info("Crawler finished")
except KeyboardInterrupt:
logger.info("Crawler interrupted by user")
if crawler and crawler.running:
logger.info("Stopping crawler...")
crawler.stop()
except Exception as e:
logger.error(f"Error running crawler: {e}", exc_info=True)
if crawler and crawler.running:
try:
logger.info("Attempting to stop crawler after error...")
crawler.stop()
except:
pass
if __name__ == "__main__":
run_crawler() |