""" Configuration settings for the web crawler """ import os from typing import Dict, List, Any, Optional # General settings MAX_WORKERS = 100 # Maximum number of worker threads/processes MAX_DEPTH = 10 # Maximum depth to crawl from seed URLs CRAWL_TIMEOUT = 30 # Timeout for HTTP requests in seconds USER_AGENT = "Mozilla/5.0 WebCrawler/1.0 (+https://example.org/bot)" # Politeness settings ROBOTSTXT_OBEY = True # Whether to obey robots.txt rules DOWNLOAD_DELAY = 1.0 # Delay between requests to the same domain (seconds) MAX_REQUESTS_PER_DOMAIN = 10 # Maximum concurrent requests per domain RESPECT_CRAWL_DELAY = True # Respect Crawl-delay in robots.txt RETRY_TIMES = 3 # Number of retries for failed requests RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429] # HTTP codes to retry # URL settings ALLOWED_DOMAINS: Optional[List[str]] = None # Domains to restrict crawling to (None = all domains) EXCLUDED_DOMAINS: List[str] = [] # Domains to exclude from crawling ALLOWED_SCHEMES = ["http", "https"] # URL schemes to allow URL_FILTERS = [ # Only filter out binary and media files r".*\.(jpg|jpeg|gif|png|ico|mp3|mp4|wav|avi|mov|mpeg|pdf|zip|rar|gz|exe|dmg|pkg|iso|bin)$", ] # Regex patterns to filter out URLs # Storage settings MONGODB_URI = "mongodb://localhost:27017/" MONGODB_DB = "webcrawler" REDIS_URI = "redis://localhost:6379/0" STORAGE_PATH = os.path.join(os.path.dirname(__file__), "storage") HTML_STORAGE_PATH = os.path.join(STORAGE_PATH, "html") LOG_PATH = os.path.join(STORAGE_PATH, "logs") # Frontier settings FRONTIER_QUEUE_SIZE = 100000 # Maximum number of URLs in the frontier queue PRIORITY_QUEUE_NUM = 5 # Number of priority queues HOST_QUEUE_NUM = 1000 # Number of host queues for politeness # Content settings MAX_CONTENT_SIZE = 10 * 1024 * 1024 # Maximum size of HTML content to download (10MB) ALLOWED_CONTENT_TYPES = [ "text/html", "application/xhtml+xml", "text/plain", # Some servers might serve HTML as text/plain "application/html", "*/*", # Accept any content type ] # Allowed content types # DNS settings DNS_CACHE_SIZE = 10000 # Maximum number of entries in DNS cache DNS_CACHE_TIMEOUT = 3600 # DNS cache timeout in seconds # Logging settings LOG_LEVEL = "INFO" LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s" # Seed URLs SEED_URLS = [ "https://en.wikipedia.org/", "https://www.nytimes.com/", "https://www.bbc.com/", "https://www.github.com/", "https://www.reddit.com/", ] # Override settings with environment variables def get_env_settings() -> Dict[str, Any]: """Get settings from environment variables""" env_settings = {} for key, value in globals().items(): if key.isupper(): # Only consider uppercase variables as settings env_value = os.environ.get(f"WEBCRAWLER_{key}") if env_value is not None: # Convert to appropriate type based on default value if isinstance(value, bool): env_settings[key] = env_value.lower() in ("true", "1", "yes") elif isinstance(value, int): env_settings[key] = int(env_value) elif isinstance(value, float): env_settings[key] = float(env_value) elif isinstance(value, list): # Assume comma-separated values env_settings[key] = [item.strip() for item in env_value.split(",")] else: env_settings[key] = env_value return env_settings # Update settings with environment variables globals().update(get_env_settings())