File size: 3,654 Bytes
6f509ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
Configuration settings for the web crawler
"""

import os
from typing import Dict, List, Any, Optional

# General settings
MAX_WORKERS = 100  # Maximum number of worker threads/processes
MAX_DEPTH = 10  # Maximum depth to crawl from seed URLs
CRAWL_TIMEOUT = 30  # Timeout for HTTP requests in seconds
USER_AGENT = "Mozilla/5.0 WebCrawler/1.0 (+https://example.org/bot)"

# Politeness settings
ROBOTSTXT_OBEY = True  # Whether to obey robots.txt rules
DOWNLOAD_DELAY = 1.0  # Delay between requests to the same domain (seconds)
MAX_REQUESTS_PER_DOMAIN = 10  # Maximum concurrent requests per domain
RESPECT_CRAWL_DELAY = True  # Respect Crawl-delay in robots.txt
RETRY_TIMES = 3  # Number of retries for failed requests
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]  # HTTP codes to retry

# URL settings
ALLOWED_DOMAINS: Optional[List[str]] = None  # Domains to restrict crawling to (None = all domains)
EXCLUDED_DOMAINS: List[str] = []  # Domains to exclude from crawling
ALLOWED_SCHEMES = ["http", "https"]  # URL schemes to allow
URL_FILTERS = [
    # Only filter out binary and media files
    r".*\.(jpg|jpeg|gif|png|ico|mp3|mp4|wav|avi|mov|mpeg|pdf|zip|rar|gz|exe|dmg|pkg|iso|bin)$",
]  # Regex patterns to filter out URLs

# Storage settings
MONGODB_URI = "mongodb://localhost:27017/"
MONGODB_DB = "webcrawler"
REDIS_URI = "redis://localhost:6379/0"
STORAGE_PATH = os.path.join(os.path.dirname(__file__), "storage")
HTML_STORAGE_PATH = os.path.join(STORAGE_PATH, "html")
LOG_PATH = os.path.join(STORAGE_PATH, "logs")

# Frontier settings
FRONTIER_QUEUE_SIZE = 100000  # Maximum number of URLs in the frontier queue
PRIORITY_QUEUE_NUM = 5  # Number of priority queues
HOST_QUEUE_NUM = 1000  # Number of host queues for politeness

# Content settings
MAX_CONTENT_SIZE = 10 * 1024 * 1024  # Maximum size of HTML content to download (10MB)
ALLOWED_CONTENT_TYPES = [
    "text/html",
    "application/xhtml+xml",
    "text/plain",  # Some servers might serve HTML as text/plain
    "application/html",
    "*/*",  # Accept any content type
]  # Allowed content types

# DNS settings
DNS_CACHE_SIZE = 10000  # Maximum number of entries in DNS cache
DNS_CACHE_TIMEOUT = 3600  # DNS cache timeout in seconds

# Logging settings
LOG_LEVEL = "INFO"
LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s"

# Seed URLs
SEED_URLS = [
    "https://en.wikipedia.org/",
    "https://www.nytimes.com/",
    "https://www.bbc.com/",
    "https://www.github.com/",
    "https://www.reddit.com/",
]

# Override settings with environment variables
def get_env_settings() -> Dict[str, Any]:
    """Get settings from environment variables"""
    env_settings = {}
    
    for key, value in globals().items():
        if key.isupper():  # Only consider uppercase variables as settings
            env_value = os.environ.get(f"WEBCRAWLER_{key}")
            if env_value is not None:
                # Convert to appropriate type based on default value
                if isinstance(value, bool):
                    env_settings[key] = env_value.lower() in ("true", "1", "yes")
                elif isinstance(value, int):
                    env_settings[key] = int(env_value)
                elif isinstance(value, float):
                    env_settings[key] = float(env_value)
                elif isinstance(value, list):
                    # Assume comma-separated values
                    env_settings[key] = [item.strip() for item in env_value.split(",")]
                else:
                    env_settings[key] = env_value
    
    return env_settings

# Update settings with environment variables
globals().update(get_env_settings())