AI_SEO_Crawler / config.py
sagarnildass's picture
Upload folder using huggingface_hub
6f509ec verified
"""
Configuration settings for the web crawler
"""
import os
from typing import Dict, List, Any, Optional
# General settings
MAX_WORKERS = 100 # Maximum number of worker threads/processes
MAX_DEPTH = 10 # Maximum depth to crawl from seed URLs
CRAWL_TIMEOUT = 30 # Timeout for HTTP requests in seconds
USER_AGENT = "Mozilla/5.0 WebCrawler/1.0 (+https://example.org/bot)"
# Politeness settings
ROBOTSTXT_OBEY = True # Whether to obey robots.txt rules
DOWNLOAD_DELAY = 1.0 # Delay between requests to the same domain (seconds)
MAX_REQUESTS_PER_DOMAIN = 10 # Maximum concurrent requests per domain
RESPECT_CRAWL_DELAY = True # Respect Crawl-delay in robots.txt
RETRY_TIMES = 3 # Number of retries for failed requests
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429] # HTTP codes to retry
# URL settings
ALLOWED_DOMAINS: Optional[List[str]] = None # Domains to restrict crawling to (None = all domains)
EXCLUDED_DOMAINS: List[str] = [] # Domains to exclude from crawling
ALLOWED_SCHEMES = ["http", "https"] # URL schemes to allow
URL_FILTERS = [
# Only filter out binary and media files
r".*\.(jpg|jpeg|gif|png|ico|mp3|mp4|wav|avi|mov|mpeg|pdf|zip|rar|gz|exe|dmg|pkg|iso|bin)$",
] # Regex patterns to filter out URLs
# Storage settings
MONGODB_URI = "mongodb://localhost:27017/"
MONGODB_DB = "webcrawler"
REDIS_URI = "redis://localhost:6379/0"
STORAGE_PATH = os.path.join(os.path.dirname(__file__), "storage")
HTML_STORAGE_PATH = os.path.join(STORAGE_PATH, "html")
LOG_PATH = os.path.join(STORAGE_PATH, "logs")
# Frontier settings
FRONTIER_QUEUE_SIZE = 100000 # Maximum number of URLs in the frontier queue
PRIORITY_QUEUE_NUM = 5 # Number of priority queues
HOST_QUEUE_NUM = 1000 # Number of host queues for politeness
# Content settings
MAX_CONTENT_SIZE = 10 * 1024 * 1024 # Maximum size of HTML content to download (10MB)
ALLOWED_CONTENT_TYPES = [
"text/html",
"application/xhtml+xml",
"text/plain", # Some servers might serve HTML as text/plain
"application/html",
"*/*", # Accept any content type
] # Allowed content types
# DNS settings
DNS_CACHE_SIZE = 10000 # Maximum number of entries in DNS cache
DNS_CACHE_TIMEOUT = 3600 # DNS cache timeout in seconds
# Logging settings
LOG_LEVEL = "INFO"
LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s"
# Seed URLs
SEED_URLS = [
"https://en.wikipedia.org/",
"https://www.nytimes.com/",
"https://www.bbc.com/",
"https://www.github.com/",
"https://www.reddit.com/",
]
# Override settings with environment variables
def get_env_settings() -> Dict[str, Any]:
"""Get settings from environment variables"""
env_settings = {}
for key, value in globals().items():
if key.isupper(): # Only consider uppercase variables as settings
env_value = os.environ.get(f"WEBCRAWLER_{key}")
if env_value is not None:
# Convert to appropriate type based on default value
if isinstance(value, bool):
env_settings[key] = env_value.lower() in ("true", "1", "yes")
elif isinstance(value, int):
env_settings[key] = int(env_value)
elif isinstance(value, float):
env_settings[key] = float(env_value)
elif isinstance(value, list):
# Assume comma-separated values
env_settings[key] = [item.strip() for item in env_value.split(",")]
else:
env_settings[key] = env_value
return env_settings
# Update settings with environment variables
globals().update(get_env_settings())