Spaces:
Sleeping
Sleeping
""" | |
DNS resolver with caching for web crawler | |
""" | |
import socket | |
import logging | |
import time | |
from typing import Dict, Optional, Tuple | |
from urllib.parse import urlparse | |
from datetime import datetime, timedelta | |
from cachetools import TTLCache | |
import threading | |
import dns | |
import dns.resolver | |
import config | |
# Import local configuration if available | |
try: | |
import local_config | |
# Override config settings with local settings | |
for key in dir(local_config): | |
if key.isupper(): | |
setattr(config, key, getattr(local_config, key)) | |
logging.info("Loaded local configuration") | |
except ImportError: | |
pass | |
# Configure logging | |
logging.basicConfig( | |
level=getattr(logging, config.LOG_LEVEL), | |
format=config.LOG_FORMAT | |
) | |
logger = logging.getLogger(__name__) | |
class DNSResolver: | |
""" | |
DNS resolver with caching to improve performance | |
DNS resolution can be a bottleneck for crawlers due to the synchronous | |
nature of many DNS interfaces. This class provides a cached resolver | |
to reduce the number of DNS lookups. | |
""" | |
def __init__(self, cache_size: int = 10000, cache_ttl: int = 3600): | |
""" | |
Initialize DNS resolver | |
Args: | |
cache_size: Maximum number of DNS records to cache | |
cache_ttl: Time to live for cache entries in seconds | |
""" | |
self.cache = TTLCache(maxsize=cache_size, ttl=cache_ttl) | |
self.lock = threading.RLock() # Thread-safe operations | |
self.resolver = dns.resolver.Resolver() | |
self.resolver.timeout = 3.0 # Timeout for DNS requests in seconds | |
self.resolver.lifetime = 5.0 # Total timeout for all DNS requests | |
# Stats tracking | |
self.hit_count = 0 | |
self.miss_count = 0 | |
def resolve(self, url: str) -> Optional[str]: | |
""" | |
Resolve a URL to an IP address | |
Args: | |
url: URL to resolve | |
Returns: | |
IP address or None if resolution fails | |
""" | |
try: | |
parsed = urlparse(url) | |
hostname = parsed.netloc.split(':')[0] # Remove port if present | |
# Check cache first | |
with self.lock: | |
if hostname in self.cache: | |
logger.debug(f"DNS cache hit for {hostname}") | |
self.hit_count += 1 | |
return self.cache[hostname] | |
# Cache miss - resolve hostname | |
ip_address = self._resolve_hostname(hostname) | |
# Update cache | |
if ip_address: | |
with self.lock: | |
self.cache[hostname] = ip_address | |
self.miss_count += 1 | |
return ip_address | |
except Exception as e: | |
logger.warning(f"Error resolving DNS for {url}: {e}") | |
return None | |
def _resolve_hostname(self, hostname: str) -> Optional[str]: | |
""" | |
Resolve hostname to IP address | |
Args: | |
hostname: Hostname to resolve | |
Returns: | |
IP address or None if resolution fails | |
""" | |
try: | |
# First try using dnspython for more control | |
answers = self.resolver.resolve(hostname, 'A') | |
if answers: | |
# Return first IP address | |
return str(answers[0]) | |
except dns.exception.DNSException as e: | |
logger.debug(f"dnspython DNS resolution failed for {hostname}: {e}") | |
# Fall back to socket.gethostbyname | |
try: | |
return socket.gethostbyname(hostname) | |
except socket.gaierror as e: | |
logger.warning(f"Socket DNS resolution failed for {hostname}: {e}") | |
return None | |
def bulk_resolve(self, urls: list) -> Dict[str, Optional[str]]: | |
""" | |
Resolve multiple URLs to IP addresses | |
Args: | |
urls: List of URLs to resolve | |
Returns: | |
Dictionary mapping URLs to IP addresses | |
""" | |
results = {} | |
for url in urls: | |
results[url] = self.resolve(url) | |
return results | |
def clear_cache(self) -> None: | |
"""Clear the DNS cache""" | |
with self.lock: | |
self.cache.clear() | |
def get_stats(self) -> Dict[str, int]: | |
""" | |
Get statistics about the DNS cache | |
Returns: | |
Dictionary with cache statistics | |
""" | |
with self.lock: | |
return { | |
'size': len(self.cache), | |
'max_size': self.cache.maxsize, | |
'ttl': self.cache.ttl, | |
'hit_count': self.hit_count, | |
'miss_count': self.miss_count, | |
'hit_ratio': self.hit_count / (self.hit_count + self.miss_count) if (self.hit_count + self.miss_count) > 0 else 0 | |
} |