import urllib.robotparser import time from urllib.parse import urlparse class RobotsHandler: def __init__(self): self.parser = urllib.robotparser.RobotFileParser() self.cache = {} self.cache_timeout = 3600 # 1 hour def can_fetch(self, url: str, user_agent: str) -> bool: domain = urlparse(url).netloc current_time = time.time() # Check cache first if domain in self.cache: parser, timestamp = self.cache[domain] if current_time - timestamp < self.cache_timeout: return parser.can_fetch(user_agent, url) # Update cache self.parser.set_url(f"https://{domain}/robots.txt") try: self.parser.read() self.cache[domain] = (self.parser, current_time) return self.parser.can_fetch(user_agent, url) except: return True # If robots.txt is unavailable, assume allowed