Spaces:
Running
Running
| import urllib.robotparser | |
| import time | |
| from urllib.parse import urlparse | |
| class RobotsHandler: | |
| def __init__(self): | |
| self.parser = urllib.robotparser.RobotFileParser() | |
| self.cache = {} | |
| self.cache_timeout = 3600 # 1 hour | |
| def can_fetch(self, url: str, user_agent: str) -> bool: | |
| domain = urlparse(url).netloc | |
| current_time = time.time() | |
| # Check cache first | |
| if domain in self.cache: | |
| parser, timestamp = self.cache[domain] | |
| if current_time - timestamp < self.cache_timeout: | |
| return parser.can_fetch(user_agent, url) | |
| # Update cache | |
| self.parser.set_url(f"https://{domain}/robots.txt") | |
| try: | |
| self.parser.read() | |
| self.cache[domain] = (self.parser, current_time) | |
| return self.parser.can_fetch(user_agent, url) | |
| except: | |
| return True # If robots.txt is unavailable, assume allowed |