Spaces:
Running
Running
import urllib.robotparser | |
import time | |
from urllib.parse import urlparse | |
class RobotsHandler: | |
def __init__(self): | |
self.parser = urllib.robotparser.RobotFileParser() | |
self.cache = {} | |
self.cache_timeout = 3600 # 1 hour | |
def can_fetch(self, url: str, user_agent: str) -> bool: | |
domain = urlparse(url).netloc | |
current_time = time.time() | |
# Check cache first | |
if domain in self.cache: | |
parser, timestamp = self.cache[domain] | |
if current_time - timestamp < self.cache_timeout: | |
return parser.can_fetch(user_agent, url) | |
# Update cache | |
self.parser.set_url(f"https://{domain}/robots.txt") | |
try: | |
self.parser.read() | |
self.cache[domain] = (self.parser, current_time) | |
return self.parser.can_fetch(user_agent, url) | |
except: | |
return True # If robots.txt is unavailable, assume allowed |