Spaces:
Running
Running
File size: 945 Bytes
23e5664 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
import urllib.robotparser
import time
from urllib.parse import urlparse
class RobotsHandler:
def __init__(self):
self.parser = urllib.robotparser.RobotFileParser()
self.cache = {}
self.cache_timeout = 3600 # 1 hour
def can_fetch(self, url: str, user_agent: str) -> bool:
domain = urlparse(url).netloc
current_time = time.time()
# Check cache first
if domain in self.cache:
parser, timestamp = self.cache[domain]
if current_time - timestamp < self.cache_timeout:
return parser.can_fetch(user_agent, url)
# Update cache
self.parser.set_url(f"https://{domain}/robots.txt")
try:
self.parser.read()
self.cache[domain] = (self.parser, current_time)
return self.parser.can_fetch(user_agent, url)
except:
return True # If robots.txt is unavailable, assume allowed |