Spaces:

acecalisto3
/

urld

Running

urld / robots_handler.py

Create robots_handler.py

23e5664 verified 9 months ago

945 Bytes

	import urllib.robotparser
	import time
	from urllib.parse import urlparse

	class RobotsHandler:
	def __init__(self):
	self.parser = urllib.robotparser.RobotFileParser()
	self.cache = {}
	self.cache_timeout = 3600 # 1 hour

	def can_fetch(self, url: str, user_agent: str) -> bool:
	domain = urlparse(url).netloc
	current_time = time.time()

	# Check cache first
	if domain in self.cache:
	parser, timestamp = self.cache[domain]
	if current_time - timestamp < self.cache_timeout:
	return parser.can_fetch(user_agent, url)

	# Update cache
	self.parser.set_url(f"https://{domain}/robots.txt")
	try:
	self.parser.read()
	self.cache[domain] = (self.parser, current_time)
	return self.parser.can_fetch(user_agent, url)
	except:
	return True # If robots.txt is unavailable, assume allowed