Spaces:

acecalisto3
/

urld

Running

acecalisto3 commited on Mar 22

Commit

23e5664

verified ·

1 Parent(s): b39dfc3

Create robots_handler.py

Files changed (1) hide show

robots_handler.py ADDED Viewed

+import urllib.robotparser
+import time
+from urllib.parse import urlparse
+class RobotsHandler:
+    def __init__(self):
+        self.parser = urllib.robotparser.RobotFileParser()
+        self.cache = {}
+        self.cache_timeout = 3600  # 1 hour
+    def can_fetch(self, url: str, user_agent: str) -> bool:
+        domain = urlparse(url).netloc
+        current_time = time.time()
+        # Check cache first
+        if domain in self.cache:
+            parser, timestamp = self.cache[domain]
+            if current_time - timestamp < self.cache_timeout:
+                return parser.can_fetch(user_agent, url)
+        # Update cache
+        self.parser.set_url(f"https://{domain}/robots.txt")
+        try:
+            self.parser.read()
+            self.cache[domain] = (self.parser, current_time)
+            return self.parser.can_fetch(user_agent, url)
+        except:
+            return True  # If robots.txt is unavailable, assume allowed