acecalisto3 commited on
Commit
23e5664
·
verified ·
1 Parent(s): b39dfc3

Create robots_handler.py

Browse files
Files changed (1) hide show
  1. robots_handler.py +28 -0
robots_handler.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.robotparser
2
+ import time
3
+ from urllib.parse import urlparse
4
+
5
+ class RobotsHandler:
6
+ def __init__(self):
7
+ self.parser = urllib.robotparser.RobotFileParser()
8
+ self.cache = {}
9
+ self.cache_timeout = 3600 # 1 hour
10
+
11
+ def can_fetch(self, url: str, user_agent: str) -> bool:
12
+ domain = urlparse(url).netloc
13
+ current_time = time.time()
14
+
15
+ # Check cache first
16
+ if domain in self.cache:
17
+ parser, timestamp = self.cache[domain]
18
+ if current_time - timestamp < self.cache_timeout:
19
+ return parser.can_fetch(user_agent, url)
20
+
21
+ # Update cache
22
+ self.parser.set_url(f"https://{domain}/robots.txt")
23
+ try:
24
+ self.parser.read()
25
+ self.cache[domain] = (self.parser, current_time)
26
+ return self.parser.can_fetch(user_agent, url)
27
+ except:
28
+ return True # If robots.txt is unavailable, assume allowed