Spaces:
Running
Running
Create robots_handler.py
Browse files- robots_handler.py +28 -0
robots_handler.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import urllib.robotparser
|
2 |
+
import time
|
3 |
+
from urllib.parse import urlparse
|
4 |
+
|
5 |
+
class RobotsHandler:
|
6 |
+
def __init__(self):
|
7 |
+
self.parser = urllib.robotparser.RobotFileParser()
|
8 |
+
self.cache = {}
|
9 |
+
self.cache_timeout = 3600 # 1 hour
|
10 |
+
|
11 |
+
def can_fetch(self, url: str, user_agent: str) -> bool:
|
12 |
+
domain = urlparse(url).netloc
|
13 |
+
current_time = time.time()
|
14 |
+
|
15 |
+
# Check cache first
|
16 |
+
if domain in self.cache:
|
17 |
+
parser, timestamp = self.cache[domain]
|
18 |
+
if current_time - timestamp < self.cache_timeout:
|
19 |
+
return parser.can_fetch(user_agent, url)
|
20 |
+
|
21 |
+
# Update cache
|
22 |
+
self.parser.set_url(f"https://{domain}/robots.txt")
|
23 |
+
try:
|
24 |
+
self.parser.read()
|
25 |
+
self.cache[domain] = (self.parser, current_time)
|
26 |
+
return self.parser.can_fetch(user_agent, url)
|
27 |
+
except:
|
28 |
+
return True # If robots.txt is unavailable, assume allowed
|