Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -31,17 +31,23 @@ logging.basicConfig(
|
|
31 |
logger = logging.getLogger(__name__)
|
32 |
|
33 |
class URLProcessor:
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
self.cache_dir = Path(cache_dir)
|
36 |
self.cache_dir.mkdir(exist_ok=True)
|
37 |
-
|
38 |
-
# Persistent disk-based caches
|
39 |
self.url_cache = Cache(str(self.cache_dir / 'url_cache'))
|
40 |
self.content_cache = Cache(str(self.cache_dir / 'content_cache'), size_limit=2**30)
|
41 |
-
|
|
|
42 |
self.session = requests.Session()
|
43 |
self.session.headers.update({
|
44 |
-
'User
|
45 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
46 |
'Accept-Language': 'en-US,en;q=0.5',
|
47 |
'Connection': 'keep-alive'
|
|
|
31 |
logger = logging.getLogger(__name__)
|
32 |
|
33 |
class URLProcessor:
|
34 |
+
"""Class to handle URL processing with advanced features"""
|
35 |
+
|
36 |
+
def __init__(self, timeout=15, max_retries=3, cache_dir='cache'):
|
37 |
+
self.ua = UserAgent() # Initialize UserAgent first
|
38 |
+
self.timeout = timeout
|
39 |
+
self.max_retries = max_retries
|
40 |
+
|
41 |
+
# Persistent caching setup
|
42 |
self.cache_dir = Path(cache_dir)
|
43 |
self.cache_dir.mkdir(exist_ok=True)
|
|
|
|
|
44 |
self.url_cache = Cache(str(self.cache_dir / 'url_cache'))
|
45 |
self.content_cache = Cache(str(self.cache_dir / 'content_cache'), size_limit=2**30)
|
46 |
+
|
47 |
+
# Session configuration
|
48 |
self.session = requests.Session()
|
49 |
self.session.headers.update({
|
50 |
+
'User-Agent': self.ua.random, # Correct header key
|
51 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
52 |
'Accept-Language': 'en-US,en;q=0.5',
|
53 |
'Connection': 'keep-alive'
|