Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -100,15 +100,34 @@ class URLProcessor:
|
|
100 |
@sleep_and_retry
|
101 |
@limits(calls=20, period=60) # Refined rate limiting
|
102 |
def fetch_content(self, url: str) -> Optional[Dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
cached = self.content_cache.get(url)
|
104 |
if cached:
|
105 |
return cached
|
106 |
|
107 |
try:
|
108 |
response = self.session.get(url, timeout=self.timeout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
soup = BeautifulSoup(response.text, 'html.parser')
|
110 |
-
|
111 |
-
# Extract structured elements
|
112 |
title = soup.title.text.strip() if soup.title else ''
|
113 |
meta_desc = soup.find('meta', {'name': 'description'})
|
114 |
description = meta_desc['content'].strip() if meta_desc else ''
|
|
|
100 |
@sleep_and_retry
|
101 |
@limits(calls=20, period=60) # Refined rate limiting
|
102 |
def fetch_content(self, url: str) -> Optional[Dict]:
|
103 |
+
"""Fetch and structure content from URL, handling Google Drive and Calendar links."""
|
104 |
+
if 'drive.google.com' in url:
|
105 |
+
# Convert Google Drive URL to direct download link
|
106 |
+
file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
|
107 |
+
if file_id:
|
108 |
+
url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
|
109 |
+
else:
|
110 |
+
logger.error(f"Invalid Google Drive URL: {url}")
|
111 |
+
return None
|
112 |
+
|
113 |
cached = self.content_cache.get(url)
|
114 |
if cached:
|
115 |
return cached
|
116 |
|
117 |
try:
|
118 |
response = self.session.get(url, timeout=self.timeout)
|
119 |
+
response.raise_for_status()
|
120 |
+
|
121 |
+
# Handle ICS files (Google Calendar)
|
122 |
+
if 'text/calendar' in response.headers.get('Content-Type', ''):
|
123 |
+
return {
|
124 |
+
'content': response.text,
|
125 |
+
'content_type': 'text/calendar',
|
126 |
+
'timestamp': datetime.now().isoformat()
|
127 |
+
}
|
128 |
+
|
129 |
+
# Handle HTML content
|
130 |
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
131 |
title = soup.title.text.strip() if soup.title else ''
|
132 |
meta_desc = soup.find('meta', {'name': 'description'})
|
133 |
description = meta_desc['content'].strip() if meta_desc else ''
|