acecalisto3 commited on
Commit
eed2b1e
·
verified ·
1 Parent(s): 83a6fef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -2
app.py CHANGED
@@ -100,15 +100,34 @@ class URLProcessor:
100
  @sleep_and_retry
101
  @limits(calls=20, period=60) # Refined rate limiting
102
  def fetch_content(self, url: str) -> Optional[Dict]:
 
 
 
 
 
 
 
 
 
 
103
  cached = self.content_cache.get(url)
104
  if cached:
105
  return cached
106
 
107
  try:
108
  response = self.session.get(url, timeout=self.timeout)
 
 
 
 
 
 
 
 
 
 
 
109
  soup = BeautifulSoup(response.text, 'html.parser')
110
-
111
- # Extract structured elements
112
  title = soup.title.text.strip() if soup.title else ''
113
  meta_desc = soup.find('meta', {'name': 'description'})
114
  description = meta_desc['content'].strip() if meta_desc else ''
 
100
  @sleep_and_retry
101
  @limits(calls=20, period=60) # Refined rate limiting
102
  def fetch_content(self, url: str) -> Optional[Dict]:
103
+ """Fetch and structure content from URL, handling Google Drive and Calendar links."""
104
+ if 'drive.google.com' in url:
105
+ # Convert Google Drive URL to direct download link
106
+ file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
107
+ if file_id:
108
+ url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
109
+ else:
110
+ logger.error(f"Invalid Google Drive URL: {url}")
111
+ return None
112
+
113
  cached = self.content_cache.get(url)
114
  if cached:
115
  return cached
116
 
117
  try:
118
  response = self.session.get(url, timeout=self.timeout)
119
+ response.raise_for_status()
120
+
121
+ # Handle ICS files (Google Calendar)
122
+ if 'text/calendar' in response.headers.get('Content-Type', ''):
123
+ return {
124
+ 'content': response.text,
125
+ 'content_type': 'text/calendar',
126
+ 'timestamp': datetime.now().isoformat()
127
+ }
128
+
129
+ # Handle HTML content
130
  soup = BeautifulSoup(response.text, 'html.parser')
 
 
131
  title = soup.title.text.strip() if soup.title else ''
132
  meta_desc = soup.find('meta', {'name': 'description'})
133
  description = meta_desc['content'].strip() if meta_desc else ''