acecalisto3 commited on
Commit
3f75c63
·
verified ·
1 Parent(s): f25da5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -11
app.py CHANGED
@@ -139,8 +139,14 @@ class URLProcessor:
139
  # Try with a different user agent if it's a social media site
140
  if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
141
  self.session.headers.update({
142
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
 
 
143
  })
 
 
 
 
144
 
145
  response = self.session.get(url, timeout=self.timeout)
146
  response.raise_for_status()
@@ -148,8 +154,10 @@ class URLProcessor:
148
  logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
149
 
150
  # Save the raw HTML for debugging if needed
151
- with open(f"debug_raw_{int(time.time())}.html", "w", encoding="utf-8") as f:
 
152
  f.write(response.text)
 
153
 
154
  soup = BeautifulSoup(response.text, 'html.parser')
155
 
@@ -157,17 +165,72 @@ class URLProcessor:
157
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
158
  element.decompose()
159
 
160
- # Remove login walls and overlays common on social media sites
161
- for element in soup.select('.login-wall, .signup-wall, .overlay, .modal, [role="dialog"], [aria-modal="true"]'):
162
- element.decompose()
 
 
 
 
 
 
 
 
 
 
 
163
 
164
- # Remove specific elements for known sites
165
  if 'facebook.com' in url:
166
- for element in soup.select('[data-testid="cookie-policy-manage-dialog"], [role="banner"], [role="complementary"]'):
167
- element.decompose()
168
- elif 'instagram.com' in url or 'twitter.com' in url or 'x.com' in url:
169
- for element in soup.select('[role="presentation"], [role="banner"], [role="complementary"]'):
170
- element.decompose()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  elif 'huggingface.co' in url:
172
  # Special handling for Hugging Face
173
  logger.info("Applying special handling for Hugging Face")
 
139
  # Try with a different user agent if it's a social media site
140
  if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
141
  self.session.headers.update({
142
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
143
+ # Add cookie consent headers to bypass some login walls
144
+ 'Cookie': 'c_user=0; xs=0; datr=0; locale=en_US; wd=1920x1080'
145
  })
146
+ # For Facebook, try to access the mobile version which often has fewer restrictions
147
+ if 'facebook.com' in url and 'm.facebook.com' not in url:
148
+ url = url.replace('www.facebook.com', 'm.facebook.com')
149
+ logger.info(f"Switched to mobile Facebook URL: {url}")
150
 
151
  response = self.session.get(url, timeout=self.timeout)
152
  response.raise_for_status()
 
154
  logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
155
 
156
  # Save the raw HTML for debugging if needed
157
+ debug_path = f"/Users/a2014/urld/debug_raw_{int(time.time())}.html"
158
+ with open(debug_path, "w", encoding="utf-8") as f:
159
  f.write(response.text)
160
+ logger.info(f"Saved raw HTML to {debug_path}")
161
 
162
  soup = BeautifulSoup(response.text, 'html.parser')
163
 
 
165
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
166
  element.decompose()
167
 
168
+ # Simulate "ESC key" by removing login walls and overlays common on social media sites
169
+ login_wall_selectors = [
170
+ '.login-wall', '.signup-wall', '.overlay', '.modal',
171
+ '[role="dialog"]', '[aria-modal="true"]', '.login-overlay',
172
+ '.signup-overlay', '.uiLayer', '.fb_overlay', '.ReactModalPortal',
173
+ '[data-testid="login_dialog"]', '[data-testid="signup_dialog"]',
174
+ '.login-signup-modal', '.onboarding-modal', '.signup-wrapper',
175
+ '.login-wrapper', '.login-container', '.signup-container',
176
+ '.login-modal', '.signup-modal', '.auth-modal', '.auth-wall'
177
+ ]
178
+ for selector in login_wall_selectors:
179
+ for element in soup.select(selector):
180
+ logger.info(f"Removing login wall element: {selector}")
181
+ element.decompose()
182
 
183
+ # Enhanced removal for social media sites
184
  if 'facebook.com' in url:
185
+ # Facebook specific elements - simulating ESC key
186
+ fb_selectors = [
187
+ '[data-testid="cookie-policy-manage-dialog"]',
188
+ '[role="banner"]', '[role="complementary"]',
189
+ '.login_form_container', '.login_form', '#login_form',
190
+ '.uiLayer', '.pluginConnectButton', '.fbPageBanner',
191
+ '._5hn6', '._67m7', '.nonLoggedInSignUp',
192
+ '#headerArea', '.uiContextualLayer', '.uiContextualLayerPositioner'
193
+ ]
194
+ for selector in fb_selectors:
195
+ for element in soup.select(selector):
196
+ element.decompose()
197
+
198
+ # Look for the main content in mobile version
199
+ main_content = soup.select_one('#m_story_permalink_view') or soup.select_one('#mobile_injected_video_feed_pagelet')
200
+ if main_content:
201
+ logger.info("Found Facebook mobile main content")
202
+
203
+ elif 'instagram.com' in url:
204
+ # Instagram specific elements - simulating ESC key
205
+ ig_selectors = [
206
+ '[role="presentation"]', '[role="banner"]', '[role="complementary"]',
207
+ '.RnEpo', '._acb3', '._ab8w', '._abn5', '.x1n2onr6',
208
+ '.x78zum5', '.x1q0g3np', '.xieb3on', '._a9-z', '._a9_1',
209
+ '._aa4b', '.x1i10hfl', '.x9f619', '.xnz67gz', '.x78zum5',
210
+ '.x1q0g3np', '.x1gslohp', '.xieb3on', '.x1lcm9me'
211
+ ]
212
+ for selector in ig_selectors:
213
+ for element in soup.select(selector):
214
+ element.decompose()
215
+
216
+ # Try to find the main content
217
+ insta_content = soup.select_one('main article') or soup.select_one('._aagv') or soup.select_one('._ab1y')
218
+ if insta_content:
219
+ logger.info("Found Instagram main content")
220
+
221
+ elif 'twitter.com' in url or 'x.com' in url:
222
+ # X/Twitter already works well for public content, but clean up any remaining overlays
223
+ x_selectors = [
224
+ '[data-testid="LoginForm"]', '[data-testid="SignupForm"]',
225
+ '[data-testid="sheetDialog"]', '[data-testid="mask"]',
226
+ '.r-zchlnj', '.r-1xcajam', '.r-1d2f490', '.r-1p0dtai',
227
+ '.r-1pi2tsx', '.r-u8s1d', '.css-175oi2r', '.css-1dbjc4n',
228
+ '.r-kemksi', '[data-testid="BottomBar"]'
229
+ ]
230
+ for selector in x_selectors:
231
+ for element in soup.select(selector):
232
+ element.decompose()
233
+
234
  elif 'huggingface.co' in url:
235
  # Special handling for Hugging Face
236
  logger.info("Applying special handling for Hugging Face")