Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -139,8 +139,14 @@ class URLProcessor:
|
|
139 |
# Try with a different user agent if it's a social media site
|
140 |
if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
|
141 |
self.session.headers.update({
|
142 |
-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
|
|
|
|
|
143 |
})
|
|
|
|
|
|
|
|
|
144 |
|
145 |
response = self.session.get(url, timeout=self.timeout)
|
146 |
response.raise_for_status()
|
@@ -148,8 +154,10 @@ class URLProcessor:
|
|
148 |
logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
|
149 |
|
150 |
# Save the raw HTML for debugging if needed
|
151 |
-
|
|
|
152 |
f.write(response.text)
|
|
|
153 |
|
154 |
soup = BeautifulSoup(response.text, 'html.parser')
|
155 |
|
@@ -157,17 +165,72 @@ class URLProcessor:
|
|
157 |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
158 |
element.decompose()
|
159 |
|
160 |
-
#
|
161 |
-
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
-
#
|
165 |
if 'facebook.com' in url:
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
elif 'huggingface.co' in url:
|
172 |
# Special handling for Hugging Face
|
173 |
logger.info("Applying special handling for Hugging Face")
|
|
|
139 |
# Try with a different user agent if it's a social media site
|
140 |
if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
|
141 |
self.session.headers.update({
|
142 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
|
143 |
+
# Add cookie consent headers to bypass some login walls
|
144 |
+
'Cookie': 'c_user=0; xs=0; datr=0; locale=en_US; wd=1920x1080'
|
145 |
})
|
146 |
+
# For Facebook, try to access the mobile version which often has fewer restrictions
|
147 |
+
if 'facebook.com' in url and 'm.facebook.com' not in url:
|
148 |
+
url = url.replace('www.facebook.com', 'm.facebook.com')
|
149 |
+
logger.info(f"Switched to mobile Facebook URL: {url}")
|
150 |
|
151 |
response = self.session.get(url, timeout=self.timeout)
|
152 |
response.raise_for_status()
|
|
|
154 |
logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
|
155 |
|
156 |
# Save the raw HTML for debugging if needed
|
157 |
+
debug_path = f"/Users/a2014/urld/debug_raw_{int(time.time())}.html"
|
158 |
+
with open(debug_path, "w", encoding="utf-8") as f:
|
159 |
f.write(response.text)
|
160 |
+
logger.info(f"Saved raw HTML to {debug_path}")
|
161 |
|
162 |
soup = BeautifulSoup(response.text, 'html.parser')
|
163 |
|
|
|
165 |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
166 |
element.decompose()
|
167 |
|
168 |
+
# Simulate "ESC key" by removing login walls and overlays common on social media sites
|
169 |
+
login_wall_selectors = [
|
170 |
+
'.login-wall', '.signup-wall', '.overlay', '.modal',
|
171 |
+
'[role="dialog"]', '[aria-modal="true"]', '.login-overlay',
|
172 |
+
'.signup-overlay', '.uiLayer', '.fb_overlay', '.ReactModalPortal',
|
173 |
+
'[data-testid="login_dialog"]', '[data-testid="signup_dialog"]',
|
174 |
+
'.login-signup-modal', '.onboarding-modal', '.signup-wrapper',
|
175 |
+
'.login-wrapper', '.login-container', '.signup-container',
|
176 |
+
'.login-modal', '.signup-modal', '.auth-modal', '.auth-wall'
|
177 |
+
]
|
178 |
+
for selector in login_wall_selectors:
|
179 |
+
for element in soup.select(selector):
|
180 |
+
logger.info(f"Removing login wall element: {selector}")
|
181 |
+
element.decompose()
|
182 |
|
183 |
+
# Enhanced removal for social media sites
|
184 |
if 'facebook.com' in url:
|
185 |
+
# Facebook specific elements - simulating ESC key
|
186 |
+
fb_selectors = [
|
187 |
+
'[data-testid="cookie-policy-manage-dialog"]',
|
188 |
+
'[role="banner"]', '[role="complementary"]',
|
189 |
+
'.login_form_container', '.login_form', '#login_form',
|
190 |
+
'.uiLayer', '.pluginConnectButton', '.fbPageBanner',
|
191 |
+
'._5hn6', '._67m7', '.nonLoggedInSignUp',
|
192 |
+
'#headerArea', '.uiContextualLayer', '.uiContextualLayerPositioner'
|
193 |
+
]
|
194 |
+
for selector in fb_selectors:
|
195 |
+
for element in soup.select(selector):
|
196 |
+
element.decompose()
|
197 |
+
|
198 |
+
# Look for the main content in mobile version
|
199 |
+
main_content = soup.select_one('#m_story_permalink_view') or soup.select_one('#mobile_injected_video_feed_pagelet')
|
200 |
+
if main_content:
|
201 |
+
logger.info("Found Facebook mobile main content")
|
202 |
+
|
203 |
+
elif 'instagram.com' in url:
|
204 |
+
# Instagram specific elements - simulating ESC key
|
205 |
+
ig_selectors = [
|
206 |
+
'[role="presentation"]', '[role="banner"]', '[role="complementary"]',
|
207 |
+
'.RnEpo', '._acb3', '._ab8w', '._abn5', '.x1n2onr6',
|
208 |
+
'.x78zum5', '.x1q0g3np', '.xieb3on', '._a9-z', '._a9_1',
|
209 |
+
'._aa4b', '.x1i10hfl', '.x9f619', '.xnz67gz', '.x78zum5',
|
210 |
+
'.x1q0g3np', '.x1gslohp', '.xieb3on', '.x1lcm9me'
|
211 |
+
]
|
212 |
+
for selector in ig_selectors:
|
213 |
+
for element in soup.select(selector):
|
214 |
+
element.decompose()
|
215 |
+
|
216 |
+
# Try to find the main content
|
217 |
+
insta_content = soup.select_one('main article') or soup.select_one('._aagv') or soup.select_one('._ab1y')
|
218 |
+
if insta_content:
|
219 |
+
logger.info("Found Instagram main content")
|
220 |
+
|
221 |
+
elif 'twitter.com' in url or 'x.com' in url:
|
222 |
+
# X/Twitter already works well for public content, but clean up any remaining overlays
|
223 |
+
x_selectors = [
|
224 |
+
'[data-testid="LoginForm"]', '[data-testid="SignupForm"]',
|
225 |
+
'[data-testid="sheetDialog"]', '[data-testid="mask"]',
|
226 |
+
'.r-zchlnj', '.r-1xcajam', '.r-1d2f490', '.r-1p0dtai',
|
227 |
+
'.r-1pi2tsx', '.r-u8s1d', '.css-175oi2r', '.css-1dbjc4n',
|
228 |
+
'.r-kemksi', '[data-testid="BottomBar"]'
|
229 |
+
]
|
230 |
+
for selector in x_selectors:
|
231 |
+
for element in soup.select(selector):
|
232 |
+
element.decompose()
|
233 |
+
|
234 |
elif 'huggingface.co' in url:
|
235 |
# Special handling for Hugging Face
|
236 |
logger.info("Applying special handling for Hugging Face")
|