acecalisto3 commited on
Commit
1206535
·
verified ·
1 Parent(s): e14c07b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -6
app.py CHANGED
@@ -119,9 +119,51 @@ class URLProcessor:
119
  if 'calendar.google.com' in url and 'ical' in url:
120
  return self._handle_google_calendar(url)
121
 
122
- # Standard HTML processing
123
  result = self._fetch_html_content(url)
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  # Log the result status
126
  if result:
127
  logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
@@ -138,18 +180,42 @@ class URLProcessor:
138
  try:
139
  # Try with a different user agent if it's a social media site
140
  if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
 
141
  self.session.headers.update({
142
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
143
  # Add cookie consent headers to bypass some login walls
144
- 'Cookie': 'c_user=0; xs=0; datr=0; locale=en_US; wd=1920x1080'
 
 
 
 
 
 
 
 
 
145
  })
146
  # For Facebook, try to access the mobile version which often has fewer restrictions
147
  if 'facebook.com' in url and 'm.facebook.com' not in url:
148
  url = url.replace('www.facebook.com', 'm.facebook.com')
149
  logger.info(f"Switched to mobile Facebook URL: {url}")
150
 
151
- response = self.session.get(url, timeout=self.timeout)
152
- response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
155
 
@@ -159,6 +225,11 @@ class URLProcessor:
159
  f.write(response.text)
160
  logger.info(f"Saved raw HTML to {debug_path}")
161
 
 
 
 
 
 
162
  soup = BeautifulSoup(response.text, 'html.parser')
163
 
164
  # Remove unwanted elements
@@ -628,4 +699,66 @@ def main():
628
  )
629
 
630
  if __name__ == "__main__":
631
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  if 'calendar.google.com' in url and 'ical' in url:
120
  return self._handle_google_calendar(url)
121
 
122
+ # Try standard HTML processing first
123
  result = self._fetch_html_content(url)
124
 
125
+ # If standard processing failed or returned minimal content, try with Selenium
126
+ if not result or len(result.get('content', '')) < 100:
127
+ logger.info(f"Standard processing failed or returned minimal content for {url}, trying Selenium")
128
+ selenium_html = self._fetch_with_selenium(url)
129
+
130
+ if selenium_html:
131
+ # Process the Selenium HTML
132
+ soup = BeautifulSoup(selenium_html, 'html.parser')
133
+
134
+ # Remove unwanted elements
135
+ for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
136
+ element.decompose()
137
+
138
+ # Apply the same content extraction strategies as in _fetch_html_content
139
+ # Strategy 1: Look for semantic HTML5 elements
140
+ main_content = None
141
+ for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry', '.page']:
142
+ elements = soup.select(selector)
143
+ if elements:
144
+ main_content = elements[0]
145
+ logger.info(f"Found content with selector: {selector}")
146
+ break
147
+
148
+ # If no main content found, use body
149
+ if not main_content or not main_content.get_text(strip=True):
150
+ main_content = soup.body if soup.body else soup
151
+
152
+ # Extract text
153
+ text_content = main_content.get_text(separator='\n', strip=True)
154
+
155
+ # Clean content
156
+ cleaned_content = self.advanced_text_cleaning(text_content)
157
+
158
+ if len(cleaned_content) >= 20:
159
+ result = {
160
+ 'content': cleaned_content,
161
+ 'content_type': 'text/html',
162
+ 'timestamp': datetime.now().isoformat(),
163
+ 'url': url,
164
+ 'source': 'selenium' # Mark that this came from Selenium
165
+ }
166
+
167
  # Log the result status
168
  if result:
169
  logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
 
180
  try:
181
  # Try with a different user agent if it's a social media site
182
  if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
183
+ # Use a more realistic browser user agent instead of random one
184
  self.session.headers.update({
185
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
186
  # Add cookie consent headers to bypass some login walls
187
+ 'Cookie': 'c_user=0; xs=0; datr=0; locale=en_US; wd=1920x1080; consent_accepted=true; cookie_consent=accepted',
188
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
189
+ 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
190
+ 'sec-ch-ua-mobile': '?0',
191
+ 'sec-ch-ua-platform': '"macOS"',
192
+ 'Sec-Fetch-Dest': 'document',
193
+ 'Sec-Fetch-Mode': 'navigate',
194
+ 'Sec-Fetch-Site': 'none',
195
+ 'Sec-Fetch-User': '?1',
196
+ 'Upgrade-Insecure-Requests': '1'
197
  })
198
  # For Facebook, try to access the mobile version which often has fewer restrictions
199
  if 'facebook.com' in url and 'm.facebook.com' not in url:
200
  url = url.replace('www.facebook.com', 'm.facebook.com')
201
  logger.info(f"Switched to mobile Facebook URL: {url}")
202
 
203
+ # Add a delay to simulate human browsing
204
+ time.sleep(1)
205
+
206
+ # Try to get the page with multiple attempts
207
+ max_attempts = 3
208
+ for attempt in range(max_attempts):
209
+ try:
210
+ response = self.session.get(url, timeout=self.timeout)
211
+ response.raise_for_status()
212
+ break
213
+ except (requests.exceptions.RequestException, Exception) as e:
214
+ if attempt < max_attempts - 1:
215
+ logger.warning(f"Attempt {attempt+1} failed for {url}: {e}. Retrying...")
216
+ time.sleep(2) # Wait longer between retries
217
+ else:
218
+ raise
219
 
220
  logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
221
 
 
225
  f.write(response.text)
226
  logger.info(f"Saved raw HTML to {debug_path}")
227
 
228
+ # Check if we got a valid response with content
229
+ if not response.text or len(response.text) < 100:
230
+ logger.error(f"Empty or very short response from {url}")
231
+ return None
232
+
233
  soup = BeautifulSoup(response.text, 'html.parser')
234
 
235
  # Remove unwanted elements
 
699
  )
700
 
701
  if __name__ == "__main__":
702
+ main()
703
+
704
+
705
+ def _fetch_with_selenium(self, url: str) -> Optional[str]:
706
+ """Use Selenium as a fallback for difficult sites"""
707
+ try:
708
+ from selenium import webdriver
709
+ from selenium.webdriver.chrome.options import Options
710
+ from selenium.webdriver.common.by import By
711
+ from selenium.webdriver.support.ui import WebDriverWait
712
+ from selenium.webdriver.support import expected_conditions as EC
713
+ from selenium.common.exceptions import TimeoutException
714
+ import time
715
+
716
+ logger.info(f"Attempting to fetch {url} with Selenium")
717
+
718
+ # Set up Chrome options
719
+ chrome_options = Options()
720
+ chrome_options.add_argument("--headless")
721
+ chrome_options.add_argument("--no-sandbox")
722
+ chrome_options.add_argument("--disable-dev-shm-usage")
723
+ chrome_options.add_argument("--disable-gpu")
724
+ chrome_options.add_argument("--window-size=1920,1080")
725
+ chrome_options.add_argument(f"user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
726
+
727
+ # Initialize the driver
728
+ driver = webdriver.Chrome(options=chrome_options)
729
+
730
+ try:
731
+ # Navigate to the URL
732
+ driver.get(url)
733
+
734
+ # Wait for the page to load
735
+ WebDriverWait(driver, 10).until(
736
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
737
+ )
738
+
739
+ # Simulate pressing ESC key to dismiss overlays
740
+ from selenium.webdriver.common.keys import Keys
741
+ webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
742
+
743
+ # Wait a bit for any animations to complete
744
+ time.sleep(2)
745
+
746
+ # Get the page source
747
+ page_source = driver.page_source
748
+
749
+ # Save the Selenium HTML for debugging
750
+ debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
751
+ with open(debug_path, "w", encoding="utf-8") as f:
752
+ f.write(page_source)
753
+ logger.info(f"Saved Selenium HTML to {debug_path}")
754
+
755
+ return page_source
756
+ finally:
757
+ driver.quit()
758
+
759
+ except ImportError:
760
+ logger.error("Selenium is not installed. Cannot use browser automation.")
761
+ return None
762
+ except Exception as e:
763
+ logger.error(f"Selenium processing failed for {url}: {e}")
764
+ return None