Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 22

Commit

1206535

verified ·

1 Parent(s): e14c07b

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -6

app.py CHANGED Viewed

@@ -119,9 +119,51 @@ class URLProcessor:
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
-            # Standard HTML processing
             result = self._fetch_html_content(url)
             # Log the result status
             if result:
                 logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
@@ -138,18 +180,42 @@ class URLProcessor:
         try:
             # Try with a different user agent if it's a social media site
             if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
                 self.session.headers.update({
-                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
                     # Add cookie consent headers to bypass some login walls
-                    'Cookie': 'c_user=0; xs=0; datr=0; locale=en_US; wd=1920x1080'
                 })
                 # For Facebook, try to access the mobile version which often has fewer restrictions
                 if 'facebook.com' in url and 'm.facebook.com' not in url:
                     url = url.replace('www.facebook.com', 'm.facebook.com')
                     logger.info(f"Switched to mobile Facebook URL: {url}")
-            response = self.session.get(url, timeout=self.timeout)
-            response.raise_for_status()
             logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
@@ -159,6 +225,11 @@ class URLProcessor:
                 f.write(response.text)
             logger.info(f"Saved raw HTML to {debug_path}")
             soup = BeautifulSoup(response.text, 'html.parser')
             # Remove unwanted elements
@@ -628,4 +699,66 @@ def main():
     )
 if __name__ == "__main__":
-    main()

             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
+            # Try standard HTML processing first
             result = self._fetch_html_content(url)
+            # If standard processing failed or returned minimal content, try with Selenium
+            if not result or len(result.get('content', '')) < 100:
+                logger.info(f"Standard processing failed or returned minimal content for {url}, trying Selenium")
+                selenium_html = self._fetch_with_selenium(url)
+                if selenium_html:
+                    # Process the Selenium HTML
+                    soup = BeautifulSoup(selenium_html, 'html.parser')
+                    # Remove unwanted elements
+                    for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
+                        element.decompose()
+                    # Apply the same content extraction strategies as in _fetch_html_content
+                    # Strategy 1: Look for semantic HTML5 elements
+                    main_content = None
+                    for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry', '.page']:
+                        elements = soup.select(selector)
+                        if elements:
+                            main_content = elements[0]
+                            logger.info(f"Found content with selector: {selector}")
+                            break
+                    # If no main content found, use body
+                    if not main_content or not main_content.get_text(strip=True):
+                        main_content = soup.body if soup.body else soup
+                    # Extract text
+                    text_content = main_content.get_text(separator='\n', strip=True)
+                    # Clean content
+                    cleaned_content = self.advanced_text_cleaning(text_content)
+                    if len(cleaned_content) >= 20:
+                        result = {
+                            'content': cleaned_content,
+                            'content_type': 'text/html',
+                            'timestamp': datetime.now().isoformat(),
+                            'url': url,
+                            'source': 'selenium'  # Mark that this came from Selenium
+                        }
             # Log the result status
             if result:
                 logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
         try:
             # Try with a different user agent if it's a social media site
             if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
+                # Use a more realistic browser user agent instead of random one
                 self.session.headers.update({
+                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                     # Add cookie consent headers to bypass some login walls
+                    'Cookie': 'c_user=0; xs=0; datr=0; locale=en_US; wd=1920x1080; consent_accepted=true; cookie_consent=accepted',
+                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                    'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+                    'sec-ch-ua-mobile': '?0',
+                    'sec-ch-ua-platform': '"macOS"',
+                    'Sec-Fetch-Dest': 'document',
+                    'Sec-Fetch-Mode': 'navigate',
+                    'Sec-Fetch-Site': 'none',
+                    'Sec-Fetch-User': '?1',
+                    'Upgrade-Insecure-Requests': '1'
                 })
                 # For Facebook, try to access the mobile version which often has fewer restrictions
                 if 'facebook.com' in url and 'm.facebook.com' not in url:
                     url = url.replace('www.facebook.com', 'm.facebook.com')
                     logger.info(f"Switched to mobile Facebook URL: {url}")
+            # Add a delay to simulate human browsing
+            time.sleep(1)
+            # Try to get the page with multiple attempts
+            max_attempts = 3
+            for attempt in range(max_attempts):
+                try:
+                    response = self.session.get(url, timeout=self.timeout)
+                    response.raise_for_status()
+                    break
+                except (requests.exceptions.RequestException, Exception) as e:
+                    if attempt < max_attempts - 1:
+                        logger.warning(f"Attempt {attempt+1} failed for {url}: {e}. Retrying...")
+                        time.sleep(2)  # Wait longer between retries
+                    else:
+                        raise
             logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
                 f.write(response.text)
             logger.info(f"Saved raw HTML to {debug_path}")
+            # Check if we got a valid response with content
+            if not response.text or len(response.text) < 100:
+                logger.error(f"Empty or very short response from {url}")
+                return None
             soup = BeautifulSoup(response.text, 'html.parser')
             # Remove unwanted elements
     )
 if __name__ == "__main__":
+    main()
+def _fetch_with_selenium(self, url: str) -> Optional[str]:
+        """Use Selenium as a fallback for difficult sites"""
+        try:
+            from selenium import webdriver
+            from selenium.webdriver.chrome.options import Options
+            from selenium.webdriver.common.by import By
+            from selenium.webdriver.support.ui import WebDriverWait
+            from selenium.webdriver.support import expected_conditions as EC
+            from selenium.common.exceptions import TimeoutException
+            import time
+            logger.info(f"Attempting to fetch {url} with Selenium")
+            # Set up Chrome options
+            chrome_options = Options()
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--no-sandbox")
+            chrome_options.add_argument("--disable-dev-shm-usage")
+            chrome_options.add_argument("--disable-gpu")
+            chrome_options.add_argument("--window-size=1920,1080")
+            chrome_options.add_argument(f"user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+            # Initialize the driver
+            driver = webdriver.Chrome(options=chrome_options)
+            try:
+                # Navigate to the URL
+                driver.get(url)
+                # Wait for the page to load
+                WebDriverWait(driver, 10).until(
+                    EC.presence_of_element_located((By.TAG_NAME, "body"))
+                )
+                # Simulate pressing ESC key to dismiss overlays
+                from selenium.webdriver.common.keys import Keys
+                webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
+                # Wait a bit for any animations to complete
+                time.sleep(2)
+                # Get the page source
+                page_source = driver.page_source
+                # Save the Selenium HTML for debugging
+                debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
+                with open(debug_path, "w", encoding="utf-8") as f:
+                    f.write(page_source)
+                logger.info(f"Saved Selenium HTML to {debug_path}")
+                return page_source
+            finally:
+                driver.quit()
+        except ImportError:
+            logger.error("Selenium is not installed. Cannot use browser automation.")
+            return None
+        except Exception as e:
+            logger.error(f"Selenium processing failed for {url}: {e}")
+            return None