Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 22

Commit

fa62c34

verified ·

1 Parent(s): 17fdb3b

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -124

app.py CHANGED Viewed

@@ -30,11 +30,26 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 class URLProcessor:
     def __init__(self):
-        self.session = requests.Session()
-        self.timeout = 10  # seconds
-        self.session.headers.update({
             'User-Agent': UserAgent().random,
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
@@ -42,133 +57,86 @@ class URLProcessor:
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1'
         })
-    def advanced_text_cleaning(self, text: str) -> str:
-        """Robust text cleaning with version compatibility"""
-        try:
-            cleaned_text = clean(
-                text,
-                to_ascii=True,
-                lower=True,
-                no_line_breaks=True,
-                no_urls=True,
-                no_emails=True,
-                no_phone_numbers=True,
-                no_numbers=False,
-                no_digits=False,
-                no_currency_symbols=True,
-                no_punct=False
-            ).strip()
-            return cleaned_text
-        except Exception as e:
-            logger.warning(f"Text cleaning error: {e}. Using fallback method.")
-            text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)  # Remove control characters
-            text = text.encode('ascii', 'ignore').decode('ascii')  # Remove non-ASCII characters
-            text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
-            return text.strip()
-    def validate_url(self, url: str) -> Dict:
-        """Validate URL format and accessibility"""
         try:
-            if not validators.url(url):
-                return {'is_valid': False, 'message': 'Invalid URL format'}
-            # Try with DNS resolution retry
-            for attempt in range(3):  # Try up to 3 times
-                try:
-                    # Some sites block HEAD requests but allow GET
-                    try:
-                        response = self.session.head(url, timeout=self.timeout)
-                        response.raise_for_status()
-                    except (requests.exceptions.RequestException, Exception) as e:
-                        logger.warning(f"HEAD request failed for {url}, trying GET: {e}")
-                        # Try with GET request if HEAD fails
-                        response = self.session.get(url, timeout=self.timeout, stream=True)
-                        response.raise_for_status()
-                        # Close the connection to avoid downloading the entire content
-                        response.close()
-                    return {'is_valid': True, 'message': 'URL is valid and accessible'}
-                except requests.exceptions.ConnectionError as e:
-                    if "NameResolutionError" in str(e) or "Failed to resolve" in str(e):
-                        logger.warning(f"DNS resolution failed for {url}, attempt {attempt + 1}/3")
-                        time.sleep(1)  # Wait a bit before retrying
-                        continue
-                    else:
-                        raise
-                except Exception as e:
-                    raise
-            # If we get here, all attempts failed
-            return {'is_valid': False,
-                    'message': f'URL validation failed: DNS resolution failed after multiple attempts'}
-        except Exception as e:
-            logger.error(f"URL validation failed for {url}: {str(e)}")
-            return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
-    def fetch_content(self, url: str) -> Optional[Dict]:
-        """Universal content fetcher with special case handling"""
-        try:
-            logger.info(f"Fetching content from: {url}")
-            # Google Drive document handling
-            if 'drive.google.com' in url:
-                return self._handle_google_drive(url)
-            # Google Calendar ICS handling
-            if 'calendar.google.com' in url and 'ical' in url:
-                return self._handle_google_calendar(url)
-            # Try standard HTML processing first
-            result = self._fetch_html_content(url)
-            # If standard processing failed or returned minimal content, try with Selenium
-            if not result or len(result.get('content', '')) < 100:
-                logger.info(
-                    f"Standard processing failed or returned minimal content for {url}, trying Selenium")
-                selenium_html = self._fetch_with_selenium(url)
-                if selenium_html:
-                    # Process the Selenium HTML
-                    soup = BeautifulSoup(selenium_html, 'html.parser')
-                    # Remove unwanted elements
-                    for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
-                        element.decompose()
-                    # Apply the same content extraction strategies as in _fetch_html_content
-                    # Strategy 1: Look for semantic HTML5 elements
-                    main_content = None
-                    for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post',
-                                     '.entry', '.page']:
-                        elements = soup.select(selector)
-                        if elements:
-                            main_content = elements[0]
-                            logger.info(f"Found content with selector: {selector}")
-                            break
-                    # If no main content found, use body
-                    if not main_content or not main_content.get_text(strip=True):
-                        main_content = soup.body if soup.body else soup
-                    # Extract text
-                    text_content = main_content.get_text(separator='\n', strip=True)
-                    # Clean content
-                    cleaned_content = self.advanced_text_cleaning(text_content)
-                    if len(cleaned_content) >= 20:
-                        result = {
-                            'content': cleaned_content,
-                            'content_type': 'text/html',
-                            'timestamp': datetime.now().isoformat(),
-                            'url': url,
-                            'source': 'selenium'  # Mark that this came from Selenium
-                        }
-                        # Log the result status
-            if result:
-                logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
-            else:
-                logger.error(f"Failed to extract content from {url}")
-            return result
         except Exception as e:
-            logger.error(f"Content fetch failed for {url}: {e}")
             return None
     def _fetch_html_content(self, url: str) -> Optional[Dict]:
         """Standard HTML content processing"""
         try:
@@ -682,6 +650,7 @@ def create_interface():
     .warning { background-color: #fff3cd; color: #856404; }
     .error { background-color: #f8d7da; color: #721c24; }
     """
     with gr.Blocks(css=css, title="Advanced Text & URL Processing") as interface:
         gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
@@ -805,4 +774,4 @@ def main():
 if __name__ == "__main__":
     main()

 logger = logging.getLogger(__name__)
+# Add these imports at the top
+from config import Config
+from proxy_handler import ProxyHandler
+from robots_handler import RobotsHandler
+import asyncio
+import aiohttp
+from tqdm import tqdm
 class URLProcessor:
     def __init__(self):
+        self.config = Config()
+        self.proxy_handler = ProxyHandler(self.config.get('PROXY_URL'))
+        self.robots_handler = RobotsHandler()
+        self.session = self._create_session()
+    def _create_session(self):
+        session = requests.Session()
+        if self.config.get('USE_PROXY'):
+            session.proxies = self.proxy_handler.get_proxy_config()
+        session.headers.update({
             'User-Agent': UserAgent().random,
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1'
         })
+        return session
+    def _fetch_with_selenium(self, url: str) -> Optional[str]:
         try:
+            chrome_options = Options()
+            from selenium import webdriver
+            from selenium.webdriver.chrome.options import Options
+            from selenium.webdriver.common.by import By
+            from selenium.webdriver.support.ui import WebDriverWait
+            from selenium.webdriver.support import expected_conditions as EC
+            from selenium.common.exceptions import TimeoutException
+            import time
+            logger.info(f"Attempting to fetch {url} with Selenium")
+            # Set up Chrome options
+            chrome_options = Options()
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--no-sandbox")
+            chrome_options.add_argument("--disable-dev-shm-usage")
+            chrome_options.add_argument("--disable-gpu")
+            chrome_options.add_argument("--window-size=1920,1080")
+            chrome_options.add_argument(
+                "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+            # Initialize the driver
+            driver = webdriver.Chrome(options=chrome_options)
+            try:
+                # Navigate to the URL
+                driver.get(url)
+                # Wait for the page to load
+                WebDriverWait(driver, 10).until(
+                    EC.presence_of_element_located((By.TAG_NAME, "body"))
+                )
+                # Simulate pressing ESC key to dismiss overlays
+                from selenium.webdriver.common.keys import Keys
+                action_chains = webdriver.ActionChains(driver)
+                action_chains.send_keys(Keys.ESCAPE).perform()
+                time.sleep(1)  # give it a moment to take effect
+                action_chains.reset_actions() # Clear actions
+                # try again
+                action_chains.send_keys(Keys.ESCAPE).perform()
+                time.sleep(1)  # give it a moment to take effect
+                action_chains.reset_actions()
+                # Get the page source
+                page_source = driver.page_source
+                # Save the Selenium HTML for debugging
+                debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
+                with open(debug_path, "w", encoding="utf-8") as f:
+                    f.write(page_source)
+                logger.info(f"Saved Selenium HTML to {debug_path}")
+                return page_source
+            finally:
+                driver.quit()
+        except ImportError:
+            logger.error("Selenium is not installed. Cannot use browser automation.")
+            return None
         except Exception as e:
+            logger.error(f"Selenium processing failed for {url}: {e}")
             return None
+    async def fetch_urls_async(self, urls: List[str]) -> List[Dict]:
+        async with aiohttp.ClientSession() as session:
+            tasks = []
+            for url in urls:
+                if self.config.get('RESPECT_ROBOTS'):
+                    if not self.robots_handler.can_fetch(url, self.session.headers['User-Agent']):
+                        logger.warning(f"Skipping {url} due to robots.txt restrictions")
+                        continue
+                tasks.append(self.fetch_content_async(session, url))
+            return await asyncio.gather(*tasks)
     def _fetch_html_content(self, url: str) -> Optional[Dict]:
         """Standard HTML content processing"""
         try:
     .warning { background-color: #fff3cd; color: #856404; }
     .error { background-color: #f8d7da; color: #721c24; }
     """
     with gr.Blocks(css=css, title="Advanced Text & URL Processing") as interface:
         gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
 if __name__ == "__main__":
     main()