Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 22

Commit

044ca3f

verified ·

1 Parent(s): 1206535

Update app.py

Browse files

Files changed (1) hide show

app.py +188 -150

app.py CHANGED Viewed

@@ -72,7 +72,6 @@ class URLProcessor:
         try:
             if not validators.url(url):
                 return {'is_valid': False, 'message': 'Invalid URL format'}
             # Try with DNS resolution retry
             for attempt in range(3):  # Try up to 3 times
                 try:
@@ -87,21 +86,20 @@ class URLProcessor:
                         response.raise_for_status()
                         # Close the connection to avoid downloading the entire content
                         response.close()
                     return {'is_valid': True, 'message': 'URL is valid and accessible'}
                 except requests.exceptions.ConnectionError as e:
                     if "NameResolutionError" in str(e) or "Failed to resolve" in str(e):
-                        logger.warning(f"DNS resolution failed for {url}, attempt {attempt+1}/3")
                         time.sleep(1)  # Wait a bit before retrying
                         continue
                     else:
                         raise
                 except Exception as e:
                     raise
             # If we get here, all attempts failed
-            return {'is_valid': False, 'message': f'URL validation failed: DNS resolution failed after multiple attempts'}
         except Exception as e:
             logger.error(f"URL validation failed for {url}: {str(e)}")
             return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
@@ -110,51 +108,49 @@ class URLProcessor:
         """Universal content fetcher with special case handling"""
         try:
             logger.info(f"Fetching content from: {url}")
             # Google Drive document handling
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
             # Google Calendar ICS handling
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
             # Try standard HTML processing first
             result = self._fetch_html_content(url)
             # If standard processing failed or returned minimal content, try with Selenium
             if not result or len(result.get('content', '')) < 100:
-                logger.info(f"Standard processing failed or returned minimal content for {url}, trying Selenium")
                 selenium_html = self._fetch_with_selenium(url)
                 if selenium_html:
                     # Process the Selenium HTML
                     soup = BeautifulSoup(selenium_html, 'html.parser')
                     # Remove unwanted elements
                     for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                         element.decompose()
                     # Apply the same content extraction strategies as in _fetch_html_content
                     # Strategy 1: Look for semantic HTML5 elements
                     main_content = None
-                    for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry', '.page']:
                         elements = soup.select(selector)
                         if elements:
                             main_content = elements[0]
                             logger.info(f"Found content with selector: {selector}")
                             break
                     # If no main content found, use body
                     if not main_content or not main_content.get_text(strip=True):
                         main_content = soup.body if soup.body else soup
                     # Extract text
                     text_content = main_content.get_text(separator='\n', strip=True)
                     # Clean content
                     cleaned_content = self.advanced_text_cleaning(text_content)
                     if len(cleaned_content) >= 20:
                         result = {
                             'content': cleaned_content,
@@ -163,13 +159,11 @@ class URLProcessor:
                             'url': url,
                             'source': 'selenium'  # Mark that this came from Selenium
                         }
-            # Log the result status
             if result:
                 logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
             else:
                 logger.error(f"Failed to extract content from {url}")
             return result
         except Exception as e:
             logger.error(f"Content fetch failed for {url}: {e}")
@@ -179,7 +173,8 @@ class URLProcessor:
         """Standard HTML content processing"""
         try:
             # Try with a different user agent if it's a social media site
-            if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
                 # Use a more realistic browser user agent instead of random one
                 self.session.headers.update({
                     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
@@ -199,10 +194,10 @@ class URLProcessor:
                 if 'facebook.com' in url and 'm.facebook.com' not in url:
                     url = url.replace('www.facebook.com', 'm.facebook.com')
                     logger.info(f"Switched to mobile Facebook URL: {url}")
-            # Add a delay to simulate human browsing
-            time.sleep(1)
             # Try to get the page with multiple attempts
             max_attempts = 3
             for attempt in range(max_attempts):
@@ -212,34 +207,32 @@ class URLProcessor:
                     break
                 except (requests.exceptions.RequestException, Exception) as e:
                     if attempt < max_attempts - 1:
-                        logger.warning(f"Attempt {attempt+1} failed for {url}: {e}. Retrying...")
                         time.sleep(2)  # Wait longer between retries
                     else:
                         raise
             logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
             # Save the raw HTML for debugging if needed
             debug_path = f"/Users/a2014/urld/debug_raw_{int(time.time())}.html"
             with open(debug_path, "w", encoding="utf-8") as f:
                 f.write(response.text)
             logger.info(f"Saved raw HTML to {debug_path}")
             # Check if we got a valid response with content
             if not response.text or len(response.text) < 100:
                 logger.error(f"Empty or very short response from {url}")
                 return None
-            soup = BeautifulSoup(response.text, 'html.parser')
             # Remove unwanted elements
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
             # Simulate "ESC key" by removing login walls and overlays common on social media sites
             login_wall_selectors = [
-                '.login-wall', '.signup-wall', '.overlay', '.modal',
-                '[role="dialog"]', '[aria-modal="true"]', '.login-overlay',
                 '.signup-overlay', '.uiLayer', '.fb_overlay', '.ReactModalPortal',
                 '[data-testid="login_dialog"]', '[data-testid="signup_dialog"]',
                 '.login-signup-modal', '.onboarding-modal', '.signup-wrapper',
@@ -250,85 +243,89 @@ class URLProcessor:
                 for element in soup.select(selector):
                     logger.info(f"Removing login wall element: {selector}")
                     element.decompose()
             # Enhanced removal for social media sites
             if 'facebook.com' in url:
                 # Facebook specific elements - simulating ESC key
                 fb_selectors = [
-                    '[data-testid="cookie-policy-manage-dialog"]',
-                    '[role="banner"]', '[role="complementary"]',
-                    '.login_form_container', '.login_form', '#login_form',
-                    '.uiLayer', '.pluginConnectButton', '.fbPageBanner',
-                    '._5hn6', '._67m7', '.nonLoggedInSignUp',
                     '#headerArea', '.uiContextualLayer', '.uiContextualLayerPositioner'
                 ]
                 for selector in fb_selectors:
                     for element in soup.select(selector):
                         element.decompose()
                 # Look for the main content in mobile version
-                main_content = soup.select_one('#m_story_permalink_view') or soup.select_one('#mobile_injected_video_feed_pagelet')
                 if main_content:
                     logger.info("Found Facebook mobile main content")
             elif 'instagram.com' in url:
                 # Instagram specific elements - simulating ESC key
                 ig_selectors = [
-                    '[role="presentation"]', '[role="banner"]', '[role="complementary"]',
-                    '.RnEpo', '._acb3', '._ab8w', '._abn5', '.x1n2onr6',
-                    '.x78zum5', '.x1q0g3np', '.xieb3on', '._a9-z', '._a9_1',
                     '._aa4b', '.x1i10hfl', '.x9f619', '.xnz67gz', '.x78zum5',
                     '.x1q0g3np', '.x1gslohp', '.xieb3on', '.x1lcm9me'
                 ]
                 for selector in ig_selectors:
                     for element in soup.select(selector):
                         element.decompose()
                 # Try to find the main content
-                insta_content = soup.select_one('main article') or soup.select_one('._aagv') or soup.select_one('._ab1y')
                 if insta_content:
                     logger.info("Found Instagram main content")
             elif 'twitter.com' in url or 'x.com' in url:
                 # X/Twitter already works well for public content, but clean up any remaining overlays
                 x_selectors = [
                     '[data-testid="LoginForm"]', '[data-testid="SignupForm"]',
                     '[data-testid="sheetDialog"]', '[data-testid="mask"]',
-                    '.r-zchlnj', '.r-1xcajam', '.r-1d2f490', '.r-1p0dtai',
-                    '.r-1pi2tsx', '.r-u8s1d', '.css-175oi2r', '.css-1dbjc4n',
                     '.r-kemksi', '[data-testid="BottomBar"]'
                 ]
                 for selector in x_selectors:
                     for element in soup.select(selector):
                         element.decompose()
             elif 'huggingface.co' in url:
                 # Special handling for Hugging Face
                 logger.info("Applying special handling for Hugging Face")
                 # Try to find the main content
-                hf_selectors = ['.prose', '.space-content', '.model-description', '.dataset-description', 'article', '.markdown']
                 for selector in hf_selectors:
                     elements = soup.select(selector)
                     if elements:
                         logger.info(f"Found Hugging Face content with selector: {selector}")
                         break
             # Extract content using a general approach - try multiple strategies
             # Strategy 1: Look for semantic HTML5 elements
             main_content = None
-            for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry', '.page']:
                 elements = soup.select(selector)
                 if elements:
                     main_content = elements[0]
                     logger.info(f"Found content with selector: {selector}")
                     break
             # Strategy 2: If no semantic elements, try common class names
             if not main_content or not main_content.get_text(strip=True):
                 for div in soup.find_all('div'):
                     class_name = div.get('class', [])
                     id_name = div.get('id', '')
-                    if any(term in ' '.join(class_name).lower() for term in ['content', 'main', 'body', 'article', 'post']):
                         main_content = div
                         logger.info(f"Found content with div class: {class_name}")
                         break
@@ -336,15 +333,15 @@ class URLProcessor:
                         main_content = div
                         logger.info(f"Found content with div id: {id_name}")
                         break
             # Strategy 3: Fall back to body
             if not main_content or not main_content.get_text(strip=True):
                 logger.info(f"No main content container found for {url}, using body")
                 main_content = soup.body if soup.body else soup
             # Extract text with proper spacing
             text_content = main_content.get_text(separator='\n', strip=True)
             # Strategy 4: If content is too short, extract all visible text
             if len(text_content) < 100:
                 logger.info(f"Content too short for {url} ({len(text_content)} chars), using all visible text")
@@ -353,17 +350,16 @@ class URLProcessor:
                     if element.get_text(strip=True):
                         visible_text.append(element.get_text(strip=True))
                 text_content = '\n'.join(visible_text)
             # Strategy 5: Last resort - get all text from the page
             if len(text_content) < 50:
                 logger.info(f"Still insufficient content for {url} ({len(text_content)} chars), using entire page text")
                 text_content = soup.get_text(separator='\n', strip=True)
             # Clean and structure content
             cleaned_content = self.advanced_text_cleaning(text_content)
             logger.info(f"Final content length: {len(cleaned_content)} chars")
             # If we still have no content, this is a failure
             if len(cleaned_content) < 20:
                 logger.error(f"Failed to extract meaningful content from {url}")
@@ -379,6 +375,117 @@ class URLProcessor:
             logger.error(f"HTML processing failed for {url}: {e}")
             return None
 class FileProcessor:
     """Class to handle file processing"""
@@ -411,7 +518,7 @@ class FileProcessor:
                     dataset.extend(self._process_zip_file(file.name, temp_dir))
                 else:
                     dataset.extend(self._process_single_file(file))
-        except Exception as e:
             logger.error(f"Error processing file: {str(e)}")
             return []
         return dataset
@@ -436,17 +543,14 @@ class FileProcessor:
     def _process_single_file(self, file) -> List[Dict]:
         try:
             file_stat = os.stat(file.name)
             # For very large files, read in chunks and summarize
             if file_stat.st_size > 100 * 1024 * 1024:  # 100MB
                 logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
                 # Read first and last  1MB for extremely large files
                 content = ""
                 with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
                     content = f.read(1 * 1024 * 1024)  # First 1MB
                     content += "\n...[Content truncated due to large file size]...\n"
                     # Seek to the last 1MB
                     f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
                     content += f.read()  # Last 1MB
@@ -454,7 +558,6 @@ class FileProcessor:
                 # Regular file processing
                 with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
                     content = f.read()
             return [{
                 'source': 'file',
                 'filename': os.path.basename(file.name),
@@ -469,6 +572,7 @@ class FileProcessor:
             logger.error(f"File processing error: {e}")
             return []
 # Move process_all_inputs outside of the FileProcessor class
 def process_all_inputs(urls, file, text, notes):
     """Process all input types with progress tracking"""
@@ -493,11 +597,9 @@ def process_all_inputs(urls, file, text, notes):
                             'content': content,
                             'timestamp': datetime.now().isoformat()
                         })
         # Process files
         if file:
             results.extend(file_processor.process_file(file))
         # Process text input
         if text:
             cleaned_text = processor.advanced_text_cleaning(text)
@@ -506,7 +608,6 @@ def process_all_inputs(urls, file, text, notes):
                 'content': cleaned_text,
                 'timestamp': datetime.now().isoformat()
             })
         # Generate output
         if results:
             output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
@@ -515,23 +616,23 @@ def process_all_inputs(urls, file, text, notes):
             with open(output_path, 'w', encoding='utf-8') as f:
                 json.dump(results, f, ensure_ascii=False, indent=2)
             summary = f"Processed {len(results)} items successfully!"
             json_data = json.dumps(results, indent=2)  # Prepare JSON for QR code
             return str(output_path), summary, json_data  # Return JSON for editor
         else:
             return None, "No valid content to process.", ""
     except Exception as e:
         logger.error(f"Processing error: {e}")
         return None, f"Error: {str(e)}", ""
 # Also move generate_qr_code outside of the FileProcessor class
 def generate_qr_code(json_data):
     """Generate QR code from JSON data and return the file path."""
     if json_data:
         return generate_qr(json_data)
 # Move generate_qr outside of the FileProcessor class as well
 def generate_qr(json_data):
     """Generate QR code from JSON data and return the file path."""
@@ -544,7 +645,7 @@ def generate_qr(json_data):
         )
         qr.add_data(json_data)
         qr.make(fit=True)
         img = qrcode.make_image(fill_color="black", back_color="white")
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
         img.save(temp_file.name)
@@ -552,7 +653,7 @@ def generate_qr(json_data):
     except Exception as e:
         # If the data is too large for a QR code
         logger.error(f"QR generation error: {e}")
         # Create a simple QR with error message
         qr = qrcode.QRCode(
             version=1,
@@ -562,12 +663,13 @@ def generate_qr(json_data):
         )
         qr.add_data("Error: Data too large for QR code")
         qr.make(fit=True)
         img = qrcode.make_image(fill_color="black", back_color="white")
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
         img.save(temp_file.name)
         return temp_file.name
 def create_interface():
     """Create a comprehensive Gradio interface with advanced features"""
     css = """
@@ -575,7 +677,6 @@ def create_interface():
     .warning { background-color: #fff3cd; color: #856404; }
     .error { background-color: #f8d7da; color: #721c24; }
     """
     with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
         gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
@@ -628,13 +729,11 @@ def create_interface():
             inputs=[url_input, file_input, text_input, scratchpad],
             outputs=[output_file, output_text, json_editor]  # Update outputs to include JSON editor
         )
         qr_btn.click(
             generate_qr_code,
             inputs=json_editor,
             outputs=qr_output
         )
         gr.Markdown("""
     ### Usage Guidelines
     - **URL Processing**: Enter valid HTTP/HTTPS URLs
@@ -646,11 +745,12 @@ def create_interface():
     """)
     return interface
 def check_network_connectivity():
     """Check if the network is working properly by testing connection to common sites"""
     test_sites = ["https://www.google.com", "https://www.cloudflare.com", "https://www.amazon.com"]
     results = []
     for site in test_sites:
         try:
             response = requests.get(site, timeout=5)
@@ -665,29 +765,28 @@ def check_network_connectivity():
                 "status": f"Error: {str(e)}",
                 "response_time": None
             })
     # If all sites failed, there might be a network issue
     if all(result["status"].startswith("Error") for result in results):
         logger.error("Network connectivity issue detected. All test sites failed.")
         return False, results
     return True, results
 # Add this to the main function
 def main():
     # Configure system settings
     mimetypes.init()
     # Check network connectivity
     network_ok, network_results = check_network_connectivity()
     if not network_ok:
         logger.warning("Network connectivity issues detected. Some features may not work properly.")
         for result in network_results:
             logger.warning(f"Test site {result['site']}: {result['status']}")
     # Create and launch interface
     interface = create_interface()
     # Launch with proper configuration
     interface.launch(
         server_name="0.0.0.0",
@@ -698,67 +797,6 @@ def main():
         debug=True
     )
 if __name__ == "__main__":
     main()
-def _fetch_with_selenium(self, url: str) -> Optional[str]:
-        """Use Selenium as a fallback for difficult sites"""
-        try:
-            from selenium import webdriver
-            from selenium.webdriver.chrome.options import Options
-            from selenium.webdriver.common.by import By
-            from selenium.webdriver.support.ui import WebDriverWait
-            from selenium.webdriver.support import expected_conditions as EC
-            from selenium.common.exceptions import TimeoutException
-            import time
-            logger.info(f"Attempting to fetch {url} with Selenium")
-            # Set up Chrome options
-            chrome_options = Options()
-            chrome_options.add_argument("--headless")
-            chrome_options.add_argument("--no-sandbox")
-            chrome_options.add_argument("--disable-dev-shm-usage")
-            chrome_options.add_argument("--disable-gpu")
-            chrome_options.add_argument("--window-size=1920,1080")
-            chrome_options.add_argument(f"user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
-            # Initialize the driver
-            driver = webdriver.Chrome(options=chrome_options)
-            try:
-                # Navigate to the URL
-                driver.get(url)
-                # Wait for the page to load
-                WebDriverWait(driver, 10).until(
-                    EC.presence_of_element_located((By.TAG_NAME, "body"))
-                )
-                # Simulate pressing ESC key to dismiss overlays
-                from selenium.webdriver.common.keys import Keys
-                webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
-                # Wait a bit for any animations to complete
-                time.sleep(2)
-                # Get the page source
-                page_source = driver.page_source
-                # Save the Selenium HTML for debugging
-                debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
-                with open(debug_path, "w", encoding="utf-8") as f:
-                    f.write(page_source)
-                logger.info(f"Saved Selenium HTML to {debug_path}")
-                return page_source
-            finally:
-                driver.quit()
-        except ImportError:
-            logger.error("Selenium is not installed. Cannot use browser automation.")
-            return None
-        except Exception as e:
-            logger.error(f"Selenium processing failed for {url}: {e}")
-            return None

         try:
             if not validators.url(url):
                 return {'is_valid': False, 'message': 'Invalid URL format'}
             # Try with DNS resolution retry
             for attempt in range(3):  # Try up to 3 times
                 try:
                         response.raise_for_status()
                         # Close the connection to avoid downloading the entire content
                         response.close()
                     return {'is_valid': True, 'message': 'URL is valid and accessible'}
                 except requests.exceptions.ConnectionError as e:
                     if "NameResolutionError" in str(e) or "Failed to resolve" in str(e):
+                        logger.warning(f"DNS resolution failed for {url}, attempt {attempt + 1}/3")
                         time.sleep(1)  # Wait a bit before retrying
                         continue
                     else:
                         raise
                 except Exception as e:
                     raise
             # If we get here, all attempts failed
+            return {'is_valid': False,
+                    'message': f'URL validation failed: DNS resolution failed after multiple attempts'}
         except Exception as e:
             logger.error(f"URL validation failed for {url}: {str(e)}")
             return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
         """Universal content fetcher with special case handling"""
         try:
             logger.info(f"Fetching content from: {url}")
             # Google Drive document handling
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
             # Google Calendar ICS handling
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
             # Try standard HTML processing first
             result = self._fetch_html_content(url)
             # If standard processing failed or returned minimal content, try with Selenium
             if not result or len(result.get('content', '')) < 100:
+                logger.info(
+                    f"Standard processing failed or returned minimal content for {url}, trying Selenium")
                 selenium_html = self._fetch_with_selenium(url)
                 if selenium_html:
                     # Process the Selenium HTML
                     soup = BeautifulSoup(selenium_html, 'html.parser')
                     # Remove unwanted elements
                     for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                         element.decompose()
                     # Apply the same content extraction strategies as in _fetch_html_content
                     # Strategy 1: Look for semantic HTML5 elements
                     main_content = None
+                    for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post',
+                                     '.entry', '.page']:
                         elements = soup.select(selector)
                         if elements:
                             main_content = elements[0]
                             logger.info(f"Found content with selector: {selector}")
                             break
                     # If no main content found, use body
                     if not main_content or not main_content.get_text(strip=True):
                         main_content = soup.body if soup.body else soup
                     # Extract text
                     text_content = main_content.get_text(separator='\n', strip=True)
                     # Clean content
                     cleaned_content = self.advanced_text_cleaning(text_content)
                     if len(cleaned_content) >= 20:
                         result = {
                             'content': cleaned_content,
                             'url': url,
                             'source': 'selenium'  # Mark that this came from Selenium
                         }
+                        # Log the result status
             if result:
                 logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
             else:
                 logger.error(f"Failed to extract content from {url}")
             return result
         except Exception as e:
             logger.error(f"Content fetch failed for {url}: {e}")
         """Standard HTML content processing"""
         try:
             # Try with a different user agent if it's a social media site
+            if any(domain in url for domain in
+                   ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
                 # Use a more realistic browser user agent instead of random one
                 self.session.headers.update({
                     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                 if 'facebook.com' in url and 'm.facebook.com' not in url:
                     url = url.replace('www.facebook.com', 'm.facebook.com')
                     logger.info(f"Switched to mobile Facebook URL: {url}")
+                # Add a delay to simulate human browsing
+                time.sleep(1)
             # Try to get the page with multiple attempts
             max_attempts = 3
             for attempt in range(max_attempts):
                     break
                 except (requests.exceptions.RequestException, Exception) as e:
                     if attempt < max_attempts - 1:
+                        logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}. Retrying...")
                         time.sleep(2)  # Wait longer between retries
                     else:
                         raise
             logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
             # Save the raw HTML for debugging if needed
             debug_path = f"/Users/a2014/urld/debug_raw_{int(time.time())}.html"
             with open(debug_path, "w", encoding="utf-8") as f:
                 f.write(response.text)
             logger.info(f"Saved raw HTML to {debug_path}")
             # Check if we got a valid response with content
             if not response.text or len(response.text) < 100:
                 logger.error(f"Empty or very short response from {url}")
                 return None
+            soup = BeautifulSoup(response.text, 'html.parser')
             # Remove unwanted elements
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
             # Simulate "ESC key" by removing login walls and overlays common on social media sites
             login_wall_selectors = [
+                '.login-wall', '.signup-wall', '.overlay', '.modal',
+                '[role="dialog"]', '[aria-modal="true"]', '.login-overlay',
                 '.signup-overlay', '.uiLayer', '.fb_overlay', '.ReactModalPortal',
                 '[data-testid="login_dialog"]', '[data-testid="signup_dialog"]',
                 '.login-signup-modal', '.onboarding-modal', '.signup-wrapper',
                 for element in soup.select(selector):
                     logger.info(f"Removing login wall element: {selector}")
                     element.decompose()
             # Enhanced removal for social media sites
             if 'facebook.com' in url:
                 # Facebook specific elements - simulating ESC key
                 fb_selectors = [
+                    '[data-testid="cookie-policy-manage-dialog"]',
+                    '[role="banner"]', '[role="complementary"]',
+                    '.login_form_container', '.login_form', '#login_form',
+                    '.uiLayer', '.pluginConnectButton', '.fbPageBanner',
+                    '._5hn6', '._67m7', '.nonLoggedInSignUp',
                     '#headerArea', '.uiContextualLayer', '.uiContextualLayerPositioner'
                 ]
                 for selector in fb_selectors:
                     for element in soup.select(selector):
                         element.decompose()
                 # Look for the main content in mobile version
+                main_content = soup.select_one('#m_story_permalink_view') or soup.select_one(
+                    '#mobile_injected_video_feed_pagelet')
                 if main_content:
                     logger.info("Found Facebook mobile main content")
             elif 'instagram.com' in url:
                 # Instagram specific elements - simulating ESC key
                 ig_selectors = [
+                    '[role="presentation"]', '[role="banner"]', '[role="complementary"]',
+                    '.RnEpo', '._acb3', '._ab8w', '._abn5', '.x1n2onr6',
+                    '.x78zum5', '.x1q0g3np', '.xieb3on', '._a9-z', '._a9_1',
                     '._aa4b', '.x1i10hfl', '.x9f619', '.xnz67gz', '.x78zum5',
                     '.x1q0g3np', '.x1gslohp', '.xieb3on', '.x1lcm9me'
                 ]
                 for selector in ig_selectors:
                     for element in soup.select(selector):
                         element.decompose()
                 # Try to find the main content
+                insta_content = soup.select_one('main article') or soup.select_one('._aagv') or soup.select_one(
+                    '._ab1y')
                 if insta_content:
                     logger.info("Found Instagram main content")
             elif 'twitter.com' in url or 'x.com' in url:
                 # X/Twitter already works well for public content, but clean up any remaining overlays
                 x_selectors = [
                     '[data-testid="LoginForm"]', '[data-testid="SignupForm"]',
                     '[data-testid="sheetDialog"]', '[data-testid="mask"]',
+                    '.r-zchlnj', '.r-1xcajam', '.r-1d2f490', '.r-1p0dtai',
+                    '.r-1pi2tsx', '.r-u8s1d', '.css-175oi2r', '.css-1dbjc4n',
                     '.r-kemksi', '[data-testid="BottomBar"]'
                 ]
                 for selector in x_selectors:
                     for element in soup.select(selector):
                         element.decompose()
             elif 'huggingface.co' in url:
                 # Special handling for Hugging Face
                 logger.info("Applying special handling for Hugging Face")
                 # Try to find the main content
+                hf_selectors = ['.prose', '.space-content', '.model-description',
+                                '.dataset-description', 'article', '.markdown']
                 for selector in hf_selectors:
                     elements = soup.select(selector)
                     if elements:
                         logger.info(f"Found Hugging Face content with selector: {selector}")
                         break
             # Extract content using a general approach - try multiple strategies
             # Strategy 1: Look for semantic HTML5 elements
             main_content = None
+            for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry',
+                             '.page']:
                 elements = soup.select(selector)
                 if elements:
                     main_content = elements[0]
                     logger.info(f"Found content with selector: {selector}")
                     break
             # Strategy 2: If no semantic elements, try common class names
             if not main_content or not main_content.get_text(strip=True):
                 for div in soup.find_all('div'):
                     class_name = div.get('class', [])
                     id_name = div.get('id', '')
+                    if any(term in ' '.join(class_name).lower() for term in
+                           ['content', 'main', 'body', 'article', 'post']):
                         main_content = div
                         logger.info(f"Found content with div class: {class_name}")
                         break
                         main_content = div
                         logger.info(f"Found content with div id: {id_name}")
                         break
             # Strategy 3: Fall back to body
             if not main_content or not main_content.get_text(strip=True):
                 logger.info(f"No main content container found for {url}, using body")
                 main_content = soup.body if soup.body else soup
             # Extract text with proper spacing
             text_content = main_content.get_text(separator='\n', strip=True)
             # Strategy 4: If content is too short, extract all visible text
             if len(text_content) < 100:
                 logger.info(f"Content too short for {url} ({len(text_content)} chars), using all visible text")
                     if element.get_text(strip=True):
                         visible_text.append(element.get_text(strip=True))
                 text_content = '\n'.join(visible_text)
             # Strategy 5: Last resort - get all text from the page
             if len(text_content) < 50:
                 logger.info(f"Still insufficient content for {url} ({len(text_content)} chars), using entire page text")
                 text_content = soup.get_text(separator='\n', strip=True)
             # Clean and structure content
             cleaned_content = self.advanced_text_cleaning(text_content)
             logger.info(f"Final content length: {len(cleaned_content)} chars")
             # If we still have no content, this is a failure
             if len(cleaned_content) < 20:
                 logger.error(f"Failed to extract meaningful content from {url}")
             logger.error(f"HTML processing failed for {url}: {e}")
             return None
+    def _handle_google_drive(self, url: str) -> Optional[Dict]:
+        """Handle Google Drive document URLs"""
+        try:
+            # Construct direct download URL
+            file_id = url.split("/d/")[1].split("/")[0]
+            download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+            response = self.session.get(download_url, stream=True, timeout=self.timeout)
+            response.raise_for_status()
+            # Read content (limit to the first 1MB)
+            content = b""
+            for chunk in response.iter_content(chunk_size=8192):  # 8KB chunks
+                content += chunk
+                if len(content) > 1024 * 1024:  # 1MB limit
+                    content = content[:1024 * 1024]
+                    logger.warning(f"Truncated Google Drive file after 1MB")
+                    break
+            text_content = content.decode('utf-8', errors='ignore')
+            cleaned_text = self.advanced_text_cleaning(text_content)
+            return {
+                'content': cleaned_text,
+                'content_type': 'text/plain',  # Assume plain text for simplicity
+                'timestamp': datetime.now().isoformat(),
+                'url': url,
+                'source': 'google_drive'
+            }
+        except Exception as e:
+            logger.error(f"Error handling Google Drive URL {url}: {e}")
+            return None
+    def _handle_google_calendar(self, url: str) -> Optional[Dict]:
+        """Handle Google Calendar ICS URLs"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            text_content = response.text
+            cleaned_text = self.advanced_text_cleaning(text_content)
+            return {
+                'content': cleaned_text,
+                'content_type': 'text/calendar',  # Correct MIME type
+                'timestamp': datetime.now().isoformat(),
+                'url': url,
+                'source': 'google_calendar'
+            }
+        except Exception as e:
+            logger.error(f"Error handling Google Calendar URL {url}: {e}")
+            return None
+    def _fetch_with_selenium(self, url: str) -> Optional[str]:
+        """Use Selenium as a fallback for difficult sites"""
+        try:
+            from selenium import webdriver
+            from selenium.webdriver.chrome.options import Options
+            from selenium.webdriver.common.by import By
+            from selenium.webdriver.support.ui import WebDriverWait
+            from selenium.webdriver.support import expected_conditions as EC
+            from selenium.common.exceptions import TimeoutException
+            import time
+            logger.info(f"Attempting to fetch {url} with Selenium")
+            # Set up Chrome options
+            chrome_options = Options()
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--no-sandbox")
+            chrome_options.add_argument("--disable-dev-shm-usage")
+            chrome_options.add_argument("--disable-gpu")
+            chrome_options.add_argument("--window-size=1920,1080")
+            chrome_options.add_argument(
+                "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+            # Initialize the driver
+            driver = webdriver.Chrome(options=chrome_options)
+            try:
+                # Navigate to the URL
+                driver.get(url)
+                # Wait for the page to load
+                WebDriverWait(driver, 10).until(
+                    EC.presence_of_element_located((By.TAG_NAME, "body"))
+                )
+                # Simulate pressing ESC key to dismiss overlays
+                from selenium.webdriver.common.keys import Keys
+                webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
+                # Wait a bit for any animations to complete
+                time.sleep(2)
+                # Get the page source
+                page_source = driver.page_source
+                # Save the Selenium HTML for debugging
+                debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
+                with open(debug_path, "w", encoding="utf-8") as f:
+                    f.write(page_source)
+                logger.info(f"Saved Selenium HTML to {debug_path}")
+                return page_source
+            finally:
+                driver.quit()
+        except ImportError:
+            logger.error("Selenium is not installed. Cannot use browser automation.")
+            return None
+        except Exception as e:
+            logger.error(f"Selenium processing failed for {url}: {e}")
+            return None
 class FileProcessor:
     """Class to handle file processing"""
                     dataset.extend(self._process_zip_file(file.name, temp_dir))
                 else:
                     dataset.extend(self._process_single_file(file))
+        exceptException as e:
             logger.error(f"Error processing file: {str(e)}")
             return []
         return dataset
     def _process_single_file(self, file) -> List[Dict]:
         try:
             file_stat = os.stat(file.name)
             # For very large files, read in chunks and summarize
             if file_stat.st_size > 100 * 1024 * 1024:  # 100MB
                 logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
                 # Read first and last  1MB for extremely large files
                 content = ""
                 with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
                     content = f.read(1 * 1024 * 1024)  # First 1MB
                     content += "\n...[Content truncated due to large file size]...\n"
                     # Seek to the last 1MB
                     f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
                     content += f.read()  # Last 1MB
                 # Regular file processing
                 with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
                     content = f.read()
             return [{
                 'source': 'file',
                 'filename': os.path.basename(file.name),
             logger.error(f"File processing error: {e}")
             return []
 # Move process_all_inputs outside of the FileProcessor class
 def process_all_inputs(urls, file, text, notes):
     """Process all input types with progress tracking"""
                             'content': content,
                             'timestamp': datetime.now().isoformat()
                         })
         # Process files
         if file:
             results.extend(file_processor.process_file(file))
         # Process text input
         if text:
             cleaned_text = processor.advanced_text_cleaning(text)
                 'content': cleaned_text,
                 'timestamp': datetime.now().isoformat()
             })
         # Generate output
         if results:
             output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
             with open(output_path, 'w', encoding='utf-8') as f:
                 json.dump(results, f, ensure_ascii=False, indent=2)
             summary = f"Processed {len(results)} items successfully!"
             json_data = json.dumps(results, indent=2)  # Prepare JSON for QR code
             return str(output_path), summary, json_data  # Return JSON for editor
         else:
             return None, "No valid content to process.", ""
     except Exception as e:
         logger.error(f"Processing error: {e}")
         return None, f"Error: {str(e)}", ""
 # Also move generate_qr_code outside of the FileProcessor class
 def generate_qr_code(json_data):
     """Generate QR code from JSON data and return the file path."""
     if json_data:
         return generate_qr(json_data)
 # Move generate_qr outside of the FileProcessor class as well
 def generate_qr(json_data):
     """Generate QR code from JSON data and return the file path."""
         )
         qr.add_data(json_data)
         qr.make(fit=True)
         img = qrcode.make_image(fill_color="black", back_color="white")
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
         img.save(temp_file.name)
     except Exception as e:
         # If the data is too large for a QR code
         logger.error(f"QR generation error: {e}")
         # Create a simple QR with error message
         qr = qrcode.QRCode(
             version=1,
         )
         qr.add_data("Error: Data too large for QR code")
         qr.make(fit=True)
         img = qrcode.make_image(fill_color="black", back_color="white")
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
         img.save(temp_file.name)
         return temp_file.name
 def create_interface():
     """Create a comprehensive Gradio interface with advanced features"""
     css = """
     .warning { background-color: #fff3cd; color: #856404; }
     .error { background-color: #f8d7da; color: #721c24; }
     """
     with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
         gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
             inputs=[url_input, file_input, text_input, scratchpad],
             outputs=[output_file, output_text, json_editor]  # Update outputs to include JSON editor
         )
         qr_btn.click(
             generate_qr_code,
             inputs=json_editor,
             outputs=qr_output
         )
         gr.Markdown("""
     ### Usage Guidelines
     - **URL Processing**: Enter valid HTTP/HTTPS URLs
     """)
     return interface
 def check_network_connectivity():
     """Check if the network is working properly by testing connection to common sites"""
     test_sites = ["https://www.google.com", "https://www.cloudflare.com", "https://www.amazon.com"]
     results = []
     for site in test_sites:
         try:
             response = requests.get(site, timeout=5)
                 "status": f"Error: {str(e)}",
                 "response_time": None
             })
     # If all sites failed, there might be a network issue
     if all(result["status"].startswith("Error") for result in results):
         logger.error("Network connectivity issue detected. All test sites failed.")
         return False, results
     return True, results
 # Add this to the main function
 def main():
     # Configure system settings
     mimetypes.init()
     # Check network connectivity
     network_ok, network_results = check_network_connectivity()
     if not network_ok:
         logger.warning("Network connectivity issues detected. Some features may not work properly.")
         for result in network_results:
             logger.warning(f"Test site {result['site']}: {result['status']}")
     # Create and launch interface
     interface = create_interface()
     # Launch with proper configuration
     interface.launch(
         server_name="0.0.0.0",
         debug=True
     )
 if __name__ == "__main__":
     main()