Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 23

Commit

a2b3dd2

verified ·

1 Parent(s): 41b4ac1

Update app2.py

Browse files

Files changed (1) hide show

app2.py +276 -63

app2.py CHANGED Viewed

@@ -19,6 +19,8 @@ from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from cleantext import clean
 import qrcode# Setup logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
@@ -41,22 +43,227 @@ class URLProcessor:
         self.respect_robots = True
         self.use_proxy = False
         self.proxy_url = None
-        # Update session headers
-        self.session.headers.update({
-            'User-Agent': UserAgent().random,
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.5',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1'
-        })
         if self.use_proxy and self.proxy_url:
             self.session.proxies = {
                 'http': self.proxy_url,
                 'https': self.proxy_url
             }
     def check_robots_txt(self, url: str) -> bool:
         """Check if URL is allowed by robots.txt"""
@@ -368,7 +575,7 @@ class FileProcessor:
                             qr.add_data(json_str)
                             qr.make(fit=True)
-                            img = qr.make_image(fill_color="black", back_color="white")
                             output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
                             img.save(str(output_path))
                             paths.append(str(output_path))
@@ -395,70 +602,76 @@ class FileProcessor:
         except Exception as e:
             logger.error(f"QR generation error: {e}")
             return []
-    def decode_qr_code(image_path: str) -> Optional[str]:
-        """Decode QR code from an image file using ZXing"""
-        try:
-            reader = zxing.BarCodeReader()
-            result = reader.decode(image_path)
-            if result and result.parsed:
-                return result.parsed
-            logger.warning("No QR code found in image")
-            return None
-        except Exception as e:
-            logger.error(f"QR decoding error: {e}")
             return None
-    def decode_qr(image) -> List[str]:
-        """Decode all QR codes found in an image using ZXing"""
-        try:
-            if isinstance(image, str):
-                image_path = image
-            else:
-                # Save temporary image if input is not a path
-                with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
-                    Image.fromarray(image).save(tmp.name)
-                    image_path = tmp.name
-            reader = zxing.BarCodeReader()
-            result = reader.decode(image_path)
-            if result and result.parsed:
-                return [result.parsed]
-            return []
-        except Exception as e:
-            logger.error(f"QR decoding error: {e}")
-            return []
-            raise ValueError("Unable to decode QR code")
-        except Exception as e:
-            logger.error(f"QR decoding error: {e}")
-            return None, None  # Return None for both data and resolution in case of error
-def datachat_trained(data_input: str, query: str) -> str:
-    """Handle trained data interaction logic"""
-    data = clean_json(data_input)
-    if not data:
-        return "Invalid JSON data provided."
-    return f"[Trained Mode]\nData: {json.dumps(data, indent=2)}\nQuery: {query}"
-def datachat_simple(data_input: str, query: str) -> str:
-    """Handle simple chat interaction logic"""
-    data = clean_json(data_input)
-    if not data:
-        return "Invalid JSON data provided."
-    return f"[Chat Mode]\nData: {json.dumps(data, indent=2)}\nQuestion: {query}"
 def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: str, query: str) -> str:
-    """Interface for DataChat functionality"""
     data = None
     if data_source == "JSON Input":
         data = json_input
     elif data_source == "QR Code":
         try:
             decoded_data = decode_qr_code(qr_image)
-            data = decoded_data
             if not data:
                 return "No QR code found in the provided image."
         except Exception as e:

 from fake_useragent import UserAgent
 from cleantext import clean
 import qrcode# Setup logging
+import base64
+import io
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
         self.respect_robots = True
         self.use_proxy = False
         self.proxy_url = None
+        self.rate_limits = {}  # Track rate limits per domain
+        self.selenium_driver = None
+        # Update session headers with rotating user agents
+        self.update_user_agent()
         if self.use_proxy and self.proxy_url:
             self.session.proxies = {
                 'http': self.proxy_url,
                 'https': self.proxy_url
             }
+    def update_user_agent(self):
+        """Rotate user agents to avoid detection"""
+        try:
+            self.session.headers.update({
+                'User-Agent': UserAgent().random,
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5',
+                'Accept-Encoding': 'gzip, deflate, br',
+                'Connection': 'keep-alive',
+                'Upgrade-Insecure-Requests': '1',
+                'Cache-Control': 'max-age=0'
+            })
+        except Exception as e:
+            logger.warning(f"Failed to update user agent: {e}")
+            # Fallback to a common user agent
+            self.session.headers.update({
+                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+            })
+    def get_selenium_driver(self):
+        """Initialize Selenium WebDriver for interactive sites"""
+        if self.selenium_driver is not None:
+            return self.selenium_driver
+        try:
+            from selenium import webdriver
+            from selenium.webdriver.chrome.service import Service
+            from selenium.webdriver.chrome.options import Options
+            from webdriver_manager.chrome import ChromeDriverManager
+            options = Options()
+            options.add_argument("--headless")
+            options.add_argument("--no-sandbox")
+            options.add_argument("--disable-dev-shm-usage")
+            options.add_argument(f"user-agent={self.session.headers['User-Agent']}")
+            options.add_argument("--disable-notifications")
+            options.add_argument("--disable-popup-blocking")
+            options.add_argument("--disable-extensions")
+            service = Service(ChromeDriverManager().install())
+            self.selenium_driver = webdriver.Chrome(service=service, options=options)
+            return self.selenium_driver
+        except Exception as e:
+            logger.error(f"Failed to initialize Selenium: {e}")
+            return None
+    def handle_rate_limits(self, domain):
+        """Smart rate limiting based on domain"""
+        from urllib.parse import urlparse
+        import time
+        # Extract domain from URL
+        parsed_domain = urlparse(domain).netloc
+        # Check if we've accessed this domain recently
+        current_time = time.time()
+        if parsed_domain in self.rate_limits:
+            last_access, count = self.rate_limits[parsed_domain]
+            # Different delay strategies for different domains
+            if "facebook" in parsed_domain or "instagram" in parsed_domain:
+                min_delay = 5.0  # Longer delay for social media sites
+            elif "gov" in parsed_domain:
+                min_delay = 2.0  # Be respectful with government sites
+            else:
+                min_delay = self.request_delay
+            # Exponential backoff if we're making many requests
+            if count > 10:
+                min_delay *= 2
+            # Wait if needed
+            elapsed = current_time - last_access
+            if elapsed < min_delay:
+                time.sleep(min_delay - elapsed)
+            # Update count
+            self.rate_limits[parsed_domain] = (time.time(), count + 1)
+        else:
+            # First time accessing this domain
+            self.rate_limits[parsed_domain] = (current_time, 1)
+    def handle_interactive_site(self, url):
+        """Handle sites that require interaction to bypass blocks"""
+        driver = self.get_selenium_driver()
+        if not driver:
+            return None
+        try:
+            driver.get(url)
+            # Wait for page to load
+            import time
+            time.sleep(3)
+            # Handle different types of sites
+            if "facebook.com" in url or "instagram.com" in url:
+                self._handle_social_media_site(driver)
+            elif "google.com" in url:
+                self._handle_google_site(driver)
+            # Get the page source after interaction
+            page_source = driver.page_source
+            return {
+                'content': page_source,
+                'content_type': 'text/html',
+                'url': url,
+                'title': driver.title
+            }
+        except Exception as e:
+            logger.error(f"Error handling interactive site {url}: {e}")
+            return None
+    def _handle_social_media_site(self, driver):
+        """Handle Facebook/Instagram login walls"""
+        from selenium.webdriver.common.by import By
+        from selenium.webdriver.common.keys import Keys
+        from selenium.webdriver.support.ui import WebDriverWait
+        from selenium.webdriver.support import expected_conditions as EC
+        try:
+            # Try to find and close login popups
+            close_buttons = driver.find_elements(By.XPATH, "//button[contains(@aria-label, 'Close')]")
+            if close_buttons:
+                close_buttons[0].click()
+                time.sleep(1)
+            # Press ESC key to dismiss popups
+            webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
+            time.sleep(1)
+            # Scroll down to load more content
+            driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
+            time.sleep(2)
+        except Exception as e:
+            logger.warning(f"Error handling social media site: {e}")
+    def _handle_google_site(self, driver):
+        """Handle Google authentication and consent pages"""
+        from selenium.webdriver.common.by import By
+        try:
+            # Look for consent buttons
+            consent_buttons = driver.find_elements(By.XPATH, "//button[contains(text(), 'Accept all')]")
+            if consent_buttons:
+                consent_buttons[0].click()
+                time.sleep(1)
+            # Look for "I agree" buttons
+            agree_buttons = driver.find_elements(By.XPATH, "//button[contains(text(), 'I agree')]")
+            if agree_buttons:
+                agree_buttons[0].click()
+                time.sleep(1)
+        except Exception as e:
+            logger.warning(f"Error handling Google site: {e}")
+    def fetch_content(self, url: str) -> Optional[Dict]:
+        """Fetch content with smart handling for different sites"""
+        # Check if URL is allowed by robots.txt
+        if self.respect_robots and not self.check_robots_txt(url):
+            logger.warning(f"URL {url} is disallowed by robots.txt")
+            return None
+        # Apply rate limiting
+        self.handle_rate_limits(url)
+        # Rotate user agent occasionally
+        if random.random() < 0.3:  # 30% chance to rotate
+            self.update_user_agent()
+        # Determine if site needs special handling
+        needs_selenium = any(domain in url.lower() for domain in [
+            'facebook.com', 'instagram.com', 'linkedin.com',
+            'google.com/search', 'twitter.com', 'x.com'
+        ])
+        for attempt in range(self.max_retries):
+            try:
+                if needs_selenium:
+                    return self.handle_interactive_site(url)
+                # Try with cloudscraper first for sites with anti-bot measures
+                if any(domain in url.lower() for domain in ['cloudflare', '.gov']):
+                    import cloudscraper
+                    scraper = cloudscraper.create_scraper(
+                        browser={'browser': 'chrome', 'platform': 'darwin', 'mobile': False}
+                    )
+                    response = scraper.get(url, timeout=self.timeout)
+                else:
+                    # Standard request for most sites
+                    response = self.session.get(url, timeout=self.timeout)
+                response.raise_for_status()
+                return {
+                    'content': response.text,
+                    'content_type': response.headers.get('Content-Type', ''),
+                    'url': url,
+                    'status_code': response.status_code
+                }
+            except Exception as e:
+                logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
+                if attempt < self.max_retries - 1:
+                    # Exponential backoff
+                    time.sleep(self.request_delay * (2 ** attempt))
+        logger.error(f"All attempts failed for {url}")
+        return None
     def check_robots_txt(self, url: str) -> bool:
         """Check if URL is allowed by robots.txt"""
                             qr.add_data(json_str)
                             qr.make(fit=True)
+                            img = qrcode.make_image(fill_color="black", back_color="white")
                             output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
                             img.save(str(output_path))
                             paths.append(str(output_path))
         except Exception as e:
             logger.error(f"QR generation error: {e}")
             return []
+def decode_qr_code(image_path: str) -> Optional[str]:
+    """Decode QR code from an image file using OpenCV with improved binary handling"""
+    try:
+        # Read image using OpenCV
+        img = cv2.imread(image_path)
+        if img is None:
+            logger.error(f"Failed to read image: {image_path}")
             return None
+        # Convert to grayscale
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        # Initialize QRCode detector
+        detector = cv2.QRCodeDetector()
+        # Detect and decode
+        data, vertices, _ = detector.detectAndDecode(gray)
+        if vertices is not None and data:
+            # Check if this might be binary data (like a PDF)
+            if data.startswith("%PDF") or not all(ord(c) < 128 for c in data):
+                # This is likely binary data, encode as base64
+                try:
+                    # If it's already a string representation, convert to bytes first
+                    if isinstance(data, str):
+                        data_bytes = data.encode('latin-1')  # Use latin-1 to preserve byte values
+                    else:
+                        data_bytes = data
+                    # Encode as base64
+                    base64_data = base64.b64encode(data_bytes).decode('ascii')
+                    return f"base64:{base64_data}"
+                except Exception as e:
+                    logger.error(f"Error encoding binary data: {e}")
+            return data
+        logger.warning("No QR code found in image")
+        return None
+    except Exception as e:
+        logger.error(f"QR decoding error: {e}")
+        return None
+# Also update the datachat_interface function to handle base64 data
 def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: str, query: str) -> str:
+    """Interface for DataChat functionality with binary data support"""
     data = None
     if data_source == "JSON Input":
         data = json_input
     elif data_source == "QR Code":
         try:
             decoded_data = decode_qr_code(qr_image)
+            # Handle base64 encoded data
+            if decoded_data and decoded_data.startswith("base64:"):
+                base64_part = decoded_data[7:]  # Remove the "base64:" prefix
+                try:
+                    # For PDFs and other binary data, provide info about the content
+                    binary_data = base64.b64decode(base64_part)
+                    if binary_data.startswith(b"%PDF"):
+                        data = "The QR code contains a PDF document. Binary data cannot be processed directly."
+                    else:
+                        # Try to decode as text as a fallback
+                        data = binary_data.decode('utf-8', errors='replace')
+                except Exception as e:
+                    logger.error(f"Error processing base64 data: {e}")
+                    data = "The QR code contains binary data that cannot be processed directly."
+            else:
+                data = decoded_data
             if not data:
                 return "No QR code found in the provided image."
         except Exception as e: