Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 25

Commit

5798d9f

verified ·

1 Parent(s): e1fd662

Update app2.py

Browse files

Files changed (1) hide show

app2.py +743 -492

app2.py CHANGED Viewed

@@ -1,126 +1,120 @@
 import json
-import os
-import re
 import logging
 import mimetypes
-import time
 from PIL import Image
-import zxing
-import io
-import zipfile
-import tempfile
-from datetime import datetime
-from typing import List, Dict, Optional, Union, Any
-from pathlib import Path
 import requests
 import validators
-import gradio as gr
 from bs4 import BeautifulSoup
-from fake_useragent import UserAgent
 from cleantext import clean
-import qrcode
-import cv2  # Add this import for the decode_qr_code function
-# Setup logging
-import sys
-import argparse
-import base64
-import io
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
-    handlers=[
-        logging.StreamHandler(),
-        logging.FileHandler('app.log', encoding='utf-8')
-    ]
-)
-logger = logging.getLogger(__name__)
-# Ensure output directories exist
-Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
 class URLProcessor:
-    def __init__(self):
-        self.session = requests.Session()
-        self.timeout = 10
-        self.max_retries = 3
-        self.request_delay = 1.0
-        self.respect_robots = True
-        self.use_proxy = False
-        self.proxy_url = None
-        self.rate_limits = {}  # Track rate limits per domain
-        self.selenium_driver = None
-        # Update session headers with rotating user agents
         self.update_user_agent()
-        if self.use_proxy and self.proxy_url:
-            self.session.proxies = {
-                'http': self.proxy_url,
-                'https': self.proxy_url
-            }
     def update_user_agent(self):
-        """Rotate user agents to avoid detection"""
-        try:
-            self.session.headers.update({
-                'User-Agent': UserAgent().random,
-                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-                'Accept-Language': 'en-US,en;q=0.5',
-                'Accept-Encoding': 'gzip, deflate, br',
-                'Connection': 'keep-alive',
-                'Upgrade-Insecure-Requests': '1',
-                'Cache-Control': 'max-age=0'
-            })
-        except Exception as e:
-            logger.warning(f"Failed to update user agent: {e}")
-            # Fallback to a common user agent
-            self.session.headers.update({
-                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
-            })
     def get_selenium_driver(self):
-        """Initialize Selenium WebDriver for interactive sites"""
-        if self.selenium_driver is not None:
-            return self.selenium_driver
         try:
-            from selenium import webdriver
-            from selenium.webdriver.chrome.service import Service
             from selenium.webdriver.chrome.options import Options
             from webdriver_manager.chrome import ChromeDriverManager
             options = Options()
-            options.add_argument("--headless")
-            options.add_argument("--no-sandbox")
-            options.add_argument("--disable-dev-shm-usage")
-            options.add_argument(f"user-agent={self.session.headers['User-Agent']}")
-            options.add_argument("--disable-notifications")
-            options.add_argument("--disable-popup-blocking")
-            options.add_argument("--disable-extensions")
             service = Service(ChromeDriverManager().install())
-            self.selenium_driver = webdriver.Chrome(service=service, options=options)
-            return self.selenium_driver
         except Exception as e:
             logger.error(f"Failed to initialize Selenium: {e}")
             return None
-    def handle_rate_limits(self, domain):
-        """Smart rate limiting based on domain"""
-        from urllib.parse import urlparse
-        import time
-        # Extract domain from URL
-        parsed_domain = urlparse(domain).netloc
-        # Check if we've accessed this domain recently
         current_time = time.time()
         if parsed_domain in self.rate_limits:
             last_access, count = self.rate_limits[parsed_domain]
-            # Different delay strategies for different domains
-            if "facebook" in parsed_domain or "instagram" in parsed_domain:
-                min_delay = 5.0  # Longer delay for social media sites
             elif "gov" in parsed_domain:
                 min_delay = 2.0  # Be respectful with government sites
             else:
@@ -216,59 +210,6 @@ class URLProcessor:
         except Exception as e:
             logger.warning(f"Error handling Google site: {e}")
-    def fetch_content(self, url: str) -> Optional[Dict]:
-        """Fetch content with smart handling for different sites"""
-        # Check if URL is allowed by robots.txt
-        if self.respect_robots and not self.check_robots_txt(url):
-            logger.warning(f"URL {url} is disallowed by robots.txt")
-            return None
-        # Apply rate limiting
-        self.handle_rate_limits(url)
-        # Rotate user agent occasionally
-        if random.random() < 0.3:  # 30% chance to rotate
-            self.update_user_agent()
-        # Determine if site needs special handling
-        needs_selenium = any(domain in url.lower() for domain in [
-            'facebook.com', 'instagram.com', 'linkedin.com',
-            'google.com/search', 'twitter.com', 'x.com'
-        ])
-        for attempt in range(self.max_retries):
-            try:
-                if needs_selenium:
-                    return self.handle_interactive_site(url)
-                # Try with cloudscraper first for sites with anti-bot measures
-                if any(domain in url.lower() for domain in ['cloudflare', '.gov']):
-                    import cloudscraper
-                    scraper = cloudscraper.create_scraper(
-                        browser={'browser': 'chrome', 'platform': 'darwin', 'mobile': False}
-                    )
-                    response = scraper.get(url, timeout=self.timeout)
-                else:
-                    # Standard request for most sites
-                    response = self.session.get(url, timeout=self.timeout)
-                response.raise_for_status()
-                return {
-                    'content': response.text,
-                    'content_type': response.headers.get('Content-Type', ''),
-                    'url': url,
-                    'status_code': response.status_code
-                }
-            except Exception as e:
-                logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
-                if attempt < self.max_retries - 1:
-                    # Exponential backoff
-                    time.sleep(self.request_delay * (2 ** attempt))
-        logger.error(f"All attempts failed for {url}")
-        return None
     def check_robots_txt(self, url: str) -> bool:
         """Check if URL is allowed by robots.txt"""
         if not self.respect_robots:
@@ -290,65 +231,6 @@ class URLProcessor:
             logger.warning(f"Error checking robots.txt: {e}")
             return True
-    def fetch_content(self, url: str) -> Optional[Dict]:
-        """Fetch content with built-in rate limiting and robots.txt checking"""
-        if not self.check_robots_txt(url):
-            logger.warning(f"URL {url} is disallowed by robots.txt")
-            return None
-        time.sleep(self.request_delay)  # Basic rate limiting
-        for attempt in range(self.max_retries):
-            try:
-                if 'drive.google.com' in url:
-                    return self._handle_google_drive(url)
-                if 'calendar.google.com' in url:
-                    return self._handle_google_calendar(url)
-                return self._fetch_html_content(url)
-            except Exception as e:
-                logger.error(f"Attempt {attempt + 1} failed: {e}")
-                if attempt < self.max_retries - 1:
-                    time.sleep(self.request_delay * (attempt + 1))
-        return None
-    def advanced_text_cleaning(self, text: str) -> str:
-        """Robust text cleaning with version compatibility"""
-        try:
-            cleaned_text = clean(
-                text,
-                fix_unicode=True,
-                to_ascii=True,
-                lower=True,
-                no_line_breaks=True,
-                no_urls=True,
-                no_emails=True,
-                no_phone_numbers=True,
-                no_numbers=False,
-                no_digits=False,
-                no_currency_symbols=True,
-                no_punct=False
-            ).strip()
-            return cleaned_text
-        except Exception as e:
-            logger.warning(f"Text cleaning error: {e}. Using fallback method.")
-            text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
-            text = text.encode('ascii', 'ignore').decode('ascii')
-            text = re.sub(r'\s+', ' ', text)
-            return text.strip()
-    def validate_url(self, url: str) -> Dict:
-        """Validate URL format and accessibility"""
-        try:
-            if not validators.url(url):
-                return {'is_valid': False, 'message': 'Invalid URL format'}
-            response = self.session.head(url, timeout=self.timeout)
-            response.raise_for_status()
-            return {'is_valid': True, 'message': 'URL is valid and accessible'}
-        except Exception as e:
-            return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
     def fetch_content(self, url: str) -> Optional[Dict]:
         """Universal content fetcher with special case handling"""
         try:
@@ -397,30 +279,90 @@ class URLProcessor:
             return None
     def _fetch_html_content(self, url: str) -> Optional[Dict]:
-        """Standard HTML content processing"""
         try:
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
-            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
-                element.decompose()
-            main_content = soup.find('main') or soup.find('article') or soup.body
-            if main_content is None:
-                logger.warning(f"No main content found for URL: {url}")
-                return {
-                    'content': '',
-                    'content_type': response.headers.get('Content-Type', ''),
-                    'timestamp': datetime.now().isoformat()
-                }
-            text_content = main_content.get_text(separator='\n', strip=True)
-            cleaned_content = self.advanced_text_cleaning(text_content)
             return {
-                'content': cleaned_content,
                 'content_type': response.headers.get('Content-Type', ''),
                 'timestamp': datetime.now().isoformat()
             }
@@ -428,6 +370,146 @@ class URLProcessor:
             logger.error(f"HTML processing failed: {e}")
             return None
 class FileProcessor:
     """Class to handle file processing with enhanced capabilities"""
@@ -679,6 +761,7 @@ class FileProcessor:
                 if file_stat.st_size > 100 * 1024 * 1024:  # 100MB
                     logger.info(f"Processing large file: {file_path} ({file_stat.st_size} bytes)")
                     with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                         content = f.read(1 * 1024 * 1024)  # First 1MB
                         content += "\n...[Content truncated due to large file size]...\n"
@@ -686,312 +769,480 @@ class FileProcessor:
                         f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
                         content += f.read()  # Last 1MB
                 else:
-                    with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
                         content = f.read()
             else:
-                # For binary files, just record metadata
-                content = f"[Binary file: {mime_type or 'unknown type'}]"
             return [{
-                'source': 'file',
-                'filename': os.path.basename(file.name),
-                'file_size': file_stat.st_size,
-                'mime_type': mimetypes.guess_type(file.name)[0],
-                'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
-                'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
-                'content': content,
                 'timestamp': datetime.now().isoformat()
             }]
-        except Exception as e:
-            logger.error(f"File processing error: {e}")
-            return []
-    def clean_json(self, data: Union[str, Dict]) -> Optional[Dict]:
-        """Clean and validate JSON data"""
         try:
-            if isinstance(data, str):
-                data = data.strip()
-                data = json.loads(data)
-            cleaned = json.loads(json.dumps(data))
-            return cleaned
-        except json.JSONDecodeError as e:
-            logger.error(f"JSON cleaning error: {e}")
-            return None
         except Exception as e:
-            logger.error(f"Unexpected error while cleaning JSON: {e}")
-            return None
-    def generate_qr_code(self, data: Union[str, Dict], combined: bool = True) -> List[str]:
-        """Generate QR code(s) from data"""
         try:
-            output_dir = Path('output/qr_codes')
-            output_dir.mkdir(parents=True, exist_ok=True)
-            if combined:
-                cleaned_data = self.clean_json(data)
-                if cleaned_data:
-                    qr = qrcode.QRCode(
-                        version=None,
-                        error_correction=qrcode.constants.ERROR_CORRECT_L,
-                        box_size=10,
-                        border=4,
-                    )
-                    json_str = json.dumps(cleaned_data, ensure_ascii=False)
-                    qr.add_data(json_str)
-                    qr.make(fit=True)
-                    img = qr.make_image(fill_color="black", back_color="white")
-                    output_path = output_dir / f'combined_qr_{int(time.time())}.png'
-                    img.save(str(output_path))
-                    return [str(output_path)]
-            else:
-                if isinstance(data, list):
-                    paths = []
-                    for idx, item in enumerate(data):
-                        cleaned_item = self.clean_json(item)
-                        if cleaned_item:
-                            qr = qrcode.QRCode(
-                                version=None,
-                                error_correction=qrcode.constants.ERROR_CORRECT_L,
-                                box_size=10,
-                                border=4,
-                            )
-                            json_str = json.dumps(cleaned_item, ensure_ascii=False)
-                            qr.add_data(json_str)
-                            qr.make(fit=True)
-                            img = qrcode.make_image(fill_color="black", back_color="white")
-                            output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
-                            img.save(str(output_path))
-                            paths.append(str(output_path))
-                    return paths
-                else:
-                    cleaned_item = self.clean_json(data)
-                    if cleaned_item:
-                        qr = qrcode.QRCode(
-                            version=None,
-                            error_correction=qrcode.constants.ERROR_CORRECT_L,
-                            box_size=10,
-                            border=4,
-                        )
-                        json_str = json.dumps(cleaned_item, ensure_ascii=False)
-                        qr.add_data(json_str)
-                        qr.make(fit=True)
-                        img = qrcode.make_image(fill_color="black", back_color="white")
-                        output_path = output_dir / f'single_qr_{int(time.time())}.png'
-                        img.save(str(output_path))
-                        return [str(output_path)]
-            return []
         except Exception as e:
-            logger.error(f"QR generation error: {e}")
-            return []
-def decode_qr_code(image_path: str) -> Optional[str]:
-    """Decode QR code from an image file using OpenCV with improved binary handling"""
-    try:
-        # Read image using OpenCV
-        img = cv2.imread(image_path)
-        if img is None:
-            logger.error(f"Failed to read image: {image_path}")
-            return None
-        # Convert to grayscale
-        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        # Initialize QRCode detector
-        detector = cv2.QRCodeDetector()
-        # Detect and decode
-        data, vertices, _ = detector.detectAndDecode(gray)
-        if vertices is not None and data:
-            # Check if this might be binary data (like a PDF)
-            if data.startswith("%PDF") or not all(ord(c) < 128 for c in data):
-                # This is likely binary data, encode as base64
-                try:
-                    # If it's already a string representation, convert to bytes first
-                    if isinstance(data, str):
-                        data_bytes = data.encode('latin-1')  # Use latin-1 to preserve byte values
-                    else:
-                        data_bytes = data
-                    # Encode as base64
-                    base64_data = base64.b64encode(data_bytes).decode('ascii')
-                    return f"base64:{base64_data}"
-                except Exception as e:
-                    logger.error(f"Error encoding binary data: {e}")
-            return data
-        logger.warning("No QR code found in image")
-        return None
-    except Exception as e:
-        logger.error(f"QR decoding error: {e}")
-        return None
-# Also update the datachat_interface function to handle base64 data
-def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: str, query: str) -> str:
-    """Interface for DataChat functionality with binary data support"""
-    data = None
-    if data_source == "JSON Input":
-        data = json_input
-    elif data_source == "QR Code":
-        try:
-            decoded_data = decode_qr_code(qr_image)
-            # Handle base64 encoded data
-            if decoded_data and decoded_data.startswith("base64:"):
-                base64_part = decoded_data[7:]  # Remove the "base64:" prefix
-                try:
-                    # For PDFs and other binary data, provide info about the content
-                    binary_data = base64.b64decode(base64_part)
-                    if binary_data.startswith(b"%PDF"):
-                        data = "The QR code contains a PDF document. Binary data cannot be processed directly."
-                    else:
-                        # Try to decode as text as a fallback
-                        data = binary_data.decode('utf-8', errors='replace')
-                except Exception as e:
-                    logger.error(f"Error processing base64 data: {e}")
-                    data = "The QR code contains binary data that cannot be processed directly."
-            else:
-                data = decoded_data
-            if not data:
-                return "No QR code found in the provided image."
         except Exception as e:
-            return f"Invalid QR code data provided: {e}"
-    else:
-        return "No valid data source selected."
-    if mode == "Trained with Data":
-        return datachat_trained(data, query)
-    elif mode == "Chat about Data":
-        return datachat_simple(data, query)
-    else:
-        return "Invalid mode selected."
-# Replace the create_interface function with this version
-def create_interface():
-    """Create a comprehensive Gradio interface with advanced features"""
-    css = """
-    .container { max-width: 1200px; margin: auto; }
-    .warning { background-color: #fff3cd; color: #856404; padding: 10px; border-radius: 4px; }
-    .error { background-color: #f8d7da; color: #721c24; padding: 10px; border-radius: 4px; }
-    .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
-    """
-    # Use Interface instead of Blocks
-    demo = gr.Interface(
-        fn=datachat_interface,
-        inputs=[
-            gr.Radio(["Trained with Data", "Chat about Data"], label="Mode"),
-            gr.Radio(["JSON Input", "QR Code"], label="Data Source"),
-            gr.Textbox(lines=8, label="JSON Data"),
-            gr.Image(label="QR Code Image", type="filepath"),
-            gr.Textbox(label="Query")
-        ],
-        outputs=gr.Textbox(label="Response"),
-        title="Advanced Data Processor & QR Code Generator",
-        description="# 🌐 Advanced Data Processing & QR Code Generator",
-        css=css
-    )
-    return interface
-def main():
-    """Main entry point for the application"""
-    parser = argparse.ArgumentParser(description='URL and File Processor')
-    parser.add_argument('--mode', choices=['web', 'cli'], default='web', help='Run mode (web interface or CLI)')
-    parser.add_argument('--url', help='URL to process (CLI mode)')
-    parser.add_argument('--file', help='File to process (CLI mode)')
-    parser.add_argument('--output', help='Output directory for results (CLI mode)')
-    parser.add_argument('--share', action='store_true', help='Share the web interface publicly (web mode)')
-    parser.add_argument('--check-deps', action='store_true', help='Check dependencies and install missing ones')
-    args = parser.parse_args()
-    # Check dependencies if requested
-    if args.check_deps:
-        from utils import check_dependencies, install_missing_dependencies
-        logger.info("Checking dependencies...")
-        deps = check_dependencies()
-        missing = [pkg for pkg, installed in deps.items() if not installed]
-        if missing:
-            logger.info(f"Missing dependencies: {', '.join(missing)}")
-            if input("Install missing dependencies? (y/n): ").lower() == 'y':
-                install_missing_dependencies(missing)
             else:
-                logger.warning("Some features may not work without required dependencies.")
-        else:
-            logger.info("All dependencies are installed.")
-    # Run in web mode
-    if args.mode == 'web':
         try:
-            import gradio
-        except ImportError:
-            logger.error("Gradio is required for web mode. Install with 'pip install gradio'")
-            sys.exit(1)
-        from interface import Interface
-        logger.info("Starting web interface...")
-        interface = Interface()
-        interface.launch(share=args.share)
-    # Run in CLI mode
-    elif args.mode == 'cli':
-        if not args.url and not args.file:
-            logger.error("In CLI mode, you must provide either --url or --file")
-            sys.exit(1)
-        results = []
-        # Process URL if provided
-        if args.url:
-            from url_processor import URLProcessor
-            logger.info(f"Processing URL: {args.url}")
-            url_processor = URLProcessor()
-            url_results = url_processor.process_urls([args.url])
-            results.extend(url_results)
-        # Process file if provided
-        if args.file:
-            from file_processor import FileProcessor
-            if not os.path.exists(args.file):
-                logger.error(f"File not found: {args.file}")
-                sys.exit(1)
-            logger.info(f"Processing file: {args.file}")
-            file_processor = FileProcessor()
-            # Create a file-like object with a name attribute
-            class FileObj:
-                def __init__(self, path):
-                    self.name = path
-            file_results = file_processor.process_file(FileObj(args.file))
-            results.extend(file_results)
-        # Save results
-        if results:
-            from utils import save_results
-            output_dir = args.output or os.getcwd()
-            filepath = save_results(results, output_dir)
-            if filepath:
-                logger.info(f"Results saved to: {filepath}")
-            else:
-                logger.error("Failed to save results")
-        else:
-            logger.warning("No results to save")
 if __name__ == "__main__":
     main()

+import base64
+import gradio as gr
+import hashlib
+import io
 import json
 import logging
 import mimetypes
+import os
 from PIL import Image
+import qrcode# Setup logging
+import random
+import re
 import requests
+import tempfile
+import time
 import validators
+import zipfile
+import zxing
 from bs4 import BeautifulSoup
 from cleantext import clean
+from datetime import datetime
+from fake_useragent import UserAgent
+from file_processor import FileProcessor
+from pathlib import Path
+from qr_processor import QRProcessor
+from selenium import webdriver
+from typing import List, Dict, Optional, Union, Any
+from url_processor import URLProcessor
+from urllib.parse import urlparse
+from utils import save_results, extract_urls_from_text, format_results_as_markdown
+# Configure logging
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger('App')
+# URLProcessor class
+# ===================
 class URLProcessor:
+    """Class to handle URL processing with advanced features"""
+    def __init__(self, request_delay: float = 1.0, timeout: int = 30, max_retries: int = 3, respect_robots: bool = True):
+        self.request_delay = request_delay
+        self.timeout = timeout
+        self.max_retries = max_retries
+        self.respect_robots = respect_robots
+        self.rate_limits = {}  # Domain -> (last_access_time, count)
+        # Initialize session with rotating user agents
+        self.session = requests.Session()
         self.update_user_agent()
+        # Selenium driver (lazy initialization)
+        self._driver = None
     def update_user_agent(self):
+        """Rotate user agent to avoid detection"""
+        user_agents = [
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
+        ]
+        self.session.headers.update({
+            'User-Agent': random.choice(user_agents),
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'Pragma': 'no-cache',
+            'Cache-Control': 'no-cache',
+        })
     def get_selenium_driver(self):
+        """Get or create Selenium WebDriver with proper settings"""
+        if self._driver is not None:
+            return self._driver
         try:
             from selenium.webdriver.chrome.options import Options
+            from selenium.webdriver.chrome.service import Service
             from webdriver_manager.chrome import ChromeDriverManager
             options = Options()
+            options.add_argument('--headless')
+            options.add_argument('--no-sandbox')
+            options.add_argument('--disable-dev-shm-usage')
+            options.add_argument('--disable-gpu')
+            options.add_argument('--window-size=1920,1080')
+            options.add_argument(f'user-agent={self.session.headers["User-Agent"]}')
             service = Service(ChromeDriverManager().install())
+            self._driver = webdriver.Chrome(service=service, options=options)
+            return self._driver
         except Exception as e:
             logger.error(f"Failed to initialize Selenium: {e}")
             return None
+    def close(self):
+        """Close resources"""
+        if self._driver is not None:
+            self._driver.quit()
+            self._driver = None
+    def handle_rate_limits(self, url: str):
+        """Implement rate limiting per domain"""
+        parsed_url = urlparse(url)
+        parsed_domain = parsed_url.netloc
         current_time = time.time()
         if parsed_domain in self.rate_limits:
             last_access, count = self.rate_limits[parsed_domain]
+            # Determine appropriate delay based on domain
+            min_delay = self.request_delay
+            if "linkedin.com" in parsed_domain:
+                min_delay = 5.0  # LinkedIn is sensitive to scraping
             elif "gov" in parsed_domain:
                 min_delay = 2.0  # Be respectful with government sites
             else:
         except Exception as e:
             logger.warning(f"Error handling Google site: {e}")
     def check_robots_txt(self, url: str) -> bool:
         """Check if URL is allowed by robots.txt"""
         if not self.respect_robots:
             logger.warning(f"Error checking robots.txt: {e}")
             return True
     def fetch_content(self, url: str) -> Optional[Dict]:
         """Universal content fetcher with special case handling"""
         try:
             return None
     def _fetch_html_content(self, url: str) -> Optional[Dict]:
+        """Enhanced HTML content processing to extract everything"""
         try:
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
+            # Store the original HTML
+            original_html = response.text
+            # Parse with BeautifulSoup
             soup = BeautifulSoup(response.text, 'html.parser')
+            # Extract all text content
+            text_content = soup.get_text(separator='\n', strip=True)
+            # Extract all links
+            links = []
+            for link in soup.find_all('a', href=True):
+                href = link['href']
+                # Convert relative URLs to absolute
+                if href.startswith('/'):
+                    from urllib.parse import urlparse, urljoin
+                    parsed_url = urlparse(url)
+                    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+                    href = urljoin(base_url, href)
+                link_text = link.get_text(strip=True)
+                links.append({
+                    'url': href,
+                    'text': link_text if link_text else '[No text]'
+                })
+            # Extract all images
+            images = []
+            for img in soup.find_all('img', src=True):
+                src = img['src']
+                # Convert relative URLs to absolute
+                if src.startswith('/'):
+                    from urllib.parse import urlparse, urljoin
+                    parsed_url = urlparse(url)
+                    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+                    src = urljoin(base_url, src)
+                alt_text = img.get('alt', '')
+                images.append({
+                    'src': src,
+                    'alt': alt_text if alt_text else '[No alt text]'
+                })
+            # Extract all scripts
+            scripts = []
+            for script in soup.find_all('script'):
+                script_content = script.string
+                if script_content:
+                    scripts.append(script_content)
+            # Extract all styles
+            styles = []
+            for style in soup.find_all('style'):
+                style_content = style.string
+                if style_content:
+                    styles.append(style_content)
+            # Extract metadata
+            metadata = {}
+            for meta in soup.find_all('meta'):
+                if meta.get('name') and meta.get('content'):
+                    metadata[meta['name']] = meta['content']
+                elif meta.get('property') and meta.get('content'):
+                    metadata[meta['property']] = meta['content']
+            # Extract title
+            title = soup.title.string if soup.title else ''
+            # Return comprehensive data
             return {
+                'url': url,
+                'title': title,
+                'metadata': metadata,
+                'content': text_content,
+                'html': original_html,
+                'links': links,
+                'images': images,
+                'scripts': scripts,
+                'styles': styles,
                 'content_type': response.headers.get('Content-Type', ''),
                 'timestamp': datetime.now().isoformat()
             }
             logger.error(f"HTML processing failed: {e}")
             return None
+    def advanced_text_cleaning(self, text: str) -> str:
+        """Robust text cleaning with version compatibility"""
+        try:
+            # Try to use cleantext if available
+            import importlib.util
+            if importlib.util.find_spec("cleantext") is not None:
+                from cleantext import clean
+                cleaned_text = clean(
+                    text,
+                    fix_unicode=True,
+                    to_ascii=True,
+                    lower=True,
+                    no_line_breaks=True,
+                    no_urls=True,
+                    no_emails=True,
+                    no_phone_numbers=True,
+                    no_numbers=False,
+                    no_digits=False,
+                    no_currency_symbols=True,
+                    no_punct=False
+                ).strip()
+                return cleaned_text
+            else:
+                # Fallback cleaning
+                text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
+                text = text.encode('ascii', 'ignore').decode('ascii')
+                text = re.sub(r'\s+', ' ', text)
+                return text.strip()
+        except Exception as e:
+            logger.warning(f"Text cleaning error: {e}")
+            return text.strip() if text else ""
+    def process_urls(self, urls: List[str], mode: str = 'basic') -> List[Dict]:
+        """Process a list of URLs with different modes"""
+        results = []
+        for url in urls:
+            # Validate URL
+            if not validators.url(url):
+                results.append({
+                    'url': url,
+                    'error': 'Invalid URL format',
+                    'timestamp': datetime.now().isoformat()
+                })
+                continue
+            # Check robots.txt
+            if not self.check_robots_txt(url):
+                results.append({
+                    'url': url,
+                    'error': 'Access disallowed by robots.txt',
+                    'timestamp': datetime.now().isoformat()
+                })
+                continue
+            # Apply rate limiting
+            self.handle_rate_limits(url)
+            # Process based on mode
+            try:
+                if mode == 'basic':
+                    content = self.fetch_content(url)
+                    if content:
+                        results.append(content)
+                    else:
+                        results.append({
+                            'url': url,
+                            'error': 'Failed to fetch content',
+                            'timestamp': datetime.now().isoformat()
+                        })
+                elif mode == 'interactive':
+                    content = self.handle_interactive_site(url)
+                    if content:
+                        results.append(content)
+                    else:
+                        # Fallback to basic mode
+                        content = self.fetch_content(url)
+                        if content:
+                            results.append(content)
+                        else:
+                            results.append({
+                                'url': url,
+                                'error': 'Failed to fetch content in interactive mode',
+                                'timestamp': datetime.now().isoformat()
+                            })
+                elif mode == 'deep':
+                    # Deep mode: get main content and follow some links
+                    main_content = self.fetch_content(url)
+                    if not main_content:
+                        results.append({
+                            'url': url,
+                            'error': 'Failed to fetch main content',
+                            'timestamp': datetime.now().isoformat()
+                        })
+                        continue
+                    results.append(main_content)
+                    # Follow up to 5 links from the main page
+                    if 'links' in main_content and main_content['links']:
+                        followed_count = 0
+                        for link_data in main_content['links'][:10]:  # Consider first 10 links
+                            link_url = link_data['url']
+                            # Skip external links and non-http(s) links
+                            if not link_url.startswith(('http://', 'https://')):
+                                continue
+                            # Skip if not same domain
+                            main_domain = urlparse(url).netloc
+                            link_domain = urlparse(link_url).netloc
+                            if main_domain != link_domain:
+                                continue
+                            # Apply rate limiting
+                            self.handle_rate_limits(link_url)
+                            # Fetch the linked content
+                            link_content = self.fetch_content(link_url)
+                            if link_content:
+                                results.append(link_content)
+                                followed_count += 1
+                            # Limit to 5 followed links
+                            if followed_count >= 5:
+                                break
+            except Exception as e:
+                logger.error(f"Error processing URL {url}: {e}")
+                results.append({
+                    'url': url,
+                    'error': f"Processing error: {str(e)}",
+                    'timestamp': datetime.now().isoformat()
+                })
+# FileProcessor class
+# ===================
 class FileProcessor:
     """Class to handle file processing with enhanced capabilities"""
                 if file_stat.st_size > 100 * 1024 * 1024:  # 100MB
                     logger.info(f"Processing large file: {file_path} ({file_stat.st_size} bytes)")
+                    content = ""
                     with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                         content = f.read(1 * 1024 * 1024)  # First 1MB
                         content += "\n...[Content truncated due to large file size]...\n"
                         f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
                         content += f.read()  # Last 1MB
                 else:
+                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                         content = f.read()
+                return [{
+                    'source': 'file',
+                    'filename': filename,
+                    'file_size': file_stat.st_size,
+                    'mime_type': mime_type,
+                    'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                    'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                    'content': content,
+                    'timestamp': datetime.now().isoformat()
+                }]
             else:
+                # For binary files, extract metadata and try specialized extraction
+                if file_path.endswith(('.pdf', '.doc', '.docx')):
+                    return self._process_document_file(file_path)
+                elif file_path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp')):
+                    return self._process_image_file(file_path)
+                elif file_path.endswith(('.mp3', '.wav', '.ogg', '.mp4', '.avi', '.mov')):
+                    return self._process_media_file(file_path)
+                else:
+                    # Generic binary file handling
+                    return [{
+                        'source': 'binary_file',
+                        'filename': filename,
+                        'file_size': file_stat.st_size,
+                        'mime_type': mime_type,
+                        'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                        'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                        'content': f"[Binary file: {mime_type or 'unknown type'}]",
+                        'timestamp': datetime.now().isoformat()
+                    }]
+        except Exception as e:
+            logger.error(f"File processing error: {e}")
             return [{
+                'source': 'error',
+                'filename': os.path.basename(file.name) if file else 'unknown',
+                'error': str(e),
                 'timestamp': datetime.now().isoformat()
             }]
+    def _process_pdf_file(self, file_path: str) -> List[Dict]:
+        """Extract text from PDF files"""
         try:
+            # Try to import PyPDF2 module
+            import importlib.util
+            if importlib.util.find_spec("PyPDF2") is None:
+                return [{
+                    "error": "PDF processing requires the 'PyPDF2' module. Install with 'pip install PyPDF2'."
+                }]
+            import PyPDF2
+            with open(file_path, 'rb') as file:
+                reader = PyPDF2.PdfReader(file)
+                num_pages = len(reader.pages)
+                # Extract text from each page
+                all_text = ""
+                page_texts = []
+                for i in range(num_pages):
+                    page = reader.pages[i]
+                    text = page.extract_text()
+                    all_text += text + "\n\n"
+                    page_texts.append({
+                        "page_number": i + 1,
+                        "content": text
+                    })
+                # Get file metadata
+                file_stat = os.stat(file_path)
+                return [{
+                    "source": "pdf",
+                    "filename": os.path.basename(file_path),
+                    "file_size": file_stat.st_size,
+                    "mime_type": "application/pdf",
+                    "created": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                    "modified": datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                    "num_pages": num_pages,
+                    "content": all_text,
+                    "pages": page_texts,
+                    "timestamp": datetime.now().isoformat()
+                }]
         except Exception as e:
+            logger.error(f"Error processing PDF file: {str(e)}")
+            return [{
+                "source": "error",
+                "filename": os.path.basename(file_path),
+                "error": f"Error processing PDF file: {str(e)}",
+                "timestamp": datetime.now().isoformat()
+            }]
+    def _process_image_file(self, file_path: str) -> List[Dict]:
+        """Extract metadata and attempt OCR on image files"""
         try:
+            # Try to import PIL module
+            import importlib.util
+            if importlib.util.find_spec("PIL") is None:
+                return [{
+                    "error": "Image processing requires the 'Pillow' module. Install with 'pip install Pillow'."
+                }]
+            from PIL import Image
+            # Open image and get basic metadata
+            with Image.open(file_path) as img:
+                width, height = img.size
+                format_name = img.format
+                mode = img.mode
+                # Extract EXIF data if available
+                exif_data = {}
+                if hasattr(img, '_getexif') and img._getexif():
+                    exif = img._getexif()
+                    if exif:
+                        for tag_id, value in exif.items():
+                            tag_name = f"tag_{tag_id}"
+                            exif_data[tag_name] = str(value)
+                # Try OCR if pytesseract is available
+                ocr_text = None
+                if importlib.util.find_spec("pytesseract") is not None:
+                    try:
+                        import pytesseract
+                        ocr_text = pytesseract.image_to_string(img)
+                    except Exception as e:
+                        logger.warning(f"OCR failed: {e}")
+                # Get file metadata
+                file_stat = os.stat(file_path)
+                return [{
+                    "source": "image",
+                    "filename": os.path.basename(file_path),
+                    "file_size": file_stat.st_size,
+                    "mime_type": f"image/{format_name.lower()}" if format_name else "image/unknown",
+                    "created": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                    "modified": datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                    "width": width,
+                    "height": height,
+                    "format": format_name,
+                    "mode": mode,
+                    "exif": exif_data,
+                    "ocr_text": ocr_text,
+                    "content": ocr_text if ocr_text else f"[Image: {width}x{height} {format_name}]",
+                    "timestamp": datetime.now().isoformat()
+                }]
         except Exception as e:
+            logger.error(f"Error processing image file: {str(e)}")
+            return [{
+                "source": "error",
+                "filename": os.path.basename(file_path),
+                "error": f"Error processing image file: {str(e)}",
+                "timestamp": datetime.now().isoformat()
+            }]
+    def _process_media_file(self, file_path: str) -> List[Dict]:
+        """Extract metadata from audio/video files"""
+        try:
+            # Try to import mutagen module
+            import importlib.util
+            if importlib.util.find_spec("mutagen") is None:
+                return [{
+                    "error": "Media processing requires the 'mutagen' module. Install with 'pip install mutagen'."
+                }]
+            import mutagen
+            # Get file metadata
+            file_stat = os.stat(file_path)
+            mime_type, _ = mimetypes.guess_type(file_path)
+            # Extract media metadata
+            media_info = mutagen.File(file_path)
+            metadata = {}
+            if media_info:
+                # Extract common metadata
+                if hasattr(media_info, 'info') and hasattr(media_info.info, 'length'):
+                    metadata['duration'] = media_info.info.length
+                # Extract tags
+                for key, value in media_info.items():
+                    if isinstance(value, list) and len(value) == 1:
+                        metadata[key] = str(value[0])
+                    else:
+                        metadata[key] = str(value)
+            return [{
+                "source": "media",
+                "filename": os.path.basename(file_path),
+                "file_size": file_stat.st_size,
+                "mime_type": mime_type,
+                "created": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                "modified": datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                "metadata": metadata,
+                "content": f"[Media file: {mime_type or 'unknown type'}]",
+                "timestamp": datetime.now().isoformat()
+            }]
         except Exception as e:
+            logger.error(f"Error processing media file: {str(e)}")
+            return [{
+                "source": "error",
+                "filename": os.path.basename(file_path),
+                "error": f"Error processing media file: {str(e)}",
+                "timestamp": datetime.now().isoformat()
+# QRProcessor class
+# =================
+class QRProcessor:
+    """Class to handle QR code processing"""
+    def __init__(self):
+        # Check for required libraries
+        self._check_dependencies()
+    def _check_dependencies(self):
+        """Check if required libraries are installed"""
+        try:
+            import importlib.util
+            # Check for pyzbar
+            if importlib.util.find_spec("pyzbar") is None:
+                logger.warning("pyzbar library not found. QR code detection will not work. Install with 'pip install pyzbar'")
+            # Check for qrcode
+            if importlib.util.find_spec("qrcode") is None:
+                logger.warning("qrcode library not found. QR code generation will not work. Install with 'pip install qrcode'")
+        except ImportError as e:
+            logger.error(f"Error checking dependencies: {e}")
+    def detect_qr_codes(self, image_path: str) -> List[Dict]:
+        """Detect QR codes in an image"""
+        try:
+            import importlib.util
+            if importlib.util.find_spec("pyzbar") is None:
+                return [{"error": "pyzbar library not found. Install with 'pip install pyzbar'"}]
+            from pyzbar.pyzbar import decode
+            from PIL import Image
+            # Open the image
+            image = Image.open(image_path)
+            # Decode QR codes
+            decoded_objects = decode(image)
+            results = []
+            for obj in decoded_objects:
+                # Get the bounding box
+                rect = obj.rect
+                bbox = {
+                    'left': rect.left,
+                    'top': rect.top,
+                    'width': rect.width,
+                    'height': rect.height
+                }
+                # Get the data
+                data = obj.data.decode('utf-8', errors='replace')
+                # Get the type
+                qr_type = obj.type
+                results.append({
+                    'type': qr_type,
+                    'data': data,
+                    'bbox': bbox,
+                    'timestamp': datetime.now().isoformat()
+                })
+            if not results:
+                results.append({
+                    'warning': 'No QR codes detected in the image',
+                    'timestamp': datetime.now().isoformat()
+                })
+            return results
+        except Exception as e:
+            logger.error(f"Error detecting QR codes: {e}")
+            return [{"error": f"Error detecting QR codes: {str(e)}"}]
+    def generate_qr_code(self, data: str, output_path: Optional[str] = None, size: int = 10) -> Dict:
+        """Generate a QR code from data"""
+        try:
+            import importlib.util
+            if importlib.util.find_spec("qrcode") is None:
+                return {"error": "qrcode library not found. Install with 'pip install qrcode'"}
+            import qrcode
+            # Create QR code instance
+            qr = qrcode.QRCode(
+                version=1,
+                error_correction=qrcode.constants.ERROR_CORRECT_L,
+                box_size=size,
+                border=4,
+            )
+            # Add data
+            qr.add_data(data)
+            qr.make(fit=True)
+            # Create an image from the QR Code instance
+            img = qr.make_image(fill_color="black", back_color="white")
+            # Save the image if output path is provided
+            if output_path:
+                img.save(output_path)
+                return {
+                    'success': True,
+                    'data': data,
+                    'output_path': output_path,
+                    'timestamp': datetime.now().isoformat()
+                }
             else:
+                # Save to a temporary file
+                with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
+                    temp_path = tmp.name
+                    img.save(temp_path)
+                    return {
+                        'success': True,
+                        'data': data,
+                        'output_path': temp_path,
+                        'timestamp': datetime.now().isoformat()
+                    }
+        except Exception as e:
+            logger.error(f"Error generating QR code: {e}")
+            return {"error": f"Error generating QR code: {str(e)}"}
+    def extract_qr_from_url(self, url_processor, url: str) -> List[Dict]:
+        """Extract QR codes from an image URL"""
         try:
+            # Fetch the image from the URL
+            response = url_processor.session.get(url, stream=True)
+            response.raise_for_status()
+            # Save to a temporary file
+            with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
+                temp_path = tmp.name
+                for chunk in response.iter_content(chunk_size=128):
+                    tmp.write(chunk)
+            # Process the image
+            results = self.detect_qr_codes(temp_path)
+            # Add source information
+            for result in results:
+                result['source_url'] = url
+            # Clean up
+            os.unlink(temp_path)
+            return results
+        except Exception as e:
+            logger.error(f"Error extracting QR from URL: {e}")
+            return [{"error": f"Error extracting QR from URL: {str(e)}"}]
+    def batch_process_images(self, image_paths: List[str]) -> Dict[str, List[Dict]]:
+        """Process multiple images for QR codes"""
+        results = {}
+        for image_path in image_paths:
+            try:
+                if os.path.exists(image_path):
+                    image_results = self.detect_qr_codes(image_path)
+                    results[image_path] = image_results
+                else:
+                    results[image_path] = [{"error": f"Image file not found: {image_path}"}]
+            except Exception as e:
+                logger.error(f"Error processing image {image_path}: {e}")
+                results[image_path] = [{"error": f"Processing error: {str(e)}"}]
+def create_interface():
+    """Create a comprehensive Gradio interface with advanced features"""
+    css = """
+    .container { max-width: 1200px; margin: auto; }
+    .warning { background-color: #fff3cd; color: #856404; }
+    .error { background-color: #f8d7da; color: #721c24; }
+    """
+    with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
+        gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
+        with gr.Tab("URL Processing"):
+            url_input = gr.Textbox(
+                label="Enter URLs (comma or newline separated)",
+                lines=5,
+                placeholder="https://example1.com\nhttps://example2.com"
+            )
+        with gr.Tab("File Input"):
+            file_input = gr.File(
+                label="Upload text file or ZIP archive",
+                file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
+            )
+        with gr.Tab("Text Input"):
+            text_input = gr.Textbox(
+                label="Raw Text Input",
+                lines=5,
+                placeholder="Paste your text here..."
+            )
+        with gr.Tab("JSON Editor"):
+            json_editor = gr.Textbox(
+                label="JSON Editor",
+                lines=20,
+                placeholder="View and edit your JSON data here...",
+                interactive=True,
+                elem_id="json-editor"  # Optional: for custom styling
+            )
+        with gr.Tab("Scratchpad"):
+            scratchpad = gr.Textbox(
+                label="Scratchpad",
+                lines=10,
+                placeholder="Quick notes or text collections...",
+                interactive=True
+            )
+        process_btn = gr.Button("Process Input", variant="primary")
+        qr_btn = gr.Button("Generate QR Code", variant="secondary")
+        output_text = gr.Textbox(label="Processing Results", interactive=False)
+        output_file = gr.File(label="Processed Output")
+        qr_output = gr.Image(label="QR Code", type="filepath")  # To display the generated QR code
+        process_btn.click(
+            process_all_inputs,
+            inputs=[url_input, file_input, text_input, scratchpad],
+            outputs=[output_file, output_text, json_editor]  # Update outputs to include JSON editor
+        )
+        qr_btn.click(
+            generate_qr_code,
+            inputs=json_editor,
+            outputs=qr_output
+        )
+        gr.Markdown("""
+    ### Usage Guidelines
+    - **URL Processing**: Enter valid HTTP/HTTPS URLs
+    - **File Input**: Upload text files or ZIP archives
+    - ** Text Input**: Direct text processing
+    - **JSON Editor**: View and edit your JSON data
+    - **Scratchpad**: Quick notes or text collections
+    - Advanced cleaning and validation included
+    """)
+    return interface
+def main():
+    # Configure system settings
+    mimetypes.init()
+    # Create and launch interface
+    interface = create_interface()
+    # Launch with proper configuration
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        share=False,
+        inbrowser=True,
+        debug=True
+    )
 if __name__ == "__main__":
     main()