Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 22

Commit

e8a15e5

verified ·

1 Parent(s): b2eab13

Update app2.py

Browse files

Files changed (1) hide show

app2.py +307 -292

app2.py CHANGED Viewed

@@ -1,26 +1,26 @@
 import json
-import sys
-sys.path.append('./config')
-import config
 import os
 import re
 import time
 import logging
 import mimetypes
 import tempfile
 from datetime import datetime
 from pathlib import Path
 from urllib.parse import urlparse
-from typing import List, Dict, Tuple, Union, Optional
 import requests
 import validators
 import gradio as gr
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from cleantext import clean
-import qrcode
-import zipfile
 # Setup logging with detailed configuration
 logging.basicConfig(
@@ -29,159 +29,242 @@ logging.basicConfig(
     handlers=[
         logging.StreamHandler(),
         logging.FileHandler('app.log', encoding='utf-8')
-    ])
 logger = logging.getLogger(__name__)
-# Add these imports at the top
-from config import Config
-from proxy_handler import ProxyHandler
-from robots_handler import RobotsHandler
-import asyncio
-import aiohttp
-from tqdm import tqdm
-# Add new imports for rate limiting and testing
-from ratelimit import limits, sleep_and_retry
-from typing import Dict, Any, Optional, List
-import pytest
-from urllib.robotparser import RobotFileParser
-import concurrent.futures
-class URLProcessor:
-    def __init__(self):
-        self.config = Config()
-        self.proxy_handler = ProxyHandler(self.config.get('PROXY_URL'))
-        self.robots_handler = RobotsHandler()
-        self.session = self._create_session()
-        self.rate_limit = self.config.get('RATE_LIMIT', 60)  # requests per minute
-        self.timeout = self.config.get('TIMEOUT', 10)
-    @sleep_and_retry
-    @limits(calls=60, period=60)  # Rate limiting decorator
     def fetch_content(self, url: str) -> Optional[Dict]:
-        """Fetch content with rate limiting"""
-        if self.config.get('RESPECT_ROBOTS', True):
-            if not self.robots_handler.can_fetch(url):
-                logger.warning(f"Skipping {url} - robots.txt disallowed")
-                return None
-        def _create_session(self):
-            session = requests.Session()
-            if self.config.get('USE_PROXY'):
-                session.proxies = self.proxy_handler.get_proxy_config()
-            session.headers.update({
-                'User-Agent': UserAgent().random,
-                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-                'Accept-Language': 'en-US,en;q=0.5',
-                'Accept-Encoding': 'gzip, deflate, br',
-                'Connection': 'keep-alive',
-                'Upgrade-Insecure-Requests': '1'
-            })
-            return session
-        def _fetch_with_selenium(self, url: str) -> Optional[str]:
-            try:
-                chrome_options = Options()
-                from selenium import webdriver
-                from selenium.webdriver.chrome.options import Options
-                from selenium.webdriver.common.by import By
-                from selenium.webdriver.support.ui import WebDriverWait
-                from selenium.webdriver.support import expected_conditions as EC
-                from selenium.common.exceptions import TimeoutException
-                import time
-                logger.info(f"Attempting to fetch {url} with Selenium")
-                # Set up Chrome options
-                chrome_options = Options()
-                chrome_options.add_argument("--headless")
-                chrome_options.add_argument("--no-sandbox")
-                chrome_options.add_argument("--disable-dev-shm-usage")
-                chrome_options.add_argument("--disable-gpu")
-                chrome_options.add_argument("--window-size=1920,1080")
-                chrome_options.add_argument(
-                    "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
-                # Initialize the driver
-                driver = webdriver.Chrome(options=chrome_options)
-                try:
-                    # Navigate to the URL
-                    driver.get(url)
-                    # Wait for the page to load
-                    WebDriverWait(driver, 10).until(
-                        EC.presence_of_element_located((By.TAG_NAME, "body"))
-                    )
-                    # Simulate pressing ESC key to dismiss overlays
-                    from selenium.webdriver.common.keys import Keys
-                    action_chains = webdriver.ActionChains(driver)
-                    action_chains.send_keys(Keys.ESCAPE).perform()
-                    time.sleep(1)  # give it a moment to take effect
-                    action_chains.reset_actions() # Clear actions
-                    # try again
-                    action_chains.send_keys(Keys.ESCAPE).perform()
-                    time.sleep(1)  # give it a moment to take effect
-                    action_chains.reset_actions()
-                    # Get the page source
-                    page_source = driver.page_source
-                    # Save the Selenium HTML for debugging
-                    debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
-                    with open(debug_path, "w", encoding="utf-8") as f:
-                        f.write(page_source)
-                    logger.info(f"Saved Selenium HTML to {debug_path}")
-                    return page_source
-                finally:
-                    driver.quit()
-            except ImportError:
-                logger.error("Selenium is not installed. Cannot use browser automation.")
-                return None
-            except Exception as e:
-                logger.error(f"Selenium processing failed for {url}: {e}")
                 return None
-        async def fetch_urls_async(self, urls: List[str]) -> List[Dict]:
-            """Asynchronous URL fetching with rate limiting"""
-            async with aiohttp.ClientSession() as session:
-                tasks = []
-                for url in urls:
-                    if len(tasks) >= self.rate_limit:
-                        await asyncio.sleep(60)  # Rate limiting
-                        tasks = []
-                    tasks.append(self.fetch_content_async(session, url))
-                return await asyncio.gather(*tasks)
 def create_interface():
     css = """
     .container { max-width: 1200px; margin: auto; }
     .warning { background-color: #fff3cd; color: #856404; }
     .error { background-color: #f8d7da; color: #721c24; }
     """
     with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
-        with gr.Tab("Settings"):
-            respect_robots = gr.Checkbox(label="Respect robots.txt", value=True)
-            use_proxy = gr.Checkbox(label="Use Proxy", value=False)
-            proxy_url = gr.Textbox(label="Proxy URL", placeholder="http://proxy:port")
-            request_delay = gr.Slider(minimum=0, maximum=10, value=1, label="Request Delay (seconds)")
-            output_format = gr.Dropdown(choices=["json", "csv", "txt"], value="json", label="Output Format")
         gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
         with gr.Tab("URL Processing"):
             url_input = gr.Textbox(
-                label="Enter URLs (comma or newline separated)",
                 lines=5,
                 placeholder="https://example1.com\nhttps://example2.com"
             )
         with gr.Tab("File Input"):
             file_input = gr.File(
                 label="Upload text file or ZIP archive",
@@ -190,171 +273,103 @@ def create_interface():
         with gr.Tab("Text Input"):
             text_input = gr.Textbox(
-                label="Raw Text Input",
                 lines=5,
                 placeholder="Paste your text here..."
             )
-        with gr.Tab("JSON Editor"):
-            json_editor = gr.Textbox(
-                label="JSON Editor",
-                lines=20,
-                placeholder="View and edit your JSON data here...",
-                interactive=True,
-                elem_id="json-editor"  # Optional: for custom styling
-            )
-        with gr.Tab("Scratchpad"):
-            scratchpad = gr.Textbox(
-                label="Scratchpad",
-                lines=10,
-                placeholder="Quick notes or text collections...",
-                interactive=True
-            )
         process_btn = gr.Button("Process Input", variant="primary")
-        qr_btn = gr.Button("Generate QR Code", variant="secondary")
         output_text = gr.Textbox(label="Processing Results", interactive=False)
         output_file = gr.File(label="Processed Output")
-        qr_output = gr.Image(label="QR Code", type="filepath")  # To display the generated QR code
-        process_btn.click(
-            process_all_inputs,
-            inputs=[url_input, file_input, text_input, scratchpad],
-            outputs=[output_file, output_text, json_editor]  # Update outputs to include JSON editor
-        )
-        qr_btn.click(
-            generate_qr_code,
-            inputs=json_editor,
-            outputs=qr_output
-        )
-        gr.Markdown("""
-    ### Usage Guidelines
-    - **URL Processing**: Enter valid HTTP/HTTPS URLs
-    - **File Input**: Upload text files or ZIP archives
-    - ** Text Input**: Direct text processing
-    - **JSON Editor**: View and edit your JSON data
-    - **Scratchpad**: Quick notes or text collections
-    - Advanced cleaning and validation included
-    """)
-    return interface
-def check_network_connectivity():
-    """Check if the network is working properly by testing connection to common sites"""
-    test_sites = ["https://www.google.com", "https://www.cloudflare.com", "https://www.amazon.com"]
-    results = []
-    for site in test_sites:
-        try:
-            response = requests.get(site, timeout=5)
-            results.append({
-                "site": site,
-                "status": "OK" if response.status_code == 200 else f"Error: {response.status_code}",
-                "response_time": response.elapsed.total_seconds()
-            })
-        except Exception as e:
-            results.append({
-                "site": site,
-                "status": f"Error: {str(e)}",
-                "response_time": None
-            })
-    # If all sites failed, there might be a network issue
-    if all(result["status"].startswith("Error") for result in results):
-        logger.error("Network connectivity issue detected. All test sites failed.")
-        return False, results
-    return True, results
-def validate_config(config: Dict[str, Any]) -> Dict[str, str]:
-    """Validate configuration settings"""
-    errors = {}
-    if config.get('RATE_LIMIT', 0) < 1:
-        errors['rate_limit'] = "Rate limit must be positive"
-    if config.get('TIMEOUT', 0) < 1:
-        errors['timeout'] = "Timeout must be positive"
-    if config.get('USE_PROXY') and not config.get('PROXY_URL'):
-        errors['proxy'] = "Proxy URL required when proxy is enabled"
-    return errors
-def update_settings(respect_robots: bool, use_proxy: bool, proxy_url: str,
-                   request_delay: float, output_format: str) -> str:
-    """Update application settings"""
-    config = Config()
-    new_settings = {
-        'RESPECT_ROBOTS': respect_robots,
-        'USE_PROXY': use_proxy,
-        'PROXY_URL': proxy_url,
-        'REQUEST_DELAY': request_delay,
-        'OUTPUT_FORMAT': output_format
-    }
-    # Validate settings before updating
-    errors = validate_config(new_settings)
-    if errors:
-        return f"Configuration error: {', '.join(errors.values())}"
-    config.update(new_settings)
-    return "Configuration updated successfully"
-def create_settings_tab() -> gr.Tab:
-    """Create settings tab with configuration controls"""
-    with gr.Tab("Settings") as settings_tab:
-        respect_robots = gr.Checkbox(label="Respect robots.txt", value=True)
-        use_proxy = gr.Checkbox(label="Use Proxy", value=False)
-        proxy_url = gr.Textbox(label="Proxy URL", placeholder="http://proxy:port")
-        request_delay = gr.Slider(minimum=0, maximum=10, value=1, label="Request Delay (seconds)")
-        output_format = gr.Dropdown(choices=["json", "csv", "txt"], value="json", label="Output Format")
-        settings_btn = gr.Button("Update Settings")
-        settings_output = gr.Textbox(label="Settings Status")
-        settings_btn.click(
-            update_settings,
-            inputs=[respect_robots, use_proxy, proxy_url, request_delay, output_format],
-            outputs=settings_output
         )
-    return settings_tab
 def main():
-    """Main application entry point"""
-    try:
-        # Initialize system settings
-        mimetypes.init()
-        # Validate initial configuration
-        config = Config()
-        errors = validate_config(config.get_all())
-        if errors:
-            logger.error(f"Configuration errors found: {errors}")
-            sys.exit(1)
-        # Check network connectivity
-        network_ok, network_results = check_network_connectivity()
-        if not network_ok:
-            logger.warning("Network connectivity issues detected. Some features may not work properly.")
-            for result in network_results:
-                logger.warning(f"Test site {result['site']}: {result['status']}")
-        # Create and launch interface
-        interface = create_interface()
-        # Launch with proper configuration
-        interface.launch(
-            server_name="0.0.0.0",
-            server_port=7860,
-            show_error=True,
-            share=False,
-            inbrowser=True,
-            debug=True
-        )
-    except Exception as e:
-        logger.error(f"Application startup failed: {str(e)}")
-        sys.exit(1)
 if __name__ == "__main__":
-    main()

 import json
 import os
 import re
 import time
 import logging
 import mimetypes
+import concurrent.futures
+import string
+import zipfile
 import tempfile
 from datetime import datetime
+from typing import List, Dict, Optional, Union
 from pathlib import Path
 from urllib.parse import urlparse
 import requests
 import validators
 import gradio as gr
+from diskcache import Cache
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
+from ratelimit import limits, sleep_and_retry
 from cleantext import clean
 # Setup logging with detailed configuration
 logging.basicConfig(
     handlers=[
         logging.StreamHandler(),
         logging.FileHandler('app.log', encoding='utf-8')
+    ]
+)
 logger = logging.getLogger(__name__)
+class URLProcessor:
+    def __init__(self):
+        self.session = requests.Session()
+        self.timeout = 10  # seconds
+        self.session.headers.update({
+            'User-Agent': UserAgent().random,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
+        })
+    def advanced_text_cleaning(self, text: str) -> str:
+        """Robust text cleaning with version compatibility"""
+        try:
+            cleaned_text = clean(
+                text,
+                fix_unicode=True,
+                to_ascii=True,
+                lower=True,
+                no_line_breaks=True,
+                no_urls=True,
+                no_emails=True,
+                no_phone_numbers=True,
+                no_numbers=False,
+                no_digits=False,
+                no_currency_symbols=True,
+                no_punct=False
+            ).strip()
+            return cleaned_text
+        except Exception as e:
+            logger.warning(f"Text cleaning error: {e}. Using fallback method.")
+            text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)  # Remove control characters
+            text = text.encode('ascii', 'ignore').decode('ascii')  # Remove non-ASCII characters
+            text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
+            return text.strip()
+    def validate_url(self, url: str) -> Dict:
+        """Validate URL format and accessibility"""
+        try:
+            if not validators.url(url):
+                return {'is_valid': False, 'message': 'Invalid URL format'}
+            response = self.session.head(url, timeout=self.timeout)
+            response.raise_for_status()
+            return {'is_valid': True, 'message': 'URL is valid and accessible'}
+        except Exception as e:
+            return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
     def fetch_content(self, url: str) -> Optional[Dict]:
+        """Universal content fetcher with special case handling"""
+        try:
+            # Google Drive document handling
+            if 'drive.google.com' in url:
+                return self._handle_google_drive(url)
+            # Google Calendar ICS handling
+            if 'calendar.google.com' in url and 'ical' in url:
+                return self._handle_google_calendar(url)
+            # Standard HTML processing
+            return self._fetch_html_content(url)
+        except Exception as e:
+            logger.error(f"Content fetch failed: {e}")
+            return None
+    def _handle_google_drive(self, url: str) -> Optional[Dict]:
+        """Process Google Drive file links"""
+        try:
+            file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
+            if not file_id:
+                logger.error(f"Invalid Google Drive URL: {url}")
                 return None
+            direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
+            response = self.session.get(direct_url, timeout=self.timeout)
+            response.raise_for_status()
+            return {
+                'content': response.text,
+                'content_type': response.headers.get('Content-Type', ''),
+                'timestamp': datetime.now().isoformat()
+            }
+        except Exception as e:
+            logger.error(f"Google Drive processing failed: {e}")
+            return None
+    def _handle_google_calendar(self, url: str) -> Optional[Dict]:
+        """Process Google Calendar ICS feeds"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            return {
+                'content': response.text,
+                'content_type': 'text/calendar',
+                'timestamp': datetime.now().isoformat()
+            }
+        except Exception as e:
+            logger.error(f"Calendar fetch failed: {e}")
+            return None
+    def _fetch_html_content(self, url: str) -> Optional[Dict]:
+        """Standard HTML content processing"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
+                element.decompose()
+            # Extract main content
+            main_content = soup.find('main') or soup.find('article') or soup.body
+            # Clean and structure content
+            text_content = main_content.get_text(separator='\n', strip=True)
+            cleaned_content = self.advanced_text_cleaning(text_content)
+            return {
+                'content': cleaned_content,
+                'content_type': response.headers.get('Content-Type', ''),
+                'timestamp': datetime.now().isoformat()
+            }
+        except Exception as e:
+            logger.error(f"HTML processing failed: {e}")
+            return None
+class FileProcessor:
+    """Class to handle file processing"""
+    def __init__(self, max_file_size: int = 10 * 1024 * 1024):  # 10MB default
+        self.max_file_size = max_file_size
+        self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
+    def is_text_file(self, filepath: str) -> bool:
+        """Check if file is a text file"""
+        try:
+            mime_type, _ = mimetypes.guess_type(filepath)
+            return (mime_type and mime_type.startswith('text/')) or \
+                   (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
+        except Exception:
+            return False
+    def process_file(self, file) -> List[Dict]:
+        """Process uploaded file with enhanced error handling"""
+        if not file:
+            return []
+        dataset = []
+        try:
+            file_size = os.path.getsize(file.name)
+            if file_size > self.max_file_size:
+                logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
+                return []
+            with tempfile.TemporaryDirectory() as temp_dir:
+                if zipfile.is_zipfile(file.name):
+                    dataset.extend(self._process_zip_file(file.name, temp_dir))
+                else:
+                    dataset.extend(self._process_single_file(file))
+        except Exception as e:
+            logger.error(f"Error processing file: {str(e)}")
+            return []
+        return dataset
+    def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
+        """Process ZIP file contents"""
+        results = []
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(temp_dir)
+            for root, _, files in os.walk(temp_dir):
+                for filename in files:
+                    filepath = os.path.join(root, filename)
+                    if self.is_text_file(filepath):
+                        try:
+                            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+                                content = f.read()
+                            if content.strip():
+                                results.append({
+                                    "source": "file",
+                                    "filename": filename,
+                                    "content": content,
+                                    "timestamp": datetime.now().isoformat()
+                                })
+                        except Exception as e:
+                            logger.error(f"Error reading file {filename}: {str(e)}")
+        return results
+    def _process_single_file(self, file) -> List[Dict]:
+        try:
+            file_stat = os.stat(file.name)
+            with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+            return [{
+                'source': 'file',
+                'filename': os.path.basename(file.name),
+                'file_size': file_stat.st_size,
+                'mime_type': mimetypes.guess_type(file.name)[0],
+                'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                'content': content,
+                'timestamp': datetime.now().isoformat()
+            }]
+        except Exception as e:
+            logger.error(f"File processing error: {e}")
+            return []
 def create_interface():
+    """Create a comprehensive Gradio interface with advanced features"""
     css = """
     .container { max-width: 1200px; margin: auto; }
     .warning { background-color: #fff3cd; color: #856404; }
     .error { background-color: #f8d7da; color: #721c24; }
     """
     with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
         gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
         with gr.Tab("URL Processing"):
             url_input = gr.Textbox(
+                label="Enter URLs (comma or newline separated)",
                 lines=5,
                 placeholder="https://example1.com\nhttps://example2.com"
             )
         with gr.Tab("File Input"):
             file_input = gr.File(
                 label="Upload text file or ZIP archive",
         with gr.Tab("Text Input"):
             text_input = gr.Textbox(
+                label="Raw Text Input",
                 lines=5,
                 placeholder="Paste your text here..."
             )
         process_btn = gr.Button("Process Input", variant="primary")
         output_text = gr.Textbox(label="Processing Results", interactive=False)
         output_file = gr.File(label="Processed Output")
+        def process_all_inputs(urls, file, text):
+            """Process all input types with progress tracking"""
+            try:
+                processor = URLProcessor()
+                file_processor = FileProcessor()
+                results = []
+                # Process URLs
+                if urls:
+                    url_list = re.split(r'[,\n]', urls)
+                    url_list = [url.strip() for url in url_list if url.strip()]
+                    for url in url_list:
+                        validation = processor.validate_url(url)
+                        if validation.get('is_valid'):
+                            content = processor.fetch_content(url)
+                            if content:
+                                results.append({
+                                    'source': 'url',
+                                    'url': url,
+                                    'content': content,
+                                    'timestamp': datetime.now().isoformat()
+                                })
+                # Process files
+                if file:
+                    results.extend(file_processor.process_file(file))
+                # Process text input
+                if text:
+                    cleaned_text = processor.advanced_text_cleaning(text)
+                    results.append({
+                        'source': 'direct_input',
+                        'content': cleaned_text,
+                        'timestamp': datetime.now().isoformat()
+                    })
+                # Generate output
+                if results:
+                    output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
+                    output_dir.mkdir(parents=True, exist_ok=True)
+                    output_path = output_dir / f'processed_{int(time.time())}.json'
+                    with open(output_path, 'w', encoding='utf-8') as f:
+                        json.dump(results, f, ensure_ascii=False, indent=2)
+                    summary = f"Processed {len(results)} items successfully!"
+                    # Convert Path object to string here
+                    return str(output_path), summary
+                else:
+                    return None, "No valid content to process."
+            except Exception as e:
+                logger.error(f"Processing error: {e}")
+                return None, f"Error: {str(e)}"
+        process_btn.click(
+            process_all_inputs,
+            inputs=[url_input, file_input, text_input],
+            outputs=[output_file, output_text]
         )
+        gr.Markdown("""
+        ### Usage Guidelines
+        - **URL Processing**: Enter valid HTTP/HTTPS URLs
+        - **File Input**: Upload text files or ZIP archives
+        - **Text Input**: Direct text processing
+        - Advanced cleaning and validation included
+        """)
+    return interface
 def main():
+    # Configure system settings
+    mimetypes.init()
+    # Create and launch interface
+    interface = create_interface()
+    # Launch with proper configuration
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        share=False,
+        inbrowser=True,
+        debug=True
+    )
 if __name__ == "__main__":
+    main()