Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Dec 7, 2024

Commit

200e562

verified ·

1 Parent(s): 2713a9a

Update app.py

Browse files

Files changed (1) hide show

app.py +355 -303

app.py CHANGED Viewed

@@ -1,364 +1,416 @@
 import json
 import os
-import torch
-import string
-import requests
-from bs4 import BeautifulSoup
-import tempfile
-import zipfile
-import mimetypes
-from tqdm import tqdm
 import logging
-import gradio as gr
-from typing import List, Dict, Union, Optional
-from urllib.parse import urlparse
 import concurrent.futures
-import validators
 from pathlib import Path
-import re
-# Setup logging with more detailed configuration
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
     handlers=[
         logging.StreamHandler(),
-        logging.FileHandler('app.log')
     ]
 )
 logger = logging.getLogger(__name__)
-class URLProcessor:
-    """Class to handle URL processing with advanced features"""
-    def __init__(self, timeout: int = 10, max_retries: int = 3, concurrent_requests: int = 5):
         self.timeout = timeout
         self.max_retries = max_retries
         self.concurrent_requests = concurrent_requests
         self.session = requests.Session()
-        # Add common headers to mimic browser behavior
         self.session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         })
-    def validate_url(self, url: str) -> bool:
-    """Validate URL format and accessibility"""
-    try:
-        result = urlparse(url)
-        is_valid = all([result.scheme, result.netloc]) and validators.url(url)
-        logger.info(f"Validating URL: {url} - Result: {is_valid}")
-        return is_valid
-    except Exception as e:
-        logger.warning(f"Invalid URL format: {url} - {str(e)}")
-        return False
-    def fetch_content(self, url: str) -> Optional[str]:
-        """Fetch content from URL with retry mechanism"""
-        for attempt in range(self.max_retries):
             try:
-                response = self.session.get(url, timeout=self.timeout)
-                response.raise_for_status()
-                return response.text
             except requests.RequestException as e:
-                logger.error(f"Attempt {attempt + 1}/{self.max_retries} failed for {url}: {str(e)}")
-                if attempt == self.max_retries - 1:
-                    return None
-            time.sleep(1)  # Delay between retries
-    def process_urls(self, urls: List[str]) -> List[Dict]:
-        """Process multiple URLs concurrently"""
-        valid_urls = [url for url in urls if self.validate_url(url)]
-        if not valid_urls:
-            logger.warning("No valid URLs to process")
-            return []
-        results = []
-        with concurrent.futures.ThreadPoolExecutor(max_workers=self.concurrent_requests) as executor:
-            future_to_url = {executor.submit(self.fetch_content, url): url for url in valid_urls}
-            for future in concurrent.futures.as_completed(future_to_url):
-                url = future_to_url[future]
                 try:
-                    html = future.result()
-                    if html:
-                        text = extract_text(html)
-                        if text:
-                            results.append({
-                                "source": "url",
-                                "url": url,
-                                "content": text,
-                                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
-                            })
-                        else:
-                            logger.warning(f"No text content extracted from {url}")
-                except Exception as e:
-                    logger.error(f"Error processing {url}: {str(e)}")
-        return results
-def extract_text(html: str) -> str:
-    """Enhanced text extraction with better cleaning"""
-    if not html:
-        return ""
-    soup = BeautifulSoup(html, 'html.parser')
-    # Remove unwanted elements
-    for element in soup(['script', 'style', 'header', 'footer', 'nav']):
-        element.decompose()
-    # Extract text with better formatting
-    text = soup.get_text(separator=' ')
-    # Clean up the text
-    lines = (line.strip() for line in text.splitlines())
-    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-    text = ' '.join(chunk for chunk in chunks if chunk)
-    # Remove excessive whitespace
-    text = re.sub(r'\s+', ' ', text)
-    return text.strip()
-class FileProcessor:
-    """Class to handle file processing"""
-    def __init__(self, max_file_size: int = 10 * 1024 * 1024):  # 10MB default
-        self.max_file_size = max_file_size
-        self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
-    def is_text_file(self, filepath: str) -> bool:
-        """Check if file is a text file"""
-        try:
-            mime_type, _ = mimetypes.guess_type(filepath)
-            return mime_type and mime_type.startswith('text/')
-        except Exception:
-            return False
-    def process_file(self, file) -> List[Dict]:
-        """Process uploaded file with enhanced error handling"""
-        if not file:
-            return []
-        dataset = []
         try:
-            file_size = os.path.getsize(file.name)
-            if file_size > self.max_file_size:
-                logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
-                return []
-            with tempfile.TemporaryDirectory() as temp_dir:
-                if zipfile.is_zipfile(file.name):
-                    dataset.extend(self._process_zip_file(file.name, temp_dir))
                 else:
-                    dataset.extend(self._process_single_file(file))
         except Exception as e:
-            logger.error(f"Error processing file: {str(e)}")
-            return []
-        return dataset
-    def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
-        """Process ZIP file contents"""
-        results = []
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_ref.extractall(temp_dir)
-            for root, _, files in os.walk(temp_dir):
-                for filename in files:
-                    filepath = os.path.join(root, filename)
-                    if self.is_text_file(filepath):
-                        try:
-                            with open(filepath, 'r', errors='ignore') as f:
-                                content = f.read()
-                            if content.strip():
-                                results.append({
-                                    "source": "file",
-                                    "filename": filename,
-                                    "content": content,
-                                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
-                                })
-                        except Exception as e:
-                            logger.error(f"Error reading file {filename}: {str(e)}")
-        return results
-    def _process_single_file(self, file) -> List[Dict]:
-        """Process single file"""
-        results = []
         try:
-            content = file.read().decode('utf-8', errors='ignore')
-            if content.strip():
-                results.append({
-                    "source": "file",
-                    "filename": os.path.basename(file.name),
-                    "content": content,
-                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
-                })
         except Exception as e:
-            logger.error(f"Error processing single file: {str(e)}")
-        return results
-def preprocess_bulk_text(text: str) -> str:
-    """Enhanced text preprocessing"""
-    if not text:
-        return ""
-    # Normalize line endings
-    text = text.replace('\r\n', '\n').replace('\r', '\n')
-    # Define separators
-    separators = ['\n', ' / ', '/', ';', ' - ', '|', '  ']
-    # Replace separators with commas if not already comma-separated
-    if ',' not in text:
-        for separator in separators:
-            text = text.replace(separator, ',')
-        # Handle domain endings
-        domain_pattern = r'(\.[a-z]{2,})\s+'
-        text = re.sub(domain_pattern, r'\1,', text)
-        # Clean up multiple commas and whitespace
-        text = re.sub(r',+', ',', text)
-        text = text.strip(',' + string.whitespace)
-        text = re.sub(r'\s*,\s*', ', ', text)
-    return text
 def create_interface():
-    """Create enhanced Gradio interface"""
-    # Custom CSS for better styling
-    custom_css = """
-    .container { max-width: 1200px; margin: auto; padding: 20px; }
-    .output-panel { margin-top: 20px; }
-    .warning { color: #856404; background-color: #fff3cd; padding: 10px; border-radius: 4px; }
-    .error { color: #721c24; background-color: #f8d7da; padding: 10px; border-radius: 4px; }
-    """
-    with gr.Blocks(css=custom_css) as interface:
-        gr.Markdown("# Advanced URL and Text Processing Tool")
-        with gr.Tab("URL Input"):
-            url_input = gr.Textbox(
-                label="Enter URLs (comma-separated or one per line)",
-                placeholder="https://example1.com, https://example2.com",
-                lines=5
-            )
-        with gr.Tab("File Input"):
-            file_input = gr.File(
-                label="Upload text file or ZIP archive",
-                file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
-            )
-        with gr.Tab("Text Input"):
-            text_input = gr.Textbox(
-                label="Enter text directly",
-                placeholder="Enter your text here...",
-                lines=5
-            )
-        # Process button with loading state
-        process_btn = gr.Button("Process Input", variant="primary")
-        # Output components
         with gr.Row():
-            output_file = gr.File(label="Processed Dataset")
-            output_text = gr.Textbox(
-                label="Processing Results",
-                lines=3,
-                interactive=False
-            )
-        def process_all_inputs(urls, file, text):
-            """Process all input types with progress tracking"""
-            try:
-                dataset = []
-                # Process URLs
-                if urls:
-                    url_processor = URLProcessor()
-                    url_list = [u.strip() for u in urls.split(',') if u.strip()]
-                    dataset.extend(url_processor.process_urls(url_list))
-                # Process files
-                if file:
-                    file_processor = FileProcessor()
-                    dataset.extend(file_processor.process_file(file))
-                # Process text input
-                if text:
-                    processed_text = preprocess_bulk_text(text)
-                    if processed_text:
-                        dataset.append({
-                            "source": "input",
-                            "content": processed_text,
-                            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
-                        })
-                if not dataset:
-                    return [None, "No valid data to process. Please check your inputs."]
-                # Save results
-                output_file = 'processed_dataset.json'
-                with open(output_file, 'w', encoding='utf-8') as f:
-                    json.dump(dataset, f, indent=2, ensure_ascii=False)
-                # Generate summary
-                summary = f"""
-                Processing completed successfully!
-                - URLs processed: {sum(1 for d in dataset if d['source'] == 'url')}
-                - Files processed: {sum(1 for d in dataset if d['source'] == 'file')}
-                - Text inputs processed: {sum(1 for d in dataset if d['source'] == 'input')}
-                """
-                return [output_file, summary]
-            except Exception as e:
-                error_msg = f"Error during processing: {str(e)}"
-                logger.error(error_msg)
-                return [None, error_msg]
-        # Connect the interface
-        process_btn.click(
-            fn=process_all_inputs,
-            inputs=[url_input, file_input, text_input],
-            outputs=[output_file, output_text]
-        )
-        # Add comprehensive instructions
-        gr.Markdown("""
-        ## Instructions
-        1. **URL Input**:
-           - Enter URLs separated by commas or new lines
-           - URLs must start with http:// or https://
-           - Invalid URLs will be skipped
-        2. **File Input**:
-           - Upload text files or ZIP archives
-           - Supported formats: .txt, .zip, .md, .csv, .json, .xml
-           - Maximum file size: 10MB
-        3. **Text Input**:
-           - Directly enter or paste text
-           - Text will be automatically formatted
-        4. Click 'Process Input' to generate the dataset
-        The tool will combine all valid inputs into a single JSON dataset file.
-        """)
     return interface
 if __name__ == "__main__":
     # Initialize mimetypes
     mimetypes.init()
-    # Create and launch the interface
     interface = create_interface()
     interface.launch(
-        share=True,
         server_name="0.0.0.0",
         server_port=7860,
         debug=True
-    )

 import json
 import os
+import re
+import time
 import logging
+import mimetypes
 import concurrent.futures
+import string
+from typing import List, Dict, Optional, Union, Any
 from pathlib import Path
+from urllib.parse import urlparse
+import requests
+import validators
+import gradio as gr
+import torch
+import cachetools
+from bs4 import BeautifulSoup
+from fake_useragent import UserAgent
+from ratelimit import limits, sleep_and_retry
+# Advanced Logging Configuration
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
     handlers=[
         logging.StreamHandler(),
+        logging.FileHandler('app_advanced.log', encoding='utf-8')
     ]
 )
 logger = logging.getLogger(__name__)
+class AdvancedURLProcessor:
+    """Enhanced URL processing with advanced features"""
+    def __init__(
+        self,
+        timeout: int = 15,
+        max_retries: int = 3,
+        concurrent_requests: int = 5,
+        cache_size: int = 100
+    ):
         self.timeout = timeout
         self.max_retries = max_retries
         self.concurrent_requests = concurrent_requests
+        self.ua = UserAgent()
+        # Implement multilevel caching
+        self.url_cache = cachetools.LRUCache(maxsize=cache_size)
+        self.content_cache = cachetools.TTLCache(maxsize=cache_size, ttl=3600)  # 1-hour cache
         self.session = requests.Session()
         self.session.headers.update({
+            'User-Agent': self.ua.random,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Connection': 'keep-alive'
         })
+    @sleep_and_retry
+    @limits(calls=10, period=60)  # Rate limiting: 10 calls per minute
+    def validate_url(self, url: str) -> Dict[str, Union[bool, str]]:
+        """Enhanced URL validation with comprehensive checks"""
+        try:
+            # Check cache first
+            if url in self.url_cache:
+                return self.url_cache[url]
+            # Comprehensive URL validation
+            result = urlparse(url)
+            validation_result = {
+                'is_valid': False,
+                'message': 'Invalid URL',
+                'scheme': result.scheme,
+                'netloc': result.netloc
+            }
+            if not all([result.scheme, result.netloc]):
+                validation_result['message'] = 'Missing scheme or network location'
+                return validation_result
+            # Use validators for additional checks
+            if not validators.url(url):
+                validation_result['message'] = 'URL format validation failed'
+                return validation_result
+            # Perform HEAD request for accessibility
             try:
+                response = self.session.head(
+                    url,
+                    timeout=self.timeout,
+                    allow_redirects=True
+                )
+                validation_result['is_valid'] = response.status_code in [200, 301, 302]
+                validation_result['status_code'] = response.status_code
+                validation_result['message'] = f"URL is {'valid' if validation_result['is_valid'] else 'invalid'}"
             except requests.RequestException as e:
+                validation_result['message'] = f"Connection error: {str(e)}"
+            # Cache the result
+            self.url_cache[url] = validation_result
+            return validation_result
+        except Exception as e:
+            logger.error(f"Unexpected error validating URL {url}: {e}")
+            return {
+                'is_valid': False,
+                'message': f"Unexpected validation error: {str(e)}"
+            }
+    @sleep_and_retry
+    @limits(calls=10, period=60)
+    async def fetch_content(self, url: str) -> Optional[str]:
+        """Fetch content from URL with retry mechanism and caching"""
+        try:
+            # Check content cache first
+            if url in self.content_cache:
+                logger.info(f"Cache hit for URL: {url}")
+                return self.content_cache[url]
+            for attempt in range(self.max_retries):
                 try:
+                    response = self.session.get(url, timeout=self.timeout)
+                    response.raise_for_status()
+                    content = response.text
+                    # Cache the content
+                    self.content_cache[url] = content
+                    return content
+                except requests.RequestException as e:
+                    logger.warning(f"Attempt {attempt + 1}/{self.max_retries} failed for {url}: {str(e)}")
+                    if attempt == self.max_retries - 1:
+                        raise
+                    time.sleep(1)  # Delay between retries
+        except Exception as e:
+            logger.error(f"Error fetching content from {url}: {e}")
+            return None
+class ContentExtractor:
+    """Advanced content extraction and processing"""
+    def __init__(self):
+        self.cleaners = [
+            self._remove_scripts,
+            self._remove_styles,
+            self._remove_special_chars,
+            self._normalize_whitespace
+        ]
+    def extract_text(self, html: str, url: str = "") -> Dict[str, Union[str, Dict]]:
+        """Extract and clean text content with metadata"""
+        try:
+            if not html:
+                return {
+                    "success": False,
+                    "content": "",
+                    "metadata": {"error": "Empty HTML content"}
+                }
+            soup = BeautifulSoup(html, 'html.parser')
+            # Extract metadata
+            metadata = self._extract_metadata(soup, url)
+            # Clean content
+            content = self._process_content(soup)
+            return {
+                "success": True,
+                "content": content,
+                "metadata": metadata
+            }
+        except Exception as e:
+            logger.error(f"Content extraction error for {url}: {e}")
+            return {
+                "success": False,
+                "content": "",
+                "metadata": {"error": str(e)}
+            }
+    def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
+        """Extract page metadata"""
+        metadata = {
+            "title": self._get_title(soup),
+            "description": self._get_meta_description(soup),
+            "keywords": self._get_meta_keywords(soup),
+            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "url": url
+        }
+        return metadata
+    def _process_content(self, soup: BeautifulSoup) -> str:
+        """Process and clean content through multiple passes"""
+        for cleaner in self.cleaners:
+            soup = cleaner(soup)
+        # Extract text with preserved structure
+        lines = []
+        for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
+            text = element.get_text(strip=True)
+            if text:
+                lines.append(text)
+        return "\n".join(lines)
+    @staticmethod
+    def _remove_scripts(soup: BeautifulSoup) -> BeautifulSoup:
+        for script in soup(["script", "style", "iframe", "noscript"]):
+            script.decompose()
+        return soup
+    @staticmethod
+    def _remove_styles(soup: BeautifulSoup) -> BeautifulSoup:
+        for element in soup.find_all(style=True):
+            del element['style']
+        return soup
+    @staticmethod
+    def _remove_special_chars(soup: BeautifulSoup) -> BeautifulSoup:
+        text = soup.get_text()
+        text = re.sub(r'[^\w\s\.\,\!\?\-]', '', text)
+        new_soup = BeautifulSoup(f"<div>{text}</div>", 'html.parser')
+        return new_soup
+    @staticmethod
+    def _normalize_whitespace(soup: BeautifulSoup) -> BeautifulSoup:
+        text = soup.get_text()
+        text = re.sub(r'\s+', ' ', text)
+        new_soup = BeautifulSoup(f"<div>{text}</div>", 'html.parser')
+        return new_soup
+    @staticmethod
+    def _get_title(soup: BeautifulSoup) -> str:
+        title = soup.find('title')
+        return title.get_text(strip=True) if title else ""
+    @staticmethod
+    def _get_meta_description(soup: BeautifulSoup) -> str:
+        meta = soup.find('meta', attrs={'name': 'description'})
+        return meta.get('content', '') if meta else ""
+    @staticmethod
+    def _get_meta_keywords(soup: BeautifulSoup) -> str:
+        meta = soup.find('meta', attrs={'name': 'keywords'})
+        return meta.get('content', '') if meta else ""
+class ContentProcessor:
+    """Main content processing orchestrator"""
+    def __init__(self):
+        self.url_processor = AdvancedURLProcessor()
+        self.content_extractor = ContentExtractor()
+        self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
+    async def process_urls(self, urls: List[str]) -> Dict[str, Any]:
+        """Process multiple URLs concurrently with advanced error handling"""
+        results = {
+            "successful": [],
+            "failed": [],
+            "metadata": {
+                "total_urls": len(urls),
+                "start_time": time.strftime("%Y-%m-%d %H:%M:%S")
+            }
+        }
         try:
+            # Validate URLs first
+            valid_urls = []
+            for url in urls:
+                validation_result = self.url_processor.validate_url(url)
+                if validation_result['is_valid']:
+                    valid_urls.append(url)
                 else:
+                    results['failed'].append({
+                        "url": url,
+                        "error": validation_result['message']
+                    })
+            # Process valid URLs concurrently
+            futures = []
+            for url in valid_urls:
+                future = self.executor.submit(self._process_single_url, url)
+                futures.append((url, future))
+            # Collect results
+            for url, future in futures:
+                try:
+                    result = future.result(timeout=30)  # 30-second timeout
+                    if result["success"]:
+                        results["successful"].append(result)
+                    else:
+                        results["failed"].append({
+                            "url": url,
+                            "error": "Processing failed"
+                        })
+                except Exception as e:
+                    logger.error(f"Error processing {url}: {e}")
+                    results["failed"].append({
+                        "url": url,
+                        "error": str(e)
+                    })
+            # Update metadata
+            results["metadata"].update({
+                "end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
+                "successful_count": len(results["successful"]),
+                "failed_count": len(results["failed"])
+            })
+            return results
         except Exception as e:
+            logger.error(f"Batch processing error: {e}")
+            raise
+    def _process_single_url(self, url: str) -> Dict:
+        """Process a single URL with comprehensive error handling"""
         try:
+            response = self.url_processor.session.get(
+                url,
+                timeout=self.url_processor.timeout
+            )
+            response.raise_for_status()
+            result = self.content_extractor.extract_text(
+                response.text,
+                url
+            )
+            result["url"] = url
+            result["timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S")
+            return result
         except Exception as e:
+            logger.error(f"Error processing {url}: {e}")
+            return {
+                "success": False,
+                "url": url,
+                "error": str(e)
+            }
 def create_interface():
+    """Create Gradio interface with advanced features"""
+    processor = ContentProcessor()
+    with gr.Blocks(title="Advanced URL Content Processor") as interface:
+        gr.Markdown("# Advanced URL Content Processor")
         with gr.Row():
+            with gr.Column():
+                url_input = gr.Textbox(
+                    label="Enter URLs (one per line)",
+                    placeholder="https://example.com\nhttps://example.org",
+                    lines=5
+                )
+                with gr.Row():
+                    process_btn = gr.Button("Process URLs", variant="primary")
+                    clear_btn = gr.Button("Clear")
+            with gr.Column():
+                status_output = gr.JSON(
+                    label="Processing Results",
+                    show_label=True
+                )
+        gr.Markdown("## Processing Status")
+        with gr.Row():
+            progress_output = gr.Textbox(
+                label="Progress",
+                show_label=True
+            )
+        async def process_urls(urls):
+            if not urls.strip():
+                return {"error": "No URLs provided"}
+            url_list = [url.strip() for url in urls.splitlines() if url.strip()]
+            results = await processor.process_urls(url_list)
+            return results
+        def clear_inputs():
+            return None, None
+        process_btn.click(
+            fn=process_urls,
+            inputs=[url_input],
+            outputs=[status_output]
+        )
+        clear_btn.click(
+            fn=clear_inputs,
+            inputs=[],
+            outputs=[url_input, status_output]
+        )
     return interface
 if __name__ == "__main__":
     # Initialize mimetypes
     mimetypes.init()
+    # Create and launch interface
     interface = create_interface()
     interface.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False,
         debug=True
+    )