Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 13

Commit

890dba9

verified ·

1 Parent(s): 1336a84

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -318

app.py CHANGED Viewed

@@ -1,323 +1,48 @@
-import json
-import os
-import re
-import time
-import logging
-import mimetypes
-import concurrent.futures
-import string
-from typing import List, Dict, Optional, Union
-from pathlib import Path
-from urllib.parse import urlparse
-import requests
-import validators
-import gradio as gr
-from diskcache import Cache
-from bs4 import BeautifulSoup
-from fake_useragent import UserAgent
-from ratelimit import limits, sleep_and_retry
-from cleantext import clean
-# Setup logging with detailed configuration
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
-    handlers=[
-        logging.StreamHandler(),
-        logging.FileHandler('app.log', encoding='utf-8')
-    ]
-)
-logger = logging.getLogger(__name__)
-class URLProcessor:
-    def advanced_text_cleaning(self, text: str) -> str:
-        """Robust text cleaning with version compatibility"""
-        try:
-            # Modern clean-text parameters
-            return clean(text,
-                fix_unicode=True,
-                to_ascii=True,
-                lower=True,
-                no_line_breaks=True,
-                no_urls=True,
-                no_emails=True,
-                no_phone_numbers=True,
-                no_numbers=False,
-                no_digits=False,
-                no_currency_symbols=True,
-                no_punct=False
-            ).strip()
-        except TypeError as e:
-            # Fallback to basic cleaning
-            logger.warning("Using fallback text cleaning method")
-            text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)  # Control chars
-            text = text.encode('ascii', 'ignore').decode('ascii')  # Unicode
-            text = re.sub(r'\s+', ' ', text)  # Whitespace
-            return text.strip()
-    def fetch_content(self, url: str) -> Optional[Dict]:
-        """Universal content fetcher with special case handling"""
-        # Google Drive document handling
-        if 'drive.google.com' in url:
-            return self._handle_google_drive(url)
-        # Google Calendar ICS handling
-        if 'calendar.google.com' in url and 'ical' in url:
-            return self._handle_google_calendar(url)
-        # Standard HTML processing
-        return self._fetch_html_content(url)
-    def _handle_google_drive(self, url: str) -> Optional[Dict]:
-        """Process Google Drive file links"""
-        try:
-            file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
-            if not file_id:
-                logger.error(f"Invalid Google Drive URL: {url}")
-                return None
-            direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
-            response = self.session.get(direct_url, timeout=self.timeout)
-            response.raise_for_status()
-            return {
-                'content': response.text,
-                'content_type': response.headers.get('Content-Type', ''),
-                'timestamp': datetime.now().isoformat()
-            }
-        except Exception as e:
-            logger.error(f"Google Drive processing failed: {e}")
-            return None
-    def _handle_google_calendar(self, url: str) -> Optional[Dict]:
-        """Process Google Calendar ICS feeds"""
-        try:
-            response = self.session.get(url, timeout=self.timeout)
-            response.raise_for_status()
-            return {
-                'content': response.text,
-                'content_type': 'text/calendar',
-                'timestamp': datetime.now().isoformat()
-            }
-        except Exception as e:
-            logger.error(f"Calendar fetch failed: {e}")
-            return None
-    def _fetch_html_content(self, url: str) -> Optional[Dict]:
-        """Standard HTML content processing"""
-        try:
-            response = self.session.get(url, timeout=self.timeout)
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # ... existing HTML processing logic ...
-            return structured_data
-        except Exception as e:
-            logger.error(f"HTML processing failed: {e}")
-            return None
-class FileProcessor:
-    """Class to handle file processing"""
-    def __init__(self, max_file_size: int = 10 * 1024 * 1024):  # 10MB default
-        self.max_file_size = max_file_size
-        self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
-    def is_text_file(self, filepath: str) -> bool:
-        """Check if file is a text file"""
-        try:
-            mime_type, _ = mimetypes.guess_type(filepath)
-            return mime_type and mime_type.startswith('text/')
-        except Exception:
-            return False
-    def process_file(self, file) -> List[Dict]:
-        """Process uploaded file with enhanced error handling"""
-        if not file:
-            return []
-        dataset = []
-        try:
-            file_size = os.path.getsize(file.name)
-            if file_size > self.max_file_size:
-                logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
-                return []
-            with tempfile.TemporaryDirectory() as temp_dir:
-                if zipfile.is_zipfile(file.name):
-                    dataset.extend(self._process_zip_file(file.name, temp_dir))
-                else:
-                    dataset.extend(self._process_single_file(file))
-        except Exception as e:
-            logger.error(f"Error processing file: {str(e)}")
-            return []
-        return dataset
-    def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
-        """Process ZIP file contents"""
-        results = []
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_ref.extractall(temp_dir)
-            for root, _, files in os.walk(temp_dir):
-                for filename in files:
-                    filepath = os.path.join(root, filename)
-                    if self.is_text_file(filepath):
-                        try:
-                            with open(filepath, 'r', errors='ignore') as f:
-                                content = f.read()
-                            if content.strip():
-                                results.append({
-                                    "source": "file",
-                                    "filename": filename,
-                                    "content": content,
-                                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
-                                })
-                        except Exception as e:
-                            logger.error(f"Error reading file {filename}: {str(e)}")
-        return results
-    def _process_single_file(self, file) -> List[Dict]:
-        try:
-            file_stat = os.stat(file.name)
-            content = file.read().decode('utf-8', errors='ignore')
-            return [{
-                'source': 'file',
-                'filename': os.path.basename(file.name),
-                'file_size': file_stat.st_size,
-                'mime_type': mimetypes.guess_type(file.name)[0],
-                'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
-                'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
-                'content': content,
-                'timestamp': datetime.now().isoformat()
-            }]
-        except Exception as e:
-            logger.error(f"File processing error: {e}")
-            return []
-def create_interface():
-    """Create a comprehensive Gradio interface with advanced features"""
-    css = """
-    .container { max-width: 1200px; margin: auto; }
-    .warning { background-color: #fff3cd; color: #856404; }
-    .error { background-color: #f8d7da; color: #721c24; }
-    """
-    with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
-        gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
-        with gr.Tab("URL Processing"):
-            url_input = gr.Textbox(
-                label="Enter URLs (comma or newline separated)",
-                lines=5,
-                placeholder="https://example1.com\nhttps://example2.com"
-            )
-        with gr.Tab("File Input"):
-            file_input = gr.File(
-                label="Upload text file or ZIP archive",
-                file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
-            )
-        with gr.Tab("Text Input"):
-            text_input = gr.Textbox(
-                label="Raw Text Input",
-                lines=5,
-                placeholder="Paste your text here..."
-            )
-        process_btn = gr.Button("Process Input", variant="primary")
-        output_text = gr.Textbox(label="Processing Results", interactive=False)
-        output_file = gr.File(label="Processed Output")
-        def process_all_inputs(urls, file, text):
-            """Process all input types with progress tracking"""
-            try:
-                processor = URLProcessor()
-                file_processor = FileProcessor()
-                results = []
-                # Process URLs
-                if urls:
-                    url_list = re.split(r'[,\n]', urls)
-                    url_list = [url.strip() for url in url_list if url.strip()]
-                    for url in url_list:
-                        validation = processor.validate_url(url)
-                        if validation.get('is_valid'):
-                            content = processor.fetch_content(url)
-                            if content:
-                                results.append({
-                                    'source': 'url',
-                                    'url': url,
-                                    'content': content,
-                                    'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
-                                })
-                # Process files
-                if file:
-                    results.extend(file_processor.process_file(file))
-                # Process text input
-                if text:
-                    cleaned_text = processor.advanced_text_cleaning(text)
-                    results.append({
-                        'source': 'direct_input',
-                        'content': cleaned_text,
-                        'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
-                    })
-                # Generate output
-                if results:
-                    output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
-                    output_dir.mkdir(parents=True, exist_ok=True)
-                    output_path = output_dir / f'processed_{int(time.time())}.json'
-                    with open(output_path, 'w', encoding='utf-8') as f:
-                        json.dump(results, f, ensure_ascii=False, indent=2)
-                    summary = f"Processed {len(results)} items successfully!"
-                    return output_path, summary
-                else:
-                    return None, "No valid content to process."
-            except Exception as e:
-                logger.error(f"Processing error: {e}")
-                return None, f"Error: {str(e)}"
-        process_btn.click(
-            process_all_inputs,
-            inputs=[url_input, file_input, text_input],
-            outputs=[output_file, output_text]
-        )
-        gr.Markdown("""
-        ### Usage Guidelines
-        - **URL Processing**: Enter valid HTTP/HTTPS URLs
-        - **File Input**: Upload text files or ZIP archives
-        - **Text Input**: Direct text processing
-        - Advanced cleaning and validation included
-        """)
-    return interface
-def main():
-    # Configure system settings
-    mimetypes.init()
-    # Create and launch interface
-    interface = create_interface()
-    interface.launch(
-        share=True,
-        server_name="0.0.0.0",
-        server_port=7860,
-        debug=True
-    )
-if __name__ == "__main__":
-    main()

+def process_all_inputs(urls, file, text):
+    """Process all input types with progress tracking"""
+    try:
+        processor = URLProcessor()
+        file_processor = FileProcessor()
+        results = []
+        # Process URLs
+        if urls:
+            url_list = re.split(r'[,\n]', urls)
+            url_list = [url.strip() for url in url_list if url.strip()]
+            for url in url_list:
+                validation = processor.validate_url(url)
+                if validation.get('is_valid'):
+                    content = processor.fetch_content(url)
+                    if content:
+                        results.append({
+                            'source': 'url',
+                            'url': url,
+                            'content': content,
+                            'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
+                        })
+        # Process files
+        if file:
+            results.extend(file_processor.process_file(file))
+        # Process text input
+        if text:
+            cleaned_text = processor.advanced_text_cleaning(text)
+            results.append({
+                'source': 'direct_input',
+                'content': cleaned_text,
+                'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
+            })
+        # Generate output
+        if results:
+            output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
+            output_dir.mkdir(parents=True, exist_ok=True)
+            output_path = output_dir / f'processed_{int(time.time())}.json'
+            with open(output_path, 'w', encoding='utf-8') as f:
+                json.dump(results, f, ensure_ascii=False, indent=2)
+            summary = f"Processed {len(results)} items successfully!"
+            return output