Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 26

Commit

7d538d5

verified ·

1 Parent(s): faa89e2

Update app.py

Browse files

Files changed (1) hide show

app.py +373 -224

app.py CHANGED Viewed

@@ -1,279 +1,428 @@
-"""
-Advanced URL & Text Processing Suite - Main Application
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-A sophisticated Gradio interface with URL processing, file manipulation, QR operations,
-and advanced data chat capabilities.
-"""
-import gradio as gr
-import logging
 import json
 import os
-import sys
-import zipfile
-import pandas as pd
-import numpy as np
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, List, Optional, Union, Any, Tuple
-# Configure logging
 logging.basicConfig(
     level=logging.INFO,
-    format='%(asctime)s.%(msecs)03d [%(levelname)s] %(name)s - %(message)s',
-    datefmt='%Y-%m-%d %H:%M:%S'
 )
 logger = logging.getLogger(__name__)
-# Modern UI Configuration
-THEME = gr.themes.Soft(
-    primary_hue="indigo",
-    secondary_hue="blue",
-    neutral_hue="slate",
-    spacing_size=gr.themes.sizes.spacing_md,
-    radius_size=gr.themes.sizes.radius_md,
-    text_size=gr.themes.sizes.text_md,
-)
-class DataChatProcessor:
     def __init__(self):
-        self.trained_data = {}
-        self.current_dataset = None
-    def process_zip_file(self, file_obj, mode):
         try:
-            if not file_obj:
-                return "Please upload a ZIP file", []
-            # Extract ZIP contents
-            with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
-                temp_dir = Path('temp_data')
-                temp_dir.mkdir(exist_ok=True)
-                zip_ref.extractall(temp_dir)
-            # Process based on mode
-            if mode == "TrainedOnData":
-                return self._train_on_data(temp_dir)
-            else:  # TalkAboutData
-                return self._analyze_data(temp_dir)
         except Exception as e:
-            logger.error(f"Error processing ZIP file: {e}")
-            return f"Error: {str(e)}", []
-    def _train_on_data(self, data_dir):
         try:
-            datasets = []
-            for file in data_dir.glob('**/*.csv'):
-                df = pd.read_csv(file)
-                datasets.append({
-                    'name': file.name,
-                    'data': df,
-                    'summary': {
-                        'rows': len(df),
-                        'columns': len(df.columns),
-                        'dtypes': df.dtypes.astype(str).to_dict()
-                    }
-                })
-            self.trained_data = {
-                'datasets': datasets,
                 'timestamp': datetime.now().isoformat()
             }
-            summary = f"Trained on {len(datasets)} datasets"
-            messages = [
-                {"role": "assistant", "content": "Training completed successfully."},
-                {"role": "assistant", "content": summary}
-            ]
-            return summary, messages
         except Exception as e:
-            logger.error(f"Error training on data: {e}")
-            return f"Error during training: {str(e)}", []
-    def _analyze_data(self, data_dir):
         try:
-            analyses = []
-            for file in data_dir.glob('**/*.csv'):
-                df = pd.read_csv(file)
-                analyses.append({
-                    'file': file.name,
-                    'shape': df.shape,
-                    'dtypes': df.dtypes.astype(str).to_dict()
-                })
-            self.current_dataset = {
-                'analyses': analyses,
                 'timestamp': datetime.now().isoformat()
             }
-            summary = f"Analyzed {len(analyses)} files"
-            messages = [
-                {"role": "assistant", "content": "Analysis completed successfully."},
-                {"role": "assistant", "content": summary}
-            ]
-            return summary, messages
         except Exception as e:
-            logger.error(f"Error analyzing data: {e}")
-            return f"Error during analysis: {str(e)}", []
-    def chat(self, message, history, mode):
-        if not message:
-            return "", history
-        history.append({"role": "user", "content": message})
         try:
-            if mode == "TrainedOnData":
-                if not self.trained_data:
-                    response = "Please upload and train on data first."
-                else:
-                    response = self._generate_trained_response(message)
-            else:
-                if not self.current_dataset:
-                    response = "Please upload data for analysis first."
                 else:
-                    response = self._generate_analysis_response(message)
-            history.append({"role": "assistant", "content": response})
-            return "", history
         except Exception as e:
-            logger.error(f"Error in chat: {e}")
-            history.append({"role": "assistant", "content": f"Error: {str(e)}"})
-            return "", history
-    def _generate_trained_response(self, message):
-        datasets = self.trained_data['datasets']
-        if "how many" in message.lower():
-            return f"There are {len(datasets)} datasets."
-        if "summary" in message.lower():
-            summaries = []
-            for ds in datasets:
-                summaries.append(
-                    f"Dataset '{ds['name']}': {ds['summary']['rows']} rows, "
-                    f"{ds['summary']['columns']} columns"
-                )
-            return "\n".join(summaries)
-        return "I can help you analyze the trained datasets. Ask about number of datasets or summaries."
-    def _generate_analysis_response(self, message):
-        analyses = self.current_dataset['analyses']
-        if "how many" in message.lower():
-            return f"There are {len(analyses)} files."
-        if "summary" in message.lower():
-            summaries = []
-            for analysis in analyses:
-                summaries.append(
-                    f"File '{analysis['file']}': {analysis['shape'][0]} rows, "
-                    f"{analysis['shape'][1]} columns"
-                )
-            return "\n".join(summaries)
-        return "I can help you explore the current dataset. Ask about file count or summaries."
-def create_interface():
-    data_chat = DataChatProcessor()
-    with gr.Blocks(theme=THEME) as interface:
-        gr.Markdown(
-            """
-            # 🌐 Advanced Data Processing & Analysis Suite
-            Enterprise-grade toolkit for data processing, analysis, and interactive chat capabilities.
-            """
-        )
-        with gr.Tab("💬 DataChat"):
-            with gr.Row():
-                # Left column for file upload and mode selection
-                with gr.Column(scale=1):
-                    data_file = gr.File(
-                        label="Upload ZIP File",
-                        file_types=[".zip"]
-                    )
-                    mode = gr.Radio(
-                        choices=["TrainedOnData", "TalkAboutData"],
-                        value="TrainedOnData",
-                        label="Chat Mode"
-                    )
-                    process_btn = gr.Button("Process Data", variant="primary")
-                    status_output = gr.Textbox(
-                        label="Status",
-                        interactive=False
-                    )
-                # Right column for chat interface
-                with gr.Column(scale=2):
-                    chatbot = gr.Chatbot(
-                        label="Chat History",
-                        height=400,
-                        show_label=True,
-                        type="messages"  # Specify OpenAI-style message format
-                    )
-                    msg = gr.Textbox(
-                        label="Your Message",
-                        placeholder="Ask questions about your data...",
-                        lines=2
-                    )
-                    with gr.Row():
-                        submit_btn = gr.Button("Send", variant="primary")
-                        clear_btn = gr.Button("Clear Chat", variant="secondary")
-        # Event handlers
-        process_btn.click(
-            fn=data_chat.process_zip_file,
-            inputs=[data_file, mode],
-            outputs=[status_output, chatbot]
-        )
-        submit_btn.click(
-            fn=data_chat.chat,
-            inputs=[msg, chatbot, mode],
-            outputs=[msg, chatbot]
-        )
-        msg.submit(
-            fn=data_chat.chat,
-            inputs=[msg, chatbot, mode],
-            outputs=[msg, chatbot]
         )
-        clear_btn.click(
-            fn=lambda: ([], "Chat cleared"),
-            outputs=[chatbot, status_output]
         )
-        return interface
 def main():
-    try:
-        interface = create_interface()
-        if interface:
-            interface.launch(
-                server_name="0.0.0.0",
-                server_port=8000
-            )
-        else:
-            logger.error("Failed to create interface")
-            sys.exit(1)
-    except Exception as e:
-        logger.error(f"Application startup error: {e}", exc_info=True)
-        sys.exit(1)
 if __name__ == "__main__":
-    main()

 import json
 import os
+import re
+import time
+import logging
+import mimetypes
+import tempfile
 from datetime import datetime
 from pathlib import Path
+from urllib.parse import urlparse
+from typing import List, Dict, Tuple, Union, Optional
+import requests
+import validators
+import gradio as gr
+from diskcache import Cache
+from bs4 import BeautifulSoup
+from fake_useragent import UserAgent
+from cleantext import clean
+import qrcode
+# Setup logging with detailed configuration
 logging.basicConfig(
     level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler('app.log', encoding='utf-8')
+    ]
 )
 logger = logging.getLogger(__name__)
+class URLProcessor:
     def __init__(self):
+        self.session = requests.Session()
+        self.timeout = 10  # seconds
+        self.session.headers.update({
+            'User-Agent': UserAgent().random,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
+        })
+    def advanced_text_cleaning(self, text: str) -> str:
+        """Robust text cleaning with version compatibility"""
+        try:
+            cleaned_text = clean(
+                text,
+                fix_unicode=True,
+                to_ascii=True,
+                lower=True,
+                no_line_breaks=True,
+                no_urls=True,
+                no_emails=True,
+                no_phone_numbers=True,
+                no_numbers=False,
+                no_digits=False,
+                no_currency_symbols=True,
+                no_punct=False
+            ).strip()
+            return cleaned_text
+        except Exception as e:
+            logger.warning(f"Text cleaning error: {e}. Using fallback method.")
+            text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)  # Remove control characters
+            text = text.encode('ascii', 'ignore').decode('ascii')  # Remove non-ASCII characters
+            text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
+            return text.strip()
+    def validate_url(self, url: str) -> Dict:
+        """Validate URL format and accessibility"""
         try:
+            if not validators.url(url):
+                return {'is_valid': False, 'message': 'Invalid URL format'}
+            response = self.session.head(url, timeout=self.timeout)
+            response.raise_for_status()
+            return {'is_valid': True, 'message': 'URL is valid and accessible'}
+        except Exception as e:
+            return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
+    def fetch_content(self, url: str) -> Optional[Dict]:
+        """Universal content fetcher with special case handling"""
+        try:
+            # Google Drive document handling
+            if 'drive.google.com' in url:
+                return self._handle_google_drive(url)
+            # Google Calendar ICS handling
+            if 'calendar.google.com' in url and 'ical' in url:
+                return self._handle_google_calendar(url)
+            # Standard HTML processing
+            return self._fetch_html_content(url)
         except Exception as e:
+            logger.error(f"Content fetch failed: {e}")
+            return None
+    def _handle_google_drive(self, url: str) -> Optional[Dict]:
+        """Process Google Drive file links"""
         try:
+            file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
+            if not file_id:
+                logger.error(f"Invalid Google Drive URL: {url}")
+                return None
+            direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
+            response = self.session.get(direct_url, timeout=self.timeout)
+            response.raise_for_status()
+            return {
+                'content': response.text,
+                'content_type': response.headers.get('Content-Type', ''),
                 'timestamp': datetime.now().isoformat()
             }
         except Exception as e:
+            logger.error(f"Google Drive processing failed: {e}")
+            return None
+    def _handle_google_calendar(self, url: str) -> Optional[Dict]:
+        """Process Google Calendar ICS feeds"""
         try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            return {
+                'content': response.text,
+                'content_type': 'text/calendar',
                 'timestamp': datetime.now().isoformat()
             }
+        except Exception as e:
+            logger.error(f"Calendar fetch failed: {e}")
+            return None
+    def _fetch_html_content(self, url: str) -> Optional[Dict]:
+        """Standard HTML content processing"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
+                element.decompose()
+            # Extract main content
+            main_content = soup.find('main') or soup.find('article') or soup.body
+            # Clean and structure content
+            text_content = main_content.get_text(separator='\n', strip=True)
+            cleaned_content = self.advanced_text_cleaning(text_content)
+            return {
+                'content': cleaned_content,
+                'content_type': response.headers.get('Content-Type', ''),
+                'timestamp': datetime.now().isoformat()
+            }
         except Exception as e:
+            logger.error(f"HTML processing failed: {e}")
+            return None
+class FileProcessor:
+    """Class to handle file processing"""
+    def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):  # 2GB default
+        self.max_file_size = max_file_size
+        self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
+    def is_text_file(self, filepath: str) -> bool:
+        """Check if file is a text file"""
         try:
+            mime_type, _ = mimetypes.guess_type(filepath)
+            return (mime_type and mime_type.startswith('text/')) or \
+                   (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
+        except Exception:
+            return False
+    def process_file(self, file) -> List[Dict]:
+        """Process uploaded file with enhanced error handling"""
+        if not file:
+            return []
+        dataset = []
+        try:
+            file_size = os.path.getsize(file.name)
+            if file_size > self.max_file_size:
+                logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
+                return []
+            with tempfile.TemporaryDirectory() as temp_dir:
+                if zipfile.is_zipfile(file.name):
+                    dataset.extend(self._process_zip_file(file.name, temp_dir))
                 else:
+                    dataset.extend(self._process_single_file(file))
         except Exception as e:
+            logger.error(f"Error processing file: {str(e)}")
+            return []
+        return dataset
+    def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
+        """Process ZIP file contents"""
+        results = []
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(temp_dir)
+            for root, _, files in os.walk(temp_dir):
+                for filename in files:
+                    filepath = os.path.join(root, filename)
+                    if self.is_text_file(filepath):
+                        try:
+                            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+                                content = f.read()
+                            if content.strip():
+                                results.append({
+                                    "source": "file",
+                                    "filename": filename,
+                                    "content": content,
+                                    "timestamp": datetime.now().isoformat()
+                                })
+                        except Exception as e:
+                            logger.error(f"Error reading file {filename}: {str(e)}")
+        return results
+def _process_single_file(self, file) -> List[Dict]:
+    try:
+        file_stat = os.stat(file.name)
+        # For very large files, read in chunks and summarize
+        if file_stat.st_size > 100 * 1024 * 1024:  # 100MB
+            logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
+            # Read first and last 1MB for extremely large files
+            content = ""
+            with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read(1 * 1024 * 1024)  # First 1MB
+                content += "\n...[Content truncated due to large file size]...\n"
+                # Seek to the last 1MB
+                f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
+                content += f.read()  # Last 1MB
+        else:
+            # Regular file processing
+            with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+        return [{
+            'source': 'file',
+            'filename': os.path.basename(file.name),
+            'file_size': file_stat.st_size,
+            'mime_type': mimetypes.guess_type(file.name)[0],
+            'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+            'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+            'content': content,
+            'timestamp': datetime.now().isoformat()
+        }]
+    except Exception as e:
+        logger.error(f"File processing error: {e}")
+        return []
+def generate_qr_code(json_data):
+    """Generate a QR code from JSON data."""
+    qr = qrcode.make(json_data)
+    qr_path = "output/qr_code.png"
+    qr.save(qr_path)
+    return qr_path
+def create_interface():
+    """Create a comprehensive Gradio interface with advanced features"""
+    css = """
+    .container { max-width: 1200px; margin: auto; }
+    .warning { background-color: #fff3cd; color: #856404; }
+    .error { background-color: #f8d7da; color: #721c24; }
+    """
+    with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
+        gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
+        with gr.Tab("URL Processing"):
+            url_input = gr.Textbox(
+                label="Enter URLs (comma or newline separated)",
+                lines=5,
+                placeholder="https://example1.com\nhttps://example2.com"
+            )
+        with gr.Tab("File Input"):
+            file_input = gr.File(
+                label="Upload text file or ZIP archive",
+                file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
+            )
+        with gr.Tab("Text Input"):
+            text_input = gr.Textbox(
+                label="Raw Text Input",
+                lines=5,
+                placeholder="Paste your text here..."
+            )
+        with gr.Tab("JSON Editor"):
+            json_editor = gr.Textbox(
+                label="JSON Editor",
+                lines=20,
+                placeholder="View and edit your JSON data here...",
+                interactive=True,
+                elem_id="json-editor"  # Optional: for custom styling
+            )
+        with gr.Tab("Scratchpad"):
+            scratchpad = gr.Textbox(
+                label="Scratchpad",
+                lines=10,
+                placeholder="Quick notes or text collections...",
+                interactive=True
+            )
+        process_btn = gr.Button("Process Input", variant="primary")
+        qr_btn = gr.Button("Generate QR Code", variant="secondary")
+        output_text = gr.Textbox(label="Processing Results", interactive=False)
+        output_file = gr.File(label="Processed Output")
+        qr_output = gr.Image(label="QR Code", type="filepath")  # To display the generated QR code
+        def process_all_inputs(urls, file, text, notes):
+            """Process all input types with progress tracking"""
+            try:
+                processor = URLProcessor()
+                file_processor = FileProcessor()
+                results = []
+                # Process URLs
+                if urls:
+                    url_list = re.split(r'[,\n]', urls)
+                    url_list = [url.strip() for url in url_list if url.strip()]
+                    for url in url_list:
+                        validation = processor.validate_url(url)
+                        if validation.get('is_valid'):
+                            content = processor.fetch_content(url)
+                            if content:
+                                results.append({
+                                    'source': 'url',
+                                    'url': url,
+                                    'content': content,
+                                    'timestamp': datetime.now().isoformat()
+                                })
+                # Process files
+                if file:
+                    results.extend(file_processor.process_file(file))
+                # Process text input
+                if text:
+                    cleaned_text = processor.advanced_text_cleaning(text)
+                    results.append({
+                        'source': 'direct_input',
+                        'content': cleaned_text,
+                        'timestamp': datetime.now().isoformat()
+                    })
+                # Generate output
+                if results:
+                    output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
+                    output_dir.mkdir(parents=True, exist_ok=True)
+                    output_path = output_dir / f'processed_{int(time.time())}.json'
+                    with open(output_path, 'w', encoding='utf-8') as f:
+                        json.dump(results, f, ensure_ascii=False, indent=2)
+                    summary = f"Processed {len(results)} items successfully!"
+                    json_data = json.dumps(results, indent=2)  # Prepare JSON for QR code
+                    return str(output_path), summary, json_data  # Return JSON for editor
+                else:
+                    return None, "No valid content to process.", ""
+            except Exception as e:
+                logger.error(f"Processing error: {e}")
+                return None, f"Error: {str(e)}", ""
+        def generate_qr(json_data):
+            """Generate QR code from JSON data and return the file path."""
+            if json_data:
+                return generate_qr_code(json_data)
+            return None
+        process_btn.click(
+            process_all_inputs,
+            inputs=[url_input, file_input, text_input, scratchpad],
+            outputs=[output_file, output_text, json_editor]  # Update outputs to include JSON editor
         )
+        qr_btn.click(
+            generate_qr,
+            inputs=json_editor,
+            outputs=qr_output
         )
+        gr.Markdown("""
+        ### Usage Guidelines
+        - **URL Processing**: Enter valid HTTP/HTTPS URLs
+        - **File Input**: Upload text files or ZIP archives
+        - **Text Input**: Direct text processing
+        - **JSON Editor**: View and edit your JSON data
+        - **Scratchpad**: Quick notes or text collections
+        - Advanced cleaning and validation included
+        """)
+    return interface
 def main():
+    # Configure system settings
+    mimetypes.init()
+    # Create and launch interface
+    interface = create_interface()
+    # Launch with proper configuration
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        share=False,
+        inbrowser=True,
+        debug=True
+    )
 if __name__ == "__main__":
+    main()