Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 20

Commit

c92df66

verified ·

1 Parent(s): 163699e

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -183

app.py CHANGED Viewed

@@ -4,16 +4,22 @@ import re
 import time
 import logging
 import mimetypes
 import zipfile
 from datetime import datetime
 from typing import List, Dict, Optional, Union
 from pathlib import Path
 import requests
 import validators
 import gradio as gr
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from cleantext import clean
 # Setup logging with detailed configuration
@@ -80,10 +86,15 @@ class URLProcessor:
     def fetch_content(self, url: str) -> Optional[Dict]:
         """Universal content fetcher with special case handling"""
         try:
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
             return self._fetch_html_content(url)
         except Exception as e:
             logger.error(f"Content fetch failed: {e}")
@@ -96,7 +107,7 @@ class URLProcessor:
             if not file_id:
                 logger.error(f"Invalid Google Drive URL: {url}")
                 return None
             direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
             response = self.session.get(direct_url, timeout=self.timeout)
             response.raise_for_status()
@@ -132,151 +143,108 @@ class URLProcessor:
             soup = BeautifulSoup(response.text, 'html.parser')
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
             main_content = soup.find('main') or soup.find('article') or soup.body
-            if main_content:
-                text_content = main_content.get_text(separator='\n', strip=True)
-                cleaned_content = self.advanced_text_cleaning(text_content)
-                return {
-                    'content': cleaned_content,
-                    'content_type': response.headers.get('Content-Type', ''),
-                    'timestamp': datetime.now().isoformat()
-                }
-            else:
-                logger.warning(f"No main content found for URL: {url}")
-                return None
         except Exception as e:
             logger.error(f"HTML processing failed: {e}")
             return None
 class FileProcessor:
     """Class to handle file processing"""
-    def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):  # 2GB default
         self.max_file_size = max_file_size
         self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
-        self.processed_zip_count = 0
-        self.max_zip_files = 5
-    def is_text_file(self, file_path: str) -> bool:
-        """Check if the file is a text file based on its extension."""
-        return any(file_path.lower().endswith(ext) for ext in self.supported_text_extensions)
-    def validate_filepath(path: Path) -> bool:
-        """Validate file exists and has supported extension"""
         try:
-            return path.exists() and path.is_file() and path.suffix.lower() in valid_extensions
-        except Exception as e:
-            logger.error(f"Validation error for {path}: {str(e)}")
             return False
-    def process_files(base_path: str = "/app/data") -> list:
-        """Process files with validation and error handling"""
-        combined_data = []
-        base_dir = Path(base_path)
-        if not base_dir.exists():
-            base_dir.mkdir(parents=True, exist_ok=True)
-            logger.info(f"Created data directory at {base_dir}")
-        for item in base_dir.glob('**/*'):
-            try:
-                # Skip directories immediately
-                if item.is_dir():
-                    logger.debug(f"Skipping directory: {item}")
-                    continue
-                # Validate file using shared function
-                if not validate_filepath(item):
-                    logger.warning(f"Invalid file skipped: {item}")
-                    continue
-                logger.info(f"Processing valid file: {item.name}")
-                # Add actual processing logic here
-                file_data = process_single_file(item)  # Your processing function
-                combined_data.append(file_data)
-            except Exception as e:
-                logger.error(f"Failed processing {item}: {str(e)}")
-                continue
-        return combined_data
-    def process_single_file(file_path: Path) -> dict:
-        """Example processing function"""
-        # Add your actual file processing logic here
-        return {
-            'filename': file_path.name,
-            'content': "processed content",  # Replace with real content
-            'metadata': {}  # Add actual metadata
-        }
-        except Exception as e:
-            logger.error(f"File processing error: {e}")
             return []
-    def _process_zip_file(self, zip_file_path: str) -> List[Dict]:
-        """Process a ZIP file and extract data from text files within."""
-        extracted_data = []
         try:
-            with zipfile.ZipFile(zip_file_path, 'r') as zf:
-                for name in zf.namelist():
-                    if self.is_text_file(name):
-                        try:
-                            file_info = zf.getinfo(name)
-                            with zf.open(name) as f:
-                                content = f.read().decode('utf-8', errors='ignore')
-                            # Use file_info for file size and date/time
-                            extracted_data.append({
-                                'source': 'zip',
-                                'filename': name,
-                                'file_size': file_info.file_size,  # Get file size from ZipInfo
-                                'mime_type': mimetypes.guess_type(name)[0],
-                                'created': datetime(*file_info.date_time).isoformat(), # Get date from ZipInfo
-                                'modified': datetime(*file_info.date_time).isoformat(),
-                                'content': content,
-                                'timestamp': datetime.now().isoformat()
-                            })
-                        except Exception as e:
-                            logger.error(f"Error processing file {name} from ZIP: {e}")
-        except zipfile.BadZipFile:
-            logger.error(f"Error: {zip_file_path} is not a valid ZIP file.")
-        except Exception as e:
-            logger.error(f"Error processing ZIP file {zip_file_path}: {e}")
-        return extracted_data
-class Chatbot:
-    """Simple chatbot that uses provided JSON data for responses."""
-    def __init__(self):
-        self.data = None
-    def load_data(self, json_data: str):
-        """Load JSON data into the chatbot."""
-        try:
-            self.data = json.loads(json_data)
-            return "Data loaded successfully!"
-        except json.JSONDecodeError:
-            return "Invalid JSON data. Please check your input."
-    def chat(self, user_input: str) -> str:
-        """Generate a response based on user input and loaded data."""
-        if not self.data:
-            return "No data loaded. Please load your JSON data first."
-        for key, value in self.data.items():
-            if key.lower() in user_input.lower():
-                return f"{key}: {value}"
-        return "I don't have information on that. Please ask about something else."
 def create_interface():
     """Create a comprehensive Gradio interface with advanced features"""
@@ -292,7 +260,7 @@ def create_interface():
         with gr.Tab("URL Processing"):
             url_input = gr.Textbox(
-                label="Enter URLs (comma or newline separated)",
                 lines=5,
                 placeholder="https://example1.com\nhttps://example2.com"
             )
@@ -305,31 +273,16 @@ def create_interface():
         with gr.Tab("Text Input"):
             text_input = gr.Textbox(
-                label="Raw Text Input",
                 lines=5,
                 placeholder="Paste your text here..."
             )
-        with gr.Tab("Chat"):
-            json_input = gr.Textbox(
-                label="Load JSON Data",
-                placeholder="Paste your JSON data here...",
-                lines=5
-            )
-            load_btn = gr.Button("Load Data", variant="primary")
-            chat_input = gr.Textbox(
-                label="Chat with your data",
-                placeholder="Type your question here..."
-            )
-            chat_output = gr.Textbox(label="Chatbot Response", interactive=False)
         process_btn = gr.Button("Process Input", variant="primary")
         output_text = gr.Textbox(label="Processing Results", interactive=False)
         output_file = gr.File(label="Processed Output")
-        chatbot = Chatbot()
         def process_all_inputs(urls, file, text):
             """Process all input types with progress tracking"""
             try:
@@ -337,6 +290,7 @@ def create_interface():
                 file_processor = FileProcessor()
                 results = []
                 if urls:
                     url_list = re.split(r'[,\n]', urls)
                     url_list = [url.strip() for url in url_list if url.strip()]
@@ -352,10 +306,12 @@ def create_interface():
                                     'content': content,
                                     'timestamp': datetime.now().isoformat()
                                 })
-                if file:
-                    results.extend(file_processor.process_files(file))
                 if text:
                     cleaned_text = processor.advanced_text_cleaning(text)
                     results.append({
@@ -364,6 +320,7 @@ def create_interface():
                         'timestamp': datetime.now().isoformat()
                     })
                 if results:
                     output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
                     output_dir.mkdir(parents=True, exist_ok=True)
@@ -373,6 +330,7 @@ def create_interface():
                         json.dump(results, f, ensure_ascii=False, indent=2)
                     summary = f"Processed {len(results)} items successfully!"
                     return str(output_path), summary
                 else:
                     return None, "No valid content to process."
@@ -380,52 +338,23 @@ def create_interface():
             except Exception as e:
                 logger.error(f"Processing error: {e}")
                 return None, f"Error: {str(e)}"
-        def load_chat_data(json_data):
-            """Load JSON data into the chatbot."""
-            return chatbot.load_data(json_data)
-        def chat_with_data(user_input):
-            """Chat with the loaded data."""
-            return chatbot.chat(user_input)
         process_btn.click(
-            process_all_inputs,
-            inputs=[url_input, file_input, text_input],
             outputs=[output_file, output_text]
         )
-        load_btn.click(
-            load_chat_data,
-            inputs=json_input,
-            outputs=chat_output
-        )
-        chat_input.submit(
-            chat_with_data,
-            inputs=chat_input,
-            outputs=chat_output
-        )
         gr.Markdown("""
         ### Usage Guidelines
         - **URL Processing**: Enter valid HTTP/HTTPS URLs
         - **File Input**: Upload text files or ZIP archives
         - **Text Input**: Direct text processing
-        - **Chat**: Load JSON data and ask questions about it
         - Advanced cleaning and validation included
         """)
     return interface
-def gradio_interface_handler(input_path: str):
-    """Example Gradio handler function"""
-    if not validate_filepath(Path(input_path)):
-        raise ValueError("Invalid file path provided")
-    processed_data = process_files(input_path)
-    return format_output(processed_data)
 def main():
     # Configure system settings
     mimetypes.init()
@@ -437,10 +366,8 @@ def main():
     interface.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
-        inbrowser=False,  # Disable browser opening in container
-        debug=False      # Disable debug mode for production
     )
-if __name__ == "__main__":
-    main()

 import time
 import logging
 import mimetypes
+import concurrent.futures
+import string
 import zipfile
+import tempfile
 from datetime import datetime
 from typing import List, Dict, Optional, Union
 from pathlib import Path
+from urllib.parse import urlparse
 import requests
 import validators
 import gradio as gr
+from diskcache import Cache
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
+from ratelimit import limits, sleep_and_retry
 from cleantext import clean
 # Setup logging with detailed configuration
     def fetch_content(self, url: str) -> Optional[Dict]:
         """Universal content fetcher with special case handling"""
         try:
+            # Google Drive document handling
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
+            # Google Calendar ICS handling
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
+            # Standard HTML processing
             return self._fetch_html_content(url)
         except Exception as e:
             logger.error(f"Content fetch failed: {e}")
             if not file_id:
                 logger.error(f"Invalid Google Drive URL: {url}")
                 return None
             direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
             response = self.session.get(direct_url, timeout=self.timeout)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove unwanted elements
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
+            # Extract main content
             main_content = soup.find('main') or soup.find('article') or soup.body
+            # Clean and structure content
+            text_content = main_content.get_text(separator='\n', strip=True)
+            cleaned_content = self.advanced_text_cleaning(text_content)
+            return {
+                'content': cleaned_content,
+                'content_type': response.headers.get('Content-Type', ''),
+                'timestamp': datetime.now().isoformat()
+            }
         except Exception as e:
             logger.error(f"HTML processing failed: {e}")
             return None
 class FileProcessor:
     """Class to handle file processing"""
+    def __init__(self, max_file_size: int = 10 * 1024 * 1024):  # 10MB default
         self.max_file_size = max_file_size
         self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
+    def is_text_file(self, filepath: str) -> bool:
+        """Check if file is a text file"""
         try:
+            mime_type, _ = mimetypes.guess_type(filepath)
+            return (mime_type and mime_type.startswith('text/')) or \
+                   (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
+        except Exception:
             return False
+    def process_file(self, file) -> List[Dict]:
+        """Process uploaded file with enhanced error handling"""
+        if not file:
             return []
+        dataset = []
         try:
+            file_size = os.path.getsize(file.name)
+            if file_size > self.max_file_size:
+                logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
+                return []
+            with tempfile.TemporaryDirectory() as temp_dir:
+                if zipfile.is_zipfile(file.name):
+                    dataset.extend(self._process_zip_file(file.name, temp_dir))
+                else:
+                    dataset.extend(self._process_single_file(file))
+        except Exception as e:
+            logger.error(f"Error processing file: {str(e)}")
+            return []
+        return dataset
+    def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
+        """Process ZIP file contents"""
+        results = []
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(temp_dir)
+            for root, _, files in os.walk(temp_dir):
+                for filename in files:
+                    filepath = os.path.join(root, filename)
+                    if self.is_text_file(filepath):
+                        try:
+                            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+                                content = f.read()
+                            if content.strip():
+                                results.append({
+                                    "source": "file",
+                                    "filename": filename,
+                                    "content": content,
+                                    "timestamp": datetime.now().isoformat()
+                                })
+                        except Exception as e:
+                            logger.error(f"Error reading file {filename}: {str(e)}")
+        return results
+    def _process_single_file(self, file) -> List[Dict]:
+        try:
+            file_stat = os.stat(file.name)
+            with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+            return [{
+                'source': 'file',
+                'filename': os.path.basename(file.name),
+                'file_size': file_stat.st_size,
+                'mime_type': mimetypes.guess_type(file.name)[0],
+                'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                'content': content,
+                'timestamp': datetime.now().isoformat()
+            }]
+        except Exception as e:
+            logger.error(f"File processing error: {e}")
+            return []
 def create_interface():
     """Create a comprehensive Gradio interface with advanced features"""
         with gr.Tab("URL Processing"):
             url_input = gr.Textbox(
+                label="Enter URLs (comma or newline separated)",
                 lines=5,
                 placeholder="https://example1.com\nhttps://example2.com"
             )
         with gr.Tab("Text Input"):
             text_input = gr.Textbox(
+                label="Raw Text Input",
                 lines=5,
                 placeholder="Paste your text here..."
             )
         process_btn = gr.Button("Process Input", variant="primary")
         output_text = gr.Textbox(label="Processing Results", interactive=False)
         output_file = gr.File(label="Processed Output")
         def process_all_inputs(urls, file, text):
             """Process all input types with progress tracking"""
             try:
                 file_processor = FileProcessor()
                 results = []
+                # Process URLs
                 if urls:
                     url_list = re.split(r'[,\n]', urls)
                     url_list = [url.strip() for url in url_list if url.strip()]
                                     'content': content,
                                     'timestamp': datetime.now().isoformat()
                                 })
+                # Process files
+                if file:
+                    results.extend(file_processor.process_file(file))
+                # Process text input
                 if text:
                     cleaned_text = processor.advanced_text_cleaning(text)
                     results.append({
                         'timestamp': datetime.now().isoformat()
                     })
+                # Generate output
                 if results:
                     output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
                     output_dir.mkdir(parents=True, exist_ok=True)
                         json.dump(results, f, ensure_ascii=False, indent=2)
                     summary = f"Processed {len(results)} items successfully!"
+                    # Convert Path object to string here
                     return str(output_path), summary
                 else:
                     return None, "No valid content to process."
             except Exception as e:
                 logger.error(f"Processing error: {e}")
                 return None, f"Error: {str(e)}"
         process_btn.click(
+            process_all_inputs,
+            inputs=[url_input, file_input, text_input],
             outputs=[output_file, output_text]
         )
         gr.Markdown("""
         ### Usage Guidelines
         - **URL Processing**: Enter valid HTTP/HTTPS URLs
         - **File Input**: Upload text files or ZIP archives
         - **Text Input**: Direct text processing
         - Advanced cleaning and validation included
         """)
     return interface
 def main():
     # Configure system settings
     mimetypes.init()
     interface.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        show_error=True,
         share=False,
+        inbrowser=True,
+        debug=True
     )