Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 22

Commit

fe14e10

verified ·

1 Parent(s): b3a8443

Update app2.py

Browse files

Files changed (1) hide show

app2.py +55 -63

app2.py CHANGED Viewed

@@ -4,23 +4,23 @@ import re
 import time
 import logging
 import mimetypes
 import tempfile
 from datetime import datetime
 from pathlib import Path
 from urllib.parse import urlparse
-from typing import List, Dict, Tuple, Union, Optional
 import requests
 import validators
 import gradio as gr
 from diskcache import Cache
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from cleantext import clean
-import qrcode
-if sys.version_info >= (3, 6):
-    import zipfile
-else:
-    import zipfile36 as zipfile
 # Setup logging with detailed configuration
 logging.basicConfig(
@@ -45,12 +45,13 @@ class URLProcessor:
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1'
         })
     def advanced_text_cleaning(self, text: str) -> str:
         """Robust text cleaning with version compatibility"""
         try:
             cleaned_text = clean(
                 text,
                 to_ascii=True,
                 lower=True,
                 no_line_breaks=True,
@@ -149,6 +150,14 @@ class URLProcessor:
             # Extract main content
             main_content = soup.find('main') or soup.find('article') or soup.body
             # Clean and structure content
             text_content = main_content.get_text(separator='\n', strip=True)
             cleaned_content = self.advanced_text_cleaning(text_content)
@@ -202,15 +211,29 @@ class FileProcessor:
         return dataset
-    def process_zip_file(zip_path):
-        """Extract and process files within a ZIP archive."""
-        extraction_directory = tempfile.mkdtemp()
         with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_ref.extractall(extraction_directory)
-            for extracted_file in os.listdir(extraction_directory):
-                extracted_file_path = os.path.join(extraction_directory, extracted_file)
-                process_file(extracted_file_path)
 def _process_single_file(self, file) -> List[Dict]:
     try:
         file_stat = os.stat(file.name)
@@ -247,14 +270,17 @@ def _process_single_file(self, file) -> List[Dict]:
         logger.error(f"File processing error: {e}")
         return []
-def generate_qr_code(json_data):
-    """Generate a QR code from JSON data."""
-    qr = qrcode.make(json_data)
-    qr_path = "output/qr_code.png"
-    qr.save(qr_path)
-    return qr_path
-def create_interface():
     """Create a comprehensive Gradio interface with advanced features"""
     css = """
@@ -286,31 +312,12 @@ def create_interface():
                 placeholder="Paste your text here..."
             )
-        with gr.Tab("JSON Editor"):
-            json_editor = gr.Textbox(
-                label="JSON Editor",
-                lines=20,
-                placeholder="View and edit your JSON data here...",
-                interactive=True,
-                elem_id="json-editor"  # Optional: for custom styling
-            )
-        with gr.Tab("Scratchpad"):
-            scratchpad = gr.Textbox(
-                label="Scratchpad",
-                lines=10,
-                placeholder="Quick notes or text collections...",
-                interactive=True
-            )
         process_btn = gr.Button("Process Input", variant="primary")
-        qr_btn = gr.Button("Generate QR Code", variant="secondary")
         output_text = gr.Textbox(label="Processing Results", interactive=False)
         output_file = gr.File(label="Processed Output")
-        qr_output = gr.Image(label="QR Code", type="filepath")  # To display the generated QR code
-        def process_all_inputs(urls, file, text, notes):
             """Process all input types with progress tracking"""
             try:
                 processor = URLProcessor()
@@ -357,31 +364,19 @@ def create_interface():
                         json.dump(results, f, ensure_ascii=False, indent=2)
                     summary = f"Processed {len(results)} items successfully!"
-                    json_data = json.dumps(results, indent=2)  # Prepare JSON for QR code
-                    return str(output_path), summary, json_data  # Return JSON for editor
                 else:
-                    return None, "No valid content to process.", ""
             except Exception as e:
                 logger.error(f"Processing error: {e}")
-                return None, f"Error: {str(e)}", ""
-        def generate_qr(json_data):
-            """Generate QR code from JSON data and return the file path."""
-            if json_data:
-                return generate_qr_code(json_data)
-            return None
         process_btn.click(
             process_all_inputs,
-            inputs=[url_input, file_input, text_input, scratchpad],
-            outputs=[output_file, output_text, json_editor]  # Update outputs to include JSON editor
-        )
-        qr_btn.click(
-            generate_qr,
-            inputs=json_editor,
-            outputs=qr_output
         )
         gr.Markdown("""
@@ -389,8 +384,6 @@ def create_interface():
         - **URL Processing**: Enter valid HTTP/HTTPS URLs
         - **File Input**: Upload text files or ZIP archives
         - **Text Input**: Direct text processing
-        - **JSON Editor**: View and edit your JSON data
-        - **Scratchpad**: Quick notes or text collections
         - Advanced cleaning and validation included
         """)
@@ -412,6 +405,5 @@ def main():
         inbrowser=True,
         debug=True
     )
 if __name__ == "__main__":
     main()

 import time
 import logging
 import mimetypes
+import concurrent.futures
+import string
+import zipfile
 import tempfile
 from datetime import datetime
+from typing import List, Dict, Optional, Union
 from pathlib import Path
 from urllib.parse import urlparse
 import requests
 import validators
 import gradio as gr
 from diskcache import Cache
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
+from ratelimit import limits, sleep_and_retry
 from cleantext import clean
 # Setup logging with detailed configuration
 logging.basicConfig(
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1'
         })
     def advanced_text_cleaning(self, text: str) -> str:
         """Robust text cleaning with version compatibility"""
         try:
             cleaned_text = clean(
                 text,
+                fix_unicode=True,
                 to_ascii=True,
                 lower=True,
                 no_line_breaks=True,
             # Extract main content
             main_content = soup.find('main') or soup.find('article') or soup.body
+            if main_content is None:
+                logger.warning(f"No main content found for URL: {url}")
+                return {
+                    'content': '',
+                    'content_type': response.headers.get('Content-Type', ''),
+                    'timestamp': datetime.now().isoformat()
+                }
             # Clean and structure content
             text_content = main_content.get_text(separator='\n', strip=True)
             cleaned_content = self.advanced_text_cleaning(text_content)
         return dataset
+    def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
+        """Process ZIP file contents"""
+        results = []
         with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(temp_dir)
+            for root, _, files in os.walk(temp_dir):
+                for filename in files:
+                    filepath = os.path.join(root, filename)
+                    if self.is_text_file(filepath):
+                        try:
+                            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+                                content = f.read()
+                            if content.strip():
+                                results.append({
+                                    "source": "file",
+                                    "filename": filename,
+                                    "content": content,
+                                    "timestamp": datetime.now().isoformat()
+                                })
+                        except Exception as e:
+                            logger.error(f"Error reading file {filename}: {str(e)}")
+        return results
 def _process_single_file(self, file) -> List[Dict]:
     try:
         file_stat = os.stat(file.name)
         logger.error(f"File processing error: {e}")
         return []
+import qrcode  # Import the qrcode library
+def generate_qr(json_data):
+    """Generate QR code from JSON data and return the file path."""
+    if json_data:
+        qr = qrcode.make(json_data)
+        qr_path = f"output/qr_code_{int(time.time())}.png"
+        qr.save(qr_path)
+        return qr_path
+    return None
     """Create a comprehensive Gradio interface with advanced features"""
     css = """
                 placeholder="Paste your text here..."
             )
         process_btn = gr.Button("Process Input", variant="primary")
         output_text = gr.Textbox(label="Processing Results", interactive=False)
         output_file = gr.File(label="Processed Output")
+        def process_all_inputs(urls, file, text):
             """Process all input types with progress tracking"""
             try:
                 processor = URLProcessor()
                         json.dump(results, f, ensure_ascii=False, indent=2)
                     summary = f"Processed {len(results)} items successfully!"
+                    # Convert Path object to string here
+                    return str(output_path), summary
                 else:
+                    return None, "No valid content to process."
             except Exception as e:
                 logger.error(f"Processing error: {e}")
+                return None, f"Error: {str(e)}"
         process_btn.click(
             process_all_inputs,
+            inputs=[url_input, file_input, text_input],
+            outputs=[output_file, output_text]
         )
         gr.Markdown("""
         - **URL Processing**: Enter valid HTTP/HTTPS URLs
         - **File Input**: Upload text files or ZIP archives
         - **Text Input**: Direct text processing
         - Advanced cleaning and validation included
         """)
         inbrowser=True,
         debug=True
     )
 if __name__ == "__main__":
     main()