Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 22

Commit

c7e50ec

verified ·

1 Parent(s): abeedee

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -93

app.py CHANGED Viewed

@@ -19,7 +19,6 @@ from fake_useragent import UserAgent
 from cleantext import clean
 import qrcode
 import zipfile
-import zipfile36 as zipfile
 # Setup logging with detailed configuration
 logging.basicConfig(
@@ -28,10 +27,10 @@ logging.basicConfig(
     handlers=[
         logging.StreamHandler(),
         logging.FileHandler('app.log', encoding='utf-8')
-    ]
-)
 logger = logging.getLogger(__name__)
 class URLProcessor:
     def __init__(self):
         self.session = requests.Session()
@@ -44,7 +43,7 @@ class URLProcessor:
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1'
         })
     def advanced_text_cleaning(self, text: str) -> str:
         """Robust text cleaning with version compatibility"""
         try:
@@ -74,7 +73,7 @@ class URLProcessor:
         try:
             if not validators.url(url):
                 return {'is_valid': False, 'message': 'Invalid URL format'}
             response = self.session.head(url, timeout=self.timeout)
             response.raise_for_status()
             return {'is_valid': True, 'message': 'URL is valid and accessible'}
@@ -87,11 +86,11 @@ class URLProcessor:
             # Google Drive document handling
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
             # Google Calendar ICS handling
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
             # Standard HTML processing
             return self._fetch_html_content(url)
         except Exception as e:
@@ -105,11 +104,11 @@ class URLProcessor:
             if not file_id:
                 logger.error(f"Invalid Google Drive URL: {url}")
                 return None
             direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
             response = self.session.get(direct_url, timeout=self.timeout)
             response.raise_for_status()
             return {
                 'content': response.text,
                 'content_type': response.headers.get('Content-Type', ''),
@@ -138,20 +137,20 @@ class URLProcessor:
         try:
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
             # Remove unwanted elements
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
             # Extract main content
             main_content = soup.find('main') or soup.find('article') or soup.body
             # Clean and structure content
             text_content = main_content.get_text(separator='\n', strip=True)
             cleaned_content = self.advanced_text_cleaning(text_content)
             return {
                 'content': cleaned_content,
                 'content_type': response.headers.get('Content-Type', ''),
@@ -160,14 +159,15 @@ class URLProcessor:
         except Exception as e:
             logger.error(f"HTML processing failed: {e}")
             return None
 class FileProcessor:
     """Class to handle file processing"""
     def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):  # 2GB default
         self.max_file_size = max_file_size
         self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
     def is_text_file(self, filepath: str) -> bool:
         """Check if file is a text file"""
         try:
@@ -181,24 +181,20 @@ class FileProcessor:
         """Process uploaded file with enhanced error handling"""
         if not file:
             return []
         dataset = []
         try:
             file_size = os.path.getsize(file.name)
             if file_size > self.max_file_size:
                 logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
                 return []
             with tempfile.TemporaryDirectory() as temp_dir:
                 if zipfile.is_zipfile(file.name):
                     dataset.extend(self._process_zip_file(file.name, temp_dir))
                 else:
                     dataset.extend(self._process_single_file(file))
         except Exception as e:
             logger.error(f"Error processing file: {str(e)}")
             return []
         return dataset
     def _process_zip_file(self, zip_path, temp_dir):
@@ -217,29 +213,29 @@ class FileProcessor:
                             'timestamp': datetime.now().isoformat()
                         })
         return result
     def _process_single_file(self, file) -> List[Dict]:
         try:
             file_stat = os.stat(file.name)
             # For very large files, read in chunks and summarize
             if file_stat.st_size > 100 * 1024 * 1024:  # 100MB
                 logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
                 # Read first and last 1MB for extremely large files
                 content = ""
                 with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
                     content = f.read(1 * 1024 * 1024)  # First 1MB
                     content += "\n...[Content truncated due to large file size]...\n"
                     # Seek to the last 1MB
                     f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
                     content += f.read()  # Last 1MB
             else:
                 # Regular file processing
                 with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
-                    content =f.read()
             return [{
                 'source': 'file',
                 'filename': os.path.basename(file.name),
@@ -253,38 +249,41 @@ class FileProcessor:
         except Exception as e:
             logger.error(f"File processing error: {e}")
             return []
-    def generate_qr(json_data):
-        """Generate QR code from JSON data and return the file path."""
-        qr = qrcode.QRCode(
-            version=40,  # Force maximum version
-            error_correction=qrcode.constants.ERROR_CORRECT_L,  # Use lower error correction
-            box_size=10,
-            border=4,
-        )
-        qr.add_data(json_data)
-        qr.make(fit=True)
-        return qr.make_image(fill_color="black", back_color="white")
 def create_interface():
     """Create a comprehensive Gradio interface with advanced features"""
     css = """
     .container { max-width: 1200px; margin: auto; }
     .warning { background-color: #fff3cd; color: #856404; }
     .error { background-color: #f8d7da; color: #721c24; }
     """
     with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
         gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
         with gr.Tab("URL Processing"):
             url_input = gr.Textbox(
-                label="Enter URLs (comma or newline separated)",
                 lines=5,
                 placeholder="https://example1.com\nhttps://example2.com"
             )
         with gr.Tab("File Input"):
             file_input = gr.File(
                 label="Upload text file or ZIP archive",
@@ -293,11 +292,11 @@ def create_interface():
         with gr.Tab("Text Input"):
             text_input = gr.Textbox(
-                label="Raw Text Input",
                 lines=5,
                 placeholder="Paste your text here..."
             )
         with gr.Tab("JSON Editor"):
             json_editor = gr.Textbox(
                 label="JSON Editor",
@@ -306,7 +305,7 @@ def create_interface():
                 interactive=True,
                 elem_id="json-editor"  # Optional: for custom styling
             )
         with gr.Tab("Scratchpad"):
             scratchpad = gr.Textbox(
                 label="Scratchpad",
@@ -314,26 +313,26 @@ def create_interface():
                 placeholder="Quick notes or text collections...",
                 interactive=True
             )
         process_btn = gr.Button("Process Input", variant="primary")
         qr_btn = gr.Button("Generate QR Code", variant="secondary")
         output_text = gr.Textbox(label="Processing Results", interactive=False)
         output_file = gr.File(label="Processed Output")
         qr_output = gr.Image(label="QR Code", type="filepath")  # To display the generated QR code
-def process_all_inputs(urls, file, text, notes):
             """Process all input types with progress tracking"""
             try:
                 processor = URLProcessor()
                 file_processor = FileProcessor()
                 results = []
                 # Process URLs
                 if urls:
                     url_list = re.split(r'[,\n]', urls)
                     url_list = [url.strip() for url in url_list if url.strip()]
                     for url in url_list:
                         validation = processor.validate_url(url)
                         if validation.get('is_valid'):
@@ -345,11 +344,11 @@ def process_all_inputs(urls, file, text, notes):
                                     'content': content,
                                     'timestamp': datetime.now().isoformat()
                                 })
                 # Process files
                 if file:
                     results.extend(file_processor.process_file(file))
                 # Process text input
                 if text:
                     cleaned_text = processor.advanced_text_cleaning(text)
@@ -358,56 +357,45 @@ def process_all_inputs(urls, file, text, notes):
                         'content': cleaned_text,
                         'timestamp': datetime.now().isoformat()
                     })
                 # Generate output
                 if results:
                     output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
                     output_dir.mkdir(parents=True, exist_ok=True)
                     output_path = output_dir / f'processed_{int(time.time())}.json'
                     with open(output_path, 'w', encoding='utf-8') as f:
                         json.dump(results, f, ensure_ascii=False, indent=2)
                     summary = f"Processed {len(results)} items successfully!"
                     json_data = json.dumps(results, indent=2)  # Prepare JSON for QR code
                     return str(output_path), summary, json_data  # Return JSON for editor
                 else:
                     return None, "No valid content to process.", ""
             except Exception as e:
                 logger.error(f"Processing error: {e}")
                 return None, f"Error: {str(e)}", ""
-def generate_qr(json_data):
-    """Generate QR code from JSON data and return the file path."""
-     qr = qrcode.QRCode(
-        version=40,  # Force maximum version
-        error_correction=qrcode.constants.ERROR_CORRECT_L,  # Use lower error correction
-        box_size=10,
-        border=4,
-    )
-        qr.add_data(json_data)
-        qr.make(fit=True)
-        return qr.make_image(fill_color="black", back_color="white")
             if json_data:
-                return generate_qr_code(json_data)
             return None
-    process_btn.click(
-        process_all_inputs,
-        inputs=[url_input, file_input, text_input, scratchpad],
-        outputs=[output_file, output_text, json_editor]  # Update outputs to include JSON editor
-    )
-    qr_btn.click(
-        generate_qr,
-        inputs=json_editor,
-        outputs=qr_output
-    )
-    gr.Markdown("""
     ### Usage Guidelines
     - **URL Processing**: Enter valid HTTP/HTTPS URLs
     - **File Input**: Upload text files or ZIP archives
@@ -416,16 +404,15 @@ def generate_qr(json_data):
     - **Scratchpad**: Quick notes or text collections
     - Advanced cleaning and validation included
     """)
-return interface
 def main():
     # Configure system settings
     mimetypes.init()
     # Create and launch interface
     interface = create_interface()
     # Launch with proper configuration
     interface.launch(
         server_name="0.0.0.0",
@@ -437,4 +424,4 @@ def main():
     )
 if __name__ == "__main__":
-    main()

 from cleantext import clean
 import qrcode
 import zipfile
 # Setup logging with detailed configuration
 logging.basicConfig(
     handlers=[
         logging.StreamHandler(),
         logging.FileHandler('app.log', encoding='utf-8')
+    ])
 logger = logging.getLogger(__name__)
 class URLProcessor:
     def __init__(self):
         self.session = requests.Session()
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1'
         })
     def advanced_text_cleaning(self, text: str) -> str:
         """Robust text cleaning with version compatibility"""
         try:
         try:
             if not validators.url(url):
                 return {'is_valid': False, 'message': 'Invalid URL format'}
             response = self.session.head(url, timeout=self.timeout)
             response.raise_for_status()
             return {'is_valid': True, 'message': 'URL is valid and accessible'}
             # Google Drive document handling
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
             # Google Calendar ICS handling
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
             # Standard HTML processing
             return self._fetch_html_content(url)
         except Exception as e:
             if not file_id:
                 logger.error(f"Invalid Google Drive URL: {url}")
                 return None
             direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
             response = self.session.get(direct_url, timeout=self.timeout)
             response.raise_for_status()
             return {
                 'content': response.text,
                 'content_type': response.headers.get('Content-Type', ''),
         try:
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
             # Remove unwanted elements
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
             # Extract main content
             main_content = soup.find('main') or soup.find('article') or soup.body
             # Clean and structure content
             text_content = main_content.get_text(separator='\n', strip=True)
             cleaned_content = self.advanced_text_cleaning(text_content)
             return {
                 'content': cleaned_content,
                 'content_type': response.headers.get('Content-Type', ''),
         except Exception as e:
             logger.error(f"HTML processing failed: {e}")
             return None
 class FileProcessor:
     """Class to handle file processing"""
     def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):  # 2GB default
         self.max_file_size = max_file_size
         self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
     def is_text_file(self, filepath: str) -> bool:
         """Check if file is a text file"""
         try:
         """Process uploaded file with enhanced error handling"""
         if not file:
             return []
         dataset = []
         try:
             file_size = os.path.getsize(file.name)
             if file_size > self.max_file_size:
                 logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
                 return []
             with tempfile.TemporaryDirectory() as temp_dir:
                 if zipfile.is_zipfile(file.name):
                     dataset.extend(self._process_zip_file(file.name, temp_dir))
                 else:
                     dataset.extend(self._process_single_file(file))
         except Exception as e:
             logger.error(f"Error processing file: {str(e)}")
             return []
         return dataset
     def _process_zip_file(self, zip_path, temp_dir):
                             'timestamp': datetime.now().isoformat()
                         })
         return result
     def _process_single_file(self, file) -> List[Dict]:
         try:
             file_stat = os.stat(file.name)
             # For very large files, read in chunks and summarize
             if file_stat.st_size > 100 * 1024 * 1024:  # 100MB
                 logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
                 # Read first and last 1MB for extremely large files
                 content = ""
                 with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
                     content = f.read(1 * 1024 * 1024)  # First 1MB
                     content += "\n...[Content truncated due to large file size]...\n"
                     # Seek to the last 1MB
                     f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
                     content += f.read()  # Last 1MB
             else:
                 # Regular file processing
                 with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
+                    content = f.read()
             return [{
                 'source': 'file',
                 'filename': os.path.basename(file.name),
         except Exception as e:
             logger.error(f"File processing error: {e}")
             return []
+def generate_qr(json_data):
+    """Generate QR code from JSON data and return the file path."""
+    qr = qrcode.QRCode(
+        version=40,  # Force maximum version
+        error_correction=qrcode.constants.ERROR_CORRECT_L,  # Use lower error correction
+        box_size=10,
+        border=4,
+    )
+    qr.add_data(json_data)
+    qr.make(fit=True)
+    img = qr.make_image(fill_color="black", back_color="white")
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
+    img.save(temp_file.name)
+    return temp_file.name
 def create_interface():
     """Create a comprehensive Gradio interface with advanced features"""
     css = """
     .container { max-width: 1200px; margin: auto; }
     .warning { background-color: #fff3cd; color: #856404; }
     .error { background-color: #f8d7da; color: #721c24; }
     """
     with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
         gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
         with gr.Tab("URL Processing"):
             url_input = gr.Textbox(
+                label="Enter URLs (comma or newline separated)",
                 lines=5,
                 placeholder="https://example1.com\nhttps://example2.com"
             )
         with gr.Tab("File Input"):
             file_input = gr.File(
                 label="Upload text file or ZIP archive",
         with gr.Tab("Text Input"):
             text_input = gr.Textbox(
+                label="Raw Text Input",
                 lines=5,
                 placeholder="Paste your text here..."
             )
         with gr.Tab("JSON Editor"):
             json_editor = gr.Textbox(
                 label="JSON Editor",
                 interactive=True,
                 elem_id="json-editor"  # Optional: for custom styling
             )
         with gr.Tab("Scratchpad"):
             scratchpad = gr.Textbox(
                 label="Scratchpad",
                 placeholder="Quick notes or text collections...",
                 interactive=True
             )
         process_btn = gr.Button("Process Input", variant="primary")
         qr_btn = gr.Button("Generate QR Code", variant="secondary")
         output_text = gr.Textbox(label="Processing Results", interactive=False)
         output_file = gr.File(label="Processed Output")
         qr_output = gr.Image(label="QR Code", type="filepath")  # To display the generated QR code
+        def process_all_inputs(urls, file, text, notes):
             """Process all input types with progress tracking"""
             try:
                 processor = URLProcessor()
                 file_processor = FileProcessor()
                 results = []
                 # Process URLs
                 if urls:
                     url_list = re.split(r'[,\n]', urls)
                     url_list = [url.strip() for url in url_list if url.strip()]
                     for url in url_list:
                         validation = processor.validate_url(url)
                         if validation.get('is_valid'):
                                     'content': content,
                                     'timestamp': datetime.now().isoformat()
                                 })
                 # Process files
                 if file:
                     results.extend(file_processor.process_file(file))
                 # Process text input
                 if text:
                     cleaned_text = processor.advanced_text_cleaning(text)
                         'content': cleaned_text,
                         'timestamp': datetime.now().isoformat()
                     })
                 # Generate output
                 if results:
                     output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
                     output_dir.mkdir(parents=True, exist_ok=True)
                     output_path = output_dir / f'processed_{int(time.time())}.json'
                     with open(output_path, 'w', encoding='utf-8') as f:
                         json.dump(results, f, ensure_ascii=False, indent=2)
                     summary = f"Processed {len(results)} items successfully!"
                     json_data = json.dumps(results, indent=2)  # Prepare JSON for QR code
                     return str(output_path), summary, json_data  # Return JSON for editor
                 else:
                     return None, "No valid content to process.", ""
             except Exception as e:
                 logger.error(f"Processing error: {e}")
                 return None, f"Error: {str(e)}", ""
+        def generate_qr_code(json_data):
+            """Generate QR code from JSON data and return the file path."""
             if json_data:
+                return generate_qr(json_data)
             return None
+        process_btn.click(
+            process_all_inputs,
+            inputs=[url_input, file_input, text_input, scratchpad],
+            outputs=[output_file, output_text, json_editor]  # Update outputs to include JSON editor
+        )
+        qr_btn.click(
+            generate_qr_code,
+            inputs=json_editor,
+            outputs=qr_output
+        )
+        gr.Markdown("""
     ### Usage Guidelines
     - **URL Processing**: Enter valid HTTP/HTTPS URLs
     - **File Input**: Upload text files or ZIP archives
     - **Scratchpad**: Quick notes or text collections
     - Advanced cleaning and validation included
     """)
+    return interface
 def main():
     # Configure system settings
     mimetypes.init()
     # Create and launch interface
     interface = create_interface()
     # Launch with proper configuration
     interface.launch(
         server_name="0.0.0.0",
     )
 if __name__ == "__main__":
+    main()