Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Apr 5

Commit

c70f013

verified ·

1 Parent(s): d7b800d

Update app.py

Browse files

Files changed (1) hide show

app.py +534 -269

app.py CHANGED Viewed

@@ -1,295 +1,560 @@
-import gradio as gr
 import os
 import logging
 import zipfile
-import io
-from pypdf import PdfReader
 import tempfile
-import traceback
-logging.basicConfig(level=logging.INFO)
-class FileProcessor:
-    def __init__(self):
-        pass
-    def process_file(self, file_obj): # Modified to accept file_obj directly
-        if file_obj is None:
-            return "Error: No file uploaded."
-        file_path = file_obj.name
-        logging.info(f"Processing file: {file_path}")
-        file_extension = os.path.splitext(file_path)[1].lower()
         try:
-            if file_extension == '.pdf':
-                return self._process_pdf_file(file_path)
-            elif file_extension == '.zip':
-                return self._process_zip_file(file_path)
-            elif file_extension == '.txt':
-                return self._process_txt_file(file_path)
-            else:
-                error_message = f"Error: Unsupported file type: {file_extension}. Please upload .pdf, .txt, or .zip files."
-                logging.warning(error_message)
-                return error_message
         except Exception as e:
-            error_message = f"Fatal error processing file: {os.path.basename(file_path)}. Please try again or contact support.  Technical details logged."
-            logging.error(f"Unhandled exception processing file: {file_path} - {e}")
-            logging.error(traceback.format_exc()) # Log full traceback for debugging
-            return error_message
-        finally:
-            try:
-                if os.path.exists(file_path):
-                    os.remove(file_path)
-                    logging.info(f"Temporary file removed: {file_path}")
-            except OSError as e:
-                logging.error(f"Error removing temporary file {file_path}: {e}")
-    def _process_pdf_file(self, file_path):
-        text = ""
-        try:
-            with open(file_path, 'rb') as f: # Open in binary mode for PdfReader
-                reader = PdfReader(f)
-                if not reader.is_encrypted: # Check if PDF is encrypted before processing
-                    for page in reader.pages:
-                        text += page.extract_text()
-                    logging.info(f"Successfully processed PDF file: {file_path}")
-                    if not text.strip(): # Check if extracted text is empty
-                        return "Warning: PDF processed, but no text content found. The PDF might contain images or scanned content."
-                    return text
-                else:
-                    error_message = f"Error: Encrypted PDF file: {os.path.basename(file_path)}. Processing of encrypted PDFs is not supported."
-                    logging.warning(error_message)
-                    return error_message
-        except FileNotFoundError:
-            error_message = f"Error: PDF file not found: {os.path.basename(file_path)}. Please ensure the file was uploaded correctly."
-            logging.error(f"File not found: {file_path}")
-            return error_message
-        except PdfReader.errors.PdfStreamError as e: # Specific error for corrupted PDF streams
-            error_message = f"Error: Corrupted PDF file: {os.path.basename(file_path)}. The PDF file appears to be damaged or invalid. Error details: {e}"
-            logging.error(f"Corrupted PDF stream error: {file_path} - {e}")
-            return error_message
-        except Exception as e: # Catch-all for other PDF processing errors
-            error_message = f"Error processing PDF file: {os.path.basename(file_path)}.  It might be corrupted or use unsupported features. Error details logged."
-            logging.error(f"General PDF processing error: {file_path} - {e}")
-            logging.error(traceback.format_exc()) # Log full traceback for debugging
-            return error_message
-    def _process_zip_file(self, file_path):
-        extracted_text = ""
-        error_occurred = False
         try:
-            with zipfile.ZipFile(file_path, 'r') as zf:
-                if not zf.namelist(): # Check for empty ZIP file
-                    return "Warning: ZIP file is empty and contains no files to process."
-                for filename in zf.namelist():
-                    try:
-                        if filename.lower().endswith('.pdf'):
-                            with zf.open(filename) as pdf_file:
-                                pdf_content = pdf_file.read()
-                                text = self._process_pdf_content(io.BytesIO(pdf_content), filename=filename) # Pass filename for better error context
-                                extracted_text += f"File: {filename}\nContent:\n{text}\n\n"
-                                logging.info(f"Successfully processed PDF within ZIP: {filename}")
-                        elif filename.lower().endswith('.txt'):
-                            with zf.open(filename) as txt_file:
-                                text = txt_file.read().decode('utf-8', errors='ignore') # Handle potential encoding issues in TXT
-                                extracted_text += f"File: {filename}\nContent:\n{text}\n\n"
-                                logging.info(f"Successfully processed TXT within ZIP: {filename}")
-                        else:
-                            logging.warning(f"Skipping unsupported file type within ZIP: {filename}")
-                    except Exception as e: # Catch errors for individual files within ZIP
-                        error_message = f"Error processing file '{filename}' within ZIP: {os.path.basename(file_path)}. Error: {e}"
-                        logging.error(error_message)
-                        logging.error(traceback.format_exc()) # Log traceback for inner ZIP errors
-                        extracted_text += f"File: {filename}\nError processing file. See logs for details.\n\n" # User-friendly error in output
-                        error_occurred = True # Flag that an error occurred within the zip
-            if not error_occurred:
-                logging.info(f"Successfully processed ZIP file: {file_path}")
-            else:
-                logging.warning(f"ZIP file processed with some errors: {file_path}. Check output for details.")
-            return extracted_text
-        except zipfile.BadZipFile: # Specific error for invalid ZIP file
-            error_message = f"Error: Invalid or corrupted ZIP file: {os.path.basename(file_path)}. Please ensure it is a valid ZIP archive."
-            logging.error(f"Bad ZIP file error: {file_path}")
-            return error_message
-        except Exception as e: # Catch-all for other ZIP processing errors
-            error_message = f"Error processing ZIP file: {os.path.basename(file_path)}.  It might be corrupted or have an unexpected structure. Error details logged."
-            logging.error(f"General ZIP processing error: {file_path} - {e}")
-            logging.error(traceback.format_exc()) # Log full traceback for debugging
-            return error_message
-    def _process_pdf_content(self, pdf_content_stream, filename=""): # Added filename for context
-        text = ""
-        try:
-            reader = PdfReader(pdf_content_stream)
-            if not reader.is_encrypted:
-                for page in reader.pages:
-                    text += page.extract_text()
-                if not text.strip():
-                    logging.warning(f"PDF content processed from '{filename}', but no text found.") # Filename context
-                    return "Warning: PDF content processed, but no text content found."
-                return text
-            else:
-                error_message = f"Error: Encrypted PDF content found in '{filename}'. Processing encrypted PDFs is not supported."
-                logging.warning(error_message)
-                return error_message
-        except PdfReader.errors.PdfStreamError as e:
-            error_message = f"Error: Corrupted PDF content in '{filename}'. PDF stream error: {e}" # Filename context
-            logging.error(error_message)
-            return error_message
-        except Exception as e:
-            error_message = f"Error processing PDF content from '{filename}'. Error details logged." # Filename context
-            logging.error(f"Error processing PDF content from stream (file: {filename}) - {e}")
-            logging.error(traceback.format_exc())
-            return error_message
-    def _process_txt_file(self, file_path):
-        text = ""
-        try:
-            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: # Handle potential encoding issues
-                text = file.read()
-            logging.info(f"Successfully processed TXT file: {file_path}")
-            if not text.strip(): # Check for empty TXT
-                return "Warning: TXT file processed, but it is empty."
-            return text
-        except FileNotFoundError:
-            error_message = f"Error: TXT file not found: {os.path.basename(file_path)}. Please ensure the file was uploaded correctly."
-            logging.error(f"File not found: {file_path}")
-            return error_message
         except Exception as e:
-            error_message = f"Error processing TXT file: {os.path.basename(file_path)}. Error details logged."
-            logging.error(f"Error processing TXT file: {file_path} - {e}")
-            logging.error(traceback.format_exc())
-            return error_message
-# Initialize FileProcessor
-file_processor = FileProcessor()
-def process_file_and_respond(file_obj): # No change needed here as file_obj is now directly processed
-    return file_processor.process_file(file_obj)
-def test_functionality_enhanced():
-    temp_dir = tempfile.TemporaryDirectory()
-    test_dir = temp_dir.name
-    # --- Create test files in temporary directory ---
-    def create_test_file(filepath, content, mode='w'): # Helper function for file creation
-        with open(filepath, mode, encoding='utf-8') as f: # Default text mode
-            f.write(content)
-    def create_binary_test_file(filepath, content_binary, mode='wb'): # Helper for binary file creation
-        with open(filepath, mode) as f:
-            f.write(content_binary)
-    pdf_content = "This is a test PDF file.\nWith multiple lines."
-    txt_content = "This is a test TXT file.\nAnother line of text."
-    zip_content_pdf = "PDF content inside ZIP."
-    zip_content_txt = "TXT content inside ZIP."
-    empty_txt_content = ""
-    encrypted_pdf_content = "%PDF-1.5\n%����\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages /Kids [ 3 0 R ] /Count 1 >>\nendobj\n3 0 obj\n<< /Type /Page /MediaBox [ 0 0 612 792 ] /Contents 4 0 R /Parent 2 0 R >>\nendobj\n4 0 obj\n<< /Length 5 >>\nstream\nBT\n/F1 12 Tf\n72 712 Td\n(This is an encrypted PDF - fake content) Tj\nET\nendstream\nendobj\n5 0 obj\n<< /Length 44 >>\nstream\n/Filter /FlateDecode\n/Length 44\nstream\nxœ+��\x0e@E\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nendstream\nendstream\nendobj\nxref\n0 6\n0000000000 65535 f\n0000000015 00000 n\n0000000062 00000 n\n0000000112 00000 n\n0000000179 00000 n\n0000000259 00000 n\ntrailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n369\n%%EOF\n" # Minimal PDF structure - not actually encrypted, but enough to test encryption check
-    pdf_file_path = os.path.join(test_dir, "test.pdf")
-    txt_file_path = os.path.join(test_dir, "test.txt")
-    zip_file_path = os.path.join(test_dir, "test.zip")
-    unsupported_file_path = os.path.join(test_dir, "test.csv")
-    corrupted_pdf_path = os.path.join(test_dir, "corrupted.pdf")
-    empty_txt_path = os.path.join(test_dir, "empty.txt")
-    empty_zip_path = os.path.join(test_dir, "empty.zip")
-    encrypted_pdf_path = os.path.join(test_dir, "encrypted.pdf")
-    create_test_file(pdf_file_path, pdf_content)
-    create_test_file(txt_file_path, txt_content)
-    create_test_file(unsupported_file_path, "test csv content")
-    create_test_file(empty_txt_path, empty_txt_content)
-    create_binary_test_file(encrypted_pdf_path, encrypted_pdf_content.encode('latin-1')) # Encrypted PDF test - use latin-1 to avoid encoding issues with PDF structure
-    # Create a "corrupted" PDF by just writing plain text to a .pdf file.
-    create_test_file(corrupted_pdf_path, "This is NOT a valid PDF file.")
-    with zipfile.ZipFile(zip_file_path, 'w') as zf:
-        zf.writestr("zip_test.pdf", zip_content_pdf)
-        zf.writestr("zip_test.txt", zip_content_txt)
-    with zipfile.ZipFile(empty_zip_path, 'w') as zf: # Create empty zip
-        pass
-    # --- Test cases ---
-    test_cases = [
-        {"name": "PDF Processing", "file_path": pdf_file_path, "expected_content": pdf_content, "expect_error": False},
-        {"name": "TXT Processing", "file_path": txt_file_path, "expected_content": txt_content, "expect_error": False},
-        {"name": "ZIP Processing (PDF & TXT)", "file_path": zip_file_path, "expected_content_in": [zip_content_pdf, zip_content_txt], "expect_error": False},
-        {"name": "Unsupported File Type", "file_path": unsupported_file_path, "expected_content": "Unsupported file type", "expect_error": True},
-        {"name": "Corrupted PDF Processing", "file_path": corrupted_pdf_path, "expected_content": "Error processing PDF file", "expect_error": True},
-        {"name": "Empty TXT File", "file_path": empty_txt_path, "expected_content": "Warning: TXT file processed, but it is empty.", "expect_error": False},
-        {"name": "Empty ZIP File", "file_path": empty_zip_path, "expected_content": "Warning: ZIP file is empty", "expect_error": False},
-        {"name": "Encrypted PDF File", "file_path": encrypted_pdf_path, "expected_content": "Error: Encrypted PDF file", "expect_error": True},
-    ]
-    all_tests_passed = True
-    for case in test_cases:
-        print(f"\n--- Test Case: {case['name']} ---")
-        result = file_processor.process_file(SimpleFileObject(case['file_path'])) # Use SimpleFileObject to simulate file upload
-        print(f"Result: {result[:100]}...") # Print first 100 chars of result
-        if case.get("expect_error"):
-            if case["expected_content"] not in result:
-                print(f"  ❌ FAIL: Expected error message containing '{case['expected_content']}', but got: {result}")
-                all_tests_passed = False
-            else:
-                print(f"  ✅ PASS: Expected error message found.")
-        elif case.get("expected_content_in"): # For cases expecting multiple contents (like ZIP)
-             all_contents_found = True
-             for expected_content in case["expected_content_in"]:
-                 if expected_content not in result:
-                     print(f"  ❌ FAIL: Expected content '{expected_content}' not found in result for {case['name']}. Got: {result[:100]}...")
-                     all_contents_found = False
-                     all_tests_passed = False
-                     break
-             if all_contents_found:
-                 print(f"  ✅ PASS: All expected contents found.")
-        elif case.get("expected_content"):
-            if case["expected_content"] not in result:
-                print(f"  ❌ FAIL: Expected content '{case['expected_content']}', but got: {result[:100]}...")
-                all_tests_passed = False
             else:
-                print(f"  ✅ PASS: Expected content found.")
-    if all_tests_passed:
-        print("\n🎉 All enhanced tests completed successfully! 🎉")
-    else:
-        print("\n⚠️ Some enhanced tests FAILED. See details above. ⚠️")
-    temp_dir.cleanup() # Clean up temporary directory and files
-class SimpleFileObject: # Mock file object for testing
-    def __init__(self, file_path):
-        self.name = file_path
-iface = gr.Interface(
-    fn=process_file_and_respond,
-    inputs=gr.File(file_types=[".pdf", ".txt", ".zip"]),
-    outputs="text",
-    title="Robust File Processing Agent",
-    description="Upload a PDF, TXT, or ZIP file to process its content. Enhanced for error handling and robustness."
-)
 if __name__ == "__main__":
-    test_functionality_enhanced() # Run enhanced tests
-    iface.launch(debug=True)

+import json
 import os
+import re
+import time
 import logging
+import mimetypes
 import zipfile
 import tempfile
+from datetime import datetime
+from typing import List, Dict, Optional, Union
+from pathlib import Path
+from urllib.parse import urlparse
+import requests
+import validators
+import gradio as gr
+from diskcache import Cache
+from bs4 import BeautifulSoup
+from fake_useragent import UserAgent
+from cleantext import clean
+import qrcode
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler('app.log', encoding='utf-8')
+    ]
+)
+logger = logging.getLogger(__name__)
+# Ensure output directories exist
+Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
+class URLProcessor:
+    def __init__(self):
+        self.session = requests.Session()
+        self.timeout = 10  # seconds
+        self.session.headers.update({
+            'User -Agent': UserAgent().random,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
+        })
+    def advanced_text_cleaning(self, text: str) -> str:
+        """Robust text cleaning with version compatibility"""
         try:
+            cleaned_text = clean(
+                text,
+                fix_unicode=True,
+                to_ascii=True,
+                lower=True,
+                no_line_breaks=True,
+                no_urls=True,
+                no_emails=True,
+                no_phone_numbers=True,
+                no_numbers=False,
+                no_digits=False,
+                no_currency_symbols=True,
+                no_punct=False
+            ).strip()
+            return cleaned_text
         except Exception as e:
+            logger.warning(f"Text cleaning error: {e}. Using fallback method.")
+            text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)  # Remove control characters
+            text = text.encode('ascii', 'ignore').decode('ascii')  # Remove non-ASCII characters
+            text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
+            return text.strip()
+    def validate_url(self, url: str) -> Dict:
+        """Validate URL format and accessibility"""
         try:
+            if not validators.url(url):
+                return {'is_valid': False, 'message': 'Invalid URL format'}
+            response = self.session.head(url, timeout=self.timeout)
+            response.raise_for_status()
+            return {'is_valid': True, 'message': 'URL is valid and accessible'}
         except Exception as e:
+            return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
+    def fetch_content(self, url: str) -> Optional[Dict]:
+        """Universal content fetcher with special case handling"""
+        try:
+            # Google Drive document handling
+            if 'drive.google.com' in url:
+                return self._handle_google_drive(url)
+            # Google Calendar ICS handling
+            if 'calendar.google.com' in url and 'ical' in url:
+                return self._handle_google_calendar(url)
+            # Standard HTML processing
+            return self._fetch_html_content(url)
+        except Exception as e:
+            logger.error(f"Content fetch failed: {e}")
+            return None
+    def _handle_google_drive(self, url: str) -> Optional[Dict]:
+        """Process Google Drive file links"""
+        try:
+            file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
+            if not file_id:
+                logger.error(f"Invalid Google Drive URL: {url}")
+                return None
+            direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
+            response = self.session.get(direct_url, timeout=self.timeout)
+            response.raise_for_status()
+            return {
+                'content': response.text,
+                'content_type': response.headers.get('Content-Type', ''),
+                'timestamp': datetime.now().isoformat()
+            }
+        except Exception as e:
+            logger.error(f"Google Drive processing failed: {e}")
+            return None
+    def _handle_google_calendar(self, url: str) -> Optional[Dict]:
+        """Process Google Calendar ICS feeds"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            return {
+                'content': response.text,
+                'content_type': 'text/calendar',
+                'timestamp': datetime.now().isoformat()
+            }
+        except Exception as e:
+            logger.error(f"Calendar fetch failed: {e}")
+            return None
+    def _fetch_html_content(self, url: str) -> Optional[Dict]:
+        """Standard HTML content processing"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
+                element.decompose()
+            # Extract main content
+            main_content = soup.find('main') or soup.find('article') or soup.body
+            if main_content is None:
+                logger.warning(f"No main content found for URL: {url}")
+                return {
+                    'content': '',
+                    'content_type': response.headers.get('Content-Type', ''),
+                    'timestamp': datetime.now().isoformat()
+                }
+            # Clean and structure content
+            text_content = main_content.get_text(separator='\n', strip=True)
+            cleaned_content = self.advanced_text_cleaning(text_content)
+            return {
+                'content': cleaned_content,
+                'content_type': response.headers.get('Content-Type', ''),
+                'timestamp': datetime.now().isoformat()
+            }
+        except Exception as e:
+            logger.error(f"HTML processing failed: {e}")
+            return None
+class FileProcessor:
+    """Class to handle file processing"""
+    def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):  # 2GB default
+        self.max_file_size = max_file_size
+        self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
+    def is_text_file(self, filepath: str) -> bool:
+        """Check if file is a text file"""
+        try:
+            mime_type, _ = mimetypes.guess_type(filepath)
+            return (mime_type and mime_type.startswith('text/')) or \
+                   (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
+        except Exception:
+            return False
+    def process_file(self, file) -> List[Dict]:
+        """Process uploaded file with enhanced error handling"""
+        if not file:
+            return []
+        dataset = []
+        try:
+            file_size = os.path.getsize(file.name)
+            if file_size > self.max_file_size:
+                logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
+                return []
+            with tempfile.TemporaryDirectory() as temp_dir:
+                if zipfile.is_zipfile(file.name):
+                    dataset.extend(self._process_zip_file(file.name, temp_dir))
+                else:
+                    dataset.extend(self._process_single_file(file))
+        except Exception as e:
+            logger.error(f"Error processing file: {str(e)}")
+            return []
+        return dataset
+    def chunk_data(self, data, max_size=2953):  # 2953 is the max size for version 1 QR code
+        """Chunk data into smaller pieces if it exceeds max_size."""
+        json_str = json.dumps(data, ensure_ascii=False)
+        if len(json_str) <= max_size:
+            return [json_str]
+        # Split into chunks
+        chunks = []
+        while json_str:
+            chunk = json_str[:max_size]
+            chunks.append(chunk)
+            json_str = json_str[max_size:]
+        return chunks
+def _process_single_file(self, file) -> List[Dict]:
+    """Process a single file"""
+    try:
+        file_stat = os.stat(file.name)
+        # For very large files, read in chunks and summarize
+        if file_stat.st_size > 100 * 1024 * 1024:  # 100MB
+            logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
+            # Read first and last 1MB for extremely large files
+            content = ""
+            with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read(1 * 1024 * 1024)  # First 1MB
+                content += "\n...[Content truncated due to large file size]...\n"
+                # Seek to the last 1MB
+                f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
+                content += f.read()  # Last 1MB
+        else:
+            # Regular file processing
+            with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+        return [{
+            'source': 'filename',  # Assuming 'source' should be a string value
+            'filename': os.path.basename(file.name),
+            'file_size': file_stat.st_size,
+            'mime_type': mimetypes.guess_type(file.name)[0],
+            'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+            'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+            'content': content,
+            'timestamp': datetime.now().isoformat()
+        }]
+    except Exception as e:
+        logger.error(f"File processing error: {e}")
+        return []
+def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
+    """Clean and validate JSON data"""
+    try:
+        # If it's a string, try to parse it
+        if isinstance(data, str):
+            # Remove any existing content and extra whitespace
+            data = data.strip()
+            data = json.loads(data)
+        # Convert to string and back to ensure proper JSON format
+        cleaned = json.loads(json.dumps(data))
+        return cleaned
+    except json.JSONDecodeError as e:
+        logger.error(f"JSON cleaning error: {e}")
+        return None
+    except Exception as e:
+        logger.error(f"Unexpected error while cleaning JSON: {e}")
+        return None
+def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]:
+    """Generate QR code(s) from data"""
+    try:
+        output_dir = Path('output/qr_codes')
+        output_dir.mkdir(parents=True, exist_ok=True)
+        if combined:
+            # Generate single QR code for all data
+            cleaned_data = clean_json(data)
+            if cleaned_data is None:  # Check if cleaning failed
+                logger.error("Failed to clean data for QR code generation.")
+                return []
+            qr = qrcode.QRCode(
+                version=None,
+                error_correction=qrcode.constants.ERROR_CORRECT_L,
+                box_size=10,
+                border=4,
+            )
+            json_str = json.dumps(cleaned_data, ensure_ascii=False)
+            qr.add_data(json_str)
+            qr.make(fit=True)
+            img = qr.make_image(fill_color="black", back_color="white")
+            output_path = output_dir / f'combined_qr_{int(time.time())}.png'
+            img.save(str(output_path))
+            return [str(output_path)]
+        else:
+            # Generate separate QR codes for each item
+            if isinstance(data, list):
+                paths = []
+                for idx, item in enumerate(data):
+                    cleaned_item = clean_json(item)
+                    if cleaned_item is None:  # Check if cleaning failed
+                        logger.error(f"Failed to clean item {idx} for QR code generation.")
+                        continue  # Skip this item
+                    qr = qrcode.QRCode(
+                        version=None,
+                        error_correction=qrcode.constants.ERROR_CORRECT_L,
+                        box_size=10,
+                        border=4,
+                    )
+                    json_str = json.dumps(cleaned_item, ensure_ascii=False)
+                    qr.add_data(json_str)
+                    qr.make(fit=True)
+                    img = qr.make_image(fill_color="black", back_color="white")
+                    output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
+                    img.save(str(output_path))
+                    paths.append(str(output_path))
+                return paths
             else:
+                # Single item, not combined
+                cleaned_item = clean_json(data)
+                if cleaned_item is None:  # Check if cleaning failed
+                    logger.error("Failed to clean single item for QR code generation.")
+                    return []
+                qr = qrcode.QRCode(
+                    version=None,
+                    error_correction=qrcode.constants.ERROR_CORRECT_L,
+                    box_size=10,
+                    border=4,
+                )
+                json_str = json.dumps(cleaned_item, ensure_ascii=False)
+                qr.add_data(json_str)
+                qr.make(fit=True)
+                img = qr.make_image(fill_color="black", back_color="white")
+                output_path = output_dir / f'single_qr_{int(time.time())}.png'
+                img.save(str(output_path))
+                return [str(output_path)]
+        return []
+    except Exception as e:
+        logger.error(f"QR generation error: {e}")
+        return []
+def create_interface():
+    """Create a comprehensive Gradio interface with advanced features"""
+    css = """
+    .container { max-width: 1200px; margin: auto; }
+    .warning { background-color: #fff3cd; color: #856404; padding: 10px; border-radius: 4px; }
+    .error { background-color: #f8d7da; color: #721c24; padding: 10px; border-radius: 4px; }
+    .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
+    """
+    with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
+        gr.Markdown("# 🌐 Advanced Data Processing & QR Code Generator")
+        with gr.Tab("URL Processing"):
+            url_input = gr.Textbox(
+                label="Enter URLs (comma or newline separated)",
+                lines=5,
+                placeholder="https://example1.com\nhttps://example2.com",
+                value=""
+            )
+        with gr.Tab("File Input"):
+            file_input = gr.File(
+                label="Upload text file or ZIP archive",
+                file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
+            )
+        with gr.Tab("Notepad"):
+            text_input = gr.TextArea(
+                label="JSON Data Input",
+                lines=15,
+                placeholder="Paste your JSON data here...",
+                value=""
+            )
+            with gr.Row():
+                example_btn = gr.Button("📝 Load Example JSON", variant="secondary")
+                clear_btn = gr.Button("🗑️ Clear Input", variant="secondary")
+        with gr.Row():
+            combine_data = gr.Checkbox(
+                label="Combine all data into single QR code",
+                value=True,
+                info="Generate one QR code for all data, or separate QR codes for each item"
+            )
+            process_btn = gr.Button("🔄 Process & Generate QR", variant="primary", scale=2)
+        output_json = gr.JSON(label="Processed JSON Data")
+        output_gallery = gr.Gallery(label="Generated QR Codes", columns=2, height=400)
+        output_text = gr.Textbox(label="Processing Status", interactive=False)
+        def load_example():
+            example_json = {
+                "type": "product_catalog",
+                "items": [
+                    {
+                        "id": "123",
+                        "name": "Test Product",
+                        "description": "This is a test product description",
+                        "price": 29.99,
+                        "category": "electronics",
+                        "tags": ["test", "sample", "demo"]
+                    },
+                    {
+                        "id": "456",
+                        "name": "Another Product",
+                        "description": "Another test product description",
+                        "price": 49.99,
+                        "category": "accessories",
+                        "tags": ["sample", "test"]
+                    }
+                ],
+                "metadata": {
+                    "timestamp": datetime.now().isoformat(),
+                    "version": "1.0",
+                    "source": "example"
+                }
+            }
+            return json.dumps(example_json, indent=2)
+        def clear_input():
+            return ""
+        def process_all_inputs(urls, file, text, combine):
+            """Process all input types and generate QR codes"""
+            try:
+                results = []
+                # Process text input first (since it's direct JSON)
+                if text and text.strip():
+                    try:
+                        # Try to parse as JSON
+                        json_data = json.loads(text)
+                        if isinstance(json_data, list):
+                            results.extend(json_data)
+                        else:
+                            results.append(json_data)
+                    except json.JSONDecodeError as e:
+                        return None, [], f"❌ Invalid JSON format: {str(e)}"
+                # Process URLs if provided
+                if urls and urls.strip():
+                    processor = URLProcessor()
+                    url_list = re.split(r'[,\n]', urls)
+                    url_list = [url.strip() for url in url_list if url.strip()]
+                    for url in url_list:
+                        validation = processor.validate_url(url)
+                        if validation.get('is_valid'):
+                            content = processor.fetch_content(url)
+                            if content:
+                                results.append({
+                                    'source': 'url',
+                                    'url': url,
+                                    'content': content,
+                                    'timestamp': datetime.now().isoformat()
+                                })
+                # Process files if provided
+                if file:
+                    file_processor = FileProcessor()
+                    file_results = file_processor.process_file(file)
+                    if file_results:
+                        results.extend(file_results)
+                # Generate QR codes
+                if results:
+                    if combine:
+                        # Chunk the data if necessary
+                        combined_data = []
+                        for item in results:
+                            combined_data.extend(file_processor.chunk_data(item))
+                        qr_paths = generate_qr_code(combined_data, combined=False)
+                    else:
+                        qr_paths = generate_qr_code(results, combined=combine)
+                    if qr_paths:
+                        return (
+                            results,
+                            [str(path) for path in qr_paths],
+                            f"✅ Successfully processed {len(results)} items and generated {len(qr_paths)} QR code(s)!"
+                        )
+                    else:
+                        return None, [], "❌ Failed to generate QR codes. Please check the input data."
+                else:
+                    return None, [], "⚠️ No valid content to process. Please provide some input data."
+            except Exception as e:
+                logger.error(f"Processing error: {e}")
+                return None, [], f"❌ Error: {str(e)}"
+        # Set up event handlers
+        example_btn.click(load_example, outputs=[text_input])
+        clear_btn.click(clear_input, outputs=[text_input])
+        process_btn.click(
+            process_all_inputs,
+            inputs=[url_input, file_input, text_input, combine_data],
+            outputs=[output_json, output_gallery, output_text]
+        )
+        gr.Markdown("""
+        ### Features
+        - **URL Processing**: Extract content from websites
+        - **File Processing**: Handle text files and archives
+        - **Notepad**: Direct JSON data input/manipulation
+        - **JSON Cleaning**: Automatic JSON validation and formatting
+        - **QR Generation**: Generate QR codes with embedded JSON data
+        - **Flexible Output**: Choose between combined or separate QR codes
+        ### Usage Tips
+        1. Use the **Notepad** tab for direct JSON input
+        2. Click "Load Example JSON" to see a sample format
+        3. Choose whether to combine all data into a single QR code
+        4. The generated QR codes will contain the complete JSON data
+        """)
+    return interface
+def main():
+    # Configure system settings
+    mimetypes.init()
+    # Create output directories
+    Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
+    # Create and launch interface
+    interface = create_interface()
+    # Launch with proper configuration for Hugging Face
+    interface.launch(
+        share=False,
+        debug=False  # Set to False for production
+    )
 if __name__ == "__main__":
+    main()