Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Apr 5

Commit

ac0ad33

verified ·

1 Parent(s): 9814d50

Update app.py

Browse files

Files changed (1) hide show

app.py +522 -417

app.py CHANGED Viewed

@@ -6,10 +6,11 @@ import logging
 import mimetypes
 import zipfile
 import tempfile
 from datetime import datetime
-from typing import List, Dict, Optional, Union
 from pathlib import Path
-from urllib.parse import urlparse
 import requests
 import validators
@@ -19,8 +20,10 @@ from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from cleantext import clean
 import qrcode
-# Setup logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
@@ -31,229 +34,164 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-# Ensure output directories exist
-Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
-class URLProcessor:
     def __init__(self):
         self.session = requests.Session()
-        self.timeout = 10  # seconds
         self.session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate, br',
             'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1'
         })
     def validate_url(self, url: str) -> Dict:
-        """Validate URL format and accessibility"""
         try:
             if not validators.url(url):
-                return {'is_valid': False, 'message': 'Invalid URL format'}
-            # Try a simple GET request to check if the URL is accessible
-            response = self.session.get(url, timeout=self.timeout)
-            response.raise_for_status()
-            return {'is_valid': True, 'message': 'URL is valid and accessible'}
-        except Exception as e:
-            return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
-    def fetch_content(self, url: str) -> Optional[Dict]:
-        """Universal content fetcher with special case handling"""
-        try:
-            # Google Drive document handling
-            if 'drive.google.com' in url:
-                return self._handle_google_drive(url)
-            # Google Calendar ICS handling
-            if 'calendar.google.com' in url and ' ics' in url:
-                return self._handle_google_calendar(url)
-            logger.info(f"Fetching content from URL: {url}")
-            response = self.session.get(url, timeout=self.timeout)
-            response.raise_for_status()
-            # Return the raw HTML content
             return {
-                'content': response.text,
-                'content_type': response.headers.get('Content-Type', ''),
-                'timestamp': datetime.now().isoformat()
             }
         except Exception as e:
-            logger.error(f"Content fetch failed: {e}")
-            return None
-    def process_all_inputs(urls, file, text, combine):
-        """Process all input types and generate QR codes"""
-        try:
-            results = []
-            file_processor = FileProcessor()  # Initialize file_processor here
-            # Process text input first (since it's direct JSON)
-            if text and text.strip():
-                try:
-                    json_data = json.loads(text)
-                    if isinstance(json_data, list):
-                        results.extend(json_data)
-                    else:
-                        results.append(json_data)
-                except json.JSONDecodeError as e:
-                    return None, [], f"❌ Invalid JSON format: {str(e)}"
-            # Process URLs if provided
-            if urls and urls.strip():
-                processor = URLProcessor()
-                url_list = re.split(r'[,\n]', urls)
-                url_list = [url.strip() for url in url_list if url.strip()]
-                for url in url_list:
-                    validation = processor.validate_url(url)
-                    if validation.get('is_valid'):
-                        content = processor.fetch_content(url)
-                        if content:
-                            # Convert HTML content to a proper JSON object
-                            url_data = {
-                                'source': 'url',
-                                'url': url,
-                                'content': content.get('content', ''),
-                                'content_type': content.get('content_type', ''),
-                                'timestamp': datetime.now().isoformat()
-                            }
-                            results.append(url_data)
-                    else:
-                        logger.warning(f"Invalid URL: {url} - {validation.get('message')}")
-            # Process files if provided
-            if file:
-                file_results = file_processor.process_file(file)
-                if file_results:
-                    results.extend(file_results)
-            # Generate QR codes
-            if results:
-                if combine:
-                    combined_data = {
-                        'type': 'combined_data',
-                        'items': results,
-                        'timestamp': datetime.now().isoformat()
-                    }
-                    qr_paths = generate_qr_code(combined_data, combined=True)
-                else:
-                    qr_paths = []
-                    for item in results:
-                        item_paths = generate_qr_code(item, combined=True)
-                        if item_paths:
-                            qr_paths.extend(item_paths)
-                if qr_paths:
-                    return (
-                        results,
-                        [str(path) for path in qr_paths],
-                        f"✅ Successfully processed {len(results)} items and generated {len(qr_paths)} QR code(s)!"
-                    )
-                else:
-                    return None, [], "❌ Failed to generate QR codes. Please check the input data."
-            else:
-                return None, [], "⚠️ No valid content to process. Please provide some input data."
-        except Exception as e:
-            logger.error(f"Processing error: {e}")
-            import traceback
-            logger.error(traceback.format_exc())  # Print the full stack trace
-            return None, [], f"❌ Error: {str(e)}"
-    def _handle_google_drive(self, url: str) -> Optional[Dict]:
-        """Process Google Drive file links"""
         try:
-            file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
-            if not file_id:
-                logger.error(f"Invalid Google Drive URL: {url}")
-                return None
-            direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
-            response = self.session.get(direct_url, timeout=self.timeout)
             response.raise_for_status()
-            return {
-                'content': response.text,
                 'content_type': response.headers.get('Content-Type', ''),
-                'timestamp': datetime.now().isoformat()
             }
-        except Exception as e:
-            logger.error (f"Google Drive processing failed: {e}")
-            return None
-    def _handle_google_calendar(self, url: str) -> Optional[Dict]:
-        """Process Google Calendar ICS feeds"""
-        try:
-            response = self.session.get(url, timeout=self.timeout)
-            response.raise_for_status()
             return {
-                'content': response.text,
-                'content_type': 'text/calendar',
-                'timestamp': datetime.now().isoformat()
             }
         except Exception as e:
-            logger.error(f"Calendar fetch failed: {e}")
             return None
-    def _fetch_html_content(self, url: str) -> Optional[Dict]:
-        """Standard HTML content processing"""
         try:
-            response = self.session.get(url, timeout=self.timeout)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # Remove unwanted elements
-            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
-                element.decompose()
-            # Extract main content
-            main_content = soup.find('main') or soup.find('article') or soup.body
-            if main_content is None:
-                logger.warning(f"No main content found for URL: {url}")
-                return {
-                    'content': '',
-                    'content_type': response.headers.get('Content-Type', ''),
-                    'timestamp': datetime.now().isoformat()
-                }
-            # Clean and structure content
-            text_content = main_content.get_text(separator='\n', strip=True)
-            cleaned_content = self.advanced_text_cleaning(text_content)
-            return {
-                'content': cleaned_content,
-                'content_type': response.headers.get('Content-Type', ''),
-                'timestamp': datetime.now().isoformat()
-            }
         except Exception as e:
-            logger.error(f"HTML processing failed: {e}")
-            return None
-class FileProcessor:
-    """Class to handle file processing"""
-    def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):  # 2GB default
         self.max_file_size = max_file_size
-        self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml', '.zip', '.pdf', '.rtf', '.tar'}
-    def is_text_file(self, filepath: str) -> bool:
-        """Check if file is a text file"""
-        try:
-            mime_type, _ = mimetypes.guess_type(filepath)
-            return (mime_type and mime_type.startswith('text/')) or \
-                   (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
-        except Exception:
-            return False
     def process_file(self, file) -> List[Dict]:
-        """Process uploaded file with enhanced error handling"""
         if not file:
             return []
@@ -265,8 +203,11 @@ class FileProcessor:
                 return []
             with tempfile.TemporaryDirectory() as temp_dir:
-                if zipfile.is_zipfile(file.name):
-                    dataset.extend(self._process_zip_file(file.name, temp_dir))
                 else:
                     dataset.extend(self._process_single_file(file))
@@ -276,261 +217,419 @@ class FileProcessor:
         return dataset
     def _process_single_file(self, file) -> List[Dict]:
-        """Process a single file"""
         try:
             file_stat = os.stat(file.name)
-            # For very large files, read in chunks and summarize
-            if file_stat.st_size > 100 * 1024 * 1024:  # 100MB
-                logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
-                # Read first and last 1MB for extremely large files
-                content = ""
-                with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
-                    content = f.read(1 * 1024 * 1024)  # First 1MB
-                    content += "\n...[Content truncated due to large file size]...\n"
-                    # Seek to the last 1MB
-                    f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
-                    content += f.read()  # Last 1MB
-            else:
-                # Regular file processing
-                with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
-                    content = f.read()
             return [{
-                'source': 'filename',
                 'filename': os.path.basename(file.name),
-                'file_size': file_stat.st_size,
                 'mime_type': mimetypes.guess_type(file.name)[0],
                 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
                 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
-                'content': content,
                 'timestamp': datetime.now().isoformat()
             }]
         except Exception as e:
             logger.error(f"File processing error: {e}")
             return []
-    def _process_zip_file(self, zip_file_path: str, extract_to: str) -> List[Dict]:
-        """Process a zip file and extract its contents"""
         dataset = []
         try:
-            if not os.path.isfile(zip_file_path):
-                logger.error(f"Zip file does not exist: {zip_file_path}")
-                return dataset
-            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
-                zip_ref.extractall(extract_to)
-                for file_info in zip_ref.infolist():
-                    if file_info.is_dir():
-                        continue
-                    extracted_file_path = os.path.join(extract_to, file_info.filename)
-                    dataset.extend(self._process_single_file(open(extracted_file_path, 'rb')))
-        except FileNotFoundError as e:
-            logger.error(f"File not found: {e}")
         except Exception as e:
-            logger.error(f"Error processing zip file: {e}")
         return dataset
-    def chunk_data(self, data, max_size=2953):  # 2953 is the max size for version 1 QR code
-        """Chunk data into smaller pieces if it exceeds max_size."""
-        json_str = json.dumps(data, ensure_ascii=False)
-        if len(json_str) <= max_size:
-            return [json_str]
-        # Split into chunks
-        chunks = []
-        while json_str:
-            chunk = json_str[:max_size]
-            chunks.append(chunk)
-            json_str = json_str[max_size:]
-        return chunks
-def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
-    """Clean and validate JSON data"""
     try:
-        if isinstance(data, str):
-            data = data.strip()
-            data = json.loads(data)
-        cleaned = json.loads(json.dumps(data))
-        return cleaned
-    except json.JSONDecodeError as e:
-        logger.error(f"JSON cleaning error: {e}")
-        return None
     except Exception as e:
-        logger.error(f"Unexpected error while cleaning JSON: {e}")
-        return None
-def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]:
-    """Generate QR code(s) from data"""
     try:
-        output_dir = Path('output/qr_codes')
-        output_dir.mkdir(parents=True, exist_ok=True)
         if combined:
-            cleaned_data = clean_json(data)
-            if cleaned_data is None:
-                logger.error("Failed to clean data for QR code generation.")
-                return []
-            qr = qrcode.QRCode(
-                version=None,
-                error_correction=qrcode.constants.ERROR_CORRECT_L,
-                box_size=10,
-                border=4,
-            )
-            json_str = json.dumps(cleaned_data, ensure_ascii=False)
-            qr.add_data(json_str)
-            qr.make(fit=True)
-            img = qr.make_image(fill_color="black", back_color="white")
-            output_path = output_dir / f'combined_qr_{int(time.time())}.png'
-            img.save(str(output_path))
-            return [str(output_path)]
         else:
             if isinstance(data, list):
-                paths = []
                 for idx, item in enumerate(data):
-                    cleaned_item = clean_json(item)
-                    if cleaned_item is None:
-                        logger.error(f"Failed to clean item {idx} for QR code generation.")
-                        continue
-                    qr = qrcode.QRCode(
-                        version=None,
-                        error_correction=qrcode.constants.ERROR_CORRECT_L,
-                        box_size=10,
-                        border=4,
-                    )
-                    json_str = json.dumps(cleaned_item, ensure_ascii=False)
-                    qr.add_data(json_str)
-                    qr.make(fit=True)
-                    img = qr.make_image(fill_color="black", back_color="white")
-                    output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
-                    img.save(str(output_path))
-                    paths.append(str(output_path))
-                return paths
             else:
-                cleaned_item = clean_json(data)
-                if cleaned_item is None:
-                    logger.error("Failed to clean single item for QR code generation.")
-                    return []
-                qr = qrcode.QRCode(
-                    version=None,
-                    error_correction=qrcode.constants.ERROR_CORRECT_L,
-                    box_size=10,
-                    border=4,
-                )
-                json_str = json.dumps(cleaned_item, ensure_ascii=False)
-                qr.add_data(json_str)
-                qr.make(fit=True)
-                img = qr.make_image(fill_color="black", back_color="white")
-                output_path = output_dir / f'single_qr_{int(time.time())}.png'
-                img.save(str(output_path))
-                return [str(output_path)]
-        return []
     except Exception as e:
-        logger.error(f"QR generation error: {e}")
         return []
-def create_interface():
-    """Create a comprehensive Gradio interface with advanced features"""
     css = """
-    .container { max-width: 1200px; margin: auto; }
-    .warning { background-color: #fff3cd; color: #856404; padding: 10px; border-radius: 4px; }
-    .error { background-color: #f8d7da; color: #721c24; padding: 10px; border-radius: 4px; }
-    .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
     """
     with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
-        gr.Markdown("# 🌐 Advanced Data Processing & QR Code Generator")
-        with gr.Tab("URL Processing"):
             url_input = gr.Textbox(
-                label="Enter URLs (comma or newline separated)",
                 lines=5,
                 placeholder="https://example1.com\nhttps://example2.com",
                 value=""
             )
-        with gr.Tab("File Input"):
             file_input = gr.File(
-                label="Upload text file or ZIP archive",
-                file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
             )
-        with gr.Tab("Notepad"):
             text_input = gr.TextArea(
-                label="JSON Data Input",
                 lines=15,
                 placeholder="Paste your JSON data here...",
                 value=""
             )
             with gr.Row():
-                example_btn = gr.Button("📝 Load Example JSON", variant="secondary")
-                clear_btn = gr.Button("🗑️ Clear Input", variant="secondary")
         with gr.Row():
             combine_data = gr.Checkbox(
-                label="Combine all data into single QR code",
                 value=True,
-                info="Generate one QR code for all data, or separate QR codes for each item"
             )
-            process_btn = gr.Button("🔄 Process & Generate QR", variant="primary", scale=2)
-        output_json = gr.JSON(label="Processed JSON Data")
-        output_gallery = gr.Gallery(label="Generated QR Codes", columns=2, height=400)
-        output_text = gr.Textbox(label="Processing Status", interactive=False)
         def load_example():
-            example_json = {
                 "type": "product_catalog",
                 "items": [
                     {
                         "id": "123",
-                        "name": "Test Product",
-                        "description": "This is a test product description",
-                        "price": 29.99,
                         "category": "electronics",
-                        "tags": ["test", "sample", "demo"]
                     },
                     {
                         "id": "456",
-                        "name": "Another Product",
-                        "description": "Another test product description",
-                        "price": 49.99,
-                        "category": "accessories",
-                        "tags": ["sample", "test"]
                     }
                 ],
                 "metadata": {
                     "timestamp": datetime.now().isoformat(),
-                    "version": "1.0",
                     "source": "example"
                 }
             }
-            return json.dumps(example_json, indent=2)
         def clear_input():
             return ""
-        def process_all_inputs(urls, file, text, combine):
-            """Process all input types and generate QR codes"""
             try:
                 results = []
-                file_processor = FileProcessor()  # Initialize file_processor here
-                # Process text input first (since it's direct JSON)
                 if text and text.strip():
                     try:
                         json_data = json.loads(text)
@@ -540,17 +639,16 @@ def create_interface():
                             results.append(json_data)
                     except json.JSONDecodeError as e:
                         return None, [], f"❌ Invalid JSON format: {str(e)}"
-                # Process URLs if provided
                 if urls and urls.strip():
-                    processor = URLProcessor()
                     url_list = re.split(r'[,\n]', urls)
                     url_list = [url.strip() for url in url_list if url.strip()]
                     for url in url_list:
-                        validation = processor.validate_url(url)
-                        if validation.get('is_valid'):
-                            content = processor.fetch_content(url)
                             if content:
                                 results.append({
                                     'source': 'url',
@@ -558,34 +656,28 @@ def create_interface():
                                     'content': content,
                                     'timestamp': datetime.now().isoformat()
                                 })
-                # Process files if provided
-                if file:
-                    file_results = file_processor.process_file(file)
-                    if file_results:
-                        results.extend(file_results)
                 # Generate QR codes
                 if results:
-                    if combine:
-                        combined_data = []
-                        for item in results:
-                            combined_data.extend(file_processor.chunk_data(item))
-                        qr_paths = generate_qr_code(combined_data, combined=False)
-                    else:
-                        qr_paths = generate_qr_code(results, combined=combine)
                     if qr_paths:
                         return (
                             results,
                             [str(path) for path in qr_paths],
-                            f"✅ Successfully processed {len(results)} items and generated {len(qr_paths)} QR code(s)!"
                         )
                     else:
-                        return None, [], "❌ Failed to generate QR codes. Please check the input data."
                 else:
-                    return None, [], "⚠️ No valid content to process. Please provide some input data."
             except Exception as e:
                 logger.error(f"Processing error: {e}")
                 return None, [], f"❌ Error: {str(e)}"
@@ -594,44 +686,57 @@ def create_interface():
         example_btn.click(load_example, outputs=[text_input])
         clear_btn.click(clear_input, outputs=[text_input])
         process_btn.click(
-            process_all_inputs,
-            inputs=[url_input, file_input, text_input, combine_data],
             outputs=[output_json, output_gallery, output_text]
         )
         gr.Markdown("""
-        ### Features
-        - **URL Processing**: Extract content from websites
-        - **File Processing**: Handle text files and archives
-        - **Notepad**: Direct JSON data input/manipulation
-        - **JSON Cleaning**: Automatic JSON validation and formatting
-        - **QR Generation**: Generate QR codes with embedded JSON data
-        - **Flexible Output**: Choose between combined or separate QR codes
-        ### Usage Tips
-        1. Use the **Notepad** tab for direct JSON input
-        2. Click "Load Example JSON" to see a sample format
-        3. Choose whether to combine all data into a single QR code
-        4. The generated QR codes will contain the complete JSON data
         """)
     return interface
 def main():
-    # Configure system settings
-    mimetypes.init()
-    # Create output directories
-    Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
-    # Create and launch interface
-    interface = create_interface()
-    # Launch with proper configuration for Hugging Face
-    interface.launch(
-        share=False,
-        debug=False  # Set to False for production
-    )
 if __name__ == "__main__":
     main()

 import mimetypes
 import zipfile
 import tempfile
+import chardet
 from datetime import datetime
+from typing import List, Dict, Optional, Union, Tuple
 from pathlib import Path
+from urllib.parse import urlparse, urljoin
 import requests
 import validators
 from fake_useragent import UserAgent
 from cleantext import clean
 import qrcode
+from PIL import Image, ImageDraw, ImageFont
+import numpy as np
+# Setup enhanced logging with more detailed formatting
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
 )
 logger = logging.getLogger(__name__)
+# Ensure output directories exist with modern structure
+OUTPUTS_DIR = Path('output')
+QR_CODES_DIR = OUTPUTS_DIR / 'qr_codes'
+TEMP_DIR = OUTPUTS_DIR / 'temp'
+for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
+    directory.mkdir(parents=True, exist_ok=True)
+class EnhancedURLProcessor:
+    """Advanced URL processing with complete content extraction"""
     def __init__(self):
         self.session = requests.Session()
+        self.timeout = 15  # Extended timeout for larger content
+        self.max_retries = 3
+        self.user_agent = UserAgent()
+        # Enhanced headers for better site compatibility
         self.session.headers.update({
+            'User-Agent': self.user_agent.random,
+            'Accept': '*/*',  # Accept all content types
+            'Accept-Language': 'en-US,en;q=0.9',
             'Accept-Encoding': 'gzip, deflate, br',
             'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+            'Sec-Fetch-User': '?1',
+            'DNT': '1'
         })
     def validate_url(self, url: str) -> Dict:
+        """Enhanced URL validation with detailed feedback"""
         try:
             if not validators.url(url):
+                return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
+            parsed = urlparse(url)
+            if not all([parsed.scheme, parsed.netloc]):
+                return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
+            # Try HEAD request first to check accessibility
+            try:
+                head_response = self.session.head(url, timeout=5)
+                head_response.raise_for_status()
+            except requests.exceptions.RequestException:
+                # If HEAD fails, try GET as some servers don't support HEAD
+                response = self.session.get(url, timeout=self.timeout)
+                response.raise_for_status()
             return {
+                'is_valid': True,
+                'message': 'URL is valid and accessible',
+                'details': {
+                    'content_type': head_response.headers.get('Content-Type', 'unknown'),
+                    'server': head_response.headers.get('Server', 'unknown'),
+                    'size': head_response.headers.get('Content-Length', 'unknown')
+                }
             }
         except Exception as e:
+            return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
+    def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict]:
+        """Enhanced content fetcher with retry mechanism and complete character extraction"""
         try:
+            logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
+            # Update User-Agent randomly for each request
+            self.session.headers.update({'User-Agent': self.user_agent.random})
+            response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
+            # Detect encoding
+            if response.encoding is None:
+                encoding = chardet.detect(response.content)['encoding'] or 'utf-8'
+            else:
+                encoding = response.encoding
+            # Decode content with fallback
+            try:
+                raw_content = response.content.decode(encoding, errors='replace')
+            except (UnicodeDecodeError, LookupError):
+                raw_content = response.content.decode('utf-8', errors='replace')
+            # Extract metadata
+            metadata = {
+                'url': url,
+                'timestamp': datetime.now().isoformat(),
+                'encoding': encoding,
                 'content_type': response.headers.get('Content-Type', ''),
+                'content_length': len(response.content),
+                'headers': dict(response.headers),
+                'status_code': response.status_code
             }
+            # Process based on content type
+            content_type = response.headers.get('Content-Type', '').lower()
+            if 'text/html' in content_type:
+                processed_content = self._process_html_content(raw_content, url)
+            else:
+                processed_content = raw_content
             return {
+                'content': processed_content,
+                'raw_content': raw_content,
+                'metadata': metadata
             }
+        except requests.exceptions.RequestException as e:
+            if retry_count < self.max_retries - 1:
+                logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
+                time.sleep(2 ** retry_count)  # Exponential backoff
+                return self.fetch_content(url, retry_count + 1)
+            logger.error(f"Failed to fetch content after {self.max_retries} attempts: {e}")
+            return None
         except Exception as e:
+            logger.error(f"Unexpected error while fetching content: {e}")
             return None
+    def _process_html_content(self, content: str, base_url: str) -> str:
+        """Process HTML content while preserving all characters"""
         try:
+            soup = BeautifulSoup(content, 'html.parser')
+            # Convert relative URLs to absolute
+            for tag in soup.find_all(['a', 'img', 'link', 'script']):
+                for attr in ['href', 'src']:
+                    if tag.get(attr):
+                        try:
+                            tag[attr] = urljoin(base_url, tag[attr])
+                        except Exception:
+                            pass
+            # Extract all text content
+            text_parts = []
+            for element in soup.stripped_strings:
+                text_parts.append(str(element))
+            return '\n'.join(text_parts)
         except Exception as e:
+            logger.error(f"HTML processing error: {e}")
+            return content
+class EnhancedFileProcessor:
+    """Advanced file processing with complete content extraction"""
+    def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024):  # 5GB default
         self.max_file_size = max_file_size
+        self.supported_extensions = {
+            '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
+            '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
+            '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
+            '.pdf', '.doc', '.docx', '.rtf', '.odt'
+        }
     def process_file(self, file) -> List[Dict]:
+        """Process uploaded file with enhanced error handling and complete extraction"""
         if not file:
             return []
                 return []
             with tempfile.TemporaryDirectory() as temp_dir:
+                temp_dir_path = Path(temp_dir)
+                # Handle different archive types
+                if self._is_archive(file.name):
+                    dataset.extend(self._process_archive(file.name, temp_dir_path))
                 else:
                     dataset.extend(self._process_single_file(file))
         return dataset
+    def _is_archive(self, filepath: str) -> bool:
+        """Check if file is an archive"""
+        return any(filepath.lower().endswith(ext) for ext in [
+            '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'
+        ])
     def _process_single_file(self, file) -> List[Dict]:
+        """Process a single file with enhanced character extraction"""
         try:
             file_stat = os.stat(file.name)
+            file_size = file_stat.st_size
+            # Initialize content storage
+            content_parts = []
+            # Process file in chunks for large files
+            chunk_size = 10 * 1024 * 1024  # 10MB chunks
+            with open(file.name, 'rb') as f:
+                while True:
+                    chunk = f.read(chunk_size)
+                    if not chunk:
+                        break
+                    # Detect encoding for each chunk
+                    encoding = chardet.detect(chunk)['encoding'] or 'utf-8'
+                    try:
+                        decoded_chunk = chunk.decode(encoding, errors='replace')
+                        content_parts.append(decoded_chunk)
+                    except (UnicodeDecodeError, LookupError):
+                        decoded_chunk = chunk.decode('utf-8', errors='replace')
+                        content_parts.append(decoded_chunk)
+            # Combine all chunks
+            complete_content = ''.join(content_parts)
             return [{
+                'source': 'file',
                 'filename': os.path.basename(file.name),
+                'file_size': file_size,
                 'mime_type': mimetypes.guess_type(file.name)[0],
                 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
                 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                'content': complete_content,
                 'timestamp': datetime.now().isoformat()
             }]
         except Exception as e:
             logger.error(f"File processing error: {e}")
             return []
+    def _process_archive(self, archive_path: str, extract_to: Path) -> List[Dict]:
+        """Process an archive file with enhanced extraction"""
         dataset = []
         try:
+            # Handle ZIP archives
+            if zipfile.is_zipfile(archive_path):
+                with zipfile.ZipFile(archive_path, 'r') as zip_ref:
+                    zip_ref.extractall(extract_to)
+                    for file_info in zip_ref.infolist():
+                        if file_info.file_size > 0 and not file_info.filename.endswith('/'):
+                            extracted_path = extract_to / file_info.filename
+                            if extracted_path.suffix.lower() in self.supported_extensions:
+                                with open(extracted_path, 'rb') as f:
+                                    dataset.extend(self._process_single_file(f))
+            # TODO: Add support for other archive types (tar, 7z, etc.)
         except Exception as e:
+            logger.error(f"Archive processing error: {e}")
         return dataset
+    def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
+        """Enhanced data chunking with sequence metadata"""
+        try:
+            # Convert data to JSON string
+            json_str = json.dumps(data, ensure_ascii=False)
+            total_length = len(json_str)
+            # Calculate overhead for metadata
+            metadata_template = {
+                "chunk_index": 0,
+                "total_chunks": 1,
+                "total_length": total_length,
+                "chunk_hash": "",
+                "data": ""
+            }
+            overhead = len(json.dumps(metadata_template)) + 20  # Extra padding for safety
+            # Calculate effective chunk size
+            effective_chunk_size = max_size - overhead
+            if total_length <= effective_chunk_size:
+                # Data fits in one chunk
+                chunk = {
+                    "chunk_index": 0,
+                    "total_chunks": 1,
+                    "total_length": total_length,
+                    "chunk_hash": hash(json_str) & 0xFFFFFFFF,  # 32-bit hash
+                    "data": json_str
+                }
+                return [chunk]
+            # Calculate number of chunks needed
+            num_chunks = -(-total_length // effective_chunk_size)  # Ceiling division
+            chunk_size = -(-total_length // num_chunks)  # Even distribution
+            chunks = []
+            for i in range(num_chunks):
+                start_idx = i * chunk_size
+                end_idx = min(start_idx + chunk_size, total_length)
+                chunk_data = json_str[start_idx:end_idx]
+                chunk = {
+                    "chunk_index": i,
+                    "total_chunks": num_chunks,
+                    "total_length": total_length,
+                    "chunk_hash": hash(chunk_data) & 0xFFFFFFFF,
+                    "data": chunk_data
+                }
+                chunks.append(chunk)
+            return chunks
+        except Exception as e:
+            logger.error(f"Error chunking data: {e}")
+            return []
+def generate_stylish_qr(data: Union[str, Dict],
+                       filename: str,
+                       size: int = 10,
+                       border: int = 4,
+                       fill_color: str = "#000000",
+                       back_color: str = "#FFFFFF") -> str:
+    """Generate a stylish QR code with enhanced visual appeal"""
     try:
+        qr = qrcode.QRCode(
+            version=None,
+            error_correction=qrcode.constants.ERROR_CORRECT_H,
+            box_size=size,
+            border=border
+        )
+        # Add data to QR code
+        if isinstance(data, dict):
+            qr.add_data(json.dumps(data, ensure_ascii=False))
+        else:
+            qr.add_data(data)
+        qr.make(fit=True)
+        # Create QR code image with custom colors
+        qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
+        # Convert to RGBA for transparency support
+        qr_image = qr_image.convert('RGBA')
+        # Add subtle gradient overlay
+        gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
+        draw = ImageDraw.Draw(gradient)
+        for i in range(qr_image.width):
+            alpha = int(255 * (1 - i/qr_image.width) * 0.1)  # 10% maximum opacity
+            draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
+        # Combine images
+        final_image = Image.alpha_composite(qr_image, gradient)
+        # Save the image
+        output_path = QR_CODES_DIR / filename
+        final_image.save(output_path, quality=95)
+        return str(output_path)
     except Exception as e:
+        logger.error(f"QR generation error: {e}")
+        return ""
+def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
+    """Generate QR codes with enhanced visual appeal and metadata"""
     try:
+        file_processor = EnhancedFileProcessor()
+        paths = []
         if combined:
+            # Process combined data
+            chunks = file_processor.chunk_data(data)
+            for i, chunk in enumerate(chunks):
+                filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png'
+                qr_path = generate_stylish_qr(
+                    data=chunk,
+                    filename=filename,
+                    fill_color="#1a365d",  # Deep blue
+                    back_color="#ffffff"
+                )
+                if qr_path:
+                    paths.append(qr_path)
         else:
+            # Process individual items
             if isinstance(data, list):
                 for idx, item in enumerate(data):
+                    chunks = file_processor.chunk_data(item)
+                    for chunk_idx, chunk in enumerate(chunks):
+                        filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png'
+                        qr_path = generate_stylish_qr(
+                            data=chunk,
+                            filename=filename,
+                            fill_color="#1a365d",  # Deep blue
+                            back_color="#ffffff"
+                        )
+                        if qr_path:
+                            paths.append(qr_path)
             else:
+                chunks = file_processor.chunk_data(data)
+                for i, chunk in enumerate(chunks):
+                    filename = f'single_qr_{i+1}_of_{len(chunks)}_{int(time.time())}.png'
+                    qr_path = generate_stylish_qr(
+                        data=chunk,
+                        filename=filename,
+                        fill_color="#1a365d",  # Deep blue
+                        back_color="#ffffff"
+                    )
+                    if qr_path:
+                        paths.append(qr_path)
+        return paths
     except Exception as e:
+        logger.error(f"QR code generation error: {e}")
         return []
+def create_modern_interface():
+    """Create a modern and visually appealing Gradio interface"""
+    # Modern CSS styling
     css = """
+    /* Modern color scheme */
+    :root {
+        --primary-color: #1a365d;
+        --secondary-color: #2d3748;
+        --accent-color: #4299e1;
+        --background-color: #f7fafc;
+        --success-color: #48bb78;
+        --error-color: #f56565;
+        --warning-color: #ed8936;
+    }
+    /* Container styling */
+    .container {
+        max-width: 1200px;
+        margin: auto;
+        padding: 2rem;
+        background-color: var(--background-color);
+        border-radius: 1rem;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    }
+    /* Component styling */
+    .input-container {
+        background-color: white;
+        padding: 1.5rem;
+        border-radius: 0.5rem;
+        border: 1px solid #e2e8f0;
+        margin-bottom: 1rem;
+    }
+    /* Button styling */
+    .primary-button {
+        background-color: var(--primary-color);
+        color: white;
+        padding: 0.75rem 1.5rem;
+        border-radius: 0.375rem;
+        border: none;
+        cursor: pointer;
+        transition: all 0.2s;
+    }
+    .primary-button:hover {
+        background-color: var(--accent-color);
+        transform: translateY(-1px);
+    }
+    /* Status messages */
+    .status {
+        padding: 1rem;
+        border-radius: 0.375rem;
+        margin: 1rem 0;
+    }
+    .status.success { background-color: #f0fff4; color: var(--success-color); }
+    .status.error { background-color: #fff5f5; color: var(--error-color); }
+    .status.warning { background-color: #fffaf0; color: var(--warning-color); }
+    /* Gallery styling */
+    .gallery {
+        display: grid;
+        grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+        gap: 1rem;
+        padding: 1rem;
+        background-color: white;
+        border-radius: 0.5rem;
+        border: 1px solid #e2e8f0;
+    }
+    .gallery img {
+        width: 100%;
+        height: auto;
+        border-radius: 0.375rem;
+        transition: transform 0.2s;
+    }
+    .gallery img:hover {
+        transform: scale(1.05);
+    }
     """
+    # Create interface with modern design
     with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
+        gr.Markdown("""
+        # 🌐 Advanced Data Processing & QR Code Generator
+        Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
+        """)
+        with gr.Tab("📝 URL Processing"):
             url_input = gr.Textbox(
+                label="Enter URLs (comma or newline separated)",
                 lines=5,
                 placeholder="https://example1.com\nhttps://example2.com",
                 value=""
             )
+        with gr.Tab("📁 File Input"):
             file_input = gr.File(
+                label="Upload Files",
+                file_types=["text", "zip"],
+                file_count="multiple"
             )
+        with gr.Tab("📋 JSON Input"):
             text_input = gr.TextArea(
+                label="Direct JSON Input",
                 lines=15,
                 placeholder="Paste your JSON data here...",
                 value=""
             )
             with gr.Row():
+                example_btn = gr.Button("📝 Load Example", variant="secondary")
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary")
         with gr.Row():
             combine_data = gr.Checkbox(
+                label="Combine all data into sequence",
                 value=True,
+                info="Generate sequential QR codes for combined data"
+            )
+            process_btn = gr.Button(
+                "🔄 Process & Generate QR",
+                variant="primary"
             )
+        # Output components
+        output_json = gr.JSON(label="Processed Data")
+        output_gallery = gr.Gallery(
+            label="Generated QR Codes",
+            columns=3,
+            height=400,
+            show_label=True
+        )
+        output_text = gr.Textbox(
+            label="Processing Status",
+            interactive=False
+        )
+        # Load example data
         def load_example():
+            example = {
                 "type": "product_catalog",
                 "items": [
                     {
                         "id": "123",
+                        "name": "Premium Widget",
+                        "description": "High-quality widget with advanced features",
+                        "price": 299.99,
                         "category": "electronics",
+                        "tags": ["premium", "featured", "new"]
                     },
                     {
                         "id": "456",
+                        "name": "Basic Widget",
+                        "description": "Reliable widget for everyday use",
+                        "price": 149.99,
+                        "category": "electronics",
+                        "tags": ["basic", "popular"]
                     }
                 ],
                 "metadata": {
                     "timestamp": datetime.now().isoformat(),
+                    "version": "2.0",
                     "source": "example"
                 }
             }
+            return json.dumps(example, indent=2)
         def clear_input():
             return ""
+        def process_inputs(urls, files, text, combine):
+            """Process all inputs and generate QR codes"""
             try:
                 results = []
+                url_processor = EnhancedURLProcessor()
+                file_processor = EnhancedFileProcessor()
+                # Process JSON input
                 if text and text.strip():
                     try:
                         json_data = json.loads(text)
                             results.append(json_data)
                     except json.JSONDecodeError as e:
                         return None, [], f"❌ Invalid JSON format: {str(e)}"
+                # Process URLs
                 if urls and urls.strip():
                     url_list = re.split(r'[,\n]', urls)
                     url_list = [url.strip() for url in url_list if url.strip()]
                     for url in url_list:
+                        validation = url_processor.validate_url(url)
+                        if validation['is_valid']:
+                            content = url_processor.fetch_content(url)
                             if content:
                                 results.append({
                                     'source': 'url',
                                     'content': content,
                                     'timestamp': datetime.now().isoformat()
                                 })
+                # Process files
+                if files:
+                    for file in files:
+                        file_results = file_processor.process_file(file)
+                        if file_results:
+                            results.extend(file_results)
                 # Generate QR codes
                 if results:
+                    qr_paths = generate_qr_codes(results, combine)
                     if qr_paths:
                         return (
                             results,
                             [str(path) for path in qr_paths],
+                            f"✅ Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
                         )
                     else:
+                        return None, [], "❌ Failed to generate QR codes"
                 else:
+                    return None, [], "⚠️ No valid content to process"
             except Exception as e:
                 logger.error(f"Processing error: {e}")
                 return None, [], f"❌ Error: {str(e)}"
         example_btn.click(load_example, outputs=[text_input])
         clear_btn.click(clear_input, outputs=[text_input])
         process_btn.click(
+            process_inputs,
+            inputs=[url_input, file_input, text_input, combine_data],
             outputs=[output_json, output_gallery, output_text]
         )
+        # Add helpful documentation
         gr.Markdown("""
+        ### 🚀 Features
+        - **Complete URL Scraping**: Extracts every character from web pages
+        - **Advanced File Processing**: Full content extraction from text files and archives
+        - **Smart JSON Handling**: Processes any size JSON with automatic chunking
+        - **Sequential QR Codes**: Maintains data integrity across multiple codes
+        - **Modern Design**: Clean, responsive interface with visual feedback
+        ### 💡 Tips
+        1. **URLs**: Enter multiple URLs separated by commas or newlines
+        2. **Files**: Upload text files or ZIP archives containing text files
+        3. **JSON**: Use the example button to see the expected format
+        4. **QR Codes**: Choose whether to combine data into sequential codes
+        5. **Processing**: Monitor the status for real-time feedback
+        ### 🎨 Output
+        - Generated QR codes are saved in the `output/qr_codes` directory
+        - Each QR code contains metadata for proper sequencing
+        - Hover over QR codes in the gallery to see details
         """)
     return interface
 def main():
+    """Initialize and launch the application"""
+    try:
+        # Configure system settings
+        mimetypes.init()
+        # Create and launch interface
+        interface = create_modern_interface()
+        # Launch with configuration
+        interface.launch(
+            share=False,
+            debug=False,
+            show_error=True,
+            show_api=False
+        )
+    except Exception as e:
+        logger.error(f"Application startup error: {e}")
+        raise
 if __name__ == "__main__":
     main()