Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Apr 10

Commit

da7f558

verified ·

1 Parent(s): 8407f80

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -252

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from datetime import datetime
 from typing import List, Dict, Optional, Union, Tuple
 from pathlib import Path
 from urllib.parse import urlparse, urljoin
 import requests
 import validators
 import gradio as gr
@@ -21,8 +22,6 @@ from cleantext import clean
 import qrcode
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
-import tarfile
-import gzip
 # Setup enhanced logging with more detailed formatting
 logging.basicConfig(
@@ -31,7 +30,8 @@ logging.basicConfig(
     handlers=[
         logging.StreamHandler(),
         logging.FileHandler('app.log', encoding='utf-8')
-    ])
 logger = logging.getLogger(__name__)
 # Ensure output directories exist with modern structure
@@ -43,13 +43,13 @@ for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
 class EnhancedURLProcessor:
     """Advanced URL processing with complete content extraction"""
     def __init__(self):
         self.session = requests.Session()
         self.timeout = 15  # Extended timeout for larger content
         self.max_retries = 3
         self.user_agent = UserAgent()
         # Enhanced headers for better site compatibility
         self.session.headers.update({
             'User-Agent': self.user_agent.random,
@@ -70,9 +70,11 @@ class EnhancedURLProcessor:
         try:
             if not validators.url(url):
                 return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
             parsed = urlparse(url)
             if not all([parsed.scheme, parsed.netloc]):
                 return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
             # Try HEAD request first to check accessibility
             try:
                 head_response = self.session.head(url, timeout=5)
@@ -98,18 +100,19 @@ class EnhancedURLProcessor:
         """Enhanced content fetcher with retry mechanism and complete character extraction"""
         try:
             logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
             # Update User-Agent randomly for each request
             self.session.headers.update({'User-Agent': self.user_agent.random})
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
             # Detect encoding
             if response.encoding is None:
                 encoding = chardet.detect(response.content)['encoding'] or 'utf-8'
             else:
                 encoding = response.encoding
             # Decode content with fallback
             try:
                 raw_content = response.content.decode(encoding, errors='replace')
@@ -133,11 +136,13 @@ class EnhancedURLProcessor:
                 processed_content = self._process_html_content(raw_content, url)
             else:
                 processed_content = raw_content
             return {
                 'content': processed_content,
                 'raw_content': raw_content,
                 'metadata': metadata
             }
         except requests.exceptions.RequestException as e:
             if retry_count < self.max_retries - 1:
                 logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
@@ -153,7 +158,7 @@ class EnhancedURLProcessor:
         """Process HTML content while preserving all characters"""
         try:
             soup = BeautifulSoup(content, 'html.parser')
             # Convert relative URLs to absolute
             for tag in soup.find_all(['a', 'img', 'link', 'script']):
                 for attr in ['href', 'src']:
@@ -162,10 +167,12 @@ class EnhancedURLProcessor:
                             tag[attr] = urljoin(base_url, tag[attr])
                         except Exception:
                             pass
             # Extract all text content
             text_parts = []
             for element in soup.stripped_strings:
                 text_parts.append(str(element))
             return '\n'.join(text_parts)
         except Exception as e:
             logger.error(f"HTML processing error: {e}")
@@ -177,7 +184,7 @@ class EnhancedFileProcessor:
     def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024):  # 5GB default
         self.max_file_size = max_file_size
         self.supported_extensions = {
-            '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
             '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
             '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
             '.pdf', '.doc', '.docx', '.rtf', '.odt'
@@ -197,18 +204,17 @@ class EnhancedFileProcessor:
             with tempfile.TemporaryDirectory() as temp_dir:
                 temp_dir_path = Path(temp_dir)
                 # Handle different archive types
                 if self._is_archive(file.name):
                     dataset.extend(self._process_archive(file.name, temp_dir_path))
-                elif Path(file.name).suffix.lower() in self.supported_extensions:
-                    dataset.extend(self._process_single_file(file))
                 else:
-                    logger.warning(f"Unsupported file type: {file.name}")
         except Exception as e:
             logger.error(f"Error processing file: {str(e)}")
             return []
         return dataset
     def _is_archive(self, filepath: str) -> bool:
@@ -218,14 +224,14 @@ class EnhancedFileProcessor:
         ])
     def _process_single_file(self, file) -> List[Dict]:
-        """Process a single file with enhanced character extraction and JSON handling"""
         try:
             file_stat = os.stat(file.name)
             file_size = file_stat.st_size
             # Initialize content storage
             content_parts = []
             # Process file in chunks for large files
             chunk_size = 10 * 1024 * 1024  # 10MB chunks
             with open(file.name, 'rb') as f:
@@ -233,7 +239,7 @@ class EnhancedFileProcessor:
                     chunk = f.read(chunk_size)
                     if not chunk:
                         break
                     # Detect encoding for each chunk
                     encoding = chardet.detect(chunk)['encoding'] or 'utf-8'
                     try:
@@ -246,43 +252,6 @@ class EnhancedFileProcessor:
             # Combine all chunks
             complete_content = ''.join(content_parts)
-            # Check if the content is valid JSON regardless of file extension
-            try:
-                if mimetypes.guess_type(file.name)[0] == 'application/json' or file.name.lower().endswith('.json'):
-                    # It's a JSON file by type or extension
-                    json_data = json.loads(complete_content)
-                    return [{
-                        'source': 'json_file',
-                        'filename': os.path.basename(file.name),
-                        'file_size': file_size,
-                        'mime_type': 'application/json',
-                        'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
-                        'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
-                        'content': json_data,  # Store the parsed JSON object
-                        'raw_content': complete_content,  # Store the original JSON string
-                        'timestamp': datetime.now().isoformat()
-                    }]
-                else:
-                    # Try to parse as JSON anyway
-                    try:
-                        json_data = json.loads(complete_content)
-                        # If we get here, it's valid JSON despite the extension
-                        return [{
-                            'source': 'json_content',
-                            'filename': os.path.basename(file.name),
-                            'file_size': file_size,
-                            'mime_type': 'application/json',
-                            'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
-                            'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
-                            'content': json_data,  # Store the parsed JSON object
-                            'raw_content': complete_content,  # Store the original JSON string
-                            'timestamp': datetime.now().isoformat()
-                        }]
-                    except json.JSONDecodeError:
-                        logger.warning(f"File {file.name} is not valid JSON.")
-            except Exception as e:
-                logger.error(f"Error during JSON processing: {e}")
             return [{
                 'source': 'file',
                 'filename': os.path.basename(file.name),
@@ -311,46 +280,22 @@ class EnhancedFileProcessor:
                             if extracted_path.suffix.lower() in self.supported_extensions:
                                 with open(extracted_path, 'rb') as f:
                                     dataset.extend(self._process_single_file(f))
-            # Handle TAR archives
-            elif archive_path.lower().endswith(('.tar', '.tar.gz', '.tgz')):
-                try:
-                    with tarfile.open(archive_path, 'r:*') as tar_ref:
-                        for member in tar_ref.getmembers():
-                            if member.isfile():
-                                extracted_path = extract_to / member.name
-                                tar_ref.extract(member, path=extract_to)
-                                if extracted_path.suffix.lower() in self.supported_extensions:
-                                    with open(extracted_path, 'rb') as f:
-                                        dataset.extend(self._process_single_file(f))
-                except tarfile.TarError as e:
-                    logger.error(f"Error processing TAR archive: {e}")
-            # Handle GZIP archives (single file)
-            elif archive_path.lower().endswith('.gz'):
-                extracted_path = extract_to / Path(archive_path).stem
-                try:
-                    with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
-                        outfile.write(gz_file.read())
-                    if extracted_path.suffix.lower() in self.supported_extensions:
-                        with open(extracted_path, 'rb') as f:
-                            dataset.extend(self._process_single_file(f))
-                except gzip.GzipFile as e:
-                    logger.error(f"Error processing GZIP archive: {e}")
-            # TODO: Add support for other archive types (.bz2, .7z, .rar) - may require external libraries
-            elif archive_path.lower().endswith(('.bz2', '.7z', '.rar')):
-                logger.warning(f"Support for {Path(archive_path).suffix} archives is not yet fully implemented.")
         except Exception as e:
             logger.error(f"Archive processing error: {e}")
         return dataset
     def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
-        """Enhanced data chunking with sequence metadata"""
         try:
-            # Convert data to JSON string
             json_str = json.dumps(data, ensure_ascii=False)
-            total_length = len(json_str)
-            # Calculate overhead for metadata
             metadata_template = {
                 "chunk_index": 0,
                 "total_chunks": 1,
@@ -358,32 +303,48 @@ class EnhancedFileProcessor:
                 "chunk_hash": "",
                 "data": ""
             }
-            overhead = len(json.dumps(metadata_template)) + 20  # Extra padding for safety
-            # Calculate effective chunk size
-            effective_chunk_size = max_size - overhead
-            if total_length <= effective_chunk_size:
-                # Data fits in one chunk
                 chunk = {
-                    "chunk_index": 0,
-                    "total_chunks": 1,
                     "total_length": total_length,
-                    "chunk_hash": hash(json_str) & 0xFFFFFFFF,  # 32-bit hash
-                    "data": json_str
                 }
-                return [chunk]
             # Calculate number of chunks needed
             num_chunks = -(-total_length // effective_chunk_size)  # Ceiling division
             chunk_size = -(-total_length // num_chunks)  # Even distribution
             chunks = []
             for i in range(num_chunks):
                 start_idx = i * chunk_size
                 end_idx = min(start_idx + chunk_size, total_length)
                 chunk_data = json_str[start_idx:end_idx]
                 chunk = {
                     "chunk_index": i,
                     "total_chunks": num_chunks,
@@ -392,56 +353,58 @@ class EnhancedFileProcessor:
                     "data": chunk_data
                 }
                 chunks.append(chunk)
             return chunks
         except Exception as e:
             logger.error(f"Error chunking data: {e}")
             return []
-def generate_stylish_qr(data: Union[str, Dict],
-                        filename: str,
-                        size: int = 10,
-                        border: int = 4,
-                        fill_color: str = "#000000",
-                        back_color: str = "#FFFFFF") -> str:
     """Generate a stylish QR code with enhanced visual appeal"""
     try:
         qr = qrcode.QRCode(
             version=None,
-            error_correction=qrcode.constants.ERROR_CORRECT_H,
             box_size=size,
             border=border
         )
         # Add data to QR code
         if isinstance(data, dict):
             qr.add_data(json.dumps(data, ensure_ascii=False))
         else:
             qr.add_data(data)
         qr.make(fit=True)
         # Create QR code image with custom colors
         qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
         # Convert to RGBA for transparency support
         qr_image = qr_image.convert('RGBA')
         # Add subtle gradient overlay
         gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
         draw = ImageDraw.Draw(gradient)
         for i in range(qr_image.width):
             alpha = int(255 * (1 - i/qr_image.width) * 0.1)  # 10% maximum opacity
             draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
         # Combine images
         final_image = Image.alpha_composite(qr_image, gradient)
         # Save the image
         output_path = QR_CODES_DIR / filename
         final_image.save(output_path, quality=95)
         return str(output_path)
     except Exception as e:
         logger.error(f"QR generation error: {e}")
         return ""
@@ -451,7 +414,7 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
     try:
         file_processor = EnhancedFileProcessor()
         paths = []
         if combined:
             # Process combined data
             chunks = file_processor.chunk_data(data)
@@ -492,126 +455,15 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
                     )
                     if qr_path:
                         paths.append(qr_path)
-                return paths
     except Exception as e:
         logger.error(f"QR code generation error: {e}")
         return []
-def create_qr_visualizer(qr_paths, metadata=None):
-    """Create an interactive visualization of sequenced QR codes"""
-    if not qr_paths:
-        return None
-    # Extract metadata from QR codes if not provided
-    if metadata is None:
-        metadata = []
-        for path in qr_paths:
-            try:
-                img = Image.open(path)
-                qr = qrcode.QRCode()
-                data = qrcode.image.pil.PilImage.get_qr_data(img)
-                if data:
-                    metadata.append(json.loads(data))
-                else:
-                    # If can't extract, add placeholder
-                    metadata.append({"chunk_index": len(metadata), "total_chunks": len(qr_paths)})
-            except Exception as e:
-                logger.error(f"Error extracting QR metadata: {e}")
-                metadata.append({"chunk_index": len(metadata), "total_chunks": len(qr_paths)})
-    # Compute optimal grid size
-    total_codes = len(qr_paths)
-    grid_size = math.ceil(math.sqrt(total_codes))
-    # Create a composite image with placeholders for disabled QR codes
-    def create_composite(enabled_indices):
-        # Size calculations for the grid
-        qr_size = 200  # Size of each QR code in pixels
-        padding = 20   # Padding between QR codes
-        # Create grid for visualization
-        grid_width = grid_size * (qr_size + padding) + padding
-        grid_height = grid_size * (qr_size + padding) + padding
-        # Create a white background image
-        composite = Image.new('RGBA', (grid_width, grid_height), (255, 255, 255, 255))
-        draw = ImageDraw.Draw(composite)
-        # Load and place QR codes on the grid
-        for i, path in enumerate(qr_paths):
-            # Calculate grid position
-            row = i // grid_size
-            col = i % grid_size
-            # Calculate pixel position
-            x = col * (qr_size + padding) + padding
-            y = row * (qr_size + padding) + padding
-            if i in enabled_indices:
-                try:
-                    # Load and resize QR code
-                    qr_img = Image.open(path)
-                    qr_img = qr_img.resize((qr_size, qr_size), Image.Resampling.LANCZOS)
-                    # Extract metadata for this QR
-                    meta = metadata[i] if i < len(metadata) else {}
-                    chunk_index = meta.get("chunk_index", i)
-                    total_chunks = meta.get("total_chunks", len(qr_paths))
-                    # Add visual indicator for sequence position
-                    sequence_indicator = Image.new('RGBA', (qr_size, 30), (26, 54, 93, 200))  # Dark blue
-                    draw_indicator = ImageDraw.Draw(sequence_indicator)
-                    draw_indicator.text((10, 5), f"#{chunk_index+1} of {total_chunks}", fill=(255, 255, 255))
-                    # Combine QR with indicator
-                    qr_with_indicator = Image.new('RGBA', (qr_size, qr_size + 30))
-                    qr_with_indicator.paste(qr_img, (0, 0))
-                    qr_with_indicator.paste(sequence_indicator, (0, qr_size), sequence_indicator)
-                    # Paste onto composite
-                    composite.paste(qr_with_indicator, (x, y))
-                    # Draw connection lines based on sequence
-                    if i >  0:
-                        prev_x = (col - 1) * (qr_size + padding) + padding if col > 0 else x
-                        prev_y = (row * (qr_size + padding)) + padding
-                        draw.line([(prev_x + qr_size // 2, prev_y + qr_size), (x + qr_size // 2, y)], fill=(0, 0, 0, 255), width=2)
-        return composite
-    # Create a toggleable interface for enabling/disabling QR codes
-    enabled_indices = list(range(total_codes))  # Start with all enabled
-    def toggle_qr(index):
-        if index in enabled_indices:
-            enabled_indices.remove(index)
-        else:
-            enabled_indices.append(index)
-        return create_composite(enabled_indices)
-    # Create the initial composite image
-    initial_composite = create_composite(enabled_indices)
-    # Display the composite image
-    plt.figure(figsize=(10, 10))
-    plt.imshow(initial_composite)
-    plt.axis('off')
-    plt.show()
-    return toggle_qr
-# Integrate the visualizer into the main application
-def visualize_qr_codes(qr_paths):
-    """Visualize the generated QR codes with enable/disable functionality"""
-    toggle_function = create_qr_visualizer(qr_paths)
-    return toggle_function
-# Add a button in the Gradio interface to trigger visualization
-visualize_btn = gr.Button("🔍 Visualize QR Codes")
-visualize_btn.click(visualize_qr_codes, inputs=output_gallery, outputs=None)
 def create_modern_interface():
     """Create a modern and visually appealing Gradio interface"""
     # Modern CSS styling
     css = """
     /* Modern color scheme */
@@ -624,6 +476,7 @@ def create_modern_interface():
         --error-color: #f56565;
         --warning-color: #ed8936;
     }
     /* Container styling */
     .container {
         max-width: 1200px;
@@ -633,6 +486,7 @@ def create_modern_interface():
         border-radius: 1rem;
         box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
     }
     /* Component styling */
     .input-container {
         background-color: white;
@@ -641,6 +495,7 @@ def create_modern_interface():
         border: 1px solid #e2e8f0;
         margin-bottom: 1rem;
     }
     /* Button styling */
     .primary-button {
         background-color: var(--primary-color);
@@ -651,19 +506,23 @@ def create_modern_interface():
         cursor: pointer;
         transition: all 0.2s;
     }
     .primary-button:hover {
         background-color: var(--accent-color);
         transform: translateY(-1px);
     }
     /* Status messages */
     .status {
         padding: 1rem;
         border-radius: 0.375rem;
         margin: 1rem 0;
     }
     .status.success { background-color: #f0fff4; color: var(--success-color); }
     .status.error { background-color: #fff5f5; color: var(--error-color); }
     .status.warning { background-color: #fffaf0; color: var(--warning-color); }
     /* Gallery styling */
     .gallery {
         display: grid;
@@ -674,22 +533,27 @@ def create_modern_interface():
         border-radius: 0.5rem;
         border: 1px solid #e2e8f0;
     }
     .gallery img {
         width: 100%;
         height: auto;
         border-radius: 0.375rem;
         transition: transform 0.2s;
     }
     .gallery img:hover {
         transform: scale(1.05);
     }
     """
     # Create interface with modern design
     with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
         gr.Markdown("""
         # 🌐 Advanced Data Processing & QR Code Generator
         Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
         """)
         with gr.Tab("📝 URL Processing"):
             url_input = gr.Textbox(
                 label="Enter URLs (comma or newline separated)",
@@ -697,12 +561,14 @@ def create_modern_interface():
                 placeholder="https://example1.com\nhttps://example2.com",
                 value=""
             )
         with gr.Tab("📁 File Input"):
             file_input = gr.File(
                 label="Upload Files",
-                file_types=["*"],  # Accept all file types
                 file_count="multiple"
             )
         with gr.Tab("📋 JSON Input"):
             text_input = gr.TextArea(
                 label="Direct JSON Input",
@@ -710,9 +576,11 @@ def create_modern_interface():
                 placeholder="Paste your JSON data here...",
                 value=""
             )
             with gr.Row():
                 example_btn = gr.Button("📝 Load Example", variant="secondary")
                 clear_btn = gr.Button("🗑️ Clear", variant="secondary")
         with gr.Row():
             combine_data = gr.Checkbox(
                 label="Combine all data into sequence",
@@ -723,6 +591,7 @@ def create_modern_interface():
                 "🔄 Process & Generate QR",
                 variant="primary"
             )
         # Output components
         output_json = gr.JSON(label="Processed Data")
         output_gallery = gr.Gallery(
@@ -791,6 +660,7 @@ def create_modern_interface():
                 if urls and urls.strip():
                     url_list = re.split(r'[,\n]', urls)
                     url_list = [url.strip() for url in url_list if url.strip()]
                     for url in url_list:
                         validation = url_processor.validate_url(url)
                         if validation['is_valid']:
@@ -823,6 +693,7 @@ def create_modern_interface():
                         return None, [], "❌ Failed to generate QR codes"
                 else:
                     return None, [], "⚠️ No valid content to process"
             except Exception as e:
                 logger.error(f"Processing error: {e}")
                 return None, [], f"❌ Error: {str(e)}"
@@ -839,22 +710,28 @@ def create_modern_interface():
         # Add helpful documentation
         gr.Markdown("""
         ### 🚀 Features
-                - **Complete URL Scraping**: Extracts every character from web pages
-                - **Advanced File Processing**: Full content extraction from various text-based files and common archives. Supports flexible JSON handling.
-                - **Smart JSON Handling**: Processes any size JSON with automatic chunking, either via direct input or file upload.
-                - **Sequential QR Codes**: Maintains data integrity across multiple codes
-                - **Modern Design**: Clean, responsive interface with visual feedback
-                ### 💡 Tips
-                1. **URLs**: Enter multiple URLs separated by commas or newlines
-                2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and JSON files.
-                3. **JSON**: Use the example button to see the expected format or upload a .json file. The system will also try to detect JSON content in other file types.
-                4. **QR Codes**: Choose whether to combine data into sequential codes
-                5. **Processing**: Monitor the status for real-time feedback
-                ### 🎨 Output
-                - Generated QR codes are saved in the `output/qr_codes` directory
-                - Each QR code contains metadata for proper sequencing
-                - Hover over QR codes in the gallery to see details
-                """)
     return interface
 def main():
@@ -865,7 +742,7 @@ def main():
         # Create and launch interface
         interface = create_modern_interface()
         # Launch with configuration
         interface.launch(
             share=False,
@@ -878,4 +755,4 @@ def main():
         raise
 if __name__ == "__main__":
-    main()

 from typing import List, Dict, Optional, Union, Tuple
 from pathlib import Path
 from urllib.parse import urlparse, urljoin
 import requests
 import validators
 import gradio as gr
 import qrcode
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
 # Setup enhanced logging with more detailed formatting
 logging.basicConfig(
     handlers=[
         logging.StreamHandler(),
         logging.FileHandler('app.log', encoding='utf-8')
+    ]
+)
 logger = logging.getLogger(__name__)
 # Ensure output directories exist with modern structure
 class EnhancedURLProcessor:
     """Advanced URL processing with complete content extraction"""
     def __init__(self):
         self.session = requests.Session()
         self.timeout = 15  # Extended timeout for larger content
         self.max_retries = 3
         self.user_agent = UserAgent()
         # Enhanced headers for better site compatibility
         self.session.headers.update({
             'User-Agent': self.user_agent.random,
         try:
             if not validators.url(url):
                 return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
             parsed = urlparse(url)
             if not all([parsed.scheme, parsed.netloc]):
                 return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
             # Try HEAD request first to check accessibility
             try:
                 head_response = self.session.head(url, timeout=5)
         """Enhanced content fetcher with retry mechanism and complete character extraction"""
         try:
             logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
             # Update User-Agent randomly for each request
             self.session.headers.update({'User-Agent': self.user_agent.random})
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
             # Detect encoding
             if response.encoding is None:
                 encoding = chardet.detect(response.content)['encoding'] or 'utf-8'
             else:
                 encoding = response.encoding
             # Decode content with fallback
             try:
                 raw_content = response.content.decode(encoding, errors='replace')
                 processed_content = self._process_html_content(raw_content, url)
             else:
                 processed_content = raw_content
             return {
                 'content': processed_content,
                 'raw_content': raw_content,
                 'metadata': metadata
             }
         except requests.exceptions.RequestException as e:
             if retry_count < self.max_retries - 1:
                 logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
         """Process HTML content while preserving all characters"""
         try:
             soup = BeautifulSoup(content, 'html.parser')
             # Convert relative URLs to absolute
             for tag in soup.find_all(['a', 'img', 'link', 'script']):
                 for attr in ['href', 'src']:
                             tag[attr] = urljoin(base_url, tag[attr])
                         except Exception:
                             pass
             # Extract all text content
             text_parts = []
             for element in soup.stripped_strings:
                 text_parts.append(str(element))
             return '\n'.join(text_parts)
         except Exception as e:
             logger.error(f"HTML processing error: {e}")
     def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024):  # 5GB default
         self.max_file_size = max_file_size
         self.supported_extensions = {
+            '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
             '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
             '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
             '.pdf', '.doc', '.docx', '.rtf', '.odt'
             with tempfile.TemporaryDirectory() as temp_dir:
                 temp_dir_path = Path(temp_dir)
                 # Handle different archive types
                 if self._is_archive(file.name):
                     dataset.extend(self._process_archive(file.name, temp_dir_path))
                 else:
+                    dataset.extend(self._process_single_file(file))
         except Exception as e:
             logger.error(f"Error processing file: {str(e)}")
             return []
         return dataset
     def _is_archive(self, filepath: str) -> bool:
         ])
     def _process_single_file(self, file) -> List[Dict]:
+        """Process a single file with enhanced character extraction"""
         try:
             file_stat = os.stat(file.name)
             file_size = file_stat.st_size
             # Initialize content storage
             content_parts = []
             # Process file in chunks for large files
             chunk_size = 10 * 1024 * 1024  # 10MB chunks
             with open(file.name, 'rb') as f:
                     chunk = f.read(chunk_size)
                     if not chunk:
                         break
                     # Detect encoding for each chunk
                     encoding = chardet.detect(chunk)['encoding'] or 'utf-8'
                     try:
             # Combine all chunks
             complete_content = ''.join(content_parts)
             return [{
                 'source': 'file',
                 'filename': os.path.basename(file.name),
                             if extracted_path.suffix.lower() in self.supported_extensions:
                                 with open(extracted_path, 'rb') as f:
                                     dataset.extend(self._process_single_file(f))
+            # TODO: Add support for other archive types (tar, 7z, etc.)
         except Exception as e:
             logger.error(f"Archive processing error: {e}")
         return dataset
     def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
         try:
+            # Convert data to JSON bytes
             json_str = json.dumps(data, ensure_ascii=False)
+            json_bytes = json_str.encode('utf-8')
+            total_length = len(json_bytes)
+            # Calculate metadata overhead in bytes
             metadata_template = {
                 "chunk_index": 0,
                 "total_chunks": 1,
                 "chunk_hash": "",
                 "data": ""
             }
+            overhead_bytes = len(json.dumps(metadata_template).encode('utf-8')) + 20  # Add padding
+            effective_chunk_size = max_size - overhead_bytes
+            if effective_chunk_size <= 0:
+                raise ValueError("Max size is too small after accounting for metadata overhead")
+            chunks = []
+            start = 0
+            while start < total_length:
+                end = start + effective_chunk_size
+                # Ensure valid Unicode by decoding
+                chunk_str = json_bytes[start:end].decode('utf-8', errors='replace')
                 chunk = {
+                    "chunk_index": len(chunks),
+                    "total_chunks": -1,  # To be set later
                     "total_length": total_length,
+                    "chunk_hash": hash(chunk_str) & 0xFFFFFFFF,
+                    "data": chunk_str
                 }
+                chunks.append(chunk)
+                start = end
+            # Update total_chunks in each chunk
+            for i, chunk in enumerate(chunks):
+                chunk["total_chunks"] = len(chunks)
+            return chunks
+        except Exception as e:
+            logger.error(f"Error chunking data: {e}")
+            return []
             # Calculate number of chunks needed
             num_chunks = -(-total_length // effective_chunk_size)  # Ceiling division
             chunk_size = -(-total_length // num_chunks)  # Even distribution
             chunks = []
             for i in range(num_chunks):
                 start_idx = i * chunk_size
                 end_idx = min(start_idx + chunk_size, total_length)
                 chunk_data = json_str[start_idx:end_idx]
                 chunk = {
                     "chunk_index": i,
                     "total_chunks": num_chunks,
                     "data": chunk_data
                 }
                 chunks.append(chunk)
             return chunks
         except Exception as e:
             logger.error(f"Error chunking data: {e}")
             return []
+def generate_stylish_qr(data: Union[str, Dict],
+                       filename: str,
+                       size: int = 10,
+                       border: int = 4,
+                       fill_color: str = "#000000",
+                       back_color: str = "#FFFFFF") -> str:
     """Generate a stylish QR code with enhanced visual appeal"""
     try:
         qr = qrcode.QRCode(
             version=None,
+            error_correction=qrcode.constants.ERROR_CORRECT_M,
             box_size=size,
             border=border
         )
         # Add data to QR code
         if isinstance(data, dict):
             qr.add_data(json.dumps(data, ensure_ascii=False))
         else:
             qr.add_data(data)
         qr.make(fit=True)
         # Create QR code image with custom colors
         qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
         # Convert to RGBA for transparency support
         qr_image = qr_image.convert('RGBA')
         # Add subtle gradient overlay
         gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
         draw = ImageDraw.Draw(gradient)
         for i in range(qr_image.width):
             alpha = int(255 * (1 - i/qr_image.width) * 0.1)  # 10% maximum opacity
             draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
         # Combine images
         final_image = Image.alpha_composite(qr_image, gradient)
         # Save the image
         output_path = QR_CODES_DIR / filename
         final_image.save(output_path, quality=95)
         return str(output_path)
     except Exception as e:
         logger.error(f"QR generation error: {e}")
         return ""
     try:
         file_processor = EnhancedFileProcessor()
         paths = []
         if combined:
             # Process combined data
             chunks = file_processor.chunk_data(data)
                     )
                     if qr_path:
                         paths.append(qr_path)
+        return paths
     except Exception as e:
         logger.error(f"QR code generation error: {e}")
         return []
 def create_modern_interface():
     """Create a modern and visually appealing Gradio interface"""
     # Modern CSS styling
     css = """
     /* Modern color scheme */
         --error-color: #f56565;
         --warning-color: #ed8936;
     }
     /* Container styling */
     .container {
         max-width: 1200px;
         border-radius: 1rem;
         box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
     }
     /* Component styling */
     .input-container {
         background-color: white;
         border: 1px solid #e2e8f0;
         margin-bottom: 1rem;
     }
     /* Button styling */
     .primary-button {
         background-color: var(--primary-color);
         cursor: pointer;
         transition: all 0.2s;
     }
     .primary-button:hover {
         background-color: var(--accent-color);
         transform: translateY(-1px);
     }
     /* Status messages */
     .status {
         padding: 1rem;
         border-radius: 0.375rem;
         margin: 1rem 0;
     }
     .status.success { background-color: #f0fff4; color: var(--success-color); }
     .status.error { background-color: #fff5f5; color: var(--error-color); }
     .status.warning { background-color: #fffaf0; color: var(--warning-color); }
     /* Gallery styling */
     .gallery {
         display: grid;
         border-radius: 0.5rem;
         border: 1px solid #e2e8f0;
     }
     .gallery img {
         width: 100%;
         height: auto;
         border-radius: 0.375rem;
         transition: transform 0.2s;
     }
     .gallery img:hover {
         transform: scale(1.05);
     }
     """
     # Create interface with modern design
     with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
         gr.Markdown("""
         # 🌐 Advanced Data Processing & QR Code Generator
         Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
         """)
         with gr.Tab("📝 URL Processing"):
             url_input = gr.Textbox(
                 label="Enter URLs (comma or newline separated)",
                 placeholder="https://example1.com\nhttps://example2.com",
                 value=""
             )
         with gr.Tab("📁 File Input"):
             file_input = gr.File(
                 label="Upload Files",
+                file_types=["text/*", "application/zip"],  # Allow all text files and ZIP
                 file_count="multiple"
             )
         with gr.Tab("📋 JSON Input"):
             text_input = gr.TextArea(
                 label="Direct JSON Input",
                 placeholder="Paste your JSON data here...",
                 value=""
             )
             with gr.Row():
                 example_btn = gr.Button("📝 Load Example", variant="secondary")
                 clear_btn = gr.Button("🗑️ Clear", variant="secondary")
         with gr.Row():
             combine_data = gr.Checkbox(
                 label="Combine all data into sequence",
                 "🔄 Process & Generate QR",
                 variant="primary"
             )
         # Output components
         output_json = gr.JSON(label="Processed Data")
         output_gallery = gr.Gallery(
                 if urls and urls.strip():
                     url_list = re.split(r'[,\n]', urls)
                     url_list = [url.strip() for url in url_list if url.strip()]
                     for url in url_list:
                         validation = url_processor.validate_url(url)
                         if validation['is_valid']:
                         return None, [], "❌ Failed to generate QR codes"
                 else:
                     return None, [], "⚠️ No valid content to process"
             except Exception as e:
                 logger.error(f"Processing error: {e}")
                 return None, [], f"❌ Error: {str(e)}"
         # Add helpful documentation
         gr.Markdown("""
         ### 🚀 Features
+        - **Complete URL Scraping**: Extracts every character from web pages
+        - **Advanced File Processing**: Full content extraction from text files and archives
+        - **Smart JSON Handling**: Processes any size JSON with automatic chunking
+        - **Sequential QR Codes**: Maintains data integrity across multiple codes
+        - **Modern Design**: Clean, responsive interface with visual feedback
+        ### 💡 Tips
+        1. **URLs**: Enter multiple URLs separated by commas or newlines
+        2. **Files**: Upload text files or ZIP archives containing text files
+        3. **JSON**: Use the example button to see the expected format
+        4. **QR Codes**: Choose whether to combine data into sequential codes
+        5. **Processing**: Monitor the status for real-time feedback
+        ### 🎨 Output
+        - Generated QR codes are saved in the `output/qr_codes` directory
+        - Each QR code contains metadata for proper sequencing
+        - Hover over QR codes in the gallery to see details
+        """)
     return interface
 def main():
         # Create and launch interface
         interface = create_modern_interface()
         # Launch with configuration
         interface.launch(
             share=False,
         raise
 if __name__ == "__main__":
+    main()