Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on May 6

Commit

c19dd51

verified ·

1 Parent(s): 2e9ddb9

Update app2.py

Browse files

Files changed (1) hide show

app2.py +302 -374

app2.py CHANGED Viewed

@@ -55,6 +55,7 @@ except ImportError:
 try:
     from pyth.plugins.plaintext.writer import PlaintextWriter
     RTF_SUPPORT = True
 except ImportError:
     RTF_SUPPORT = False
@@ -86,108 +87,129 @@ TEMP_DIR = OUTPUTS_DIR / 'temp'
 for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
     directory.mkdir(parents=True, exist_ok=True)
 class EnhancedURLProcessor:
-    """Advanced URL processing with enhanced content extraction and recursive link following."""
     def __init__(self):
-        self.session = requests.Session()
-        self.timeout = 15  # Extended timeout for larger content
         self.max_retries = 3
-        self.user_agent = UserAgent()
-        # Enhanced headers for better site compatibility
-        self.session.headers.update({
-            'User-Agent': self.user_agent.random,
-            'Accept': 'text/html, application/json, application/xml, text/plain, */*', # Request common types
-            'Accept-Language': 'en-US,en;q=0.9',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1', # May be ignored for non-HTML
-            'Sec-Fetch-Dest': 'document',
-            'Sec-Fetch-Mode': 'navigate',
-            'Sec-Fetch-Site': 'none',
-            'Sec-Fetch-User': '?1',
-            'DNT': '1'
-        })
-    def validate_url(self, url: str) -> Dict[str, Any]:
-        """Enhanced URL validation with detailed feedback"""
-        try:
-            if not validators.url(url):
-                return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
-            parsed = urlparse(url)
-            if not all([parsed.scheme, parsed.netloc]):
-                return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
-            # Try HEAD request first to check accessibility
-            try:
-                head_response = self.session.head(url, timeout=5)
-                head_response.raise_for_status()
-                final_url = head_response.url # Capture potential redirects
-                content_type = head_response.headers.get('Content-Type', 'unknown')
-                server = head_response.headers.get('Server', 'unknown')
-                size = head_response.headers.get('Content-Length', 'unknown')
-            except requests.exceptions.RequestException:
-                 # If HEAD fails, try GET as some servers don't support HEAD
-                 try:
-                    response = self.session.get(url, timeout=self.timeout)
-                    response.raise_for_status()
-                    final_url = response.url # Capture potential redirects
-                    content_type = response.headers.get('Content-Type', 'unknown')
-                    server = response.headers.get('Server', 'unknown')
-                    size = response.headers.get('Content-Length', 'unknown') # May not be accurate for full content
-                 except requests.exceptions.RequestException as get_e:
-                     return {'is_valid': False, 'message': f'URL not accessible after HEAD/GET attempts: {str(get_e)}', 'details': str(get_e)}
-                 except Exception as get_e:
-                     return {'is_valid': False, 'message': f'Unexpected error during GET validation: {str(get_e)}', 'details': str(get_e)}
-            return {
-                'is_valid': True,
-                'message': 'URL is valid and accessible',
-                'details': {
-                    'final_url': final_url,
-                    'content_type': content_type,
-                    'server': server,
-                    'size': size
-                }
             }
-        except Exception as e:
-            return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
     def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]:
-        """Enhanced content fetcher with retry mechanism and complete character extraction"""
         try:
-            logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
-            # Update User-Agent randomly for each request
-            self.session.headers.update({'User-Agent': self.user_agent.random})
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
-            final_url = response.url # Capture potential redirects
             content_type = response.headers.get('Content-Type', '')
-            # Detect encoding
-            if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
-                encoding_detection = chardet.detect(response.content)
-                encoding = encoding_detection['encoding'] or 'utf-8'
-                logger.debug(f"Detected encoding '{encoding}' with confidence {encoding_detection['confidence']:.2f} for {url}")
-            else:
-                encoding = response.encoding
-                logger.debug(f"Using response.encoding '{encoding}' for {url}")
-            # Decode content with fallback
-            try:
-                raw_content = response.content.decode(encoding, errors='replace')
-            except (UnicodeDecodeError, LookupError):
-                 # Fallback to a more common encoding if the first attempt fails
-                try:
-                     raw_content = response.content.decode('utf-8', errors='replace')
-                     encoding = 'utf-8 (fallback)'
-                     logger.warning(f"Decoding with {encoding} fallback for {url}")
-                except Exception:
-                     raw_content = response.content.decode('latin-1', errors='replace') # Another common fallback
-                     encoding = 'latin-1 (fallback)'
-                     logger.warning(f"Decoding with {encoding} fallback for {url}")
-            # Extract metadata
             metadata = {
                 'original_url': url,
                 'final_url': final_url,
@@ -199,7 +221,7 @@ class EnhancedURLProcessor:
                 'status_code': response.status_code
             }
-            # Process based on content type
             processed_extraction = self._process_web_content(raw_content, metadata['content_type'], final_url)
             return {
@@ -212,29 +234,30 @@ class EnhancedURLProcessor:
             }
         except requests.exceptions.RequestException as e:
             if retry_count < self.max_retries - 1:
-                logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
-                time.sleep(2 ** retry_count)  # Exponential backoff
                 return self.fetch_content(url, retry_count + 1)
-            logger.error(f"Failed to fetch content after {self.max_retries} attempts from {url}: {e}")
             return {
                  'source': 'url',
                  'url': url,
                  'raw_content': None,
                  'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}, # Include basic metadata on failure
                  'extracted_data': None,
-                 'processing_notes': [f"Failed to fetch content after {self.max_retries} attempts: {str(e)}"] # Ensure notes is a list
             }
         except Exception as e:
-            logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
             return {
                 'source': 'url',
                 'url': url,
                 'raw_content': raw_content if 'raw_content' in locals() else None,
                 'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None},
                 'extracted_data': None,
-                'processing_notes': [f"Unexpected processing error: {str(e)}"]
             }
     def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
         """Process content based on detected content type"""
         lower_content_type = content_type.lower()
@@ -353,252 +376,87 @@ class EnhancedURLProcessor:
         return extracted
     def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
-    if not isinstance(max_steps, int) or not (0 <= max_steps <= 3):
-        logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3.")
-        return {
-            'url': url,
-            'level': 0,
-            'fetch_result': None,
-            'linked_extractions': [],
-            'note': f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3."
-        }
-    validation_result = self.validate_url(url)
-    if not validation_result['is_valid']:
-        logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
-        return {
-            'url': url,
-            'level': 0,
-            'fetch_result': None,
-            'linked_extractions': [],
-            'note': f"Initial URL validation failed: {validation_result['message']}"
-        }
-    return self._fetch_content_recursive(url, max_steps, current_step=0)
-def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
-    if current_step > max_steps:
-        logger.debug(f"Depth limit reached for {url} at level {current_step}.")
-        return {
-            'url': url,
-            'level': current_step,
-            'fetch_result': None,
-            'linked_extractions': [],
-            'note': f"Depth limit ({max_steps}) reached."
-        }
-    logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
-    fetch_result = self.fetch_content(url)
-    linked_extractions: List[Dict[str, Any]] = []
-    if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
-        extracted_data = fetch_result['extracted_data']
-        links = extracted_data.get('links', [])
-        logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
-        if current_step < max_steps:
-            for link_info in links:
-                linked_url = link_info.get('url')
-                if linked_url:
-                    linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
-                    linked_extractions.append(linked_result)
-    return {
-        'url': url,
-        'level': current_step,
-        'fetch_result': fetch_result,
-        'linked_extractions': linked_extractions,
-        'note': f"Processed at level {current_step}"
-    }
-class EnhancedURLProcessor:
-    def fetch_content_with_depth(self, url, max_steps, current_level=0):
-        """Simulates fetching and processing URLs up to max_steps depth."""
-        # print(f"Simulating fetch for {url} at level {current_level} with remaining steps {max_steps}...") # Debug print
-        # Simulate handling invalid URL format
-        if not url.startswith('http://') and not url.startswith('https://'):
-             return {
                 'url': url,
-                'level': current_level,
                 'fetch_result': None,
                 'linked_extractions': [],
-                'processing_notes': 'Invalid URL format.'
             }
-        # Base case for recursion depth
-        if max_steps < 0:
-            # This case should ideally not be reached if initial max_steps is non-negative
-            # and recursion correctly decrements, but included for robustness.
             return {
                 'url': url,
-                'level': current_level,
                 'fetch_result': None,
                 'linked_extractions': [],
-                'processing_notes': f'Recursion depth limit reached unexpectedly at level {current_level}.'
             }
-        fetch_success = True # Assume success for simulation by default
-        fetch_content = f"Simulated content for {url}" # Dummy content
-        processing_notes = ""
-        # Simulate a potentially failing URL
-        if "this-domain-does-not-exist" in url:
-            fetch_success = False
-            fetch_content = None
-            processing_notes = "Simulated network error: Could not resolve host."
-        linked_extractions = []
-        # Simulate finding links only if more steps are allowed and fetch was successful
-        if max_steps > 0 and fetch_success:
-            # Simulate finding a couple of links to demonstrate nesting
-            # In a real implementation, this would involve parsing the fetched content
-            # and resolving relative URLs.
-            simulated_linked_urls = [f"{url}/child1", f"{url}/child2"]
-            for linked_url in simulated_linked_urls:
-                # Recursively call for linked URLs, decreasing max_steps and increasing current_level
-                linked_result = self.fetch_content_with_depth(linked_url, max_steps - 1, current_level + 1)
-                if linked_result:
-                    linked_extractions.append(linked_result)
         return {
             'url': url,
-            'level': current_level,
-            'fetch_result': fetch_content, # Keep content even if fetch_success is False, or set to None based on desired behavior
             'linked_extractions': linked_extractions,
-            'processing_notes': processing_notes if processing_notes else 'Simulated fetch successful.'
         }
-# Define a helper function to recursively print extraction details
-def print_extraction_details(extraction, max_level, current_level=0):
-    """Recursively prints details of the extraction and its linked extractions."""
-    if not extraction:
-        return
-    indent = "  " * current_level
-    url = extraction.get('url', 'N/A')
-    level = extraction.get('level', 'N/A')
-    fetch_success = extraction.get('fetch_result') is not None and 'error' not in extraction.get('processing_notes', '').lower()
-    num_linked = len(extraction.get('linked_extractions', []))
-    notes = extraction.get('processing_notes', '')
-    print(f"{indent}URL: {url} (Level {level}). Success: {fetch_success}")
-    print(f"{indent}Number of linked extractions found: {num_linked}")
-    if notes:
-        print(f"{indent}Notes: {notes}")
-    if current_level < max_level and extraction.get('linked_extractions'):
-        # print(f"{indent}Processing linked extractions (Level {current_level + 1}):") # Optional header
-        for i, linked_extraction in enumerate(extraction['linked_extractions']):
-            # print(f"{indent}  Linked Extraction {i+1}:") # Optional item separator
-            print_extraction_details(linked_extraction, max_level, current_level + 1)
-if __name__ == "__main__":
-    # Instantiate the processor
-    processor = EnhancedURLProcessor()
-    # Using quotes.toscrape.com as it has multiple links (in a real scenario)
-    # For this simulation, the dummy processor creates nested links regardless of the actual URL content.
-    test_url_with_links = "https://quotes.toscrape.com/"
-    # --- Test Cases (Extended up to max_steps = 10) ---
-    # Test with 0 steps (only initial URL)
-    print("\n--- Testing with max_steps = 0 ---")
-    result_0 = processor.fetch_content_with_depth(test_url_with_links, max_steps=0)
-    # print(json.dumps(result_0, indent=2)) # Uncomment to see full structure
-    print_extraction_details(result_0, 0)
-    # Test with 1 step (initial URL + its direct links)
-    print(f"\n--- Testing with max_steps = 1 for {test_url_with_links} ---")
-    result_1 = processor.fetch_content_with_depth(test_url_with_links, max_steps=1)
-    # print(json.dumps(result_1, indent=2)) # Uncomment to see full structure
-    print_extraction_details(result_1, 1)
-    # Test with 2 steps
-    print(f"\n--- Testing with max_steps = 2 for {test_url_with_links} ---")
-    result_2 = processor.fetch_content_with_depth(test_url_with_links, max_steps=2)
-    # print(json.dumps(result_2, indent=2)) # Uncomment to see full structure
-    print_extraction_details(result_2, 2)
-    # Test with max_steps = 3
-    print(f"\n--- Testing with max_steps = 3 for {test_url_with_links} ---")
-    result_3 = processor.fetch_content_with_depth(test_url_with_links, max_steps=3)
-    # print(json.dumps(result_3, indent=2)) # Uncomment to see full structure
-    print_extraction_details(result_3, 3)
-    # Test with max_steps = 4
-    print(f"\n--- Testing with max_steps = 4 for {test_url_with_links} ---")
-    result_4 = processor.fetch_content_with_depth(test_url_with_links, max_steps=4)
-    # print(json.dumps(result_4, indent=2)) # Uncomment to see full structure
-    print_extraction_details(result_4, 4)
-    # Test with max_steps = 5
-    print(f"\n--- Testing with max_steps = 5 for {test_url_with_links} ---")
-    result_5 = processor.fetch_content_with_depth(test_url_with_links, max_steps=5)
-    # print(json.dumps(result_5, indent=2)) # Uncomment to see full structure
-    print_extraction_details(result_5, 5)
-    # Test with max_steps = 6
-    print(f"\n--- Testing with max_steps = 6 for {test_url_with_links} ---")
-    result_6 = processor.fetch_content_with_depth(test_url_with_links, max_steps=6)
-    # print(json.dumps(result_6, indent=2)) # Uncomment to see full structure
-    print_extraction_details(result_6, 6)
-    # Test with max_steps = 7
-    print(f"\n--- Testing with max_steps = 7 for {test_url_with_links} ---")
-    result_7 = processor.fetch_content_with_depth(test_url_with_links, max_steps=7)
-    # print(json.dumps(result_7, indent=2)) # Uncomment to see full structure
-    print_extraction_details(result_7, 7)
-    # Test with max_steps = 8
-    print(f"\n--- Testing with max_steps = 8 for {test_url_with_links} ---")
-    result_8 = processor.fetch_content_with_depth(test_url_with_links, max_steps=8)
-    # print(json.dumps(result_8, indent=2)) # Uncomment to see full structure
-    print_extraction_details(result_8, 8)
-    # Test with max_steps = 9
-    print(f"\n--- Testing with max_steps = 9 for {test_url_with_links} ---")
-    result_9 = processor.fetch_content_with_depth(test_url_with_links, max_steps=9)
-    # print(json.dumps(result_9, indent=2)) # Uncomment to see full structure
-    print_extraction_details(result_9, 9)
-    # Test with max_steps = 10
-    print(f"\n--- Testing with max_steps = 10 for {test_url_with_links} ---")
-    result_10 = processor.fetch_content_with_depth(test_url_with_links, max_steps=10)
-    # print(json.dumps(result_10, indent=2)) # Uncomment to see full structure
-    print_extraction_details(result_10, 10)
-    # Test with invalid max_steps (e.g., negative)
-    print("\n--- Testing with invalid max_steps = -1 ---")
-    result_invalid_steps = processor.fetch_content_with_depth(test_url_with_links, max_steps=-1)
-    # print(json.dumps(result_invalid_steps, indent=2)) # Uncomment to see full structure
-    print(f"Result for invalid steps: {result_invalid_steps.get('processing_notes')}")
-    # Test with invalid initial URL format
-    print("\n--- Testing with invalid initial URL format ---")
-    result_invalid_url = processor.fetch_content_with_depth("invalid-url", max_steps=1)
-    # print(json.dumps(result_invalid_url, indent=2)) # Uncomment to see full structure
-    print(f"Result for invalid initial URL: {result_invalid_url.get('processing_notes')}")
-    # Test with a URL that might fail to fetch (simulated)
-    print("\n--- Testing with a potentially failing URL (simulated) ---")
-    # Use a non-existent subdomain or a port that's unlikely to be open
-    failing_url = "http://this-domain-does-not-exist-12345.com/"
-    result_fail = processor.fetch_content_with_depth(failing_url, max_steps=1)
-    # print(json.dumps(result_fail, indent=2)) # Uncomment to see full structure
-    print(f"Result for failing URL: {result_fail.get('processing_notes')}")
-    # Check if fetch_result is None or indicates failure
-    if result_fail.get('fetch_result') is None:
-         print("Fetch result is None as expected for failing URL.")
-    # if result_fail.get('fetch_result') and 'error' in result_fail['fetch_result'].get('processing_notes', '').lower():
-    #      print(f"Fetch result notes for failing URL: {result_fail['fetch_result'].get('processing_notes')}")
-    print("\n--- End of Test Cases ---")
 class EnhancedFileProcessor:
     """Advanced file processing with enhanced content extraction"""
@@ -622,7 +480,21 @@ class EnhancedFileProcessor:
             return []
         dataset = []
-        file_path = Path(file.name) # Use Path object for easier handling
         try:
             file_size = file_path.stat().st_size
@@ -636,18 +508,19 @@ class EnhancedFileProcessor:
                     'processing_notes': 'File size exceeds limit.'
                 }]
             with tempfile.TemporaryDirectory() as temp_dir:
                 temp_dir_path = Path(temp_dir)
-                # Decide processing strategy
                 if file_path.suffix.lower() in self.archive_extensions:
                     dataset.extend(self._process_archive(file_path, temp_dir_path))
                 elif file_path.suffix.lower() in self.supported_extensions:
                     # Pass the path to the single file processor
                     dataset.extend(self._process_single_file(file_path))
                 else:
-                    logger.warning(f"Unsupported file type for processing: '{file_path.name}'")
-                    # Optionally process as raw text even if extension is unsupported
                     try:
                          # Read as text with error replacement
                          content_bytes = file_path.read_bytes()
@@ -660,7 +533,7 @@ class EnhancedFileProcessor:
                              'file_size': file_size,
                              'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
                              'extracted_data': {'plain_text': raw_content}, # Store raw text under a key
-                             'processing_notes': 'Processed as plain text (unsupported extension).'
                          })
                     except Exception as e:
                         logger.error(f"Error reading or processing unsupported file '{file_path.name}' as text: {e}")
@@ -670,7 +543,7 @@ class EnhancedFileProcessor:
                              'file_size': file_size,
                              'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
                              'extracted_data': None,
-                             'processing_notes': f'Unsupported file type and failed to read as text: {e}'
                         })
@@ -681,7 +554,7 @@ class EnhancedFileProcessor:
                 'filename': file_path.name,
                 'file_size': file_size if 'file_size' in locals() else None,
                 'extracted_data': None,
-                'processing_notes': f'Overall file processing error: {str(e)}'
             })
         return dataset
@@ -703,7 +576,7 @@ class EnhancedFileProcessor:
         raw_content: Optional[str] = None
         extracted_data: Any = None
-        processing_notes = []
         try:
             # Read content efficiently
@@ -788,13 +661,13 @@ class EnhancedFileProcessor:
                      if rows:
                           # Limit the number of rows included for potentially huge CSVs
-                          max_rows_preview = 100
                           extracted_data = {
-                              'headers': rows[0] if rows[0] else None, # Assume first row is header
-                              'rows': rows[1:max_rows_preview+1] # Get up to max_rows_preview data rows
                           }
                           if len(rows) > max_rows_preview + 1:
-                              processing_notes.append(f"CSV truncated to {max_rows_preview} data rows.")
                           processing_notes.append("Parsed as CSV.")
                           if not is_explicit_csv:
                                processing_notes.append("Note: Content looked like CSV despite extension/mime.")
@@ -825,7 +698,7 @@ class EnhancedFileProcessor:
                               extracted_text = text_content
                               processing_notes.append("Extracted text from PDF.")
                           finally:
-                              temp_path.unlink() # Clean up temp file
                       elif file_extension == '.docx' and DOCX_SUPPORT:
                            with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
                                tmp_file.write(content_bytes) # Write bytes to temp file
@@ -836,10 +709,11 @@ class EnhancedFileProcessor:
                                extracted_text = text_content
                                processing_notes.append("Extracted text from DOCX.")
                            finally:
-                               temp_path.unlink() # Clean up temp file
                       elif file_extension == '.rtf' and RTF_SUPPORT:
                            # pyth can read directly from file-like object or string
                            try:
                                 doc = Rtf15Reader.read(io.StringIO(raw_content))
                                 text_content = PlaintextWriter.write(doc).getvalue()
                                 extracted_text = text_content
@@ -858,7 +732,7 @@ class EnhancedFileProcessor:
                                 extracted_text = text_content
                                 processing_notes.append("Extracted text from ODT.")
                            finally:
-                                temp_path.unlink() # Clean up temp file
                       elif file_extension in ['.doc', '.ppt', '.pptx', '.xls', '.xlsx']:
                            # These require more complex or platform-specific libraries (e.g. antiword, pandoc, COM objects on Windows)
                            processing_notes.append(f"Automatic text extraction for {file_extension.upper()} not fully implemented.")
@@ -925,10 +799,16 @@ class EnhancedFileProcessor:
                 if zipfile.is_zipfile(archive_path):
                     with zipfile.ZipFile(archive_path, 'r') as zip_ref:
                         for file_info in zip_ref.infolist():
                             if file_info.file_size > 0 and not file_info.filename.endswith('/'):
                                 try:
-                                    zip_ref.extract(file_info, path=extract_to)
-                                    extracted_file_path = extract_to / file_info.filename
                                     # Recursively process the extracted file if it's supported and not an archive itself
                                     if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
                                          dataset.extend(self._process_single_file(extracted_file_path))
@@ -940,6 +820,14 @@ class EnhancedFileProcessor:
                                          logger.debug(f"Skipping unsupported file in archive: '{file_info.filename}'")
                                 except Exception as e:
                                     logger.warning(f"Error extracting/processing file '{file_info.filename}' from zip '{archive_path.name}': {e}")
                 else:
                      logger.error(f"'{archive_path.name}' is not a valid zip file.")
@@ -954,9 +842,23 @@ class EnhancedFileProcessor:
                     with tarfile.open(archive_path, mode) as tar_ref:
                         for member in tar_ref.getmembers():
                             if member.isfile():
                                 try:
-                                    tar_ref.extract(member, path=extract_to)
-                                    extracted_file_path = extract_to / member.name
                                      # Recursively process extracted file
                                     if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
                                          dataset.extend(self._process_single_file(extracted_file_path))
@@ -967,6 +869,14 @@ class EnhancedFileProcessor:
                                          logger.debug(f"Skipping unsupported file in archive: '{member.name}'")
                                 except Exception as e:
                                     logger.warning(f"Error extracting/processing file '{member.name}' from tar '{archive_path.name}': {e}")
                 except tarfile.TarError as e:
                     logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
@@ -991,7 +901,12 @@ class EnhancedFileProcessor:
                  except Exception as e:
                      logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
                  finally:
-                      if extracted_path.exists(): extracted_path.unlink() # Clean up extracted file
             # TODO: Add support for other archive types (.bz2, .7z, .rar)
             elif archive_extension in ('.bz2', '.7z', '.rar'):
@@ -1020,12 +935,14 @@ class EnhancedFileProcessor:
                 "idx": 0, # chunk_index
                 "tc": 1, # total_chunks
                 "tl": total_length, # total_length
-                "hash": "", # chunk_hash
                 "data": "" # chunk_data
             }
             # Estimate overhead more accurately by dumping a sample metadata structure
             # and adding some safety margin. Shortened keys reduce overhead.
-            overhead_estimate = len(json.dumps(metadata_template, separators=(',', ':'))) + 50 # Extra padding
             # Calculate effective chunk size
             effective_chunk_size = max_size - overhead_estimate
@@ -1338,18 +1255,18 @@ def create_modern_interface():
         }
         </script>
         """
         with gr.Row():
             crawl_depth_slider = gr.Slider(
                 label="Crawl Depth",
                 minimum=0,
-                maximum=3,
                 value=0,
                 step=1,
                 interactive=True,
-                info="Select the maximum depth for crawling links (0-3)."
             )
         qr_code_paths = gr.State([])
         gr.Markdown("""
         # 🌐 Advanced Data Processing & QR Code Generator
@@ -1445,13 +1362,14 @@ def create_modern_interface():
             num_qr_codes = len(paths)
             cols = math.ceil(math.sqrt(num_qr_codes)) # Calculate columns for a roughly square grid
             cols = max(1, min(cols, 6)) # Limit max columns for small screens
-            rows = math.ceil(num_qr_codes / cols)
             viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'.format(cols)
-            # Initialize enabledStates if it's empty (first load)
-            if not enabled_states and paths:
-                 enabled_states = list(range(num_qr_codes)) # Enable all by default on first view
             for i, path in enumerate(paths):
                 is_enabled = i in enabled_states
@@ -1465,7 +1383,7 @@ def create_modern_interface():
             return viewport_html
-        def process_inputs(urls, files, text, combine, *args):
             """Process all inputs and generate QR codes"""
             results = []
             processing_status_messages = []
@@ -1497,19 +1415,30 @@ def create_modern_interface():
                     url_list = re.split(r'[,\n]', urls)
                     url_list = [url.strip() for url in url_list if url.strip()]
                     for url in url_list:
-                        validation = url_processor.validate_url(url)
-                        if validation['is_valid']:
-                            processing_status_messages.append(f"🌐 Fetching URL: {url}...")
-                            content_result = url_processor.fetch_content(url)
-                            if content_result:
-                                results.append(content_result)
-                                processing_status_messages.append(f"✅ Fetched and processed URL: {url}")
-                            else:
-                                processing_status_messages.append(f"❌ Failed to fetch/process URL: {url}")
-                                if validation['details'].get('final_url'):
-                                     processing_status_messages[-1] += f" (Redirected to {validation['details']['final_url']})"
                         else:
-                            processing_status_messages.append(f"⚠️ Skipping invalid URL: {url} ({validation['message']})")
                 # Process files
                 if files:
@@ -1519,9 +1448,14 @@ def create_modern_interface():
                         if file_results:
                              results.extend(file_results)
                              processing_status_messages.append(f"✅ Processed file: {file.name}")
                         else:
                              processing_status_messages.append(f"❌ Failed to process file: {file.name}")
                 # Generate QR codes
                 qr_paths = []
                 final_json_output = None
@@ -1557,7 +1491,7 @@ def create_modern_interface():
                 num_qrs = 0
             else:
                 num_qrs = len(qr_paths_list)
             initial_enabled_states = list(range(num_qrs))
             return qr_paths_list, initial_enabled_states  # Return paths list and initial enabled state
@@ -1567,7 +1501,7 @@ def create_modern_interface():
         process_btn.click(
             process_inputs,
-            inputs=[url_input, file_input, text_input, combine_data, crawl_depth_slider],
             outputs=[output_json, output_gallery, output_text]
         ).then( # Chain a .then() to update the QR paths state and trigger viewport update
             on_qr_generation,
@@ -1581,7 +1515,7 @@ def create_modern_interface():
         # Add helpful documentation
         gr.Markdown("""
         ### 🚀 Features
-        - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type.
         - **Advanced File Processing**: Reads various text-based files (.txt, .md, .log etc.), HTML, XML, CSV, and attempts text extraction from common documents (.pdf, .docx, .rtf, .odt - *requires extra dependencies*).
         - **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs.
         - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives.
@@ -1591,19 +1525,13 @@ def create_modern_interface():
         - **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
         - **Modern Design**: Clean, responsive interface with visual feedback.
         ### 💡 Tips
-        1.  **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type.
         2.  **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats.
         3.  **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure.
         4.  **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries. Check the console logs for warnings if a library is missing.
         5.  **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item.
         6.  **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps.
         7.  **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images.
-        ### 🎨 Output Details
-        -   The "Processed Data" JSON will be a list of dictionaries. Each dictionary represents one processed input (URL or file).
-        -   Each item will have keys like `source`, `filename` (for files), `url` (for URLs), `mime_type`, `raw_content` (if readable), `extracted_data`, and `processing_notes`.
-        -   `extracted_data` will contain the parsed/extracted content, structured according to the input type (e.g., dictionary for JSON, text for documents, list of rows for CSV, dictionary with title/text/links for HTML).
-        -   `processing_notes` will list any issues encountered during extraction.
-        -   Generated QR codes are saved in the `output/qr_codes` directory.
         ### ⚙️ QR Code Viewport Instructions
         1.  Navigate to the **QR Code Viewport** tab after generating QR codes.
         2.  The generated QR codes will be displayed in a grid based on their total count.
@@ -1635,4 +1563,4 @@ def main():
         raise # Re-raise the exception to ensure the process exits if launch fails
 if __name__ == "__main__":
-    main()

 try:
     from pyth.plugins.plaintext.writer import PlaintextWriter
+    from pyth.plugins.rtf15.reader import Rtf15Reader # Import Rtf15Reader
     RTF_SUPPORT = True
 except ImportError:
     RTF_SUPPORT = False
 for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
     directory.mkdir(parents=True, exist_ok=True)
+# Dummy EnhancedURLProcessor class for demonstration purposes if the actual class isn't provided.
+# This dummy simulates fetching and creating a nested structure based on max_steps.
 class EnhancedURLProcessor:
+    """Simulates advanced URL processing with enhanced content extraction and recursive link following."""
     def __init__(self):
+        # Dummy session and user agent for simulation
+        self.session = type('obj', (object,), {'get': self._dummy_get_request})()
+        self.user_agent = type('obj', (object,), {'random': 'SimulatedAgent/1.0'})()
+        self.timeout = 15
         self.max_retries = 3
+    def _dummy_get_request(self, url, timeout):
+        """Simulates a GET request response."""
+        class MockResponse:
+            def __init__(self, url, status_code, content_type, content, encoding='utf-8'):
+                self.url = url
+                self.status_code = status_code
+                self.headers = {'Content-Type': content_type}
+                self._content = content.encode(encoding)
+                self.encoding = encoding
+            def raise_for_status(self):
+                if 400 <= self.status_code < 600:
+                    raise requests.exceptions.RequestException(f"Simulated HTTP error {self.status_code}")
+            @property
+            def content(self):
+                return self._content
+        # Simulate different responses based on URL
+        if "this-domain-does-not-exist" in url:
+            raise requests.exceptions.RequestException("Simulated network error: Could not resolve host.")
+        elif "httpbin.org/html" in url:
+             # Simulate a simple HTML response
+             html_content = """
+             <!DOCTYPE html>
+             <html>
+             <head><title>Simulated HTML</title></head>
+             <body>
+                 <h1>Hello, World!</h1>
+                 <p>This is simulated HTML content.</p>
+                 <a href="/link1">Link 1</a>
+                 <a href="/link2">Link 2</a>
+             </body>
+             </html>
+             """
+             return MockResponse(url, 200, 'text/html', html_content)
+        elif "quotes.toscrape.com" in url:
+             # Simulate a more complex HTML with more links for deeper testing
+             html_content = f"""
+             <!DOCTYPE html>
+             <html>
+             <head><title>Simulated Quotes Page</title></head>
+             <body>
+                 <h1>Quotes</h1>
+                 <p>Some simulated quotes.</p>
+                 <a href="{url}/page/1/">Page 1</a>
+                 <a href="{url}/page/2/">Page 2</a>
+                 <a href="/tag/love/">Love Quotes</a>
+             </body>
+             </html>
+             """
+             return MockResponse(url, 200, 'text/html', html_content)
+        elif "/child" in url:
+             # Simulate nested HTML pages
+             html_content = f"""
+             <!DOCTYPE html>
+             <html>
+             <head><title>Simulated Child Page</title></head>
+             <body>
+                 <h1>Child Page</h1>
+                 <p>Content for {url}.</p>
+                 <a href="{url}/grandchild1">Grandchild 1</a>
+             </body>
+             </html>
+             """
+             return MockResponse(url, 200, 'text/html', html_content)
+        else:
+            # Default simulated plain text response
+            return MockResponse(url, 200, 'text/plain', f"Simulated content for {url}")
+    def validate_url(self, url: str) -> Dict[str, Any]:
+        """Enhanced URL validation with detailed feedback (Simulated)"""
+        # In a real implementation, this would perform actual network checks (HEAD/GET)
+        # For simulation, just check format
+        if not validators.url(url):
+            return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
+        parsed = urlparse(url)
+        if not all([parsed.scheme, parsed.netloc]):
+            return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
+        # Simulate accessibility check
+        if "this-domain-does-not-exist" in url:
+             return {'is_valid': False, 'message': 'Simulated: URL not accessible', 'details': 'Simulated network error'}
+        return {
+            'is_valid': True,
+            'message': 'Simulated: URL is valid and accessible',
+            'details': {
+                'final_url': url, # In simulation, final_url is same as original unless specifically handled
+                'content_type': 'text/html', # Simulate HTML for most tests
+                'server': 'SimulatedServer',
+                'size': 'SimulatedSize'
             }
+        }
     def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]:
+        """Enhanced content fetcher with retry mechanism and complete character extraction (Simulated)"""
         try:
+            logger.info(f"Simulating fetch content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
+            # Simulate the request using the dummy get
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
+            final_url = response.url # Capture potential redirects (simulated)
             content_type = response.headers.get('Content-Type', '')
+            # Simulate encoding detection (assuming utf-8 for simplicity in simulation)
+            encoding = 'utf-8'
+            raw_content = response.content.decode(encoding, errors='replace')
+            # Extract metadata (simulated)
             metadata = {
                 'original_url': url,
                 'final_url': final_url,
                 'status_code': response.status_code
             }
+            # Process based on content type (using the actual _process_web_content)
             processed_extraction = self._process_web_content(raw_content, metadata['content_type'], final_url)
             return {
             }
         except requests.exceptions.RequestException as e:
             if retry_count < self.max_retries - 1:
+                logger.warning(f"Simulated Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
+                time.sleep(0.1)  # Shorter backoff for simulation
                 return self.fetch_content(url, retry_count + 1)
+            logger.error(f"Simulated: Failed to fetch content after {self.max_retries} attempts from {url}: {e}")
             return {
                  'source': 'url',
                  'url': url,
                  'raw_content': None,
                  'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}, # Include basic metadata on failure
                  'extracted_data': None,
+                 'processing_notes': [f"Simulated: Failed to fetch content after {self.max_retries} attempts: {str(e)}"] # Ensure notes is a list
             }
         except Exception as e:
+            logger.error(f"Simulated: Unexpected error while fetching or processing URL {url}: {e}")
             return {
                 'source': 'url',
                 'url': url,
                 'raw_content': raw_content if 'raw_content' in locals() else None,
                 'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None},
                 'extracted_data': None,
+                'processing_notes': [f"Simulated: Unexpected processing error: {str(e)}"]
             }
     def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
         """Process content based on detected content type"""
         lower_content_type = content_type.lower()
         return extracted
     def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
+        """Fetches content from a URL and recursively follows links up to max_steps depth."""
+        # Validate max_steps first
+        if not isinstance(max_steps, int) or not (0 <= max_steps <= 10): # Changed max depth to 10
+            logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 10.")
+            return {
+                'url': url,
+                'level': 0,
+                'fetch_result': None,
+                'linked_extractions': [],
+                'processing_notes': [f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 10."]
+            }
+        # Validate the initial URL
+        validation_result = self.validate_url(url)
+        if not validation_result['is_valid']:
+            logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
+            return {
                 'url': url,
+                'level': 0,
                 'fetch_result': None,
                 'linked_extractions': [],
+                'processing_notes': [f"Initial URL validation failed: {validation_result['message']}"]
             }
+        # Start the recursive fetching process
+        return self._fetch_content_recursive(url, max_steps, current_step=0)
+    def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
+        """Recursive helper function to fetch content and follow links."""
+        # Base case: Stop if current depth exceeds max_steps
+        if current_step > max_steps:
+            logger.debug(f"Depth limit ({max_steps}) reached for {url} at level {current_step}.")
             return {
                 'url': url,
+                'level': current_step,
                 'fetch_result': None,
                 'linked_extractions': [],
+                'processing_notes': [f"Depth limit ({max_steps}) reached."]
             }
+        logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
+        fetch_result = self.fetch_content(url)
+        linked_extractions: List[Dict[str, Any]] = []
+        # Only attempt to extract and follow links if fetch was successful and content is HTML
+        if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
+            extracted_data = fetch_result['extracted_data']
+            links = extracted_data.get('links', [])
+            logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
+            # Recursively process linked URLs if more steps are allowed
+            if current_step < max_steps:
+                for link_info in links:
+                    linked_url = link_info.get('url')
+                    if linked_url:
+                        # Add a check to prevent processing the same URL repeatedly in a single crawl path
+                        # (More sophisticated de-duplication across the *entire* crawl would require a visited set passed down)
+                        # For simplicity here, we just prevent immediate cycles.
+                        if linked_url != url:
+                             linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
+                             # Only append results if the recursive call returned something valid
+                             if linked_result:
+                                linked_extractions.append(linked_result)
+                        else:
+                             logger.debug(f"Skipping self-referencing link: {linked_url}")
+        # Add processing notes from the fetch_result to the current level's notes
+        current_notes = fetch_result.get('processing_notes', []) if fetch_result else ['Fetch failed.']
+        if f"Processed at level {current_step}" not in current_notes:
+             current_notes.append(f"Processed at level {current_step}")
         return {
             'url': url,
+            'level': current_step,
+            'fetch_result': fetch_result, # Include the full fetch result for details
             'linked_extractions': linked_extractions,
+            'processing_notes': current_notes
         }
 class EnhancedFileProcessor:
     """Advanced file processing with enhanced content extraction"""
             return []
         dataset = []
+        # Use Path object for easier handling. Note: Gradio file object might not be a standard file path,
+        # but rather an object with a 'name' attribute pointing to a temp file path.
+        file_path = Path(file.name)
+        # Ensure file exists before trying to get size/stats
+        if not file_path.exists():
+             logger.error(f"File path does not exist: {file_path}")
+             return [{
+                 'source': 'file',
+                 'filename': file.name if hasattr(file, 'name') else 'unknown',
+                 'file_size': None,
+                 'extracted_data': None,
+                 'processing_notes': 'File path does not exist.'
+             }]
         try:
             file_size = file_path.stat().st_size
                     'processing_notes': 'File size exceeds limit.'
                 }]
+            # Use a temporary directory for extracting archives
             with tempfile.TemporaryDirectory() as temp_dir:
                 temp_dir_path = Path(temp_dir)
+                # Decide processing strategy based on extension
                 if file_path.suffix.lower() in self.archive_extensions:
                     dataset.extend(self._process_archive(file_path, temp_dir_path))
                 elif file_path.suffix.lower() in self.supported_extensions:
                     # Pass the path to the single file processor
                     dataset.extend(self._process_single_file(file_path))
                 else:
+                    logger.warning(f"Unsupported file type for processing: '{file_path.name}'. Attempting to read as plain text.")
+                    # Attempt to process as raw text even if extension is unsupported
                     try:
                          # Read as text with error replacement
                          content_bytes = file_path.read_bytes()
                              'file_size': file_size,
                              'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
                              'extracted_data': {'plain_text': raw_content}, # Store raw text under a key
+                             'processing_notes': ['Processed as plain text (unsupported extension).'] # Ensure notes is a list
                          })
                     except Exception as e:
                         logger.error(f"Error reading or processing unsupported file '{file_path.name}' as text: {e}")
                              'file_size': file_size,
                              'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
                              'extracted_data': None,
+                             'processing_notes': [f'Unsupported file type and failed to read as text: {e}'] # Ensure notes is a list
                         })
                 'filename': file_path.name,
                 'file_size': file_size if 'file_size' in locals() else None,
                 'extracted_data': None,
+                'processing_notes': [f'Overall file processing error: {str(e)}'] # Ensure notes is a list
             })
         return dataset
         raw_content: Optional[str] = None
         extracted_data: Any = None
+        processing_notes: List[str] = [] # Initialize notes as a list
         try:
             # Read content efficiently
                      if rows:
                           # Limit the number of rows included for potentially huge CSVs
+                          max_rows_preview = 100 # Limit text preview
                           extracted_data = {
+                              'headers': rows[0] if rows and rows[0] else None, # Assume first row is header if exists
+                              'rows': rows[1:max_rows_preview+1] if len(rows) > 1 else [] # Get up to max_rows_preview data rows, if any
                           }
                           if len(rows) > max_rows_preview + 1:
+                              processing_notes.append(f"CSV data rows truncated to {max_rows_preview}.")
                           processing_notes.append("Parsed as CSV.")
                           if not is_explicit_csv:
                                processing_notes.append("Note: Content looked like CSV despite extension/mime.")
                               extracted_text = text_content
                               processing_notes.append("Extracted text from PDF.")
                           finally:
+                              if temp_path.exists(): temp_path.unlink() # Clean up temp file
                       elif file_extension == '.docx' and DOCX_SUPPORT:
                            with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
                                tmp_file.write(content_bytes) # Write bytes to temp file
                                extracted_text = text_content
                                processing_notes.append("Extracted text from DOCX.")
                            finally:
+                               if temp_path.exists(): temp_path.unlink() # Clean up temp file
                       elif file_extension == '.rtf' and RTF_SUPPORT:
                            # pyth can read directly from file-like object or string
                            try:
+                                # Rtf15Reader expects a file-like object or string
                                 doc = Rtf15Reader.read(io.StringIO(raw_content))
                                 text_content = PlaintextWriter.write(doc).getvalue()
                                 extracted_text = text_content
                                 extracted_text = text_content
                                 processing_notes.append("Extracted text from ODT.")
                            finally:
+                                if temp_path.exists(): temp_path.unlink() # Clean up temp file
                       elif file_extension in ['.doc', '.ppt', '.pptx', '.xls', '.xlsx']:
                            # These require more complex or platform-specific libraries (e.g. antiword, pandoc, COM objects on Windows)
                            processing_notes.append(f"Automatic text extraction for {file_extension.upper()} not fully implemented.")
                 if zipfile.is_zipfile(archive_path):
                     with zipfile.ZipFile(archive_path, 'r') as zip_ref:
                         for file_info in zip_ref.infolist():
+                            # Skip directories and empty files
                             if file_info.file_size > 0 and not file_info.filename.endswith('/'):
+                                # Sanitize filename to prevent directory traversal issues
+                                sanitized_filename = Path(file_info.filename).name # Takes only the base name
+                                extracted_file_path = extract_to / sanitized_filename
                                 try:
+                                    # Extract file to the temporary directory
+                                    with zip_ref.open(file_info) as zf, open(extracted_file_path, 'wb') as outfile:
+                                         outfile.write(zf.read())
                                     # Recursively process the extracted file if it's supported and not an archive itself
                                     if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
                                          dataset.extend(self._process_single_file(extracted_file_path))
                                          logger.debug(f"Skipping unsupported file in archive: '{file_info.filename}'")
                                 except Exception as e:
                                     logger.warning(f"Error extracting/processing file '{file_info.filename}' from zip '{archive_path.name}': {e}")
+                                finally:
+                                     # Clean up the extracted file immediately after processing
+                                     if extracted_file_path.exists():
+                                          try:
+                                               extracted_file_path.unlink()
+                                          except OSError as e:
+                                               logger.warning(f"Failed to clean up extracted file {extracted_file_path}: {e}")
                 else:
                      logger.error(f"'{archive_path.name}' is not a valid zip file.")
                     with tarfile.open(archive_path, mode) as tar_ref:
                         for member in tar_ref.getmembers():
                             if member.isfile():
+                                # Sanitize member name
+                                sanitized_filename = Path(member.name).name
+                                extracted_file_path = extract_to / sanitized_filename
                                 try:
+                                    # Extract member to the temporary directory
+                                    # Ensure the target path is within the extraction directory
+                                    if not str(extracted_file_path).startswith(str(extract_to)):
+                                        logger.warning(f"Skipping potentially malicious path in tar: {member.name}")
+                                        continue # Skip if path is outside the temp dir
+                                    with tar_ref.extractfile(member) as tf, open(extracted_file_path, 'wb') as outfile:
+                                         if tf: # extractfile can return None for special file types
+                                              outfile.write(tf.read())
+                                         else:
+                                              logger.warning(f"Could not extract file-like object for {member.name} from tar.")
+                                              continue # Skip this member
                                      # Recursively process extracted file
                                     if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
                                          dataset.extend(self._process_single_file(extracted_file_path))
                                          logger.debug(f"Skipping unsupported file in archive: '{member.name}'")
                                 except Exception as e:
                                     logger.warning(f"Error extracting/processing file '{member.name}' from tar '{archive_path.name}': {e}")
+                                finally:
+                                     # Clean up the extracted file immediately after processing
+                                     if extracted_file_path.exists():
+                                          try:
+                                               extracted_file_path.unlink()
+                                          except OSError as e:
+                                               logger.warning(f"Failed to clean up extracted file {extracted_file_path}: {e}")
                 except tarfile.TarError as e:
                     logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
                  except Exception as e:
                      logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
                  finally:
+                      if extracted_path.exists():
+                           try:
+                                extracted_path.unlink() # Clean up extracted file
+                           except OSError as e:
+                                logger.warning(f"Failed to clean up extracted file {extracted_path}: {e}")
             # TODO: Add support for other archive types (.bz2, .7z, .rar)
             elif archive_extension in ('.bz2', '.7z', '.rar'):
                 "idx": 0, # chunk_index
                 "tc": 1, # total_chunks
                 "tl": total_length, # total_length
+                "hash": 0, # chunk_hash (using int for hash)
                 "data": "" # chunk_data
             }
             # Estimate overhead more accurately by dumping a sample metadata structure
             # and adding some safety margin. Shortened keys reduce overhead.
+            # Use a dummy hash value (e.g., 1234567890) to get a realistic length estimate
+            metadata_template_with_hash = {**metadata_template, "hash": 1234567890}
+            overhead_estimate = len(json.dumps(metadata_template_with_hash, separators=(',', ':'))) + 50 # Extra padding
             # Calculate effective chunk size
             effective_chunk_size = max_size - overhead_estimate
         }
         </script>
         """
         with gr.Row():
+            # Adjusted crawl depth slider to match the max_steps limit in the code
             crawl_depth_slider = gr.Slider(
                 label="Crawl Depth",
                 minimum=0,
+                maximum=10, # Changed max depth to 10
                 value=0,
                 step=1,
                 interactive=True,
+                info="Select the maximum depth for crawling links (0-10)." # Updated info
             )
         qr_code_paths = gr.State([])
         gr.Markdown("""
         # 🌐 Advanced Data Processing & QR Code Generator
             num_qr_codes = len(paths)
             cols = math.ceil(math.sqrt(num_qr_codes)) # Calculate columns for a roughly square grid
             cols = max(1, min(cols, 6)) # Limit max columns for small screens
+            # rows = math.ceil(num_qr_codes / cols) # Not used in HTML generation
             viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'.format(cols)
+            # Initialize enabledStates if it's empty (first load) or if paths have changed
+            if enabled_states is None or len(enabled_states) != num_qr_codes:
+                 enabled_states = list(range(num_qr_codes)) # Enable all by default or if QR count changes
             for i, path in enumerate(paths):
                 is_enabled = i in enabled_states
             return viewport_html
+        def process_inputs(urls, files, text, combine, crawl_depth): # Added crawl_depth parameter
             """Process all inputs and generate QR codes"""
             results = []
             processing_status_messages = []
                     url_list = re.split(r'[,\n]', urls)
                     url_list = [url.strip() for url in url_list if url.strip()]
                     for url in url_list:
+                        # Use the crawl_depth from the slider
+                        processing_status_messages.append(f"🌐 Processing URL: {url} with crawl depth {crawl_depth}...")
+                        content_result = url_processor.fetch_content_with_depth(url, max_steps=crawl_depth)
+                        if content_result and content_result.get('fetch_result') is not None: # Check if initial fetch was successful
+                            results.append(content_result)
+                            processing_status_messages.append(f"✅ Processed URL: {url} (Level 0)")
+                            # Add notes from the result if any
+                            if content_result.get('processing_notes'):
+                                 processing_status_messages.append(f"   Notes: {'; '.join(content_result['processing_notes'])}")
+                            # Optionally add status for linked extractions
+                            if content_result.get('linked_extractions'):
+                                 num_linked_processed = len([r for r in content_result['linked_extractions'] if r and r.get('fetch_result') is not None])
+                                 processing_status_messages.append(f"   Found and processed {num_linked_processed}/{len(content_result['linked_extractions'])} direct links.")
+                                 # Note: Detailed status for deeper levels would require traversing the result structure here.
                         else:
+                            processing_status_messages.append(f"❌ Failed to process URL: {url}")
+                            # Add notes from the result even if fetch failed
+                            if content_result and content_result.get('processing_notes'):
+                                 processing_status_messages.append(f"   Notes: {'; '.join(content_result['processing_notes'])}")
+                            elif content_result and content_result.get('note'): # Handle the 'note' key from validation/invalid steps
+                                 processing_status_messages.append(f"   Notes: {content_result['note']}")
                 # Process files
                 if files:
                         if file_results:
                              results.extend(file_results)
                              processing_status_messages.append(f"✅ Processed file: {file.name}")
+                             # Add notes from file processing results
+                             for res in file_results:
+                                  if res.get('processing_notes'):
+                                       processing_status_messages.append(f"   Notes for {res.get('filename', 'item')}: {'; '.join(res['processing_notes'])}")
                         else:
                              processing_status_messages.append(f"❌ Failed to process file: {file.name}")
                 # Generate QR codes
                 qr_paths = []
                 final_json_output = None
                 num_qrs = 0
             else:
                 num_qrs = len(qr_paths_list)
             initial_enabled_states = list(range(num_qrs))
             return qr_paths_list, initial_enabled_states  # Return paths list and initial enabled state
         process_btn.click(
             process_inputs,
+            inputs=[url_input, file_input, text_input, combine_data, crawl_depth_slider], # Pass crawl_depth_slider value
             outputs=[output_json, output_gallery, output_text]
         ).then( # Chain a .then() to update the QR paths state and trigger viewport update
             on_qr_generation,
         # Add helpful documentation
         gr.Markdown("""
         ### 🚀 Features
+        - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type. Supports crawling links up to a specified depth.
         - **Advanced File Processing**: Reads various text-based files (.txt, .md, .log etc.), HTML, XML, CSV, and attempts text extraction from common documents (.pdf, .docx, .rtf, .odt - *requires extra dependencies*).
         - **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs.
         - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives.
         - **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
         - **Modern Design**: Clean, responsive interface with visual feedback.
         ### 💡 Tips
+        1.  **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type, following links up to the specified **Crawl Depth**.
         2.  **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats.
         3.  **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure.
         4.  **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries. Check the console logs for warnings if a library is missing.
         5.  **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item.
         6.  **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps.
         7.  **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images.
         ### ⚙️ QR Code Viewport Instructions
         1.  Navigate to the **QR Code Viewport** tab after generating QR codes.
         2.  The generated QR codes will be displayed in a grid based on their total count.
         raise # Re-raise the exception to ensure the process exits if launch fails
 if __name__ == "__main__":
+    main()