diff --git "a/app2.py" "b/app2.py"
--- "a/app2.py"
+++ "b/app2.py"
@@ -26,6 +26,10 @@ from PIL import Image, ImageDraw, ImageFont # ImageFont may require pillow[extra
 import numpy as np # Unused in provided code, kept for completeness
 import tarfile
 import gzip
+import bz2 # Added for bz2 support
+import lzma # Added for xz support
+# import py7zr # Requires external dependency, not standard library
+# import rarfile # Requires external dependency, not standard library
 import math
 import random
 import pandas as pd
@@ -101,24 +105,34 @@ class EnhancedURLProcessor:
 
     def validate_url(self, url: str) -> Dict[str, Any]:
         """Enhanced URL validation with accessibility check."""
-        if not validators.url(url):
+        if not isinstance(url, str) or not url.strip():
+             return {'is_valid': False, 'message': 'URL input is empty or invalid type', 'details': 'Input must be a non-empty string'}
+
+        cleaned_url = url.strip()
+
+        if not validators.url(cleaned_url):
             return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
 
-        parsed = urlparse(url)
+        parsed = urlparse(cleaned_url)
         if not all([parsed.scheme, parsed.netloc]):
             return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
 
+        # Basic check for scheme
+        if parsed.scheme.lower() not in ['http', 'https']:
+             return {'is_valid': False, 'message': 'Unsupported scheme', 'details': 'Only http and https are supported'}
+
         try:
             # Use a HEAD request to check accessibility without downloading full content
             headers = {'User-Agent': self.user_agent.random}
-            response = self.session.head(url, timeout=self.timeout, headers=headers, allow_redirects=True)
+            response = self.session.head(cleaned_url, timeout=self.timeout, headers=headers, allow_redirects=True)
             response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
 
             # Check content type if available in HEAD response
             content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
-            if not content_type or not (content_type.startswith('text/') or 'json' in content_type or 'xml' in content_type):
-                 # Basic check if content type seems relevant for text extraction
-                 logger.warning(f"URL {url} returned potentially irrelevant content type: {content_type}")
+            # Basic check if content type seems relevant for text extraction
+            # Allow text, json, xml, and potentially others that might contain text
+            if not content_type or not (content_type.startswith('text/') or 'json' in content_type or 'xml' in content_type or 'application/octet-stream' in content_type):
+                 logger.warning(f"URL {cleaned_url} returned potentially irrelevant content type: {content_type}")
                  # Decide if this should invalidate the URL or just add a note
                  # For now, we'll allow fetching but add a note.
 
@@ -133,14 +147,21 @@ class EnhancedURLProcessor:
                 }
             }
         except requests.exceptions.RequestException as e:
-            return {'is_valid': False, 'message': 'URL not accessible', 'details': str(e)}
+            logger.error(f"URL validation failed for {cleaned_url}: {e}")
+            # Capture status code from response if available
+            status_code = getattr(e.response, 'status_code', None)
+            return {'is_valid': False, 'message': 'URL not accessible', 'details': f"{str(e)} (Status: {status_code})"}
         except Exception as e:
-            logger.error(f"Unexpected error during URL validation for {url}: {e}")
+            logger.error(f"Unexpected error during URL validation for {cleaned_url}: {e}")
             return {'is_valid': False, 'message': 'Unexpected validation error', 'details': str(e)}
 
 
     def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]:
         """Enhanced content fetcher with retry mechanism and complete character extraction."""
+        raw_content: Optional[str] = None
+        metadata: Dict[str, Any] = {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}
+        processing_notes: List[str] = []
+
         try:
             logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1})")
             headers = {'User-Agent': self.user_agent.random}
@@ -152,32 +173,54 @@ class EnhancedURLProcessor:
 
             # Attempt to detect encoding if not specified in headers
             encoding = response.encoding # requests attempts to guess encoding
-            if encoding is None or encoding == 'ISO-8859-1': # Fallback if requests guess is default/uncertain
+            # Fallback if requests guess is default/uncertain or explicitly wrong
+            # Use chardet if requests' guess is default or if content is large enough for reliable detection
+            if encoding is None or encoding == 'ISO-8859-1' or len(response.content) > 1000: # Only use chardet on larger content for performance
                  try:
                      encoding_detection = chardet.detect(response.content)
-                     encoding = encoding_detection['encoding'] or 'utf-8'
-                     logger.debug(f"Chardet detected encoding: {encoding} for {url}")
+                     # Use detected encoding if confidence is high, otherwise default to utf-8
+                     if encoding_detection and encoding_detection['confidence'] > 0.8:
+                         encoding = encoding_detection['encoding']
+                         logger.debug(f"Chardet detected encoding with high confidence: {encoding} for {url}")
+                     else:
+                          encoding = 'utf-8'
+                          logger.debug(f"Chardet detection confidence too low or failed for {url}. Falling back to utf-8. Detection result: {encoding_detection}")
                  except Exception as e:
                      logger.warning(f"Chardet detection failed for {url}: {e}. Falling back to utf-8.")
                      encoding = 'utf-8'
+            else:
+                 logger.debug(f"Requests detected encoding: {encoding} for {url}")
+
+
+            # Decode content using the determined encoding
+            try:
+                 raw_content = response.content.decode(encoding, errors='replace')
+                 if encoding != 'utf-8':
+                      processing_notes.append(f"Decoded using detected encoding: {encoding}")
+                 if 'replace' in raw_content: # Simple check if replacements occurred
+                      processing_notes.append("Note: Character replacement occurred during decoding.")
 
+            except Exception as e:
+                 logger.warning(f"Failed to decode content with encoding {encoding} for {url}: {e}. Trying utf-8 with ignore.")
+                 raw_content = response.content.decode('utf-8', errors='ignore')
+                 processing_notes.append(f"Decoding with {encoding} failed, used utf-8 with ignore: {e}")
 
-            raw_content = response.content.decode(encoding, errors='replace')
 
-            # Extract metadata
-            metadata = {
-                'original_url': url,
+            # Update metadata with successful fetch details
+            metadata.update({
                 'final_url': final_url,
-                'timestamp': datetime.now().isoformat(),
                 'detected_encoding': encoding,
                 'content_type': content_type,
                 'content_length': len(response.content),
                 'headers': dict(response.headers),
                 'status_code': response.status_code
-            }
+            })
 
             # Process based on content type
             processed_extraction = self._process_web_content(raw_content, metadata['content_type'], final_url)
+            # Merge processing notes from decoding and content processing
+            processing_notes.extend(processed_extraction['notes'])
+
 
             return {
                 'source': 'url',
@@ -185,27 +228,32 @@ class EnhancedURLProcessor:
                 'raw_content': raw_content,
                 'metadata': metadata,
                 'extracted_data': processed_extraction['data'],
-                'processing_notes': processed_extraction['notes']
+                'processing_notes': processing_notes
             }
         except requests.exceptions.RequestException as e:
             logger.error(f"Failed to fetch content from {url}: {e}")
+            # Capture status code from response if available
+            status_code = getattr(e.response, 'status_code', None)
+            metadata['status_code'] = status_code
+            processing_notes.append(f"Failed to fetch content: {str(e)}")
             return {
                  'source': 'url',
                  'url': url,
-                 'raw_content': None,
-                 'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': getattr(e.response, 'status_code', None)},
+                 'raw_content': raw_content, # raw_content might be partially decoded
+                 'metadata': metadata,
                  'extracted_data': None,
-                 'processing_notes': [f"Failed to fetch content: {str(e)}"]
+                 'processing_notes': processing_notes
             }
         except Exception as e:
             logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
+            processing_notes.append(f"Unexpected processing error: {str(e)}")
             return {
                 'source': 'url',
                 'url': url,
-                'raw_content': raw_content if 'raw_content' in locals() else None,
-                'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None},
+                'raw_content': raw_content,
+                'metadata': metadata,
                 'extracted_data': None,
-                'processing_notes': [f"Unexpected processing error: {str(e)}"]
+                'processing_notes': processing_notes
             }
 
 
@@ -225,12 +273,12 @@ class EnhancedURLProcessor:
                      extracted_data = json.loads(content)
                      notes.append("Parsed as JSON")
                  except json.JSONDecodeError as e:
-                     extracted_data = content
-                     notes.append(f"Failed to parse as JSON: {e}")
+                     extracted_data = content # Store raw content if parsing fails
+                     notes.append(f"Failed to parse as JSON: {e}. Stored raw text.")
                      logger.warning(f"Failed to parse JSON from {base_url}: {e}")
                  except Exception as e:
-                     extracted_data = content
-                     notes.append(f"Error processing JSON: {e}")
+                     extracted_data = content # Store raw content on other errors
+                     notes.append(f"Error processing JSON: {e}. Stored raw text.")
                      logger.error(f"Error processing JSON from {base_url}: {e}")
             elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'):
                  logger.debug(f"Processing XML content from {base_url}")
@@ -240,12 +288,12 @@ class EnhancedURLProcessor:
                      extracted_data = xml_text
                      notes.append("Parsed as XML (text representation)")
                  except ET.ParseError as e:
-                     extracted_data = content
-                     notes.append(f"Failed to parse as XML: {e}")
+                     extracted_data = content # Store raw content if parsing fails
+                     notes.append(f"Failed to parse as XML: {e}. Stored raw text.")
                      logger.warning(f"Failed to parse XML from {base_url}: {e}")
                  except Exception as e:
-                     extracted_data = content
-                     notes.append(f"Error processing XML: {e}")
+                     extracted_data = content # Store raw content on other errors
+                     notes.append(f"Error processing XML: {e}. Stored raw text.")
                      logger.error(f"Error processing XML from {base_url}: {e}")
             elif 'text/plain' in lower_content_type or 'text/' in lower_content_type:
                  logger.debug(f"Processing Plain Text content from {base_url}")
@@ -257,7 +305,7 @@ class EnhancedURLProcessor:
                 notes.append(f"Unknown content type '{content_type}'. Stored raw text.")
         except Exception as e:
             logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}")
-            extracted_data = content
+            extracted_data = content # Ensure raw content is stored on unexpected error
             notes.append(f"Unexpected processing error: {e}. Stored raw text.")
         return {'data': extracted_data, 'notes': notes}
 
@@ -285,17 +333,17 @@ class EnhancedURLProcessor:
                  if href and not href.startswith(('#', 'mailto:', 'tel:', 'javascript:')):
                     text = a_tag.get_text().strip()
                     try:
+                         # Use urljoin to create absolute URL
                          absolute_url = urljoin(base_url, href)
-                         if absolute_url not in unique_links:
+                         # Basic check if the joined URL is valid before adding
+                         if validators.url(absolute_url) and absolute_url not in unique_links:
                              extracted['links'].append({'text': text, 'url': absolute_url})
                              unique_links.add(absolute_url)
-                    except Exception:
-                         if validators.url(href) and href not in unique_links:
-                             extracted['links'].append({'text': text, 'url': href})
-                             unique_links.add(href)
-                         elif urlparse(href).netloc and href not in unique_links:
-                              extracted['links'].append({'text': text, 'url': href})
-                              unique_links.add(href)
+                         elif not validators.url(absolute_url):
+                              logger.debug(f"Skipping invalid joined URL: {absolute_url}")
+                    except Exception as e:
+                         logger.debug(f"Error joining URL {href} with base {base_url}: {e}. Skipping link.")
+
 
             soup_copy = BeautifulSoup(content, 'html.parser')
             for script_or_style in soup_copy(["script", "style"]):
@@ -307,10 +355,16 @@ class EnhancedURLProcessor:
 
         except Exception as e:
             logger.error(f"Enhanced HTML processing error for {base_url}: {e}")
-            soup_copy = BeautifulSoup(content, 'html.parser')
-            for script_or_style in soup_copy(["script", "style"]):
-                script_or_style.extract()
-            extracted['full_text'] = soup_copy.get_text(separator='\n').strip()
+            # Fallback to basic text extraction on error
+            try:
+                soup_copy = BeautifulSoup(content, 'html.parser')
+                for script_or_style in soup_copy(["script", "style"]):
+                    script_or_style.extract()
+                extracted['full_text'] = soup_copy.get_text(separator='\n').strip()
+            except Exception as soup_e:
+                 logger.error(f"Fallback HTML text extraction failed for {base_url}: {soup_e}")
+                 extracted['full_text'] = "Failed to extract text." # Indicate total failure
+
             extracted['processing_error'] = f"Enhanced HTML processing failed: {e}"
 
         return extracted
@@ -335,7 +389,7 @@ class EnhancedURLProcessor:
                 'level': 0,
                 'fetch_result': None,
                 'linked_extractions': [],
-                'processing_notes': [f"Initial URL validation failed: {validation_result['message']}"]
+                'processing_notes': [f"Initial URL validation failed: {validation_result['message']}. Details: {validation_result['details']}"]
             }
 
         # Use a set to keep track of visited URLs during the crawl to avoid infinite loops
@@ -344,6 +398,18 @@ class EnhancedURLProcessor:
 
     def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int, visited_urls: set) -> Dict[str, Any]:
         """Recursive helper function to fetch content and follow links."""
+        # Basic check for URL length to prevent excessively long URLs causing issues
+        if len(url) > 2000: # Arbitrary limit to prevent extremely long URLs
+             logger.warning(f"Skipping excessively long URL: {url[:100]}... at level {current_step}.")
+             return {
+                 'url': url,
+                 'level': current_step,
+                 'fetch_result': None,
+                 'linked_extractions': [],
+                 'processing_notes': ["URL is excessively long."]
+             }
+
+
         if current_step > max_steps:
             logger.debug(f"Depth limit ({max_steps}) reached for {url} at level {current_step}.")
             return {
@@ -355,10 +421,26 @@ class EnhancedURLProcessor:
             }
 
         # Normalize URL before checking visited set
-        normalized_url = url.rstrip('/') # Simple normalization
+        # Simple normalization: lowercase scheme and netloc, remove trailing slash
+        try:
+            parsed_url = urlparse(url)
+            # Normalize scheme and netloc to handle http vs https and www vs non-www consistently
+            normalized_netloc = parsed_url.netloc.lower()
+            # Optional: remove 'www.' if present, but this might be too aggressive depending on site structure
+            # if normalized_netloc.startswith('www.'):
+            #     normalized_netloc = normalized_netloc[4:]
+
+            normalized_url = parsed_url._replace(
+                 scheme=parsed_url.scheme.lower(),
+                 netloc=normalized_netloc
+            ).geturl().rstrip('/')
+        except Exception as e:
+             logger.warning(f"Failed to parse/normalize URL {url}: {e}. Using original URL for visited check.")
+             normalized_url = url.rstrip('/')
+
 
         if normalized_url in visited_urls:
-            logger.debug(f"Skipping already visited URL: {url} at level {current_step}.")
+            logger.debug(f"Skipping already visited URL: {url} (normalized: {normalized_url}) at level {current_step}.")
             return {
                 'url': url,
                 'level': current_step,
@@ -373,22 +455,39 @@ class EnhancedURLProcessor:
         fetch_result = self.fetch_content(url)
         linked_extractions: List[Dict[str, Any]] = []
 
-        if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
+        # Only attempt to extract links and recurse if the fetch was successful and it was HTML
+        # Check if fetch_result is not None and has a successful status code
+        if fetch_result and fetch_result.get('metadata', {}).get('status_code') is not None and 200 <= fetch_result['metadata']['status_code'] < 300 and \
+           fetch_result.get('extracted_data') and isinstance(fetch_result['extracted_data'], dict) and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
+
             extracted_data = fetch_result['extracted_data']
             links = extracted_data.get('links', [])
 
             logger.info(f"Found {len(links)} potential links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
             if current_step < max_steps:
-                for link_info in links:
+                # Limit the number of links followed per page to prevent excessive crawling
+                max_links_per_page = 20 # Arbitrary limit
+                for link_info in links[:max_links_per_page]:
                     linked_url = link_info.get('url')
                     if linked_url:
-                        # Ensure linked URL is absolute and potentially within the same domain
                         # Simple same-domain check (can be made more sophisticated)
                         try:
-                            base_domain = urlparse(url).netloc
-                            linked_domain = urlparse(linked_url).netloc
+                            base_domain = urlparse(url).netloc.lower()
+                            linked_domain = urlparse(linked_url).netloc.lower()
+                            base_scheme = urlparse(url).scheme.lower()
+                            linked_scheme = urlparse(linked_url).scheme.lower()
+
+                            # Only follow http/https links
+                            if linked_scheme not in ['http', 'https']:
+                                logger.debug(f"Skipping non-http/https link: {linked_url}")
+                                continue
+
                             # Allow processing if domains match OR if linked_domain is empty (relative link)
-                            if linked_domain and linked_domain != base_domain:
+                            # Also allow if linked_domain is a subdomain of base_domain
+                            is_same_domain = linked_domain == base_domain
+                            is_subdomain = linked_domain.endswith('.' + base_domain) if base_domain else False
+
+                            if linked_domain and not is_same_domain and not is_subdomain:
                                 logger.debug(f"Skipping external link: {linked_url}")
                                 continue # Skip external links
 
@@ -398,12 +497,14 @@ class EnhancedURLProcessor:
                                 linked_extractions.append(linked_result)
                         except Exception as e:
                             logger.warning(f"Error processing linked URL {linked_url} from {url}: {e}")
+                if len(links) > max_links_per_page:
+                     logger.info(f"Truncated link following to {max_links_per_page} links on {url}.")
+
 
+        current_notes = fetch_result.get('processing_notes', []) if fetch_result else ['Fetch failed or skipped.']
+        # Add a note indicating the level processed
+        current_notes.append(f"Processed at level {current_step}")
 
-        current_notes = fetch_result.get('processing_notes', []) if fetch_result else ['Fetch failed.']
-        if fetch_result and fetch_result.get('fetch_result') is not None: # Only add level note if fetch was attempted
-             if f"Processed at level {current_step}" not in current_notes:
-                  current_notes.append(f"Processed at level {current_step}")
 
         return {
             'url': url,
@@ -416,15 +517,15 @@ class EnhancedURLProcessor:
 
 class EnhancedFileProcessor:
     """Advanced file processing with enhanced content extraction"""
-    def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024):  # 5GB default
+    def __init__(self, max_file_size: int = 500 * 1024 * 1024):  # 500MB default, 5GB might be too large for typical web apps
         self.max_file_size = max_file_size
         self.supported_extensions = {
             '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
             '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
             '.pdf', '.doc', '.docx', '.rtf', '.odt',
-            '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
+            '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', '.xz', '.tgz', '.tar.gz', '.tar.bz2', '.tar.xz',
         }
-        self.archive_extensions = {'.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'}
+        self.archive_extensions = {'.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', '.xz', '.tgz', '.tar.gz', '.tar.bz2', '.tar.xz'}
 
     def process_file(self, file) -> List[Dict]:
         """Process uploaded file with enhanced error handling and complete extraction"""
@@ -458,14 +559,15 @@ class EnhancedFileProcessor:
                     'processing_notes': ['File size exceeds limit.']
                 }]
 
-            # Use a temporary directory for archive extraction
-            with tempfile.TemporaryDirectory() as temp_dir:
-                temp_dir_path = Path(temp_dir)
+            # Use a temporary directory for archive extraction and document processing
+            with tempfile.TemporaryDirectory() as temp_dir_str:
+                temp_dir_path = Path(temp_dir_str)
+                resolved_temp_dir_path = temp_dir_path.resolve() # Resolve temp dir once
 
                 if file_path.suffix.lower() in self.archive_extensions:
-                    dataset.extend(self._process_archive(file_path, temp_dir_path))
+                    dataset.extend(self._process_archive(file_path, temp_dir_path, resolved_temp_dir_path))
                 elif file_path.suffix.lower() in self.supported_extensions:
-                    dataset.extend(self._process_single_file(file_path))
+                    dataset.extend(self._process_single_file(file_path, temp_dir_path, resolved_temp_dir_path))
                 else:
                     logger.warning(f"Unsupported file type for processing: '{file_path.name}'. Attempting to read as plain text.")
                     try:
@@ -508,7 +610,7 @@ class EnhancedFileProcessor:
         p = Path(filepath) if isinstance(filepath, str) else filepath
         return p.suffix.lower() in self.archive_extensions
 
-    def _process_single_file(self, file_path: Path) -> List[Dict]:
+    def _process_single_file(self, file_path: Path, temp_dir_path: Path, resolved_temp_dir_path: Path) -> List[Dict]:
         """Process a single file with enhanced character extraction and format-specific handling"""
         dataset_entries = []
         filename = file_path.name
@@ -530,7 +632,9 @@ class EnhancedFileProcessor:
             raw_content = content_bytes.decode(encoding, errors='replace')
 
             is_explicit_json = mime_type == 'application/json' or file_extension == '.json'
-            looks_like_json = raw_content.strip().startswith('{') or raw_content.strip().startswith('[')
+            # Check if it looks like JSON only if not already explicitly identified
+            looks_like_json = not is_explicit_json and raw_content.strip().startswith(('{', '[')) and raw_content.strip().endswith(('}', ']'))
+
 
             if is_explicit_json or looks_like_json:
                 try:
@@ -539,17 +643,20 @@ class EnhancedFileProcessor:
                     if not is_explicit_json:
                          processing_notes.append("Note: Content looked like JSON despite extension/mime.")
                          logger.warning(f"File '{filename}' identified as JSON content despite extension/mime.")
-                    mime_type = 'application/json'
+                    if 'json' not in mime_type: mime_type = 'application/json' # Update mime type if detected
                 except json.JSONDecodeError as e:
-                    processing_notes.append(f"Failed to parse as JSON: {e}.")
+                    extracted_data = raw_content # Store raw content if parsing fails
+                    processing_notes.append(f"Failed to parse as JSON: {e}. Stored raw text.")
                     if is_explicit_json:
                          logger.error(f"Explicit JSON file '{filename}' has invalid format: {e}")
                     else:
                          logger.warning(f"Content of '{filename}' looks like JSON but failed to parse: {e}")
                 except Exception as e:
-                    processing_notes.append(f"Error processing JSON: {e}.")
+                    extracted_data = raw_content # Store raw content on other errors
+                    processing_notes.append(f"Error processing JSON: {e}. Stored raw text.")
                     logger.error(f"Error processing JSON in '{filename}': {e}")
 
+            # Check if it looks like XML only if not already processed as JSON
             looks_like_xml = extracted_data is None and raw_content.strip().startswith('<') and raw_content.strip().endswith('>')
             is_explicit_xml = extracted_data is None and (mime_type in ('application/xml', 'text/xml') or mime_type.endswith('+xml') or file_extension in ('.xml', '.xsd'))
 
@@ -560,37 +667,50 @@ class EnhancedFileProcessor:
                      processing_notes.append("Parsed as XML (text representation).")
                      if not is_explicit_xml:
                          processing_notes.append("Note: Content looked like XML despite extension/mime.")
-                     if 'xml' not in mime_type: mime_type = 'application/xml'
+                     if 'xml' not in mime_type: mime_type = 'application/xml' # Update mime type if detected
                  except ET.ParseError as e:
-                     processing_notes.append(f"Failed to parse as XML: {e}.")
+                     extracted_data = raw_content # Store raw content if parsing fails
+                     processing_notes.append(f"Failed to parse as XML: {e}. Stored raw text.")
                      if is_explicit_xml:
                          logger.error(f"Explicit XML file '{filename}' has invalid format: {e}")
                      else:
                          logger.warning(f"Content of '{filename}' looks like XML but failed to parse: {e}")
                  except Exception as e:
-                     processing_notes.append(f"Error processing XML: {e}.")
+                     extracted_data = raw_content # Store raw content on other errors
+                     processing_notes.append(f"Error processing XML: {e}. Stored raw text.")
                      logger.error(f"Error processing XML in '{filename}': {e}")
 
+            # Check if it looks like CSV only if not already processed as JSON or XML
             is_explicit_csv = extracted_data is None and (mime_type == 'text/csv' or file_extension == '.csv')
+            # Basic heuristic for looks_like_csv: contains comma/semicolon AND multiple lines
             looks_like_csv = extracted_data is None and (',' in raw_content or ';' in raw_content) and ('\n' in raw_content or len(raw_content.splitlines()) > 1)
 
             if extracted_data is None and (is_explicit_csv or looks_like_csv):
                  try:
-                     dialect = 'excel'
+                     dialect = 'excel' # Default dialect
+                     # Use csv.Sniffer to detect dialect if possible
                      try:
-                          sample = '\n'.join(raw_content.splitlines()[:10])
+                          # Sniffer needs a sample with multiple lines if possible
+                          sample_lines = raw_content.splitlines()
+                          sample = '\n'.join(sample_lines[:10]) # Use up to 10 lines for sample
                           if sample:
+                              # Sniffer can raise csv.Error if sample is not CSV-like
                               dialect = csv.Sniffer().sniff(sample).name
                               logger.debug(f"Sniffer detected CSV dialect: {dialect} for '{filename}'")
                      except csv.Error:
                          logger.debug(f"Sniffer failed to detect dialect for '{filename}', using 'excel'.")
                          dialect = 'excel'
+                     except Exception as e:
+                         logger.warning(f"Unexpected error during CSV dialect sniffing for '{filename}': {e}. Using 'excel'.")
+                         dialect = 'excel'
+
 
+                     # Read CSV content
                      csv_reader = csv.reader(io.StringIO(raw_content), dialect=dialect)
                      rows = list(csv_reader)
 
                      if rows:
-                          max_rows_preview = 100
+                          max_rows_preview = 100 # Limit preview rows
                           extracted_data = {
                               'headers': rows[0] if rows and rows[0] else None,
                               'rows': rows[1:max_rows_preview+1] if len(rows) > 1 else []
@@ -600,7 +720,7 @@ class EnhancedFileProcessor:
                           processing_notes.append("Parsed as CSV.")
                           if not is_explicit_csv:
                                processing_notes.append("Note: Content looked like CSV despite extension/mime.")
-                          mime_type = 'text/csv'
+                          if 'csv' not in mime_type: mime_type = 'text/csv' # Update mime type if detected
 
                      else:
                          extracted_data = "Empty CSV"
@@ -609,25 +729,38 @@ class EnhancedFileProcessor:
                                processing_notes.append("Note: Content looked like CSV but was empty.")
 
                  except Exception as e:
-                     processing_notes.append(f"Failed to parse as CSV: {e}.")
+                     extracted_data = raw_content # Store raw content if parsing fails
+                     processing_notes.append(f"Failed to parse as CSV: {e}. Stored raw text.")
                      logger.warning(f"Failed to parse CSV from '{filename}': {e}")
 
+            # Attempt document specific extraction if not already processed as structured data
             if extracted_data is None:
                  try:
                       extracted_text = None
+                      # Need to save bytes to a temporary file for libraries that expect a file path
+                      temp_file_suffix = file_extension # Use original extension for temp file
+                      temp_path = None # Initialize temp_path
+
                       if file_extension == '.pdf' and PDF_SUPPORT:
-                          with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+                          # Create temp file within the designated temp directory
+                          with tempfile.NamedTemporaryFile(delete=False, suffix=temp_file_suffix, dir=temp_dir_path) as tmp_file:
                               tmp_file.write(content_bytes)
                               temp_path = Path(tmp_file.name)
                           try:
                               reader = PdfReader(temp_path)
+                              # Concatenate text from all pages
                               text_content = "".join(page.extract_text() or "" for page in reader.pages)
                               extracted_text = text_content
                               processing_notes.append("Extracted text from PDF.")
+                          except Exception as e:
+                               processing_notes.append(f"PDF extraction error: {e}")
+                               logger.warning(f"Failed to extract PDF text from '{filename}': {e}")
                           finally:
-                              if temp_path.exists(): temp_path.unlink()
+                              if temp_path and temp_path.exists(): temp_path.unlink() # Clean up temp file
+
                       elif file_extension == '.docx' and DOCX_SUPPORT:
-                           with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
+                           # Create temp file within the designated temp directory
+                           with tempfile.NamedTemporaryFile(delete=False, suffix=temp_file_suffix, dir=temp_dir_path) as tmp_file:
                                tmp_file.write(content_bytes)
                                temp_path = Path(tmp_file.name)
                            try:
@@ -635,11 +768,16 @@ class EnhancedFileProcessor:
                                text_content = "\n".join(paragraph.text for paragraph in document.paragraphs)
                                extracted_text = text_content
                                processing_notes.append("Extracted text from DOCX.")
+                           except Exception as e:
+                               processing_notes.append(f"DOCX extraction error: {e}")
+                               logger.warning(f"Failed to extract DOCX text from '{filename}': {e}")
                            finally:
-                               if temp_path.exists(): temp_path.unlink()
+                               if temp_path and temp_path.exists(): temp_path.unlink() # Clean up temp file
+
                       elif file_extension == '.rtf' and RTF_SUPPORT:
                            try:
                                 # Need to read RTF content as text, not bytes, for pyth's Rtf15Reader
+                                # Assuming raw_content is already decoded text
                                 doc = Rtf15Reader.read(io.StringIO(raw_content))
                                 text_content = PlaintextWriter.write(doc).getvalue()
                                 extracted_text = text_content
@@ -647,35 +785,49 @@ class EnhancedFileProcessor:
                            except Exception as e:
                                processing_notes.append(f"RTF extraction error: {e}")
                                logger.warning(f"Failed to extract RTF text from '{filename}': {e}")
+
                       elif file_extension == '.odt' and ODT_SUPPORT:
-                           with tempfile.NamedTemporaryFile(delete=False, suffix='.odt') as tmp_file:
+                           # Create temp file within the designated temp directory
+                           with tempfile.NamedTemporaryFile(delete=False, suffix=temp_file_suffix, dir=temp_dir_path) as tmp_file:
                                tmp_file.write(content_bytes)
                                temp_path = Path(tmp_file.name)
                            try:
                                 text_doc = OpenDocumentText(temp_path)
                                 paragraphs = text_doc.getElementsByType(odftext.P)
+                                # Iterate through paragraphs and their child nodes to get text
                                 text_content = "\n".join("".join(node.text for node in p.childNodes) for p in paragraphs)
                                 extracted_text = text_content
                                 processing_notes.append("Extracted text from ODT.")
+                           except Exception as e:
+                               processing_notes.append(f"ODT extraction error: {e}")
+                               logger.warning(f"Failed to extract ODT text from '{filename}': {e}")
                            finally:
-                                if temp_path.exists(): temp_path.unlink()
+                                if temp_path and temp_path.exists(): temp_path.unlink() # Clean up temp file
+
                       elif file_extension in ['.doc', '.ppt', '.pptx', '.xls', '.xlsx']:
+                           # Note: These require external libraries like python-pptx, openpyxl, or potentially platform-specific tools
                            processing_notes.append(f"Automatic text extraction for {file_extension.upper()} not fully implemented.")
                            logger.warning(f"Automatic text extraction for {file_extension.upper()} not fully implemented for '{filename}'.")
 
                       if extracted_text is not None:
-                           max_extracted_text_size = 10000
+                           max_extracted_text_size = 10000 # Limit extracted text size for display/QR
                            extracted_data = {'text': extracted_text[:max_extracted_text_size]}
                            if len(extracted_text) > max_extracted_text_size:
                                 extracted_data['text'] += "..."
                                 processing_notes.append("Extracted text truncated.")
+                           if mime_type in ['unknown/unknown', 'application/octet-stream']:
+                                # Guess a more specific mime type if text was extracted
+                                guessed_text_mime, _ = mimetypes.guess_type('dummy.txt')
+                                if guessed_text_mime: mime_type = guessed_text_mime
+
 
                  except ImportError as e:
-                      processing_notes.append(f"Missing dependency for document type ({e}). Cannot extract text.")
+                      processing_notes.append(f"Missing dependency for document type ({e.name if hasattr(e, 'name') else str(e)}). Cannot extract text.")
                  except Exception as e:
                       processing_notes.append(f"Error during document text extraction: {e}")
                       logger.warning(f"Error during document text extraction for '{filename}': {e}")
 
+            # If no specific extraction worked, store the raw decoded content as plain text
             if extracted_data is None:
                 extracted_data = {'plain_text': raw_content}
                 processing_notes.append("Stored as plain text.")
@@ -686,8 +838,8 @@ class EnhancedFileProcessor:
         except Exception as e:
             logger.error(f"Fatal error processing single file '{filename}': {e}")
             processing_notes.append(f"Fatal processing error: {e}")
-            raw_content = None
-            extracted_data = None
+            raw_content = raw_content if 'raw_content' in locals() else None # Preserve raw content if decoded before error
+            extracted_data = None # Clear extracted data on fatal error
 
         entry = {
             'source': 'file',
@@ -704,8 +856,8 @@ class EnhancedFileProcessor:
         dataset_entries.append(entry)
         return dataset_entries
 
-    def _process_archive(self, archive_path: Path, extract_to: Path) -> List[Dict]:
-        """Process an archive file with enhanced extraction"""
+    def _process_archive(self, archive_path: Path, extract_to: Path, resolved_extract_to: Path) -> List[Dict]:
+        """Process an archive file with enhanced extraction and security"""
         dataset = []
         archive_extension = archive_path.suffix.lower()
         logger.info(f"Processing archive: '{archive_path.name}'")
@@ -715,62 +867,97 @@ class EnhancedFileProcessor:
                 if zipfile.is_zipfile(archive_path):
                     with zipfile.ZipFile(archive_path, 'r') as zip_ref:
                         for file_info in zip_ref.infolist():
-                            # Prevent Zip Slip vulnerability
-                            sanitized_filename = Path(file_info.filename).name # Takes only the base name
-                            extracted_file_path = extract_to / sanitized_filename
-
-                            if file_info.file_size > 0 and not file_info.filename.endswith('/'):
+                            if file_info.file_size > 0 and not file_info.is_dir():
+                                # Calculate the intended extraction path within the temp_dir
+                                # Use pathlib's / operator which is safer than os.path.join
+                                extracted_file_path = extract_to / file_info.filename
                                 try:
-                                    # Use extract method with path to temp_dir for safety
+                                    # Resolve the potential extraction path to check against the resolved temp dir
+                                    # This is the most robust check against Zip Slip
+                                    resolved_extracted_file_path = extracted_file_path.resolve()
+
+                                    # Check if the resolved extracted path is actually inside the resolved temp directory
+                                    if not resolved_extracted_file_path.is_relative_to(resolved_extract_to):
+                                         logger.warning(f"Skipping potentially malicious path in zip: '{file_info.filename}' (resolved to {resolved_extracted_file_path})")
+                                         continue # Skip this member
+
+                                    # If the check passes, proceed with extraction
+                                    # Note: zipfile.extract() is generally safe in recent Python versions (>=3.6)
                                     zip_ref.extract(file_info, path=extract_to)
-                                    extracted_file_path = extract_to / file_info.filename # Get the actual extracted path
 
-                                    if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
-                                         dataset.extend(self._process_single_file(extracted_file_path))
-                                    elif extracted_file_path.suffix.lower() in self.archive_extensions:
-                                         logger.info(f"Found nested archive '{file_info.filename}', processing recursively.")
-                                         dataset.extend(self._process_archive(extracted_file_path, extract_to))
+                                    # Process the extracted file
+                                    if extracted_file_path.suffix.lower() in self.supported_extensions:
+                                         if self._is_archive(extracted_file_path):
+                                              logger.info(f"Found nested archive '{file_info.filename}', processing recursively.")
+                                              # Recursively call, passing a *new* temp dir path for nested extraction
+                                              with tempfile.TemporaryDirectory() as nested_temp_dir_str:
+                                                  nested_temp_dir_path = Path(nested_temp_dir_str)
+                                                  dataset.extend(self._process_archive(extracted_file_path, nested_temp_dir_path, nested_temp_dir_path.resolve()))
+                                         else:
+                                              # Pass the nested temp dir info down for processing the single file
+                                              dataset.extend(self._process_single_file(extracted_file_path, extract_to, resolved_extract_to))
                                     else:
-                                         logger.debug(f"Skipping unsupported file in archive: '{file_info.filename}'")
+                                         logger.debug(f"Skipping unsupported file in zip: '{file_info.filename}'")
                                 except Exception as e:
                                     logger.warning(f"Error extracting/processing file '{file_info.filename}' from zip '{archive_path.name}': {e}")
                                 finally:
                                      # Clean up the extracted file immediately
                                      if extracted_file_path.exists():
                                           try:
-                                               extracted_file_path.unlink()
+                                               # Re-check resolved path before unlinking for safety
+                                               if extracted_file_path.resolve().is_relative_to(resolved_extract_to):
+                                                    extracted_file_path.unlink()
+                                               else:
+                                                    logger.warning(f"Skipping cleanup of path outside temp dir: {extracted_file_path}")
                                           except OSError as e:
                                                logger.warning(f"Failed to clean up extracted file {extracted_file_path}: {e}")
                 else:
                      logger.error(f"'{archive_path.name}' is not a valid zip file.")
 
-            elif archive_extension in ('.tar', '.gz', '.tgz'): # .tgz is often tar.gz
+            elif archive_extension in ('.tar', '.gz', '.tgz', '.tar.gz', '.bz2', '.tar.bz2', '.xz', '.tar.xz'): # Added more tar/compression extensions
                 try:
                     mode = 'r'
-                    if archive_extension in ('.tar.gz', '.tgz'): mode = 'r:gz' # Handle .tar.gz and .tgz
+                    if archive_extension in ('.tar.gz', '.tgz'): mode = 'r:gz'
+                    elif archive_extension == '.tar.bz2': mode = 'r:bz2'
+                    elif archive_extension == '.tar.xz': mode = 'r:xz'
+                    elif archive_extension == '.gz': mode = 'r:gz' # Handle standalone .gz as tar.gz with single member
+                    elif archive_extension == '.bz2': mode = 'r:bz2' # Handle standalone .bz2 as tar.bz2 with single member
+                    elif archive_extension == '.xz': mode = 'r:xz' # Handle standalone .xz as tar.xz with single member
+                    # Note: standalone .gz, .bz2, .xz are typically single files, and tarfile can read them.
 
                     with tarfile.open(archive_path, mode) as tar_ref:
                         for member in tar_ref.getmembers():
                             if member.isfile():
-                                # Prevent Tar Slip vulnerability
-                                sanitized_filename = Path(member.name).name # Takes only the base name
-                                extracted_file_path = extract_to / sanitized_filename
+                                # Calculate the intended extraction path within the temp_dir
+                                extracted_file_path = extract_to / member.name # This is the path *inside* the temp dir
 
                                 try:
-                                    # Use extractfile method and write manually for better control/safety
-                                    member_file = tar_ref.extractfile(member)
-                                    if member_file:
-                                        with open(extracted_file_path, 'wb') as outfile:
-                                             outfile.write(member_file.read())
-                                        member_file.close() # Close the BytesIO object
-
-                                        if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
-                                             dataset.extend(self._process_single_file(extracted_file_path))
-                                        elif extracted_file_path.suffix.lower() in self.archive_extensions:
-                                             logger.info(f"Found nested archive '{member.name}', processing recursively.")
-                                             dataset.extend(self._process_archive(extracted_file_path, extract_to))
+                                    # Resolve the potential extraction path
+                                    resolved_extracted_file_path = extracted_file_path.resolve()
+
+                                    # Check if the resolved extracted path is actually inside the resolved temp directory
+                                    # This is the most robust check against Tar Slip
+                                    if not resolved_extracted_file_path.is_relative_to(resolved_extract_to):
+                                         logger.warning(f"Skipping potentially malicious path in tar: '{member.name}' (resolved to {resolved_extracted_file_path})")
+                                         continue # Skip this member
+
+                                    # If the check passes, proceed with extraction
+                                    # Note: tarfile.extract() is generally safe in recent Python versions (>=3.6)
+                                    tar_ref.extract(member, path=extract_to)
+
+                                    # Process the extracted file
+                                    if extracted_file_path.suffix.lower() in self.supported_extensions:
+                                         if self._is_archive(extracted_file_path):
+                                              logger.info(f"Found nested archive '{member.name}', processing recursively.")
+                                              # Recursively call, passing a *new* temp dir path for nested extraction
+                                              with tempfile.TemporaryDirectory() as nested_temp_dir_str:
+                                                   nested_temp_dir_path = Path(nested_temp_dir_str)
+                                                   dataset.extend(self._process_archive(extracted_file_path, nested_temp_dir_path, nested_temp_dir_path.resolve()))
+                                         else:
+                                              # Pass the nested temp dir info down for processing the single file
+                                              dataset.extend(self._process_single_file(extracted_file_path, extract_to, resolved_extract_to))
                                     else:
-                                         logger.warning(f"Could not get file-like object for {member.name} from tar.")
+                                         logger.debug(f"Skipping unsupported file in tar: '{member.name}'")
 
                                 except Exception as e:
                                     logger.warning(f"Error extracting/processing file '{member.name}' from tar '{archive_path.name}': {e}")
@@ -778,48 +965,28 @@ class EnhancedFileProcessor:
                                      # Clean up the extracted file immediately
                                      if extracted_file_path.exists():
                                           try:
-                                               extracted_file_path.unlink()
+                                               # Re-check resolved path before unlinking for safety
+                                               if extracted_file_path.resolve().is_relative_to(resolved_extract_to):
+                                                    extracted_file_path.unlink()
+                                               else:
+                                                    logger.warning(f"Skipping cleanup of path outside temp dir: {extracted_file_path}")
                                           except OSError as e:
                                                logger.warning(f"Failed to clean up extracted file {extracted_file_path}: {e}")
 
                 except tarfile.TarError as e:
                     logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
+                except Exception as e:
+                    logger.error(f"Unexpected error processing TAR archive '{archive_path.name}': {e}")
 
-            elif archive_extension == '.gz': # Handle standalone .gz (single file compression)
-                 extracted_name = archive_path.stem # Get filename without .gz
-                 extracted_path = extract_to / extracted_name
-                 try:
-                     with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
-                         outfile.write(gz_file.read())
-                     # Process the extracted file
-                     if extracted_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_path):
-                          dataset.extend(self._process_single_file(extracted_path))
-                     elif extracted_path.suffix.lower() in self.archive_extensions:
-                          logger.info(f"Found nested archive '{extracted_name}', processing recursively.")
-                          dataset.extend(self._process_archive(extracted_path, extract_to))
-                     else:
-                          logger.debug(f"Skipping unsupported file (from gz): '{extracted_name}'")
-
-                 except gzip.BadGzipFile as e:
-                     logger.error(f"Error processing GZIP file '{archive_path.name}': {e}")
-                 except Exception as e:
-                     logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
-                 finally:
-                      # Clean up the extracted file immediately
-                      if extracted_path.exists():
-                           try:
-                                extracted_path.unlink()
-                           except OSError as e:
-                                logger.warning(f"Failed to clean up extracted file {extracted_path}: {e}")
-
-            elif archive_extension in ('.bz2', '.7z', '.rar'):
-                logger.warning(f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.")
+            elif archive_extension in ('.7z', '.rar'):
+                logger.warning(f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries like py7zr or rarfile.")
 
         except Exception as e:
             logger.error(f"Overall archive processing error for '{archive_path.name}': {e}")
 
         return dataset
 
+
     def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
         """Enhanced data chunking with sequence metadata"""
         try:
@@ -831,64 +998,122 @@ class EnhancedFileProcessor:
                  data_list = data
 
             # JSON dump the entire list first
+            # Use compact separators for maximum data density
             json_str = json.dumps(data_list, ensure_ascii=False, separators=(',', ':'))
             total_length = len(json_str)
 
             # Estimate overhead for metadata + some buffer
             # Example metadata: {"idx":0,"tc":1,"tl":1000,"hash":1234567890,"data":"..."}
-            # A rough estimate of the metadata string length
-            # Assuming max 5 digits for idx/tc, 10 for tl, 10 for hash, plus keys, colons, commas, quotes
-            # {"idx":NNNNN,"tc":NNNNN,"tl":NNNNNNNNNN,"hash":NNNNNNNNNN,"data":""}
-            # ~ 7 + 5 + 6 + 5 + 6 + 10 + 7 + 10 + 7 + 0 + 2 + 4*3 (commas/colons) + 2*2 (quotes) = ~ 80-100 characters
-            # Let's use a slightly safer estimate
-            overhead_estimate = len(json.dumps({"idx": 99999, "tc": 99999, "tl": 9999999999, "hash": 9999999999, "data": ""}, separators=(',', ':'))) + 50 # Add buffer
-            # Max QR code capacity for alphanumeric is higher than byte/binary.
-            # Max size 2953 is for bytes. For alphanumeric, it's 4296.
-            # We are encoding JSON (mostly alphanumeric, but can contain non-ASCII).
-            # Using byte capacity (2953) is safer. Let's stick to 2953 as the max_size input.
-
-            effective_chunk_size = max_size - overhead_estimate
-
-            if effective_chunk_size <= 0:
-                 logger.error(f"Max QR size ({max_size}) is too small for metadata overhead ({overhead_estimate}). Cannot chunk.")
+            # Max possible values for idx, tc (up to num_chunks, which can be large), tl (up to total_length), hash (32-bit int)
+            # Let's assume max 6 digits for idx/tc (up to 999,999 chunks), 10 for tl, 10 for hash
+            # {"idx":999999,"tc":999999,"tl":9999999999,"hash":4294967295,"data":""}
+            # Length of keys + colons + commas + quotes + max value lengths:
+            # "idx":      (5 + 1 + 6) = 12
+            # "tc":       (4 + 1 + 6) = 11
+            # "tl":       (4 + 1 + 10) = 15
+            # "hash":     (6 + 1 + 10) = 17
+            # "data":     (6 + 1 + 2) = 9  (for quotes around empty string)
+            # Commas: 3
+            # Total estimate: 12 + 11 + 15 + 17 + 9 + 3 = 67
+            # Add a buffer for safety (e.g., 30-50 chars)
+            overhead_estimate = 100 # Safe estimate for metadata overhead
+
+            # Max QR code capacity for bytes (Version 40, Level M) is 2953.
+            # We are encoding JSON string, which is mostly alphanumeric but can contain non-ASCII characters.
+            # Using byte capacity is the safest approach.
+            max_qr_byte_capacity = 2953 # Max bytes for QR Version 40, Error Correction M
+
+            effective_chunk_size_bytes = max_qr_byte_capacity - overhead_estimate
+
+            if effective_chunk_size_bytes <= 0:
+                 logger.error(f"Max QR byte capacity ({max_qr_byte_capacity}) is too small for metadata overhead ({overhead_estimate}). Cannot chunk.")
                  return []
 
-            if total_length <= effective_chunk_size:
+            # Convert the JSON string to bytes to get the actual byte length for chunking
+            json_bytes = json_str.encode('utf-8')
+            total_byte_length = len(json_bytes)
+
+            if total_byte_length <= effective_chunk_size_bytes:
                 # Single chunk case
-                chunk_data = json_str
+                chunk_data = json_str # Store the string data in the chunk
                 chunk = {
                     "idx": 0,
                     "tc": 1,
-                    "tl": total_length,
-                    "hash": hash(chunk_data) & 0xFFFFFFFF, # Use a simple hash
-                    "data": chunk_data
+                    "tl": total_byte_length, # Total length is in bytes
+                    "hash": hash(json_str) & 0xFFFFFFFF, # Use hash of the whole string, truncated to 32-bit
+                    "data": chunk_data # The data payload is the full JSON string
                 }
                 return [chunk]
 
             # Multi-chunk case
-            num_chunks = math.ceil(total_length / effective_chunk_size)
             chunks = []
-            current_pos = 0
-            for i in range(num_chunks):
-                end_pos = min(current_pos + effective_chunk_size, total_length)
-                chunk_data_str = json_str[current_pos:end_pos]
+            current_byte_pos = 0
+
+            # Iterate and split the bytes, then decode each chunk
+            # This is safer than splitting the string directly by character index
+            for i in range(math.ceil(total_byte_length / effective_chunk_size_bytes)):
+                end_byte_pos = min(current_byte_pos + effective_chunk_size_bytes, total_byte_length)
+
+                # Find the nearest character boundary before end_byte_pos
+                # Decode a small buffer around the end position to find a character boundary
+                buffer_size = 10 # Look back a few bytes
+                safe_end_byte_pos = end_byte_pos
+                if safe_end_byte_pos < total_byte_length:
+                    # Decode a small slice ending at end_byte_pos
+                    try:
+                        # Attempt to decode from a few bytes before the end to the end
+                        test_slice = json_bytes[max(0, end_byte_pos - buffer_size) : end_byte_pos]
+                        test_slice.decode('utf-8', errors='strict') # Try strict decoding
+                        # If strict decoding works, end_byte_pos is a character boundary
+                        safe_end_byte_pos = end_byte_pos
+                    except UnicodeDecodeError as e:
+                        # If decoding fails, it's not a character boundary. Adjust to the start of the invalid sequence.
+                        # e.start gives the index within the slice, add current_byte_pos to get index in full bytes
+                        safe_end_byte_pos = max(0, end_byte_pos - buffer_size) + e.start
+                        logger.debug(f"Adjusted chunk boundary from {end_byte_pos} to {safe_end_byte_pos} due to character boundary.")
+                    except Exception as e:
+                        logger.warning(f"Unexpected error finding character boundary near byte {end_byte_pos}: {e}. Using unverified boundary.")
+                        safe_end_byte_pos = end_byte_pos # Fallback
+
+
+                # Get the byte slice for the chunk
+                chunk_bytes = json_bytes[current_byte_pos:safe_end_byte_pos]
+
+                # Decode the chunk bytes into a string
+                try:
+                     chunk_data_str = chunk_bytes.decode('utf-8', errors='strict')
+                except UnicodeDecodeError as e:
+                     logger.error(f"Failed to strictly decode chunk {i} bytes ({current_byte_pos}-{safe_end_byte_pos}): {e}. Using 'replace' error handling.")
+                     chunk_data_str = chunk_bytes.decode('utf-8', errors='replace')
+                except Exception as e:
+                     logger.error(f"Unexpected error decoding chunk {i} bytes ({current_byte_pos}-{safe_end_byte_pos}): {e}. Using 'replace' error handling.")
+                     chunk_data_str = chunk_bytes.decode('utf-8', errors='replace')
+
+
+                # Re-check the byte length of the decoded string (should be <= effective_chunk_size_bytes)
+                actual_chunk_byte_length = len(chunk_data_str.encode('utf-8'))
+                if actual_chunk_byte_length > effective_chunk_size_bytes:
+                     logger.error(f"Chunk {i} byte length ({actual_chunk_byte_length}) exceeds effective size ({effective_chunk_size_bytes}) after decoding/re-encoding. This indicates a logic error or extreme edge case.")
+                     return [] # Indicate failure
 
                 chunk = {
                     "idx": i,
                     "tc": num_chunks,
-                    "tl": total_length,
-                    "hash": hash(chunk_data_str) & 0xFFFFFFFF, # Hash each chunk
+                    "tl": total_byte_length, # Total length is in bytes
+                    "hash": hash(chunk_data_str) & 0xFFFFFFFF, # Hash each chunk string
                     "data": chunk_data_str
                 }
                 chunks.append(chunk)
-                current_pos = end_pos
+                current_byte_pos = safe_end_byte_pos # Move to the end of the current chunk's bytes
+
+            # Final check to ensure all data was included
+            if current_byte_pos < total_byte_length:
+                 logger.error(f"Chunking logic error: Only processed {current_byte_pos} of {total_byte_length} bytes.")
+                 # As a safeguard, return the chunks generated so far, but log the error
+                 pass # Allow returning partial chunks with a warning for now
 
-            if current_pos < total_length:
-                 logger.error(f"Chunking logic error: Only processed {current_pos} of {total_length} characters.")
-                 # This should not happen with ceil and min, but as a safeguard
-                 return [] # Indicate failure
 
-            logger.info(f"Chunked data into {num_chunks} chunks for QR codes.")
+            logger.info(f"Chunked data into {num_chunks} chunks for QR codes. Total byte length: {total_byte_length}")
             return chunks
 
         except Exception as e:
@@ -910,20 +1135,18 @@ def generate_stylish_qr(data: Union[str, Dict],
             border=border
         )
 
-        # Data to encode should be a string, typically the JSON chunk
+        # Data to encode should be a string, typically the JSON chunk dictionary dumped to string
         if isinstance(data, dict):
             # Ensure it's dumped to a string if it's a dict chunk
             data_to_encode = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
         else:
-            # Assume it's already the string data chunk
-            data_to_encode = str(data)
+            # Assume it's already the string data chunk payload
+            data_to_encode = str(data) # Ensure it's a string
 
         qr.add_data(data_to_encode)
         qr.make(fit=True) # Fit the QR code size to the data
 
         qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
-        # qr_image = qr_image.convert('RGBA') # Conversion might not be needed for simple fill/back colors
-
         # Optional: Add a simple gradient overlay for style (can be resource intensive)
         # try:
         #     gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
@@ -941,7 +1164,7 @@ def generate_stylish_qr(data: Union[str, Dict],
         final_image = qr_image
 
         output_path = QR_CODES_DIR / filename
-        # Use PNG for lossless quality, 90 quality is for JPEGs but harmless here
+        # Use PNG for lossless quality
         final_image.save(output_path, format='PNG')
 
         return str(output_path)
@@ -951,13 +1174,14 @@ def generate_stylish_qr(data: Union[str, Dict],
 
 def generate_qr_codes(data: List[Dict], combined: bool = True) -> List[str]:
     """Generate QR codes with enhanced visual appeal and metadata"""
-    # Ensure data is a list of dictionaries as expected
+    # Ensure data is a list as expected
     if not isinstance(data, list):
         logger.error("generate_qr_codes received data that is not a list.")
         return []
-    if not all(isinstance(item, dict) for item in data):
-         logger.error("generate_qr_codes received a list containing non-dictionary items.")
-         return []
+    # Allow list of potentially non-dict items, as flatten_item handles wrapping
+    # if not all(isinstance(item, dict) for item in data):
+    #      logger.warning("generate_qr_codes received a list containing non-dictionary items.")
+
 
     try:
         file_processor = EnhancedFileProcessor() # Use the processor for chunking
@@ -987,7 +1211,8 @@ def generate_qr_codes(data: List[Dict], combined: bool = True) -> List[str]:
             if data:
                 for idx, item in enumerate(data):
                     # Chunk the single item (wrapped in a list for chunk_data consistency)
-                    chunks = file_processor.chunk_data([item]) # Pass item as a list
+                    # chunk_data expects List[Dict], so wrap item in list
+                    chunks = file_processor.chunk_data([item])
                     if not chunks:
                          logger.warning(f"No chunks generated for item {idx+1}.")
                          continue
@@ -1027,26 +1252,38 @@ def respond_to_chat(
     # Initialize chat_history if it's None (Gradio might pass None initially)
     if chat_history is None:
         chat_history = []
-    
+
     if chatbot_data is None or not chatbot_data:
-        chat_history.append((message, "Please process some data first using the other tabs before chatting."))
+        # Append user message and then the bot response
+        # Ensure chat_history format is list of lists/tuples
+        if not chat_history or chat_history[-1][0] != message: # Avoid appending the same message twice if Gradio resends
+             chat_history.append([message, None]) # Append user message with None placeholder
+        # Update the placeholder with the response
+        if chat_history and chat_history[-1][1] is None:
+             chat_history[-1][1] = "Please process some data first using the other tabs before chatting."
+        else: # Fallback if somehow no placeholder exists
+             chat_history.append([message, "Please process some data first using the other tabs before chatting."])
+
         return chat_history, chatbot_data, current_filtered_df_state  # Return existing state
-    
+
     # Append user message to history immediately
-    chat_history.append((message, None))  # Use None as a placeholder for the assistant's response
-    
+    # Gradio's chatbot type='messages' expects [[user, bot], [user, bot], ...]
+    # So we append a new entry with user message and None for bot response
+    if not chat_history or chat_history[-1][0] != message: # Avoid appending the same message twice
+         chat_history.append([message, None])
+
     response = ""
     lower_message = message.lower().strip()
-    
+
     # Initialize new_filtered_df_state with the current state to preserve it unless a filter changes it
     new_filtered_df_state = current_filtered_df_state
-    
+
     df = None
-    try:
+    try: # Try block 1: DataFrame Creation
         # Attempt to create a DataFrame from the full chatbot_data for analysis
         # This flattens the structure for easier querying with pandas
         flat_data = []
-        
+
         def flatten_item(d, parent_key='', sep='_'):
             items = {}
             if isinstance(d, dict):
@@ -1067,7 +1304,7 @@ def respond_to_chat(
             # If d is a primitive (int, str, bool, None), it won't add anything here, which is fine
             # as primitives are handled in the dict/list branches.
             return items
-    
+
         # Process each top-level item in chatbot_data
         for i, item in enumerate(chatbot_data):
             if isinstance(item, dict):
@@ -1077,133 +1314,130 @@ def respond_to_chat(
             # If chatbot_data contains non-dict top-level items, flatten them too
             elif isinstance(item, (list, str, int, float, bool, type(None))):
                 flat_data.append({'item_value': item})  # Wrap primitives in a dict
-    
-    except Exception as e:
-        # Handle exceptions that may occur during processing
-        response = f"An error occurred: {str(e)}"
-        chat_history.append((message, response))  # Append error message to chat history
+            else:
+                 # Handle potentially unflattenable types gracefully
+                 logger.warning(f"Skipping unflattenable item type {type(item)} at index {i} for DataFrame conversion.")
+                 # Optionally add a placeholder indicating the skipped item
+                 # flat_data.append({'unsupported_item_type': str(type(item))})
+
 
         if flat_data:
-            try:
-                # Create DataFrame. Use errors='ignore' for columns with mixed types that can't be coerced
+            try: # Inner try block for pd.DataFrame creation specifically
+                # Handle potential issues with inconsistent columns after flattening
+                # Fill missing columns with NaN to create a rectangular DataFrame
                 df = pd.DataFrame(flat_data)
+
                 # Convert object columns to string type explicitly to avoid future warnings/errors
+                # This also helps handle mixed types in columns after flattening
                 for col in df.columns:
                      if df[col].dtype == 'object':
+                          # Use errors='ignore' in astype if needed, but str conversion is usually safe
                           df[col] = df[col].astype(str)
                 logger.debug(f"Created DataFrame with shape: {df.shape}")
                 logger.debug(f"DataFrame columns: {list(df.columns)}")
             except Exception as e:
                 logger.warning(f"Could not create pandas DataFrame from processed data: {e}. Falling back to manual processing.")
-                df = None
+                df = None # Ensure df is None on error
         else:
             logger.warning("Flattened data is empty. Cannot create DataFrame.")
-            df = None
+            df = None # Ensure df is None if flat_data is empty
 
-    except Exception as e:
+    except Exception as e: # Catch errors during the flattening loop itself (Try block 1 catch)
         logger.error(f"Error during DataFrame creation from chatbot_data: {e}")
-        df = None
-        response = f"An error occurred while preparing data for analysis: {e}"
-
-
-    # --- Complex Queries and Analysis ---
-    # These operations should primarily act on the FULL dataframe 'df'
-    # unless the user explicitly asks about the 'filtered' data.
-    # The filter command itself updates `new_filtered_df_state`.
-
-    if df is not None and not response: # Proceed with analysis if DataFrame exists and no error yet
-        # List available columns (from the full DataFrame)
-        if "what columns are available" in lower_message or "list columns" in lower_message:
-             response = f"The available columns in the full dataset are: {', '.join(df.columns)}"
-
-        # Describe a specific column (from the full DataFrame)
-        match = re.search(r'describe column (\w+)', lower_message)
-        if match:
-             column_name = match.group(1)
-             if column_name in df.columns:
-                  # Handle non-numeric describe gracefully
-                  try:
-                      description = df[column_name].describe().to_string()
-                      response = f"Description for column '{column_name}':\n```\n{description}\n```"
-                  except Exception as e:
-                       response = f"Could not generate description for column '{column_name}': {e}"
-                       logger.warning(f"Error describing column '{column_name}': {e}")
-             else:
-                  response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
-
-        # How many unique values in a column? (from the full DataFrame)
-        match = re.search(r'how many unique values in (\w+)', lower_message)
-        if match:
-            column_name = match.group(1)
-            if column_name in df.columns:
-                try:
-                    unique_count = df[column_name].nunique()
-                    response = f"There are {unique_count} unique values in the '{column_name}' column (in the full dataset)."
-                except Exception as e:
-                    response = f"Could not count unique values for column '{column_name}': {e}"
-                    logger.warning(f"Error counting unique values for column '{column_name}': {e}")
-            else:
-                response = f"I couldn't find a column named '{column_name}' in the data. Available columns are: {', '.join(df.columns)}"
+        df = None # Ensure df is None on error
+        # Do NOT set response here, let the main logic handle it if df is None
 
-        # What is the average/sum/min/max of a numeric column? (from the full DataFrame)
-        match = re.search(r'what is the (average|sum|min|max) of (\w+)', lower_message)
-        if match:
-            operation, column_name = match.groups()
-            if column_name in df.columns:
-                try:
-                    # Attempt to convert to numeric, coercing errors to NaN, then drop NaNs
-                    numeric_col = pd.to_numeric(df[column_name], errors='coerce').dropna()
-
-                    if not numeric_col.empty:
-                        if operation == 'average':
-                            result = numeric_col.mean()
-                            response = f"The average of '{column_name}' is {result:.2f}."
-                        elif operation == 'sum':
-                            result = numeric_col.sum()
-                            response = f"The sum of '{column_name}' is {result:.2f}."
-                        elif operation == 'min':
-                            result = numeric_col.min()
-                            response = f"The minimum of '{column_name}' is {result}."
-                        elif operation == 'max':
-                            result = numeric_col.max()
-                            response = f"The maximum of '{column_name}' is {result}."
+
+    # --- Main Chatbot Logic ---
+    # This block contains the core query processing and response generation.
+    # Wrap it in a try/except to catch errors during analysis.
+    try:
+        if df is not None: # Proceed with analysis if DataFrame exists
+            # List available columns (from the full DataFrame)
+            if "what columns are available" in lower_message or "list columns" in lower_message or "show columns" in lower_message:
+                 response = f"The available columns in the full dataset are: {', '.join(df.columns)}"
+
+            # Describe a specific column (from the full DataFrame)
+            match = re.search(r'describe column (\w+)', lower_message)
+            if match:
+                 column_name = match.group(1)
+                 if column_name in df.columns:
+                      # Handle non-numeric describe gracefully
+                      try:
+                          description = df[column_name].describe().to_string()
+                          response = f"Description for column '{column_name}':\n```\n{description}\n```"
+                      except Exception as e:
+                           response = f"Could not generate description for column '{column_name}': {e}"
+                           logger.warning(f"Error describing column '{column_name}': {e}")
+                 else:
+                      response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
+
+            # How many unique values in a column? (from the full DataFrame)
+            match = re.search(r'how many unique values in (\w+)', lower_message)
+            if match:
+                column_name = match.group(1)
+                if column_name in df.columns:
+                    try:
+                        unique_count = df[column_name].nunique()
+                        response = f"There are {unique_count} unique values in the '{column_name}' column (in the full dataset)."
+                    except Exception as e:
+                        response = f"Could not count unique values for column '{column_name}': {e}"
+                        logger.warning(f"Error counting unique values for column '{column_name}': {e}")
+                else:
+                    response = f"I couldn't find a column named '{column_name}' in the data. Available columns are: {', '.join(df.columns)}"
+
+            # What is the average/sum/min/max of a numeric column? (from the full DataFrame)
+            match = re.search(r'what is the (average|sum|min|max) of (\w+)', lower_message)
+            if match:
+                operation, column_name = match.groups()
+                if column_name in df.columns:
+                    try:
+                        # Attempt to convert to numeric, coercing errors to NaN, then drop NaNs
+                        numeric_col = pd.to_numeric(df[column_name], errors='coerce').dropna()
+
+                        if not numeric_col.empty:
+                            if operation == 'average':
+                                result = numeric_col.mean()
+                                response = f"The average of '{column_name}' is {result:.2f}."
+                            elif operation == 'sum':
+                                result = numeric_col.sum()
+                                response = f"The sum of '{column_name}' is {result:.2f}."
+                            elif operation == 'min':
+                                result = numeric_col.min()
+                                response = f"The minimum of '{column_name}' is {result}."
+                            elif operation == 'max':
+                                result = numeric_col.max()
+                                response = f"The maximum of '{column_name}' is {result}."
+                            else:
+                                response = "I can calculate average, sum, min, or max." # Should not reach here due to regex
                         else:
-                            response = "I can calculate average, sum, min, or max." # Should not reach here due to regex
-                    else:
-                        response = f"The column '{column_name}' does not contain numeric values that I can analyze."
-                except Exception as e:
-                    response = f"An error occurred while calculating the {operation} of '{column_name}': {e}"
-                    logger.error(f"Error calculating {operation} for column '{column_name}': {e}")
-            else:
-                response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
-
-        # Enhanced Filter data based on more complex conditions
-        # This section *updates* `new_filtered_df_state` based on the filter command.
-        # It should filter from the *full* dataframe (`df`).
-        filter_match = re.search(
-            r'(?:filter|show items|show me items|find entries|select items|get items)\s+' # Optional action phrases
-            r'(?:where|by|for|with|if)\s+' # Keyword indicating condition
-            r'(\w+)\s+' # Column name
-            r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+' # Operator
-            r'([\'"]?[\w\s.-]+[\'"]?)', # Value (allows spaces, dots, hyphens if quoted, or single words)
-            lower_message
-        )
+                            response = f"The column '{column_name}' does not contain numeric values that I can analyze."
+                    except Exception as e:
+                        response = f"An error occurred while calculating the {operation} of '{column_name}': {e}"
+                        logger.error(f"Error calculating {operation} for column '{column_name}': {e}")
+                else:
+                    response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
+
+            # Enhanced Filter data based on more complex conditions
+            # This section *updates* `new_filtered_df_state` based on the filter command.
+            # It should filter from the *full* dataframe (`df`).
+            filter_match = re.search(
+                r'(?:filter|show items|show me items|find entries|select items|get items)\s+' # Optional action phrases
+                r'(?:where|by|for|with|if)\s+' # Keyword indicating condition
+                r'(\w+)\s+' # Column name
+                r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+' # Operator
+                r'([\'"]?[\w\s.-]+[\'"]?)', # Value (allows spaces, dots, hyphens if quoted, or single words)
+                lower_message
+            )
 
-        if filter_match:
-            column_name, operator, value_str = filter_match.groups()
-            column_name = column_name.strip()
-            operator = operator.strip().lower()
-            value_str = value_str.strip().strip("'\"")
+            if filter_match:
+                column_name, operator, value_str = filter_match.groups()
+                column_name = column_name.strip()
+                operator = operator.strip().lower()
+                value_str = value_str.strip().strip("'\"")
 
-            logger.info(f"Filter request: Column='{column_name}', Operator='{operator}', Value='{value_str}'")
+                logger.info(f"Filter request: Column='{column_name}', Operator='{operator}', Value='{value_str}'")
 
-            if df is None:
-                 response = "No data available to filter. Please process data first."
-                 new_filtered_df_state = None # Ensure state is None if no data
-            elif column_name not in df.columns:
-                response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
-                new_filtered_df_state = None # Clear previous filter if column not found
-            else:
                 # Always filter from the original full dataframe 'df'
                 active_df_to_filter = df.copy()
                 col_series_original = active_df_to_filter[column_name] # Use original series for type checks
@@ -1221,6 +1455,7 @@ def respond_to_chat(
                             target_value = float(value_str)
 
                             # Apply numeric condition only where conversion was successful (not NaN)
+                            # Use .loc to ensure alignment after potential dropna if needed, but here we use fillna(False)
                             if operator == '==': condition = col_series_numeric == target_value
                             elif operator == '!=': condition = col_series_numeric != target_value
                             elif operator == '>': condition = col_series_numeric > target_value
@@ -1259,7 +1494,10 @@ def respond_to_chat(
                     # Handle boolean comparisons (if column type is bool or value looks like bool)
                     elif operator in ['is', 'equals', '==', '!='] and (pd.api.types.is_bool_dtype(col_series_original) or value_str.lower() in ['true', 'false']):
                          try:
-                             col_series_bool = col_series_original.astype(bool) # Attempt to convert column to bool
+                             # Attempt to convert column to bool, coercing errors
+                             # Note: astype(bool) can be aggressive. pd.to_numeric(..., errors='coerce').astype(bool) might be safer
+                             # but let's try direct astype first.
+                             col_series_bool = col_series_original.astype(bool)
                              target_value = value_str.lower() == 'true' # Convert value string to bool
 
                              if operator in ['is', 'equals', '==']:
@@ -1273,6 +1511,11 @@ def respond_to_chat(
                               response = f"For boolean comparison on column '{column_name}', '{value_str}' is not a valid boolean value (true/false)."
                               target_value = None
                               condition = None
+                         except Exception as e: # Catch other potential errors during bool conversion
+                              response = f"Error converting column '{column_name}' to boolean for comparison: {e}"
+                              target_value = None
+                              condition = None
+
 
                     else:
                         # If none of the above types matched, the operator is likely invalid for the column type
@@ -1300,156 +1543,147 @@ def respond_to_chat(
                             new_filtered_df_state = pd.DataFrame() # Store empty DF for "no results"
                             response = f"No items found where '{column_name}' {operator} '{value_str}'."
                     # If condition is None (e.g. bad operator or type mismatch error) and response not already set, set generic invalid op message.
-                    elif not response: # Avoid overwriting specific error from type check
-                        response = f"Unsupported operator '{operator}' for column '{column_name}'. Please check column type or operator."
-                        new_filtered_df_state = None
+                    # This check is now implicitly handled by the initial `if response:` check below this block.
 
 
                 except ValueError as ve: # Specifically catch ValueError for target_value conversion
                     response = f"Invalid value '{value_str}' for comparison on column '{column_name}'. {ve}"
-                    new_filtered_df_state = None # Clear on value error
+                    new_filtered_df_state = pd.DataFrame() # Clear on value error (use empty DF)
                     logger.warning(f"ValueError during filter: {ve}")
                 except Exception as e:
-                    new_filtered_df_state = None # Clear on other errors
+                    new_filtered_df_state = pd.DataFrame() # Clear on other errors (use empty DF)
                     response = f"An error occurred while applying the filter: {e}"
                     logger.error(f"Error applying filter (column='{column_name}', op='{operator}', val='{value_str}'): {e}")
-            # If the message was a filter, new_filtered_df_state is now set (or None/empty if error/no results)
-
-        # --- End of Enhanced Filter Logic ---
-
-        # If `response` is still empty, it means no filter query was matched by the filter_match regex.
-        # In this case, new_filtered_df_state (initialized from current_filtered_df_state) remains unchanged.
-
-
-        # Request structured output (e.g., as CSV or simplified JSON)
-        # This section should act on the *original* df unless specifically asked for filtered data export.
-        # The new download buttons handle filtered data export separately.
-        # Let's assume for now it acts on the original df, and a separate command would be needed for "export filtered data"
-        # If no filter query matched, and no other specific df query matched,
-        # then `response` might still be empty. `new_filtered_df_state` will be the same as `current_filtered_df_state`.
-        # The general queries below should not reset `new_filtered_df_state` unless it's a "clear" command.
-
-        elif "output as csv" in lower_message or "export as csv" in lower_message:
-             if df is not None and not df.empty:
-                  csv_output = df.to_csv(index=False)
-                  response = f"Here is the data in CSV format:\n```csv\n{csv_output[:1000]}...\n```\n(Output truncated for chat display)"
-             else:
-                  response = "There is no data available to output as CSV."
-        elif "output as json" in lower_message or "export as json" in lower_message: # Note: "export as json" is different from download buttons
-             if df is not None and not df.empty:
-                  json_output = df.to_json(orient='records', indent=2)
-                  response = f"Here is the data in JSON format:\n```json\n{json_output[:1000]}...\n```\n(Output truncated for chat display)"
-             else:
-                  response = "There is no data available to output as JSON."
-
-    # --- General Queries (if no DataFrame or specific query matched AND no filter was applied in this turn) ---
-    # These should not clear new_filtered_df_state unless it's a "clear chat"
-    if not response: # Only enter if no response has been generated by DataFrame/filter logic
-        if "how many items" in lower_message or "number of items" in lower_message:
-            # Check filtered state first, then full df, then raw chatbot_data list
-            if new_filtered_df_state is not None and not new_filtered_df_state.empty:
-                response = f"The currently filtered dataset has {len(new_filtered_df_state)} items."
-                if df is not None:
-                     response += f" The original dataset has {len(df)} items."
-            elif df is not None: # Check df from original chatbot_data
-                response = f"There are {len(df)} items in the processed data."
-            elif isinstance(chatbot_data, list): # Fallback if df creation failed but chatbot_data is list
-                response = f"There are {len(chatbot_data)} top-level items in the processed data (not in DataFrame)."
-            elif isinstance(chatbot_data, dict):
-                response = "The processed data is a single dictionary, not a list of items."
-            else:
-                response = "The processed data is not a standard list or dictionary structure."
-
-        elif "what is the structure" in lower_message or "tell me about the data" in lower_message:
-            # Describe filtered data structure if available, otherwise full data structure
-            if new_filtered_df_state is not None and not new_filtered_df_state.empty:
-                 response = f"The filtered data is a table with {len(new_filtered_df_state)} rows and columns: {', '.join(new_filtered_df_state.columns)}. "
-                 if df is not None:
-                     response += f"The original data has columns: {', '.join(df.columns)}."
-                 else:
-                     response += "Original data structure is not tabular."
-            elif df is not None:
-                 response = f"The data is a table with {len(df)} rows and columns: {', '.join(df.columns)}."
-            elif isinstance(chatbot_data, list) and chatbot_data:
-                sample_item = chatbot_data[0]
-                response = f"The data is a list containing {len(chatbot_data)} items. The first item has the following top-level keys: {list(sample_item.keys())}."
-            elif isinstance(chatbot_data, dict):
-                 response = f"The data is a dictionary with the following top-level keys: {list(chatbot_data.keys())}."
-            else:
-                response = "The processed data is not a standard list or dictionary structure that I can easily describe."
-
-        # "show me" without a filter condition might be ambiguous.
-        # Let's assume it refers to the original data or provide guidance.
-        elif "show me" in lower_message or "get me" in lower_message or "extract" in lower_message:
-             # This specific 'show me' without 'where' should not trigger a filter or clear existing filter state.
-             # It's a general request for data, which is too broad. Guide the user.
-             response = "If you want to filter the data, please use a phrase like 'show items where column_name is value'. If you want to see the raw data, consider using the download buttons."
-
-        # --- Speculation about Modifications ---
-        # These responses are purely informative and do not modify data or state.
-        elif "how can i modify" in lower_message or "how to change" in lower_message or "can i add" in lower_message or "can i remove" in lower_message:
-             response = "I cannot directly modify the data here, but I can tell you how you *could* modify it programmatically. What kind of change are you considering (e.g., adding an item, changing a value, removing a field)?"
-        elif "add a field" in lower_message or "add a column" in lower_message:
-             response = "To add a field (or column if the data is tabular), you would typically iterate through each item (or row) in the data and add the new key-value pair. For example, adding a 'status' field with a default value."
-        elif "change a value" in lower_message or "update a field" in lower_message:
-             response = "To change a value, you would need to identify the specific item(s) and the field you want to update. You could use a condition (like filtering) to find the right items and then assign a new value to the field."
-        elif "remove a field" in lower_message or "delete a column" in lower_message:
-             response = "To remove a field, you would iterate through each item and delete the specified key. Be careful, as this is irreversible."
-        elif "restructure" in lower_message or "change the format" in lower_message:
-             response = "Restructuring data involves transforming it into a different shape. This could mean flattening nested objects, grouping items, or pivoting data. This often requires writing custom code to map the old structure to the new one."
-        elif "what if i" in lower_message or "if i changed" in lower_message:
-             response = "Tell me what specific change you're contemplating, and I can speculate on the potential impact or how you might approach it programmatically."
-
-        # --- General Conversation / Fallback ---
-        elif "hello" in lower_message or "hi" in lower_message:
-            response = random.choice(["Hello! How can I help you understand the processed data?", "Hi there! What's on your mind about this data?", "Hey! Ask me anything about the data you've loaded."])
-        elif "thank you" in lower_message or "thanks" in lower_message:
-            response = random.choice(["You're welcome!", "Glad I could help.", "No problem! Let me know if you have more questions about the data."])
-        elif "clear chat" in lower_message: # This should be caught by button, but as text too
-             # Gradio handles clearing the chatbot component state via the button action.
-             # We just need to clear the filtered data state here.
-             response = "Chat history cleared." # Respond that chat is cleared
-             new_filtered_df_state = None # Also clear filtered data on "clear chat" command by text
-        elif not response: # Fallback if nothing else matched
-             response = random.choice([
-                "I can analyze the data you've processed. What would you like to know? Try asking to filter data, e.g., 'show items where status is active'.",
-                "Ask me about the number of items, the structure, or values of specific fields. You can also filter data.",
-                "I can perform basic analysis or filter the data. For example: 'filter by price > 100'.",
-                "Tell me what you want to extract or filter from the data. Use phrases like 'show items where ...'.",
-                "I'm equipped to filter your data. Try 'find entries where name contains widget'."
-             ])
-
-    # --- End of main try block ---
-    except Exception, e:
+                # If the message was a filter, new_filtered_df_state is now set (or None/empty if error/no results)
+
+            # --- End of Enhanced Filter Logic ---
+
+            # If `response` is still empty, check for other dataframe-based queries
+            if not response:
+                # Request structured output (e.g., as CSV or simplified JSON)
+                # This section should act on the *original* df unless specifically asked for filtered data export.
+                # The new download buttons handle filtered data export separately.
+
+                if "output as csv" in lower_message or "export as csv" in lower_message:
+                     if not df.empty:
+                          csv_output = df.to_csv(index=False)
+                          response = f"Here is the data in CSV format:\n```csv\n{csv_output[:1000]}...\n```\n(Output truncated for chat display)"
+                     else:
+                          response = "There is no data available to output as CSV."
+                elif "output as json" in lower_message or "export as json" in lower_message: # Note: "export as json" is different from download buttons
+                     if not df.empty:
+                          json_output = df.to_json(orient='records', indent=2)
+                          response = f"Here is the data in JSON format:\n```json\n{json_output[:1000]}...\n```\n(Output truncated for chat display)"
+                     else:
+                          response = "There is no data available to output as JSON."
+
+        # --- General Queries (if no specific query matched AND no filter was applied in this turn) ---
+        # These should not clear new_filtered_df_state unless it's a "clear chat"
+        # This block runs if df is None or if no specific df query matched.
+        if not response: # Only enter if no response has been generated by DataFrame/filter logic
+            if "how many items" in lower_message or "number of items" in lower_message:
+                # Check filtered state first, then full df, then raw chatbot_data list
+                if new_filtered_df_state is not None and not new_filtered_df_state.empty:
+                    response = f"The currently filtered dataset has {len(new_filtered_df_state)} items."
+                    if df is not None:
+                         response += f" The original dataset has {len(df)} items."
+                elif df is not None: # Check df from original chatbot_data
+                    response = f"There are {len(df)} items in the processed data."
+                elif isinstance(chatbot_data, list): # Fallback if df creation failed but chatbot_data is list
+                    response = f"There are {len(chatbot_data)} top-level items in the processed data (not in DataFrame)."
+                elif isinstance(chatbot_data, dict):
+                    response = "The processed data is a single dictionary, not a list of items."
+                else:
+                    response = "The processed data is not a standard list or dictionary structure."
+
+            elif "what is the structure" in lower_message or "tell me about the data" in lower_message:
+                # Describe filtered data structure if available, otherwise full data structure
+                if new_filtered_df_state is not None and not new_filtered_df_state.empty:
+                     response = f"The filtered data is a table with {len(new_filtered_df_state)} rows and columns: {', '.join(new_filtered_df_state.columns)}. "
+                     if df is not None:
+                         response += f"The original data has columns: {', '.join(df.columns)}."
+                     else:
+                         response += "Original data structure is not tabular."
+                elif df is not None:
+                     response = f"The data is a table with {len(df)} rows and columns: {', '.join(df.columns)}."
+                elif isinstance(chatbot_data, list) and chatbot_data:
+                    # Provide structure of the first item if it's a dictionary
+                    sample_item = chatbot_data[0]
+                    if isinstance(sample_item, dict):
+                        response = f"The data is a list containing {len(chatbot_data)} items. The first item has the following top-level keys: {list(sample_item.keys())}."
+                    else:
+                        response = f"The data is a list containing {len(chatbot_data)} items. The first item is of type: {type(sample_item).__name__}."
+                elif isinstance(chatbot_data, dict):
+                     response = f"The data is a dictionary with the following top-level keys: {list(chatbot_data.keys())}."
+                else:
+                    response = "The processed data is not a standard list or dictionary structure that I can easily describe."
+
+            # "show me" without a filter condition might be ambiguous.
+            # Let's assume it refers to the original data or provide guidance.
+            elif "show me" in lower_message or "get me" in lower_message or "extract" in lower_message:
+                 # This specific 'show me' without 'where' should not trigger a filter or clear existing filter state.
+                 # It's a general request for data, which is too broad. Guide the user.
+                 response = "If you want to filter the data, please use a phrase like 'show items where column_name is value'. If you want to see the raw data, consider using the download buttons."
+
+            # --- Speculation about Modifications ---
+            # These responses are purely informative and do not modify data or state.
+            elif "how can i modify" in lower_message or "how to change" in lower_message or "can i add" in lower_message or "can i remove" in lower_message:
+                 response = "I cannot directly modify the data here, but I can tell you how you *could* modify it programmatically. What kind of change are you considering (e.g., adding an item, changing a value, removing a field)?"
+            elif "add a field" in lower_message or "add a column" in lower_message:
+                 response = "To add a field (or column if the data is tabular), you would typically iterate through each item (or row) in the data and add the new key-value pair. For example, adding a 'status' field with a default value."
+            elif "change a value" in lower_message or "update a field" in lower_message:
+                 response = "To change a value, you would need to identify the specific item(s) and the field you want to update. You could use a condition (like filtering) to find the right items and then assign a new value to the field."
+            elif "remove a field" in lower_message or "delete a column" in lower_message:
+                 response = "To remove a field, you would iterate through each item and delete the specified key. Be careful, as this is irreversible."
+            elif "restructure" in lower_message or "change the format" in lower_message:
+                 response = "Restructuring data involves transforming it into a different shape. This could mean flattening nested objects, grouping items, or pivoting data. This often requires writing custom code to map the old structure to the new one."
+            elif "what if i" in lower_message or "if i changed" in lower_message:
+                 response = "Tell me what specific change you're contemplating, and I can speculate on the potential impact or how you might approach it programmatically."
+
+            # --- General Conversation / Fallback ---
+            elif "hello" in lower_message or "hi" in lower_message:
+                response = random.choice(["Hello! How can I help you understand the processed data?", "Hi there! What's on your mind about this data?", "Hey! Ask me anything about the data you've loaded."])
+            elif "thank you" in lower_message or "thanks" in lower_message:
+                response = random.choice(["You're welcome!", "Glad I could help.", "No problem! Let me know if you have more questions about the data."])
+            elif "clear chat" in lower_message: # This should be caught by button, but as text too
+                 # Gradio handles clearing the chatbot component state via the button action.
+                 # We just need to clear the filtered data state here.
+                 response = "Chat history cleared." # Respond that chat is cleared
+                 new_filtered_df_state = pd.DataFrame() # Clear filtered data on "clear chat" command by text (use empty DF)
+            elif not response: # Fallback if nothing else matched
+                 response = random.choice([
+                    "I can analyze the data you've processed. What would you like to know? Try asking to filter data, e.g., 'show items where status is active'.",
+                    "Ask me about the number of items, the structure, or values of specific fields. You can also filter data.",
+                    "I can perform basic analysis or filter the data. For example: 'filter by price > 100'.",
+                    "Tell me what you want to extract or filter from the data. Use phrases like 'show items where ...'.",
+                    "I'm equipped to filter your data. Try 'find entries where name contains widget'."
+                 ])
+
+    except Exception as e: # Catch errors during main chatbot logic (Try block 2 catch)
         logger.error(f"Chatbot runtime error: {e}")
         response = f"An internal error occurred while processing your request: {e}"
         response += "\nPlease try rephrasing your question or clear the chat history."
-        # On unexpected error, preserve the current_filtered_df_state rather than clearing or modifying it.
-        # new_filtered_df_state = current_filtered_df_state # This line is effectively already done by initialization
-
-    # --- Finally block (optional, but good practice if cleanup is needed) ---
-    # finally:
-    #     # Any cleanup code can go here
-    #     pass
+        # new_filtered_df_state is already initialized to current_filtered_df_state, so it's preserved on error.
 
 
+    # --- Final Response Handling ---
     if not response: # Final safety net for response, if it's somehow still empty
         response = "I'm not sure how to respond to that. Please try rephrasing or ask for help on available commands."
 
     # Update the last message in chat history with the generated response
     # Find the last entry where the assistant's response is None
-    for i in reversed(range(len(chat_history))):
-        if chat_history[i][1] is None:
-            chat_history[i] = (chat_history[i][0], response)
-            break
-    # If no None placeholder was found (shouldn't happen with current logic), append as new entry
-    # else:
-    #     chat_history.append((message, response))
-
+    # This logic assumes the user message was just appended as the last item with None
+    if chat_history and chat_history[-1][1] is None:
+         chat_history[-1][1] = response # Update the second element of the last tuple/list
+    else:
+         # Fallback just in case - append as a new entry
+         chat_history.append([message, response]) # Use original message here
 
     # Ensure chat_history is in the format Gradio expects for type='messages'
     # It should be a list of lists: [[user_msg, bot_msg], [user_msg, bot_msg], ...]
     # The current format List[Tuple[str, str]] works with type='messages' as tuples are treated like lists.
+    # However, modifying tuples is not possible, so ensure we are using lists if we modify in place.
+    # The initial append uses [message, None], so modifying chat_history[-1][1] is correct if it's a list.
 
     return chat_history, chatbot_data, new_filtered_df_state
 
@@ -1552,10 +1786,60 @@ def create_modern_interface():
     }
     """
     with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
-        interface.head += """
-        <script>
+        # Custom JS for the QR Code Viewport checkboxes
+        # This script runs once when the Blocks are loaded.
+        # It defines the setup function and the update function.
+        interface.load(_js='''
+        // Store the enabled states globally within the script's scope
         let enabledStates = [];
-        function updateEnabledStates(checkbox) {
+        let enabledStateTextbox = null; // Reference to the hidden textbox
+
+        // Function to initialize the state from the hidden textbox and setup checkboxes
+        function setupViewportCheckboxes() {
+            // Find the hidden Gradio textbox that stores the enabled state JSON
+            enabledStateTextbox = document.getElementById('enabled_qr_codes_hidden_state_id');
+
+            // Try to load initial state from the textbox value
+            if (enabledStateTextbox && enabledStateTextbox.value) {
+                try {
+                    enabledStates = JSON.parse(enabledStateTextbox.value);
+                    if (!Array.isArray(enabledStates)) {
+                        enabledStates = []; // Reset if not an array
+                    }
+                    console.log("Initialized enabledStates from hidden state:", enabledStates);
+                } catch (e) {
+                    console.error("Failed to parse initial enabledStates from hidden state:", e);
+                    enabledStates = []; // Default to empty on error
+                }
+            } else {
+                console.log("Hidden enabled_qr_codes state textbox not found or empty on setup.");
+                enabledStates = [];
+            }
+
+            // Ensure checkboxes reflect the current state when the viewport is rendered
+            document.querySelectorAll('.viewport-item input[type="checkbox"]').forEach(checkbox => {
+                const index = parseInt(checkbox.dataset.index);
+                checkbox.checked = enabledStates.includes(index);
+                 // Also apply initial visual style
+                 const itemDiv = checkbox.closest('.viewport-item');
+                 if (itemDiv) {
+                      const img = itemDiv.querySelector('img');
+                      if (img) {
+                          if (checkbox.checked) {
+                              img.style.border = "2px solid green";
+                              img.style.opacity = "1.0";
+                          } else {
+                              img.style.border = "2px solid lightgray";
+                              img.style.opacity = "0.5";
+                          }
+                      }
+                 }
+            });
+             console.log("Viewport checkboxes setup complete.");
+        }
+
+        // Function called by checkbox onchange event
+        window.updateEnabledStates = function(checkbox) {
             const index = parseInt(checkbox.dataset.index);
             if (checkbox.checked) {
                 if (!enabledStates.includes(index)) {
@@ -1564,15 +1848,39 @@ def create_modern_interface():
             } else {
                 enabledStates = enabledStates.filter(item => item !== index);
             }
-            const enabled_qr_codes_component = document.querySelector('[data-component-type="state"][data-state-name="enabled_qr_codes"]');
-            if (enabled_qr_codes_component) {
-                enabled_qr_codes_component.value = JSON.stringify(enabledStates);
-                enabled_qr_codes_component.dispatchEvent(new Event('input'));
+            // Sort for consistency
+            enabledStates.sort((a, b) => a - b);
+
+            // Update the hidden textbox value
+            if (enabledStateTextbox) {
+                enabledStateTextbox.value = JSON.stringify(enabledStates);
+                // Dispatch event so Gradio backend knows the state has changed
+                enabledStateTextbox.dispatchEvent(new Event('input', { bubbles: true }));
             }
-            console.log("Enabled QR Code Indices:", enabledStates);
-        }
-        </script>
-        """
+            console.log("Updated Enabled QR Code Indices:", enabledStates);
+
+            // Update visual style immediately
+            const itemDiv = checkbox.closest('.viewport-item');
+            if (itemDiv) {
+                 const img = itemDiv.querySelector('img');
+                 if (img) {
+                     if (checkbox.checked) {
+                         img.style.border = "2px solid green";
+                         img.style.opacity = "1.0";
+                     } else {
+                         img.style.border = "2px solid lightgray";
+                         img.style.opacity = "0.5";
+                     }
+                 }
+            }
+        };
+
+        // Initial setup might be needed if the viewport tab is the default or first shown
+        // setupViewportCheckboxes(); // This might run too early before elements exist.
+                                   // Rely on the tab.select event and _js parameter instead.
+
+        ''')
+
         with gr.Row():
             crawl_depth_slider = gr.Slider(
                 label="Crawl Depth",
@@ -1584,8 +1892,8 @@ def create_modern_interface():
                 info="Select the maximum depth for crawling links (0-10)."
             )
 
-        qr_code_paths = gr.State([])
-        chatbot_data = gr.State(None)
+        qr_code_paths = gr.State([]) # Stores list of QR code file paths
+        chatbot_data = gr.State(None) # Stores the full processed data (List[Dict])
         gr.Markdown("""
         # 🌐 Advanced Data Processing & QR Code Generator
         Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
@@ -1612,7 +1920,8 @@ def create_modern_interface():
             )
             with gr.Row():
                 example_btn = gr.Button("📝 Load Example", variant="secondary")
-                clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                clear_input_btn = gr.Button("🗑️ Clear Input", variant="secondary") # Renamed to avoid conflict
+
         with gr.Row():
             combine_data = gr.Checkbox(
                 label="Combine all data into sequence",
@@ -1641,11 +1950,16 @@ def create_modern_interface():
         )
 
         with gr.Tab("🖼️ QR Code Viewport") as viewport_tab:
+            # Hidden component for JS to update the state. Use elem_id for reliable targeting.
+            enabled_qr_codes_hidden_state = gr.Textbox(value="[]", visible=False, label="__enabled_qr_codes_state__", elem_id="enabled_qr_codes_hidden_state_id")
+            enabled_qr_codes = gr.State([]) # The actual state variable updated by Python from the hidden textbox
+
+            # The viewport_output will return HTML and trigger JS execution via _js
             viewport_output = gr.HTML(label="QR Code Sequence Viewport")
-            enabled_qr_codes = gr.State([])
+
 
         with gr.Tab("🤖 Chat with Data") as chat_tab:
-             chat_history = gr.State([])
+             chat_history = gr.State([]) # Stores the chat history (List[List[str, str]])
              chatbot = gr.Chatbot(label="Data Chatbot", type='messages')  # Set type to 'messages'
              filtered_chatbot_df_state = gr.State(None) # To store the filtered DataFrame
 
@@ -1655,9 +1969,11 @@ def create_modern_interface():
              with gr.Row():
                 download_full_json_btn = gr.Button("Download Full JSON")
                 download_filtered_json_btn = gr.Button("Download Filtered JSON")
-             download_file_output = gr.File(label="Download Data", interactive=False) # For triggering download
+             # A dummy File component to trigger downloads
+             download_file_output = gr.File(label="Download Data", interactive=False)
              clear_chat_btn = gr.Button("Clear Chat History")
 
+
         # Event handlers must be defined within the Blocks context
 
         def load_example():
@@ -1689,13 +2005,23 @@ def create_modern_interface():
             }
             return json.dumps(example, indent=2)
 
-        def clear_input():
+        def clear_inputs():
             # Clear all input fields and the chatbot data state
-            return "", None, "", None
+            # Also clear QR related states and chat history
+            return "", None, "", None, [], "[]", [], None # url, file, text, chatbot_data, qr_code_paths, enabled_qr_codes_hidden_state, chat_history, filtered_chatbot_df_state
 
-        def update_viewport(paths, enabled_states):
+        def update_viewport(paths, enabled_states_json_str):
+            """Updates the HTML viewport based on QR paths and enabled state."""
             if not paths:
-                return "<p>No QR codes generated yet.</p>"
+                # Return HTML and an empty JS script
+                return "<p>No QR codes generated yet.</p>", ""
+
+            try:
+                 enabled_states = json.loads(enabled_states_json_str)
+                 if not isinstance(enabled_states, list): enabled_states = []
+            except (json.JSONDecodeError, TypeError):
+                 logger.warning("Failed to decode enabled_states JSON string. Resetting to empty list.")
+                 enabled_states = []
 
             num_qr_codes = len(paths)
             # Determine grid columns based on the number of QRs, aiming for a roughly square layout
@@ -1704,26 +2030,30 @@ def create_modern_interface():
 
             viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'
 
-            # Ensure enabled_states is a list of indices if it's None or doesn't match current paths
-            if enabled_states is None or len(enabled_states) != num_qr_codes:
-                 enabled_states = list(range(num_qr_codes))
+            # Ensure enabled_states only contains valid indices
+            valid_enabled_states = [idx for idx in enabled_states if 0 <= idx < num_qr_codes]
 
             for i, path in enumerate(paths):
-                is_enabled = i in enabled_states
-                border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;"
-                opacity = "opacity: 1.0;" if is_enabled else "opacity: 0.5;"
+                is_enabled = i in valid_enabled_states
+                # Initial border/opacity are set by JS on load/update based on checkbox state
+                # This makes the HTML simpler.
+                # border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;"
+                # opacity = "opacity: 1.0;" if is_enabled else "opacity: 0.5;"
                 # Use /file= prefix for Gradio to serve local files
+                # Add data-index and onchange handler to the checkbox
                 viewport_html += f'<div class="viewport-item" id="qr_item_{i}">'
-                viewport_html += f'<img src="/file={path}" style="{border} {opacity}" alt="QR Code {i+1}">'
+                viewport_html += f'<img src="/file={path}" alt="QR Code {i+1}">' # Style applied by JS
                 # Add checkbox with data-index for JS to identify which QR it controls
                 viewport_html += f'<label><input type="checkbox" data-index="{i}" {"checked" if is_enabled else ""} onchange="updateEnabledStates(this)"> Enable</label>'
                 viewport_html += '</div>'
             viewport_html += '</div>'
 
-            return viewport_html
+            # Return HTML and an empty JS script. The setupViewportCheckboxes() is called via _js on the select event.
+            return viewport_html, ""
+
 
         def on_qr_generation(qr_paths_list):
-            """Handler to initialize enabled_qr_codes state after QR generation."""
+            """Handler to initialize qr_code_paths and enabled_qr_codes states after QR generation."""
             if qr_paths_list is None:
                 num_qrs = 0
             else:
@@ -1731,8 +2061,10 @@ def create_modern_interface():
 
             # Initially enable all generated QR codes
             initial_enabled_states = list(range(num_qrs))
-            # Return the paths list and the initial enabled states
-            return qr_paths_list, initial_enabled_states
+            initial_enabled_states_json = json.dumps(initial_enabled_states)
+
+            # Return the paths list and the initial enabled states (both list and JSON string)
+            return qr_paths_list, initial_enabled_states, initial_enabled_states_json
 
         def process_inputs(urls, files, text, combine, crawl_depth, generate_qr_enabled):
             """Process all inputs and generate QR codes based on toggle"""
@@ -1746,13 +2078,18 @@ def create_modern_interface():
                 if text and text.strip():
                     try:
                         json_data = json.loads(text)
-                        results.append({
-                            'source': 'json_input',
-                            'extracted_data': json_data,
-                            'timestamp': datetime.now().isoformat(),
-                            'processing_notes': ['Parsed from direct JSON input.']
-                        })
-                        processing_status_messages.append("✅ Successfully parsed direct JSON input.")
+                        # Wrap single JSON objects in a list for consistency with file/url output
+                        if not isinstance(json_data, list):
+                             json_data = [json_data]
+
+                        for item in json_data:
+                            results.append({
+                                'source': 'json_input',
+                                'extracted_data': item, # Store each item separately
+                                'timestamp': datetime.now().isoformat(),
+                                'processing_notes': ['Parsed from direct JSON input.']
+                            })
+                        processing_status_messages.append(f"✅ Successfully parsed {len(json_data)} item(s) from direct JSON input.")
                     except json.JSONDecodeError as e:
                         processing_status_messages.append(f"❌ Invalid JSON format in text input: {str(e)}")
                         logger.error(f"Invalid JSON format in text input: {e}")
@@ -1768,38 +2105,63 @@ def create_modern_interface():
                         # Call fetch_content_with_depth which handles recursion
                         content_result = url_processor.fetch_content_with_depth(url, max_steps=crawl_depth)
 
-                        # The result from fetch_content_with_depth is already structured
-                        # It includes the main fetch_result and linked_extractions
+                        # The result from fetch_content_with_depth is a nested structure.
+                        # We need to flatten this structure into a list of items for the main results list.
+                        def flatten_crawl_results(crawl_result_item):
+                            flat_list = []
+                            # Add the current item's fetch result if it exists and was successful
+                            fetch_res = crawl_result_item.get('fetch_result')
+                            if fetch_res and fetch_res.get('metadata', {}).get('status_code') is not None and 200 <= fetch_res['metadata']['status_code'] < 300:
+                                # Copy the item and add level info to the top level
+                                item_copy = fetch_res.copy()
+                                item_copy['source'] = 'url_crawl'
+                                item_copy['original_request_url'] = crawl_result_item.get('url')
+                                item_copy['crawl_level'] = crawl_result_item.get('level')
+                                # Combine notes from the recursive call structure and the fetch_result structure
+                                notes_from_crawl_structure = crawl_result_item.get('processing_notes', [])
+                                notes_from_fetch_result = item_copy.get('processing_notes', [])
+                                # Avoid duplicating notes already present in fetch_result if they came from the recursive call
+                                combined_notes = list(set(notes_from_crawl_structure + notes_from_fetch_result))
+                                item_copy['processing_notes'] = combined_notes
+                                flat_list.append(item_copy)
+                            elif crawl_result_item:
+                                # Add a placeholder for items that failed to fetch or were skipped
+                                flat_list.append({
+                                    'source': 'url_crawl_skipped',
+                                    'url': crawl_result_item.get('url'),
+                                    'crawl_level': crawl_result_item.get('level'),
+                                    'processing_notes': crawl_result_item.get('processing_notes', [])
+                                })
+
+                            # Recursively add linked extractions
+                            for linked_result in crawl_result_item.get('linked_extractions', []):
+                                flat_list.extend(flatten_crawl_results(linked_result))
+                            return flat_list
+
                         if content_result: # Check if a result dictionary was returned
-                             results.append(content_result)
-                             # Provide status based on the fetch_result within the recursive structure
-                             main_fetch_status = content_result.get('fetch_result', {}).get('status_code')
-                             if main_fetch_status is not None and 200 <= main_fetch_status < 300:
-                                 processing_status_messages.append(f"✅ Processed URL: {url} (Level 0, Status: {main_fetch_status})")
-                                 if content_result.get('processing_notes'):
-                                      processing_status_messages.append(f"   Notes for {url}: {'; '.join(content_result['processing_notes'])}")
-
-                                 # Count successfully processed linked pages
-                                 def count_successful_fetches(crawl_result):
-                                     count = 0
-                                     if crawl_result and crawl_result.get('fetch_result') is not None:
-                                         status = crawl_result['fetch_result'].get('status_code')
-                                         if status is not None and 200 <= status < 300:
-                                             count += 1
-                                     for linked_result in crawl_result.get('linked_extractions', []):
-                                         count += count_successful_fetches(linked_result)
-                                     return count
-
-                                 total_attempted_links = len(content_result.get('linked_extractions', []))
-                                 total_successful_linked = count_successful_fetches({'linked_extractions': content_result.get('linked_extractions', [])}) # Wrap to match expected structure
-
-                                 if total_attempted_links > 0:
-                                     processing_status_messages.append(f"   Processed {total_successful_linked}/{total_attempted_links} linked pages up to depth {crawl_depth}.")
+                             flat_crawl_data = flatten_crawl_results(content_result)
+                             results.extend(flat_crawl_data)
+
+                             # Provide status based on the root URL fetch result
+                             root_fetch_result = content_result.get('fetch_result')
+                             root_fetch_status = root_fetch_result.get('metadata', {}).get('status_code') if root_fetch_result else None
+
+                             if root_fetch_result and root_fetch_status is not None and 200 <= root_fetch_status < 300:
+                                 processing_status_messages.append(f"✅ Processed root URL: {url} (Level 0, Status: {root_fetch_status})")
+                                 # Count successful fetches within the flattened results
+                                 successful_fetches_count = sum(1 for item in flat_crawl_data if item.get('source') == 'url_crawl' and item.get('metadata', {}).get('status_code') is not None and 200 <= item['metadata']['status_code'] < 300)
+                                 total_nodes_count = len(flat_crawl_data) # Count all items in the flattened list
+
+                                 if total_nodes_count > 1: # Only report linked pages if depth > 0 and links were found
+                                     processing_status_messages.append(f"   Fetched {successful_fetches_count}/{total_nodes_count} pages in crawl up to depth {crawl_depth}.")
 
                              else:
-                                 processing_status_messages.append(f"❌ Failed to fetch or process URL: {url} (Status: {main_fetch_status})")
-                                 if content_result.get('processing_notes'):
-                                      processing_status_messages.append(f"   Notes for {url}: {'; '.join(content_result['processing_notes'])}")
+                                 processing_status_messages.append(f"❌ Failed to fetch or process root URL: {url} (Status: {root_fetch_status if root_fetch_status is not None else 'N/A'})")
+
+                             # Add notes from the root processing
+                             if content_result.get('processing_notes'):
+                                  processing_status_messages.append(f"   Notes for {url} (root): {'; '.join(content_result['processing_notes'])}")
+
                         else:
                              processing_status_messages.append(f"❌ Failed to process URL: {url} (No result returned)")
 
@@ -1807,6 +2169,8 @@ def create_modern_interface():
                 if files:
                     for file in files:
                         processing_status_messages.append(f"📁 Processing file: {file.name}...")
+                        # Pass temp_dir info from the outer scope
+                        # Note: _process_file creates its *own* temp dir context internally now
                         file_results = file_processor.process_file(file)
                         if file_results:
                              results.extend(file_results)
@@ -1840,6 +2204,8 @@ def create_modern_interface():
                 else:
                     processing_status_messages.append("⚠️ No valid content collected from inputs.")
                     final_json_output = [] # Ensure output_json is cleared if no results
+                    qr_paths = [] # Ensure qr_paths is cleared if no results
+
 
             except Exception as e:
                 logger.error(f"Overall processing error in process_inputs: {e}")
@@ -1848,11 +2214,12 @@ def create_modern_interface():
                 qr_paths = [] # Clear qrs on unexpected error
 
             # Return the processed data, QR paths, status messages, and update chatbot_data state
+            # The qr_code_paths state and enabled_qr_codes state are updated in the .then() block
             return (
-                final_json_output,
-                [str(path) for path in qr_paths], # Return paths as strings for Gradio Gallery
-                "\n".join(processing_status_messages),
-                final_json_output # Update chatbot_data state
+                final_json_output, # output_json
+                [str(path) for path in qr_paths], # output_gallery (paths as strings)
+                "\n".join(processing_status_messages), # output_text
+                final_json_output # chatbot_data state
             )
 
         # --- Download Logic ---
@@ -1863,6 +2230,7 @@ def create_modern_interface():
                 return None
             try:
                 # Convert DataFrame to list of dictionaries
+                # Use orient='records' for list of dicts format
                 data_list = data_df.to_dict(orient='records')
                 json_str = json.dumps(data_list, indent=2, ensure_ascii=False)
 
@@ -1916,6 +2284,9 @@ def create_modern_interface():
                     # Handle cases where top-level items might not be dicts, wrap them
                     elif isinstance(item, (list, str, int, float, bool, type(None))):
                          flat_data.append({'item_value': item})
+                    else:
+                         # Include a note for unsupported types in the flat data
+                         flat_data.append({'unsupported_item_type': str(type(item))})
 
 
                 if not flat_data:
@@ -1945,10 +2316,24 @@ def create_modern_interface():
             # Pass the DataFrame directly to the generic download function
             return download_json_data(current_filtered_df_state, "filtered_data")
 
+        # Handler to update the enabled_qr_codes State from the hidden Textbox updated by JS
+        def update_enabled_qr_codes_state(enabled_states_json_str):
+             try:
+                  enabled_states = json.loads(enabled_states_json_str)
+                  if isinstance(enabled_states, list):
+                       return enabled_states
+                  else:
+                       logger.warning("Received non-list data for enabled_qr_codes state.")
+                       return [] # Reset to empty list on invalid data
+             except (json.JSONDecodeError, TypeError):
+                  logger.warning("Failed to decode enabled_states JSON string. Resetting to empty list.")
+                  return [] # Reset to empty list on decode error
+
 
         # Connect event handlers within the Blocks context
         example_btn.click(load_example, inputs=[], outputs=text_input)
-        clear_btn.click(clear_input, inputs=[], outputs=[url_input, file_input, text_input, chatbot_data])
+        # Clear inputs button now clears all input components and relevant states
+        clear_input_btn.click(clear_inputs, inputs=[], outputs=[url_input, file_input, text_input, chatbot_data, qr_code_paths, enabled_qr_codes_hidden_state, chat_history, filtered_chatbot_df_state])
 
         process_btn.click(
             process_inputs,
@@ -1956,13 +2341,30 @@ def create_modern_interface():
             outputs=[output_json, output_gallery, output_text, chatbot_data]
         ).then(
             # This .then() is triggered after process_inputs completes and updates output_gallery
+            # It initializes the qr_code_paths state and the enabled_qr_codes state (both list and hidden JSON string)
             on_qr_generation,
             inputs=[output_gallery], # Pass the list of QR paths from the gallery output
-            outputs=[qr_code_paths, enabled_qr_codes] # Update the state variables
+            outputs=[qr_code_paths, enabled_qr_codes, enabled_qr_codes_hidden_state] # Update the state variables
+        )
+
+        # When the hidden enabled_qr_codes_hidden_state Textbox is updated by JS,
+        # update the actual enabled_qr_codes State variable.
+        enabled_qr_codes_hidden_state.change(
+             update_enabled_qr_codes_state,
+             inputs=[enabled_qr_codes_hidden_state],
+             outputs=[enabled_qr_codes]
         )
 
-        # When the viewport tab is selected, update the viewport HTML
-        viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output])
+
+        # When the viewport tab is selected, update the viewport HTML and trigger JS setup
+        # Use the hidden JSON string state as input because the JS updates it directly
+        viewport_tab.select(
+            update_viewport,
+            inputs=[qr_code_paths, enabled_qr_codes_hidden_state],
+            outputs=[viewport_output],
+            # The _js parameter allows running JS after the output is updated
+            _js="setupViewportCheckboxes();"
+        )
 
         # Chatbot send button and text input submit events
         send_msg_btn.click(
@@ -1989,10 +2391,10 @@ def create_modern_interface():
 
         # Clear chat history button
         clear_chat_btn.click(
-            # Clear chat history component and the filtered data state
-            lambda: ([], None),
+            # Clear chat history component, the filtered data state, and the chat_history state variable
+            lambda: ([], None, []), # Clear chatbot component, filtered_df state, and chat_history state
             inputs=None,
-            outputs=[chatbot, filtered_chatbot_df_state]
+            outputs=[chatbot, filtered_chatbot_df_state, chat_history]
         )
 
         # Download buttons
@@ -2012,23 +2414,23 @@ def create_modern_interface():
         ### 🚀 Features
         - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type. Supports crawling links up to a specified depth. **(Now performs real fetching)**
         - **Advanced File Processing**: Reads various text-based files (.txt, .md, .log etc.), HTML, XML, CSV, and attempts text extraction from common documents (.pdf, .docx, .rtf, .odt - *requires extra dependencies*). **(Now performs real file processing)**
-        - **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs.
-        - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives. **(Now performs real extraction)**
+        - **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs. Handles single JSON objects or lists.
+        - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz, .bz2, .xz archives. Includes basic Zip Slip/Tar Slip prevention. **(Now performs real extraction)**
         - **Robust Encoding Detection**: Uses `chardet` for reliable character encoding identification.
         - **Structured Output**: Provides a consistent JSON output format containing raw content (if applicable), extracted data, and processing notes for each processed item.
-        - **Sequential QR Codes**: Maintains data integrity across multiple codes by chunking the combined/individual processed data.
+        - **Sequential QR Codes**: Maintains data integrity across multiple codes by chunking the combined/individual processed data. Uses a more robust chunking method based on byte length.
         - **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
         - **Modern Design**: Clean, responsive interface with visual feedback.
-        - **Data Chatbot**: Interact conversationally with the processed JSON data to ask questions about its structure, content, or request specific information.
+        - **Data Chatbot**: Interact conversationally with the processed JSON data to ask questions about its structure, content, or request specific information. Supports filtering data based on column values.
         ### 💡 Tips
-        1.  **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type, following links up to the specified **Crawl Depth**.
-        2.  **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats.
+        1.  **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type, following links up to the specified **Crawl Depth**. Crawling is limited to the same domain or subdomains for safety and relevance.
+        2.  **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats. Supported archive types include .zip, .tar, .gz, .tgz, .tar.gz, .bz2, .tar.bz2, .xz, .tar.xz. Support for .7z and .rar requires external libraries.
         3.  **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure.
-        4.  **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries (`PyPDF2`, `python-docx`, `pyth`, `odfpy`). Check the console logs for warnings if a library is missing.
-        5.  **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item.
+        4.  **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries (`PyPDF2`, `python-docx`, `pyth`, `odfpy`). Check the console logs for warnings if a library is missing. `pillow[extra]` might be needed for some image features (though not strictly used for QR generation itself here).
+        5.  **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item. QR code generation must be explicitly enabled.
         6.  **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps.
         7.  **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images.
-        8.  **Chatbot**: After processing data, go to the "Chat with Data" tab to ask questions about the JSON output.
+        8.  **Chatbot**: After processing data, go to the "Chat with Data" tab to ask questions about the JSON output. You can filter data using phrases like 'show items where price > 100' or 'filter by category is electronics'.
         ### ⚙️ QR Code Viewport Instructions
         1.  Navigate to the **QR Code Viewport** tab after generating QR codes.
         2.  The generated QR codes will be displayed in a grid based on their total count.
@@ -2051,8 +2453,11 @@ def main():
     except Exception as e:
         logger.error(f"Application startup error: {e}")
         print(f"\nFatal Error: {e}\nCheck the logs for details.")
-        raise
+        # Optionally log traceback for more detailed error info
+        # import traceback
+        # logger.error(traceback.format_exc())
+        raise # Re-raise the exception after logging
 
 if __name__ == "__main__":
     # Ensure the script is run directly (not imported)
-    main()
\ No newline at end of file
+    main()