diff --git "a/app2.py" "b/app2.py" --- "a/app2.py" +++ "b/app2.py" @@ -26,6 +26,10 @@ from PIL import Image, ImageDraw, ImageFont # ImageFont may require pillow[extra import numpy as np # Unused in provided code, kept for completeness import tarfile import gzip +import bz2 # Added for bz2 support +import lzma # Added for xz support +# import py7zr # Requires external dependency, not standard library +# import rarfile # Requires external dependency, not standard library import math import random import pandas as pd @@ -101,24 +105,34 @@ class EnhancedURLProcessor: def validate_url(self, url: str) -> Dict[str, Any]: """Enhanced URL validation with accessibility check.""" - if not validators.url(url): + if not isinstance(url, str) or not url.strip(): + return {'is_valid': False, 'message': 'URL input is empty or invalid type', 'details': 'Input must be a non-empty string'} + + cleaned_url = url.strip() + + if not validators.url(cleaned_url): return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'} - parsed = urlparse(url) + parsed = urlparse(cleaned_url) if not all([parsed.scheme, parsed.netloc]): return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'} + # Basic check for scheme + if parsed.scheme.lower() not in ['http', 'https']: + return {'is_valid': False, 'message': 'Unsupported scheme', 'details': 'Only http and https are supported'} + try: # Use a HEAD request to check accessibility without downloading full content headers = {'User-Agent': self.user_agent.random} - response = self.session.head(url, timeout=self.timeout, headers=headers, allow_redirects=True) + response = self.session.head(cleaned_url, timeout=self.timeout, headers=headers, allow_redirects=True) response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) # Check content type if available in HEAD response content_type = response.headers.get('Content-Type', '').split(';')[0].strip() - if not content_type or not (content_type.startswith('text/') or 'json' in content_type or 'xml' in content_type): - # Basic check if content type seems relevant for text extraction - logger.warning(f"URL {url} returned potentially irrelevant content type: {content_type}") + # Basic check if content type seems relevant for text extraction + # Allow text, json, xml, and potentially others that might contain text + if not content_type or not (content_type.startswith('text/') or 'json' in content_type or 'xml' in content_type or 'application/octet-stream' in content_type): + logger.warning(f"URL {cleaned_url} returned potentially irrelevant content type: {content_type}") # Decide if this should invalidate the URL or just add a note # For now, we'll allow fetching but add a note. @@ -133,14 +147,21 @@ class EnhancedURLProcessor: } } except requests.exceptions.RequestException as e: - return {'is_valid': False, 'message': 'URL not accessible', 'details': str(e)} + logger.error(f"URL validation failed for {cleaned_url}: {e}") + # Capture status code from response if available + status_code = getattr(e.response, 'status_code', None) + return {'is_valid': False, 'message': 'URL not accessible', 'details': f"{str(e)} (Status: {status_code})"} except Exception as e: - logger.error(f"Unexpected error during URL validation for {url}: {e}") + logger.error(f"Unexpected error during URL validation for {cleaned_url}: {e}") return {'is_valid': False, 'message': 'Unexpected validation error', 'details': str(e)} def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]: """Enhanced content fetcher with retry mechanism and complete character extraction.""" + raw_content: Optional[str] = None + metadata: Dict[str, Any] = {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None} + processing_notes: List[str] = [] + try: logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1})") headers = {'User-Agent': self.user_agent.random} @@ -152,32 +173,54 @@ class EnhancedURLProcessor: # Attempt to detect encoding if not specified in headers encoding = response.encoding # requests attempts to guess encoding - if encoding is None or encoding == 'ISO-8859-1': # Fallback if requests guess is default/uncertain + # Fallback if requests guess is default/uncertain or explicitly wrong + # Use chardet if requests' guess is default or if content is large enough for reliable detection + if encoding is None or encoding == 'ISO-8859-1' or len(response.content) > 1000: # Only use chardet on larger content for performance try: encoding_detection = chardet.detect(response.content) - encoding = encoding_detection['encoding'] or 'utf-8' - logger.debug(f"Chardet detected encoding: {encoding} for {url}") + # Use detected encoding if confidence is high, otherwise default to utf-8 + if encoding_detection and encoding_detection['confidence'] > 0.8: + encoding = encoding_detection['encoding'] + logger.debug(f"Chardet detected encoding with high confidence: {encoding} for {url}") + else: + encoding = 'utf-8' + logger.debug(f"Chardet detection confidence too low or failed for {url}. Falling back to utf-8. Detection result: {encoding_detection}") except Exception as e: logger.warning(f"Chardet detection failed for {url}: {e}. Falling back to utf-8.") encoding = 'utf-8' + else: + logger.debug(f"Requests detected encoding: {encoding} for {url}") + + + # Decode content using the determined encoding + try: + raw_content = response.content.decode(encoding, errors='replace') + if encoding != 'utf-8': + processing_notes.append(f"Decoded using detected encoding: {encoding}") + if 'replace' in raw_content: # Simple check if replacements occurred + processing_notes.append("Note: Character replacement occurred during decoding.") + except Exception as e: + logger.warning(f"Failed to decode content with encoding {encoding} for {url}: {e}. Trying utf-8 with ignore.") + raw_content = response.content.decode('utf-8', errors='ignore') + processing_notes.append(f"Decoding with {encoding} failed, used utf-8 with ignore: {e}") - raw_content = response.content.decode(encoding, errors='replace') - # Extract metadata - metadata = { - 'original_url': url, + # Update metadata with successful fetch details + metadata.update({ 'final_url': final_url, - 'timestamp': datetime.now().isoformat(), 'detected_encoding': encoding, 'content_type': content_type, 'content_length': len(response.content), 'headers': dict(response.headers), 'status_code': response.status_code - } + }) # Process based on content type processed_extraction = self._process_web_content(raw_content, metadata['content_type'], final_url) + # Merge processing notes from decoding and content processing + processing_notes.extend(processed_extraction['notes']) + return { 'source': 'url', @@ -185,27 +228,32 @@ class EnhancedURLProcessor: 'raw_content': raw_content, 'metadata': metadata, 'extracted_data': processed_extraction['data'], - 'processing_notes': processed_extraction['notes'] + 'processing_notes': processing_notes } except requests.exceptions.RequestException as e: logger.error(f"Failed to fetch content from {url}: {e}") + # Capture status code from response if available + status_code = getattr(e.response, 'status_code', None) + metadata['status_code'] = status_code + processing_notes.append(f"Failed to fetch content: {str(e)}") return { 'source': 'url', 'url': url, - 'raw_content': None, - 'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': getattr(e.response, 'status_code', None)}, + 'raw_content': raw_content, # raw_content might be partially decoded + 'metadata': metadata, 'extracted_data': None, - 'processing_notes': [f"Failed to fetch content: {str(e)}"] + 'processing_notes': processing_notes } except Exception as e: logger.error(f"Unexpected error while fetching or processing URL {url}: {e}") + processing_notes.append(f"Unexpected processing error: {str(e)}") return { 'source': 'url', 'url': url, - 'raw_content': raw_content if 'raw_content' in locals() else None, - 'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}, + 'raw_content': raw_content, + 'metadata': metadata, 'extracted_data': None, - 'processing_notes': [f"Unexpected processing error: {str(e)}"] + 'processing_notes': processing_notes } @@ -225,12 +273,12 @@ class EnhancedURLProcessor: extracted_data = json.loads(content) notes.append("Parsed as JSON") except json.JSONDecodeError as e: - extracted_data = content - notes.append(f"Failed to parse as JSON: {e}") + extracted_data = content # Store raw content if parsing fails + notes.append(f"Failed to parse as JSON: {e}. Stored raw text.") logger.warning(f"Failed to parse JSON from {base_url}: {e}") except Exception as e: - extracted_data = content - notes.append(f"Error processing JSON: {e}") + extracted_data = content # Store raw content on other errors + notes.append(f"Error processing JSON: {e}. Stored raw text.") logger.error(f"Error processing JSON from {base_url}: {e}") elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'): logger.debug(f"Processing XML content from {base_url}") @@ -240,12 +288,12 @@ class EnhancedURLProcessor: extracted_data = xml_text notes.append("Parsed as XML (text representation)") except ET.ParseError as e: - extracted_data = content - notes.append(f"Failed to parse as XML: {e}") + extracted_data = content # Store raw content if parsing fails + notes.append(f"Failed to parse as XML: {e}. Stored raw text.") logger.warning(f"Failed to parse XML from {base_url}: {e}") except Exception as e: - extracted_data = content - notes.append(f"Error processing XML: {e}") + extracted_data = content # Store raw content on other errors + notes.append(f"Error processing XML: {e}. Stored raw text.") logger.error(f"Error processing XML from {base_url}: {e}") elif 'text/plain' in lower_content_type or 'text/' in lower_content_type: logger.debug(f"Processing Plain Text content from {base_url}") @@ -257,7 +305,7 @@ class EnhancedURLProcessor: notes.append(f"Unknown content type '{content_type}'. Stored raw text.") except Exception as e: logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}") - extracted_data = content + extracted_data = content # Ensure raw content is stored on unexpected error notes.append(f"Unexpected processing error: {e}. Stored raw text.") return {'data': extracted_data, 'notes': notes} @@ -285,17 +333,17 @@ class EnhancedURLProcessor: if href and not href.startswith(('#', 'mailto:', 'tel:', 'javascript:')): text = a_tag.get_text().strip() try: + # Use urljoin to create absolute URL absolute_url = urljoin(base_url, href) - if absolute_url not in unique_links: + # Basic check if the joined URL is valid before adding + if validators.url(absolute_url) and absolute_url not in unique_links: extracted['links'].append({'text': text, 'url': absolute_url}) unique_links.add(absolute_url) - except Exception: - if validators.url(href) and href not in unique_links: - extracted['links'].append({'text': text, 'url': href}) - unique_links.add(href) - elif urlparse(href).netloc and href not in unique_links: - extracted['links'].append({'text': text, 'url': href}) - unique_links.add(href) + elif not validators.url(absolute_url): + logger.debug(f"Skipping invalid joined URL: {absolute_url}") + except Exception as e: + logger.debug(f"Error joining URL {href} with base {base_url}: {e}. Skipping link.") + soup_copy = BeautifulSoup(content, 'html.parser') for script_or_style in soup_copy(["script", "style"]): @@ -307,10 +355,16 @@ class EnhancedURLProcessor: except Exception as e: logger.error(f"Enhanced HTML processing error for {base_url}: {e}") - soup_copy = BeautifulSoup(content, 'html.parser') - for script_or_style in soup_copy(["script", "style"]): - script_or_style.extract() - extracted['full_text'] = soup_copy.get_text(separator='\n').strip() + # Fallback to basic text extraction on error + try: + soup_copy = BeautifulSoup(content, 'html.parser') + for script_or_style in soup_copy(["script", "style"]): + script_or_style.extract() + extracted['full_text'] = soup_copy.get_text(separator='\n').strip() + except Exception as soup_e: + logger.error(f"Fallback HTML text extraction failed for {base_url}: {soup_e}") + extracted['full_text'] = "Failed to extract text." # Indicate total failure + extracted['processing_error'] = f"Enhanced HTML processing failed: {e}" return extracted @@ -335,7 +389,7 @@ class EnhancedURLProcessor: 'level': 0, 'fetch_result': None, 'linked_extractions': [], - 'processing_notes': [f"Initial URL validation failed: {validation_result['message']}"] + 'processing_notes': [f"Initial URL validation failed: {validation_result['message']}. Details: {validation_result['details']}"] } # Use a set to keep track of visited URLs during the crawl to avoid infinite loops @@ -344,6 +398,18 @@ class EnhancedURLProcessor: def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int, visited_urls: set) -> Dict[str, Any]: """Recursive helper function to fetch content and follow links.""" + # Basic check for URL length to prevent excessively long URLs causing issues + if len(url) > 2000: # Arbitrary limit to prevent extremely long URLs + logger.warning(f"Skipping excessively long URL: {url[:100]}... at level {current_step}.") + return { + 'url': url, + 'level': current_step, + 'fetch_result': None, + 'linked_extractions': [], + 'processing_notes': ["URL is excessively long."] + } + + if current_step > max_steps: logger.debug(f"Depth limit ({max_steps}) reached for {url} at level {current_step}.") return { @@ -355,10 +421,26 @@ class EnhancedURLProcessor: } # Normalize URL before checking visited set - normalized_url = url.rstrip('/') # Simple normalization + # Simple normalization: lowercase scheme and netloc, remove trailing slash + try: + parsed_url = urlparse(url) + # Normalize scheme and netloc to handle http vs https and www vs non-www consistently + normalized_netloc = parsed_url.netloc.lower() + # Optional: remove 'www.' if present, but this might be too aggressive depending on site structure + # if normalized_netloc.startswith('www.'): + # normalized_netloc = normalized_netloc[4:] + + normalized_url = parsed_url._replace( + scheme=parsed_url.scheme.lower(), + netloc=normalized_netloc + ).geturl().rstrip('/') + except Exception as e: + logger.warning(f"Failed to parse/normalize URL {url}: {e}. Using original URL for visited check.") + normalized_url = url.rstrip('/') + if normalized_url in visited_urls: - logger.debug(f"Skipping already visited URL: {url} at level {current_step}.") + logger.debug(f"Skipping already visited URL: {url} (normalized: {normalized_url}) at level {current_step}.") return { 'url': url, 'level': current_step, @@ -373,22 +455,39 @@ class EnhancedURLProcessor: fetch_result = self.fetch_content(url) linked_extractions: List[Dict[str, Any]] = [] - if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower(): + # Only attempt to extract links and recurse if the fetch was successful and it was HTML + # Check if fetch_result is not None and has a successful status code + if fetch_result and fetch_result.get('metadata', {}).get('status_code') is not None and 200 <= fetch_result['metadata']['status_code'] < 300 and \ + fetch_result.get('extracted_data') and isinstance(fetch_result['extracted_data'], dict) and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower(): + extracted_data = fetch_result['extracted_data'] links = extracted_data.get('links', []) logger.info(f"Found {len(links)} potential links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.") if current_step < max_steps: - for link_info in links: + # Limit the number of links followed per page to prevent excessive crawling + max_links_per_page = 20 # Arbitrary limit + for link_info in links[:max_links_per_page]: linked_url = link_info.get('url') if linked_url: - # Ensure linked URL is absolute and potentially within the same domain # Simple same-domain check (can be made more sophisticated) try: - base_domain = urlparse(url).netloc - linked_domain = urlparse(linked_url).netloc + base_domain = urlparse(url).netloc.lower() + linked_domain = urlparse(linked_url).netloc.lower() + base_scheme = urlparse(url).scheme.lower() + linked_scheme = urlparse(linked_url).scheme.lower() + + # Only follow http/https links + if linked_scheme not in ['http', 'https']: + logger.debug(f"Skipping non-http/https link: {linked_url}") + continue + # Allow processing if domains match OR if linked_domain is empty (relative link) - if linked_domain and linked_domain != base_domain: + # Also allow if linked_domain is a subdomain of base_domain + is_same_domain = linked_domain == base_domain + is_subdomain = linked_domain.endswith('.' + base_domain) if base_domain else False + + if linked_domain and not is_same_domain and not is_subdomain: logger.debug(f"Skipping external link: {linked_url}") continue # Skip external links @@ -398,12 +497,14 @@ class EnhancedURLProcessor: linked_extractions.append(linked_result) except Exception as e: logger.warning(f"Error processing linked URL {linked_url} from {url}: {e}") + if len(links) > max_links_per_page: + logger.info(f"Truncated link following to {max_links_per_page} links on {url}.") + + current_notes = fetch_result.get('processing_notes', []) if fetch_result else ['Fetch failed or skipped.'] + # Add a note indicating the level processed + current_notes.append(f"Processed at level {current_step}") - current_notes = fetch_result.get('processing_notes', []) if fetch_result else ['Fetch failed.'] - if fetch_result and fetch_result.get('fetch_result') is not None: # Only add level note if fetch was attempted - if f"Processed at level {current_step}" not in current_notes: - current_notes.append(f"Processed at level {current_step}") return { 'url': url, @@ -416,15 +517,15 @@ class EnhancedURLProcessor: class EnhancedFileProcessor: """Advanced file processing with enhanced content extraction""" - def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default + def __init__(self, max_file_size: int = 500 * 1024 * 1024): # 500MB default, 5GB might be too large for typical web apps self.max_file_size = max_file_size self.supported_extensions = { '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg', '.pdf', '.doc', '.docx', '.rtf', '.odt', - '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', + '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', '.xz', '.tgz', '.tar.gz', '.tar.bz2', '.tar.xz', } - self.archive_extensions = {'.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'} + self.archive_extensions = {'.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', '.xz', '.tgz', '.tar.gz', '.tar.bz2', '.tar.xz'} def process_file(self, file) -> List[Dict]: """Process uploaded file with enhanced error handling and complete extraction""" @@ -458,14 +559,15 @@ class EnhancedFileProcessor: 'processing_notes': ['File size exceeds limit.'] }] - # Use a temporary directory for archive extraction - with tempfile.TemporaryDirectory() as temp_dir: - temp_dir_path = Path(temp_dir) + # Use a temporary directory for archive extraction and document processing + with tempfile.TemporaryDirectory() as temp_dir_str: + temp_dir_path = Path(temp_dir_str) + resolved_temp_dir_path = temp_dir_path.resolve() # Resolve temp dir once if file_path.suffix.lower() in self.archive_extensions: - dataset.extend(self._process_archive(file_path, temp_dir_path)) + dataset.extend(self._process_archive(file_path, temp_dir_path, resolved_temp_dir_path)) elif file_path.suffix.lower() in self.supported_extensions: - dataset.extend(self._process_single_file(file_path)) + dataset.extend(self._process_single_file(file_path, temp_dir_path, resolved_temp_dir_path)) else: logger.warning(f"Unsupported file type for processing: '{file_path.name}'. Attempting to read as plain text.") try: @@ -508,7 +610,7 @@ class EnhancedFileProcessor: p = Path(filepath) if isinstance(filepath, str) else filepath return p.suffix.lower() in self.archive_extensions - def _process_single_file(self, file_path: Path) -> List[Dict]: + def _process_single_file(self, file_path: Path, temp_dir_path: Path, resolved_temp_dir_path: Path) -> List[Dict]: """Process a single file with enhanced character extraction and format-specific handling""" dataset_entries = [] filename = file_path.name @@ -530,7 +632,9 @@ class EnhancedFileProcessor: raw_content = content_bytes.decode(encoding, errors='replace') is_explicit_json = mime_type == 'application/json' or file_extension == '.json' - looks_like_json = raw_content.strip().startswith('{') or raw_content.strip().startswith('[') + # Check if it looks like JSON only if not already explicitly identified + looks_like_json = not is_explicit_json and raw_content.strip().startswith(('{', '[')) and raw_content.strip().endswith(('}', ']')) + if is_explicit_json or looks_like_json: try: @@ -539,17 +643,20 @@ class EnhancedFileProcessor: if not is_explicit_json: processing_notes.append("Note: Content looked like JSON despite extension/mime.") logger.warning(f"File '{filename}' identified as JSON content despite extension/mime.") - mime_type = 'application/json' + if 'json' not in mime_type: mime_type = 'application/json' # Update mime type if detected except json.JSONDecodeError as e: - processing_notes.append(f"Failed to parse as JSON: {e}.") + extracted_data = raw_content # Store raw content if parsing fails + processing_notes.append(f"Failed to parse as JSON: {e}. Stored raw text.") if is_explicit_json: logger.error(f"Explicit JSON file '{filename}' has invalid format: {e}") else: logger.warning(f"Content of '{filename}' looks like JSON but failed to parse: {e}") except Exception as e: - processing_notes.append(f"Error processing JSON: {e}.") + extracted_data = raw_content # Store raw content on other errors + processing_notes.append(f"Error processing JSON: {e}. Stored raw text.") logger.error(f"Error processing JSON in '{filename}': {e}") + # Check if it looks like XML only if not already processed as JSON looks_like_xml = extracted_data is None and raw_content.strip().startswith('<') and raw_content.strip().endswith('>') is_explicit_xml = extracted_data is None and (mime_type in ('application/xml', 'text/xml') or mime_type.endswith('+xml') or file_extension in ('.xml', '.xsd')) @@ -560,37 +667,50 @@ class EnhancedFileProcessor: processing_notes.append("Parsed as XML (text representation).") if not is_explicit_xml: processing_notes.append("Note: Content looked like XML despite extension/mime.") - if 'xml' not in mime_type: mime_type = 'application/xml' + if 'xml' not in mime_type: mime_type = 'application/xml' # Update mime type if detected except ET.ParseError as e: - processing_notes.append(f"Failed to parse as XML: {e}.") + extracted_data = raw_content # Store raw content if parsing fails + processing_notes.append(f"Failed to parse as XML: {e}. Stored raw text.") if is_explicit_xml: logger.error(f"Explicit XML file '{filename}' has invalid format: {e}") else: logger.warning(f"Content of '{filename}' looks like XML but failed to parse: {e}") except Exception as e: - processing_notes.append(f"Error processing XML: {e}.") + extracted_data = raw_content # Store raw content on other errors + processing_notes.append(f"Error processing XML: {e}. Stored raw text.") logger.error(f"Error processing XML in '{filename}': {e}") + # Check if it looks like CSV only if not already processed as JSON or XML is_explicit_csv = extracted_data is None and (mime_type == 'text/csv' or file_extension == '.csv') + # Basic heuristic for looks_like_csv: contains comma/semicolon AND multiple lines looks_like_csv = extracted_data is None and (',' in raw_content or ';' in raw_content) and ('\n' in raw_content or len(raw_content.splitlines()) > 1) if extracted_data is None and (is_explicit_csv or looks_like_csv): try: - dialect = 'excel' + dialect = 'excel' # Default dialect + # Use csv.Sniffer to detect dialect if possible try: - sample = '\n'.join(raw_content.splitlines()[:10]) + # Sniffer needs a sample with multiple lines if possible + sample_lines = raw_content.splitlines() + sample = '\n'.join(sample_lines[:10]) # Use up to 10 lines for sample if sample: + # Sniffer can raise csv.Error if sample is not CSV-like dialect = csv.Sniffer().sniff(sample).name logger.debug(f"Sniffer detected CSV dialect: {dialect} for '{filename}'") except csv.Error: logger.debug(f"Sniffer failed to detect dialect for '{filename}', using 'excel'.") dialect = 'excel' + except Exception as e: + logger.warning(f"Unexpected error during CSV dialect sniffing for '{filename}': {e}. Using 'excel'.") + dialect = 'excel' + + # Read CSV content csv_reader = csv.reader(io.StringIO(raw_content), dialect=dialect) rows = list(csv_reader) if rows: - max_rows_preview = 100 + max_rows_preview = 100 # Limit preview rows extracted_data = { 'headers': rows[0] if rows and rows[0] else None, 'rows': rows[1:max_rows_preview+1] if len(rows) > 1 else [] @@ -600,7 +720,7 @@ class EnhancedFileProcessor: processing_notes.append("Parsed as CSV.") if not is_explicit_csv: processing_notes.append("Note: Content looked like CSV despite extension/mime.") - mime_type = 'text/csv' + if 'csv' not in mime_type: mime_type = 'text/csv' # Update mime type if detected else: extracted_data = "Empty CSV" @@ -609,25 +729,38 @@ class EnhancedFileProcessor: processing_notes.append("Note: Content looked like CSV but was empty.") except Exception as e: - processing_notes.append(f"Failed to parse as CSV: {e}.") + extracted_data = raw_content # Store raw content if parsing fails + processing_notes.append(f"Failed to parse as CSV: {e}. Stored raw text.") logger.warning(f"Failed to parse CSV from '{filename}': {e}") + # Attempt document specific extraction if not already processed as structured data if extracted_data is None: try: extracted_text = None + # Need to save bytes to a temporary file for libraries that expect a file path + temp_file_suffix = file_extension # Use original extension for temp file + temp_path = None # Initialize temp_path + if file_extension == '.pdf' and PDF_SUPPORT: - with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: + # Create temp file within the designated temp directory + with tempfile.NamedTemporaryFile(delete=False, suffix=temp_file_suffix, dir=temp_dir_path) as tmp_file: tmp_file.write(content_bytes) temp_path = Path(tmp_file.name) try: reader = PdfReader(temp_path) + # Concatenate text from all pages text_content = "".join(page.extract_text() or "" for page in reader.pages) extracted_text = text_content processing_notes.append("Extracted text from PDF.") + except Exception as e: + processing_notes.append(f"PDF extraction error: {e}") + logger.warning(f"Failed to extract PDF text from '{filename}': {e}") finally: - if temp_path.exists(): temp_path.unlink() + if temp_path and temp_path.exists(): temp_path.unlink() # Clean up temp file + elif file_extension == '.docx' and DOCX_SUPPORT: - with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file: + # Create temp file within the designated temp directory + with tempfile.NamedTemporaryFile(delete=False, suffix=temp_file_suffix, dir=temp_dir_path) as tmp_file: tmp_file.write(content_bytes) temp_path = Path(tmp_file.name) try: @@ -635,11 +768,16 @@ class EnhancedFileProcessor: text_content = "\n".join(paragraph.text for paragraph in document.paragraphs) extracted_text = text_content processing_notes.append("Extracted text from DOCX.") + except Exception as e: + processing_notes.append(f"DOCX extraction error: {e}") + logger.warning(f"Failed to extract DOCX text from '{filename}': {e}") finally: - if temp_path.exists(): temp_path.unlink() + if temp_path and temp_path.exists(): temp_path.unlink() # Clean up temp file + elif file_extension == '.rtf' and RTF_SUPPORT: try: # Need to read RTF content as text, not bytes, for pyth's Rtf15Reader + # Assuming raw_content is already decoded text doc = Rtf15Reader.read(io.StringIO(raw_content)) text_content = PlaintextWriter.write(doc).getvalue() extracted_text = text_content @@ -647,35 +785,49 @@ class EnhancedFileProcessor: except Exception as e: processing_notes.append(f"RTF extraction error: {e}") logger.warning(f"Failed to extract RTF text from '{filename}': {e}") + elif file_extension == '.odt' and ODT_SUPPORT: - with tempfile.NamedTemporaryFile(delete=False, suffix='.odt') as tmp_file: + # Create temp file within the designated temp directory + with tempfile.NamedTemporaryFile(delete=False, suffix=temp_file_suffix, dir=temp_dir_path) as tmp_file: tmp_file.write(content_bytes) temp_path = Path(tmp_file.name) try: text_doc = OpenDocumentText(temp_path) paragraphs = text_doc.getElementsByType(odftext.P) + # Iterate through paragraphs and their child nodes to get text text_content = "\n".join("".join(node.text for node in p.childNodes) for p in paragraphs) extracted_text = text_content processing_notes.append("Extracted text from ODT.") + except Exception as e: + processing_notes.append(f"ODT extraction error: {e}") + logger.warning(f"Failed to extract ODT text from '{filename}': {e}") finally: - if temp_path.exists(): temp_path.unlink() + if temp_path and temp_path.exists(): temp_path.unlink() # Clean up temp file + elif file_extension in ['.doc', '.ppt', '.pptx', '.xls', '.xlsx']: + # Note: These require external libraries like python-pptx, openpyxl, or potentially platform-specific tools processing_notes.append(f"Automatic text extraction for {file_extension.upper()} not fully implemented.") logger.warning(f"Automatic text extraction for {file_extension.upper()} not fully implemented for '{filename}'.") if extracted_text is not None: - max_extracted_text_size = 10000 + max_extracted_text_size = 10000 # Limit extracted text size for display/QR extracted_data = {'text': extracted_text[:max_extracted_text_size]} if len(extracted_text) > max_extracted_text_size: extracted_data['text'] += "..." processing_notes.append("Extracted text truncated.") + if mime_type in ['unknown/unknown', 'application/octet-stream']: + # Guess a more specific mime type if text was extracted + guessed_text_mime, _ = mimetypes.guess_type('dummy.txt') + if guessed_text_mime: mime_type = guessed_text_mime + except ImportError as e: - processing_notes.append(f"Missing dependency for document type ({e}). Cannot extract text.") + processing_notes.append(f"Missing dependency for document type ({e.name if hasattr(e, 'name') else str(e)}). Cannot extract text.") except Exception as e: processing_notes.append(f"Error during document text extraction: {e}") logger.warning(f"Error during document text extraction for '{filename}': {e}") + # If no specific extraction worked, store the raw decoded content as plain text if extracted_data is None: extracted_data = {'plain_text': raw_content} processing_notes.append("Stored as plain text.") @@ -686,8 +838,8 @@ class EnhancedFileProcessor: except Exception as e: logger.error(f"Fatal error processing single file '{filename}': {e}") processing_notes.append(f"Fatal processing error: {e}") - raw_content = None - extracted_data = None + raw_content = raw_content if 'raw_content' in locals() else None # Preserve raw content if decoded before error + extracted_data = None # Clear extracted data on fatal error entry = { 'source': 'file', @@ -704,8 +856,8 @@ class EnhancedFileProcessor: dataset_entries.append(entry) return dataset_entries - def _process_archive(self, archive_path: Path, extract_to: Path) -> List[Dict]: - """Process an archive file with enhanced extraction""" + def _process_archive(self, archive_path: Path, extract_to: Path, resolved_extract_to: Path) -> List[Dict]: + """Process an archive file with enhanced extraction and security""" dataset = [] archive_extension = archive_path.suffix.lower() logger.info(f"Processing archive: '{archive_path.name}'") @@ -715,62 +867,97 @@ class EnhancedFileProcessor: if zipfile.is_zipfile(archive_path): with zipfile.ZipFile(archive_path, 'r') as zip_ref: for file_info in zip_ref.infolist(): - # Prevent Zip Slip vulnerability - sanitized_filename = Path(file_info.filename).name # Takes only the base name - extracted_file_path = extract_to / sanitized_filename - - if file_info.file_size > 0 and not file_info.filename.endswith('/'): + if file_info.file_size > 0 and not file_info.is_dir(): + # Calculate the intended extraction path within the temp_dir + # Use pathlib's / operator which is safer than os.path.join + extracted_file_path = extract_to / file_info.filename try: - # Use extract method with path to temp_dir for safety + # Resolve the potential extraction path to check against the resolved temp dir + # This is the most robust check against Zip Slip + resolved_extracted_file_path = extracted_file_path.resolve() + + # Check if the resolved extracted path is actually inside the resolved temp directory + if not resolved_extracted_file_path.is_relative_to(resolved_extract_to): + logger.warning(f"Skipping potentially malicious path in zip: '{file_info.filename}' (resolved to {resolved_extracted_file_path})") + continue # Skip this member + + # If the check passes, proceed with extraction + # Note: zipfile.extract() is generally safe in recent Python versions (>=3.6) zip_ref.extract(file_info, path=extract_to) - extracted_file_path = extract_to / file_info.filename # Get the actual extracted path - if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path): - dataset.extend(self._process_single_file(extracted_file_path)) - elif extracted_file_path.suffix.lower() in self.archive_extensions: - logger.info(f"Found nested archive '{file_info.filename}', processing recursively.") - dataset.extend(self._process_archive(extracted_file_path, extract_to)) + # Process the extracted file + if extracted_file_path.suffix.lower() in self.supported_extensions: + if self._is_archive(extracted_file_path): + logger.info(f"Found nested archive '{file_info.filename}', processing recursively.") + # Recursively call, passing a *new* temp dir path for nested extraction + with tempfile.TemporaryDirectory() as nested_temp_dir_str: + nested_temp_dir_path = Path(nested_temp_dir_str) + dataset.extend(self._process_archive(extracted_file_path, nested_temp_dir_path, nested_temp_dir_path.resolve())) + else: + # Pass the nested temp dir info down for processing the single file + dataset.extend(self._process_single_file(extracted_file_path, extract_to, resolved_extract_to)) else: - logger.debug(f"Skipping unsupported file in archive: '{file_info.filename}'") + logger.debug(f"Skipping unsupported file in zip: '{file_info.filename}'") except Exception as e: logger.warning(f"Error extracting/processing file '{file_info.filename}' from zip '{archive_path.name}': {e}") finally: # Clean up the extracted file immediately if extracted_file_path.exists(): try: - extracted_file_path.unlink() + # Re-check resolved path before unlinking for safety + if extracted_file_path.resolve().is_relative_to(resolved_extract_to): + extracted_file_path.unlink() + else: + logger.warning(f"Skipping cleanup of path outside temp dir: {extracted_file_path}") except OSError as e: logger.warning(f"Failed to clean up extracted file {extracted_file_path}: {e}") else: logger.error(f"'{archive_path.name}' is not a valid zip file.") - elif archive_extension in ('.tar', '.gz', '.tgz'): # .tgz is often tar.gz + elif archive_extension in ('.tar', '.gz', '.tgz', '.tar.gz', '.bz2', '.tar.bz2', '.xz', '.tar.xz'): # Added more tar/compression extensions try: mode = 'r' - if archive_extension in ('.tar.gz', '.tgz'): mode = 'r:gz' # Handle .tar.gz and .tgz + if archive_extension in ('.tar.gz', '.tgz'): mode = 'r:gz' + elif archive_extension == '.tar.bz2': mode = 'r:bz2' + elif archive_extension == '.tar.xz': mode = 'r:xz' + elif archive_extension == '.gz': mode = 'r:gz' # Handle standalone .gz as tar.gz with single member + elif archive_extension == '.bz2': mode = 'r:bz2' # Handle standalone .bz2 as tar.bz2 with single member + elif archive_extension == '.xz': mode = 'r:xz' # Handle standalone .xz as tar.xz with single member + # Note: standalone .gz, .bz2, .xz are typically single files, and tarfile can read them. with tarfile.open(archive_path, mode) as tar_ref: for member in tar_ref.getmembers(): if member.isfile(): - # Prevent Tar Slip vulnerability - sanitized_filename = Path(member.name).name # Takes only the base name - extracted_file_path = extract_to / sanitized_filename + # Calculate the intended extraction path within the temp_dir + extracted_file_path = extract_to / member.name # This is the path *inside* the temp dir try: - # Use extractfile method and write manually for better control/safety - member_file = tar_ref.extractfile(member) - if member_file: - with open(extracted_file_path, 'wb') as outfile: - outfile.write(member_file.read()) - member_file.close() # Close the BytesIO object - - if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path): - dataset.extend(self._process_single_file(extracted_file_path)) - elif extracted_file_path.suffix.lower() in self.archive_extensions: - logger.info(f"Found nested archive '{member.name}', processing recursively.") - dataset.extend(self._process_archive(extracted_file_path, extract_to)) + # Resolve the potential extraction path + resolved_extracted_file_path = extracted_file_path.resolve() + + # Check if the resolved extracted path is actually inside the resolved temp directory + # This is the most robust check against Tar Slip + if not resolved_extracted_file_path.is_relative_to(resolved_extract_to): + logger.warning(f"Skipping potentially malicious path in tar: '{member.name}' (resolved to {resolved_extracted_file_path})") + continue # Skip this member + + # If the check passes, proceed with extraction + # Note: tarfile.extract() is generally safe in recent Python versions (>=3.6) + tar_ref.extract(member, path=extract_to) + + # Process the extracted file + if extracted_file_path.suffix.lower() in self.supported_extensions: + if self._is_archive(extracted_file_path): + logger.info(f"Found nested archive '{member.name}', processing recursively.") + # Recursively call, passing a *new* temp dir path for nested extraction + with tempfile.TemporaryDirectory() as nested_temp_dir_str: + nested_temp_dir_path = Path(nested_temp_dir_str) + dataset.extend(self._process_archive(extracted_file_path, nested_temp_dir_path, nested_temp_dir_path.resolve())) + else: + # Pass the nested temp dir info down for processing the single file + dataset.extend(self._process_single_file(extracted_file_path, extract_to, resolved_extract_to)) else: - logger.warning(f"Could not get file-like object for {member.name} from tar.") + logger.debug(f"Skipping unsupported file in tar: '{member.name}'") except Exception as e: logger.warning(f"Error extracting/processing file '{member.name}' from tar '{archive_path.name}': {e}") @@ -778,48 +965,28 @@ class EnhancedFileProcessor: # Clean up the extracted file immediately if extracted_file_path.exists(): try: - extracted_file_path.unlink() + # Re-check resolved path before unlinking for safety + if extracted_file_path.resolve().is_relative_to(resolved_extract_to): + extracted_file_path.unlink() + else: + logger.warning(f"Skipping cleanup of path outside temp dir: {extracted_file_path}") except OSError as e: logger.warning(f"Failed to clean up extracted file {extracted_file_path}: {e}") except tarfile.TarError as e: logger.error(f"Error processing TAR archive '{archive_path.name}': {e}") + except Exception as e: + logger.error(f"Unexpected error processing TAR archive '{archive_path.name}': {e}") - elif archive_extension == '.gz': # Handle standalone .gz (single file compression) - extracted_name = archive_path.stem # Get filename without .gz - extracted_path = extract_to / extracted_name - try: - with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile: - outfile.write(gz_file.read()) - # Process the extracted file - if extracted_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_path): - dataset.extend(self._process_single_file(extracted_path)) - elif extracted_path.suffix.lower() in self.archive_extensions: - logger.info(f"Found nested archive '{extracted_name}', processing recursively.") - dataset.extend(self._process_archive(extracted_path, extract_to)) - else: - logger.debug(f"Skipping unsupported file (from gz): '{extracted_name}'") - - except gzip.BadGzipFile as e: - logger.error(f"Error processing GZIP file '{archive_path.name}': {e}") - except Exception as e: - logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}") - finally: - # Clean up the extracted file immediately - if extracted_path.exists(): - try: - extracted_path.unlink() - except OSError as e: - logger.warning(f"Failed to clean up extracted file {extracted_path}: {e}") - - elif archive_extension in ('.bz2', '.7z', '.rar'): - logger.warning(f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.") + elif archive_extension in ('.7z', '.rar'): + logger.warning(f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries like py7zr or rarfile.") except Exception as e: logger.error(f"Overall archive processing error for '{archive_path.name}': {e}") return dataset + def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]: """Enhanced data chunking with sequence metadata""" try: @@ -831,64 +998,122 @@ class EnhancedFileProcessor: data_list = data # JSON dump the entire list first + # Use compact separators for maximum data density json_str = json.dumps(data_list, ensure_ascii=False, separators=(',', ':')) total_length = len(json_str) # Estimate overhead for metadata + some buffer # Example metadata: {"idx":0,"tc":1,"tl":1000,"hash":1234567890,"data":"..."} - # A rough estimate of the metadata string length - # Assuming max 5 digits for idx/tc, 10 for tl, 10 for hash, plus keys, colons, commas, quotes - # {"idx":NNNNN,"tc":NNNNN,"tl":NNNNNNNNNN,"hash":NNNNNNNNNN,"data":""} - # ~ 7 + 5 + 6 + 5 + 6 + 10 + 7 + 10 + 7 + 0 + 2 + 4*3 (commas/colons) + 2*2 (quotes) = ~ 80-100 characters - # Let's use a slightly safer estimate - overhead_estimate = len(json.dumps({"idx": 99999, "tc": 99999, "tl": 9999999999, "hash": 9999999999, "data": ""}, separators=(',', ':'))) + 50 # Add buffer - # Max QR code capacity for alphanumeric is higher than byte/binary. - # Max size 2953 is for bytes. For alphanumeric, it's 4296. - # We are encoding JSON (mostly alphanumeric, but can contain non-ASCII). - # Using byte capacity (2953) is safer. Let's stick to 2953 as the max_size input. - - effective_chunk_size = max_size - overhead_estimate - - if effective_chunk_size <= 0: - logger.error(f"Max QR size ({max_size}) is too small for metadata overhead ({overhead_estimate}). Cannot chunk.") + # Max possible values for idx, tc (up to num_chunks, which can be large), tl (up to total_length), hash (32-bit int) + # Let's assume max 6 digits for idx/tc (up to 999,999 chunks), 10 for tl, 10 for hash + # {"idx":999999,"tc":999999,"tl":9999999999,"hash":4294967295,"data":""} + # Length of keys + colons + commas + quotes + max value lengths: + # "idx": (5 + 1 + 6) = 12 + # "tc": (4 + 1 + 6) = 11 + # "tl": (4 + 1 + 10) = 15 + # "hash": (6 + 1 + 10) = 17 + # "data": (6 + 1 + 2) = 9 (for quotes around empty string) + # Commas: 3 + # Total estimate: 12 + 11 + 15 + 17 + 9 + 3 = 67 + # Add a buffer for safety (e.g., 30-50 chars) + overhead_estimate = 100 # Safe estimate for metadata overhead + + # Max QR code capacity for bytes (Version 40, Level M) is 2953. + # We are encoding JSON string, which is mostly alphanumeric but can contain non-ASCII characters. + # Using byte capacity is the safest approach. + max_qr_byte_capacity = 2953 # Max bytes for QR Version 40, Error Correction M + + effective_chunk_size_bytes = max_qr_byte_capacity - overhead_estimate + + if effective_chunk_size_bytes <= 0: + logger.error(f"Max QR byte capacity ({max_qr_byte_capacity}) is too small for metadata overhead ({overhead_estimate}). Cannot chunk.") return [] - if total_length <= effective_chunk_size: + # Convert the JSON string to bytes to get the actual byte length for chunking + json_bytes = json_str.encode('utf-8') + total_byte_length = len(json_bytes) + + if total_byte_length <= effective_chunk_size_bytes: # Single chunk case - chunk_data = json_str + chunk_data = json_str # Store the string data in the chunk chunk = { "idx": 0, "tc": 1, - "tl": total_length, - "hash": hash(chunk_data) & 0xFFFFFFFF, # Use a simple hash - "data": chunk_data + "tl": total_byte_length, # Total length is in bytes + "hash": hash(json_str) & 0xFFFFFFFF, # Use hash of the whole string, truncated to 32-bit + "data": chunk_data # The data payload is the full JSON string } return [chunk] # Multi-chunk case - num_chunks = math.ceil(total_length / effective_chunk_size) chunks = [] - current_pos = 0 - for i in range(num_chunks): - end_pos = min(current_pos + effective_chunk_size, total_length) - chunk_data_str = json_str[current_pos:end_pos] + current_byte_pos = 0 + + # Iterate and split the bytes, then decode each chunk + # This is safer than splitting the string directly by character index + for i in range(math.ceil(total_byte_length / effective_chunk_size_bytes)): + end_byte_pos = min(current_byte_pos + effective_chunk_size_bytes, total_byte_length) + + # Find the nearest character boundary before end_byte_pos + # Decode a small buffer around the end position to find a character boundary + buffer_size = 10 # Look back a few bytes + safe_end_byte_pos = end_byte_pos + if safe_end_byte_pos < total_byte_length: + # Decode a small slice ending at end_byte_pos + try: + # Attempt to decode from a few bytes before the end to the end + test_slice = json_bytes[max(0, end_byte_pos - buffer_size) : end_byte_pos] + test_slice.decode('utf-8', errors='strict') # Try strict decoding + # If strict decoding works, end_byte_pos is a character boundary + safe_end_byte_pos = end_byte_pos + except UnicodeDecodeError as e: + # If decoding fails, it's not a character boundary. Adjust to the start of the invalid sequence. + # e.start gives the index within the slice, add current_byte_pos to get index in full bytes + safe_end_byte_pos = max(0, end_byte_pos - buffer_size) + e.start + logger.debug(f"Adjusted chunk boundary from {end_byte_pos} to {safe_end_byte_pos} due to character boundary.") + except Exception as e: + logger.warning(f"Unexpected error finding character boundary near byte {end_byte_pos}: {e}. Using unverified boundary.") + safe_end_byte_pos = end_byte_pos # Fallback + + + # Get the byte slice for the chunk + chunk_bytes = json_bytes[current_byte_pos:safe_end_byte_pos] + + # Decode the chunk bytes into a string + try: + chunk_data_str = chunk_bytes.decode('utf-8', errors='strict') + except UnicodeDecodeError as e: + logger.error(f"Failed to strictly decode chunk {i} bytes ({current_byte_pos}-{safe_end_byte_pos}): {e}. Using 'replace' error handling.") + chunk_data_str = chunk_bytes.decode('utf-8', errors='replace') + except Exception as e: + logger.error(f"Unexpected error decoding chunk {i} bytes ({current_byte_pos}-{safe_end_byte_pos}): {e}. Using 'replace' error handling.") + chunk_data_str = chunk_bytes.decode('utf-8', errors='replace') + + + # Re-check the byte length of the decoded string (should be <= effective_chunk_size_bytes) + actual_chunk_byte_length = len(chunk_data_str.encode('utf-8')) + if actual_chunk_byte_length > effective_chunk_size_bytes: + logger.error(f"Chunk {i} byte length ({actual_chunk_byte_length}) exceeds effective size ({effective_chunk_size_bytes}) after decoding/re-encoding. This indicates a logic error or extreme edge case.") + return [] # Indicate failure chunk = { "idx": i, "tc": num_chunks, - "tl": total_length, - "hash": hash(chunk_data_str) & 0xFFFFFFFF, # Hash each chunk + "tl": total_byte_length, # Total length is in bytes + "hash": hash(chunk_data_str) & 0xFFFFFFFF, # Hash each chunk string "data": chunk_data_str } chunks.append(chunk) - current_pos = end_pos + current_byte_pos = safe_end_byte_pos # Move to the end of the current chunk's bytes + + # Final check to ensure all data was included + if current_byte_pos < total_byte_length: + logger.error(f"Chunking logic error: Only processed {current_byte_pos} of {total_byte_length} bytes.") + # As a safeguard, return the chunks generated so far, but log the error + pass # Allow returning partial chunks with a warning for now - if current_pos < total_length: - logger.error(f"Chunking logic error: Only processed {current_pos} of {total_length} characters.") - # This should not happen with ceil and min, but as a safeguard - return [] # Indicate failure - logger.info(f"Chunked data into {num_chunks} chunks for QR codes.") + logger.info(f"Chunked data into {num_chunks} chunks for QR codes. Total byte length: {total_byte_length}") return chunks except Exception as e: @@ -910,20 +1135,18 @@ def generate_stylish_qr(data: Union[str, Dict], border=border ) - # Data to encode should be a string, typically the JSON chunk + # Data to encode should be a string, typically the JSON chunk dictionary dumped to string if isinstance(data, dict): # Ensure it's dumped to a string if it's a dict chunk data_to_encode = json.dumps(data, ensure_ascii=False, separators=(',', ':')) else: - # Assume it's already the string data chunk - data_to_encode = str(data) + # Assume it's already the string data chunk payload + data_to_encode = str(data) # Ensure it's a string qr.add_data(data_to_encode) qr.make(fit=True) # Fit the QR code size to the data qr_image = qr.make_image(fill_color=fill_color, back_color=back_color) - # qr_image = qr_image.convert('RGBA') # Conversion might not be needed for simple fill/back colors - # Optional: Add a simple gradient overlay for style (can be resource intensive) # try: # gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0)) @@ -941,7 +1164,7 @@ def generate_stylish_qr(data: Union[str, Dict], final_image = qr_image output_path = QR_CODES_DIR / filename - # Use PNG for lossless quality, 90 quality is for JPEGs but harmless here + # Use PNG for lossless quality final_image.save(output_path, format='PNG') return str(output_path) @@ -951,13 +1174,14 @@ def generate_stylish_qr(data: Union[str, Dict], def generate_qr_codes(data: List[Dict], combined: bool = True) -> List[str]: """Generate QR codes with enhanced visual appeal and metadata""" - # Ensure data is a list of dictionaries as expected + # Ensure data is a list as expected if not isinstance(data, list): logger.error("generate_qr_codes received data that is not a list.") return [] - if not all(isinstance(item, dict) for item in data): - logger.error("generate_qr_codes received a list containing non-dictionary items.") - return [] + # Allow list of potentially non-dict items, as flatten_item handles wrapping + # if not all(isinstance(item, dict) for item in data): + # logger.warning("generate_qr_codes received a list containing non-dictionary items.") + try: file_processor = EnhancedFileProcessor() # Use the processor for chunking @@ -987,7 +1211,8 @@ def generate_qr_codes(data: List[Dict], combined: bool = True) -> List[str]: if data: for idx, item in enumerate(data): # Chunk the single item (wrapped in a list for chunk_data consistency) - chunks = file_processor.chunk_data([item]) # Pass item as a list + # chunk_data expects List[Dict], so wrap item in list + chunks = file_processor.chunk_data([item]) if not chunks: logger.warning(f"No chunks generated for item {idx+1}.") continue @@ -1027,26 +1252,38 @@ def respond_to_chat( # Initialize chat_history if it's None (Gradio might pass None initially) if chat_history is None: chat_history = [] - + if chatbot_data is None or not chatbot_data: - chat_history.append((message, "Please process some data first using the other tabs before chatting.")) + # Append user message and then the bot response + # Ensure chat_history format is list of lists/tuples + if not chat_history or chat_history[-1][0] != message: # Avoid appending the same message twice if Gradio resends + chat_history.append([message, None]) # Append user message with None placeholder + # Update the placeholder with the response + if chat_history and chat_history[-1][1] is None: + chat_history[-1][1] = "Please process some data first using the other tabs before chatting." + else: # Fallback if somehow no placeholder exists + chat_history.append([message, "Please process some data first using the other tabs before chatting."]) + return chat_history, chatbot_data, current_filtered_df_state # Return existing state - + # Append user message to history immediately - chat_history.append((message, None)) # Use None as a placeholder for the assistant's response - + # Gradio's chatbot type='messages' expects [[user, bot], [user, bot], ...] + # So we append a new entry with user message and None for bot response + if not chat_history or chat_history[-1][0] != message: # Avoid appending the same message twice + chat_history.append([message, None]) + response = "" lower_message = message.lower().strip() - + # Initialize new_filtered_df_state with the current state to preserve it unless a filter changes it new_filtered_df_state = current_filtered_df_state - + df = None - try: + try: # Try block 1: DataFrame Creation # Attempt to create a DataFrame from the full chatbot_data for analysis # This flattens the structure for easier querying with pandas flat_data = [] - + def flatten_item(d, parent_key='', sep='_'): items = {} if isinstance(d, dict): @@ -1067,7 +1304,7 @@ def respond_to_chat( # If d is a primitive (int, str, bool, None), it won't add anything here, which is fine # as primitives are handled in the dict/list branches. return items - + # Process each top-level item in chatbot_data for i, item in enumerate(chatbot_data): if isinstance(item, dict): @@ -1077,133 +1314,130 @@ def respond_to_chat( # If chatbot_data contains non-dict top-level items, flatten them too elif isinstance(item, (list, str, int, float, bool, type(None))): flat_data.append({'item_value': item}) # Wrap primitives in a dict - - except Exception as e: - # Handle exceptions that may occur during processing - response = f"An error occurred: {str(e)}" - chat_history.append((message, response)) # Append error message to chat history + else: + # Handle potentially unflattenable types gracefully + logger.warning(f"Skipping unflattenable item type {type(item)} at index {i} for DataFrame conversion.") + # Optionally add a placeholder indicating the skipped item + # flat_data.append({'unsupported_item_type': str(type(item))}) + if flat_data: - try: - # Create DataFrame. Use errors='ignore' for columns with mixed types that can't be coerced + try: # Inner try block for pd.DataFrame creation specifically + # Handle potential issues with inconsistent columns after flattening + # Fill missing columns with NaN to create a rectangular DataFrame df = pd.DataFrame(flat_data) + # Convert object columns to string type explicitly to avoid future warnings/errors + # This also helps handle mixed types in columns after flattening for col in df.columns: if df[col].dtype == 'object': + # Use errors='ignore' in astype if needed, but str conversion is usually safe df[col] = df[col].astype(str) logger.debug(f"Created DataFrame with shape: {df.shape}") logger.debug(f"DataFrame columns: {list(df.columns)}") except Exception as e: logger.warning(f"Could not create pandas DataFrame from processed data: {e}. Falling back to manual processing.") - df = None + df = None # Ensure df is None on error else: logger.warning("Flattened data is empty. Cannot create DataFrame.") - df = None + df = None # Ensure df is None if flat_data is empty - except Exception as e: + except Exception as e: # Catch errors during the flattening loop itself (Try block 1 catch) logger.error(f"Error during DataFrame creation from chatbot_data: {e}") - df = None - response = f"An error occurred while preparing data for analysis: {e}" - - - # --- Complex Queries and Analysis --- - # These operations should primarily act on the FULL dataframe 'df' - # unless the user explicitly asks about the 'filtered' data. - # The filter command itself updates `new_filtered_df_state`. - - if df is not None and not response: # Proceed with analysis if DataFrame exists and no error yet - # List available columns (from the full DataFrame) - if "what columns are available" in lower_message or "list columns" in lower_message: - response = f"The available columns in the full dataset are: {', '.join(df.columns)}" - - # Describe a specific column (from the full DataFrame) - match = re.search(r'describe column (\w+)', lower_message) - if match: - column_name = match.group(1) - if column_name in df.columns: - # Handle non-numeric describe gracefully - try: - description = df[column_name].describe().to_string() - response = f"Description for column '{column_name}':\n```\n{description}\n```" - except Exception as e: - response = f"Could not generate description for column '{column_name}': {e}" - logger.warning(f"Error describing column '{column_name}': {e}") - else: - response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}" - - # How many unique values in a column? (from the full DataFrame) - match = re.search(r'how many unique values in (\w+)', lower_message) - if match: - column_name = match.group(1) - if column_name in df.columns: - try: - unique_count = df[column_name].nunique() - response = f"There are {unique_count} unique values in the '{column_name}' column (in the full dataset)." - except Exception as e: - response = f"Could not count unique values for column '{column_name}': {e}" - logger.warning(f"Error counting unique values for column '{column_name}': {e}") - else: - response = f"I couldn't find a column named '{column_name}' in the data. Available columns are: {', '.join(df.columns)}" + df = None # Ensure df is None on error + # Do NOT set response here, let the main logic handle it if df is None - # What is the average/sum/min/max of a numeric column? (from the full DataFrame) - match = re.search(r'what is the (average|sum|min|max) of (\w+)', lower_message) - if match: - operation, column_name = match.groups() - if column_name in df.columns: - try: - # Attempt to convert to numeric, coercing errors to NaN, then drop NaNs - numeric_col = pd.to_numeric(df[column_name], errors='coerce').dropna() - - if not numeric_col.empty: - if operation == 'average': - result = numeric_col.mean() - response = f"The average of '{column_name}' is {result:.2f}." - elif operation == 'sum': - result = numeric_col.sum() - response = f"The sum of '{column_name}' is {result:.2f}." - elif operation == 'min': - result = numeric_col.min() - response = f"The minimum of '{column_name}' is {result}." - elif operation == 'max': - result = numeric_col.max() - response = f"The maximum of '{column_name}' is {result}." + + # --- Main Chatbot Logic --- + # This block contains the core query processing and response generation. + # Wrap it in a try/except to catch errors during analysis. + try: + if df is not None: # Proceed with analysis if DataFrame exists + # List available columns (from the full DataFrame) + if "what columns are available" in lower_message or "list columns" in lower_message or "show columns" in lower_message: + response = f"The available columns in the full dataset are: {', '.join(df.columns)}" + + # Describe a specific column (from the full DataFrame) + match = re.search(r'describe column (\w+)', lower_message) + if match: + column_name = match.group(1) + if column_name in df.columns: + # Handle non-numeric describe gracefully + try: + description = df[column_name].describe().to_string() + response = f"Description for column '{column_name}':\n```\n{description}\n```" + except Exception as e: + response = f"Could not generate description for column '{column_name}': {e}" + logger.warning(f"Error describing column '{column_name}': {e}") + else: + response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}" + + # How many unique values in a column? (from the full DataFrame) + match = re.search(r'how many unique values in (\w+)', lower_message) + if match: + column_name = match.group(1) + if column_name in df.columns: + try: + unique_count = df[column_name].nunique() + response = f"There are {unique_count} unique values in the '{column_name}' column (in the full dataset)." + except Exception as e: + response = f"Could not count unique values for column '{column_name}': {e}" + logger.warning(f"Error counting unique values for column '{column_name}': {e}") + else: + response = f"I couldn't find a column named '{column_name}' in the data. Available columns are: {', '.join(df.columns)}" + + # What is the average/sum/min/max of a numeric column? (from the full DataFrame) + match = re.search(r'what is the (average|sum|min|max) of (\w+)', lower_message) + if match: + operation, column_name = match.groups() + if column_name in df.columns: + try: + # Attempt to convert to numeric, coercing errors to NaN, then drop NaNs + numeric_col = pd.to_numeric(df[column_name], errors='coerce').dropna() + + if not numeric_col.empty: + if operation == 'average': + result = numeric_col.mean() + response = f"The average of '{column_name}' is {result:.2f}." + elif operation == 'sum': + result = numeric_col.sum() + response = f"The sum of '{column_name}' is {result:.2f}." + elif operation == 'min': + result = numeric_col.min() + response = f"The minimum of '{column_name}' is {result}." + elif operation == 'max': + result = numeric_col.max() + response = f"The maximum of '{column_name}' is {result}." + else: + response = "I can calculate average, sum, min, or max." # Should not reach here due to regex else: - response = "I can calculate average, sum, min, or max." # Should not reach here due to regex - else: - response = f"The column '{column_name}' does not contain numeric values that I can analyze." - except Exception as e: - response = f"An error occurred while calculating the {operation} of '{column_name}': {e}" - logger.error(f"Error calculating {operation} for column '{column_name}': {e}") - else: - response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}" - - # Enhanced Filter data based on more complex conditions - # This section *updates* `new_filtered_df_state` based on the filter command. - # It should filter from the *full* dataframe (`df`). - filter_match = re.search( - r'(?:filter|show items|show me items|find entries|select items|get items)\s+' # Optional action phrases - r'(?:where|by|for|with|if)\s+' # Keyword indicating condition - r'(\w+)\s+' # Column name - r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+' # Operator - r'([\'"]?[\w\s.-]+[\'"]?)', # Value (allows spaces, dots, hyphens if quoted, or single words) - lower_message - ) + response = f"The column '{column_name}' does not contain numeric values that I can analyze." + except Exception as e: + response = f"An error occurred while calculating the {operation} of '{column_name}': {e}" + logger.error(f"Error calculating {operation} for column '{column_name}': {e}") + else: + response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}" + + # Enhanced Filter data based on more complex conditions + # This section *updates* `new_filtered_df_state` based on the filter command. + # It should filter from the *full* dataframe (`df`). + filter_match = re.search( + r'(?:filter|show items|show me items|find entries|select items|get items)\s+' # Optional action phrases + r'(?:where|by|for|with|if)\s+' # Keyword indicating condition + r'(\w+)\s+' # Column name + r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+' # Operator + r'([\'"]?[\w\s.-]+[\'"]?)', # Value (allows spaces, dots, hyphens if quoted, or single words) + lower_message + ) - if filter_match: - column_name, operator, value_str = filter_match.groups() - column_name = column_name.strip() - operator = operator.strip().lower() - value_str = value_str.strip().strip("'\"") + if filter_match: + column_name, operator, value_str = filter_match.groups() + column_name = column_name.strip() + operator = operator.strip().lower() + value_str = value_str.strip().strip("'\"") - logger.info(f"Filter request: Column='{column_name}', Operator='{operator}', Value='{value_str}'") + logger.info(f"Filter request: Column='{column_name}', Operator='{operator}', Value='{value_str}'") - if df is None: - response = "No data available to filter. Please process data first." - new_filtered_df_state = None # Ensure state is None if no data - elif column_name not in df.columns: - response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}" - new_filtered_df_state = None # Clear previous filter if column not found - else: # Always filter from the original full dataframe 'df' active_df_to_filter = df.copy() col_series_original = active_df_to_filter[column_name] # Use original series for type checks @@ -1221,6 +1455,7 @@ def respond_to_chat( target_value = float(value_str) # Apply numeric condition only where conversion was successful (not NaN) + # Use .loc to ensure alignment after potential dropna if needed, but here we use fillna(False) if operator == '==': condition = col_series_numeric == target_value elif operator == '!=': condition = col_series_numeric != target_value elif operator == '>': condition = col_series_numeric > target_value @@ -1259,7 +1494,10 @@ def respond_to_chat( # Handle boolean comparisons (if column type is bool or value looks like bool) elif operator in ['is', 'equals', '==', '!='] and (pd.api.types.is_bool_dtype(col_series_original) or value_str.lower() in ['true', 'false']): try: - col_series_bool = col_series_original.astype(bool) # Attempt to convert column to bool + # Attempt to convert column to bool, coercing errors + # Note: astype(bool) can be aggressive. pd.to_numeric(..., errors='coerce').astype(bool) might be safer + # but let's try direct astype first. + col_series_bool = col_series_original.astype(bool) target_value = value_str.lower() == 'true' # Convert value string to bool if operator in ['is', 'equals', '==']: @@ -1273,6 +1511,11 @@ def respond_to_chat( response = f"For boolean comparison on column '{column_name}', '{value_str}' is not a valid boolean value (true/false)." target_value = None condition = None + except Exception as e: # Catch other potential errors during bool conversion + response = f"Error converting column '{column_name}' to boolean for comparison: {e}" + target_value = None + condition = None + else: # If none of the above types matched, the operator is likely invalid for the column type @@ -1300,156 +1543,147 @@ def respond_to_chat( new_filtered_df_state = pd.DataFrame() # Store empty DF for "no results" response = f"No items found where '{column_name}' {operator} '{value_str}'." # If condition is None (e.g. bad operator or type mismatch error) and response not already set, set generic invalid op message. - elif not response: # Avoid overwriting specific error from type check - response = f"Unsupported operator '{operator}' for column '{column_name}'. Please check column type or operator." - new_filtered_df_state = None + # This check is now implicitly handled by the initial `if response:` check below this block. except ValueError as ve: # Specifically catch ValueError for target_value conversion response = f"Invalid value '{value_str}' for comparison on column '{column_name}'. {ve}" - new_filtered_df_state = None # Clear on value error + new_filtered_df_state = pd.DataFrame() # Clear on value error (use empty DF) logger.warning(f"ValueError during filter: {ve}") except Exception as e: - new_filtered_df_state = None # Clear on other errors + new_filtered_df_state = pd.DataFrame() # Clear on other errors (use empty DF) response = f"An error occurred while applying the filter: {e}" logger.error(f"Error applying filter (column='{column_name}', op='{operator}', val='{value_str}'): {e}") - # If the message was a filter, new_filtered_df_state is now set (or None/empty if error/no results) - - # --- End of Enhanced Filter Logic --- - - # If `response` is still empty, it means no filter query was matched by the filter_match regex. - # In this case, new_filtered_df_state (initialized from current_filtered_df_state) remains unchanged. - - - # Request structured output (e.g., as CSV or simplified JSON) - # This section should act on the *original* df unless specifically asked for filtered data export. - # The new download buttons handle filtered data export separately. - # Let's assume for now it acts on the original df, and a separate command would be needed for "export filtered data" - # If no filter query matched, and no other specific df query matched, - # then `response` might still be empty. `new_filtered_df_state` will be the same as `current_filtered_df_state`. - # The general queries below should not reset `new_filtered_df_state` unless it's a "clear" command. - - elif "output as csv" in lower_message or "export as csv" in lower_message: - if df is not None and not df.empty: - csv_output = df.to_csv(index=False) - response = f"Here is the data in CSV format:\n```csv\n{csv_output[:1000]}...\n```\n(Output truncated for chat display)" - else: - response = "There is no data available to output as CSV." - elif "output as json" in lower_message or "export as json" in lower_message: # Note: "export as json" is different from download buttons - if df is not None and not df.empty: - json_output = df.to_json(orient='records', indent=2) - response = f"Here is the data in JSON format:\n```json\n{json_output[:1000]}...\n```\n(Output truncated for chat display)" - else: - response = "There is no data available to output as JSON." - - # --- General Queries (if no DataFrame or specific query matched AND no filter was applied in this turn) --- - # These should not clear new_filtered_df_state unless it's a "clear chat" - if not response: # Only enter if no response has been generated by DataFrame/filter logic - if "how many items" in lower_message or "number of items" in lower_message: - # Check filtered state first, then full df, then raw chatbot_data list - if new_filtered_df_state is not None and not new_filtered_df_state.empty: - response = f"The currently filtered dataset has {len(new_filtered_df_state)} items." - if df is not None: - response += f" The original dataset has {len(df)} items." - elif df is not None: # Check df from original chatbot_data - response = f"There are {len(df)} items in the processed data." - elif isinstance(chatbot_data, list): # Fallback if df creation failed but chatbot_data is list - response = f"There are {len(chatbot_data)} top-level items in the processed data (not in DataFrame)." - elif isinstance(chatbot_data, dict): - response = "The processed data is a single dictionary, not a list of items." - else: - response = "The processed data is not a standard list or dictionary structure." - - elif "what is the structure" in lower_message or "tell me about the data" in lower_message: - # Describe filtered data structure if available, otherwise full data structure - if new_filtered_df_state is not None and not new_filtered_df_state.empty: - response = f"The filtered data is a table with {len(new_filtered_df_state)} rows and columns: {', '.join(new_filtered_df_state.columns)}. " - if df is not None: - response += f"The original data has columns: {', '.join(df.columns)}." - else: - response += "Original data structure is not tabular." - elif df is not None: - response = f"The data is a table with {len(df)} rows and columns: {', '.join(df.columns)}." - elif isinstance(chatbot_data, list) and chatbot_data: - sample_item = chatbot_data[0] - response = f"The data is a list containing {len(chatbot_data)} items. The first item has the following top-level keys: {list(sample_item.keys())}." - elif isinstance(chatbot_data, dict): - response = f"The data is a dictionary with the following top-level keys: {list(chatbot_data.keys())}." - else: - response = "The processed data is not a standard list or dictionary structure that I can easily describe." - - # "show me" without a filter condition might be ambiguous. - # Let's assume it refers to the original data or provide guidance. - elif "show me" in lower_message or "get me" in lower_message or "extract" in lower_message: - # This specific 'show me' without 'where' should not trigger a filter or clear existing filter state. - # It's a general request for data, which is too broad. Guide the user. - response = "If you want to filter the data, please use a phrase like 'show items where column_name is value'. If you want to see the raw data, consider using the download buttons." - - # --- Speculation about Modifications --- - # These responses are purely informative and do not modify data or state. - elif "how can i modify" in lower_message or "how to change" in lower_message or "can i add" in lower_message or "can i remove" in lower_message: - response = "I cannot directly modify the data here, but I can tell you how you *could* modify it programmatically. What kind of change are you considering (e.g., adding an item, changing a value, removing a field)?" - elif "add a field" in lower_message or "add a column" in lower_message: - response = "To add a field (or column if the data is tabular), you would typically iterate through each item (or row) in the data and add the new key-value pair. For example, adding a 'status' field with a default value." - elif "change a value" in lower_message or "update a field" in lower_message: - response = "To change a value, you would need to identify the specific item(s) and the field you want to update. You could use a condition (like filtering) to find the right items and then assign a new value to the field." - elif "remove a field" in lower_message or "delete a column" in lower_message: - response = "To remove a field, you would iterate through each item and delete the specified key. Be careful, as this is irreversible." - elif "restructure" in lower_message or "change the format" in lower_message: - response = "Restructuring data involves transforming it into a different shape. This could mean flattening nested objects, grouping items, or pivoting data. This often requires writing custom code to map the old structure to the new one." - elif "what if i" in lower_message or "if i changed" in lower_message: - response = "Tell me what specific change you're contemplating, and I can speculate on the potential impact or how you might approach it programmatically." - - # --- General Conversation / Fallback --- - elif "hello" in lower_message or "hi" in lower_message: - response = random.choice(["Hello! How can I help you understand the processed data?", "Hi there! What's on your mind about this data?", "Hey! Ask me anything about the data you've loaded."]) - elif "thank you" in lower_message or "thanks" in lower_message: - response = random.choice(["You're welcome!", "Glad I could help.", "No problem! Let me know if you have more questions about the data."]) - elif "clear chat" in lower_message: # This should be caught by button, but as text too - # Gradio handles clearing the chatbot component state via the button action. - # We just need to clear the filtered data state here. - response = "Chat history cleared." # Respond that chat is cleared - new_filtered_df_state = None # Also clear filtered data on "clear chat" command by text - elif not response: # Fallback if nothing else matched - response = random.choice([ - "I can analyze the data you've processed. What would you like to know? Try asking to filter data, e.g., 'show items where status is active'.", - "Ask me about the number of items, the structure, or values of specific fields. You can also filter data.", - "I can perform basic analysis or filter the data. For example: 'filter by price > 100'.", - "Tell me what you want to extract or filter from the data. Use phrases like 'show items where ...'.", - "I'm equipped to filter your data. Try 'find entries where name contains widget'." - ]) - - # --- End of main try block --- - except Exception, e: + # If the message was a filter, new_filtered_df_state is now set (or None/empty if error/no results) + + # --- End of Enhanced Filter Logic --- + + # If `response` is still empty, check for other dataframe-based queries + if not response: + # Request structured output (e.g., as CSV or simplified JSON) + # This section should act on the *original* df unless specifically asked for filtered data export. + # The new download buttons handle filtered data export separately. + + if "output as csv" in lower_message or "export as csv" in lower_message: + if not df.empty: + csv_output = df.to_csv(index=False) + response = f"Here is the data in CSV format:\n```csv\n{csv_output[:1000]}...\n```\n(Output truncated for chat display)" + else: + response = "There is no data available to output as CSV." + elif "output as json" in lower_message or "export as json" in lower_message: # Note: "export as json" is different from download buttons + if not df.empty: + json_output = df.to_json(orient='records', indent=2) + response = f"Here is the data in JSON format:\n```json\n{json_output[:1000]}...\n```\n(Output truncated for chat display)" + else: + response = "There is no data available to output as JSON." + + # --- General Queries (if no specific query matched AND no filter was applied in this turn) --- + # These should not clear new_filtered_df_state unless it's a "clear chat" + # This block runs if df is None or if no specific df query matched. + if not response: # Only enter if no response has been generated by DataFrame/filter logic + if "how many items" in lower_message or "number of items" in lower_message: + # Check filtered state first, then full df, then raw chatbot_data list + if new_filtered_df_state is not None and not new_filtered_df_state.empty: + response = f"The currently filtered dataset has {len(new_filtered_df_state)} items." + if df is not None: + response += f" The original dataset has {len(df)} items." + elif df is not None: # Check df from original chatbot_data + response = f"There are {len(df)} items in the processed data." + elif isinstance(chatbot_data, list): # Fallback if df creation failed but chatbot_data is list + response = f"There are {len(chatbot_data)} top-level items in the processed data (not in DataFrame)." + elif isinstance(chatbot_data, dict): + response = "The processed data is a single dictionary, not a list of items." + else: + response = "The processed data is not a standard list or dictionary structure." + + elif "what is the structure" in lower_message or "tell me about the data" in lower_message: + # Describe filtered data structure if available, otherwise full data structure + if new_filtered_df_state is not None and not new_filtered_df_state.empty: + response = f"The filtered data is a table with {len(new_filtered_df_state)} rows and columns: {', '.join(new_filtered_df_state.columns)}. " + if df is not None: + response += f"The original data has columns: {', '.join(df.columns)}." + else: + response += "Original data structure is not tabular." + elif df is not None: + response = f"The data is a table with {len(df)} rows and columns: {', '.join(df.columns)}." + elif isinstance(chatbot_data, list) and chatbot_data: + # Provide structure of the first item if it's a dictionary + sample_item = chatbot_data[0] + if isinstance(sample_item, dict): + response = f"The data is a list containing {len(chatbot_data)} items. The first item has the following top-level keys: {list(sample_item.keys())}." + else: + response = f"The data is a list containing {len(chatbot_data)} items. The first item is of type: {type(sample_item).__name__}." + elif isinstance(chatbot_data, dict): + response = f"The data is a dictionary with the following top-level keys: {list(chatbot_data.keys())}." + else: + response = "The processed data is not a standard list or dictionary structure that I can easily describe." + + # "show me" without a filter condition might be ambiguous. + # Let's assume it refers to the original data or provide guidance. + elif "show me" in lower_message or "get me" in lower_message or "extract" in lower_message: + # This specific 'show me' without 'where' should not trigger a filter or clear existing filter state. + # It's a general request for data, which is too broad. Guide the user. + response = "If you want to filter the data, please use a phrase like 'show items where column_name is value'. If you want to see the raw data, consider using the download buttons." + + # --- Speculation about Modifications --- + # These responses are purely informative and do not modify data or state. + elif "how can i modify" in lower_message or "how to change" in lower_message or "can i add" in lower_message or "can i remove" in lower_message: + response = "I cannot directly modify the data here, but I can tell you how you *could* modify it programmatically. What kind of change are you considering (e.g., adding an item, changing a value, removing a field)?" + elif "add a field" in lower_message or "add a column" in lower_message: + response = "To add a field (or column if the data is tabular), you would typically iterate through each item (or row) in the data and add the new key-value pair. For example, adding a 'status' field with a default value." + elif "change a value" in lower_message or "update a field" in lower_message: + response = "To change a value, you would need to identify the specific item(s) and the field you want to update. You could use a condition (like filtering) to find the right items and then assign a new value to the field." + elif "remove a field" in lower_message or "delete a column" in lower_message: + response = "To remove a field, you would iterate through each item and delete the specified key. Be careful, as this is irreversible." + elif "restructure" in lower_message or "change the format" in lower_message: + response = "Restructuring data involves transforming it into a different shape. This could mean flattening nested objects, grouping items, or pivoting data. This often requires writing custom code to map the old structure to the new one." + elif "what if i" in lower_message or "if i changed" in lower_message: + response = "Tell me what specific change you're contemplating, and I can speculate on the potential impact or how you might approach it programmatically." + + # --- General Conversation / Fallback --- + elif "hello" in lower_message or "hi" in lower_message: + response = random.choice(["Hello! How can I help you understand the processed data?", "Hi there! What's on your mind about this data?", "Hey! Ask me anything about the data you've loaded."]) + elif "thank you" in lower_message or "thanks" in lower_message: + response = random.choice(["You're welcome!", "Glad I could help.", "No problem! Let me know if you have more questions about the data."]) + elif "clear chat" in lower_message: # This should be caught by button, but as text too + # Gradio handles clearing the chatbot component state via the button action. + # We just need to clear the filtered data state here. + response = "Chat history cleared." # Respond that chat is cleared + new_filtered_df_state = pd.DataFrame() # Clear filtered data on "clear chat" command by text (use empty DF) + elif not response: # Fallback if nothing else matched + response = random.choice([ + "I can analyze the data you've processed. What would you like to know? Try asking to filter data, e.g., 'show items where status is active'.", + "Ask me about the number of items, the structure, or values of specific fields. You can also filter data.", + "I can perform basic analysis or filter the data. For example: 'filter by price > 100'.", + "Tell me what you want to extract or filter from the data. Use phrases like 'show items where ...'.", + "I'm equipped to filter your data. Try 'find entries where name contains widget'." + ]) + + except Exception as e: # Catch errors during main chatbot logic (Try block 2 catch) logger.error(f"Chatbot runtime error: {e}") response = f"An internal error occurred while processing your request: {e}" response += "\nPlease try rephrasing your question or clear the chat history." - # On unexpected error, preserve the current_filtered_df_state rather than clearing or modifying it. - # new_filtered_df_state = current_filtered_df_state # This line is effectively already done by initialization - - # --- Finally block (optional, but good practice if cleanup is needed) --- - # finally: - # # Any cleanup code can go here - # pass + # new_filtered_df_state is already initialized to current_filtered_df_state, so it's preserved on error. + # --- Final Response Handling --- if not response: # Final safety net for response, if it's somehow still empty response = "I'm not sure how to respond to that. Please try rephrasing or ask for help on available commands." # Update the last message in chat history with the generated response # Find the last entry where the assistant's response is None - for i in reversed(range(len(chat_history))): - if chat_history[i][1] is None: - chat_history[i] = (chat_history[i][0], response) - break - # If no None placeholder was found (shouldn't happen with current logic), append as new entry - # else: - # chat_history.append((message, response)) - + # This logic assumes the user message was just appended as the last item with None + if chat_history and chat_history[-1][1] is None: + chat_history[-1][1] = response # Update the second element of the last tuple/list + else: + # Fallback just in case - append as a new entry + chat_history.append([message, response]) # Use original message here # Ensure chat_history is in the format Gradio expects for type='messages' # It should be a list of lists: [[user_msg, bot_msg], [user_msg, bot_msg], ...] # The current format List[Tuple[str, str]] works with type='messages' as tuples are treated like lists. + # However, modifying tuples is not possible, so ensure we are using lists if we modify in place. + # The initial append uses [message, None], so modifying chat_history[-1][1] is correct if it's a list. return chat_history, chatbot_data, new_filtered_df_state @@ -1552,10 +1786,60 @@ def create_modern_interface(): } """ with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface: - interface.head += """ - - """ + console.log("Updated Enabled QR Code Indices:", enabledStates); + + // Update visual style immediately + const itemDiv = checkbox.closest('.viewport-item'); + if (itemDiv) { + const img = itemDiv.querySelector('img'); + if (img) { + if (checkbox.checked) { + img.style.border = "2px solid green"; + img.style.opacity = "1.0"; + } else { + img.style.border = "2px solid lightgray"; + img.style.opacity = "0.5"; + } + } + } + }; + + // Initial setup might be needed if the viewport tab is the default or first shown + // setupViewportCheckboxes(); // This might run too early before elements exist. + // Rely on the tab.select event and _js parameter instead. + + ''') + with gr.Row(): crawl_depth_slider = gr.Slider( label="Crawl Depth", @@ -1584,8 +1892,8 @@ def create_modern_interface(): info="Select the maximum depth for crawling links (0-10)." ) - qr_code_paths = gr.State([]) - chatbot_data = gr.State(None) + qr_code_paths = gr.State([]) # Stores list of QR code file paths + chatbot_data = gr.State(None) # Stores the full processed data (List[Dict]) gr.Markdown(""" # 🌐 Advanced Data Processing & QR Code Generator Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor. @@ -1612,7 +1920,8 @@ def create_modern_interface(): ) with gr.Row(): example_btn = gr.Button("📝 Load Example", variant="secondary") - clear_btn = gr.Button("🗑️ Clear", variant="secondary") + clear_input_btn = gr.Button("🗑️ Clear Input", variant="secondary") # Renamed to avoid conflict + with gr.Row(): combine_data = gr.Checkbox( label="Combine all data into sequence", @@ -1641,11 +1950,16 @@ def create_modern_interface(): ) with gr.Tab("🖼️ QR Code Viewport") as viewport_tab: + # Hidden component for JS to update the state. Use elem_id for reliable targeting. + enabled_qr_codes_hidden_state = gr.Textbox(value="[]", visible=False, label="__enabled_qr_codes_state__", elem_id="enabled_qr_codes_hidden_state_id") + enabled_qr_codes = gr.State([]) # The actual state variable updated by Python from the hidden textbox + + # The viewport_output will return HTML and trigger JS execution via _js viewport_output = gr.HTML(label="QR Code Sequence Viewport") - enabled_qr_codes = gr.State([]) + with gr.Tab("🤖 Chat with Data") as chat_tab: - chat_history = gr.State([]) + chat_history = gr.State([]) # Stores the chat history (List[List[str, str]]) chatbot = gr.Chatbot(label="Data Chatbot", type='messages') # Set type to 'messages' filtered_chatbot_df_state = gr.State(None) # To store the filtered DataFrame @@ -1655,9 +1969,11 @@ def create_modern_interface(): with gr.Row(): download_full_json_btn = gr.Button("Download Full JSON") download_filtered_json_btn = gr.Button("Download Filtered JSON") - download_file_output = gr.File(label="Download Data", interactive=False) # For triggering download + # A dummy File component to trigger downloads + download_file_output = gr.File(label="Download Data", interactive=False) clear_chat_btn = gr.Button("Clear Chat History") + # Event handlers must be defined within the Blocks context def load_example(): @@ -1689,13 +2005,23 @@ def create_modern_interface(): } return json.dumps(example, indent=2) - def clear_input(): + def clear_inputs(): # Clear all input fields and the chatbot data state - return "", None, "", None + # Also clear QR related states and chat history + return "", None, "", None, [], "[]", [], None # url, file, text, chatbot_data, qr_code_paths, enabled_qr_codes_hidden_state, chat_history, filtered_chatbot_df_state - def update_viewport(paths, enabled_states): + def update_viewport(paths, enabled_states_json_str): + """Updates the HTML viewport based on QR paths and enabled state.""" if not paths: - return "

No QR codes generated yet.

" + # Return HTML and an empty JS script + return "

No QR codes generated yet.

", "" + + try: + enabled_states = json.loads(enabled_states_json_str) + if not isinstance(enabled_states, list): enabled_states = [] + except (json.JSONDecodeError, TypeError): + logger.warning("Failed to decode enabled_states JSON string. Resetting to empty list.") + enabled_states = [] num_qr_codes = len(paths) # Determine grid columns based on the number of QRs, aiming for a roughly square layout @@ -1704,26 +2030,30 @@ def create_modern_interface(): viewport_html = f'
' - # Ensure enabled_states is a list of indices if it's None or doesn't match current paths - if enabled_states is None or len(enabled_states) != num_qr_codes: - enabled_states = list(range(num_qr_codes)) + # Ensure enabled_states only contains valid indices + valid_enabled_states = [idx for idx in enabled_states if 0 <= idx < num_qr_codes] for i, path in enumerate(paths): - is_enabled = i in enabled_states - border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;" - opacity = "opacity: 1.0;" if is_enabled else "opacity: 0.5;" + is_enabled = i in valid_enabled_states + # Initial border/opacity are set by JS on load/update based on checkbox state + # This makes the HTML simpler. + # border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;" + # opacity = "opacity: 1.0;" if is_enabled else "opacity: 0.5;" # Use /file= prefix for Gradio to serve local files + # Add data-index and onchange handler to the checkbox viewport_html += f'
' - viewport_html += f'QR Code {i+1}' + viewport_html += f'QR Code {i+1}' # Style applied by JS # Add checkbox with data-index for JS to identify which QR it controls viewport_html += f'' viewport_html += '
' viewport_html += '
' - return viewport_html + # Return HTML and an empty JS script. The setupViewportCheckboxes() is called via _js on the select event. + return viewport_html, "" + def on_qr_generation(qr_paths_list): - """Handler to initialize enabled_qr_codes state after QR generation.""" + """Handler to initialize qr_code_paths and enabled_qr_codes states after QR generation.""" if qr_paths_list is None: num_qrs = 0 else: @@ -1731,8 +2061,10 @@ def create_modern_interface(): # Initially enable all generated QR codes initial_enabled_states = list(range(num_qrs)) - # Return the paths list and the initial enabled states - return qr_paths_list, initial_enabled_states + initial_enabled_states_json = json.dumps(initial_enabled_states) + + # Return the paths list and the initial enabled states (both list and JSON string) + return qr_paths_list, initial_enabled_states, initial_enabled_states_json def process_inputs(urls, files, text, combine, crawl_depth, generate_qr_enabled): """Process all inputs and generate QR codes based on toggle""" @@ -1746,13 +2078,18 @@ def create_modern_interface(): if text and text.strip(): try: json_data = json.loads(text) - results.append({ - 'source': 'json_input', - 'extracted_data': json_data, - 'timestamp': datetime.now().isoformat(), - 'processing_notes': ['Parsed from direct JSON input.'] - }) - processing_status_messages.append("✅ Successfully parsed direct JSON input.") + # Wrap single JSON objects in a list for consistency with file/url output + if not isinstance(json_data, list): + json_data = [json_data] + + for item in json_data: + results.append({ + 'source': 'json_input', + 'extracted_data': item, # Store each item separately + 'timestamp': datetime.now().isoformat(), + 'processing_notes': ['Parsed from direct JSON input.'] + }) + processing_status_messages.append(f"✅ Successfully parsed {len(json_data)} item(s) from direct JSON input.") except json.JSONDecodeError as e: processing_status_messages.append(f"❌ Invalid JSON format in text input: {str(e)}") logger.error(f"Invalid JSON format in text input: {e}") @@ -1768,38 +2105,63 @@ def create_modern_interface(): # Call fetch_content_with_depth which handles recursion content_result = url_processor.fetch_content_with_depth(url, max_steps=crawl_depth) - # The result from fetch_content_with_depth is already structured - # It includes the main fetch_result and linked_extractions + # The result from fetch_content_with_depth is a nested structure. + # We need to flatten this structure into a list of items for the main results list. + def flatten_crawl_results(crawl_result_item): + flat_list = [] + # Add the current item's fetch result if it exists and was successful + fetch_res = crawl_result_item.get('fetch_result') + if fetch_res and fetch_res.get('metadata', {}).get('status_code') is not None and 200 <= fetch_res['metadata']['status_code'] < 300: + # Copy the item and add level info to the top level + item_copy = fetch_res.copy() + item_copy['source'] = 'url_crawl' + item_copy['original_request_url'] = crawl_result_item.get('url') + item_copy['crawl_level'] = crawl_result_item.get('level') + # Combine notes from the recursive call structure and the fetch_result structure + notes_from_crawl_structure = crawl_result_item.get('processing_notes', []) + notes_from_fetch_result = item_copy.get('processing_notes', []) + # Avoid duplicating notes already present in fetch_result if they came from the recursive call + combined_notes = list(set(notes_from_crawl_structure + notes_from_fetch_result)) + item_copy['processing_notes'] = combined_notes + flat_list.append(item_copy) + elif crawl_result_item: + # Add a placeholder for items that failed to fetch or were skipped + flat_list.append({ + 'source': 'url_crawl_skipped', + 'url': crawl_result_item.get('url'), + 'crawl_level': crawl_result_item.get('level'), + 'processing_notes': crawl_result_item.get('processing_notes', []) + }) + + # Recursively add linked extractions + for linked_result in crawl_result_item.get('linked_extractions', []): + flat_list.extend(flatten_crawl_results(linked_result)) + return flat_list + if content_result: # Check if a result dictionary was returned - results.append(content_result) - # Provide status based on the fetch_result within the recursive structure - main_fetch_status = content_result.get('fetch_result', {}).get('status_code') - if main_fetch_status is not None and 200 <= main_fetch_status < 300: - processing_status_messages.append(f"✅ Processed URL: {url} (Level 0, Status: {main_fetch_status})") - if content_result.get('processing_notes'): - processing_status_messages.append(f" Notes for {url}: {'; '.join(content_result['processing_notes'])}") - - # Count successfully processed linked pages - def count_successful_fetches(crawl_result): - count = 0 - if crawl_result and crawl_result.get('fetch_result') is not None: - status = crawl_result['fetch_result'].get('status_code') - if status is not None and 200 <= status < 300: - count += 1 - for linked_result in crawl_result.get('linked_extractions', []): - count += count_successful_fetches(linked_result) - return count - - total_attempted_links = len(content_result.get('linked_extractions', [])) - total_successful_linked = count_successful_fetches({'linked_extractions': content_result.get('linked_extractions', [])}) # Wrap to match expected structure - - if total_attempted_links > 0: - processing_status_messages.append(f" Processed {total_successful_linked}/{total_attempted_links} linked pages up to depth {crawl_depth}.") + flat_crawl_data = flatten_crawl_results(content_result) + results.extend(flat_crawl_data) + + # Provide status based on the root URL fetch result + root_fetch_result = content_result.get('fetch_result') + root_fetch_status = root_fetch_result.get('metadata', {}).get('status_code') if root_fetch_result else None + + if root_fetch_result and root_fetch_status is not None and 200 <= root_fetch_status < 300: + processing_status_messages.append(f"✅ Processed root URL: {url} (Level 0, Status: {root_fetch_status})") + # Count successful fetches within the flattened results + successful_fetches_count = sum(1 for item in flat_crawl_data if item.get('source') == 'url_crawl' and item.get('metadata', {}).get('status_code') is not None and 200 <= item['metadata']['status_code'] < 300) + total_nodes_count = len(flat_crawl_data) # Count all items in the flattened list + + if total_nodes_count > 1: # Only report linked pages if depth > 0 and links were found + processing_status_messages.append(f" Fetched {successful_fetches_count}/{total_nodes_count} pages in crawl up to depth {crawl_depth}.") else: - processing_status_messages.append(f"❌ Failed to fetch or process URL: {url} (Status: {main_fetch_status})") - if content_result.get('processing_notes'): - processing_status_messages.append(f" Notes for {url}: {'; '.join(content_result['processing_notes'])}") + processing_status_messages.append(f"❌ Failed to fetch or process root URL: {url} (Status: {root_fetch_status if root_fetch_status is not None else 'N/A'})") + + # Add notes from the root processing + if content_result.get('processing_notes'): + processing_status_messages.append(f" Notes for {url} (root): {'; '.join(content_result['processing_notes'])}") + else: processing_status_messages.append(f"❌ Failed to process URL: {url} (No result returned)") @@ -1807,6 +2169,8 @@ def create_modern_interface(): if files: for file in files: processing_status_messages.append(f"📁 Processing file: {file.name}...") + # Pass temp_dir info from the outer scope + # Note: _process_file creates its *own* temp dir context internally now file_results = file_processor.process_file(file) if file_results: results.extend(file_results) @@ -1840,6 +2204,8 @@ def create_modern_interface(): else: processing_status_messages.append("⚠️ No valid content collected from inputs.") final_json_output = [] # Ensure output_json is cleared if no results + qr_paths = [] # Ensure qr_paths is cleared if no results + except Exception as e: logger.error(f"Overall processing error in process_inputs: {e}") @@ -1848,11 +2214,12 @@ def create_modern_interface(): qr_paths = [] # Clear qrs on unexpected error # Return the processed data, QR paths, status messages, and update chatbot_data state + # The qr_code_paths state and enabled_qr_codes state are updated in the .then() block return ( - final_json_output, - [str(path) for path in qr_paths], # Return paths as strings for Gradio Gallery - "\n".join(processing_status_messages), - final_json_output # Update chatbot_data state + final_json_output, # output_json + [str(path) for path in qr_paths], # output_gallery (paths as strings) + "\n".join(processing_status_messages), # output_text + final_json_output # chatbot_data state ) # --- Download Logic --- @@ -1863,6 +2230,7 @@ def create_modern_interface(): return None try: # Convert DataFrame to list of dictionaries + # Use orient='records' for list of dicts format data_list = data_df.to_dict(orient='records') json_str = json.dumps(data_list, indent=2, ensure_ascii=False) @@ -1916,6 +2284,9 @@ def create_modern_interface(): # Handle cases where top-level items might not be dicts, wrap them elif isinstance(item, (list, str, int, float, bool, type(None))): flat_data.append({'item_value': item}) + else: + # Include a note for unsupported types in the flat data + flat_data.append({'unsupported_item_type': str(type(item))}) if not flat_data: @@ -1945,10 +2316,24 @@ def create_modern_interface(): # Pass the DataFrame directly to the generic download function return download_json_data(current_filtered_df_state, "filtered_data") + # Handler to update the enabled_qr_codes State from the hidden Textbox updated by JS + def update_enabled_qr_codes_state(enabled_states_json_str): + try: + enabled_states = json.loads(enabled_states_json_str) + if isinstance(enabled_states, list): + return enabled_states + else: + logger.warning("Received non-list data for enabled_qr_codes state.") + return [] # Reset to empty list on invalid data + except (json.JSONDecodeError, TypeError): + logger.warning("Failed to decode enabled_states JSON string. Resetting to empty list.") + return [] # Reset to empty list on decode error + # Connect event handlers within the Blocks context example_btn.click(load_example, inputs=[], outputs=text_input) - clear_btn.click(clear_input, inputs=[], outputs=[url_input, file_input, text_input, chatbot_data]) + # Clear inputs button now clears all input components and relevant states + clear_input_btn.click(clear_inputs, inputs=[], outputs=[url_input, file_input, text_input, chatbot_data, qr_code_paths, enabled_qr_codes_hidden_state, chat_history, filtered_chatbot_df_state]) process_btn.click( process_inputs, @@ -1956,13 +2341,30 @@ def create_modern_interface(): outputs=[output_json, output_gallery, output_text, chatbot_data] ).then( # This .then() is triggered after process_inputs completes and updates output_gallery + # It initializes the qr_code_paths state and the enabled_qr_codes state (both list and hidden JSON string) on_qr_generation, inputs=[output_gallery], # Pass the list of QR paths from the gallery output - outputs=[qr_code_paths, enabled_qr_codes] # Update the state variables + outputs=[qr_code_paths, enabled_qr_codes, enabled_qr_codes_hidden_state] # Update the state variables + ) + + # When the hidden enabled_qr_codes_hidden_state Textbox is updated by JS, + # update the actual enabled_qr_codes State variable. + enabled_qr_codes_hidden_state.change( + update_enabled_qr_codes_state, + inputs=[enabled_qr_codes_hidden_state], + outputs=[enabled_qr_codes] ) - # When the viewport tab is selected, update the viewport HTML - viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output]) + + # When the viewport tab is selected, update the viewport HTML and trigger JS setup + # Use the hidden JSON string state as input because the JS updates it directly + viewport_tab.select( + update_viewport, + inputs=[qr_code_paths, enabled_qr_codes_hidden_state], + outputs=[viewport_output], + # The _js parameter allows running JS after the output is updated + _js="setupViewportCheckboxes();" + ) # Chatbot send button and text input submit events send_msg_btn.click( @@ -1989,10 +2391,10 @@ def create_modern_interface(): # Clear chat history button clear_chat_btn.click( - # Clear chat history component and the filtered data state - lambda: ([], None), + # Clear chat history component, the filtered data state, and the chat_history state variable + lambda: ([], None, []), # Clear chatbot component, filtered_df state, and chat_history state inputs=None, - outputs=[chatbot, filtered_chatbot_df_state] + outputs=[chatbot, filtered_chatbot_df_state, chat_history] ) # Download buttons @@ -2012,23 +2414,23 @@ def create_modern_interface(): ### 🚀 Features - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type. Supports crawling links up to a specified depth. **(Now performs real fetching)** - **Advanced File Processing**: Reads various text-based files (.txt, .md, .log etc.), HTML, XML, CSV, and attempts text extraction from common documents (.pdf, .docx, .rtf, .odt - *requires extra dependencies*). **(Now performs real file processing)** - - **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs. - - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives. **(Now performs real extraction)** + - **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs. Handles single JSON objects or lists. + - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz, .bz2, .xz archives. Includes basic Zip Slip/Tar Slip prevention. **(Now performs real extraction)** - **Robust Encoding Detection**: Uses `chardet` for reliable character encoding identification. - **Structured Output**: Provides a consistent JSON output format containing raw content (if applicable), extracted data, and processing notes for each processed item. - - **Sequential QR Codes**: Maintains data integrity across multiple codes by chunking the combined/individual processed data. + - **Sequential QR Codes**: Maintains data integrity across multiple codes by chunking the combined/individual processed data. Uses a more robust chunking method based on byte length. - **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing. - **Modern Design**: Clean, responsive interface with visual feedback. - - **Data Chatbot**: Interact conversationally with the processed JSON data to ask questions about its structure, content, or request specific information. + - **Data Chatbot**: Interact conversationally with the processed JSON data to ask questions about its structure, content, or request specific information. Supports filtering data based on column values. ### 💡 Tips - 1. **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type, following links up to the specified **Crawl Depth**. - 2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats. + 1. **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type, following links up to the specified **Crawl Depth**. Crawling is limited to the same domain or subdomains for safety and relevance. + 2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats. Supported archive types include .zip, .tar, .gz, .tgz, .tar.gz, .bz2, .tar.bz2, .xz, .tar.xz. Support for .7z and .rar requires external libraries. 3. **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure. - 4. **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries (`PyPDF2`, `python-docx`, `pyth`, `odfpy`). Check the console logs for warnings if a library is missing. - 5. **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item. + 4. **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries (`PyPDF2`, `python-docx`, `pyth`, `odfpy`). Check the console logs for warnings if a library is missing. `pillow[extra]` might be needed for some image features (though not strictly used for QR generation itself here). + 5. **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item. QR code generation must be explicitly enabled. 6. **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps. 7. **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images. - 8. **Chatbot**: After processing data, go to the "Chat with Data" tab to ask questions about the JSON output. + 8. **Chatbot**: After processing data, go to the "Chat with Data" tab to ask questions about the JSON output. You can filter data using phrases like 'show items where price > 100' or 'filter by category is electronics'. ### ⚙️ QR Code Viewport Instructions 1. Navigate to the **QR Code Viewport** tab after generating QR codes. 2. The generated QR codes will be displayed in a grid based on their total count. @@ -2051,8 +2453,11 @@ def main(): except Exception as e: logger.error(f"Application startup error: {e}") print(f"\nFatal Error: {e}\nCheck the logs for details.") - raise + # Optionally log traceback for more detailed error info + # import traceback + # logger.error(traceback.format_exc()) + raise # Re-raise the exception after logging if __name__ == "__main__": # Ensure the script is run directly (not imported) - main() \ No newline at end of file + main()