acecalisto3 commited on
Commit
87efc94
Β·
verified Β·
1 Parent(s): 3a36f7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +491 -1053
app.py CHANGED
@@ -23,11 +23,6 @@ from PIL import Image, ImageDraw, ImageFont
23
  import numpy as np
24
  import tarfile
25
  import gzip
26
- import networkx as nx
27
- import matplotlib.pyplot as plt
28
- from matplotlib.colors import to_rgba
29
- import io
30
- import math
31
 
32
  # Setup enhanced logging with more detailed formatting
33
  logging.basicConfig(
@@ -48,6 +43,7 @@ for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
48
 
49
  class EnhancedURLProcessor:
50
  """Advanced URL processing with complete content extraction"""
 
51
  def __init__(self):
52
  self.session = requests.Session()
53
  self.timeout = 15 # Extended timeout for larger content
@@ -56,7 +52,7 @@ class EnhancedURLProcessor:
56
 
57
  # Enhanced headers for better site compatibility
58
  self.session.headers.update({
59
- 'User-Agent': self.user_agent.random, # Corrected spacing
60
  'Accept': '*/*', # Accept all content types
61
  'Accept-Language': 'en-US,en;q=0.9',
62
  'Accept-Encoding': 'gzip, deflate, br',
@@ -65,7 +61,7 @@ class EnhancedURLProcessor:
65
  'Sec-Fetch-Dest': 'document',
66
  'Sec-Fetch-Mode': 'navigate',
67
  'Sec-Fetch-Site': 'none',
68
- 'Sec-Fetch-User': '?1', # Corrected spacing
69
  'DNT': '1'
70
  })
71
 
@@ -78,32 +74,22 @@ class EnhancedURLProcessor:
78
  if not all([parsed.scheme, parsed.netloc]):
79
  return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
80
  # Try HEAD request first to check accessibility
81
- head_response = None # Initialize head_response
82
  try:
83
  head_response = self.session.head(url, timeout=5)
84
  head_response.raise_for_status()
85
- # Need details from head_response if successful
86
- details = {
87
- 'content_type': head_response.headers.get('Content-Type', 'unknown'),
88
- 'server': head_response.headers.get('Server', 'unknown'),
89
- 'size': head_response.headers.get('Content-Length', 'unknown')
90
- }
91
  except requests.exceptions.RequestException:
92
  # If HEAD fails, try GET as some servers don't support HEAD
93
- logger.info(f"HEAD request failed for {url}, trying GET.")
94
  response = self.session.get(url, timeout=self.timeout)
95
  response.raise_for_status()
96
- # Use details from GET response if HEAD failed
97
- details = {
98
- 'content_type': response.headers.get('Content-Type', 'unknown'),
99
- 'server': response.headers.get('Server', 'unknown'),
100
- 'size': response.headers.get('Content-Length', 'unknown') # Might not be accurate for GET stream
101
- }
102
 
103
  return {
104
  'is_valid': True,
105
  'message': 'URL is valid and accessible',
106
- 'details': details
 
 
 
 
107
  }
108
  except Exception as e:
109
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
@@ -114,7 +100,7 @@ class EnhancedURLProcessor:
114
  logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
115
 
116
  # Update User-Agent randomly for each request
117
- self.session.headers.update({'User-Agent': self.user_agent.random}) # Corrected spacing
118
 
119
  response = self.session.get(url, timeout=self.timeout)
120
  response.raise_for_status()
@@ -124,11 +110,10 @@ class EnhancedURLProcessor:
124
  encoding = chardet.detect(response.content)['encoding'] or 'utf-8'
125
  else:
126
  encoding = response.encoding
127
-
128
  # Decode content with fallback
129
  try:
130
  raw_content = response.content.decode(encoding, errors='replace')
131
- except (UnicodeDecodeError, LookupError): # Corrected error type
132
  raw_content = response.content.decode('utf-8', errors='replace')
133
 
134
  # Extract metadata
@@ -147,10 +132,10 @@ class EnhancedURLProcessor:
147
  if 'text/html' in content_type:
148
  processed_content = self._process_html_content(raw_content, url)
149
  else:
150
- processed_content = raw_content # Store raw non-html content as processed
151
  return {
152
  'content': processed_content,
153
- 'raw_content': raw_content, # Keep raw bytes if needed elsewhere
154
  'metadata': metadata
155
  }
156
  except requests.exceptions.RequestException as e:
@@ -174,890 +159,460 @@ class EnhancedURLProcessor:
174
  for attr in ['href', 'src']:
175
  if tag.get(attr):
176
  try:
177
- # Handle potential base tag
178
- base = soup.find('base')
179
- current_base_url = base['href'] if base and base.get('href') else base_url
180
- tag[attr] = urljoin(current_base_url, tag[attr])
181
- except Exception as url_e:
182
- # logger.warning(f"Could not absolutize URL {tag.get(attr)} in {base_url}: {url_e}")
183
- pass # Keep original if conversion fails
184
-
185
- # Extract all text content more cleanly
186
- text_parts = [element for element in soup.stripped_strings]
187
- # text_content = ' '.join(text_parts) # Join with space instead of newline? Depends on use case.
188
- # Or keep newlines for structure:
189
- text_content = '\n'.join(text_parts)
190
-
191
- # Alternative: Get all text including scripts/styles if needed
192
- # text_content = soup.get_text(separator='\n', strip=True)
193
-
194
- return text_content
195
  except Exception as e:
196
  logger.error(f"HTML processing error: {e}")
197
- # Return original content if parsing fails
198
  return content
199
 
200
  class EnhancedFileProcessor:
201
  """Advanced file processing with complete content extraction"""
 
202
  def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
203
  self.max_file_size = max_file_size
204
- # Added more potential text/data formats
205
  self.supported_extensions = {
206
- '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', '.css', '.js',
207
- '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg', '.toml', '.sql', '.py', '.java', '.c', '.cpp', '.h', # Code files
208
- '.zip', '.tar', '.gz', '.bz2', # No .7z, .rar without external libs
209
- # '.pdf', '.doc', '.docx', '.rtf', '.odt' # These require more specific libraries (PyPDF2, python-docx etc.) - keep commented unless implemented
210
- }
211
- # Define extensions that should be treated primarily as text
212
- self.text_extensions = {
213
- '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', '.css', '.js',
214
- '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg', '.toml', '.sql', '.py', '.java', '.c', '.cpp', '.h'
215
  }
216
 
217
-
218
  def process_file(self, file) -> List[Dict]:
219
  """Process uploaded file with enhanced error handling and complete extraction"""
220
- if not file or not hasattr(file, 'name'):
221
- logger.warning("Invalid file object received in process_file.")
222
- return []
223
 
224
  dataset = []
225
- file_path_obj = Path(file.name)
226
-
227
  try:
228
- # Use Gradio's temp file path directly
229
- file_path = file_path_obj.resolve()
230
- if not file_path.exists():
231
- logger.error(f"File path does not exist: {file_path}")
232
- return []
233
-
234
- file_size = file_path.stat().st_size
235
  if file_size > self.max_file_size:
236
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size ({self.max_file_size} bytes) for {file_path.name}")
237
- # Optionally return a specific error message entry
238
- # return [{'error': 'File too large', 'filename': file_path.name}]
239
  return []
240
 
241
- file_suffix = file_path.suffix.lower()
242
-
243
- # Check if supported at all
244
- # if file_suffix not in self.supported_extensions and not self._is_archive(str(file_path)):
245
- # logger.warning(f"Unsupported file type based on extension: {file_path.name}")
246
- # # Decide if you want to try processing anyway or return
247
- # # return [{'error': 'Unsupported file type', 'filename': file_path.name}]
248
- # # Let's try processing anyway, _process_single_file will handle text reading
249
- # pass # Continue to attempt processing
250
-
251
- # Use a persistent temp directory if needed across calls, otherwise TemporaryDirectory is fine
252
- with tempfile.TemporaryDirectory(dir=TEMP_DIR) as temp_dir: # Use configured temp dir
253
  temp_dir_path = Path(temp_dir)
254
 
255
- # Handle archives first
256
- if self._is_archive(str(file_path)):
257
- logger.info(f"Processing archive file: {file_path.name}")
258
- dataset.extend(self._process_archive(str(file_path), temp_dir_path))
 
259
  else:
260
- # Process as single file (might be text or something else)
261
- logger.info(f"Processing single file: {file_path.name}")
262
- # Pass the path string or Path object to _process_single_file
263
- dataset.extend(self._process_single_file(file_path))
264
-
265
 
266
  except Exception as e:
267
- logger.error(f"Error processing file '{file_path_obj.name}': {str(e)}", exc_info=True) # Log stack trace
268
- # Optionally return error entry
269
- # dataset.append({'error': f'Processing failed: {str(e)}', 'filename': file_path_obj.name})
270
- return [] # Return empty list on error for now
271
  return dataset
272
 
273
  def _is_archive(self, filepath: str) -> bool:
274
- """Check if file is a supported archive type"""
275
- # Only include archive types we can handle
276
- return filepath.lower().endswith(('.zip', '.tar', '.tar.gz', '.tgz', '.gz', '.bz2')) # Added bz2 if bz2 lib is imported
 
277
 
278
- def _process_single_file(self, file_path: Union[str, Path]) -> List[Dict]:
279
  """Process a single file with enhanced character extraction and JSON handling"""
280
- # Ensure file_path is a Path object
281
- file_path = Path(file_path)
282
- file_name = file_path.name
283
- file_suffix = file_path.suffix.lower()
284
-
285
  try:
286
- file_stat = file_path.stat()
287
  file_size = file_stat.st_size
288
- mime_type, _ = mimetypes.guess_type(file_path)
289
- mime_type = mime_type or 'application/octet-stream' # Default if guess fails
290
 
291
  # Initialize content storage
292
- complete_content = None
293
- is_json_like = file_suffix == '.json' or 'json' in mime_type
294
 
295
- # Try reading as text first if it's a text-like extension or potentially text mime type
296
- # Increased chunk size for efficiency on larger text files
297
  chunk_size = 10 * 1024 * 1024 # 10MB chunks
298
- if file_suffix in self.text_extensions or (mime_type and mime_type.startswith('text/')):
299
- content_parts = []
300
- detected_encoding = 'utf-8' # Default
301
- try:
302
- with open(file_path, 'rb') as f:
303
- # Detect encoding from the first chunk for better accuracy
304
- first_chunk = f.read(chunk_size)
305
- if first_chunk:
306
- detected_encoding = chardet.detect(first_chunk)['encoding'] or 'utf-8'
307
- logger.info(f"Detected encoding for {file_name}: {detected_encoding}")
308
- # Rewind or reopen might be cleaner if needed, but let's decode first chunk
309
- try:
310
- decoded_chunk = first_chunk.decode(detected_encoding, errors='replace')
311
- content_parts.append(decoded_chunk)
312
- except (UnicodeDecodeError, LookupError):
313
- logger.warning(f"Failed to decode first chunk with {detected_encoding}, falling back to utf-8 for {file_name}")
314
- detected_encoding = 'utf-8' # Fallback for subsequent reads
315
- decoded_chunk = first_chunk.decode(detected_encoding, errors='replace')
316
- content_parts.append(decoded_chunk)
317
-
318
- # Read remaining chunks
319
- while True:
320
- chunk = f.read(chunk_size)
321
- if not chunk:
322
- break
323
- try:
324
- decoded_chunk = chunk.decode(detected_encoding, errors='replace')
325
- content_parts.append(decoded_chunk)
326
- except (UnicodeDecodeError, LookupError):
327
- # Should not happen if fallback already occurred, but good practice
328
- logger.warning(f"Decoding error in subsequent chunk for {file_name}, using replace.")
329
- decoded_chunk = chunk.decode(detected_encoding, errors='replace')
330
- content_parts.append(decoded_chunk)
331
-
332
- complete_content = ''.join(content_parts)
333
- logger.info(f"Successfully read text content from {file_name}")
334
-
335
- except IOError as e:
336
- logger.error(f"IOError reading file {file_name}: {e}")
337
- return [] # Cannot process if read fails
338
- except Exception as e:
339
- logger.error(f"Error reading text file {file_name}: {e}", exc_info=True)
340
- # Decide if we should return or try other methods
341
- return []
342
-
343
-
344
- # Now, check if the read text content IS valid JSON
345
- json_data = None
346
- raw_json_content = None # Store the raw string if it was JSON
347
- if complete_content is not None:
348
- try:
349
  json_data = json.loads(complete_content)
350
- # It is JSON! Update metadata
351
- raw_json_content = complete_content # Keep the original string
352
- complete_content = json_data # Now content holds the parsed object
353
- mime_type = 'application/json' # Correct mime type
354
- source = 'json_content_detected'
355
- if file_suffix == '.json':
356
- source = 'json_file'
357
- logger.info(f"Successfully parsed JSON content from {file_name}")
358
-
359
- except json.JSONDecodeError:
360
- # It looked like text, but wasn't valid JSON
361
- if is_json_like:
362
- logger.warning(f"File {file_name} has JSON extension/mime but failed to parse.")
363
- # Keep complete_content as the string it was read as
364
- source = 'text_file'
365
- except Exception as e:
366
- logger.error(f"Unexpected error during JSON parsing check for {file_name}: {e}")
367
- # Keep complete_content as string, mark as text file
368
- source = 'text_file'
369
- else:
370
- # File wasn't identified as text or failed to read
371
- # Could attempt binary read here if needed, or just mark as non-text
372
- logger.warning(f"Could not read {file_name} as text. Storing metadata only or treating as binary.")
373
- source = 'binary_file' # Or 'unreadable_file'
374
- complete_content = f"Binary or unreadable content ({file_size} bytes)" # Placeholder
375
-
 
 
 
 
 
376
 
377
- # Structure the output
378
- result = {
379
- 'source': source,
380
- 'filename': file_name,
381
  'file_size': file_size,
382
- 'mime_type': mime_type,
383
  'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
384
  'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
385
- 'content': complete_content, # This is parsed JSON if successful, or text string, or placeholder
386
  'timestamp': datetime.now().isoformat()
387
- }
388
- if raw_json_content:
389
- result['raw_content'] = raw_json_content # Add raw string if it was JSON
390
-
391
- return [result]
392
-
393
- except FileNotFoundError:
394
- logger.error(f"File not found during processing: {file_path}")
395
- return []
396
  except Exception as e:
397
- logger.error(f"File processing error for {file_path.name}: {e}", exc_info=True)
398
  return []
399
 
400
  def _process_archive(self, archive_path: str, extract_to: Path) -> List[Dict]:
401
  """Process an archive file with enhanced extraction"""
402
  dataset = []
403
- archive_path_obj = Path(archive_path)
404
- logger.info(f"Attempting to extract archive: {archive_path_obj.name}")
405
-
406
  try:
407
  # Handle ZIP archives
408
- if archive_path.lower().endswith('.zip') and zipfile.is_zipfile(archive_path):
409
- logger.debug(f"Processing ZIP file: {archive_path_obj.name}")
410
  with zipfile.ZipFile(archive_path, 'r') as zip_ref:
411
- # Check for zip bomb potential (optional, basic check)
412
- total_uncompressed_size = sum(file.file_size for file in zip_ref.infolist())
413
- # Add a limit, e.g., 10x the archive size or an absolute limit like 10GB
414
- if total_uncompressed_size > self.max_file_size * 10: # Example limit
415
- logger.warning(f"Potential zip bomb detected: {archive_path_obj.name}, uncompressed size {total_uncompressed_size}")
416
- return [{'error': 'Archive potential bomb', 'filename': archive_path_obj.name}]
417
-
418
  for file_info in zip_ref.infolist():
419
- # Avoid directory entries and potential path traversal issues
420
- if not file_info.is_dir() and file_info.filename and not file_info.filename.startswith('/') and '..' not in file_info.filename:
421
- try:
422
- extracted_path = extract_to / file_info.filename
423
- # Ensure parent directory exists
424
- extracted_path.parent.mkdir(parents=True, exist_ok=True)
425
-
426
- # Extract individual file safely
427
- with zip_ref.open(file_info.filename) as source, open(extracted_path, "wb") as target:
428
- target.write(source.read())
429
-
430
- logger.debug(f"Extracted {file_info.filename} from zip.")
431
- # Now process the extracted file
432
- dataset.extend(self._process_single_file(extracted_path))
433
- except Exception as extract_err:
434
- logger.error(f"Failed to extract/process file {file_info.filename} from zip {archive_path_obj.name}: {extract_err}")
435
-
436
- # Handle TAR archives (covers .tar, .tar.gz, .tgz, .tar.bz2)
437
- # Need to import bz2 if supporting .bz2
438
- elif tarfile.is_tarfile(archive_path):
439
- logger.debug(f"Processing TAR file: {archive_path_obj.name}")
440
- # Mode 'r:*' auto-detects compression (gz, bz2, xz if libs available)
441
- with tarfile.open(archive_path, 'r:*') as tar_ref:
442
- # Add security checks for tar extraction if needed (e.g., checking paths)
443
- for member in tar_ref.getmembers():
444
- if member.isfile() and member.name and not member.name.startswith('/') and '..' not in member.name:
445
- try:
446
- # Construct safe path
447
- extracted_path = extract_to / member.name
448
- extracted_path.parent.mkdir(parents=True, exist_ok=True)
449
- # Extract safely
450
- with tar_ref.extractfile(member) as source, open(extracted_path, "wb") as target:
451
- target.write(source.read())
452
-
453
- logger.debug(f"Extracted {member.name} from tar.")
454
- dataset.extend(self._process_single_file(extracted_path))
455
- except Exception as extract_err:
456
- logger.error(f"Failed to extract/process member {member.name} from tar {archive_path_obj.name}: {extract_err}")
457
-
458
- # Handle GZIP archives (single file compression) - check it's not a tar.gz
459
- elif archive_path.lower().endswith('.gz') and not archive_path.lower().endswith('.tar.gz'):
460
- logger.debug(f"Processing GZIP file: {archive_path_obj.name}")
461
- # Need to determine the output filename (remove .gz)
462
- extracted_filename = archive_path_obj.stem
463
- # Handle cases like '.txt.gz' -> '.txt'
464
- if '.' in extracted_filename:
465
- extracted_path = extract_to / extracted_filename
466
- else:
467
- # If no inner extension (e.g., 'myfile.gz'), maybe add a default like '.bin' or leave as is?
468
- extracted_path = extract_to / (extracted_filename + ".bin") # Example
469
-
470
- try:
471
- extracted_path.parent.mkdir(parents=True, exist_ok=True)
472
  with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
473
  outfile.write(gz_file.read())
474
- logger.debug(f"Extracted {extracted_path.name} from gzip.")
475
- dataset.extend(self._process_single_file(extracted_path))
476
- except gzip.BadGzipFile as e:
477
- logger.error(f"Error processing GZIP archive {archive_path_obj.name}: Bad Gzip File - {e}")
478
- except Exception as extract_err:
479
- logger.error(f"Failed to extract/process gzip file {archive_path_obj.name}: {extract_err}")
480
-
481
- # Add BZ2 single file support (requires bz2 import)
482
- elif archive_path.lower().endswith('.bz2') and not archive_path.lower().endswith('.tar.bz2'):
483
- logger.debug(f"Processing BZ2 file: {archive_path_obj.name}")
484
- try:
485
- import bz2
486
- extracted_filename = archive_path_obj.stem
487
- extracted_path = extract_to / extracted_filename
488
- if '.' not in extracted_filename:
489
- extracted_path = extract_to / (extracted_filename + ".bin")
490
-
491
- extracted_path.parent.mkdir(parents=True, exist_ok=True)
492
- with bz2.open(archive_path, 'rb') as bz2_file, open(extracted_path, 'wb') as outfile:
493
- outfile.write(bz2_file.read())
494
- logger.debug(f"Extracted {extracted_path.name} from bz2.")
495
- dataset.extend(self._process_single_file(extracted_path))
496
-
497
- except ImportError:
498
- logger.warning("bz2 library not available, cannot process .bz2 files.")
499
- except Exception as extract_err:
500
- logger.error(f"Failed to extract/process bz2 file {archive_path_obj.name}: {extract_err}")
501
-
502
-
503
- # Placeholder for other types or if no specific handler matched
504
- else:
505
- logger.warning(f"Archive type not explicitly handled or not a recognized archive: {archive_path_obj.name}")
506
-
507
 
508
- except FileNotFoundError:
509
- logger.error(f"Archive file not found: {archive_path}")
510
- except (zipfile.BadZipFile, tarfile.TarError, gzip.BadGzipFile) as archive_err:
511
- logger.error(f"Invalid or corrupted archive file {archive_path_obj.name}: {archive_err}")
512
- dataset.append({'error': f'Corrupted archive: {archive_err}', 'filename': archive_path_obj.name})
513
  except Exception as e:
514
- logger.error(f"General archive processing error for {archive_path_obj.name}: {e}", exc_info=True)
515
- dataset.append({'error': f'Archive processing failed: {e}', 'filename': archive_path_obj.name})
516
  return dataset
517
 
518
- # Adjusted chunk_data with recommended max_size for QR codes
519
- def chunk_data(self, data: Union[Dict, List, str], max_size: int = 1800) -> List[Dict]:
520
- """Enhanced data chunking with sequence metadata, sized for QR codes."""
521
  try:
522
- if not isinstance(data, str):
523
- # Convert complex data to JSON string first
524
- # Use separators=(',', ':') for compact JSON
525
- json_str = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
526
- else:
527
- json_str = data # Assume input string is already the data payload
528
-
529
- # Data here is the raw string (or JSON string) payload for the QR code
530
- total_length = len(json_str.encode('utf-8')) # Use byte length for QR capacity
531
- logger.debug(f"Chunking data of total byte length: {total_length}")
 
 
 
532
 
 
 
533
 
534
- # Simplified: If the data fits within max_size (bytes), return one chunk object
535
- # The chunk object itself adds metadata, but the 'data' field is what matters for QR limit.
536
- if total_length <= max_size:
537
- chunk_meta = {
538
  "chunk_index": 0,
539
  "total_chunks": 1,
540
- "total_length": total_length, # Store byte length
541
- "chunk_hash": hash(json_str) & 0xFFFFFFFF,
542
- "data": json_str # The actual string payload
543
  }
544
- logger.debug(f"Data fits in one chunk (payload size {total_length} bytes)")
545
- return [chunk_meta]
546
-
547
- # If data exceeds max_size, split the string payload
548
- # We need to split the *string* representation carefully
549
- # Aim for byte size chunks, which is tricky with UTF-8 variable char width
550
- # Simple approach: estimate character chunk size based on bytes
551
-
552
- # Estimate average bytes per character (crude but simple)
553
- avg_bytes_per_char = total_length / len(json_str) if len(json_str) > 0 else 1
554
- # Calculate target character chunk size based on byte limit
555
- target_char_chunk_size = int(max_size / avg_bytes_per_char)
556
-
557
- if target_char_chunk_size < 1: target_char_chunk_size = 1 # Avoid zero chunk size
558
 
559
- # Calculate number of chunks based on estimated character size
560
- num_chunks = math.ceil(len(json_str) / target_char_chunk_size)
 
561
 
562
  chunks = []
563
- start_char_idx = 0
564
  for i in range(num_chunks):
565
- # Calculate end index, ensuring we don't overshoot
566
- end_char_idx = min(start_char_idx + target_char_chunk_size, len(json_str))
567
-
568
- # Extract the character chunk
569
- chunk_payload_str = json_str[start_char_idx:end_char_idx]
570
-
571
- # Recalculate actual byte length for this specific chunk
572
- current_chunk_byte_length = len(chunk_payload_str.encode('utf-8'))
573
-
574
- # Adjust end_char_idx if current chunk exceeds max_size (rare if estimate is decent)
575
- while current_chunk_byte_length > max_size and end_char_idx > start_char_idx:
576
- end_char_idx -= 1 # Reduce characters
577
- chunk_payload_str = json_str[start_char_idx:end_char_idx]
578
- current_chunk_byte_length = len(chunk_payload_str.encode('utf-8'))
579
-
580
- if not chunk_payload_str and start_char_idx < len(json_str):
581
- # This should not happen with the logic above, but as a safeguard
582
- logger.error("Chunking resulted in empty payload string unexpectedly.")
583
- # Handle error: skip, break, or adjust logic
584
- break # Avoid infinite loop
585
 
586
- chunk_meta = {
587
  "chunk_index": i,
588
  "total_chunks": num_chunks,
589
- "total_length": total_length, # Original total byte length
590
- "chunk_byte_length": current_chunk_byte_length, # Actual byte length of this chunk's payload
591
- "chunk_hash": hash(chunk_payload_str) & 0xFFFFFFFF,
592
- "data": chunk_payload_str # The string payload for this chunk
593
  }
594
- chunks.append(chunk_meta)
595
- logger.debug(f"Created chunk {i+1}/{num_chunks}, payload byte size: {current_chunk_byte_length}")
596
-
597
- # Move to the next starting point
598
- start_char_idx = end_char_idx
599
-
600
- # Safety break if start index doesn't advance
601
- if start_char_idx == len(json_str) and i + 1 < num_chunks:
602
- logger.warning(f"Chunking finished early at index {i+1} of {num_chunks}. Check logic.")
603
- # Adjust total_chunks if ending early?
604
- for ch in chunks: ch['total_chunks'] = len(chunks)
605
- break
606
-
607
-
608
- # Final check if total chunks changed
609
- if chunks and chunks[0]['total_chunks'] != len(chunks):
610
- logger.warning(f"Adjusting total_chunks from {chunks[0]['total_chunks']} to {len(chunks)}")
611
- final_num_chunks = len(chunks)
612
- for i, chunk in enumerate(chunks):
613
- chunk['total_chunks'] = final_num_chunks
614
- chunk['chunk_index'] = i # Re-index just in case
615
-
616
 
617
  return chunks
618
  except Exception as e:
619
- logger.error(f"Error chunking data: {e}", exc_info=True)
620
  return []
621
 
622
-
623
- def generate_stylish_qr(data: str, # Expecting string data from chunking
624
  filename: str,
625
  size: int = 10,
626
  border: int = 4,
627
  fill_color: str = "#000000",
628
- back_color: str = "#FFFFFF",
629
- error_correction_level=qrcode.constants.ERROR_CORRECT_H) -> str: # Added param
630
  """Generate a stylish QR code with enhanced visual appeal"""
631
  try:
632
  qr = qrcode.QRCode(
633
- version=None, # Auto-detect version
634
- error_correction=error_correction_level, # Use parameter
635
  box_size=size,
636
  border=border
637
  )
638
 
639
- # Add string data directly (should be from chunker)
640
- qr.add_data(data)
 
 
 
641
 
642
- # Let the library figure out the best version and mode
643
  qr.make(fit=True)
644
 
645
- logger.info(f"Generating QR code version {qr.version} for {filename} (Payload size: {len(data.encode('utf-8'))} bytes)")
646
-
647
-
648
  # Create QR code image with custom colors
649
  qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
650
 
651
- # Convert to RGBA for transparency support (optional gradient)
652
  qr_image = qr_image.convert('RGBA')
653
 
654
- # --- Optional: Add subtle gradient overlay ---
655
- # gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
656
- # draw = ImageDraw.Draw(gradient)
657
- # for i in range(qr_image.width):
658
- # alpha = int(255 * (1 - i/qr_image.width) * 0.1) # 10% maximum opacity
659
- # draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
660
- # final_image = Image.alpha_composite(qr_image, gradient)
661
- # --- End Optional Gradient ---
662
 
663
- final_image = qr_image # Use this line if gradient is commented out
 
664
 
665
  # Save the image
666
  output_path = QR_CODES_DIR / filename
667
- # Ensure directory exists just before saving
668
- output_path.parent.mkdir(parents=True, exist_ok=True)
669
-
670
- final_image.save(output_path, quality=95) # PNG quality is lossless, but ok
671
 
672
  return str(output_path)
673
- # Catch specific data overflow error
674
- except qrcode.exceptions.DataOverflowError as doe:
675
- logger.error(f"QR DataOverflowError for {filename}: {doe}. Data length (bytes): {len(data.encode('utf-8'))}. Max capacity likely exceeded for ErrorLevel {error_correction_level}.")
676
- return "" # Return empty string on failure
677
  except Exception as e:
678
- logger.error(f"QR generation error for {filename}: {e}", exc_info=True)
679
  return ""
680
 
681
-
682
- def generate_qr_codes(data_to_encode: Union[str, Dict, List], combine_sources: bool = True) -> List[str]:
683
- """Generate QR codes, chunking data appropriately."""
684
  try:
685
- file_processor = EnhancedFileProcessor() # Get chunking method
686
- all_qr_paths = []
687
- qr_fill = "#1a365d" # Deep blue
688
- qr_back = "#ffffff"
689
- # Decide on error correction level - H is default, M or L allow more data
690
- error_level = qrcode.constants.ERROR_CORRECT_H # Max correction, lowest capacity
691
- # error_level = qrcode.constants.ERROR_CORRECT_M # Medium correction, medium capacity
692
- # error_level = qrcode.constants.ERROR_CORRECT_L # Low correction, max capacity
693
-
694
-
695
- if combine_sources:
696
- logger.info("Combining all input sources into a single QR sequence.")
697
- # Combine all data into one large structure (e.g., a list) before chunking
698
- # This assumes `data_to_encode` is already the combined list/dict from process_inputs
699
- if not data_to_encode:
700
- logger.warning("No data provided to generate combined QR codes.")
701
- return []
702
-
703
- # Chunk the combined data structure
704
- chunks = file_processor.chunk_data(data_to_encode) # Chunker expects dict/list/str
705
- if not chunks:
706
- logger.error("Chunking the combined data failed.")
707
- return []
708
-
709
- num_chunks = len(chunks)
710
- logger.info(f"Generating {num_chunks} QR codes for combined data.")
711
- for i, chunk_info in enumerate(chunks):
712
- # chunk_info contains {'chunk_index', 'total_chunks', 'data', etc.}
713
- filename = f'combined_qr_{int(time.time())}_{i+1}_of_{num_chunks}.png'
714
- # Pass the actual payload string to the generator
715
- qr_payload = chunk_info['data']
716
  qr_path = generate_stylish_qr(
717
- data=qr_payload,
718
  filename=filename,
719
- fill_color=qr_fill,
720
- back_color=qr_back,
721
- error_correction_level=error_level # Pass level
722
  )
723
  if qr_path:
724
- all_qr_paths.append(qr_path)
725
- else:
726
- logger.error(f"Failed to generate QR code for combined chunk {i+1}")
727
- # Optionally stop or continue?
728
-
729
  else:
730
- # Process each item in the input list individually
731
- logger.info("Generating separate QR code sequences for each input source.")
732
- if not isinstance(data_to_encode, list):
733
- logger.error("Input data must be a list when combine_sources is False.")
734
- # Maybe wrap it?
735
- if data_to_encode:
736
- data_to_encode = [data_to_encode]
737
- else:
738
- return []
739
-
740
-
741
- total_items = len(data_to_encode)
742
- for item_idx, item in enumerate(data_to_encode):
743
- item_source_info = f"item {item_idx+1}/{total_items}"
744
- # Try to get a better name (e.g., from filename if available)
745
- if isinstance(item, dict) and 'filename' in item:
746
- item_source_info = item['filename']
747
- elif isinstance(item, dict) and 'url' in item:
748
- item_source_info = Path(urlparse(item['url']).path).name or f"url_item_{item_idx+1}"
749
-
750
- logger.info(f"Processing source: {item_source_info}")
751
-
752
- # Chunk the individual item
753
- chunks = file_processor.chunk_data(item)
754
- if not chunks:
755
- logger.error(f"Chunking failed for item {item_idx+1} ({item_source_info})")
756
- continue # Skip to next item
757
-
758
- num_chunks = len(chunks)
759
- logger.info(f"Generating {num_chunks} QR codes for {item_source_info}.")
760
- for chunk_idx, chunk_info in enumerate(chunks):
761
- # Sanitize source info for filename
762
- safe_source_name = re.sub(r'[^\w\-]+', '_', item_source_info)
763
- filename = f'{safe_source_name}_chunk_{chunk_idx+1}_of_{num_chunks}_{int(time.time())}.png'
764
- qr_payload = chunk_info['data']
765
  qr_path = generate_stylish_qr(
766
- data=qr_payload,
767
  filename=filename,
768
- fill_color=qr_fill,
769
- back_color=qr_back,
770
- error_correction_level=error_level # Pass level
771
  )
772
  if qr_path:
773
- all_qr_paths.append(qr_path)
774
- else:
775
- logger.error(f"Failed to generate QR code for {item_source_info} chunk {chunk_idx+1}")
776
-
777
-
778
- logger.info(f"Generated a total of {len(all_qr_paths)} QR codes.")
779
- return all_qr_paths
780
  except Exception as e:
781
- logger.error(f"General QR code generation process error: {e}", exc_info=True)
782
  return []
783
 
784
- def _generate_sequence_visualization_image(qr_paths: List[str], qr_data: List[Dict], title: str = "QR Code Sequence") -> Optional[io.BytesIO]:
785
- """
786
- Generates a visual representation of the QR code sequence using NetworkX and Matplotlib.
787
-
788
- Args:
789
- qr_paths: List of file paths to the QR code images.
790
- qr_data: List of decoded data dictionaries, hopefully containing 'chunk_index'.
791
- title: The title for the visualization plot.
792
-
793
- Returns:
794
- A BytesIO buffer containing the PNG image of the visualization, or None if error.
795
- """
796
- if not qr_paths or not qr_data or len(qr_paths) != len(qr_data):
797
- logger.warning("Mismatch or empty data for visualization.")
798
- return None
799
-
800
- logger.info(f"Generating visualization for {len(qr_paths)} QR codes.")
801
- try:
802
- G = nx.DiGraph()
803
- node_labels = {}
804
- node_colors = []
805
- node_sizes = []
806
-
807
- # Assume data is pre-sorted by chunk_index during loading
808
- num_nodes = len(qr_paths)
809
- total_chunks_from_meta = qr_data[0].get('total_chunks', num_nodes) if qr_data else num_nodes
810
-
811
- for i in range(num_nodes):
812
- node_id = i
813
- # Use chunk_index from metadata if possible, otherwise use list index
814
- chunk_idx = qr_data[i].get('chunk_index', i)
815
- label = f"{chunk_idx + 1}/{total_chunks_from_meta}"
816
- node_labels[node_id] = label
817
- G.add_node(node_id, path=qr_paths[i], data=qr_data[i])
818
-
819
- # Add edges between consecutive nodes
820
- if i > 0:
821
- G.add_edge(i - 1, i)
822
-
823
- # Simple coloring/sizing (can be customized further)
824
- node_colors.append('#4299e1') # Default blue color
825
- node_sizes.append(1500)
826
-
827
- if not G.nodes:
828
- logger.warning("No nodes to visualize.")
829
- return None
830
-
831
- # --- Layout and Drawing ---
832
- plt.figure(figsize=(max(10, num_nodes * 1.5), 5)) # Adjust figure size based on number of nodes
833
-
834
- # Simple linear layout for sequences is often clearest
835
- pos = {i: (i * 2, 0) for i in range(num_nodes)} # Horizontal layout
836
-
837
- # For more complex graphs, consider other layouts:
838
- # pos = nx.spring_layout(G, k=0.5, iterations=50)
839
- # pos = nx.kamada_kawai_layout(G)
840
-
841
- nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color=node_colors, alpha=0.9)
842
- nx.draw_networkx_edges(G, pos, arrowstyle='-|>', arrowsize=20, edge_color='gray', alpha=0.6)
843
- nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=10, font_color='white')
844
-
845
- plt.title(title, fontsize=16)
846
- plt.xlabel("Sequence Index", fontsize=12)
847
- plt.yticks([]) # Hide Y-axis ticks for linear layout
848
- plt.xticks(range(0, num_nodes * 2, 2), [f"{i+1}" for i in range(num_nodes)]) # Label X-axis ticks
849
- plt.box(False) # Remove frame box
850
- plt.tight_layout()
851
-
852
- # Save plot to a BytesIO buffer
853
- buf = io.BytesIO()
854
- plt.savefig(buf, format='png', bbox_inches='tight', dpi=100)
855
- plt.close() # Close the plot figure to free memory
856
- buf.seek(0)
857
- logger.info("Successfully generated visualization image buffer.")
858
- return buf
859
-
860
- except Exception as e:
861
- logger.error(f"Error generating visualization image: {e}", exc_info=True)
862
- plt.close() # Ensure plot is closed even on error
863
  return None
864
-
865
- # --- Gradio Interface Section ---
866
-
867
- def create_qr_sequence_visualizer(output_gallery_ref): # Pass a reference if needed later
868
- """Add QR sequence visualization capabilities to the application"""
869
- with gr.Tab("πŸ”„ QR Sequence Visualizer"):
870
- gr.Markdown("""
871
- ## QR Code Sequence Visualizer
872
- Upload a sequence of QR codes (e.g., those generated by this app) to decode them and visualize their order.
873
- """)
874
-
875
- # Store data globally within this tab's scope (alternative to Gradio State)
876
- # This is simpler but not ideal for complex state management
877
- shared_data = {'qr_paths': [], 'qr_data': []}
878
-
879
- with gr.Row():
880
- with gr.Column(scale=1):
881
- qr_input = gr.File(
882
- label="Upload QR Code Images",
883
- file_types=["image/png", "image/jpeg", ".png", ".jpg", ".jpeg"], # Be explicit
884
- file_count="multiple"
885
- )
886
- visualize_btn = gr.Button("πŸ‘οΈ Decode & Visualize Sequence", variant="primary")
887
- reset_btn = gr.Button("πŸ—‘οΈ Reset Visualizer", variant="secondary")
888
- visualization_status = gr.Textbox(label="Status", interactive=False, lines=3)
889
- # Placeholder for interactive elements (future improvement)
890
- # qr_toggles_container = gr.HTML(label="QR Code Controls (Future)")
891
-
892
- with gr.Column(scale=2):
893
- qr_visualization = gr.Image(label="QR Code Sequence Map", type="pil", height=400) # Use PIL type
894
- qr_preview = gr.Gallery(label="Uploaded QR Codes (Sorted)", columns=4, height=400, object_fit="contain", preview=True)
895
-
896
-
897
- def process_qr_codes_and_visualize(files):
898
- """Decodes QR files, sorts them, updates gallery, and generates visualization."""
899
- if not files:
900
- shared_data['qr_paths'] = []
901
- shared_data['qr_data'] = []
902
- return "Please upload QR code images.", None, None, "⚠️ No QR codes uploaded."
903
-
904
- logger.info(f"Processing {len(files)} uploaded QR files for visualization.")
905
- qr_data_list = []
906
- qr_path_list = []
907
- decode_errors = 0
908
-
909
- # Use OpenCV detector via qrcode library
910
  try:
911
- detector = qrcode.QRCodeDetector()
912
- except AttributeError:
913
- logger.error("qrcode.QRCodeDetector not found. Ensure correct library version or dependencies.")
914
- return "Error initializing QR detector.", None, None, "❌ Library Error"
915
- except Exception as init_e:
916
- logger.error(f"Error initializing QR detector: {init_e}")
917
- return f"Error initializing QR detector: {init_e}", None, None, "❌ Detector Init Error"
918
-
919
-
920
- for file in files:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
921
  try:
922
- img_path = file.name # Gradio File object path
923
- img = Image.open(img_path)
924
- img_np = np.array(img.convert('RGB')) # Detector often prefers RGB
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
925
 
926
- # Try to decode QR code
927
- data, bbox, straight_qrcode = detector.detectAndDecode(img_np)
928
 
929
- if data:
930
- logger.debug(f"Decoded data from {os.path.basename(img_path)}: {data[:50]}...")
931
- # Try parsing the decoded data as JSON (expected format from generator)
932
- try:
933
- qr_metadata = json.loads(data)
934
- # Check if it looks like our chunk format
935
- if isinstance(qr_metadata, dict) and 'chunk_index' in qr_metadata and 'total_chunks' in qr_metadata:
936
- qr_data_list.append(qr_metadata)
937
- qr_path_list.append(img_path)
938
- else:
939
- # Valid JSON, but not the expected chunk structure
940
- logger.warning(f"Decoded valid JSON, but not expected format from {os.path.basename(img_path)}")
941
- qr_data_list.append({"data": qr_metadata, "chunk_index": -1}) # Assign default index
942
- qr_path_list.append(img_path)
943
-
944
- except json.JSONDecodeError:
945
- # Data decoded, but not JSON - store raw data
946
- logger.warning(f"Could not decode JSON from QR data in {os.path.basename(img_path)}. Storing raw.")
947
- qr_data_list.append({"data": data, "chunk_index": -1}) # Assign default index
948
- qr_path_list.append(img_path)
949
- except Exception as json_e:
950
- logger.error(f"Error processing decoded JSON from {os.path.basename(img_path)}: {json_e}")
951
- qr_data_list.append({"data": f"Error: {json_e}", "chunk_index": -1})
952
- qr_path_list.append(img_path)
953
- decode_errors += 1
954
- else:
955
- # QR code detected, but no data decoded (or detection failed)
956
- logger.warning(f"Could not decode data from QR image: {os.path.basename(img_path)}")
957
- qr_data_list.append({"data": "[DECODE FAILED]", "chunk_index": -1})
958
- qr_path_list.append(img_path)
959
- decode_errors += 1
960
-
961
- except Exception as e:
962
- logger.error(f"Error processing QR image file {os.path.basename(getattr(file, 'name', 'N/A'))}: {e}", exc_info=True)
963
- # Optionally add placeholder for failed file?
964
- decode_errors += 1
965
-
966
- if not qr_path_list:
967
- shared_data['qr_paths'] = []
968
- shared_data['qr_data'] = []
969
- return "No valid QR codes could be processed or decoded.", None, None, "❌ Failed to process/decode QR codes"
970
-
971
- # Attempt to sort by chunk_index (handle missing index gracefully)
972
- try:
973
- # Create tuples (index, data, path) for sorting
974
- indexed_items = []
975
- for i, (data, path) in enumerate(zip(qr_data_list, qr_path_list)):
976
- # Use provided chunk_index, fallback to list index if missing or invalid (-1)
977
- sort_key = data.get('chunk_index', i)
978
- if not isinstance(sort_key, int) or sort_key < 0:
979
- sort_key = i # Fallback to original order for this item
980
- indexed_items.append((sort_key, data, path))
981
-
982
- # Sort based on the index key
983
- indexed_items.sort(key=lambda x: x[0])
984
-
985
- # Unpack sorted lists
986
- sorted_qr_data = [item[1] for item in indexed_items]
987
- sorted_qr_paths = [item[2] for item in indexed_items]
988
-
989
- # Update shared data
990
- shared_data['qr_paths'] = sorted_qr_paths
991
- shared_data['qr_data'] = sorted_qr_data
992
- logger.info("Successfully sorted QR data based on chunk_index.")
993
 
994
- except Exception as e:
995
- logger.error(f"Error sorting QR data: {e}. Using original order.")
996
- # Use original order if sorting fails
997
- shared_data['qr_paths'] = qr_path_list
998
- shared_data['qr_data'] = qr_data_list
999
-
1000
- # Generate the visualization image using the helper function
1001
- # Use the sorted data stored in shared_data
1002
- visualization_image_buffer = _generate_sequence_visualization_image(
1003
- shared_data['qr_paths'],
1004
- shared_data['qr_data'],
1005
- title=f"Visualized Sequence ({len(shared_data['qr_paths'])} Codes)"
1006
- )
1007
-
1008
- # Convert buffer to PIL Image for Gradio output if necessary
1009
- vis_image_pil = None
1010
- if visualization_image_buffer:
1011
- try:
1012
- vis_image_pil = Image.open(visualization_image_buffer)
1013
- except Exception as img_e:
1014
- logger.error(f"Failed to load visualization buffer into PIL Image: {img_e}")
1015
-
1016
-
1017
- status_message = f"Processed {len(shared_data['qr_paths'])} QR codes."
1018
- if decode_errors > 0:
1019
- status_message += f" ({decode_errors} decode errors)"
1020
- status_message += "\nSequence visualized." if vis_image_pil else "\nVisualization generation failed."
1021
- final_status = "βœ… Done" if vis_image_pil else "⚠️ Errors Occurred"
1022
-
1023
-
1024
- # Update outputs: Gallery with sorted paths, Image with visualization, Status text
1025
- # The gallery expects a list of image paths or PIL images
1026
- gallery_output = shared_data['qr_paths']
1027
-
1028
- return gallery_output, vis_image_pil, status_message, final_status
1029
-
1030
-
1031
- def reset_visualizer_state():
1032
- shared_data['qr_paths'] = []
1033
- shared_data['qr_data'] = []
1034
- logger.info("Resetting QR visualizer state.")
1035
- return None, None, None, "βšͺ Visualizer Reset. Upload new QR codes."
1036
-
1037
- # Event handlers
1038
- visualize_btn.click(
1039
- process_qr_codes_and_visualize,
1040
- inputs=[qr_input],
1041
- outputs=[qr_preview, qr_visualization, visualization_status, visualization_status] # Update gallery, image, and status twice? Let's map correctly.
1042
- # Correct mapping:
1043
- # outputs=[qr_preview (Gallery), qr_visualization (Image), visualization_status (Textbox), visualization_status (Textbox again - maybe just need 3 outputs?)]
1044
- # Let's try mapping to the 4 defined outputs:
1045
- # outputs=[qr_preview, qr_visualization, visualization_status, visualization_status] # Seems redundant, but matches function signature needs. Let's adjust function signature later if needed.
1046
- ).then(
1047
- lambda: logger.info("Visualization process complete."), inputs=None, outputs=None
1048
- )
1049
 
 
 
 
 
 
1050
 
1051
- reset_btn.click(
1052
- reset_visualizer_state,
1053
- inputs=[],
1054
- outputs=[qr_preview, qr_visualization, qr_input, visualization_status] # Clear gallery, image, file input, status
1055
- )
1056
 
1057
  def create_modern_interface():
1058
  """Create a modern and visually appealing Gradio interface"""
1059
 
1060
- # Modern CSS styling (Seems intact)
1061
  css = """
1062
  /* Modern color scheme */
1063
  :root {
@@ -1112,25 +667,21 @@ def create_modern_interface():
1112
  /* Gallery styling */
1113
  .gallery {
1114
  display: grid;
1115
- grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); /* Adjust minmax */
1116
  gap: 1rem;
1117
  padding: 1rem;
1118
  background-color: white;
1119
  border-radius: 0.5rem;
1120
  border: 1px solid #e2e8f0;
1121
- min-height: 150px; /* Ensure gallery has some height */
1122
  }
1123
  .gallery img {
1124
  width: 100%;
1125
  height: auto;
1126
- object-fit: contain; /* Use contain to avoid stretching */
1127
  border-radius: 0.375rem;
1128
  transition: transform 0.2s;
1129
- border: 1px solid #eee; /* Add subtle border */
1130
  }
1131
  .gallery img:hover {
1132
  transform: scale(1.05);
1133
- box-shadow: 0 2px 4px rgba(0,0,0,0.1); /* Add hover shadow */
1134
  }
1135
  """
1136
  # Create interface with modern design
@@ -1139,305 +690,192 @@ def create_modern_interface():
1139
  # 🌐 Advanced Data Processing & QR Code Generator
1140
  Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
1141
  """)
1142
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1143
  with gr.Row():
1144
- with gr.Column(scale=2):
1145
- # Input Tabs
1146
- with gr.Tabs():
1147
- with gr.TabItem("πŸ“ URL Input"):
1148
- url_input = gr.Textbox(
1149
- label="Enter URLs (one per line or comma-separated)",
1150
- lines=5,
1151
- placeholder="https://example1.com\nhttps://example2.com",
1152
- elem_id="url-input"
1153
- )
1154
- with gr.TabItem("πŸ“ File Input"):
1155
- file_input = gr.File(
1156
- label="Upload Files (Text, JSON, Archives: zip, tar, gz, bz2)",
1157
- file_count="multiple",
1158
- # Removed file_types="*" to rely on backend logic, or specify supported ones:
1159
- # file_types=[".txt", ".json", ".csv", ".md", ".xml", ".html", ".zip", ".tar", ".gz", ".bz2"]
1160
- elem_id="file-input"
1161
- )
1162
- with gr.TabItem("πŸ“‹ Direct Input / JSON"):
1163
- text_input = gr.TextArea(
1164
- label="Direct Text/JSON Input",
1165
- lines=10,
1166
- placeholder="Paste your text or JSON data here...",
1167
- elem_id="text-input"
1168
- )
1169
- with gr.Row():
1170
- example_btn = gr.Button("πŸ“ Load JSON Example")
1171
- clear_btn = gr.Button("πŸ—‘οΈ Clear Input")
1172
-
1173
- # Processing Options & Button
1174
- with gr.Row():
1175
- combine_data = gr.Checkbox(
1176
- label="Combine all inputs into one sequence",
1177
- value=True, # Default to combined
1178
- info="If unchecked, each URL/File/Input generates its own QR sequence."
1179
- )
1180
- process_btn = gr.Button(
1181
- "πŸ”„ Process & Generate QR Codes",
1182
- variant="primary",
1183
- elem_id="process-button"
1184
- )
1185
-
1186
- # Status Output
1187
- output_text = gr.Textbox(
1188
- label="Processing Status",
1189
- interactive=False,
1190
- lines=2,
1191
- elem_id="status-output"
1192
- )
1193
-
1194
-
1195
- with gr.Column(scale=3):
1196
- # Output Area
1197
- gr.Markdown("### Results")
1198
- with gr.Tabs():
1199
- with gr.TabItem("πŸ–ΌοΈ QR Codes"):
1200
- output_gallery = gr.Gallery(
1201
- label="Generated QR Codes",
1202
- columns=4, # Adjust columns as needed
1203
- height=500, # Adjust height
1204
- object_fit="contain",
1205
- preview=True, # Enable preview click
1206
- elem_id="qr-gallery"
1207
- )
1208
- with gr.TabItem("πŸ“„ Processed Data (JSON)"):
1209
- output_json = gr.JSON(
1210
- label="Processed Data Structure",
1211
- elem_id="json-output"
1212
- )
1213
 
1214
  # Load example data
1215
  def load_example():
1216
  example = {
1217
- "project": "Data Transfer Example",
1218
- "version": 1.1,
1219
  "items": [
1220
- {"id": "A001", "name": "Item One", "value": 123.45, "tags": ["tag1", "tag2"]},
1221
- {"id": "B002", "name": "Item Two", "value": 67.89, "enabled": True}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1222
  ],
1223
- "timestamp": datetime.now().isoformat()
 
 
 
 
1224
  }
1225
  return json.dumps(example, indent=2)
1226
 
1227
- def clear_input_area():
1228
- # Clear only the direct text input area
1229
  return ""
1230
 
1231
- # --- Main Processing Function ---
1232
- def process_inputs_and_generate_qrs(urls, files, text, combine):
1233
- """Process all inputs, combine if requested, and generate QR codes."""
1234
- start_time = time.time()
1235
- logger.info("Starting data processing...")
1236
- status_updates = []
1237
- all_processed_data = [] # List to hold results from all sources
1238
-
1239
- url_processor = EnhancedURLProcessor()
1240
- file_processor = EnhancedFileProcessor()
1241
-
1242
- # 1. Process URLs
1243
- if urls and urls.strip():
1244
- url_list = re.split(r'[,\n]+', urls) # Split by comma or newline, handle multiple newlines
1245
- url_list = [u.strip() for u in url_list if u.strip()] # Clean up
1246
- status_updates.append(f"Processing {len(url_list)} URLs...")
1247
- logger.info(f"Processing URLs: {url_list}")
1248
- for i, url in enumerate(url_list):
1249
- logger.info(f"Processing URL {i+1}/{len(url_list)}: {url}")
1250
- # Basic validation before fetching
1251
- if not validators.url(url):
1252
- logger.warning(f"Skipping invalid URL format: {url}")
1253
- status_updates.append(f"⚠️ Skipped invalid URL: {url[:50]}...")
1254
- all_processed_data.append({'error': 'Invalid URL format', 'url': url})
1255
- continue
1256
-
1257
- content_data = url_processor.fetch_content(url)
1258
- if content_data and 'content' in content_data:
1259
- logger.info(f"Successfully fetched content from {url} ({len(content_data.get('raw_content',''))} bytes)")
1260
- # Structure the result similarly to file processing output
1261
- processed_url_data = {
1262
- 'source': 'url',
1263
- 'url': url,
1264
- 'content': content_data['content'], # Processed text content
1265
- 'raw_content': content_data['raw_content'], # Raw response body
1266
- 'metadata': content_data['metadata'], # Headers, status, etc.
1267
- 'timestamp': datetime.now().isoformat()
1268
- }
1269
- all_processed_data.append(processed_url_data)
1270
- status_updates.append(f"βœ“ Fetched: {url[:60]}...")
1271
- else:
1272
- logger.error(f"Failed to fetch content from URL: {url}")
1273
- status_updates.append(f"❌ Failed fetch: {url[:60]}...")
1274
- all_processed_data.append({'error': 'Failed to fetch content', 'url': url})
1275
-
1276
- # 2. Process Files
1277
- if files:
1278
- status_updates.append(f"Processing {len(files)} uploaded files...")
1279
- logger.info(f"Processing {len(files)} files.")
1280
- for i, file_obj in enumerate(files):
1281
- logger.info(f"Processing file {i+1}/{len(files)}: {getattr(file_obj, 'name', 'N/A')}")
1282
- try:
1283
- # Pass the Gradio file object directly to process_file
1284
- file_results = file_processor.process_file(file_obj)
1285
- if file_results:
1286
- all_processed_data.extend(file_results)
1287
- # Get filename safely from results (might be multiple from archive)
1288
- processed_filenames = [res.get('filename', 'N/A') for res in file_results]
1289
- status_updates.append(f"βœ“ Processed file(s): {', '.join(processed_filenames)}")
1290
- logger.info(f"Successfully processed file(s): {', '.join(processed_filenames)}")
1291
- else:
1292
- status_updates.append(f"⚠️ No data extracted from file: {getattr(file_obj, 'name', 'N/A')}")
1293
- logger.warning(f"No data extracted from file: {getattr(file_obj, 'name', 'N/A')}")
1294
- # Add placeholder error if desired
1295
- # all_processed_data.append({'error': 'No data extracted', 'filename': getattr(file_obj, 'name', 'N/A')})
1296
-
1297
- except Exception as file_proc_err:
1298
- file_name = getattr(file_obj, 'name', 'N/A')
1299
- logger.error(f"Error processing file {file_name}: {file_proc_err}", exc_info=True)
1300
- status_updates.append(f"❌ Error processing file: {file_name}")
1301
- all_processed_data.append({'error': f'File processing error: {file_proc_err}', 'filename': file_name})
1302
-
1303
-
1304
- # 3. Process Direct Text/JSON Input
1305
- if text and text.strip():
1306
- status_updates.append("Processing direct input...")
1307
- logger.info("Processing direct text/JSON input.")
1308
- # Attempt to parse as JSON first
1309
- try:
1310
- json_data = json.loads(text)
1311
- logger.info("Direct input parsed as JSON.")
1312
- processed_text_data = {
1313
- 'source': 'direct_json',
1314
- 'content': json_data, # Parsed JSON object/list
1315
- 'raw_content': text, # Original string
1316
- 'timestamp': datetime.now().isoformat()
1317
- }
1318
- all_processed_data.append(processed_text_data)
1319
- status_updates.append("βœ“ Processed direct input as JSON.")
1320
- except json.JSONDecodeError:
1321
- # If not JSON, treat as plain text
1322
- logger.info("Direct input treated as plain text.")
1323
- processed_text_data = {
1324
- 'source': 'direct_text',
1325
- 'content': text, # Store as plain text
1326
- 'timestamp': datetime.now().isoformat()
1327
- }
1328
- all_processed_data.append(processed_text_data)
1329
- status_updates.append("βœ“ Processed direct input as Text.")
1330
- except Exception as direct_input_err:
1331
- logger.error(f"Error processing direct input: {direct_input_err}", exc_info=True)
1332
- status_updates.append(f"❌ Error processing direct input.")
1333
- all_processed_data.append({'error': f'Direct input error: {direct_input_err}', 'source': 'direct_input'})
1334
-
1335
-
1336
- # 4. Check if any data was processed
1337
- if not all_processed_data:
1338
- logger.warning("No valid data sources found or processed.")
1339
- status_updates.append("⚠️ No data to process. Please provide input.")
1340
- final_status = "\n".join(status_updates)
1341
- return None, [], final_status # Return empty results
1342
-
1343
- logger.info(f"Total processed data items: {len(all_processed_data)}")
1344
- status_updates.append(f"Data processed ({len(all_processed_data)} items). Generating QR codes...")
1345
-
1346
- # 5. Generate QR Codes
1347
- qr_paths = []
1348
  try:
1349
- # Pass the list of processed data items
1350
- qr_paths = generate_qr_codes(all_processed_data, combine)
1351
- if qr_paths:
1352
- status_updates.append(f"βœ“ Generated {len(qr_paths)} QR codes.")
1353
- logger.info(f"Successfully generated {len(qr_paths)} QR codes.")
1354
- else:
1355
- status_updates.append("❌ QR code generation failed or produced no codes.")
1356
- logger.error("QR code generation returned no paths.")
1357
- # Keep processed data, but gallery will be empty
1358
-
1359
- except Exception as qr_gen_err:
1360
- logger.error(f"Error during QR code generation step: {qr_gen_err}", exc_info=True)
1361
- status_updates.append(f"❌ Error generating QR codes: {qr_gen_err}")
1362
- # Keep processed data, gallery will be empty
1363
-
1364
-
1365
- # 6. Finalize and Return
1366
- end_time = time.time()
1367
- processing_time = end_time - start_time
1368
- status_updates.append(f"Total processing time: {processing_time:.2f} seconds.")
1369
- final_status = "\n".join(status_updates)
1370
-
1371
- # Return processed data (for JSON view), QR paths (for Gallery), and status string
1372
- # Ensure qr_paths is a list of strings
1373
- qr_paths_str = [str(p) for p in qr_paths] if qr_paths else []
1374
-
1375
- # Return data for JSON output, gallery paths, and status text
1376
- return all_processed_data, qr_paths_str, final_status
1377
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1378
 
1379
- # --- Event Handlers ---
1380
  example_btn.click(load_example, outputs=[text_input])
1381
- clear_btn.click(clear_input_area, outputs=[text_input])
1382
-
1383
  process_btn.click(
1384
- process_inputs_and_generate_qrs,
1385
  inputs=[url_input, file_input, text_input, combine_data],
1386
- outputs=[output_json, output_gallery, output_text] # Match function return order
1387
  )
1388
 
1389
- # Add helpful documentation (Seems intact)
1390
  gr.Markdown("""
1391
  ### πŸš€ Features
1392
- - **Complete URL Scraping**: Extracts text content from web pages.
1393
- - **Advanced File Processing**: Handles text, JSON, and archives (.zip, .tar.*, .gz, .bz2). Attempts intelligent JSON detection.
1394
- - **Direct Input**: Paste text or JSON directly.
1395
- - **Sequential QR Codes**: Chunks large data and embeds sequencing info. Option to combine inputs.
1396
- - **Modern Design**: Clean, responsive interface.
1397
- ### πŸ’‘ Tips
1398
- 1. **Inputs**: Use any combination of URL, File, or Direct Input tabs.
1399
- 2. **Combine**: Check 'Combine all inputs' to create one QR sequence from all sources. Uncheck to get separate QR sequences for each source.
1400
- 3. **Files**: Upload text-based files, JSON, or supported archives. Content from archives is extracted and processed.
1401
- 4. **JSON**: Use the example button or upload a `.json` file. The app also tries to parse `.txt` or other files as JSON if they contain valid JSON structure.
1402
- 5. **Status**: Monitor the Processing Status box for feedback.
1403
- ### 🎨 Output
1404
- - Generated QR codes appear in the 'QR Codes' tab and are saved in the `output/qr_codes` directory.
1405
- - The structured data processed from all inputs is shown in the 'Processed Data (JSON)' tab.
1406
- - Hover over or click QR codes in the gallery for a larger preview.
1407
- """)
1408
- return interface
1409
 
1410
  def main():
1411
  """Initialize and launch the application"""
1412
  try:
1413
- # Configure system settings if needed
1414
- mimetypes.init() # Ensure mime types are loaded
1415
 
1416
- logger.info("Starting Gradio application...")
1417
  # Create and launch interface
1418
  interface = create_modern_interface()
1419
 
1420
- # Add the QR sequence visualizer tab (if function is defined and needed)
1421
- # with interface:
1422
- # create_qr_sequence_visualizer(None) # Pass relevant components if needed
1423
-
1424
  # Launch with configuration
1425
  interface.launch(
1426
- share=False, # Set to True for public link (use with caution)
1427
- debug=False, # Set to True for more verbose Gradio errors
1428
- show_error=True, # Show Python errors in browser console
1429
- # server_name="0.0.0.0", # Bind to all interfaces if needed for Docker/network access
1430
- # server_port=7860, # Specify port if needed
1431
- show_api=False # Disable default Gradio API endpoint unless needed
1432
  )
1433
- logger.info("Gradio application stopped.")
1434
  except Exception as e:
1435
- logger.error(f"Application startup or runtime error: {e}", exc_info=True)
1436
  raise
1437
 
1438
  if __name__ == "__main__":
1439
- # Ensure output directories exist before starting
1440
- OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
1441
- QR_CODES_DIR.mkdir(parents=True, exist_ok=True)
1442
- TEMP_DIR.mkdir(parents=True, exist_ok=True)
1443
- main()
 
23
  import numpy as np
24
  import tarfile
25
  import gzip
 
 
 
 
 
26
 
27
  # Setup enhanced logging with more detailed formatting
28
  logging.basicConfig(
 
43
 
44
  class EnhancedURLProcessor:
45
  """Advanced URL processing with complete content extraction"""
46
+
47
  def __init__(self):
48
  self.session = requests.Session()
49
  self.timeout = 15 # Extended timeout for larger content
 
52
 
53
  # Enhanced headers for better site compatibility
54
  self.session.headers.update({
55
+ 'User-Agent': self.user_agent.random,
56
  'Accept': '*/*', # Accept all content types
57
  'Accept-Language': 'en-US,en;q=0.9',
58
  'Accept-Encoding': 'gzip, deflate, br',
 
61
  'Sec-Fetch-Dest': 'document',
62
  'Sec-Fetch-Mode': 'navigate',
63
  'Sec-Fetch-Site': 'none',
64
+ 'Sec-Fetch-User': '?1',
65
  'DNT': '1'
66
  })
67
 
 
74
  if not all([parsed.scheme, parsed.netloc]):
75
  return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
76
  # Try HEAD request first to check accessibility
 
77
  try:
78
  head_response = self.session.head(url, timeout=5)
79
  head_response.raise_for_status()
 
 
 
 
 
 
80
  except requests.exceptions.RequestException:
81
  # If HEAD fails, try GET as some servers don't support HEAD
 
82
  response = self.session.get(url, timeout=self.timeout)
83
  response.raise_for_status()
 
 
 
 
 
 
84
 
85
  return {
86
  'is_valid': True,
87
  'message': 'URL is valid and accessible',
88
+ 'details': {
89
+ 'content_type': head_response.headers.get('Content-Type', 'unknown'),
90
+ 'server': head_response.headers.get('Server', 'unknown'),
91
+ 'size': head_response.headers.get('Content-Length', 'unknown')
92
+ }
93
  }
94
  except Exception as e:
95
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
 
100
  logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
101
 
102
  # Update User-Agent randomly for each request
103
+ self.session.headers.update({'User-Agent': self.user_agent.random})
104
 
105
  response = self.session.get(url, timeout=self.timeout)
106
  response.raise_for_status()
 
110
  encoding = chardet.detect(response.content)['encoding'] or 'utf-8'
111
  else:
112
  encoding = response.encoding
 
113
  # Decode content with fallback
114
  try:
115
  raw_content = response.content.decode(encoding, errors='replace')
116
+ except (UnicodeDecodeError, LookupError):
117
  raw_content = response.content.decode('utf-8', errors='replace')
118
 
119
  # Extract metadata
 
132
  if 'text/html' in content_type:
133
  processed_content = self._process_html_content(raw_content, url)
134
  else:
135
+ processed_content = raw_content
136
  return {
137
  'content': processed_content,
138
+ 'raw_content': raw_content,
139
  'metadata': metadata
140
  }
141
  except requests.exceptions.RequestException as e:
 
159
  for attr in ['href', 'src']:
160
  if tag.get(attr):
161
  try:
162
+ tag[attr] = urljoin(base_url, tag[attr])
163
+ except Exception:
164
+ pass
165
+ # Extract all text content
166
+ text_parts = []
167
+ for element in soup.stripped_strings:
168
+ text_parts.append(str(element))
169
+ return '\n'.join(text_parts)
 
 
 
 
 
 
 
 
 
 
170
  except Exception as e:
171
  logger.error(f"HTML processing error: {e}")
 
172
  return content
173
 
174
  class EnhancedFileProcessor:
175
  """Advanced file processing with complete content extraction"""
176
+
177
  def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
178
  self.max_file_size = max_file_size
 
179
  self.supported_extensions = {
180
+ '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
181
+ '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
182
+ '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
183
+ '.pdf', '.doc', '.docx', '.rtf', '.odt'
 
 
 
 
 
184
  }
185
 
 
186
  def process_file(self, file) -> List[Dict]:
187
  """Process uploaded file with enhanced error handling and complete extraction"""
188
+ if not file:
189
+ return []
 
190
 
191
  dataset = []
 
 
192
  try:
193
+ file_size = os.path.getsize(file.name)
 
 
 
 
 
 
194
  if file_size > self.max_file_size:
195
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
 
 
196
  return []
197
 
198
+ with tempfile.TemporaryDirectory() as temp_dir:
 
 
 
 
 
 
 
 
 
 
 
199
  temp_dir_path = Path(temp_dir)
200
 
201
+ # Handle different archive types
202
+ if self._is_archive(file.name):
203
+ dataset.extend(self._process_archive(file.name, temp_dir_path))
204
+ elif Path(file.name).suffix.lower() in self.supported_extensions:
205
+ dataset.extend(self._process_single_file(file))
206
  else:
207
+ logger.warning(f"Unsupported file type: {file.name}")
 
 
 
 
208
 
209
  except Exception as e:
210
+ logger.error(f"Error processing file: {str(e)}")
211
+ return []
 
 
212
  return dataset
213
 
214
  def _is_archive(self, filepath: str) -> bool:
215
+ """Check if file is an archive"""
216
+ return any(filepath.lower().endswith(ext) for ext in [
217
+ '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'
218
+ ])
219
 
220
+ def _process_single_file(self, file) -> List[Dict]:
221
  """Process a single file with enhanced character extraction and JSON handling"""
 
 
 
 
 
222
  try:
223
+ file_stat = os.stat(file.name)
224
  file_size = file_stat.st_size
 
 
225
 
226
  # Initialize content storage
227
+ content_parts = []
 
228
 
229
+ # Process file in chunks for large files
 
230
  chunk_size = 10 * 1024 * 1024 # 10MB chunks
231
+ with open(file.name, 'rb') as f:
232
+ while True:
233
+ chunk = f.read(chunk_size)
234
+ if not chunk:
235
+ break
236
+
237
+ # Detect encoding for each chunk
238
+ encoding = chardet.detect(chunk)['encoding'] or 'utf-8'
239
+ try:
240
+ decoded_chunk = chunk.decode(encoding, errors='replace')
241
+ content_parts.append(decoded_chunk)
242
+ except (UnicodeDecodeError, LookupError):
243
+ decoded_chunk = chunk.decode('utf-8', errors='replace')
244
+ content_parts.append(decoded_chunk)
245
+
246
+ # Combine all chunks
247
+ complete_content = ''.join(content_parts)
248
+
249
+ # Check if the content is valid JSON regardless of file extension
250
+ try:
251
+ if mimetypes.guess_type(file.name)[0] == 'application/json' or file.name.lower().endswith('.json'):
252
+ # It's a JSON file by type or extension
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  json_data = json.loads(complete_content)
254
+ return [{
255
+ 'source': 'json_file',
256
+ 'filename': os.path.basename(file.name),
257
+ 'file_size': file_size,
258
+ 'mime_type': 'application/json',
259
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
260
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
261
+ 'content': json_data, # Store the parsed JSON object
262
+ 'raw_content': complete_content, # Store the original JSON string
263
+ 'timestamp': datetime.now().isoformat()
264
+ }]
265
+ else:
266
+ # Try to parse as JSON anyway
267
+ try:
268
+ json_data = json.loads(complete_content)
269
+ # If we get here, it's valid JSON despite the extension
270
+ return [{
271
+ 'source': 'json_content',
272
+ 'filename': os.path.basename(file.name),
273
+ 'file_size': file_size,
274
+ 'mime_type': 'application/json',
275
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
276
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
277
+ 'content': json_data, # Store the parsed JSON object
278
+ 'raw_content': complete_content, # Store the original JSON string
279
+ 'timestamp': datetime.now().isoformat()
280
+ }]
281
+ except json.JSONDecodeError:
282
+ logger.warning(f"File {file.name} is not valid JSON.")
283
+ except Exception as e:
284
+ logger.error(f"Error during JSON processing: {e}")
285
 
286
+ return [{
287
+ 'source': 'file',
288
+ 'filename': os.path.basename(file.name),
 
289
  'file_size': file_size,
290
+ 'mime_type': mimetypes.guess_type(file.name)[0],
291
  'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
292
  'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
293
+ 'content': complete_content,
294
  'timestamp': datetime.now().isoformat()
295
+ }]
 
 
 
 
 
 
 
 
296
  except Exception as e:
297
+ logger.error(f"File processing error: {e}")
298
  return []
299
 
300
  def _process_archive(self, archive_path: str, extract_to: Path) -> List[Dict]:
301
  """Process an archive file with enhanced extraction"""
302
  dataset = []
 
 
 
303
  try:
304
  # Handle ZIP archives
305
+ if zipfile.is_zipfile(archive_path):
 
306
  with zipfile.ZipFile(archive_path, 'r') as zip_ref:
307
+ zip_ref.extractall(extract_to)
 
 
 
 
 
 
308
  for file_info in zip_ref.infolist():
309
+ if file_info.file_size > 0 and not file_info.filename.endswith('/'):
310
+ extracted_path = extract_to / file_info.filename
311
+ if extracted_path.suffix.lower() in self.supported_extensions:
312
+ with open(extracted_path, 'rb') as f:
313
+ dataset.extend(self._process_single_file(f))
314
+ # Handle TAR archives
315
+ elif archive_path.lower().endswith(('.tar', '.tar.gz', '.tgz')):
316
+ try:
317
+ with tarfile.open(archive_path, 'r:*') as tar_ref:
318
+ for member in tar_ref.getmembers():
319
+ if member.isfile():
320
+ extracted_path = extract_to / member.name
321
+ tar_ref.extract(member, path=extract_to)
322
+ if extracted_path.suffix.lower() in self.supported_extensions:
323
+ with open(extracted_path, 'rb') as f:
324
+ dataset.extend(self._process_single_file(f))
325
+ except tarfile.TarError as e:
326
+ logger.error(f"Error processing TAR archive: {e}")
327
+ # Handle GZIP archives (single file)
328
+ elif archive_path.lower().endswith('.gz'):
329
+ extracted_path = extract_to / Path(archive_path).stem
330
+ try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
332
  outfile.write(gz_file.read())
333
+ if extracted_path.suffix.lower() in self.supported_extensions:
334
+ with open(extracted_path, 'rb') as f:
335
+ dataset.extend(self._process_single_file(f))
336
+ except gzip.GzipFile as e:
337
+ logger.error(f"Error processing GZIP archive: {e}")
338
+ # TODO: Add support for other archive types (.bz2, .7z, .rar) - may require external libraries
339
+ elif archive_path.lower().endswith(('.bz2', '.7z', '.rar')):
340
+ logger.warning(f"Support for {Path(archive_path).suffix} archives is not yet fully implemented.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
 
 
 
 
 
342
  except Exception as e:
343
+ logger.error(f"Archive processing error: {e}")
 
344
  return dataset
345
 
346
+ def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
347
+ """Enhanced data chunking with sequence metadata"""
 
348
  try:
349
+ # Convert data to JSON string
350
+ json_str = json.dumps(data, ensure_ascii=False)
351
+ total_length = len(json_str)
352
+
353
+ # Calculate overhead for metadata
354
+ metadata_template = {
355
+ "chunk_index": 0,
356
+ "total_chunks": 1,
357
+ "total_length": total_length,
358
+ "chunk_hash": "",
359
+ "data": ""
360
+ }
361
+ overhead = len(json.dumps(metadata_template)) + 20 # Extra padding for safety
362
 
363
+ # Calculate effective chunk size
364
+ effective_chunk_size = max_size - overhead
365
 
366
+ if total_length <= effective_chunk_size:
367
+ # Data fits in one chunk
368
+ chunk = {
 
369
  "chunk_index": 0,
370
  "total_chunks": 1,
371
+ "total_length": total_length,
372
+ "chunk_hash": hash(json_str) & 0xFFFFFFFF, # 32-bit hash
373
+ "data": json_str
374
  }
375
+ return [chunk]
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
+ # Calculate number of chunks needed
378
+ num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
379
+ chunk_size = -(-total_length // num_chunks) # Even distribution
380
 
381
  chunks = []
 
382
  for i in range(num_chunks):
383
+ start_idx = i * chunk_size
384
+ end_idx = min(start_idx + chunk_size, total_length)
385
+ chunk_data = json_str[start_idx:end_idx]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
 
387
+ chunk = {
388
  "chunk_index": i,
389
  "total_chunks": num_chunks,
390
+ "total_length": total_length,
391
+ "chunk_hash": hash(chunk_data) & 0xFFFFFFFF,
392
+ "data": chunk_data
 
393
  }
394
+ chunks.append(chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
 
396
  return chunks
397
  except Exception as e:
398
+ logger.error(f"Error chunking data: {e}")
399
  return []
400
 
401
+ def generate_stylish_qr(data: Union[str, Dict],
 
402
  filename: str,
403
  size: int = 10,
404
  border: int = 4,
405
  fill_color: str = "#000000",
406
+ back_color: str = "#FFFFFF") -> str:
 
407
  """Generate a stylish QR code with enhanced visual appeal"""
408
  try:
409
  qr = qrcode.QRCode(
410
+ version=None,
411
+ error_correction=qrcode.constants.ERROR_CORRECT_H,
412
  box_size=size,
413
  border=border
414
  )
415
 
416
+ # Add data to QR code
417
+ if isinstance(data, dict):
418
+ qr.add_data(json.dumps(data, ensure_ascii=False))
419
+ else:
420
+ qr.add_data(data)
421
 
 
422
  qr.make(fit=True)
423
 
 
 
 
424
  # Create QR code image with custom colors
425
  qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
426
 
427
+ # Convert to RGBA for transparency support
428
  qr_image = qr_image.convert('RGBA')
429
 
430
+ # Add subtle gradient overlay
431
+ gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
432
+ draw = ImageDraw.Draw(gradient)
433
+ for i in range(qr_image.width):
434
+ alpha = int(255 * (1 - i/qr_image.width) * 0.1) # 10% maximum opacity
435
+ draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
 
 
436
 
437
+ # Combine images
438
+ final_image = Image.alpha_composite(qr_image, gradient)
439
 
440
  # Save the image
441
  output_path = QR_CODES_DIR / filename
442
+ final_image.save(output_path, quality=95)
 
 
 
443
 
444
  return str(output_path)
 
 
 
 
445
  except Exception as e:
446
+ logger.error(f"QR generation error: {e}")
447
  return ""
448
 
449
+ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
450
+ """Generate QR codes with enhanced visual appeal and metadata"""
 
451
  try:
452
+ file_processor = EnhancedFileProcessor()
453
+ paths = []
454
+
455
+ if combined:
456
+ # Process combined data
457
+ chunks = file_processor.chunk_data(data)
458
+ for i, chunk in enumerate(chunks):
459
+ filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  qr_path = generate_stylish_qr(
461
+ data=chunk,
462
  filename=filename,
463
+ fill_color="#1a365d", # Deep blue
464
+ back_color="#ffffff"
 
465
  )
466
  if qr_path:
467
+ paths.append(qr_path)
 
 
 
 
468
  else:
469
+ # Process individual items
470
+ if isinstance(data, list):
471
+ for idx, item in enumerate(data):
472
+ chunks = file_processor.chunk_data(item)
473
+ for chunk_idx, chunk in enumerate(chunks):
474
+ filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png'
475
+ qr_path = generate_stylish_qr(
476
+ data=chunk,
477
+ filename=filename,
478
+ fill_color="#1a365d", # Deep blue
479
+ back_color="#ffffff"
480
+ )
481
+ if qr_path:
482
+ paths.append(qr_path)
483
+ else:
484
+ chunks = file_processor.chunk_data(data)
485
+ for i, chunk in enumerate(chunks):
486
+ filename = f'single_qr_{i+1}_of_{len(chunks)}_{int(time.time())}.png'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  qr_path = generate_stylish_qr(
488
+ data=chunk,
489
  filename=filename,
490
+ fill_color="#1a365d", # Deep blue
491
+ back_color="#ffffff"
 
492
  )
493
  if qr_path:
494
+ paths.append(qr_path)
495
+ return paths
 
 
 
 
 
496
  except Exception as e:
497
+ logger.error(f"QR code generation error: {e}")
498
  return []
499
 
500
+ def create_qr_visualizer(qr_paths, metadata=None):
501
+ """Create an interactive visualization of sequenced QR codes"""
502
+ if not qr_paths:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
  return None
504
+
505
+ # Extract metadata from QR codes if not provided
506
+ if metadata is None:
507
+ metadata = []
508
+ for path in qr_paths:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  try:
510
+ img = Image.open(path)
511
+ qr = qrcode.QRCode()
512
+ data = qrcode.image.pil.PilImage.get_qr_data(img)
513
+ if data:
514
+ metadata.append(json.loads(data))
515
+ else:
516
+ # If can't extract, add placeholder
517
+ metadata.append({"chunk_index": len(metadata), "total_chunks": len(qr_paths)})
518
+ except Exception as e:
519
+ logger.error(f"Error extracting QR metadata: {e}")
520
+ metadata.append({"chunk_index": len(metadata), "total_chunks": len(qr_paths)})
521
+
522
+ # Compute optimal grid size
523
+ total_codes = len(qr_paths)
524
+ grid_size = math.ceil(math.sqrt(total_codes))
525
+
526
+ # Create a composite image with placeholders for disabled QR codes
527
+ def create_composite(enabled_indices):
528
+ # Size calculations for the grid
529
+ qr_size = 200 # Size of each QR code in pixels
530
+ padding = 20 # Padding between QR codes
531
+
532
+ # Create grid for visualization
533
+ grid_width = grid_size * (qr_size + padding) + padding
534
+ grid_height = grid_size * (qr_size + padding) + padding
535
+
536
+ # Create a white background image
537
+ composite = Image.new('RGBA', (grid_width, grid_height), (255, 255, 255, 255))
538
+ draw = ImageDraw.Draw(composite)
539
+
540
+ # Load and place QR codes on the grid
541
+ for i, path in enumerate(qr_paths):
542
+ # Calculate grid position
543
+ row = i // grid_size
544
+ col = i % grid_size
545
+
546
+ # Calculate pixel position
547
+ x = col * (qr_size + padding) + padding
548
+ y = row * (qr_size + padding) + padding
549
+
550
+ if i in enabled_indices:
551
  try:
552
+ # Load and resize QR code
553
+ qr_img = Image.open(path)
554
+ qr_img = qr_img.resize((qr_size, qr_size), Image.Resampling.LANCZOS)
555
+
556
+ # Extract metadata for this QR
557
+ meta = metadata[i] if i < len(metadata) else {}
558
+ chunk_index = meta.get("chunk_index", i)
559
+ total_chunks = meta.get("total_chunks", len(qr_paths))
560
+
561
+ # Add visual indicator for sequence position
562
+ sequence_indicator = Image.new('RGBA', (qr_size, 30), (26, 54, 93, 200)) # Dark blue
563
+ draw_indicator = ImageDraw.Draw(sequence_indicator)
564
+ draw_indicator.text((10, 5), f"#{chunk_index+1} of {total_chunks}", fill=(255, 255, 255))
565
+
566
+ # Combine QR with indicator
567
+ qr_with_indicator = Image.new('RGBA', (qr_size, qr_size + 30))
568
+ qr_with_indicator.paste(qr_img, (0, 0))
569
+ qr_with_indicator.paste(sequence_indicator, (0, qr_size), sequence_indicator)
570
+
571
+ # Paste onto composite
572
+ composite.paste(qr_with_indicator, (x, y))
573
+
574
+ # Draw connection lines based on sequence
575
+ if i > 0:
576
+ prev_x = (col - 1) * (qr_size + padding) + padding if col > 0 else x
577
+ prev_y = (row * (qr_size + padding)) + padding
578
+ draw.line([(prev_x + qr_size // 2, prev_y + qr_size), (x + qr_size // 2, y)], fill=(0, 0, 0, 255), width=2)
579
+
580
+ return composite
581
+
582
+ # Create a toggleable interface for enabling/disabling QR codes
583
+ enabled_indices = list(range(total_codes)) # Start with all enabled
584
+ def toggle_qr(index):
585
+ if index in enabled_indices:
586
+ enabled_indices.remove(index)
587
+ else:
588
+ enabled_indices.append(index)
589
+ return create_composite(enabled_indices)
590
 
591
+ # Create the initial composite image
592
+ initial_composite = create_composite(enabled_indices)
593
 
594
+ # Display the composite image
595
+ plt.figure(figsize=(10, 10))
596
+ plt.imshow(initial_composite)
597
+ plt.axis('off')
598
+ plt.show()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
 
600
+ return toggle_qr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
 
602
+ # Integrate the visualizer into the main application
603
+ def visualize_qr_codes(qr_paths):
604
+ """Visualize the generated QR codes with enable/disable functionality"""
605
+ toggle_function = create_qr_visualizer(qr_paths)
606
+ return toggle_function
607
 
608
+ # Add a button in the Gradio interface to trigger visualization
609
+ visualize_btn = gr.Button("πŸ” Visualize QR Codes")
610
+ visualize_btn.click(visualize_qr_codes, inputs=output_gallery, outputs=None)
 
 
611
 
612
  def create_modern_interface():
613
  """Create a modern and visually appealing Gradio interface"""
614
 
615
+ # Modern CSS styling
616
  css = """
617
  /* Modern color scheme */
618
  :root {
 
667
  /* Gallery styling */
668
  .gallery {
669
  display: grid;
670
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
671
  gap: 1rem;
672
  padding: 1rem;
673
  background-color: white;
674
  border-radius: 0.5rem;
675
  border: 1px solid #e2e8f0;
 
676
  }
677
  .gallery img {
678
  width: 100%;
679
  height: auto;
 
680
  border-radius: 0.375rem;
681
  transition: transform 0.2s;
 
682
  }
683
  .gallery img:hover {
684
  transform: scale(1.05);
 
685
  }
686
  """
687
  # Create interface with modern design
 
690
  # 🌐 Advanced Data Processing & QR Code Generator
691
  Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
692
  """)
693
+ with gr.Tab("πŸ“ URL Processing"):
694
+ url_input = gr.Textbox(
695
+ label="Enter URLs (comma or newline separated)",
696
+ lines=5,
697
+ placeholder="https://example1.com\nhttps://example2.com",
698
+ value=""
699
+ )
700
+ with gr.Tab("πŸ“ File Input"):
701
+ file_input = gr.File(
702
+ label="Upload Files",
703
+ file_types=["*"], # Accept all file types
704
+ file_count="multiple"
705
+ )
706
+ with gr.Tab("πŸ“‹ JSON Input"):
707
+ text_input = gr.TextArea(
708
+ label="Direct JSON Input",
709
+ lines=15,
710
+ placeholder="Paste your JSON data here...",
711
+ value=""
712
+ )
713
+ with gr.Row():
714
+ example_btn = gr.Button("πŸ“ Load Example", variant="secondary")
715
+ clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
716
  with gr.Row():
717
+ combine_data = gr.Checkbox(
718
+ label="Combine all data into sequence",
719
+ value=True,
720
+ info="Generate sequential QR codes for combined data"
721
+ )
722
+ process_btn = gr.Button(
723
+ "πŸ”„ Process & Generate QR",
724
+ variant="primary"
725
+ )
726
+ # Output components
727
+ output_json = gr.JSON(label="Processed Data")
728
+ output_gallery = gr.Gallery(
729
+ label="Generated QR Codes",
730
+ columns=3,
731
+ height=400,
732
+ show_label=True
733
+ )
734
+ output_text = gr.Textbox(
735
+ label="Processing Status",
736
+ interactive=False
737
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
738
 
739
  # Load example data
740
  def load_example():
741
  example = {
742
+ "type": "product_catalog",
 
743
  "items": [
744
+ {
745
+ "id": "123",
746
+ "name": "Premium Widget",
747
+ "description": "High-quality widget with advanced features",
748
+ "price": 299.99,
749
+ "category": "electronics",
750
+ "tags": ["premium", "featured", "new"]
751
+ },
752
+ {
753
+ "id": "456",
754
+ "name": "Basic Widget",
755
+ "description": "Reliable widget for everyday use",
756
+ "price": 149.99,
757
+ "category": "electronics",
758
+ "tags": ["basic", "popular"]
759
+ }
760
  ],
761
+ "metadata": {
762
+ "timestamp": datetime.now().isoformat(),
763
+ "version": "2.0",
764
+ "source": "example"
765
+ }
766
  }
767
  return json.dumps(example, indent=2)
768
 
769
+ def clear_input():
 
770
  return ""
771
 
772
+ def process_inputs(urls, files, text, combine):
773
+ """Process all inputs and generate QR codes"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
774
  try:
775
+ results = []
776
+ url_processor = EnhancedURLProcessor()
777
+ file_processor = EnhancedFileProcessor()
778
+
779
+ # Process JSON input
780
+ if text and text.strip():
781
+ try:
782
+ json_data = json.loads(text)
783
+ if isinstance(json_data, list):
784
+ results.extend(json_data)
785
+ else:
786
+ results.append(json_data)
787
+ except json.JSONDecodeError as e:
788
+ return None, [], f"❌ Invalid JSON format: {str(e)}"
789
+
790
+ # Process URLs
791
+ if urls and urls.strip():
792
+ url_list = re.split(r'[,\n]', urls)
793
+ url_list = [url.strip() for url in url_list if url.strip()]
794
+ for url in url_list:
795
+ validation = url_processor.validate_url(url)
796
+ if validation['is_valid']:
797
+ content = url_processor.fetch_content(url)
798
+ if content:
799
+ results.append({
800
+ 'source': 'url',
801
+ 'url': url,
802
+ 'content': content,
803
+ 'timestamp': datetime.now().isoformat()
804
+ })
805
+
806
+ # Process files
807
+ if files:
808
+ for file in files:
809
+ file_results = file_processor.process_file(file)
810
+ if file_results:
811
+ results.extend(file_results)
812
+
813
+ # Generate QR codes
814
+ if results:
815
+ qr_paths = generate_qr_codes(results, combine)
816
+ if qr_paths:
817
+ return (
818
+ results,
819
+ [str(path) for path in qr_paths],
820
+ f"βœ… Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
821
+ )
822
+ else:
823
+ return None, [], "❌ Failed to generate QR codes"
824
+ else:
825
+ return None, [], "⚠️ No valid content to process"
826
+ except Exception as e:
827
+ logger.error(f"Processing error: {e}")
828
+ return None, [], f"❌ Error: {str(e)}"
829
 
830
+ # Set up event handlers
831
  example_btn.click(load_example, outputs=[text_input])
832
+ clear_btn.click(clear_input, outputs=[text_input])
 
833
  process_btn.click(
834
+ process_inputs,
835
  inputs=[url_input, file_input, text_input, combine_data],
836
+ outputs=[output_json, output_gallery, output_text]
837
  )
838
 
839
+ # Add helpful documentation
840
  gr.Markdown("""
841
  ### πŸš€ Features
842
+ - **Complete URL Scraping**: Extracts every character from web pages
843
+ - **Advanced File Processing**: Full content extraction from various text-based files and common archives. Supports flexible JSON handling.
844
+ - **Smart JSON Handling**: Processes any size JSON with automatic chunking, either via direct input or file upload.
845
+ - **Sequential QR Codes**: Maintains data integrity across multiple codes
846
+ - **Modern Design**: Clean, responsive interface with visual feedback
847
+ ### πŸ’‘ Tips
848
+ 1. **URLs**: Enter multiple URLs separated by commas or newlines
849
+ 2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and JSON files.
850
+ 3. **JSON**: Use the example button to see the expected format or upload a .json file. The system will also try to detect JSON content in other file types.
851
+ 4. **QR Codes**: Choose whether to combine data into sequential codes
852
+ 5. **Processing**: Monitor the status for real-time feedback
853
+ ### 🎨 Output
854
+ - Generated QR codes are saved in the `output/qr_codes` directory
855
+ - Each QR code contains metadata for proper sequencing
856
+ - Hover over QR codes in the gallery to see details
857
+ """)
858
+ return interface
859
 
860
  def main():
861
  """Initialize and launch the application"""
862
  try:
863
+ # Configure system settings
864
+ mimetypes.init()
865
 
 
866
  # Create and launch interface
867
  interface = create_modern_interface()
868
 
 
 
 
 
869
  # Launch with configuration
870
  interface.launch(
871
+ share=False,
872
+ debug=False,
873
+ show_error=True,
874
+ show_api=False
 
 
875
  )
 
876
  except Exception as e:
877
+ logger.error(f"Application startup error: {e}")
878
  raise
879
 
880
  if __name__ == "__main__":
881
+ main()