acecalisto3 commited on
Commit
3a36f7c
Β·
verified Β·
1 Parent(s): da4162d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1049 -510
app.py CHANGED
@@ -56,7 +56,7 @@ class EnhancedURLProcessor:
56
 
57
  # Enhanced headers for better site compatibility
58
  self.session.headers.update({
59
- 'User-Agent': self.user_agent.random,
60
  'Accept': '*/*', # Accept all content types
61
  'Accept-Language': 'en-US,en;q=0.9',
62
  'Accept-Encoding': 'gzip, deflate, br',
@@ -65,7 +65,7 @@ class EnhancedURLProcessor:
65
  'Sec-Fetch-Dest': 'document',
66
  'Sec-Fetch-Mode': 'navigate',
67
  'Sec-Fetch-Site': 'none',
68
- 'Sec-Fetch-User': '?1',
69
  'DNT': '1'
70
  })
71
 
@@ -78,22 +78,32 @@ class EnhancedURLProcessor:
78
  if not all([parsed.scheme, parsed.netloc]):
79
  return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
80
  # Try HEAD request first to check accessibility
 
81
  try:
82
  head_response = self.session.head(url, timeout=5)
83
  head_response.raise_for_status()
 
 
 
 
 
 
84
  except requests.exceptions.RequestException:
85
  # If HEAD fails, try GET as some servers don't support HEAD
 
86
  response = self.session.get(url, timeout=self.timeout)
87
  response.raise_for_status()
 
 
 
 
 
 
88
 
89
  return {
90
  'is_valid': True,
91
  'message': 'URL is valid and accessible',
92
- 'details': {
93
- 'content_type': head_response.headers.get('Content-Type', 'unknown'),
94
- 'server': head_response.headers.get('Server', 'unknown'),
95
- 'size': head_response.headers.get('Content-Length', 'unknown')
96
- }
97
  }
98
  except Exception as e:
99
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
@@ -104,7 +114,7 @@ class EnhancedURLProcessor:
104
  logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
105
 
106
  # Update User-Agent randomly for each request
107
- self.session.headers.update({'User-Agent': self.user_agent.random})
108
 
109
  response = self.session.get(url, timeout=self.timeout)
110
  response.raise_for_status()
@@ -118,7 +128,7 @@ class EnhancedURLProcessor:
118
  # Decode content with fallback
119
  try:
120
  raw_content = response.content.decode(encoding, errors='replace')
121
- except (UnicodeDecodeError, LookupError):
122
  raw_content = response.content.decode('utf-8', errors='replace')
123
 
124
  # Extract metadata
@@ -137,10 +147,10 @@ class EnhancedURLProcessor:
137
  if 'text/html' in content_type:
138
  processed_content = self._process_html_content(raw_content, url)
139
  else:
140
- processed_content = raw_content
141
  return {
142
  'content': processed_content,
143
- 'raw_content': raw_content,
144
  'metadata': metadata
145
  }
146
  except requests.exceptions.RequestException as e:
@@ -164,470 +174,890 @@ class EnhancedURLProcessor:
164
  for attr in ['href', 'src']:
165
  if tag.get(attr):
166
  try:
167
- tag[attr] = urljoin(base_url, tag[attr])
168
- except Exception:
169
- pass
170
- # Extract all text content
171
- text_parts = []
172
- for element in soup.stripped_strings:
173
- text_parts.append(str(element))
174
- return '\n'.join(text_parts)
 
 
 
 
 
 
 
 
 
 
175
  except Exception as e:
176
  logger.error(f"HTML processing error: {e}")
 
177
  return content
178
 
179
  class EnhancedFileProcessor:
180
  """Advanced file processing with complete content extraction"""
181
  def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
182
  self.max_file_size = max_file_size
 
183
  self.supported_extensions = {
184
- '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
185
- '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
186
- '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
187
- '.pdf', '.doc', '.docx', '.rtf', '.odt'
 
 
 
 
 
188
  }
189
 
 
190
  def process_file(self, file) -> List[Dict]:
191
  """Process uploaded file with enhanced error handling and complete extraction"""
192
- if not file:
193
- return []
 
194
 
195
  dataset = []
 
 
196
  try:
197
- file_size = os.path.getsize(file.name)
 
 
 
 
 
 
198
  if file_size > self.max_file_size:
199
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
 
 
200
  return []
201
 
202
- with tempfile.TemporaryDirectory() as temp_dir:
 
 
 
 
 
 
 
 
 
 
 
203
  temp_dir_path = Path(temp_dir)
204
 
205
- # Handle different archive types
206
- if self._is_archive(file.name):
207
- dataset.extend(self._process_archive(file.name, temp_dir_path))
208
- elif Path(file.name).suffix.lower() in self.supported_extensions:
209
- dataset.extend(self._process_single_file(file))
210
  else:
211
- logger.warning(f"Unsupported file type: {file.name}")
 
 
 
 
212
 
213
  except Exception as e:
214
- logger.error(f"Error processing file: {str(e)}")
215
- return []
 
 
216
  return dataset
217
 
218
  def _is_archive(self, filepath: str) -> bool:
219
- """Check if file is an archive"""
220
- return any(filepath.lower().endswith(ext) for ext in [
221
- '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'
222
- ])
223
 
224
- def _process_single_file(self, file) -> List[Dict]:
225
  """Process a single file with enhanced character extraction and JSON handling"""
 
 
 
 
 
226
  try:
227
- file_stat = os.stat(file.name)
228
  file_size = file_stat.st_size
 
 
 
229
  # Initialize content storage
230
- content_parts = []
231
- # Process file in chunks for large files
 
 
 
232
  chunk_size = 10 * 1024 * 1024 # 10MB chunks
233
- with open(file.name, 'rb') as f:
234
- while True:
235
- chunk = f.read(chunk_size)
236
- if not chunk:
237
- break
238
- # Detect encoding for each chunk
239
- encoding = chardet.detect(chunk)['encoding'] or 'utf-8'
240
- try:
241
- decoded_chunk = chunk.decode(encoding, errors='replace')
242
- content_parts.append(decoded_chunk)
243
- except (UnicodeDecodeError, LookupError):
244
- decoded_chunk = chunk.decode('utf-8', errors='replace')
245
- content_parts.append(decoded_chunk)
246
- # Combine all chunks
247
- complete_content = ''.join(content_parts)
248
- # Check if the content is valid JSON regardless of file extension
249
- try:
250
- if mimetypes.guess_type(file.name)[0] == 'application/json' or file.name.lower().endswith('.json'):
251
- # It's a JSON file by type or extension
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  json_data = json.loads(complete_content)
253
- return [{
254
- 'source': 'json_file',
255
- 'filename': os.path.basename(file.name),
256
- 'file_size': file_size,
257
- 'mime_type': 'application/json',
258
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
259
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
260
- 'content': json_data, # Store the parsed JSON object
261
- 'raw_content': complete_content, # Store the original JSON string
262
- 'timestamp': datetime.now().isoformat()
263
- }]
264
- else:
265
- # Try to parse as JSON anyway
266
- try:
267
- json_data = json.loads(complete_content)
268
- # If we get here, it's valid JSON despite the extension
269
- return [{
270
- 'source': 'json_content',
271
- 'filename': os.path.basename(file.name),
272
- 'file_size': file_size,
273
- 'mime_type': 'application/json',
274
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
275
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
276
- 'content': json_data, # Store the parsed JSON object
277
- 'raw_content': complete_content, # Store the original JSON string
278
- 'timestamp': datetime.now().isoformat()
279
- }]
280
- except json.JSONDecodeError:
281
- logger.warning(f"File {file.name} is not valid JSON.")
282
- except Exception as e:
283
- logger.error(f"Error during JSON processing: {e}")
284
 
285
- return [{
286
- 'source': 'file',
287
- 'filename': os.path.basename(file.name),
 
288
  'file_size': file_size,
289
- 'mime_type': mimetypes.guess_type(file.name)[0],
290
  'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
291
  'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
292
- 'content': complete_content,
293
  'timestamp': datetime.now().isoformat()
294
- }]
 
 
 
 
 
 
 
 
295
  except Exception as e:
296
- logger.error(f"File processing error: {e}")
297
  return []
298
 
299
  def _process_archive(self, archive_path: str, extract_to: Path) -> List[Dict]:
300
  """Process an archive file with enhanced extraction"""
301
  dataset = []
 
 
 
302
  try:
303
  # Handle ZIP archives
304
- if zipfile.is_zipfile(archive_path):
 
305
  with zipfile.ZipFile(archive_path, 'r') as zip_ref:
306
- zip_ref.extractall(extract_to)
 
 
 
 
 
 
307
  for file_info in zip_ref.infolist():
308
- if file_info.file_size > 0 and not file_info.filename.endswith('/'):
309
- extracted_path = extract_to / file_info.filename
310
- if extracted_path.suffix.lower() in self.supported_extensions:
311
- try:
312
- with open(extracted_path, 'rb') as f:
313
- dataset.extend(self._process_single_file(f))
314
- except Exception as e:
315
- logger.error(f"Error processing extracted file {extracted_path}: {e}")
316
- # Handle TAR archives
317
- elif archive_path.lower().endswith(('.tar', '.tar.gz', '.tgz')):
318
- try:
319
- with tarfile.open(archive_path, 'r:*') as tar_ref:
320
- for member in tar_ref.getmembers():
321
- if member.isfile():
322
- extracted_path = extract_to / member.name
323
- try:
324
- tar_ref.extract(member, path=extract_to)
325
- if extracted_path.suffix.lower() in self.supported_extensions:
326
- with open(extracted_path, 'rb') as f:
327
- dataset.extend(self._process_single_file(f))
328
- except Exception as e:
329
- logger.error(f"Error extracting or processing TAR member {member.name}: {e}")
330
- except tarfile.TarError as e:
331
- logger.error(f"Error processing TAR archive: {e}")
332
- # Handle GZIP archives (single file)
333
- elif archive_path.lower().endswith('.gz'):
334
- extracted_path = extract_to / Path(archive_path).stem
335
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
337
  outfile.write(gz_file.read())
338
- if extracted_path.suffix.lower() in self.supported_extensions:
339
- with open(extracted_path, 'rb') as f:
340
- dataset.extend(self._process_single_file(f))
341
- except gzip.GzipFile as e:
342
- logger.error(f"Error processing GZIP archive: {e}")
343
- # TODO: Add support for other archive types (.bz2, .7z, .rar) - may require external libraries
344
- elif archive_path.lower().endswith(('.bz2', '.7z', '.rar')):
345
- logger.warning(f"Support for {Path(archive_path).suffix} archives is not yet fully implemented.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
 
 
 
 
 
 
347
  except Exception as e:
348
- logger.error(f"Archive processing error: {e}")
 
349
  return dataset
350
 
351
- def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
352
- """Enhanced data chunking with sequence metadata"""
 
353
  try:
354
- # Convert data to JSON string
355
- json_str = json.dumps(data, ensure_ascii=False)
356
- total_length = len(json_str)
357
-
358
- # Calculate overhead for metadata
359
- metadata_template = {
360
- "chunk_index": 0,
361
- "total_chunks": 1,
362
- "total_length": total_length,
363
- "chunk_hash": "",
364
- "data": ""
365
- }
366
- overhead = len(json.dumps(metadata_template)) + 20 # Extra padding for safety
367
 
368
- # Calculate effective chunk size
369
- effective_chunk_size = max_size - overhead
370
 
371
- if total_length <= effective_chunk_size:
372
- # Data fits in one chunk
373
- chunk = {
 
374
  "chunk_index": 0,
375
  "total_chunks": 1,
376
- "total_length": total_length,
377
- "chunk_hash": hash(json_str) & 0xFFFFFFFF, # 32-bit hash
378
- "data": json_str
379
  }
380
- return [chunk]
 
381
 
382
- # Calculate number of chunks needed
383
- num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
384
- chunk_size = -(-total_length // num_chunks) # Even distribution
 
 
 
 
 
 
 
 
 
 
 
385
 
386
  chunks = []
 
387
  for i in range(num_chunks):
388
- start_idx = i * chunk_size
389
- end_idx = min(start_idx + chunk_size, total_length)
390
- chunk_data = json_str[start_idx:end_idx]
 
 
 
 
 
391
 
392
- chunk = {
 
 
 
 
 
 
 
 
 
 
 
 
393
  "chunk_index": i,
394
  "total_chunks": num_chunks,
395
- "total_length": total_length,
396
- "chunk_hash": hash(chunk_data) & 0xFFFFFFFF,
397
- "data": chunk_data
 
398
  }
399
- chunks.append(chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
  return chunks
402
  except Exception as e:
403
- logger.error(f"Error chunking data: {e}")
404
  return []
405
 
406
- def generate_stylish_qr(data: Union[str, Dict],
 
407
  filename: str,
408
  size: int = 10,
409
  border: int = 4,
410
  fill_color: str = "#000000",
411
- back_color: str = "#FFFFFF") -> str:
 
412
  """Generate a stylish QR code with enhanced visual appeal"""
413
  try:
414
  qr = qrcode.QRCode(
415
- version=None,
416
- error_correction=qrcode.constants.ERROR_CORRECT_H,
417
  box_size=size,
418
  border=border
419
  )
420
 
421
- # Add data to QR code
422
- if isinstance(data, dict):
423
- qr.add_data(json.dumps(data, ensure_ascii=False))
424
- else:
425
- qr.add_data(data)
426
 
 
427
  qr.make(fit=True)
428
 
 
 
 
429
  # Create QR code image with custom colors
430
  qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
431
 
432
- # Convert to RGBA for transparency support
433
  qr_image = qr_image.convert('RGBA')
434
 
435
- # Add subtle gradient overlay
436
- gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
437
- draw = ImageDraw.Draw(gradient)
438
- for i in range(qr_image.width):
439
- alpha = int(255 * (1 - i/qr_image.width) * 0.1) # 10% maximum opacity
440
- draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
 
 
441
 
442
- # Combine images
443
- final_image = Image.alpha_composite(qr_image, gradient)
444
 
445
  # Save the image
446
  output_path = QR_CODES_DIR / filename
447
- final_image.save(output_path, quality=95)
 
 
 
448
 
449
  return str(output_path)
 
 
 
 
450
  except Exception as e:
451
- logger.error(f"QR generation error: {e}")
452
  return ""
453
 
454
- def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
455
- """Generate QR codes with enhanced visual appeal and metadata"""
 
456
  try:
457
- file_processor = EnhancedFileProcessor()
458
- paths = []
459
-
460
- if combined:
461
- # Process combined data
462
- chunks = file_processor.chunk_data(data)
463
- for i, chunk in enumerate(chunks):
464
- filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  qr_path = generate_stylish_qr(
466
- data=chunk,
467
  filename=filename,
468
- fill_color="#1a365d", # Deep blue
469
- back_color="#ffffff"
 
470
  )
471
  if qr_path:
472
- paths.append(qr_path)
 
 
 
 
473
  else:
474
- # Process individual items
475
- if isinstance(data, list):
476
- for idx, item in enumerate(data):
477
- chunks = file_processor.chunk_data(item)
478
- for chunk_idx, chunk in enumerate(chunks):
479
- filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png'
480
- qr_path = generate_stylish_qr(
481
- data=chunk,
482
- filename=filename,
483
- fill_color="#1a365d", # Deep blue
484
- back_color="#ffffff"
485
- )
486
- if qr_path:
487
- paths.append(qr_path)
488
- else:
489
- chunks = file_processor.chunk_data(data)
490
- for i, chunk in enumerate(chunks):
491
- filename = f'single_qr_{i+1}_of_{len(chunks)}_{int(time.time())}.png'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  qr_path = generate_stylish_qr(
493
- data=chunk,
494
  filename=filename,
495
- fill_color="#1a365d", # Deep blue
496
- back_color="#ffffff"
 
497
  )
498
  if qr_path:
499
- paths.append(qr_path)
500
- return paths
 
 
 
 
 
501
  except Exception as e:
502
- logger.error(f"QR code generation error: {e}")
503
  return []
504
 
505
- def create_qr_sequence_visualizer(output_gallery):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
  """Add QR sequence visualization capabilities to the application"""
507
- # Create a new tab for the QR code sequence visualization
508
  with gr.Tab("πŸ”„ QR Sequence Visualizer"):
509
  gr.Markdown("""
510
  ## QR Code Sequence Visualizer
511
- Arrange and visualize your QR code sequences. Enable or disable individual QR codes to see how they connect.
512
  """)
513
 
514
- # Inputs for the visualizer
515
- with gr.Row():
516
- qr_input = gr.File(
517
- label="Upload QR Codes",
518
- file_types=["image/png", "image/jpeg"],
519
- file_count="multiple"
520
- )
521
 
522
- with gr.Column():
523
- visualize_btn = gr.Button("πŸ”„ Generate Visualization", variant="primary")
524
- reset_btn = gr.Button("πŸ—‘οΈ Reset", variant="secondary")
525
- # Container for QR code toggles
526
- qr_toggles_container = gr.HTML(label="QR Code Controls")
527
-
528
- # Output visualization
529
  with gr.Row():
530
- qr_visualization = gr.Image(label="QR Code Sequence Map", height=600)
531
- qr_preview = gr.Gallery(label="Selected QR Codes", columns=2, height=600)
 
 
 
 
 
 
 
 
 
532
 
533
- # Status output
534
- visualization_status = gr.Textbox(label="Visualization Status", interactive=False)
 
535
 
536
- # Function to process uploaded QR codes
537
- def process_qr_codes(files):
 
538
  if not files:
539
- return "Please upload QR code images.", None, None, "⚠️ No QR codes uploaded"
 
 
 
 
 
 
 
540
 
 
541
  try:
542
- # Load QR codes and extract metadata
543
- qr_data = []
544
- qr_paths = []
 
 
 
 
 
 
 
 
 
 
 
545
 
546
- for file in files:
547
- try:
548
- img = Image.open(file.name)
549
 
550
- # Try to decode QR code
 
 
551
  try:
552
- detector = qrcode.QRCodeDetector()
553
- data, bbox, _ = detector.detectAndDecode(np.array(img))
554
- if data:
555
- try:
556
- qr_json = json.loads(data)
557
- qr_data.append(qr_json)
558
- qr_paths.append(file.name)
559
- except json.JSONDecodeError:
560
- logger.warning(f"Could not decode JSON from QR: {data}")
561
- qr_data.append({"data": data}) # Store raw data if JSON fails
562
- qr_paths.append(file.name)
563
  else:
564
- qr_data.append({"data": "Empty QR"})
565
- qr_paths.append(file.name)
566
- except Exception as e:
567
- logger.warning(f"Could not decode QR: {e}")
568
- # Add with default metadata
569
- qr_data.append({
570
- "chunk_index": len(qr_data),
571
- "total_chunks": len(files),
572
- "data": "Unknown"
573
- })
574
- qr_paths.append(file.name)
575
- except Exception as e:
576
- logger.error(f"Error processing QR image {file.name}: {e}")
577
-
578
- if not qr_data:
579
- return "No valid QR codes found.", None, None, "❌ Failed to process QR codes"
580
-
581
- # Sort by chunk_index if available
582
- try:
583
- sorted_data = sorted(zip(qr_data, qr_paths), key=lambda x: x[0].get("chunk_index", 0))
584
- qr_data = [d[0] for d in sorted_data]
585
- qr_paths = [d[1] for d in sorted_data]
586
- except Exception as e:
587
- logger.error(f"Error sorting QR data: {e}")
588
 
589
- # Generate toggle controls HTML
590
- toggle_html = '<div style="max-height: 500px; overflow-y: auto; padding: 10px;">'
591
- toggle_html += '<h3>Enable/Disable QR Codes:</h3>'
592
- for i, path in enumerate(qr_paths):
593
- toggle_html += f'<div><input type="checkbox" id="qr_toggle_{i}" checked> <label for="qr_toggle_{i}">{os.path.basename(path)}</label></div>'
594
- toggle_html += '</div>'
595
 
596
- # Update the toggles container
597
- qr_toggles_container.update(value=toggle_html)
 
 
598
 
599
- # Create initial visualization (replace with actual visualization logic)
600
- initial_visualization = "Visualization will appear here." # Replace with your composite image generation
601
- qr_visualization.update(value=initial_visualization)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
 
603
- return "QR codes processed successfully.", qr_paths, qr_data, "βœ… Visualization ready!"
604
  except Exception as e:
605
- logger.error(f"Error processing QR codes: {e}")
606
- return "An error occurred while processing QR codes.", None, None, "❌ Error"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
 
608
- # Function to generate visualization (replace with actual logic)
609
- def generate_visualization(qr_paths):
610
- enabled_indices = [i for i in range(len(qr_paths))] # Start with all enabled
611
- composite_image = "Updated visualization will appear here." # Replace with your composite image generation based on enabled_indices
612
- qr_visualization.update(value=composite_image)
 
613
 
614
  # Event handlers
615
- visualize_btn.click(process_qr_codes, inputs=qr_input, outputs=[visualization_status, qr_paths, qr_preview])
616
- reset_btn.click(lambda: (None, None, None, "⚠️ Visualization reset."), outputs=[visualization_status, qr_visualization, qr_preview])
617
-
618
- # Integrate the visualizer into the main application
619
- def visualize_qr_codes(qr_paths):
620
- """Visualize the generated QR codes with enable/disable functionality"""
621
- # This function currently receives the output gallery content (list of file paths)
622
- # You might need to adapt this based on how you want to visualize.
623
- # For now, let's just log the paths.
624
- logger.info(f"Visualizing QR codes: {qr_paths}")
625
- return "Visualization placeholder" # Replace with actual visualization logic
 
 
 
 
 
 
 
626
 
627
  def create_modern_interface():
628
  """Create a modern and visually appealing Gradio interface"""
629
 
630
- # Modern CSS styling
631
  css = """
632
  /* Modern color scheme */
633
  :root {
@@ -682,21 +1112,25 @@ def create_modern_interface():
682
  /* Gallery styling */
683
  .gallery {
684
  display: grid;
685
- grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
686
  gap: 1rem;
687
  padding: 1rem;
688
  background-color: white;
689
  border-radius: 0.5rem;
690
  border: 1px solid #e2e8f0;
 
691
  }
692
  .gallery img {
693
  width: 100%;
694
  height: auto;
 
695
  border-radius: 0.375rem;
696
  transition: transform 0.2s;
 
697
  }
698
  .gallery img:hover {
699
  transform: scale(1.05);
 
700
  }
701
  """
702
  # Create interface with modern design
@@ -705,200 +1139,305 @@ def create_modern_interface():
705
  # 🌐 Advanced Data Processing & QR Code Generator
706
  Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
707
  """)
708
- with gr.Tab("πŸ“ URL Processing"):
709
- url_input = gr.Textbox(
710
- label="Enter URLs (comma or newline separated)",
711
- lines=5,
712
- placeholder="https://example1.com\nhttps://example2.com",
713
- value=""
714
- )
715
- with gr.Tab("πŸ“ File Input"):
716
- file_input = gr.File(
717
- label="Upload Files",
718
- file_types=["*"], # Accept all file types
719
- file_count="multiple"
720
- )
721
- with gr.Tab("πŸ“‹ JSON Input"):
722
- text_input = gr.TextArea(
723
- label="Direct JSON Input",
724
- lines=15,
725
- placeholder="Paste your JSON data here...",
726
- value=""
727
- )
728
- with gr.Row():
729
- example_btn = gr.Button("πŸ“ Load Example", variant="secondary")
730
- clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
731
  with gr.Row():
732
- combine_data = gr.Checkbox(
733
- label="Combine all data into sequence ",
734
- value=True,
735
- info="Generate sequential QR codes for combined data"
736
- )
737
- process_btn = gr.Button(
738
- "πŸ”„ Process & Generate QR",
739
- variant="primary"
740
- )
741
- # Output components
742
- output_json = gr.JSON(label="Processed Data")
743
- output_gallery = gr.Gallery(
744
- label="Generated QR Codes",
745
- columns=3,
746
- height=400,
747
- show_label=True
748
- )
749
- output_text = gr.Textbox(
750
- label="Processing Status",
751
- interactive=False
752
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
753
 
754
  # Load example data
755
  def load_example():
756
  example = {
757
- "type": "product_catalog",
 
758
  "items": [
759
- {
760
- "id": "123",
761
- "name": "Premium Widget",
762
- "description": "High-quality widget with advanced features",
763
- "price": 299.99,
764
- "category": "electronics",
765
- "tags": ["premium", "featured", "new"]
766
- },
767
- {
768
- "id": "456",
769
- "name": "Basic Widget",
770
- "description": "Reliable widget for everyday use",
771
- "price": 149.99,
772
- "category": "electronics",
773
- "tags": ["basic", "popular"]
774
- }
775
  ],
776
- "metadata": {
777
- "timestamp": datetime.now().isoformat(),
778
- "version": "2.0",
779
- "source": "example"
780
- }
781
  }
782
  return json.dumps(example, indent=2)
783
 
784
- def clear_input():
 
785
  return ""
786
 
787
- def process_inputs(urls, files, text, combine):
788
- """Process all inputs and generate QR codes"""
789
- try:
790
- results = []
791
- url_processor = EnhancedURLProcessor()
792
- file_processor = EnhancedFileProcessor()
793
-
794
 
 
 
795
 
796
- # Process JSON input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
797
  if text and text.strip():
 
 
 
798
  try:
799
  json_data = json.loads(text)
800
- if isinstance(json_data, list):
801
- results.extend(json_data)
802
- else:
803
- results.append(json_data)
804
- except json.JSONDecodeError as e:
805
- return None, [], f"❌ Invalid JSON format: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
806
 
807
- # Process URLs
808
- if urls and urls.strip():
809
- url_list = re.split(r'[,\n]', urls)
810
- url_list = [url.strip() for url in url_list if url.strip()]
811
- for url in url_list:
812
- validation = url_processor.validate_url(url)
813
- if validation['is_valid']:
814
- content = url_processor.fetch_content(url)
815
- if content:
816
- results.append({
817
- 'source': 'url',
818
- 'url': url,
819
- 'content': content,
820
- 'timestamp': datetime.now().isoformat()
821
- })
822
-
823
- # Process files
824
- if files:
825
- for file in files:
826
- file_results = file_processor.process_file(file)
827
- if file_results:
828
- results.extend(file_results)
829
-
830
- # Generate QR codes
831
- if results:
832
- qr_paths = generate_qr_codes(results, combine)
833
- if qr_paths:
834
- return (
835
- results,
836
- [str(path) for path in qr_paths],
837
- f"βœ… Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
838
- )
839
- else:
840
- return None, [], "❌ Failed to generate QR codes"
841
- else:
842
- return None, [], "⚠️ No valid content to process"
843
- except Exception as e:
844
- logger.error(f"Processing error: {e}")
845
- return None, [], f"❌ Error: {str(e)}"
846
-
847
- # Set up event handlers
848
- example_btn.click(load_example, outputs=[text_input])
849
- clear_btn.click(clear_input, outputs=[text_input])
850
- process_btn.click(
851
- process_inputs,
852
- inputs=[url_input, file_input, text_input, combine_data],
853
- outputs=[output_json, output_gallery, output_text]
854
- )
855
-
856
- # Add the visualization button and its click event within the interface scope
857
- #visualize_btn = gr.Button("πŸ” Visualize QR Codes")
858
- #visualize_btn.click(visualize_qr_codes, inputs=output_gallery, outputs=None)
859
-
860
- # Add helpful documentation
861
- gr.Markdown("""
862
- ### πŸš€ Features
863
- - **Complete URL Scraping**: Extracts every character from web pages
864
- - **Advanced File Processing**: Full content extraction from various text-based files and common archives. Supports flexible JSON handling.
865
- - **Smart JSON Handling**: Processes any size JSON with automatic chunking, either via direct input or file upload.
866
- - **Sequential QR Codes**: Maintains data integrity across multiple codes
867
- - **Modern Design**: Clean, responsive interface with visual feedback
868
- ### πŸ’‘ Tips
869
- 1. **URLs**: Enter multiple URLs separated by commas or newlines
870
- 2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and JSON files.
871
- 3. **JSON**: Use the example button to see the expected format or upload a .json file. The system will also try to detect JSON content in other file types.
872
- 4. **QR Codes**: Choose whether to combine data into sequential codes
873
- 5. **Processing**: Monitor the status for real-time feedback
874
- ### 🎨 Output
875
- - Generated QR codes are saved in the `output/qr_codes` directory
876
- - Each QR code contains metadata for proper sequencing
877
- - Hover over QR codes in the gallery to see details
878
- """)
879
- return interface
880
  def main():
881
- """Initialize and launch the application"""
882
- try:
883
- # Configure system settings
884
- mimetypes.init()
885
-
886
- # Create and launch interface
887
- interface = create_modern_interface()
888
-
889
- # Add the QR sequence visualizer tab
890
- #with interface:
891
- create_qr_sequence_visualizer(None) # output_gallery might not be relevant here
892
-
893
- # Launch with configuration
894
- interface.launch(
895
- share=False,
896
- debug=False,
897
- show_error=True,
898
- show_api=False
899
- )
900
- except Exception as e:
901
- logger.error(f"Application startup error: {e}")
902
- raise
903
- if name == "main":
904
- main()
 
 
 
 
 
 
 
 
 
 
56
 
57
  # Enhanced headers for better site compatibility
58
  self.session.headers.update({
59
+ 'User-Agent': self.user_agent.random, # Corrected spacing
60
  'Accept': '*/*', # Accept all content types
61
  'Accept-Language': 'en-US,en;q=0.9',
62
  'Accept-Encoding': 'gzip, deflate, br',
 
65
  'Sec-Fetch-Dest': 'document',
66
  'Sec-Fetch-Mode': 'navigate',
67
  'Sec-Fetch-Site': 'none',
68
+ 'Sec-Fetch-User': '?1', # Corrected spacing
69
  'DNT': '1'
70
  })
71
 
 
78
  if not all([parsed.scheme, parsed.netloc]):
79
  return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
80
  # Try HEAD request first to check accessibility
81
+ head_response = None # Initialize head_response
82
  try:
83
  head_response = self.session.head(url, timeout=5)
84
  head_response.raise_for_status()
85
+ # Need details from head_response if successful
86
+ details = {
87
+ 'content_type': head_response.headers.get('Content-Type', 'unknown'),
88
+ 'server': head_response.headers.get('Server', 'unknown'),
89
+ 'size': head_response.headers.get('Content-Length', 'unknown')
90
+ }
91
  except requests.exceptions.RequestException:
92
  # If HEAD fails, try GET as some servers don't support HEAD
93
+ logger.info(f"HEAD request failed for {url}, trying GET.")
94
  response = self.session.get(url, timeout=self.timeout)
95
  response.raise_for_status()
96
+ # Use details from GET response if HEAD failed
97
+ details = {
98
+ 'content_type': response.headers.get('Content-Type', 'unknown'),
99
+ 'server': response.headers.get('Server', 'unknown'),
100
+ 'size': response.headers.get('Content-Length', 'unknown') # Might not be accurate for GET stream
101
+ }
102
 
103
  return {
104
  'is_valid': True,
105
  'message': 'URL is valid and accessible',
106
+ 'details': details
 
 
 
 
107
  }
108
  except Exception as e:
109
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
 
114
  logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
115
 
116
  # Update User-Agent randomly for each request
117
+ self.session.headers.update({'User-Agent': self.user_agent.random}) # Corrected spacing
118
 
119
  response = self.session.get(url, timeout=self.timeout)
120
  response.raise_for_status()
 
128
  # Decode content with fallback
129
  try:
130
  raw_content = response.content.decode(encoding, errors='replace')
131
+ except (UnicodeDecodeError, LookupError): # Corrected error type
132
  raw_content = response.content.decode('utf-8', errors='replace')
133
 
134
  # Extract metadata
 
147
  if 'text/html' in content_type:
148
  processed_content = self._process_html_content(raw_content, url)
149
  else:
150
+ processed_content = raw_content # Store raw non-html content as processed
151
  return {
152
  'content': processed_content,
153
+ 'raw_content': raw_content, # Keep raw bytes if needed elsewhere
154
  'metadata': metadata
155
  }
156
  except requests.exceptions.RequestException as e:
 
174
  for attr in ['href', 'src']:
175
  if tag.get(attr):
176
  try:
177
+ # Handle potential base tag
178
+ base = soup.find('base')
179
+ current_base_url = base['href'] if base and base.get('href') else base_url
180
+ tag[attr] = urljoin(current_base_url, tag[attr])
181
+ except Exception as url_e:
182
+ # logger.warning(f"Could not absolutize URL {tag.get(attr)} in {base_url}: {url_e}")
183
+ pass # Keep original if conversion fails
184
+
185
+ # Extract all text content more cleanly
186
+ text_parts = [element for element in soup.stripped_strings]
187
+ # text_content = ' '.join(text_parts) # Join with space instead of newline? Depends on use case.
188
+ # Or keep newlines for structure:
189
+ text_content = '\n'.join(text_parts)
190
+
191
+ # Alternative: Get all text including scripts/styles if needed
192
+ # text_content = soup.get_text(separator='\n', strip=True)
193
+
194
+ return text_content
195
  except Exception as e:
196
  logger.error(f"HTML processing error: {e}")
197
+ # Return original content if parsing fails
198
  return content
199
 
200
  class EnhancedFileProcessor:
201
  """Advanced file processing with complete content extraction"""
202
  def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
203
  self.max_file_size = max_file_size
204
+ # Added more potential text/data formats
205
  self.supported_extensions = {
206
+ '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', '.css', '.js',
207
+ '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg', '.toml', '.sql', '.py', '.java', '.c', '.cpp', '.h', # Code files
208
+ '.zip', '.tar', '.gz', '.bz2', # No .7z, .rar without external libs
209
+ # '.pdf', '.doc', '.docx', '.rtf', '.odt' # These require more specific libraries (PyPDF2, python-docx etc.) - keep commented unless implemented
210
+ }
211
+ # Define extensions that should be treated primarily as text
212
+ self.text_extensions = {
213
+ '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', '.css', '.js',
214
+ '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg', '.toml', '.sql', '.py', '.java', '.c', '.cpp', '.h'
215
  }
216
 
217
+
218
  def process_file(self, file) -> List[Dict]:
219
  """Process uploaded file with enhanced error handling and complete extraction"""
220
+ if not file or not hasattr(file, 'name'):
221
+ logger.warning("Invalid file object received in process_file.")
222
+ return []
223
 
224
  dataset = []
225
+ file_path_obj = Path(file.name)
226
+
227
  try:
228
+ # Use Gradio's temp file path directly
229
+ file_path = file_path_obj.resolve()
230
+ if not file_path.exists():
231
+ logger.error(f"File path does not exist: {file_path}")
232
+ return []
233
+
234
+ file_size = file_path.stat().st_size
235
  if file_size > self.max_file_size:
236
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size ({self.max_file_size} bytes) for {file_path.name}")
237
+ # Optionally return a specific error message entry
238
+ # return [{'error': 'File too large', 'filename': file_path.name}]
239
  return []
240
 
241
+ file_suffix = file_path.suffix.lower()
242
+
243
+ # Check if supported at all
244
+ # if file_suffix not in self.supported_extensions and not self._is_archive(str(file_path)):
245
+ # logger.warning(f"Unsupported file type based on extension: {file_path.name}")
246
+ # # Decide if you want to try processing anyway or return
247
+ # # return [{'error': 'Unsupported file type', 'filename': file_path.name}]
248
+ # # Let's try processing anyway, _process_single_file will handle text reading
249
+ # pass # Continue to attempt processing
250
+
251
+ # Use a persistent temp directory if needed across calls, otherwise TemporaryDirectory is fine
252
+ with tempfile.TemporaryDirectory(dir=TEMP_DIR) as temp_dir: # Use configured temp dir
253
  temp_dir_path = Path(temp_dir)
254
 
255
+ # Handle archives first
256
+ if self._is_archive(str(file_path)):
257
+ logger.info(f"Processing archive file: {file_path.name}")
258
+ dataset.extend(self._process_archive(str(file_path), temp_dir_path))
 
259
  else:
260
+ # Process as single file (might be text or something else)
261
+ logger.info(f"Processing single file: {file_path.name}")
262
+ # Pass the path string or Path object to _process_single_file
263
+ dataset.extend(self._process_single_file(file_path))
264
+
265
 
266
  except Exception as e:
267
+ logger.error(f"Error processing file '{file_path_obj.name}': {str(e)}", exc_info=True) # Log stack trace
268
+ # Optionally return error entry
269
+ # dataset.append({'error': f'Processing failed: {str(e)}', 'filename': file_path_obj.name})
270
+ return [] # Return empty list on error for now
271
  return dataset
272
 
273
  def _is_archive(self, filepath: str) -> bool:
274
+ """Check if file is a supported archive type"""
275
+ # Only include archive types we can handle
276
+ return filepath.lower().endswith(('.zip', '.tar', '.tar.gz', '.tgz', '.gz', '.bz2')) # Added bz2 if bz2 lib is imported
 
277
 
278
+ def _process_single_file(self, file_path: Union[str, Path]) -> List[Dict]:
279
  """Process a single file with enhanced character extraction and JSON handling"""
280
+ # Ensure file_path is a Path object
281
+ file_path = Path(file_path)
282
+ file_name = file_path.name
283
+ file_suffix = file_path.suffix.lower()
284
+
285
  try:
286
+ file_stat = file_path.stat()
287
  file_size = file_stat.st_size
288
+ mime_type, _ = mimetypes.guess_type(file_path)
289
+ mime_type = mime_type or 'application/octet-stream' # Default if guess fails
290
+
291
  # Initialize content storage
292
+ complete_content = None
293
+ is_json_like = file_suffix == '.json' or 'json' in mime_type
294
+
295
+ # Try reading as text first if it's a text-like extension or potentially text mime type
296
+ # Increased chunk size for efficiency on larger text files
297
  chunk_size = 10 * 1024 * 1024 # 10MB chunks
298
+ if file_suffix in self.text_extensions or (mime_type and mime_type.startswith('text/')):
299
+ content_parts = []
300
+ detected_encoding = 'utf-8' # Default
301
+ try:
302
+ with open(file_path, 'rb') as f:
303
+ # Detect encoding from the first chunk for better accuracy
304
+ first_chunk = f.read(chunk_size)
305
+ if first_chunk:
306
+ detected_encoding = chardet.detect(first_chunk)['encoding'] or 'utf-8'
307
+ logger.info(f"Detected encoding for {file_name}: {detected_encoding}")
308
+ # Rewind or reopen might be cleaner if needed, but let's decode first chunk
309
+ try:
310
+ decoded_chunk = first_chunk.decode(detected_encoding, errors='replace')
311
+ content_parts.append(decoded_chunk)
312
+ except (UnicodeDecodeError, LookupError):
313
+ logger.warning(f"Failed to decode first chunk with {detected_encoding}, falling back to utf-8 for {file_name}")
314
+ detected_encoding = 'utf-8' # Fallback for subsequent reads
315
+ decoded_chunk = first_chunk.decode(detected_encoding, errors='replace')
316
+ content_parts.append(decoded_chunk)
317
+
318
+ # Read remaining chunks
319
+ while True:
320
+ chunk = f.read(chunk_size)
321
+ if not chunk:
322
+ break
323
+ try:
324
+ decoded_chunk = chunk.decode(detected_encoding, errors='replace')
325
+ content_parts.append(decoded_chunk)
326
+ except (UnicodeDecodeError, LookupError):
327
+ # Should not happen if fallback already occurred, but good practice
328
+ logger.warning(f"Decoding error in subsequent chunk for {file_name}, using replace.")
329
+ decoded_chunk = chunk.decode(detected_encoding, errors='replace')
330
+ content_parts.append(decoded_chunk)
331
+
332
+ complete_content = ''.join(content_parts)
333
+ logger.info(f"Successfully read text content from {file_name}")
334
+
335
+ except IOError as e:
336
+ logger.error(f"IOError reading file {file_name}: {e}")
337
+ return [] # Cannot process if read fails
338
+ except Exception as e:
339
+ logger.error(f"Error reading text file {file_name}: {e}", exc_info=True)
340
+ # Decide if we should return or try other methods
341
+ return []
342
+
343
+
344
+ # Now, check if the read text content IS valid JSON
345
+ json_data = None
346
+ raw_json_content = None # Store the raw string if it was JSON
347
+ if complete_content is not None:
348
+ try:
349
  json_data = json.loads(complete_content)
350
+ # It is JSON! Update metadata
351
+ raw_json_content = complete_content # Keep the original string
352
+ complete_content = json_data # Now content holds the parsed object
353
+ mime_type = 'application/json' # Correct mime type
354
+ source = 'json_content_detected'
355
+ if file_suffix == '.json':
356
+ source = 'json_file'
357
+ logger.info(f"Successfully parsed JSON content from {file_name}")
358
+
359
+ except json.JSONDecodeError:
360
+ # It looked like text, but wasn't valid JSON
361
+ if is_json_like:
362
+ logger.warning(f"File {file_name} has JSON extension/mime but failed to parse.")
363
+ # Keep complete_content as the string it was read as
364
+ source = 'text_file'
365
+ except Exception as e:
366
+ logger.error(f"Unexpected error during JSON parsing check for {file_name}: {e}")
367
+ # Keep complete_content as string, mark as text file
368
+ source = 'text_file'
369
+ else:
370
+ # File wasn't identified as text or failed to read
371
+ # Could attempt binary read here if needed, or just mark as non-text
372
+ logger.warning(f"Could not read {file_name} as text. Storing metadata only or treating as binary.")
373
+ source = 'binary_file' # Or 'unreadable_file'
374
+ complete_content = f"Binary or unreadable content ({file_size} bytes)" # Placeholder
375
+
 
 
 
 
 
376
 
377
+ # Structure the output
378
+ result = {
379
+ 'source': source,
380
+ 'filename': file_name,
381
  'file_size': file_size,
382
+ 'mime_type': mime_type,
383
  'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
384
  'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
385
+ 'content': complete_content, # This is parsed JSON if successful, or text string, or placeholder
386
  'timestamp': datetime.now().isoformat()
387
+ }
388
+ if raw_json_content:
389
+ result['raw_content'] = raw_json_content # Add raw string if it was JSON
390
+
391
+ return [result]
392
+
393
+ except FileNotFoundError:
394
+ logger.error(f"File not found during processing: {file_path}")
395
+ return []
396
  except Exception as e:
397
+ logger.error(f"File processing error for {file_path.name}: {e}", exc_info=True)
398
  return []
399
 
400
  def _process_archive(self, archive_path: str, extract_to: Path) -> List[Dict]:
401
  """Process an archive file with enhanced extraction"""
402
  dataset = []
403
+ archive_path_obj = Path(archive_path)
404
+ logger.info(f"Attempting to extract archive: {archive_path_obj.name}")
405
+
406
  try:
407
  # Handle ZIP archives
408
+ if archive_path.lower().endswith('.zip') and zipfile.is_zipfile(archive_path):
409
+ logger.debug(f"Processing ZIP file: {archive_path_obj.name}")
410
  with zipfile.ZipFile(archive_path, 'r') as zip_ref:
411
+ # Check for zip bomb potential (optional, basic check)
412
+ total_uncompressed_size = sum(file.file_size for file in zip_ref.infolist())
413
+ # Add a limit, e.g., 10x the archive size or an absolute limit like 10GB
414
+ if total_uncompressed_size > self.max_file_size * 10: # Example limit
415
+ logger.warning(f"Potential zip bomb detected: {archive_path_obj.name}, uncompressed size {total_uncompressed_size}")
416
+ return [{'error': 'Archive potential bomb', 'filename': archive_path_obj.name}]
417
+
418
  for file_info in zip_ref.infolist():
419
+ # Avoid directory entries and potential path traversal issues
420
+ if not file_info.is_dir() and file_info.filename and not file_info.filename.startswith('/') and '..' not in file_info.filename:
421
+ try:
422
+ extracted_path = extract_to / file_info.filename
423
+ # Ensure parent directory exists
424
+ extracted_path.parent.mkdir(parents=True, exist_ok=True)
425
+
426
+ # Extract individual file safely
427
+ with zip_ref.open(file_info.filename) as source, open(extracted_path, "wb") as target:
428
+ target.write(source.read())
429
+
430
+ logger.debug(f"Extracted {file_info.filename} from zip.")
431
+ # Now process the extracted file
432
+ dataset.extend(self._process_single_file(extracted_path))
433
+ except Exception as extract_err:
434
+ logger.error(f"Failed to extract/process file {file_info.filename} from zip {archive_path_obj.name}: {extract_err}")
435
+
436
+ # Handle TAR archives (covers .tar, .tar.gz, .tgz, .tar.bz2)
437
+ # Need to import bz2 if supporting .bz2
438
+ elif tarfile.is_tarfile(archive_path):
439
+ logger.debug(f"Processing TAR file: {archive_path_obj.name}")
440
+ # Mode 'r:*' auto-detects compression (gz, bz2, xz if libs available)
441
+ with tarfile.open(archive_path, 'r:*') as tar_ref:
442
+ # Add security checks for tar extraction if needed (e.g., checking paths)
443
+ for member in tar_ref.getmembers():
444
+ if member.isfile() and member.name and not member.name.startswith('/') and '..' not in member.name:
445
+ try:
446
+ # Construct safe path
447
+ extracted_path = extract_to / member.name
448
+ extracted_path.parent.mkdir(parents=True, exist_ok=True)
449
+ # Extract safely
450
+ with tar_ref.extractfile(member) as source, open(extracted_path, "wb") as target:
451
+ target.write(source.read())
452
+
453
+ logger.debug(f"Extracted {member.name} from tar.")
454
+ dataset.extend(self._process_single_file(extracted_path))
455
+ except Exception as extract_err:
456
+ logger.error(f"Failed to extract/process member {member.name} from tar {archive_path_obj.name}: {extract_err}")
457
+
458
+ # Handle GZIP archives (single file compression) - check it's not a tar.gz
459
+ elif archive_path.lower().endswith('.gz') and not archive_path.lower().endswith('.tar.gz'):
460
+ logger.debug(f"Processing GZIP file: {archive_path_obj.name}")
461
+ # Need to determine the output filename (remove .gz)
462
+ extracted_filename = archive_path_obj.stem
463
+ # Handle cases like '.txt.gz' -> '.txt'
464
+ if '.' in extracted_filename:
465
+ extracted_path = extract_to / extracted_filename
466
+ else:
467
+ # If no inner extension (e.g., 'myfile.gz'), maybe add a default like '.bin' or leave as is?
468
+ extracted_path = extract_to / (extracted_filename + ".bin") # Example
469
+
470
+ try:
471
+ extracted_path.parent.mkdir(parents=True, exist_ok=True)
472
  with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
473
  outfile.write(gz_file.read())
474
+ logger.debug(f"Extracted {extracted_path.name} from gzip.")
475
+ dataset.extend(self._process_single_file(extracted_path))
476
+ except gzip.BadGzipFile as e:
477
+ logger.error(f"Error processing GZIP archive {archive_path_obj.name}: Bad Gzip File - {e}")
478
+ except Exception as extract_err:
479
+ logger.error(f"Failed to extract/process gzip file {archive_path_obj.name}: {extract_err}")
480
+
481
+ # Add BZ2 single file support (requires bz2 import)
482
+ elif archive_path.lower().endswith('.bz2') and not archive_path.lower().endswith('.tar.bz2'):
483
+ logger.debug(f"Processing BZ2 file: {archive_path_obj.name}")
484
+ try:
485
+ import bz2
486
+ extracted_filename = archive_path_obj.stem
487
+ extracted_path = extract_to / extracted_filename
488
+ if '.' not in extracted_filename:
489
+ extracted_path = extract_to / (extracted_filename + ".bin")
490
+
491
+ extracted_path.parent.mkdir(parents=True, exist_ok=True)
492
+ with bz2.open(archive_path, 'rb') as bz2_file, open(extracted_path, 'wb') as outfile:
493
+ outfile.write(bz2_file.read())
494
+ logger.debug(f"Extracted {extracted_path.name} from bz2.")
495
+ dataset.extend(self._process_single_file(extracted_path))
496
+
497
+ except ImportError:
498
+ logger.warning("bz2 library not available, cannot process .bz2 files.")
499
+ except Exception as extract_err:
500
+ logger.error(f"Failed to extract/process bz2 file {archive_path_obj.name}: {extract_err}")
501
+
502
+
503
+ # Placeholder for other types or if no specific handler matched
504
+ else:
505
+ logger.warning(f"Archive type not explicitly handled or not a recognized archive: {archive_path_obj.name}")
506
 
507
+
508
+ except FileNotFoundError:
509
+ logger.error(f"Archive file not found: {archive_path}")
510
+ except (zipfile.BadZipFile, tarfile.TarError, gzip.BadGzipFile) as archive_err:
511
+ logger.error(f"Invalid or corrupted archive file {archive_path_obj.name}: {archive_err}")
512
+ dataset.append({'error': f'Corrupted archive: {archive_err}', 'filename': archive_path_obj.name})
513
  except Exception as e:
514
+ logger.error(f"General archive processing error for {archive_path_obj.name}: {e}", exc_info=True)
515
+ dataset.append({'error': f'Archive processing failed: {e}', 'filename': archive_path_obj.name})
516
  return dataset
517
 
518
+ # Adjusted chunk_data with recommended max_size for QR codes
519
+ def chunk_data(self, data: Union[Dict, List, str], max_size: int = 1800) -> List[Dict]:
520
+ """Enhanced data chunking with sequence metadata, sized for QR codes."""
521
  try:
522
+ if not isinstance(data, str):
523
+ # Convert complex data to JSON string first
524
+ # Use separators=(',', ':') for compact JSON
525
+ json_str = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
526
+ else:
527
+ json_str = data # Assume input string is already the data payload
528
+
529
+ # Data here is the raw string (or JSON string) payload for the QR code
530
+ total_length = len(json_str.encode('utf-8')) # Use byte length for QR capacity
531
+ logger.debug(f"Chunking data of total byte length: {total_length}")
 
 
 
532
 
 
 
533
 
534
+ # Simplified: If the data fits within max_size (bytes), return one chunk object
535
+ # The chunk object itself adds metadata, but the 'data' field is what matters for QR limit.
536
+ if total_length <= max_size:
537
+ chunk_meta = {
538
  "chunk_index": 0,
539
  "total_chunks": 1,
540
+ "total_length": total_length, # Store byte length
541
+ "chunk_hash": hash(json_str) & 0xFFFFFFFF,
542
+ "data": json_str # The actual string payload
543
  }
544
+ logger.debug(f"Data fits in one chunk (payload size {total_length} bytes)")
545
+ return [chunk_meta]
546
 
547
+ # If data exceeds max_size, split the string payload
548
+ # We need to split the *string* representation carefully
549
+ # Aim for byte size chunks, which is tricky with UTF-8 variable char width
550
+ # Simple approach: estimate character chunk size based on bytes
551
+
552
+ # Estimate average bytes per character (crude but simple)
553
+ avg_bytes_per_char = total_length / len(json_str) if len(json_str) > 0 else 1
554
+ # Calculate target character chunk size based on byte limit
555
+ target_char_chunk_size = int(max_size / avg_bytes_per_char)
556
+
557
+ if target_char_chunk_size < 1: target_char_chunk_size = 1 # Avoid zero chunk size
558
+
559
+ # Calculate number of chunks based on estimated character size
560
+ num_chunks = math.ceil(len(json_str) / target_char_chunk_size)
561
 
562
  chunks = []
563
+ start_char_idx = 0
564
  for i in range(num_chunks):
565
+ # Calculate end index, ensuring we don't overshoot
566
+ end_char_idx = min(start_char_idx + target_char_chunk_size, len(json_str))
567
+
568
+ # Extract the character chunk
569
+ chunk_payload_str = json_str[start_char_idx:end_char_idx]
570
+
571
+ # Recalculate actual byte length for this specific chunk
572
+ current_chunk_byte_length = len(chunk_payload_str.encode('utf-8'))
573
 
574
+ # Adjust end_char_idx if current chunk exceeds max_size (rare if estimate is decent)
575
+ while current_chunk_byte_length > max_size and end_char_idx > start_char_idx:
576
+ end_char_idx -= 1 # Reduce characters
577
+ chunk_payload_str = json_str[start_char_idx:end_char_idx]
578
+ current_chunk_byte_length = len(chunk_payload_str.encode('utf-8'))
579
+
580
+ if not chunk_payload_str and start_char_idx < len(json_str):
581
+ # This should not happen with the logic above, but as a safeguard
582
+ logger.error("Chunking resulted in empty payload string unexpectedly.")
583
+ # Handle error: skip, break, or adjust logic
584
+ break # Avoid infinite loop
585
+
586
+ chunk_meta = {
587
  "chunk_index": i,
588
  "total_chunks": num_chunks,
589
+ "total_length": total_length, # Original total byte length
590
+ "chunk_byte_length": current_chunk_byte_length, # Actual byte length of this chunk's payload
591
+ "chunk_hash": hash(chunk_payload_str) & 0xFFFFFFFF,
592
+ "data": chunk_payload_str # The string payload for this chunk
593
  }
594
+ chunks.append(chunk_meta)
595
+ logger.debug(f"Created chunk {i+1}/{num_chunks}, payload byte size: {current_chunk_byte_length}")
596
+
597
+ # Move to the next starting point
598
+ start_char_idx = end_char_idx
599
+
600
+ # Safety break if start index doesn't advance
601
+ if start_char_idx == len(json_str) and i + 1 < num_chunks:
602
+ logger.warning(f"Chunking finished early at index {i+1} of {num_chunks}. Check logic.")
603
+ # Adjust total_chunks if ending early?
604
+ for ch in chunks: ch['total_chunks'] = len(chunks)
605
+ break
606
+
607
+
608
+ # Final check if total chunks changed
609
+ if chunks and chunks[0]['total_chunks'] != len(chunks):
610
+ logger.warning(f"Adjusting total_chunks from {chunks[0]['total_chunks']} to {len(chunks)}")
611
+ final_num_chunks = len(chunks)
612
+ for i, chunk in enumerate(chunks):
613
+ chunk['total_chunks'] = final_num_chunks
614
+ chunk['chunk_index'] = i # Re-index just in case
615
+
616
 
617
  return chunks
618
  except Exception as e:
619
+ logger.error(f"Error chunking data: {e}", exc_info=True)
620
  return []
621
 
622
+
623
+ def generate_stylish_qr(data: str, # Expecting string data from chunking
624
  filename: str,
625
  size: int = 10,
626
  border: int = 4,
627
  fill_color: str = "#000000",
628
+ back_color: str = "#FFFFFF",
629
+ error_correction_level=qrcode.constants.ERROR_CORRECT_H) -> str: # Added param
630
  """Generate a stylish QR code with enhanced visual appeal"""
631
  try:
632
  qr = qrcode.QRCode(
633
+ version=None, # Auto-detect version
634
+ error_correction=error_correction_level, # Use parameter
635
  box_size=size,
636
  border=border
637
  )
638
 
639
+ # Add string data directly (should be from chunker)
640
+ qr.add_data(data)
 
 
 
641
 
642
+ # Let the library figure out the best version and mode
643
  qr.make(fit=True)
644
 
645
+ logger.info(f"Generating QR code version {qr.version} for {filename} (Payload size: {len(data.encode('utf-8'))} bytes)")
646
+
647
+
648
  # Create QR code image with custom colors
649
  qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
650
 
651
+ # Convert to RGBA for transparency support (optional gradient)
652
  qr_image = qr_image.convert('RGBA')
653
 
654
+ # --- Optional: Add subtle gradient overlay ---
655
+ # gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
656
+ # draw = ImageDraw.Draw(gradient)
657
+ # for i in range(qr_image.width):
658
+ # alpha = int(255 * (1 - i/qr_image.width) * 0.1) # 10% maximum opacity
659
+ # draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
660
+ # final_image = Image.alpha_composite(qr_image, gradient)
661
+ # --- End Optional Gradient ---
662
 
663
+ final_image = qr_image # Use this line if gradient is commented out
 
664
 
665
  # Save the image
666
  output_path = QR_CODES_DIR / filename
667
+ # Ensure directory exists just before saving
668
+ output_path.parent.mkdir(parents=True, exist_ok=True)
669
+
670
+ final_image.save(output_path, quality=95) # PNG quality is lossless, but ok
671
 
672
  return str(output_path)
673
+ # Catch specific data overflow error
674
+ except qrcode.exceptions.DataOverflowError as doe:
675
+ logger.error(f"QR DataOverflowError for {filename}: {doe}. Data length (bytes): {len(data.encode('utf-8'))}. Max capacity likely exceeded for ErrorLevel {error_correction_level}.")
676
+ return "" # Return empty string on failure
677
  except Exception as e:
678
+ logger.error(f"QR generation error for {filename}: {e}", exc_info=True)
679
  return ""
680
 
681
+
682
+ def generate_qr_codes(data_to_encode: Union[str, Dict, List], combine_sources: bool = True) -> List[str]:
683
+ """Generate QR codes, chunking data appropriately."""
684
  try:
685
+ file_processor = EnhancedFileProcessor() # Get chunking method
686
+ all_qr_paths = []
687
+ qr_fill = "#1a365d" # Deep blue
688
+ qr_back = "#ffffff"
689
+ # Decide on error correction level - H is default, M or L allow more data
690
+ error_level = qrcode.constants.ERROR_CORRECT_H # Max correction, lowest capacity
691
+ # error_level = qrcode.constants.ERROR_CORRECT_M # Medium correction, medium capacity
692
+ # error_level = qrcode.constants.ERROR_CORRECT_L # Low correction, max capacity
693
+
694
+
695
+ if combine_sources:
696
+ logger.info("Combining all input sources into a single QR sequence.")
697
+ # Combine all data into one large structure (e.g., a list) before chunking
698
+ # This assumes `data_to_encode` is already the combined list/dict from process_inputs
699
+ if not data_to_encode:
700
+ logger.warning("No data provided to generate combined QR codes.")
701
+ return []
702
+
703
+ # Chunk the combined data structure
704
+ chunks = file_processor.chunk_data(data_to_encode) # Chunker expects dict/list/str
705
+ if not chunks:
706
+ logger.error("Chunking the combined data failed.")
707
+ return []
708
+
709
+ num_chunks = len(chunks)
710
+ logger.info(f"Generating {num_chunks} QR codes for combined data.")
711
+ for i, chunk_info in enumerate(chunks):
712
+ # chunk_info contains {'chunk_index', 'total_chunks', 'data', etc.}
713
+ filename = f'combined_qr_{int(time.time())}_{i+1}_of_{num_chunks}.png'
714
+ # Pass the actual payload string to the generator
715
+ qr_payload = chunk_info['data']
716
  qr_path = generate_stylish_qr(
717
+ data=qr_payload,
718
  filename=filename,
719
+ fill_color=qr_fill,
720
+ back_color=qr_back,
721
+ error_correction_level=error_level # Pass level
722
  )
723
  if qr_path:
724
+ all_qr_paths.append(qr_path)
725
+ else:
726
+ logger.error(f"Failed to generate QR code for combined chunk {i+1}")
727
+ # Optionally stop or continue?
728
+
729
  else:
730
+ # Process each item in the input list individually
731
+ logger.info("Generating separate QR code sequences for each input source.")
732
+ if not isinstance(data_to_encode, list):
733
+ logger.error("Input data must be a list when combine_sources is False.")
734
+ # Maybe wrap it?
735
+ if data_to_encode:
736
+ data_to_encode = [data_to_encode]
737
+ else:
738
+ return []
739
+
740
+
741
+ total_items = len(data_to_encode)
742
+ for item_idx, item in enumerate(data_to_encode):
743
+ item_source_info = f"item {item_idx+1}/{total_items}"
744
+ # Try to get a better name (e.g., from filename if available)
745
+ if isinstance(item, dict) and 'filename' in item:
746
+ item_source_info = item['filename']
747
+ elif isinstance(item, dict) and 'url' in item:
748
+ item_source_info = Path(urlparse(item['url']).path).name or f"url_item_{item_idx+1}"
749
+
750
+ logger.info(f"Processing source: {item_source_info}")
751
+
752
+ # Chunk the individual item
753
+ chunks = file_processor.chunk_data(item)
754
+ if not chunks:
755
+ logger.error(f"Chunking failed for item {item_idx+1} ({item_source_info})")
756
+ continue # Skip to next item
757
+
758
+ num_chunks = len(chunks)
759
+ logger.info(f"Generating {num_chunks} QR codes for {item_source_info}.")
760
+ for chunk_idx, chunk_info in enumerate(chunks):
761
+ # Sanitize source info for filename
762
+ safe_source_name = re.sub(r'[^\w\-]+', '_', item_source_info)
763
+ filename = f'{safe_source_name}_chunk_{chunk_idx+1}_of_{num_chunks}_{int(time.time())}.png'
764
+ qr_payload = chunk_info['data']
765
  qr_path = generate_stylish_qr(
766
+ data=qr_payload,
767
  filename=filename,
768
+ fill_color=qr_fill,
769
+ back_color=qr_back,
770
+ error_correction_level=error_level # Pass level
771
  )
772
  if qr_path:
773
+ all_qr_paths.append(qr_path)
774
+ else:
775
+ logger.error(f"Failed to generate QR code for {item_source_info} chunk {chunk_idx+1}")
776
+
777
+
778
+ logger.info(f"Generated a total of {len(all_qr_paths)} QR codes.")
779
+ return all_qr_paths
780
  except Exception as e:
781
+ logger.error(f"General QR code generation process error: {e}", exc_info=True)
782
  return []
783
 
784
+ def _generate_sequence_visualization_image(qr_paths: List[str], qr_data: List[Dict], title: str = "QR Code Sequence") -> Optional[io.BytesIO]:
785
+ """
786
+ Generates a visual representation of the QR code sequence using NetworkX and Matplotlib.
787
+
788
+ Args:
789
+ qr_paths: List of file paths to the QR code images.
790
+ qr_data: List of decoded data dictionaries, hopefully containing 'chunk_index'.
791
+ title: The title for the visualization plot.
792
+
793
+ Returns:
794
+ A BytesIO buffer containing the PNG image of the visualization, or None if error.
795
+ """
796
+ if not qr_paths or not qr_data or len(qr_paths) != len(qr_data):
797
+ logger.warning("Mismatch or empty data for visualization.")
798
+ return None
799
+
800
+ logger.info(f"Generating visualization for {len(qr_paths)} QR codes.")
801
+ try:
802
+ G = nx.DiGraph()
803
+ node_labels = {}
804
+ node_colors = []
805
+ node_sizes = []
806
+
807
+ # Assume data is pre-sorted by chunk_index during loading
808
+ num_nodes = len(qr_paths)
809
+ total_chunks_from_meta = qr_data[0].get('total_chunks', num_nodes) if qr_data else num_nodes
810
+
811
+ for i in range(num_nodes):
812
+ node_id = i
813
+ # Use chunk_index from metadata if possible, otherwise use list index
814
+ chunk_idx = qr_data[i].get('chunk_index', i)
815
+ label = f"{chunk_idx + 1}/{total_chunks_from_meta}"
816
+ node_labels[node_id] = label
817
+ G.add_node(node_id, path=qr_paths[i], data=qr_data[i])
818
+
819
+ # Add edges between consecutive nodes
820
+ if i > 0:
821
+ G.add_edge(i - 1, i)
822
+
823
+ # Simple coloring/sizing (can be customized further)
824
+ node_colors.append('#4299e1') # Default blue color
825
+ node_sizes.append(1500)
826
+
827
+ if not G.nodes:
828
+ logger.warning("No nodes to visualize.")
829
+ return None
830
+
831
+ # --- Layout and Drawing ---
832
+ plt.figure(figsize=(max(10, num_nodes * 1.5), 5)) # Adjust figure size based on number of nodes
833
+
834
+ # Simple linear layout for sequences is often clearest
835
+ pos = {i: (i * 2, 0) for i in range(num_nodes)} # Horizontal layout
836
+
837
+ # For more complex graphs, consider other layouts:
838
+ # pos = nx.spring_layout(G, k=0.5, iterations=50)
839
+ # pos = nx.kamada_kawai_layout(G)
840
+
841
+ nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color=node_colors, alpha=0.9)
842
+ nx.draw_networkx_edges(G, pos, arrowstyle='-|>', arrowsize=20, edge_color='gray', alpha=0.6)
843
+ nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=10, font_color='white')
844
+
845
+ plt.title(title, fontsize=16)
846
+ plt.xlabel("Sequence Index", fontsize=12)
847
+ plt.yticks([]) # Hide Y-axis ticks for linear layout
848
+ plt.xticks(range(0, num_nodes * 2, 2), [f"{i+1}" for i in range(num_nodes)]) # Label X-axis ticks
849
+ plt.box(False) # Remove frame box
850
+ plt.tight_layout()
851
+
852
+ # Save plot to a BytesIO buffer
853
+ buf = io.BytesIO()
854
+ plt.savefig(buf, format='png', bbox_inches='tight', dpi=100)
855
+ plt.close() # Close the plot figure to free memory
856
+ buf.seek(0)
857
+ logger.info("Successfully generated visualization image buffer.")
858
+ return buf
859
+
860
+ except Exception as e:
861
+ logger.error(f"Error generating visualization image: {e}", exc_info=True)
862
+ plt.close() # Ensure plot is closed even on error
863
+ return None
864
+
865
+ # --- Gradio Interface Section ---
866
+
867
+ def create_qr_sequence_visualizer(output_gallery_ref): # Pass a reference if needed later
868
  """Add QR sequence visualization capabilities to the application"""
 
869
  with gr.Tab("πŸ”„ QR Sequence Visualizer"):
870
  gr.Markdown("""
871
  ## QR Code Sequence Visualizer
872
+ Upload a sequence of QR codes (e.g., those generated by this app) to decode them and visualize their order.
873
  """)
874
 
875
+ # Store data globally within this tab's scope (alternative to Gradio State)
876
+ # This is simpler but not ideal for complex state management
877
+ shared_data = {'qr_paths': [], 'qr_data': []}
 
 
 
 
878
 
 
 
 
 
 
 
 
879
  with gr.Row():
880
+ with gr.Column(scale=1):
881
+ qr_input = gr.File(
882
+ label="Upload QR Code Images",
883
+ file_types=["image/png", "image/jpeg", ".png", ".jpg", ".jpeg"], # Be explicit
884
+ file_count="multiple"
885
+ )
886
+ visualize_btn = gr.Button("πŸ‘οΈ Decode & Visualize Sequence", variant="primary")
887
+ reset_btn = gr.Button("πŸ—‘οΈ Reset Visualizer", variant="secondary")
888
+ visualization_status = gr.Textbox(label="Status", interactive=False, lines=3)
889
+ # Placeholder for interactive elements (future improvement)
890
+ # qr_toggles_container = gr.HTML(label="QR Code Controls (Future)")
891
 
892
+ with gr.Column(scale=2):
893
+ qr_visualization = gr.Image(label="QR Code Sequence Map", type="pil", height=400) # Use PIL type
894
+ qr_preview = gr.Gallery(label="Uploaded QR Codes (Sorted)", columns=4, height=400, object_fit="contain", preview=True)
895
 
896
+
897
+ def process_qr_codes_and_visualize(files):
898
+ """Decodes QR files, sorts them, updates gallery, and generates visualization."""
899
  if not files:
900
+ shared_data['qr_paths'] = []
901
+ shared_data['qr_data'] = []
902
+ return "Please upload QR code images.", None, None, "⚠️ No QR codes uploaded."
903
+
904
+ logger.info(f"Processing {len(files)} uploaded QR files for visualization.")
905
+ qr_data_list = []
906
+ qr_path_list = []
907
+ decode_errors = 0
908
 
909
+ # Use OpenCV detector via qrcode library
910
  try:
911
+ detector = qrcode.QRCodeDetector()
912
+ except AttributeError:
913
+ logger.error("qrcode.QRCodeDetector not found. Ensure correct library version or dependencies.")
914
+ return "Error initializing QR detector.", None, None, "❌ Library Error"
915
+ except Exception as init_e:
916
+ logger.error(f"Error initializing QR detector: {init_e}")
917
+ return f"Error initializing QR detector: {init_e}", None, None, "❌ Detector Init Error"
918
+
919
+
920
+ for file in files:
921
+ try:
922
+ img_path = file.name # Gradio File object path
923
+ img = Image.open(img_path)
924
+ img_np = np.array(img.convert('RGB')) # Detector often prefers RGB
925
 
926
+ # Try to decode QR code
927
+ data, bbox, straight_qrcode = detector.detectAndDecode(img_np)
 
928
 
929
+ if data:
930
+ logger.debug(f"Decoded data from {os.path.basename(img_path)}: {data[:50]}...")
931
+ # Try parsing the decoded data as JSON (expected format from generator)
932
  try:
933
+ qr_metadata = json.loads(data)
934
+ # Check if it looks like our chunk format
935
+ if isinstance(qr_metadata, dict) and 'chunk_index' in qr_metadata and 'total_chunks' in qr_metadata:
936
+ qr_data_list.append(qr_metadata)
937
+ qr_path_list.append(img_path)
 
 
 
 
 
 
938
  else:
939
+ # Valid JSON, but not the expected chunk structure
940
+ logger.warning(f"Decoded valid JSON, but not expected format from {os.path.basename(img_path)}")
941
+ qr_data_list.append({"data": qr_metadata, "chunk_index": -1}) # Assign default index
942
+ qr_path_list.append(img_path)
943
+
944
+ except json.JSONDecodeError:
945
+ # Data decoded, but not JSON - store raw data
946
+ logger.warning(f"Could not decode JSON from QR data in {os.path.basename(img_path)}. Storing raw.")
947
+ qr_data_list.append({"data": data, "chunk_index": -1}) # Assign default index
948
+ qr_path_list.append(img_path)
949
+ except Exception as json_e:
950
+ logger.error(f"Error processing decoded JSON from {os.path.basename(img_path)}: {json_e}")
951
+ qr_data_list.append({"data": f"Error: {json_e}", "chunk_index": -1})
952
+ qr_path_list.append(img_path)
953
+ decode_errors += 1
954
+ else:
955
+ # QR code detected, but no data decoded (or detection failed)
956
+ logger.warning(f"Could not decode data from QR image: {os.path.basename(img_path)}")
957
+ qr_data_list.append({"data": "[DECODE FAILED]", "chunk_index": -1})
958
+ qr_path_list.append(img_path)
959
+ decode_errors += 1
 
 
 
960
 
961
+ except Exception as e:
962
+ logger.error(f"Error processing QR image file {os.path.basename(getattr(file, 'name', 'N/A'))}: {e}", exc_info=True)
963
+ # Optionally add placeholder for failed file?
964
+ decode_errors += 1
 
 
965
 
966
+ if not qr_path_list:
967
+ shared_data['qr_paths'] = []
968
+ shared_data['qr_data'] = []
969
+ return "No valid QR codes could be processed or decoded.", None, None, "❌ Failed to process/decode QR codes"
970
 
971
+ # Attempt to sort by chunk_index (handle missing index gracefully)
972
+ try:
973
+ # Create tuples (index, data, path) for sorting
974
+ indexed_items = []
975
+ for i, (data, path) in enumerate(zip(qr_data_list, qr_path_list)):
976
+ # Use provided chunk_index, fallback to list index if missing or invalid (-1)
977
+ sort_key = data.get('chunk_index', i)
978
+ if not isinstance(sort_key, int) or sort_key < 0:
979
+ sort_key = i # Fallback to original order for this item
980
+ indexed_items.append((sort_key, data, path))
981
+
982
+ # Sort based on the index key
983
+ indexed_items.sort(key=lambda x: x[0])
984
+
985
+ # Unpack sorted lists
986
+ sorted_qr_data = [item[1] for item in indexed_items]
987
+ sorted_qr_paths = [item[2] for item in indexed_items]
988
+
989
+ # Update shared data
990
+ shared_data['qr_paths'] = sorted_qr_paths
991
+ shared_data['qr_data'] = sorted_qr_data
992
+ logger.info("Successfully sorted QR data based on chunk_index.")
993
 
 
994
  except Exception as e:
995
+ logger.error(f"Error sorting QR data: {e}. Using original order.")
996
+ # Use original order if sorting fails
997
+ shared_data['qr_paths'] = qr_path_list
998
+ shared_data['qr_data'] = qr_data_list
999
+
1000
+ # Generate the visualization image using the helper function
1001
+ # Use the sorted data stored in shared_data
1002
+ visualization_image_buffer = _generate_sequence_visualization_image(
1003
+ shared_data['qr_paths'],
1004
+ shared_data['qr_data'],
1005
+ title=f"Visualized Sequence ({len(shared_data['qr_paths'])} Codes)"
1006
+ )
1007
+
1008
+ # Convert buffer to PIL Image for Gradio output if necessary
1009
+ vis_image_pil = None
1010
+ if visualization_image_buffer:
1011
+ try:
1012
+ vis_image_pil = Image.open(visualization_image_buffer)
1013
+ except Exception as img_e:
1014
+ logger.error(f"Failed to load visualization buffer into PIL Image: {img_e}")
1015
+
1016
+
1017
+ status_message = f"Processed {len(shared_data['qr_paths'])} QR codes."
1018
+ if decode_errors > 0:
1019
+ status_message += f" ({decode_errors} decode errors)"
1020
+ status_message += "\nSequence visualized." if vis_image_pil else "\nVisualization generation failed."
1021
+ final_status = "βœ… Done" if vis_image_pil else "⚠️ Errors Occurred"
1022
+
1023
+
1024
+ # Update outputs: Gallery with sorted paths, Image with visualization, Status text
1025
+ # The gallery expects a list of image paths or PIL images
1026
+ gallery_output = shared_data['qr_paths']
1027
+
1028
+ return gallery_output, vis_image_pil, status_message, final_status
1029
 
1030
+
1031
+ def reset_visualizer_state():
1032
+ shared_data['qr_paths'] = []
1033
+ shared_data['qr_data'] = []
1034
+ logger.info("Resetting QR visualizer state.")
1035
+ return None, None, None, "βšͺ Visualizer Reset. Upload new QR codes."
1036
 
1037
  # Event handlers
1038
+ visualize_btn.click(
1039
+ process_qr_codes_and_visualize,
1040
+ inputs=[qr_input],
1041
+ outputs=[qr_preview, qr_visualization, visualization_status, visualization_status] # Update gallery, image, and status twice? Let's map correctly.
1042
+ # Correct mapping:
1043
+ # outputs=[qr_preview (Gallery), qr_visualization (Image), visualization_status (Textbox), visualization_status (Textbox again - maybe just need 3 outputs?)]
1044
+ # Let's try mapping to the 4 defined outputs:
1045
+ # outputs=[qr_preview, qr_visualization, visualization_status, visualization_status] # Seems redundant, but matches function signature needs. Let's adjust function signature later if needed.
1046
+ ).then(
1047
+ lambda: logger.info("Visualization process complete."), inputs=None, outputs=None
1048
+ )
1049
+
1050
+
1051
+ reset_btn.click(
1052
+ reset_visualizer_state,
1053
+ inputs=[],
1054
+ outputs=[qr_preview, qr_visualization, qr_input, visualization_status] # Clear gallery, image, file input, status
1055
+ )
1056
 
1057
  def create_modern_interface():
1058
  """Create a modern and visually appealing Gradio interface"""
1059
 
1060
+ # Modern CSS styling (Seems intact)
1061
  css = """
1062
  /* Modern color scheme */
1063
  :root {
 
1112
  /* Gallery styling */
1113
  .gallery {
1114
  display: grid;
1115
+ grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); /* Adjust minmax */
1116
  gap: 1rem;
1117
  padding: 1rem;
1118
  background-color: white;
1119
  border-radius: 0.5rem;
1120
  border: 1px solid #e2e8f0;
1121
+ min-height: 150px; /* Ensure gallery has some height */
1122
  }
1123
  .gallery img {
1124
  width: 100%;
1125
  height: auto;
1126
+ object-fit: contain; /* Use contain to avoid stretching */
1127
  border-radius: 0.375rem;
1128
  transition: transform 0.2s;
1129
+ border: 1px solid #eee; /* Add subtle border */
1130
  }
1131
  .gallery img:hover {
1132
  transform: scale(1.05);
1133
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1); /* Add hover shadow */
1134
  }
1135
  """
1136
  # Create interface with modern design
 
1139
  # 🌐 Advanced Data Processing & QR Code Generator
1140
  Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
1141
  """)
1142
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1143
  with gr.Row():
1144
+ with gr.Column(scale=2):
1145
+ # Input Tabs
1146
+ with gr.Tabs():
1147
+ with gr.TabItem("πŸ“ URL Input"):
1148
+ url_input = gr.Textbox(
1149
+ label="Enter URLs (one per line or comma-separated)",
1150
+ lines=5,
1151
+ placeholder="https://example1.com\nhttps://example2.com",
1152
+ elem_id="url-input"
1153
+ )
1154
+ with gr.TabItem("πŸ“ File Input"):
1155
+ file_input = gr.File(
1156
+ label="Upload Files (Text, JSON, Archives: zip, tar, gz, bz2)",
1157
+ file_count="multiple",
1158
+ # Removed file_types="*" to rely on backend logic, or specify supported ones:
1159
+ # file_types=[".txt", ".json", ".csv", ".md", ".xml", ".html", ".zip", ".tar", ".gz", ".bz2"]
1160
+ elem_id="file-input"
1161
+ )
1162
+ with gr.TabItem("πŸ“‹ Direct Input / JSON"):
1163
+ text_input = gr.TextArea(
1164
+ label="Direct Text/JSON Input",
1165
+ lines=10,
1166
+ placeholder="Paste your text or JSON data here...",
1167
+ elem_id="text-input"
1168
+ )
1169
+ with gr.Row():
1170
+ example_btn = gr.Button("πŸ“ Load JSON Example")
1171
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Input")
1172
+
1173
+ # Processing Options & Button
1174
+ with gr.Row():
1175
+ combine_data = gr.Checkbox(
1176
+ label="Combine all inputs into one sequence",
1177
+ value=True, # Default to combined
1178
+ info="If unchecked, each URL/File/Input generates its own QR sequence."
1179
+ )
1180
+ process_btn = gr.Button(
1181
+ "πŸ”„ Process & Generate QR Codes",
1182
+ variant="primary",
1183
+ elem_id="process-button"
1184
+ )
1185
+
1186
+ # Status Output
1187
+ output_text = gr.Textbox(
1188
+ label="Processing Status",
1189
+ interactive=False,
1190
+ lines=2,
1191
+ elem_id="status-output"
1192
+ )
1193
+
1194
+
1195
+ with gr.Column(scale=3):
1196
+ # Output Area
1197
+ gr.Markdown("### Results")
1198
+ with gr.Tabs():
1199
+ with gr.TabItem("πŸ–ΌοΈ QR Codes"):
1200
+ output_gallery = gr.Gallery(
1201
+ label="Generated QR Codes",
1202
+ columns=4, # Adjust columns as needed
1203
+ height=500, # Adjust height
1204
+ object_fit="contain",
1205
+ preview=True, # Enable preview click
1206
+ elem_id="qr-gallery"
1207
+ )
1208
+ with gr.TabItem("πŸ“„ Processed Data (JSON)"):
1209
+ output_json = gr.JSON(
1210
+ label="Processed Data Structure",
1211
+ elem_id="json-output"
1212
+ )
1213
 
1214
  # Load example data
1215
  def load_example():
1216
  example = {
1217
+ "project": "Data Transfer Example",
1218
+ "version": 1.1,
1219
  "items": [
1220
+ {"id": "A001", "name": "Item One", "value": 123.45, "tags": ["tag1", "tag2"]},
1221
+ {"id": "B002", "name": "Item Two", "value": 67.89, "enabled": True}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1222
  ],
1223
+ "timestamp": datetime.now().isoformat()
 
 
 
 
1224
  }
1225
  return json.dumps(example, indent=2)
1226
 
1227
+ def clear_input_area():
1228
+ # Clear only the direct text input area
1229
  return ""
1230
 
1231
+ # --- Main Processing Function ---
1232
+ def process_inputs_and_generate_qrs(urls, files, text, combine):
1233
+ """Process all inputs, combine if requested, and generate QR codes."""
1234
+ start_time = time.time()
1235
+ logger.info("Starting data processing...")
1236
+ status_updates = []
1237
+ all_processed_data = [] # List to hold results from all sources
1238
 
1239
+ url_processor = EnhancedURLProcessor()
1240
+ file_processor = EnhancedFileProcessor()
1241
 
1242
+ # 1. Process URLs
1243
+ if urls and urls.strip():
1244
+ url_list = re.split(r'[,\n]+', urls) # Split by comma or newline, handle multiple newlines
1245
+ url_list = [u.strip() for u in url_list if u.strip()] # Clean up
1246
+ status_updates.append(f"Processing {len(url_list)} URLs...")
1247
+ logger.info(f"Processing URLs: {url_list}")
1248
+ for i, url in enumerate(url_list):
1249
+ logger.info(f"Processing URL {i+1}/{len(url_list)}: {url}")
1250
+ # Basic validation before fetching
1251
+ if not validators.url(url):
1252
+ logger.warning(f"Skipping invalid URL format: {url}")
1253
+ status_updates.append(f"⚠️ Skipped invalid URL: {url[:50]}...")
1254
+ all_processed_data.append({'error': 'Invalid URL format', 'url': url})
1255
+ continue
1256
+
1257
+ content_data = url_processor.fetch_content(url)
1258
+ if content_data and 'content' in content_data:
1259
+ logger.info(f"Successfully fetched content from {url} ({len(content_data.get('raw_content',''))} bytes)")
1260
+ # Structure the result similarly to file processing output
1261
+ processed_url_data = {
1262
+ 'source': 'url',
1263
+ 'url': url,
1264
+ 'content': content_data['content'], # Processed text content
1265
+ 'raw_content': content_data['raw_content'], # Raw response body
1266
+ 'metadata': content_data['metadata'], # Headers, status, etc.
1267
+ 'timestamp': datetime.now().isoformat()
1268
+ }
1269
+ all_processed_data.append(processed_url_data)
1270
+ status_updates.append(f"βœ“ Fetched: {url[:60]}...")
1271
+ else:
1272
+ logger.error(f"Failed to fetch content from URL: {url}")
1273
+ status_updates.append(f"❌ Failed fetch: {url[:60]}...")
1274
+ all_processed_data.append({'error': 'Failed to fetch content', 'url': url})
1275
+
1276
+ # 2. Process Files
1277
+ if files:
1278
+ status_updates.append(f"Processing {len(files)} uploaded files...")
1279
+ logger.info(f"Processing {len(files)} files.")
1280
+ for i, file_obj in enumerate(files):
1281
+ logger.info(f"Processing file {i+1}/{len(files)}: {getattr(file_obj, 'name', 'N/A')}")
1282
+ try:
1283
+ # Pass the Gradio file object directly to process_file
1284
+ file_results = file_processor.process_file(file_obj)
1285
+ if file_results:
1286
+ all_processed_data.extend(file_results)
1287
+ # Get filename safely from results (might be multiple from archive)
1288
+ processed_filenames = [res.get('filename', 'N/A') for res in file_results]
1289
+ status_updates.append(f"βœ“ Processed file(s): {', '.join(processed_filenames)}")
1290
+ logger.info(f"Successfully processed file(s): {', '.join(processed_filenames)}")
1291
+ else:
1292
+ status_updates.append(f"⚠️ No data extracted from file: {getattr(file_obj, 'name', 'N/A')}")
1293
+ logger.warning(f"No data extracted from file: {getattr(file_obj, 'name', 'N/A')}")
1294
+ # Add placeholder error if desired
1295
+ # all_processed_data.append({'error': 'No data extracted', 'filename': getattr(file_obj, 'name', 'N/A')})
1296
+
1297
+ except Exception as file_proc_err:
1298
+ file_name = getattr(file_obj, 'name', 'N/A')
1299
+ logger.error(f"Error processing file {file_name}: {file_proc_err}", exc_info=True)
1300
+ status_updates.append(f"❌ Error processing file: {file_name}")
1301
+ all_processed_data.append({'error': f'File processing error: {file_proc_err}', 'filename': file_name})
1302
+
1303
+
1304
+ # 3. Process Direct Text/JSON Input
1305
  if text and text.strip():
1306
+ status_updates.append("Processing direct input...")
1307
+ logger.info("Processing direct text/JSON input.")
1308
+ # Attempt to parse as JSON first
1309
  try:
1310
  json_data = json.loads(text)
1311
+ logger.info("Direct input parsed as JSON.")
1312
+ processed_text_data = {
1313
+ 'source': 'direct_json',
1314
+ 'content': json_data, # Parsed JSON object/list
1315
+ 'raw_content': text, # Original string
1316
+ 'timestamp': datetime.now().isoformat()
1317
+ }
1318
+ all_processed_data.append(processed_text_data)
1319
+ status_updates.append("βœ“ Processed direct input as JSON.")
1320
+ except json.JSONDecodeError:
1321
+ # If not JSON, treat as plain text
1322
+ logger.info("Direct input treated as plain text.")
1323
+ processed_text_data = {
1324
+ 'source': 'direct_text',
1325
+ 'content': text, # Store as plain text
1326
+ 'timestamp': datetime.now().isoformat()
1327
+ }
1328
+ all_processed_data.append(processed_text_data)
1329
+ status_updates.append("βœ“ Processed direct input as Text.")
1330
+ except Exception as direct_input_err:
1331
+ logger.error(f"Error processing direct input: {direct_input_err}", exc_info=True)
1332
+ status_updates.append(f"❌ Error processing direct input.")
1333
+ all_processed_data.append({'error': f'Direct input error: {direct_input_err}', 'source': 'direct_input'})
1334
+
1335
+
1336
+ # 4. Check if any data was processed
1337
+ if not all_processed_data:
1338
+ logger.warning("No valid data sources found or processed.")
1339
+ status_updates.append("⚠️ No data to process. Please provide input.")
1340
+ final_status = "\n".join(status_updates)
1341
+ return None, [], final_status # Return empty results
1342
+
1343
+ logger.info(f"Total processed data items: {len(all_processed_data)}")
1344
+ status_updates.append(f"Data processed ({len(all_processed_data)} items). Generating QR codes...")
1345
+
1346
+ # 5. Generate QR Codes
1347
+ qr_paths = []
1348
+ try:
1349
+ # Pass the list of processed data items
1350
+ qr_paths = generate_qr_codes(all_processed_data, combine)
1351
+ if qr_paths:
1352
+ status_updates.append(f"βœ“ Generated {len(qr_paths)} QR codes.")
1353
+ logger.info(f"Successfully generated {len(qr_paths)} QR codes.")
1354
+ else:
1355
+ status_updates.append("❌ QR code generation failed or produced no codes.")
1356
+ logger.error("QR code generation returned no paths.")
1357
+ # Keep processed data, but gallery will be empty
1358
+
1359
+ except Exception as qr_gen_err:
1360
+ logger.error(f"Error during QR code generation step: {qr_gen_err}", exc_info=True)
1361
+ status_updates.append(f"❌ Error generating QR codes: {qr_gen_err}")
1362
+ # Keep processed data, gallery will be empty
1363
+
1364
+
1365
+ # 6. Finalize and Return
1366
+ end_time = time.time()
1367
+ processing_time = end_time - start_time
1368
+ status_updates.append(f"Total processing time: {processing_time:.2f} seconds.")
1369
+ final_status = "\n".join(status_updates)
1370
+
1371
+ # Return processed data (for JSON view), QR paths (for Gallery), and status string
1372
+ # Ensure qr_paths is a list of strings
1373
+ qr_paths_str = [str(p) for p in qr_paths] if qr_paths else []
1374
+
1375
+ # Return data for JSON output, gallery paths, and status text
1376
+ return all_processed_data, qr_paths_str, final_status
1377
+
1378
+
1379
+ # --- Event Handlers ---
1380
+ example_btn.click(load_example, outputs=[text_input])
1381
+ clear_btn.click(clear_input_area, outputs=[text_input])
1382
+
1383
+ process_btn.click(
1384
+ process_inputs_and_generate_qrs,
1385
+ inputs=[url_input, file_input, text_input, combine_data],
1386
+ outputs=[output_json, output_gallery, output_text] # Match function return order
1387
+ )
1388
+
1389
+ # Add helpful documentation (Seems intact)
1390
+ gr.Markdown("""
1391
+ ### πŸš€ Features
1392
+ - **Complete URL Scraping**: Extracts text content from web pages.
1393
+ - **Advanced File Processing**: Handles text, JSON, and archives (.zip, .tar.*, .gz, .bz2). Attempts intelligent JSON detection.
1394
+ - **Direct Input**: Paste text or JSON directly.
1395
+ - **Sequential QR Codes**: Chunks large data and embeds sequencing info. Option to combine inputs.
1396
+ - **Modern Design**: Clean, responsive interface.
1397
+ ### πŸ’‘ Tips
1398
+ 1. **Inputs**: Use any combination of URL, File, or Direct Input tabs.
1399
+ 2. **Combine**: Check 'Combine all inputs' to create one QR sequence from all sources. Uncheck to get separate QR sequences for each source.
1400
+ 3. **Files**: Upload text-based files, JSON, or supported archives. Content from archives is extracted and processed.
1401
+ 4. **JSON**: Use the example button or upload a `.json` file. The app also tries to parse `.txt` or other files as JSON if they contain valid JSON structure.
1402
+ 5. **Status**: Monitor the Processing Status box for feedback.
1403
+ ### 🎨 Output
1404
+ - Generated QR codes appear in the 'QR Codes' tab and are saved in the `output/qr_codes` directory.
1405
+ - The structured data processed from all inputs is shown in the 'Processed Data (JSON)' tab.
1406
+ - Hover over or click QR codes in the gallery for a larger preview.
1407
+ """)
1408
+ return interface
1409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1410
  def main():
1411
+ """Initialize and launch the application"""
1412
+ try:
1413
+ # Configure system settings if needed
1414
+ mimetypes.init() # Ensure mime types are loaded
1415
+
1416
+ logger.info("Starting Gradio application...")
1417
+ # Create and launch interface
1418
+ interface = create_modern_interface()
1419
+
1420
+ # Add the QR sequence visualizer tab (if function is defined and needed)
1421
+ # with interface:
1422
+ # create_qr_sequence_visualizer(None) # Pass relevant components if needed
1423
+
1424
+ # Launch with configuration
1425
+ interface.launch(
1426
+ share=False, # Set to True for public link (use with caution)
1427
+ debug=False, # Set to True for more verbose Gradio errors
1428
+ show_error=True, # Show Python errors in browser console
1429
+ # server_name="0.0.0.0", # Bind to all interfaces if needed for Docker/network access
1430
+ # server_port=7860, # Specify port if needed
1431
+ show_api=False # Disable default Gradio API endpoint unless needed
1432
+ )
1433
+ logger.info("Gradio application stopped.")
1434
+ except Exception as e:
1435
+ logger.error(f"Application startup or runtime error: {e}", exc_info=True)
1436
+ raise
1437
+
1438
+ if __name__ == "__main__":
1439
+ # Ensure output directories exist before starting
1440
+ OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
1441
+ QR_CODES_DIR.mkdir(parents=True, exist_ok=True)
1442
+ TEMP_DIR.mkdir(parents=True, exist_ok=True)
1443
+ main()