acecalisto3 commited on
Commit
a523c40
Β·
verified Β·
1 Parent(s): a3d3ab4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1026 -490
app.py CHANGED
@@ -56,7 +56,7 @@ class EnhancedURLProcessor:
56
 
57
  # Enhanced headers for better site compatibility
58
  self.session.headers.update({
59
- 'User-Agent': self.user_agent.random,
60
  'Accept': '*/*', # Accept all content types
61
  'Accept-Language': 'en-US,en;q=0.9',
62
  'Accept-Encoding': 'gzip, deflate, br',
@@ -65,7 +65,7 @@ class EnhancedURLProcessor:
65
  'Sec-Fetch-Dest': 'document',
66
  'Sec-Fetch-Mode': 'navigate',
67
  'Sec-Fetch-Site': 'none',
68
- 'Sec-Fetch-User': '?1',
69
  'DNT': '1'
70
  })
71
 
@@ -78,22 +78,32 @@ class EnhancedURLProcessor:
78
  if not all([parsed.scheme, parsed.netloc]):
79
  return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
80
  # Try HEAD request first to check accessibility
 
81
  try:
82
  head_response = self.session.head(url, timeout=5)
83
  head_response.raise_for_status()
 
 
 
 
 
 
84
  except requests.exceptions.RequestException:
85
  # If HEAD fails, try GET as some servers don't support HEAD
 
86
  response = self.session.get(url, timeout=self.timeout)
87
  response.raise_for_status()
 
 
 
 
 
 
88
 
89
  return {
90
  'is_valid': True,
91
  'message': 'URL is valid and accessible',
92
- 'details': {
93
- 'content_type': head_response.headers.get('Content-Type', 'unknown'),
94
- 'server': head_response.headers.get('Server', 'unknown'),
95
- 'size': head_response.headers.get('Content-Length', 'unknown')
96
- }
97
  }
98
  except Exception as e:
99
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
@@ -104,7 +114,7 @@ class EnhancedURLProcessor:
104
  logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
105
 
106
  # Update User-Agent randomly for each request
107
- self.session.headers.update({'User-Agent': self.user_agent.random})
108
 
109
  response = self.session.get(url, timeout=self.timeout)
110
  response.raise_for_status()
@@ -118,7 +128,7 @@ class EnhancedURLProcessor:
118
  # Decode content with fallback
119
  try:
120
  raw_content = response.content.decode(encoding, errors='replace')
121
- except (UnicodeDecodeError, LookupError):
122
  raw_content = response.content.decode('utf-8', errors='replace')
123
 
124
  # Extract metadata
@@ -137,10 +147,10 @@ class EnhancedURLProcessor:
137
  if 'text/html' in content_type:
138
  processed_content = self._process_html_content(raw_content, url)
139
  else:
140
- processed_content = raw_content
141
  return {
142
  'content': processed_content,
143
- 'raw_content': raw_content,
144
  'metadata': metadata
145
  }
146
  except requests.exceptions.RequestException as e:
@@ -164,468 +174,890 @@ class EnhancedURLProcessor:
164
  for attr in ['href', 'src']:
165
  if tag.get(attr):
166
  try:
167
- tag[attr] = urljoin(base_url, tag[attr])
168
- except Exception:
169
- pass
170
- # Extract all text content
171
- text_parts = []
172
- for element in soup.stripped_strings:
173
- text_parts.append(str(element))
174
- return '\n'.join(text_parts)
 
 
 
 
 
 
 
 
 
 
175
  except Exception as e:
176
  logger.error(f"HTML processing error: {e}")
 
177
  return content
178
 
179
  class EnhancedFileProcessor:
180
  """Advanced file processing with complete content extraction"""
181
  def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
182
  self.max_file_size = max_file_size
 
183
  self.supported_extensions = {
184
- '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
185
- '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
186
- '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
187
- '.pdf', '.doc', '.docx', '.rtf', '.odt'
188
  }
 
 
 
 
 
 
189
 
190
  def process_file(self, file) -> List[Dict]:
191
  """Process uploaded file with enhanced error handling and complete extraction"""
192
- if not file:
193
- return []
 
194
 
195
  dataset = []
 
 
196
  try:
197
- file_size = os.path.getsize(file.name)
 
 
 
 
 
 
198
  if file_size > self.max_file_size:
199
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
 
 
200
  return []
201
 
202
- with tempfile.TemporaryDirectory() as temp_dir:
 
 
 
 
 
 
 
 
 
 
 
203
  temp_dir_path = Path(temp_dir)
204
 
205
- # Handle different archive types
206
- if self._is_archive(file.name):
207
- dataset.extend(self._process_archive(file.name, temp_dir_path))
208
- elif Path(file.name).suffix.lower() in self.supported_extensions:
209
- dataset.extend(self._process_single_file(file))
210
  else:
211
- logger.warning(f"Unsupported file type: {file.name}")
 
 
 
 
212
 
213
  except Exception as e:
214
- logger.error(f"Error processing file: {str(e)}")
215
- return []
 
 
216
  return dataset
217
 
218
  def _is_archive(self, filepath: str) -> bool:
219
- """Check if file is an archive"""
220
- return any(filepath.lower().endswith(ext) for ext in [
221
- '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'
222
- ])
223
 
224
- def _process_single_file(self, file) -> List[Dict]:
225
  """Process a single file with enhanced character extraction and JSON handling"""
 
 
 
 
 
226
  try:
227
- file_stat = os.stat(file.name)
228
  file_size = file_stat.st_size
 
 
 
229
  # Initialize content storage
230
- content_parts = []
231
- # Process file in chunks for large files
 
 
 
232
  chunk_size = 10 * 1024 * 1024 # 10MB chunks
233
- with open(file.name, 'rb') as f:
234
- while True:
235
- chunk = f.read(chunk_size)
236
- if not chunk:
237
- break
238
- # Detect encoding for each chunk
239
- encoding = chardet.detect(chunk)['encoding'] or 'utf-8'
240
- try:
241
- decoded_chunk = chunk.decode(encoding, errors='replace')
242
- content_parts.append(decoded_chunk)
243
- except (UnicodeDecodeError, LookupError):
244
- decoded_chunk = chunk.decode('utf-8', errors='replace')
245
- content_parts.append(decoded_chunk)
246
- # Combine all chunks
247
- complete_content = ''.join(content_parts)
248
- # Check if the content is valid JSON regardless of file extension
249
- try:
250
- if mimetypes.guess_type(file.name)[0] == 'application/json' or file.name.lower().endswith('.json'):
251
- # It's a JSON file by type or extension
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  json_data = json.loads(complete_content)
253
- return [{
254
- 'source': 'json_file',
255
- 'filename': os.path.basename(file.name),
256
- 'file_size': file_size,
257
- 'mime_type': 'application/json',
258
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
259
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
260
- 'content': json_data, # Store the parsed JSON object
261
- 'raw_content': complete_content, # Store the original JSON string
262
- 'timestamp': datetime.now().isoformat()
263
- }]
264
- else:
265
- # Try to parse as JSON anyway
266
- try:
267
- json_data = json.loads(complete_content)
268
- # If we get here, it's valid JSON despite the extension
269
- return [{
270
- 'source': 'json_content',
271
- 'filename': os.path.basename(file.name),
272
- 'file_size': file_size,
273
- 'mime_type': 'application/json',
274
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
275
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
276
- 'content': json_data, # Store the parsed JSON object
277
- 'raw_content': complete_content, # Store the original JSON string
278
- 'timestamp': datetime.now().isoformat()
279
- }]
280
- except json.JSONDecodeError:
281
- logger.warning(f"File {file.name} is not valid JSON.")
282
- except Exception as e:
283
- logger.error(f"Error during JSON processing: {e}")
284
 
285
- return [{
286
- 'source': 'file',
287
- 'filename': os.path.basename(file.name),
 
288
  'file_size': file_size,
289
- 'mime_type': mimetypes.guess_type(file.name)[0],
290
  'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
291
  'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
292
- 'content': complete_content,
293
  'timestamp': datetime.now().isoformat()
294
- }]
 
 
 
 
 
 
 
 
295
  except Exception as e:
296
- logger.error(f"File processing error: {e}")
297
  return []
298
 
299
  def _process_archive(self, archive_path: str, extract_to: Path) -> List[Dict]:
300
  """Process an archive file with enhanced extraction"""
301
  dataset = []
 
 
 
302
  try:
303
  # Handle ZIP archives
304
- if zipfile.is_zipfile(archive_path):
 
305
  with zipfile.ZipFile(archive_path, 'r') as zip_ref:
306
- zip_ref.extractall(extract_to)
 
 
 
 
 
 
307
  for file_info in zip_ref.infolist():
308
- if file_info.file_size > 0 and not file_info.filename.endswith('/'):
309
- extracted_path = extract_to / file_info.filename
310
- if extracted_path.suffix.lower() in self.supported_extensions:
311
- with open(extracted_path, 'rb') as f:
312
- dataset.extend(self._process_single_file(f))
313
- # Handle TAR archives
314
- elif archive_path.lower().endswith(('.tar', '.tar.gz', '.tgz')):
315
- try:
316
- with tarfile.open(archive_path, 'r:*') as tar_ref:
317
- for member in tar_ref.getmembers():
318
- if member.isfile():
319
- extracted_path = extract_to / member.name
320
- tar_ref.extract(member, path=extract_to)
321
- if extracted_path.suffix.lower() in self.supported_extensions:
322
- with open(extracted_path, 'rb') as f:
323
- dataset.extend(self._process_single_file(f))
324
- except tarfile.TarError as e:
325
- logger.error(f"Error processing TAR archive: {e}")
326
- # Handle GZIP archives (single file)
327
- elif archive_path.lower().endswith('.gz'):
328
- extracted_path = extract_to / Path(archive_path).stem
329
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
331
  outfile.write(gz_file.read())
332
- if extracted_path.suffix.lower() in self.supported_extensions:
333
- with open(extracted_path, 'rb') as f:
334
- dataset.extend(self._process_single_file(f))
335
- except gzip.GzipFile as e:
336
- logger.error(f"Error processing GZIP archive: {e}")
337
- # TODO: Add support for other archive types (.bz2, .7z, .rar) - may require external libraries
338
- elif archive_path.lower().endswith(('.bz2', '.7z', '.rar')):
339
- logger.warning(f"Support for {Path(archive_path).suffix} archives is not yet fully implemented.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
 
 
 
 
 
 
341
  except Exception as e:
342
- logger.error(f"Archive processing error: {e}")
 
343
  return dataset
344
 
345
- def chunk_data(self, data: Union[Dict, List, str], max_size: int = 2953) -> List[Dict]:
346
- """Enhanced data chunking with sequence metadata"""
 
347
  try:
348
  if not isinstance(data, str):
349
- # Convert data to JSON string
350
- json_str = json.dumps(data, ensure_ascii=False)
 
351
  else:
352
- json_str = data
353
- total_length = len(json_str)
354
-
355
- # Calculate overhead for metadata
356
- metadata_template = {
357
- "chunk_index": 0,
358
- "total_chunks": 1,
359
- "total_length": total_length,
360
- "chunk_hash": "",
361
- "data": ""
362
- }
363
- overhead = len(json.dumps(metadata_template)) + 20 # Extra padding for safety
364
 
365
- # Calculate effective chunk size
366
- effective_chunk_size = max_size - overhead
 
367
 
368
- if total_length <= effective_chunk_size:
369
- # Data fits in one chunk
370
- chunk = {
 
 
371
  "chunk_index": 0,
372
  "total_chunks": 1,
373
- "total_length": total_length,
374
- "chunk_hash": hash(json_str) & 0xFFFFFFFF, # 32-bit hash
375
- "data": json_str
376
  }
377
- return [chunk]
 
 
 
 
 
 
 
 
 
 
 
378
 
379
- # Calculate number of chunks needed
380
- num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
381
- chunk_size = -(-total_length // num_chunks) # Even distribution
 
382
 
383
  chunks = []
 
384
  for i in range(num_chunks):
385
- start_idx = i * chunk_size
386
- end_idx = min(start_idx + chunk_size, total_length)
387
- chunk_data = json_str[start_idx:end_idx]
 
 
388
 
389
- chunk = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  "chunk_index": i,
391
  "total_chunks": num_chunks,
392
- "total_length": total_length,
393
- "chunk_hash": hash(chunk_data) & 0xFFFFFFFF,
394
- "data": chunk_data
 
395
  }
396
- chunks.append(chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
  return chunks
399
  except Exception as e:
400
- logger.error(f"Error chunking data: {e}")
401
  return []
402
 
403
- def generate_stylish_qr(data: Union[str, Dict],
 
404
  filename: str,
405
  size: int = 10,
406
  border: int = 4,
407
  fill_color: str = "#000000",
408
- back_color: str = "#FFFFFF") -> str:
 
409
  """Generate a stylish QR code with enhanced visual appeal"""
410
  try:
411
  qr = qrcode.QRCode(
412
- version=None,
413
- error_correction=qrcode.constants.ERROR_CORRECT_H,
414
  box_size=size,
415
  border=border
416
  )
417
 
418
- # Add data to QR code
419
- if isinstance(data, dict):
420
- qr.add_data(json.dumps(data, ensure_ascii=False))
421
- else:
422
- qr.add_data(data)
423
 
 
424
  qr.make(fit=True)
425
 
 
 
 
426
  # Create QR code image with custom colors
427
  qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
428
 
429
- # Convert to RGBA for transparency support
430
  qr_image = qr_image.convert('RGBA')
431
 
432
- # Add subtle gradient overlay
433
- gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
434
- draw = ImageDraw.Draw(gradient)
435
- for i in range(qr_image.width):
436
- alpha = int(255 * (1 - i/qr_image.width) * 0.1) # 10% maximum opacity
437
- draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
 
 
438
 
439
- # Combine images
440
- final_image = Image.alpha_composite(qr_image, gradient)
441
 
442
  # Save the image
443
  output_path = QR_CODES_DIR / filename
444
- final_image.save(output_path, quality=95)
 
 
 
445
 
446
  return str(output_path)
 
 
 
 
447
  except Exception as e:
448
- logger.error(f"QR generation error: {e}")
449
  return ""
450
 
451
- def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
452
- """Generate QR codes with enhanced visual appeal and metadata"""
 
453
  try:
454
- file_processor = EnhancedFileProcessor()
455
- paths = []
456
-
457
- if combined:
458
- # Process combined data
459
- chunks = file_processor.chunk_data(data)
460
- for i, chunk in enumerate(chunks):
461
- filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  qr_path = generate_stylish_qr(
463
- data=chunk['data'], # Use the 'data' part of the chunk
464
  filename=filename,
465
- fill_color="#1a365d", # Deep blue
466
- back_color="#ffffff"
 
467
  )
468
  if qr_path:
469
- paths.append(qr_path)
 
 
 
 
470
  else:
471
- # Process individual items
472
- if isinstance(data, list):
473
- for idx, item in enumerate(data):
474
- chunks = file_processor.chunk_data(item)
475
- for chunk_idx, chunk in enumerate(chunks):
476
- filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png'
477
- qr_path = generate_stylish_qr(
478
- data=chunk['data'], # Use the 'data' part of the chunk
479
- filename=filename,
480
- fill_color="#1a365d", # Deep blue
481
- back_color="#ffffff"
482
- )
483
- if qr_path:
484
- paths.append(qr_path)
485
- else:
486
- chunks = file_processor.chunk_data(data)
487
- for i, chunk in enumerate(chunks):
488
- filename = f'single_qr_{i+1}_of_{len(chunks)}_{int(time.time())}.png'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  qr_path = generate_stylish_qr(
490
- data=chunk['data'], # Use the 'data' part of the chunk
491
  filename=filename,
492
- fill_color="#1a365d", # Deep blue
493
- back_color="#ffffff"
 
494
  )
495
  if qr_path:
496
- paths.append(qr_path)
497
- return paths
 
 
 
 
 
498
  except Exception as e:
499
- logger.error(f"QR code generation error: {e}")
500
  return []
501
 
502
- def create_qr_sequence_visualizer(output_gallery):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
  """Add QR sequence visualization capabilities to the application"""
504
- # Create a new tab for the QR code sequence visualization
505
  with gr.Tab("πŸ”„ QR Sequence Visualizer"):
506
  gr.Markdown("""
507
  ## QR Code Sequence Visualizer
508
-
509
- Arrange and visualize your QR code sequences. Enable or disable individual QR codes to see how they connect.
510
  """)
511
 
512
- # Inputs for the visualizer
513
- with gr.Row():
514
- qr_input = gr.File(
515
- label="Upload QR Codes",
516
- file_types=["image/png", "image/jpeg"],
517
- file_count="multiple"
518
- )
519
-
520
- with gr.Column():
521
- visualize_btn = gr.Button("πŸ”„ Generate Visualization", variant="primary")
522
- reset_btn = gr.Button("πŸ—‘οΈ Reset", variant="secondary")
523
- # Container for QR code toggles
524
- qr_toggles_container = gr.HTML(label="QR Code Controls")
525
 
526
- # Output visualization
527
  with gr.Row():
528
- qr_visualization = gr.Image(label="QR Code Sequence Map", height=600)
529
- qr_preview = gr.Gallery(label="Selected QR Codes", columns=2, height=600)
 
 
 
 
 
 
 
 
 
530
 
531
- # Status output
532
- visualization_status = gr.Textbox(label="Visualization Status", interactive=False)
 
533
 
534
- # Function to process uploaded QR codes
535
- def process_qr_codes(files):
 
536
  if not files:
537
- return "Please upload QR code images.", None, None, "⚠️ No QR codes uploaded"
 
 
 
 
 
 
 
538
 
 
539
  try:
540
- # Load QR codes and extract metadata
541
- qr_data = []
542
- qr_paths = []
 
 
 
 
543
 
544
- for file in files:
545
- try:
546
- img = Image.open(file.name)
547
 
548
- # Try to decode QR code
 
 
 
 
 
 
 
 
 
 
 
549
  try:
550
- detector = qrcode.QRCodeDetector()
551
- data, bbox, _ = detector.detectAndDecode(np.array(img))
552
- if data:
553
- try:
554
- qr_json = json.loads(data)
555
- qr_data.append(qr_json)
556
- qr_paths.append(file.name)
557
- except json.JSONDecodeError:
558
- logger.warning(f"Could not decode JSON from QR: {data}")
559
- qr_data.append({"data": data}) # Store raw data if JSON fails
560
- qr_paths.append(file.name)
561
  else:
562
- qr_data.append({"data": "Empty QR"})
563
- qr_paths.append(file.name)
564
- except Exception as e:
565
- logger.warning(f"Could not decode QR: {e}")
566
- # Add with default metadata
567
- qr_data.append({
568
- "chunk_index": len(qr_data),
569
- "total_chunks": len(files),
570
- "data": "Unknown"
571
- })
572
- qr_paths.append(file.name)
573
- except Exception as e:
574
- logger.error(f"Error processing QR image {file.name}: {e}")
575
-
576
- if not qr_data:
577
- return "No valid QR codes found.", None, None, "❌ Failed to process QR codes"
578
-
579
- # Sort by chunk_index if available
580
- try:
581
- sorted_data = sorted(zip(qr_data, qr_paths), key=lambda x: x[0].get("chunk_index", 0))
582
- qr_data = [d[0] for d in sorted_data]
583
- qr_paths = [d[1] for d in sorted_data]
584
- except Exception as e:
585
- logger.error(f"Error sorting QR data: {e}")
586
 
587
- # Generate toggle controls HTML
588
- toggle_html = '<div style="max-height: 500px; overflow-y: auto; padding: 10px;">'
589
- toggle_html += '<h3>Enable/Disable QR Codes:</h3>'
590
- for i, path in enumerate(qr_paths):
591
- toggle_html += f'<div><input type="checkbox" id="qr_toggle_{i}" checked> <label for="qr_toggle_{i}">{os.path.basename(path)}</label></div>'
592
- toggle_html += '</div>'
593
 
594
- # Update the toggles container
595
- qr_toggles_container.update(value=toggle_html)
 
 
596
 
597
- # Create initial visualization (replace with actual visualization logic)
598
- initial_visualization = "Visualization will appear here." # Replace with your composite image generation
599
- qr_visualization.update(value=initial_visualization)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
600
 
601
- return "QR codes processed successfully.", qr_paths, qr_data, "βœ… Visualization ready!"
602
  except Exception as e:
603
- logger.error(f"Error processing QR codes: {e}")
604
- return "An error occurred while processing QR codes.", None, None, "❌ Error"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
 
606
- # Function to generate visualization (replace with actual logic)
607
- def generate_visualization(qr_paths):
608
- enabled_indices = [i for i in range(len(qr_paths))] # Start with all enabled
609
- composite_image = "Updated visualization will appear here." # Replace with your composite image generation based on enabled_indices
610
- qr_visualization.update(value=composite_image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
 
612
  # Event handlers
613
- visualize_btn.click(process_qr_codes, inputs=qr_input, outputs=[visualization_status, qr_visualization, qr_preview])
614
- reset_btn.click(lambda: (None, None, None, "⚠️ Visualization reset."), outputs=[visualization_status, qr_visualization, qr_preview])
615
-
616
- # Integrate the visualizer into the main application
617
- def visualize_qr_codes(qr_paths):
618
- """Visualize the generated QR codes with enable/disable functionality"""
619
- # This function currently receives the output gallery content (list of file paths)
620
- # You might need to adapt this based on how you want to visualize.
621
- # For now, let's just log the paths.
622
- logger.info(f"Visualizing QR codes: {qr_paths}")
623
- return "Visualization placeholder" # Replace with actual visualization logic
 
 
 
 
 
 
 
624
 
625
  def create_modern_interface():
626
  """Create a modern and visually appealing Gradio interface"""
627
 
628
- # Modern CSS styling
629
  css = """
630
  /* Modern color scheme */
631
  :root {
@@ -680,21 +1112,25 @@ def create_modern_interface():
680
  /* Gallery styling */
681
  .gallery {
682
  display: grid;
683
- grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
684
  gap: 1rem;
685
  padding: 1rem;
686
  background-color: white;
687
  border-radius: 0.5rem;
688
  border: 1px solid #e2e8f0;
 
689
  }
690
  .gallery img {
691
  width: 100%;
692
  height: auto;
 
693
  border-radius: 0.375rem;
694
  transition: transform 0.2s;
 
695
  }
696
  .gallery img:hover {
697
  transform: scale(1.05);
 
698
  }
699
  """
700
  # Create interface with modern design
@@ -703,205 +1139,305 @@ def create_modern_interface():
703
  # 🌐 Advanced Data Processing & QR Code Generator
704
  Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
705
  """)
706
- with gr.Tab("πŸ“ URL Processing"):
707
- url_input = gr.Textbox(
708
- label="Enter URLs (comma or newline separated)",
709
- lines=5,
710
- placeholder="https://example1.com\nhttps://example2.com",
711
- value=""
712
- )
713
- with gr.Tab("πŸ“ File Input"):
714
- file_input = gr.File(
715
- label="Upload Files",
716
- file_types=["*"], # Accept all file types
717
- file_count="multiple"
718
- )
719
- with gr.Tab("πŸ“‹ JSON Input"):
720
- text_input = gr.TextArea(
721
- label="Direct JSON Input",
722
- lines=15,
723
- placeholder="Paste your JSON data here...",
724
- value=""
725
- )
726
- with gr.Row():
727
- example_btn = gr.Button("πŸ“ Load Example", variant="secondary")
728
- clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
729
  with gr.Row():
730
- combine_data = gr.Checkbox(
731
- label="Combine all data into sequence ",
732
- value=True,
733
- info="Generate sequential QR codes for combined data"
734
- )
735
- process_btn = gr.Button(
736
- "πŸ”„ Process & Generate QR",
737
- variant="primary"
738
- )
739
- # Output components
740
- output_json = gr.JSON(label="Processed Data")
741
- output_gallery = gr.Gallery(
742
- label="Generated QR Codes",
743
- columns=3,
744
- height=400,
745
- show_label=True
746
- )
747
- output_text = gr.Textbox(
748
- label="Processing Status",
749
- interactive=False
750
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
 
752
  # Load example data
753
  def load_example():
754
  example = {
755
- "type": "product_catalog",
 
756
  "items": [
757
- {
758
- "id": "123",
759
- "name": "Premium Widget",
760
- "description": "High-quality widget with advanced features",
761
- "price": 299.99,
762
- "category": "electronics",
763
- "tags": ["premium", "featured", "new"]
764
- },
765
- {
766
- "id": "456",
767
- "name": "Basic Widget",
768
- "description": "Reliable widget for everyday use",
769
- "price": 149.99,
770
- "category": "electronics",
771
- "tags": ["basic", "popular"]
772
- }
773
  ],
774
- "metadata": {
775
- "timestamp": datetime.now().isoformat(),
776
- "version": "2.0",
777
- "source": "example"
778
- }
779
  }
780
  return json.dumps(example, indent=2)
781
 
782
- def clear_input():
 
783
  return ""
784
 
785
- def process_inputs(urls, files, text, combine):
786
- """Process all inputs and generate QR codes"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
787
  try:
788
- results = []
789
- url_processor = EnhancedURLProcessor()
790
- file_processor = EnhancedFileProcessor()
791
-
792
- # Process JSON input
793
- if text and text.strip():
794
- try:
795
- json_data = json.loads(text)
796
- if isinstance(json_data, list):
797
- results.extend(json_data)
798
- else:
799
- results.append(json_data)
800
- except json.JSONDecodeError as e:
801
- return None, [], f"❌ Invalid JSON format: {str(e)}"
802
-
803
- # Process URLs
804
- if urls and urls.strip():
805
- url_list = re.split(r'[,\n]', urls)
806
- url_list = [url.strip() for url in url_list if url.strip()]
807
- for url in url_list:
808
- validation = url_processor.validate_url(url)
809
- if validation['is_valid']:
810
- content_data = url_processor.fetch_content(url)
811
- if content_data and 'content' in content_data:
812
- # Chunk the content of each URL
813
- chunks = file_processor.chunk_data(content_data['content'])
814
- for i, chunk in enumerate(chunks):
815
- results.append({
816
- 'source': 'url',
817
- 'url': url,
818
- 'chunk_index': i + 1,
819
- 'total_chunks': len(chunks),
820
- 'content': chunk['data'], # Store the chunked data
821
- 'timestamp': datetime.now().isoformat()
822
- })
823
-
824
- # Process files
825
- if files:
826
- for file in files:
827
- file_results = file_processor.process_file(file)
828
- if file_results:
829
- results.extend(file_results)
830
-
831
- # Generate QR codes
832
- if results:
833
- qr_paths = generate_qr_codes(results, combine)
834
- if qr_paths:
835
- return (
836
- results,
837
- [str(path) for path in qr_paths],
838
- f"βœ… Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
839
- )
840
- else:
841
- return None, [], "❌ Failed to generate QR codes"
842
- else:
843
- return None, [], "⚠️ No valid content to process"
844
- except Exception as e:
845
- logger.error(f"Processing error: {e}")
846
- return None, [], f"❌ Error: {str(e)}"
847
 
848
- # Set up event handlers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
849
  example_btn.click(load_example, outputs=[text_input])
850
- clear_btn.click(clear_input, outputs=[text_input])
 
851
  process_btn.click(
852
- process_inputs,
853
  inputs=[url_input, file_input, text_input, combine_data],
854
- outputs=[output_json, output_gallery, output_text]
855
  )
856
 
857
- # Add the visualization button and its click event within the interface scope
858
- visualize_btn = gr.Button("πŸ” Visualize QR Codes")
859
- visualize_btn.click(visualize_qr_codes, inputs=output_gallery, outputs=None)
860
-
861
- # Add helpful documentation
862
  gr.Markdown("""
863
  ### πŸš€ Features
864
- - **Complete URL Scraping**: Extracts every character from web pages
865
- - **Advanced File Processing**: Full content extraction from various text-based files and common archives. Supports flexible JSON handling.
866
- - **Smart JSON Handling**: Processes any size JSON with automatic chunking, either via direct input or file upload.
867
- - **Sequential QR Codes**: Maintains data integrity across multiple codes
868
- - **Modern Design**: Clean, responsive interface with visual feedback
869
  ### πŸ’‘ Tips
870
- 1. **URLs**: Enter multiple URLs separated by commas or newlines
871
- 2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and JSON files.
872
- 3. **JSON**: Use the example button to see the expected format or upload a .json file. The system will also try to detect JSON content in other file types.
873
- 4. **QR Codes**: Choose whether to combine data into sequential codes
874
- 5. **Processing**: Monitor the status for real-time feedback
875
  ### 🎨 Output
876
- - Generated QR codes are saved in the `output/qr_codes` directory
877
- - Each QR code contains metadata for proper sequencing
878
- - Hover over QR codes in the gallery to see details
879
- """)
880
- return interface
881
 
882
  def main():
883
  """Initialize and launch the application"""
884
  try:
885
- # Configure system settings
886
- mimetypes.init()
887
 
 
888
  # Create and launch interface
889
  interface = create_modern_interface()
890
 
891
- # Add the QR sequence visualizer tab
892
- with interface:
893
- create_qr_sequence_visualizer(None) # output_gallery might not be relevant here
894
 
895
  # Launch with configuration
896
  interface.launch(
897
- share=False,
898
- debug=False,
899
- show_error=True,
900
- show_api=False
 
 
901
  )
 
902
  except Exception as e:
903
- logger.error(f"Application startup error: {e}")
904
  raise
905
 
906
  if __name__ == "__main__":
 
 
 
 
907
  main()
 
56
 
57
  # Enhanced headers for better site compatibility
58
  self.session.headers.update({
59
+ 'User-Agent': self.user_agent.random, # Corrected spacing
60
  'Accept': '*/*', # Accept all content types
61
  'Accept-Language': 'en-US,en;q=0.9',
62
  'Accept-Encoding': 'gzip, deflate, br',
 
65
  'Sec-Fetch-Dest': 'document',
66
  'Sec-Fetch-Mode': 'navigate',
67
  'Sec-Fetch-Site': 'none',
68
+ 'Sec-Fetch-User': '?1', # Corrected spacing
69
  'DNT': '1'
70
  })
71
 
 
78
  if not all([parsed.scheme, parsed.netloc]):
79
  return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
80
  # Try HEAD request first to check accessibility
81
+ head_response = None # Initialize head_response
82
  try:
83
  head_response = self.session.head(url, timeout=5)
84
  head_response.raise_for_status()
85
+ # Need details from head_response if successful
86
+ details = {
87
+ 'content_type': head_response.headers.get('Content-Type', 'unknown'),
88
+ 'server': head_response.headers.get('Server', 'unknown'),
89
+ 'size': head_response.headers.get('Content-Length', 'unknown')
90
+ }
91
  except requests.exceptions.RequestException:
92
  # If HEAD fails, try GET as some servers don't support HEAD
93
+ logger.info(f"HEAD request failed for {url}, trying GET.")
94
  response = self.session.get(url, timeout=self.timeout)
95
  response.raise_for_status()
96
+ # Use details from GET response if HEAD failed
97
+ details = {
98
+ 'content_type': response.headers.get('Content-Type', 'unknown'),
99
+ 'server': response.headers.get('Server', 'unknown'),
100
+ 'size': response.headers.get('Content-Length', 'unknown') # Might not be accurate for GET stream
101
+ }
102
 
103
  return {
104
  'is_valid': True,
105
  'message': 'URL is valid and accessible',
106
+ 'details': details
 
 
 
 
107
  }
108
  except Exception as e:
109
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
 
114
  logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
115
 
116
  # Update User-Agent randomly for each request
117
+ self.session.headers.update({'User-Agent': self.user_agent.random}) # Corrected spacing
118
 
119
  response = self.session.get(url, timeout=self.timeout)
120
  response.raise_for_status()
 
128
  # Decode content with fallback
129
  try:
130
  raw_content = response.content.decode(encoding, errors='replace')
131
+ except (UnicodeDecodeError, LookupError): # Corrected error type
132
  raw_content = response.content.decode('utf-8', errors='replace')
133
 
134
  # Extract metadata
 
147
  if 'text/html' in content_type:
148
  processed_content = self._process_html_content(raw_content, url)
149
  else:
150
+ processed_content = raw_content # Store raw non-html content as processed
151
  return {
152
  'content': processed_content,
153
+ 'raw_content': raw_content, # Keep raw bytes if needed elsewhere
154
  'metadata': metadata
155
  }
156
  except requests.exceptions.RequestException as e:
 
174
  for attr in ['href', 'src']:
175
  if tag.get(attr):
176
  try:
177
+ # Handle potential base tag
178
+ base = soup.find('base')
179
+ current_base_url = base['href'] if base and base.get('href') else base_url
180
+ tag[attr] = urljoin(current_base_url, tag[attr])
181
+ except Exception as url_e:
182
+ # logger.warning(f"Could not absolutize URL {tag.get(attr)} in {base_url}: {url_e}")
183
+ pass # Keep original if conversion fails
184
+
185
+ # Extract all text content more cleanly
186
+ text_parts = [element for element in soup.stripped_strings]
187
+ # text_content = ' '.join(text_parts) # Join with space instead of newline? Depends on use case.
188
+ # Or keep newlines for structure:
189
+ text_content = '\n'.join(text_parts)
190
+
191
+ # Alternative: Get all text including scripts/styles if needed
192
+ # text_content = soup.get_text(separator='\n', strip=True)
193
+
194
+ return text_content
195
  except Exception as e:
196
  logger.error(f"HTML processing error: {e}")
197
+ # Return original content if parsing fails
198
  return content
199
 
200
  class EnhancedFileProcessor:
201
  """Advanced file processing with complete content extraction"""
202
  def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
203
  self.max_file_size = max_file_size
204
+ # Added more potential text/data formats
205
  self.supported_extensions = {
206
+ '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', '.css', '.js',
207
+ '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg', '.toml', '.sql', '.py', '.java', '.c', '.cpp', '.h', # Code files
208
+ '.zip', '.tar', '.gz', '.bz2', # No .7z, .rar without external libs
209
+ # '.pdf', '.doc', '.docx', '.rtf', '.odt' # These require more specific libraries (PyPDF2, python-docx etc.) - keep commented unless implemented
210
  }
211
+ # Define extensions that should be treated primarily as text
212
+ self.text_extensions = {
213
+ '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', '.css', '.js',
214
+ '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg', '.toml', '.sql', '.py', '.java', '.c', '.cpp', '.h'
215
+ }
216
+
217
 
218
  def process_file(self, file) -> List[Dict]:
219
  """Process uploaded file with enhanced error handling and complete extraction"""
220
+ if not file or not hasattr(file, 'name'):
221
+ logger.warning("Invalid file object received in process_file.")
222
+ return []
223
 
224
  dataset = []
225
+ file_path_obj = Path(file.name)
226
+
227
  try:
228
+ # Use Gradio's temp file path directly
229
+ file_path = file_path_obj.resolve()
230
+ if not file_path.exists():
231
+ logger.error(f"File path does not exist: {file_path}")
232
+ return []
233
+
234
+ file_size = file_path.stat().st_size
235
  if file_size > self.max_file_size:
236
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size ({self.max_file_size} bytes) for {file_path.name}")
237
+ # Optionally return a specific error message entry
238
+ # return [{'error': 'File too large', 'filename': file_path.name}]
239
  return []
240
 
241
+ file_suffix = file_path.suffix.lower()
242
+
243
+ # Check if supported at all
244
+ # if file_suffix not in self.supported_extensions and not self._is_archive(str(file_path)):
245
+ # logger.warning(f"Unsupported file type based on extension: {file_path.name}")
246
+ # # Decide if you want to try processing anyway or return
247
+ # # return [{'error': 'Unsupported file type', 'filename': file_path.name}]
248
+ # # Let's try processing anyway, _process_single_file will handle text reading
249
+ # pass # Continue to attempt processing
250
+
251
+ # Use a persistent temp directory if needed across calls, otherwise TemporaryDirectory is fine
252
+ with tempfile.TemporaryDirectory(dir=TEMP_DIR) as temp_dir: # Use configured temp dir
253
  temp_dir_path = Path(temp_dir)
254
 
255
+ # Handle archives first
256
+ if self._is_archive(str(file_path)):
257
+ logger.info(f"Processing archive file: {file_path.name}")
258
+ dataset.extend(self._process_archive(str(file_path), temp_dir_path))
 
259
  else:
260
+ # Process as single file (might be text or something else)
261
+ logger.info(f"Processing single file: {file_path.name}")
262
+ # Pass the path string or Path object to _process_single_file
263
+ dataset.extend(self._process_single_file(file_path))
264
+
265
 
266
  except Exception as e:
267
+ logger.error(f"Error processing file '{file_path_obj.name}': {str(e)}", exc_info=True) # Log stack trace
268
+ # Optionally return error entry
269
+ # dataset.append({'error': f'Processing failed: {str(e)}', 'filename': file_path_obj.name})
270
+ return [] # Return empty list on error for now
271
  return dataset
272
 
273
  def _is_archive(self, filepath: str) -> bool:
274
+ """Check if file is a supported archive type"""
275
+ # Only include archive types we can handle
276
+ return filepath.lower().endswith(('.zip', '.tar', '.tar.gz', '.tgz', '.gz', '.bz2')) # Added bz2 if bz2 lib is imported
 
277
 
278
+ def _process_single_file(self, file_path: Union[str, Path]) -> List[Dict]:
279
  """Process a single file with enhanced character extraction and JSON handling"""
280
+ # Ensure file_path is a Path object
281
+ file_path = Path(file_path)
282
+ file_name = file_path.name
283
+ file_suffix = file_path.suffix.lower()
284
+
285
  try:
286
+ file_stat = file_path.stat()
287
  file_size = file_stat.st_size
288
+ mime_type, _ = mimetypes.guess_type(file_path)
289
+ mime_type = mime_type or 'application/octet-stream' # Default if guess fails
290
+
291
  # Initialize content storage
292
+ complete_content = None
293
+ is_json_like = file_suffix == '.json' or 'json' in mime_type
294
+
295
+ # Try reading as text first if it's a text-like extension or potentially text mime type
296
+ # Increased chunk size for efficiency on larger text files
297
  chunk_size = 10 * 1024 * 1024 # 10MB chunks
298
+ if file_suffix in self.text_extensions or (mime_type and mime_type.startswith('text/')):
299
+ content_parts = []
300
+ detected_encoding = 'utf-8' # Default
301
+ try:
302
+ with open(file_path, 'rb') as f:
303
+ # Detect encoding from the first chunk for better accuracy
304
+ first_chunk = f.read(chunk_size)
305
+ if first_chunk:
306
+ detected_encoding = chardet.detect(first_chunk)['encoding'] or 'utf-8'
307
+ logger.info(f"Detected encoding for {file_name}: {detected_encoding}")
308
+ # Rewind or reopen might be cleaner if needed, but let's decode first chunk
309
+ try:
310
+ decoded_chunk = first_chunk.decode(detected_encoding, errors='replace')
311
+ content_parts.append(decoded_chunk)
312
+ except (UnicodeDecodeError, LookupError):
313
+ logger.warning(f"Failed to decode first chunk with {detected_encoding}, falling back to utf-8 for {file_name}")
314
+ detected_encoding = 'utf-8' # Fallback for subsequent reads
315
+ decoded_chunk = first_chunk.decode(detected_encoding, errors='replace')
316
+ content_parts.append(decoded_chunk)
317
+
318
+ # Read remaining chunks
319
+ while True:
320
+ chunk = f.read(chunk_size)
321
+ if not chunk:
322
+ break
323
+ try:
324
+ decoded_chunk = chunk.decode(detected_encoding, errors='replace')
325
+ content_parts.append(decoded_chunk)
326
+ except (UnicodeDecodeError, LookupError):
327
+ # Should not happen if fallback already occurred, but good practice
328
+ logger.warning(f"Decoding error in subsequent chunk for {file_name}, using replace.")
329
+ decoded_chunk = chunk.decode(detected_encoding, errors='replace')
330
+ content_parts.append(decoded_chunk)
331
+
332
+ complete_content = ''.join(content_parts)
333
+ logger.info(f"Successfully read text content from {file_name}")
334
+
335
+ except IOError as e:
336
+ logger.error(f"IOError reading file {file_name}: {e}")
337
+ return [] # Cannot process if read fails
338
+ except Exception as e:
339
+ logger.error(f"Error reading text file {file_name}: {e}", exc_info=True)
340
+ # Decide if we should return or try other methods
341
+ return []
342
+
343
+
344
+ # Now, check if the read text content IS valid JSON
345
+ json_data = None
346
+ raw_json_content = None # Store the raw string if it was JSON
347
+ if complete_content is not None:
348
+ try:
349
  json_data = json.loads(complete_content)
350
+ # It is JSON! Update metadata
351
+ raw_json_content = complete_content # Keep the original string
352
+ complete_content = json_data # Now content holds the parsed object
353
+ mime_type = 'application/json' # Correct mime type
354
+ source = 'json_content_detected'
355
+ if file_suffix == '.json':
356
+ source = 'json_file'
357
+ logger.info(f"Successfully parsed JSON content from {file_name}")
358
+
359
+ except json.JSONDecodeError:
360
+ # It looked like text, but wasn't valid JSON
361
+ if is_json_like:
362
+ logger.warning(f"File {file_name} has JSON extension/mime but failed to parse.")
363
+ # Keep complete_content as the string it was read as
364
+ source = 'text_file'
365
+ except Exception as e:
366
+ logger.error(f"Unexpected error during JSON parsing check for {file_name}: {e}")
367
+ # Keep complete_content as string, mark as text file
368
+ source = 'text_file'
369
+ else:
370
+ # File wasn't identified as text or failed to read
371
+ # Could attempt binary read here if needed, or just mark as non-text
372
+ logger.warning(f"Could not read {file_name} as text. Storing metadata only or treating as binary.")
373
+ source = 'binary_file' # Or 'unreadable_file'
374
+ complete_content = f"Binary or unreadable content ({file_size} bytes)" # Placeholder
375
+
 
 
 
 
 
376
 
377
+ # Structure the output
378
+ result = {
379
+ 'source': source,
380
+ 'filename': file_name,
381
  'file_size': file_size,
382
+ 'mime_type': mime_type,
383
  'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
384
  'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
385
+ 'content': complete_content, # This is parsed JSON if successful, or text string, or placeholder
386
  'timestamp': datetime.now().isoformat()
387
+ }
388
+ if raw_json_content:
389
+ result['raw_content'] = raw_json_content # Add raw string if it was JSON
390
+
391
+ return [result]
392
+
393
+ except FileNotFoundError:
394
+ logger.error(f"File not found during processing: {file_path}")
395
+ return []
396
  except Exception as e:
397
+ logger.error(f"File processing error for {file_path.name}: {e}", exc_info=True)
398
  return []
399
 
400
  def _process_archive(self, archive_path: str, extract_to: Path) -> List[Dict]:
401
  """Process an archive file with enhanced extraction"""
402
  dataset = []
403
+ archive_path_obj = Path(archive_path)
404
+ logger.info(f"Attempting to extract archive: {archive_path_obj.name}")
405
+
406
  try:
407
  # Handle ZIP archives
408
+ if archive_path.lower().endswith('.zip') and zipfile.is_zipfile(archive_path):
409
+ logger.debug(f"Processing ZIP file: {archive_path_obj.name}")
410
  with zipfile.ZipFile(archive_path, 'r') as zip_ref:
411
+ # Check for zip bomb potential (optional, basic check)
412
+ total_uncompressed_size = sum(file.file_size for file in zip_ref.infolist())
413
+ # Add a limit, e.g., 10x the archive size or an absolute limit like 10GB
414
+ if total_uncompressed_size > self.max_file_size * 10: # Example limit
415
+ logger.warning(f"Potential zip bomb detected: {archive_path_obj.name}, uncompressed size {total_uncompressed_size}")
416
+ return [{'error': 'Archive potential bomb', 'filename': archive_path_obj.name}]
417
+
418
  for file_info in zip_ref.infolist():
419
+ # Avoid directory entries and potential path traversal issues
420
+ if not file_info.is_dir() and file_info.filename and not file_info.filename.startswith('/') and '..' not in file_info.filename:
421
+ try:
422
+ extracted_path = extract_to / file_info.filename
423
+ # Ensure parent directory exists
424
+ extracted_path.parent.mkdir(parents=True, exist_ok=True)
425
+
426
+ # Extract individual file safely
427
+ with zip_ref.open(file_info.filename) as source, open(extracted_path, "wb") as target:
428
+ target.write(source.read())
429
+
430
+ logger.debug(f"Extracted {file_info.filename} from zip.")
431
+ # Now process the extracted file
432
+ dataset.extend(self._process_single_file(extracted_path))
433
+ except Exception as extract_err:
434
+ logger.error(f"Failed to extract/process file {file_info.filename} from zip {archive_path_obj.name}: {extract_err}")
435
+
436
+ # Handle TAR archives (covers .tar, .tar.gz, .tgz, .tar.bz2)
437
+ # Need to import bz2 if supporting .bz2
438
+ elif tarfile.is_tarfile(archive_path):
439
+ logger.debug(f"Processing TAR file: {archive_path_obj.name}")
440
+ # Mode 'r:*' auto-detects compression (gz, bz2, xz if libs available)
441
+ with tarfile.open(archive_path, 'r:*') as tar_ref:
442
+ # Add security checks for tar extraction if needed (e.g., checking paths)
443
+ for member in tar_ref.getmembers():
444
+ if member.isfile() and member.name and not member.name.startswith('/') and '..' not in member.name:
445
+ try:
446
+ # Construct safe path
447
+ extracted_path = extract_to / member.name
448
+ extracted_path.parent.mkdir(parents=True, exist_ok=True)
449
+ # Extract safely
450
+ with tar_ref.extractfile(member) as source, open(extracted_path, "wb") as target:
451
+ target.write(source.read())
452
+
453
+ logger.debug(f"Extracted {member.name} from tar.")
454
+ dataset.extend(self._process_single_file(extracted_path))
455
+ except Exception as extract_err:
456
+ logger.error(f"Failed to extract/process member {member.name} from tar {archive_path_obj.name}: {extract_err}")
457
+
458
+ # Handle GZIP archives (single file compression) - check it's not a tar.gz
459
+ elif archive_path.lower().endswith('.gz') and not archive_path.lower().endswith('.tar.gz'):
460
+ logger.debug(f"Processing GZIP file: {archive_path_obj.name}")
461
+ # Need to determine the output filename (remove .gz)
462
+ extracted_filename = archive_path_obj.stem
463
+ # Handle cases like '.txt.gz' -> '.txt'
464
+ if '.' in extracted_filename:
465
+ extracted_path = extract_to / extracted_filename
466
+ else:
467
+ # If no inner extension (e.g., 'myfile.gz'), maybe add a default like '.bin' or leave as is?
468
+ extracted_path = extract_to / (extracted_filename + ".bin") # Example
469
+
470
+ try:
471
+ extracted_path.parent.mkdir(parents=True, exist_ok=True)
472
  with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
473
  outfile.write(gz_file.read())
474
+ logger.debug(f"Extracted {extracted_path.name} from gzip.")
475
+ dataset.extend(self._process_single_file(extracted_path))
476
+ except gzip.BadGzipFile as e:
477
+ logger.error(f"Error processing GZIP archive {archive_path_obj.name}: Bad Gzip File - {e}")
478
+ except Exception as extract_err:
479
+ logger.error(f"Failed to extract/process gzip file {archive_path_obj.name}: {extract_err}")
480
+
481
+ # Add BZ2 single file support (requires bz2 import)
482
+ elif archive_path.lower().endswith('.bz2') and not archive_path.lower().endswith('.tar.bz2'):
483
+ logger.debug(f"Processing BZ2 file: {archive_path_obj.name}")
484
+ try:
485
+ import bz2
486
+ extracted_filename = archive_path_obj.stem
487
+ extracted_path = extract_to / extracted_filename
488
+ if '.' not in extracted_filename:
489
+ extracted_path = extract_to / (extracted_filename + ".bin")
490
+
491
+ extracted_path.parent.mkdir(parents=True, exist_ok=True)
492
+ with bz2.open(archive_path, 'rb') as bz2_file, open(extracted_path, 'wb') as outfile:
493
+ outfile.write(bz2_file.read())
494
+ logger.debug(f"Extracted {extracted_path.name} from bz2.")
495
+ dataset.extend(self._process_single_file(extracted_path))
496
+
497
+ except ImportError:
498
+ logger.warning("bz2 library not available, cannot process .bz2 files.")
499
+ except Exception as extract_err:
500
+ logger.error(f"Failed to extract/process bz2 file {archive_path_obj.name}: {extract_err}")
501
+
502
+
503
+ # Placeholder for other types or if no specific handler matched
504
+ else:
505
+ logger.warning(f"Archive type not explicitly handled or not a recognized archive: {archive_path_obj.name}")
506
 
507
+
508
+ except FileNotFoundError:
509
+ logger.error(f"Archive file not found: {archive_path}")
510
+ except (zipfile.BadZipFile, tarfile.TarError, gzip.BadGzipFile) as archive_err:
511
+ logger.error(f"Invalid or corrupted archive file {archive_path_obj.name}: {archive_err}")
512
+ dataset.append({'error': f'Corrupted archive: {archive_err}', 'filename': archive_path_obj.name})
513
  except Exception as e:
514
+ logger.error(f"General archive processing error for {archive_path_obj.name}: {e}", exc_info=True)
515
+ dataset.append({'error': f'Archive processing failed: {e}', 'filename': archive_path_obj.name})
516
  return dataset
517
 
518
+ # Adjusted chunk_data with recommended max_size for QR codes
519
+ def chunk_data(self, data: Union[Dict, List, str], max_size: int = 1800) -> List[Dict]:
520
+ """Enhanced data chunking with sequence metadata, sized for QR codes."""
521
  try:
522
  if not isinstance(data, str):
523
+ # Convert complex data to JSON string first
524
+ # Use separators=(',', ':') for compact JSON
525
+ json_str = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
526
  else:
527
+ json_str = data # Assume input string is already the data payload
 
 
 
 
 
 
 
 
 
 
 
528
 
529
+ # Data here is the raw string (or JSON string) payload for the QR code
530
+ total_length = len(json_str.encode('utf-8')) # Use byte length for QR capacity
531
+ logger.debug(f"Chunking data of total byte length: {total_length}")
532
 
533
+
534
+ # Simplified: If the data fits within max_size (bytes), return one chunk object
535
+ # The chunk object itself adds metadata, but the 'data' field is what matters for QR limit.
536
+ if total_length <= max_size:
537
+ chunk_meta = {
538
  "chunk_index": 0,
539
  "total_chunks": 1,
540
+ "total_length": total_length, # Store byte length
541
+ "chunk_hash": hash(json_str) & 0xFFFFFFFF,
542
+ "data": json_str # The actual string payload
543
  }
544
+ logger.debug(f"Data fits in one chunk (payload size {total_length} bytes)")
545
+ return [chunk_meta]
546
+
547
+ # If data exceeds max_size, split the string payload
548
+ # We need to split the *string* representation carefully
549
+ # Aim for byte size chunks, which is tricky with UTF-8 variable char width
550
+ # Simple approach: estimate character chunk size based on bytes
551
+
552
+ # Estimate average bytes per character (crude but simple)
553
+ avg_bytes_per_char = total_length / len(json_str) if len(json_str) > 0 else 1
554
+ # Calculate target character chunk size based on byte limit
555
+ target_char_chunk_size = int(max_size / avg_bytes_per_char)
556
 
557
+ if target_char_chunk_size < 1: target_char_chunk_size = 1 # Avoid zero chunk size
558
+
559
+ # Calculate number of chunks based on estimated character size
560
+ num_chunks = math.ceil(len(json_str) / target_char_chunk_size)
561
 
562
  chunks = []
563
+ start_char_idx = 0
564
  for i in range(num_chunks):
565
+ # Calculate end index, ensuring we don't overshoot
566
+ end_char_idx = min(start_char_idx + target_char_chunk_size, len(json_str))
567
+
568
+ # Extract the character chunk
569
+ chunk_payload_str = json_str[start_char_idx:end_char_idx]
570
 
571
+ # Recalculate actual byte length for this specific chunk
572
+ current_chunk_byte_length = len(chunk_payload_str.encode('utf-8'))
573
+
574
+ # Adjust end_char_idx if current chunk exceeds max_size (rare if estimate is decent)
575
+ while current_chunk_byte_length > max_size and end_char_idx > start_char_idx:
576
+ end_char_idx -= 1 # Reduce characters
577
+ chunk_payload_str = json_str[start_char_idx:end_char_idx]
578
+ current_chunk_byte_length = len(chunk_payload_str.encode('utf-8'))
579
+
580
+ if not chunk_payload_str and start_char_idx < len(json_str):
581
+ # This should not happen with the logic above, but as a safeguard
582
+ logger.error("Chunking resulted in empty payload string unexpectedly.")
583
+ # Handle error: skip, break, or adjust logic
584
+ break # Avoid infinite loop
585
+
586
+ chunk_meta = {
587
  "chunk_index": i,
588
  "total_chunks": num_chunks,
589
+ "total_length": total_length, # Original total byte length
590
+ "chunk_byte_length": current_chunk_byte_length, # Actual byte length of this chunk's payload
591
+ "chunk_hash": hash(chunk_payload_str) & 0xFFFFFFFF,
592
+ "data": chunk_payload_str # The string payload for this chunk
593
  }
594
+ chunks.append(chunk_meta)
595
+ logger.debug(f"Created chunk {i+1}/{num_chunks}, payload byte size: {current_chunk_byte_length}")
596
+
597
+ # Move to the next starting point
598
+ start_char_idx = end_char_idx
599
+
600
+ # Safety break if start index doesn't advance
601
+ if start_char_idx == len(json_str) and i + 1 < num_chunks:
602
+ logger.warning(f"Chunking finished early at index {i+1} of {num_chunks}. Check logic.")
603
+ # Adjust total_chunks if ending early?
604
+ for ch in chunks: ch['total_chunks'] = len(chunks)
605
+ break
606
+
607
+
608
+ # Final check if total chunks changed
609
+ if chunks and chunks[0]['total_chunks'] != len(chunks):
610
+ logger.warning(f"Adjusting total_chunks from {chunks[0]['total_chunks']} to {len(chunks)}")
611
+ final_num_chunks = len(chunks)
612
+ for i, chunk in enumerate(chunks):
613
+ chunk['total_chunks'] = final_num_chunks
614
+ chunk['chunk_index'] = i # Re-index just in case
615
+
616
 
617
  return chunks
618
  except Exception as e:
619
+ logger.error(f"Error chunking data: {e}", exc_info=True)
620
  return []
621
 
622
+
623
+ def generate_stylish_qr(data: str, # Expecting string data from chunking
624
  filename: str,
625
  size: int = 10,
626
  border: int = 4,
627
  fill_color: str = "#000000",
628
+ back_color: str = "#FFFFFF",
629
+ error_correction_level=qrcode.constants.ERROR_CORRECT_H) -> str: # Added param
630
  """Generate a stylish QR code with enhanced visual appeal"""
631
  try:
632
  qr = qrcode.QRCode(
633
+ version=None, # Auto-detect version
634
+ error_correction=error_correction_level, # Use parameter
635
  box_size=size,
636
  border=border
637
  )
638
 
639
+ # Add string data directly (should be from chunker)
640
+ qr.add_data(data)
 
 
 
641
 
642
+ # Let the library figure out the best version and mode
643
  qr.make(fit=True)
644
 
645
+ logger.info(f"Generating QR code version {qr.version} for {filename} (Payload size: {len(data.encode('utf-8'))} bytes)")
646
+
647
+
648
  # Create QR code image with custom colors
649
  qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
650
 
651
+ # Convert to RGBA for transparency support (optional gradient)
652
  qr_image = qr_image.convert('RGBA')
653
 
654
+ # --- Optional: Add subtle gradient overlay ---
655
+ # gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
656
+ # draw = ImageDraw.Draw(gradient)
657
+ # for i in range(qr_image.width):
658
+ # alpha = int(255 * (1 - i/qr_image.width) * 0.1) # 10% maximum opacity
659
+ # draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
660
+ # final_image = Image.alpha_composite(qr_image, gradient)
661
+ # --- End Optional Gradient ---
662
 
663
+ final_image = qr_image # Use this line if gradient is commented out
 
664
 
665
  # Save the image
666
  output_path = QR_CODES_DIR / filename
667
+ # Ensure directory exists just before saving
668
+ output_path.parent.mkdir(parents=True, exist_ok=True)
669
+
670
+ final_image.save(output_path, quality=95) # PNG quality is lossless, but ok
671
 
672
  return str(output_path)
673
+ # Catch specific data overflow error
674
+ except qrcode.exceptions.DataOverflowError as doe:
675
+ logger.error(f"QR DataOverflowError for {filename}: {doe}. Data length (bytes): {len(data.encode('utf-8'))}. Max capacity likely exceeded for ErrorLevel {error_correction_level}.")
676
+ return "" # Return empty string on failure
677
  except Exception as e:
678
+ logger.error(f"QR generation error for {filename}: {e}", exc_info=True)
679
  return ""
680
 
681
+
682
+ def generate_qr_codes(data_to_encode: Union[str, Dict, List], combine_sources: bool = True) -> List[str]:
683
+ """Generate QR codes, chunking data appropriately."""
684
  try:
685
+ file_processor = EnhancedFileProcessor() # Get chunking method
686
+ all_qr_paths = []
687
+ qr_fill = "#1a365d" # Deep blue
688
+ qr_back = "#ffffff"
689
+ # Decide on error correction level - H is default, M or L allow more data
690
+ error_level = qrcode.constants.ERROR_CORRECT_H # Max correction, lowest capacity
691
+ # error_level = qrcode.constants.ERROR_CORRECT_M # Medium correction, medium capacity
692
+ # error_level = qrcode.constants.ERROR_CORRECT_L # Low correction, max capacity
693
+
694
+
695
+ if combine_sources:
696
+ logger.info("Combining all input sources into a single QR sequence.")
697
+ # Combine all data into one large structure (e.g., a list) before chunking
698
+ # This assumes `data_to_encode` is already the combined list/dict from process_inputs
699
+ if not data_to_encode:
700
+ logger.warning("No data provided to generate combined QR codes.")
701
+ return []
702
+
703
+ # Chunk the combined data structure
704
+ chunks = file_processor.chunk_data(data_to_encode) # Chunker expects dict/list/str
705
+ if not chunks:
706
+ logger.error("Chunking the combined data failed.")
707
+ return []
708
+
709
+ num_chunks = len(chunks)
710
+ logger.info(f"Generating {num_chunks} QR codes for combined data.")
711
+ for i, chunk_info in enumerate(chunks):
712
+ # chunk_info contains {'chunk_index', 'total_chunks', 'data', etc.}
713
+ filename = f'combined_qr_{int(time.time())}_{i+1}_of_{num_chunks}.png'
714
+ # Pass the actual payload string to the generator
715
+ qr_payload = chunk_info['data']
716
  qr_path = generate_stylish_qr(
717
+ data=qr_payload,
718
  filename=filename,
719
+ fill_color=qr_fill,
720
+ back_color=qr_back,
721
+ error_correction_level=error_level # Pass level
722
  )
723
  if qr_path:
724
+ all_qr_paths.append(qr_path)
725
+ else:
726
+ logger.error(f"Failed to generate QR code for combined chunk {i+1}")
727
+ # Optionally stop or continue?
728
+
729
  else:
730
+ # Process each item in the input list individually
731
+ logger.info("Generating separate QR code sequences for each input source.")
732
+ if not isinstance(data_to_encode, list):
733
+ logger.error("Input data must be a list when combine_sources is False.")
734
+ # Maybe wrap it?
735
+ if data_to_encode:
736
+ data_to_encode = [data_to_encode]
737
+ else:
738
+ return []
739
+
740
+
741
+ total_items = len(data_to_encode)
742
+ for item_idx, item in enumerate(data_to_encode):
743
+ item_source_info = f"item {item_idx+1}/{total_items}"
744
+ # Try to get a better name (e.g., from filename if available)
745
+ if isinstance(item, dict) and 'filename' in item:
746
+ item_source_info = item['filename']
747
+ elif isinstance(item, dict) and 'url' in item:
748
+ item_source_info = Path(urlparse(item['url']).path).name or f"url_item_{item_idx+1}"
749
+
750
+ logger.info(f"Processing source: {item_source_info}")
751
+
752
+ # Chunk the individual item
753
+ chunks = file_processor.chunk_data(item)
754
+ if not chunks:
755
+ logger.error(f"Chunking failed for item {item_idx+1} ({item_source_info})")
756
+ continue # Skip to next item
757
+
758
+ num_chunks = len(chunks)
759
+ logger.info(f"Generating {num_chunks} QR codes for {item_source_info}.")
760
+ for chunk_idx, chunk_info in enumerate(chunks):
761
+ # Sanitize source info for filename
762
+ safe_source_name = re.sub(r'[^\w\-]+', '_', item_source_info)
763
+ filename = f'{safe_source_name}_chunk_{chunk_idx+1}_of_{num_chunks}_{int(time.time())}.png'
764
+ qr_payload = chunk_info['data']
765
  qr_path = generate_stylish_qr(
766
+ data=qr_payload,
767
  filename=filename,
768
+ fill_color=qr_fill,
769
+ back_color=qr_back,
770
+ error_correction_level=error_level # Pass level
771
  )
772
  if qr_path:
773
+ all_qr_paths.append(qr_path)
774
+ else:
775
+ logger.error(f"Failed to generate QR code for {item_source_info} chunk {chunk_idx+1}")
776
+
777
+
778
+ logger.info(f"Generated a total of {len(all_qr_paths)} QR codes.")
779
+ return all_qr_paths
780
  except Exception as e:
781
+ logger.error(f"General QR code generation process error: {e}", exc_info=True)
782
  return []
783
 
784
+ def _generate_sequence_visualization_image(qr_paths: List[str], qr_data: List[Dict], title: str = "QR Code Sequence") -> Optional[io.BytesIO]:
785
+ """
786
+ Generates a visual representation of the QR code sequence using NetworkX and Matplotlib.
787
+
788
+ Args:
789
+ qr_paths: List of file paths to the QR code images.
790
+ qr_data: List of decoded data dictionaries, hopefully containing 'chunk_index'.
791
+ title: The title for the visualization plot.
792
+
793
+ Returns:
794
+ A BytesIO buffer containing the PNG image of the visualization, or None if error.
795
+ """
796
+ if not qr_paths or not qr_data or len(qr_paths) != len(qr_data):
797
+ logger.warning("Mismatch or empty data for visualization.")
798
+ return None
799
+
800
+ logger.info(f"Generating visualization for {len(qr_paths)} QR codes.")
801
+ try:
802
+ G = nx.DiGraph()
803
+ node_labels = {}
804
+ node_colors = []
805
+ node_sizes = []
806
+
807
+ # Assume data is pre-sorted by chunk_index during loading
808
+ num_nodes = len(qr_paths)
809
+ total_chunks_from_meta = qr_data[0].get('total_chunks', num_nodes) if qr_data else num_nodes
810
+
811
+ for i in range(num_nodes):
812
+ node_id = i
813
+ # Use chunk_index from metadata if possible, otherwise use list index
814
+ chunk_idx = qr_data[i].get('chunk_index', i)
815
+ label = f"{chunk_idx + 1}/{total_chunks_from_meta}"
816
+ node_labels[node_id] = label
817
+ G.add_node(node_id, path=qr_paths[i], data=qr_data[i])
818
+
819
+ # Add edges between consecutive nodes
820
+ if i > 0:
821
+ G.add_edge(i - 1, i)
822
+
823
+ # Simple coloring/sizing (can be customized further)
824
+ node_colors.append('#4299e1') # Default blue color
825
+ node_sizes.append(1500)
826
+
827
+ if not G.nodes:
828
+ logger.warning("No nodes to visualize.")
829
+ return None
830
+
831
+ # --- Layout and Drawing ---
832
+ plt.figure(figsize=(max(10, num_nodes * 1.5), 5)) # Adjust figure size based on number of nodes
833
+
834
+ # Simple linear layout for sequences is often clearest
835
+ pos = {i: (i * 2, 0) for i in range(num_nodes)} # Horizontal layout
836
+
837
+ # For more complex graphs, consider other layouts:
838
+ # pos = nx.spring_layout(G, k=0.5, iterations=50)
839
+ # pos = nx.kamada_kawai_layout(G)
840
+
841
+ nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color=node_colors, alpha=0.9)
842
+ nx.draw_networkx_edges(G, pos, arrowstyle='-|>', arrowsize=20, edge_color='gray', alpha=0.6)
843
+ nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=10, font_color='white')
844
+
845
+ plt.title(title, fontsize=16)
846
+ plt.xlabel("Sequence Index", fontsize=12)
847
+ plt.yticks([]) # Hide Y-axis ticks for linear layout
848
+ plt.xticks(range(0, num_nodes * 2, 2), [f"{i+1}" for i in range(num_nodes)]) # Label X-axis ticks
849
+ plt.box(False) # Remove frame box
850
+ plt.tight_layout()
851
+
852
+ # Save plot to a BytesIO buffer
853
+ buf = io.BytesIO()
854
+ plt.savefig(buf, format='png', bbox_inches='tight', dpi=100)
855
+ plt.close() # Close the plot figure to free memory
856
+ buf.seek(0)
857
+ logger.info("Successfully generated visualization image buffer.")
858
+ return buf
859
+
860
+ except Exception as e:
861
+ logger.error(f"Error generating visualization image: {e}", exc_info=True)
862
+ plt.close() # Ensure plot is closed even on error
863
+ return None
864
+
865
+ # --- Gradio Interface Section ---
866
+
867
+ def create_qr_sequence_visualizer(output_gallery_ref): # Pass a reference if needed later
868
  """Add QR sequence visualization capabilities to the application"""
 
869
  with gr.Tab("πŸ”„ QR Sequence Visualizer"):
870
  gr.Markdown("""
871
  ## QR Code Sequence Visualizer
872
+ Upload a sequence of QR codes (e.g., those generated by this app) to decode them and visualize their order.
 
873
  """)
874
 
875
+ # Store data globally within this tab's scope (alternative to Gradio State)
876
+ # This is simpler but not ideal for complex state management
877
+ shared_data = {'qr_paths': [], 'qr_data': []}
 
 
 
 
 
 
 
 
 
 
878
 
 
879
  with gr.Row():
880
+ with gr.Column(scale=1):
881
+ qr_input = gr.File(
882
+ label="Upload QR Code Images",
883
+ file_types=["image/png", "image/jpeg", ".png", ".jpg", ".jpeg"], # Be explicit
884
+ file_count="multiple"
885
+ )
886
+ visualize_btn = gr.Button("πŸ‘οΈ Decode & Visualize Sequence", variant="primary")
887
+ reset_btn = gr.Button("πŸ—‘οΈ Reset Visualizer", variant="secondary")
888
+ visualization_status = gr.Textbox(label="Status", interactive=False, lines=3)
889
+ # Placeholder for interactive elements (future improvement)
890
+ # qr_toggles_container = gr.HTML(label="QR Code Controls (Future)")
891
 
892
+ with gr.Column(scale=2):
893
+ qr_visualization = gr.Image(label="QR Code Sequence Map", type="pil", height=400) # Use PIL type
894
+ qr_preview = gr.Gallery(label="Uploaded QR Codes (Sorted)", columns=4, height=400, object_fit="contain", preview=True)
895
 
896
+
897
+ def process_qr_codes_and_visualize(files):
898
+ """Decodes QR files, sorts them, updates gallery, and generates visualization."""
899
  if not files:
900
+ shared_data['qr_paths'] = []
901
+ shared_data['qr_data'] = []
902
+ return "Please upload QR code images.", None, None, "⚠️ No QR codes uploaded."
903
+
904
+ logger.info(f"Processing {len(files)} uploaded QR files for visualization.")
905
+ qr_data_list = []
906
+ qr_path_list = []
907
+ decode_errors = 0
908
 
909
+ # Use OpenCV detector via qrcode library
910
  try:
911
+ detector = qrcode.QRCodeDetector()
912
+ except AttributeError:
913
+ logger.error("qrcode.QRCodeDetector not found. Ensure correct library version or dependencies.")
914
+ return "Error initializing QR detector.", None, None, "❌ Library Error"
915
+ except Exception as init_e:
916
+ logger.error(f"Error initializing QR detector: {init_e}")
917
+ return f"Error initializing QR detector: {init_e}", None, None, "❌ Detector Init Error"
918
 
 
 
 
919
 
920
+ for file in files:
921
+ try:
922
+ img_path = file.name # Gradio File object path
923
+ img = Image.open(img_path)
924
+ img_np = np.array(img.convert('RGB')) # Detector often prefers RGB
925
+
926
+ # Try to decode QR code
927
+ data, bbox, straight_qrcode = detector.detectAndDecode(img_np)
928
+
929
+ if data:
930
+ logger.debug(f"Decoded data from {os.path.basename(img_path)}: {data[:50]}...")
931
+ # Try parsing the decoded data as JSON (expected format from generator)
932
  try:
933
+ qr_metadata = json.loads(data)
934
+ # Check if it looks like our chunk format
935
+ if isinstance(qr_metadata, dict) and 'chunk_index' in qr_metadata and 'total_chunks' in qr_metadata:
936
+ qr_data_list.append(qr_metadata)
937
+ qr_path_list.append(img_path)
 
 
 
 
 
 
938
  else:
939
+ # Valid JSON, but not the expected chunk structure
940
+ logger.warning(f"Decoded valid JSON, but not expected format from {os.path.basename(img_path)}")
941
+ qr_data_list.append({"data": qr_metadata, "chunk_index": -1}) # Assign default index
942
+ qr_path_list.append(img_path)
943
+
944
+ except json.JSONDecodeError:
945
+ # Data decoded, but not JSON - store raw data
946
+ logger.warning(f"Could not decode JSON from QR data in {os.path.basename(img_path)}. Storing raw.")
947
+ qr_data_list.append({"data": data, "chunk_index": -1}) # Assign default index
948
+ qr_path_list.append(img_path)
949
+ except Exception as json_e:
950
+ logger.error(f"Error processing decoded JSON from {os.path.basename(img_path)}: {json_e}")
951
+ qr_data_list.append({"data": f"Error: {json_e}", "chunk_index": -1})
952
+ qr_path_list.append(img_path)
953
+ decode_errors += 1
954
+ else:
955
+ # QR code detected, but no data decoded (or detection failed)
956
+ logger.warning(f"Could not decode data from QR image: {os.path.basename(img_path)}")
957
+ qr_data_list.append({"data": "[DECODE FAILED]", "chunk_index": -1})
958
+ qr_path_list.append(img_path)
959
+ decode_errors += 1
 
 
 
960
 
961
+ except Exception as e:
962
+ logger.error(f"Error processing QR image file {os.path.basename(getattr(file, 'name', 'N/A'))}: {e}", exc_info=True)
963
+ # Optionally add placeholder for failed file?
964
+ decode_errors += 1
 
 
965
 
966
+ if not qr_path_list:
967
+ shared_data['qr_paths'] = []
968
+ shared_data['qr_data'] = []
969
+ return "No valid QR codes could be processed or decoded.", None, None, "❌ Failed to process/decode QR codes"
970
 
971
+ # Attempt to sort by chunk_index (handle missing index gracefully)
972
+ try:
973
+ # Create tuples (index, data, path) for sorting
974
+ indexed_items = []
975
+ for i, (data, path) in enumerate(zip(qr_data_list, qr_path_list)):
976
+ # Use provided chunk_index, fallback to list index if missing or invalid (-1)
977
+ sort_key = data.get('chunk_index', i)
978
+ if not isinstance(sort_key, int) or sort_key < 0:
979
+ sort_key = i # Fallback to original order for this item
980
+ indexed_items.append((sort_key, data, path))
981
+
982
+ # Sort based on the index key
983
+ indexed_items.sort(key=lambda x: x[0])
984
+
985
+ # Unpack sorted lists
986
+ sorted_qr_data = [item[1] for item in indexed_items]
987
+ sorted_qr_paths = [item[2] for item in indexed_items]
988
+
989
+ # Update shared data
990
+ shared_data['qr_paths'] = sorted_qr_paths
991
+ shared_data['qr_data'] = sorted_qr_data
992
+ logger.info("Successfully sorted QR data based on chunk_index.")
993
 
 
994
  except Exception as e:
995
+ logger.error(f"Error sorting QR data: {e}. Using original order.")
996
+ # Use original order if sorting fails
997
+ shared_data['qr_paths'] = qr_path_list
998
+ shared_data['qr_data'] = qr_data_list
999
+
1000
+ # Generate the visualization image using the helper function
1001
+ # Use the sorted data stored in shared_data
1002
+ visualization_image_buffer = _generate_sequence_visualization_image(
1003
+ shared_data['qr_paths'],
1004
+ shared_data['qr_data'],
1005
+ title=f"Visualized Sequence ({len(shared_data['qr_paths'])} Codes)"
1006
+ )
1007
+
1008
+ # Convert buffer to PIL Image for Gradio output if necessary
1009
+ vis_image_pil = None
1010
+ if visualization_image_buffer:
1011
+ try:
1012
+ vis_image_pil = Image.open(visualization_image_buffer)
1013
+ except Exception as img_e:
1014
+ logger.error(f"Failed to load visualization buffer into PIL Image: {img_e}")
1015
+
1016
 
1017
+ status_message = f"Processed {len(shared_data['qr_paths'])} QR codes."
1018
+ if decode_errors > 0:
1019
+ status_message += f" ({decode_errors} decode errors)"
1020
+ status_message += "\nSequence visualized." if vis_image_pil else "\nVisualization generation failed."
1021
+ final_status = "βœ… Done" if vis_image_pil else "⚠️ Errors Occurred"
1022
+
1023
+
1024
+ # Update outputs: Gallery with sorted paths, Image with visualization, Status text
1025
+ # The gallery expects a list of image paths or PIL images
1026
+ gallery_output = shared_data['qr_paths']
1027
+
1028
+ return gallery_output, vis_image_pil, status_message, final_status
1029
+
1030
+
1031
+ def reset_visualizer_state():
1032
+ shared_data['qr_paths'] = []
1033
+ shared_data['qr_data'] = []
1034
+ logger.info("Resetting QR visualizer state.")
1035
+ return None, None, None, "βšͺ Visualizer Reset. Upload new QR codes."
1036
 
1037
  # Event handlers
1038
+ visualize_btn.click(
1039
+ process_qr_codes_and_visualize,
1040
+ inputs=[qr_input],
1041
+ outputs=[qr_preview, qr_visualization, visualization_status, visualization_status] # Update gallery, image, and status twice? Let's map correctly.
1042
+ # Correct mapping:
1043
+ # outputs=[qr_preview (Gallery), qr_visualization (Image), visualization_status (Textbox), visualization_status (Textbox again - maybe just need 3 outputs?)]
1044
+ # Let's try mapping to the 4 defined outputs:
1045
+ # outputs=[qr_preview, qr_visualization, visualization_status, visualization_status] # Seems redundant, but matches function signature needs. Let's adjust function signature later if needed.
1046
+ ).then(
1047
+ lambda: logger.info("Visualization process complete."), inputs=None, outputs=None
1048
+ )
1049
+
1050
+
1051
+ reset_btn.click(
1052
+ reset_visualizer_state,
1053
+ inputs=[],
1054
+ outputs=[qr_preview, qr_visualization, qr_input, visualization_status] # Clear gallery, image, file input, status
1055
+ )
1056
 
1057
  def create_modern_interface():
1058
  """Create a modern and visually appealing Gradio interface"""
1059
 
1060
+ # Modern CSS styling (Seems intact)
1061
  css = """
1062
  /* Modern color scheme */
1063
  :root {
 
1112
  /* Gallery styling */
1113
  .gallery {
1114
  display: grid;
1115
+ grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); /* Adjust minmax */
1116
  gap: 1rem;
1117
  padding: 1rem;
1118
  background-color: white;
1119
  border-radius: 0.5rem;
1120
  border: 1px solid #e2e8f0;
1121
+ min-height: 150px; /* Ensure gallery has some height */
1122
  }
1123
  .gallery img {
1124
  width: 100%;
1125
  height: auto;
1126
+ object-fit: contain; /* Use contain to avoid stretching */
1127
  border-radius: 0.375rem;
1128
  transition: transform 0.2s;
1129
+ border: 1px solid #eee; /* Add subtle border */
1130
  }
1131
  .gallery img:hover {
1132
  transform: scale(1.05);
1133
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1); /* Add hover shadow */
1134
  }
1135
  """
1136
  # Create interface with modern design
 
1139
  # 🌐 Advanced Data Processing & QR Code Generator
1140
  Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
1141
  """)
1142
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1143
  with gr.Row():
1144
+ with gr.Column(scale=2):
1145
+ # Input Tabs
1146
+ with gr.Tabs():
1147
+ with gr.TabItem("πŸ“ URL Input"):
1148
+ url_input = gr.Textbox(
1149
+ label="Enter URLs (one per line or comma-separated)",
1150
+ lines=5,
1151
+ placeholder="https://example1.com\nhttps://example2.com",
1152
+ elem_id="url-input"
1153
+ )
1154
+ with gr.TabItem("πŸ“ File Input"):
1155
+ file_input = gr.File(
1156
+ label="Upload Files (Text, JSON, Archives: zip, tar, gz, bz2)",
1157
+ file_count="multiple",
1158
+ # Removed file_types="*" to rely on backend logic, or specify supported ones:
1159
+ # file_types=[".txt", ".json", ".csv", ".md", ".xml", ".html", ".zip", ".tar", ".gz", ".bz2"]
1160
+ elem_id="file-input"
1161
+ )
1162
+ with gr.TabItem("πŸ“‹ Direct Input / JSON"):
1163
+ text_input = gr.TextArea(
1164
+ label="Direct Text/JSON Input",
1165
+ lines=10,
1166
+ placeholder="Paste your text or JSON data here...",
1167
+ elem_id="text-input"
1168
+ )
1169
+ with gr.Row():
1170
+ example_btn = gr.Button("πŸ“ Load JSON Example")
1171
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Input")
1172
+
1173
+ # Processing Options & Button
1174
+ with gr.Row():
1175
+ combine_data = gr.Checkbox(
1176
+ label="Combine all inputs into one sequence",
1177
+ value=True, # Default to combined
1178
+ info="If unchecked, each URL/File/Input generates its own QR sequence."
1179
+ )
1180
+ process_btn = gr.Button(
1181
+ "πŸ”„ Process & Generate QR Codes",
1182
+ variant="primary",
1183
+ elem_id="process-button"
1184
+ )
1185
+
1186
+ # Status Output
1187
+ output_text = gr.Textbox(
1188
+ label="Processing Status",
1189
+ interactive=False,
1190
+ lines=2,
1191
+ elem_id="status-output"
1192
+ )
1193
+
1194
+
1195
+ with gr.Column(scale=3):
1196
+ # Output Area
1197
+ gr.Markdown("### Results")
1198
+ with gr.Tabs():
1199
+ with gr.TabItem("πŸ–ΌοΈ QR Codes"):
1200
+ output_gallery = gr.Gallery(
1201
+ label="Generated QR Codes",
1202
+ columns=4, # Adjust columns as needed
1203
+ height=500, # Adjust height
1204
+ object_fit="contain",
1205
+ preview=True, # Enable preview click
1206
+ elem_id="qr-gallery"
1207
+ )
1208
+ with gr.TabItem("πŸ“„ Processed Data (JSON)"):
1209
+ output_json = gr.JSON(
1210
+ label="Processed Data Structure",
1211
+ elem_id="json-output"
1212
+ )
1213
 
1214
  # Load example data
1215
  def load_example():
1216
  example = {
1217
+ "project": "Data Transfer Example",
1218
+ "version": 1.1,
1219
  "items": [
1220
+ {"id": "A001", "name": "Item One", "value": 123.45, "tags": ["tag1", "tag2"]},
1221
+ {"id": "B002", "name": "Item Two", "value": 67.89, "enabled": True}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1222
  ],
1223
+ "timestamp": datetime.now().isoformat()
 
 
 
 
1224
  }
1225
  return json.dumps(example, indent=2)
1226
 
1227
+ def clear_input_area():
1228
+ # Clear only the direct text input area
1229
  return ""
1230
 
1231
+ # --- Main Processing Function ---
1232
+ def process_inputs_and_generate_qrs(urls, files, text, combine):
1233
+ """Process all inputs, combine if requested, and generate QR codes."""
1234
+ start_time = time.time()
1235
+ logger.info("Starting data processing...")
1236
+ status_updates = []
1237
+ all_processed_data = [] # List to hold results from all sources
1238
+
1239
+ url_processor = EnhancedURLProcessor()
1240
+ file_processor = EnhancedFileProcessor()
1241
+
1242
+ # 1. Process URLs
1243
+ if urls and urls.strip():
1244
+ url_list = re.split(r'[,\n]+', urls) # Split by comma or newline, handle multiple newlines
1245
+ url_list = [u.strip() for u in url_list if u.strip()] # Clean up
1246
+ status_updates.append(f"Processing {len(url_list)} URLs...")
1247
+ logger.info(f"Processing URLs: {url_list}")
1248
+ for i, url in enumerate(url_list):
1249
+ logger.info(f"Processing URL {i+1}/{len(url_list)}: {url}")
1250
+ # Basic validation before fetching
1251
+ if not validators.url(url):
1252
+ logger.warning(f"Skipping invalid URL format: {url}")
1253
+ status_updates.append(f"⚠️ Skipped invalid URL: {url[:50]}...")
1254
+ all_processed_data.append({'error': 'Invalid URL format', 'url': url})
1255
+ continue
1256
+
1257
+ content_data = url_processor.fetch_content(url)
1258
+ if content_data and 'content' in content_data:
1259
+ logger.info(f"Successfully fetched content from {url} ({len(content_data.get('raw_content',''))} bytes)")
1260
+ # Structure the result similarly to file processing output
1261
+ processed_url_data = {
1262
+ 'source': 'url',
1263
+ 'url': url,
1264
+ 'content': content_data['content'], # Processed text content
1265
+ 'raw_content': content_data['raw_content'], # Raw response body
1266
+ 'metadata': content_data['metadata'], # Headers, status, etc.
1267
+ 'timestamp': datetime.now().isoformat()
1268
+ }
1269
+ all_processed_data.append(processed_url_data)
1270
+ status_updates.append(f"βœ“ Fetched: {url[:60]}...")
1271
+ else:
1272
+ logger.error(f"Failed to fetch content from URL: {url}")
1273
+ status_updates.append(f"❌ Failed fetch: {url[:60]}...")
1274
+ all_processed_data.append({'error': 'Failed to fetch content', 'url': url})
1275
+
1276
+ # 2. Process Files
1277
+ if files:
1278
+ status_updates.append(f"Processing {len(files)} uploaded files...")
1279
+ logger.info(f"Processing {len(files)} files.")
1280
+ for i, file_obj in enumerate(files):
1281
+ logger.info(f"Processing file {i+1}/{len(files)}: {getattr(file_obj, 'name', 'N/A')}")
1282
+ try:
1283
+ # Pass the Gradio file object directly to process_file
1284
+ file_results = file_processor.process_file(file_obj)
1285
+ if file_results:
1286
+ all_processed_data.extend(file_results)
1287
+ # Get filename safely from results (might be multiple from archive)
1288
+ processed_filenames = [res.get('filename', 'N/A') for res in file_results]
1289
+ status_updates.append(f"βœ“ Processed file(s): {', '.join(processed_filenames)}")
1290
+ logger.info(f"Successfully processed file(s): {', '.join(processed_filenames)}")
1291
+ else:
1292
+ status_updates.append(f"⚠️ No data extracted from file: {getattr(file_obj, 'name', 'N/A')}")
1293
+ logger.warning(f"No data extracted from file: {getattr(file_obj, 'name', 'N/A')}")
1294
+ # Add placeholder error if desired
1295
+ # all_processed_data.append({'error': 'No data extracted', 'filename': getattr(file_obj, 'name', 'N/A')})
1296
+
1297
+ except Exception as file_proc_err:
1298
+ file_name = getattr(file_obj, 'name', 'N/A')
1299
+ logger.error(f"Error processing file {file_name}: {file_proc_err}", exc_info=True)
1300
+ status_updates.append(f"❌ Error processing file: {file_name}")
1301
+ all_processed_data.append({'error': f'File processing error: {file_proc_err}', 'filename': file_name})
1302
+
1303
+
1304
+ # 3. Process Direct Text/JSON Input
1305
+ if text and text.strip():
1306
+ status_updates.append("Processing direct input...")
1307
+ logger.info("Processing direct text/JSON input.")
1308
+ # Attempt to parse as JSON first
1309
+ try:
1310
+ json_data = json.loads(text)
1311
+ logger.info("Direct input parsed as JSON.")
1312
+ processed_text_data = {
1313
+ 'source': 'direct_json',
1314
+ 'content': json_data, # Parsed JSON object/list
1315
+ 'raw_content': text, # Original string
1316
+ 'timestamp': datetime.now().isoformat()
1317
+ }
1318
+ all_processed_data.append(processed_text_data)
1319
+ status_updates.append("βœ“ Processed direct input as JSON.")
1320
+ except json.JSONDecodeError:
1321
+ # If not JSON, treat as plain text
1322
+ logger.info("Direct input treated as plain text.")
1323
+ processed_text_data = {
1324
+ 'source': 'direct_text',
1325
+ 'content': text, # Store as plain text
1326
+ 'timestamp': datetime.now().isoformat()
1327
+ }
1328
+ all_processed_data.append(processed_text_data)
1329
+ status_updates.append("βœ“ Processed direct input as Text.")
1330
+ except Exception as direct_input_err:
1331
+ logger.error(f"Error processing direct input: {direct_input_err}", exc_info=True)
1332
+ status_updates.append(f"❌ Error processing direct input.")
1333
+ all_processed_data.append({'error': f'Direct input error: {direct_input_err}', 'source': 'direct_input'})
1334
+
1335
+
1336
+ # 4. Check if any data was processed
1337
+ if not all_processed_data:
1338
+ logger.warning("No valid data sources found or processed.")
1339
+ status_updates.append("⚠️ No data to process. Please provide input.")
1340
+ final_status = "\n".join(status_updates)
1341
+ return None, [], final_status # Return empty results
1342
+
1343
+ logger.info(f"Total processed data items: {len(all_processed_data)}")
1344
+ status_updates.append(f"Data processed ({len(all_processed_data)} items). Generating QR codes...")
1345
+
1346
+ # 5. Generate QR Codes
1347
+ qr_paths = []
1348
  try:
1349
+ # Pass the list of processed data items
1350
+ qr_paths = generate_qr_codes(all_processed_data, combine)
1351
+ if qr_paths:
1352
+ status_updates.append(f"βœ“ Generated {len(qr_paths)} QR codes.")
1353
+ logger.info(f"Successfully generated {len(qr_paths)} QR codes.")
1354
+ else:
1355
+ status_updates.append("❌ QR code generation failed or produced no codes.")
1356
+ logger.error("QR code generation returned no paths.")
1357
+ # Keep processed data, but gallery will be empty
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1358
 
1359
+ except Exception as qr_gen_err:
1360
+ logger.error(f"Error during QR code generation step: {qr_gen_err}", exc_info=True)
1361
+ status_updates.append(f"❌ Error generating QR codes: {qr_gen_err}")
1362
+ # Keep processed data, gallery will be empty
1363
+
1364
+
1365
+ # 6. Finalize and Return
1366
+ end_time = time.time()
1367
+ processing_time = end_time - start_time
1368
+ status_updates.append(f"Total processing time: {processing_time:.2f} seconds.")
1369
+ final_status = "\n".join(status_updates)
1370
+
1371
+ # Return processed data (for JSON view), QR paths (for Gallery), and status string
1372
+ # Ensure qr_paths is a list of strings
1373
+ qr_paths_str = [str(p) for p in qr_paths] if qr_paths else []
1374
+
1375
+ # Return data for JSON output, gallery paths, and status text
1376
+ return all_processed_data, qr_paths_str, final_status
1377
+
1378
+
1379
+ # --- Event Handlers ---
1380
  example_btn.click(load_example, outputs=[text_input])
1381
+ clear_btn.click(clear_input_area, outputs=[text_input])
1382
+
1383
  process_btn.click(
1384
+ process_inputs_and_generate_qrs,
1385
  inputs=[url_input, file_input, text_input, combine_data],
1386
+ outputs=[output_json, output_gallery, output_text] # Match function return order
1387
  )
1388
 
1389
+ # Add helpful documentation (Seems intact)
 
 
 
 
1390
  gr.Markdown("""
1391
  ### πŸš€ Features
1392
+ - **Complete URL Scraping**: Extracts text content from web pages.
1393
+ - **Advanced File Processing**: Handles text, JSON, and archives (.zip, .tar.*, .gz, .bz2). Attempts intelligent JSON detection.
1394
+ - **Direct Input**: Paste text or JSON directly.
1395
+ - **Sequential QR Codes**: Chunks large data and embeds sequencing info. Option to combine inputs.
1396
+ - **Modern Design**: Clean, responsive interface.
1397
  ### πŸ’‘ Tips
1398
+ 1. **Inputs**: Use any combination of URL, File, or Direct Input tabs.
1399
+ 2. **Combine**: Check 'Combine all inputs' to create one QR sequence from all sources. Uncheck to get separate QR sequences for each source.
1400
+ 3. **Files**: Upload text-based files, JSON, or supported archives. Content from archives is extracted and processed.
1401
+ 4. **JSON**: Use the example button or upload a `.json` file. The app also tries to parse `.txt` or other files as JSON if they contain valid JSON structure.
1402
+ 5. **Status**: Monitor the Processing Status box for feedback.
1403
  ### 🎨 Output
1404
+ - Generated QR codes appear in the 'QR Codes' tab and are saved in the `output/qr_codes` directory.
1405
+ - The structured data processed from all inputs is shown in the 'Processed Data (JSON)' tab.
1406
+ - Hover over or click QR codes in the gallery for a larger preview.
1407
+ """)
1408
+ return interface
1409
 
1410
  def main():
1411
  """Initialize and launch the application"""
1412
  try:
1413
+ # Configure system settings if needed
1414
+ mimetypes.init() # Ensure mime types are loaded
1415
 
1416
+ logger.info("Starting Gradio application...")
1417
  # Create and launch interface
1418
  interface = create_modern_interface()
1419
 
1420
+ # Add the QR sequence visualizer tab (if function is defined and needed)
1421
+ # with interface:
1422
+ # create_qr_sequence_visualizer(None) # Pass relevant components if needed
1423
 
1424
  # Launch with configuration
1425
  interface.launch(
1426
+ share=False, # Set to True for public link (use with caution)
1427
+ debug=False, # Set to True for more verbose Gradio errors
1428
+ show_error=True, # Show Python errors in browser console
1429
+ # server_name="0.0.0.0", # Bind to all interfaces if needed for Docker/network access
1430
+ # server_port=7860, # Specify port if needed
1431
+ show_api=False # Disable default Gradio API endpoint unless needed
1432
  )
1433
+ logger.info("Gradio application stopped.")
1434
  except Exception as e:
1435
+ logger.error(f"Application startup or runtime error: {e}", exc_info=True)
1436
  raise
1437
 
1438
  if __name__ == "__main__":
1439
+ # Ensure output directories exist before starting
1440
+ OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
1441
+ QR_CODES_DIR.mkdir(parents=True, exist_ok=True)
1442
+ TEMP_DIR.mkdir(parents=True, exist_ok=True)
1443
  main()