acecalisto3 commited on
Commit
f29606a
Β·
verified Β·
1 Parent(s): 0fd3092

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +785 -283
app2.py CHANGED
@@ -7,8 +7,11 @@ import mimetypes
7
  import zipfile
8
  import tempfile
9
  import chardet
 
 
 
10
  from datetime import datetime
11
- from typing import List, Dict, Optional, Union, Tuple
12
  from pathlib import Path
13
  from urllib.parse import urlparse, urljoin
14
  import requests
@@ -25,6 +28,38 @@ import tarfile
25
  import gzip
26
  import math
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # Setup enhanced logging with more detailed formatting
29
  logging.basicConfig(
30
  level=logging.INFO,
@@ -43,7 +78,7 @@ for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
43
  directory.mkdir(parents=True, exist_ok=True)
44
 
45
  class EnhancedURLProcessor:
46
- """Advanced URL processing with complete content extraction"""
47
  def __init__(self):
48
  self.session = requests.Session()
49
  self.timeout = 15 # Extended timeout for larger content
@@ -53,11 +88,11 @@ class EnhancedURLProcessor:
53
  # Enhanced headers for better site compatibility
54
  self.session.headers.update({
55
  'User-Agent': self.user_agent.random,
56
- 'Accept': '*/*', # Accept all content types
57
  'Accept-Language': 'en-US,en;q=0.9',
58
  'Accept-Encoding': 'gzip, deflate, br',
59
  'Connection': 'keep-alive',
60
- 'Upgrade-Insecure-Requests': '1',
61
  'Sec-Fetch-Dest': 'document',
62
  'Sec-Fetch-Mode': 'navigate',
63
  'Sec-Fetch-Site': 'none',
@@ -77,15 +112,18 @@ class EnhancedURLProcessor:
77
  try:
78
  head_response = self.session.head(url, timeout=5)
79
  head_response.raise_for_status()
 
80
  except requests.exceptions.RequestException:
81
- # If HEAD fails, try GET as some servers don't support HEAD
82
  response = self.session.get(url, timeout=self.timeout)
83
  response.raise_for_status()
 
84
 
85
  return {
86
  'is_valid': True,
87
  'message': 'URL is valid and accessible',
88
  'details': {
 
89
  'content_type': head_response.headers.get('Content-Type', 'unknown'),
90
  'server': head_response.headers.get('Server', 'unknown'),
91
  'size': head_response.headers.get('Content-Length', 'unknown')
@@ -104,23 +142,38 @@ class EnhancedURLProcessor:
104
 
105
  response = self.session.get(url, timeout=self.timeout)
106
  response.raise_for_status()
 
107
 
108
  # Detect encoding
109
- if response.encoding is None:
110
- encoding = chardet.detect(response.content)['encoding'] or 'utf-8'
 
 
111
  else:
112
  encoding = response.encoding
 
 
113
  # Decode content with fallback
114
  try:
115
  raw_content = response.content.decode(encoding, errors='replace')
116
  except (UnicodeDecodeError, LookupError):
117
- raw_content = response.content.decode('utf-8', errors='replace')
 
 
 
 
 
 
 
 
 
118
 
119
  # Extract metadata
120
  metadata = {
121
- 'url': url,
 
122
  'timestamp': datetime.now().isoformat(),
123
- 'encoding': encoding,
124
  'content_type': response.headers.get('Content-Type', ''),
125
  'content_length': len(response.content),
126
  'headers': dict(response.headers),
@@ -128,271 +181,636 @@ class EnhancedURLProcessor:
128
  }
129
 
130
  # Process based on content type
131
- content_type = response.headers.get('Content-Type', '').lower()
132
- if 'text/html' in content_type:
133
- processed_content = self._process_html_content(raw_content, url)
134
- else:
135
- processed_content = raw_content
136
  return {
137
- 'content': processed_content,
 
138
  'raw_content': raw_content,
139
- 'metadata': metadata
 
 
140
  }
141
  except requests.exceptions.RequestException as e:
142
  if retry_count < self.max_retries - 1:
143
  logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
144
  time.sleep(2 ** retry_count) # Exponential backoff
145
  return self.fetch_content(url, retry_count + 1)
146
- logger.error(f"Failed to fetch content after {self.max_retries} attempts: {e}")
147
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  except Exception as e:
149
- logger.error(f"Unexpected error while fetching content: {e}")
150
- return None
 
 
 
151
 
152
- def _process_html_content(self, content: str, base_url: str) -> str:
153
- """Process HTML content while preserving all characters"""
 
 
 
 
 
 
 
154
  try:
155
  soup = BeautifulSoup(content, 'html.parser')
156
 
157
- # Convert relative URLs to absolute
158
- for tag in soup.find_all(['a', 'img', 'link', 'script']):
159
- for attr in ['href', 'src']:
160
- if tag.get(attr):
161
- try:
162
- tag[attr] = urljoin(base_url, tag[attr])
163
- except Exception:
164
- pass
165
- # Extract all text content
 
 
 
 
 
 
 
 
 
 
 
 
166
  text_parts = []
167
- for element in soup.stripped_strings:
168
- text_parts.append(str(element))
169
- return '\n'.join(text_parts)
 
 
 
 
 
 
 
170
  except Exception as e:
171
- logger.error(f"HTML processing error: {e}")
172
- return content
 
 
 
173
 
174
  class EnhancedFileProcessor:
175
- """Advanced file processing with complete content extraction"""
176
  def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
177
  self.max_file_size = max_file_size
 
178
  self.supported_extensions = {
179
  '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
180
  '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
 
 
181
  '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
182
- '.pdf', '.doc', '.docx', '.rtf', '.odt'
183
  }
 
 
184
 
185
  def process_file(self, file) -> List[Dict]:
186
  """Process uploaded file with enhanced error handling and complete extraction"""
187
- if not file:
 
188
  return []
189
 
190
  dataset = []
 
 
191
  try:
192
- file_size = os.path.getsize(file.name)
193
  if file_size > self.max_file_size:
194
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
195
- return []
 
 
 
 
 
 
196
 
197
  with tempfile.TemporaryDirectory() as temp_dir:
198
  temp_dir_path = Path(temp_dir)
199
 
200
- # Handle different archive types
201
- if self._is_archive(file.name):
202
- dataset.extend(self._process_archive(file.name, temp_dir_path))
203
- elif Path(file.name).suffix.lower() in self.supported_extensions:
204
- dataset.extend(self._process_single_file(file))
 
205
  else:
206
- logger.warning(f"Unsupported file type: {file.name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  except Exception as e:
209
- logger.error(f"Error processing file: {str(e)}")
210
- return []
 
 
 
 
 
 
211
  return dataset
212
 
213
- def _is_archive(self, filepath: str) -> bool:
214
  """Check if file is an archive"""
215
- return any(filepath.lower().endswith(ext) for ext in [
216
- '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'
217
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
- def _process_single_file(self, file) -> List[Dict]:
220
- """Process a single file with enhanced character extraction and JSON handling"""
221
  try:
222
- file_stat = os.stat(file.name)
223
- file_size = file_stat.st_size
224
-
225
- # Initialize content storage
226
- content_parts = []
227
-
228
- # Process file in chunks for large files
229
- chunk_size = 10 * 1024 * 1024 # 10MB chunks
230
- with open(file.name, 'rb') as f:
231
- while True:
232
- chunk = f.read(chunk_size)
233
- if not chunk:
234
- break
235
-
236
- # Detect encoding for each chunk
237
- encoding = chardet.detect(chunk)['encoding'] or 'utf-8'
238
- try:
239
- decoded_chunk = chunk.decode(encoding, errors='replace')
240
- content_parts.append(decoded_chunk)
241
- except (UnicodeDecodeError, LookupError):
242
- decoded_chunk = chunk.decode('utf-8', errors='replace')
243
- content_parts.append(decoded_chunk)
244
 
245
- # Combine all chunks
246
- complete_content = ''.join(content_parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
- # Check if the content is valid JSON regardless of file extension
249
- try:
250
- if mimetypes.guess_type(file.name)[0] == 'application/json' or file.name.lower().endswith('.json'):
251
- # It's a JSON file by type or extension
252
- json_data = json.loads(complete_content)
253
- return [{
254
- 'source': 'json_file',
255
- 'filename': os.path.basename(file.name),
256
- 'file_size': file_size,
257
- 'mime_type': 'application/json',
258
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
259
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
260
- 'content': json_data, # Store the parsed JSON object
261
- 'raw_content': complete_content, # Store the original JSON string
262
- 'timestamp': datetime.now().isoformat()
263
- }]
264
- else:
265
- # Try to parse as JSON anyway
266
- try:
267
- json_data = json.loads(complete_content)
268
- # If we get here, it's valid JSON despite the extension
269
- return [{
270
- 'source': 'json_content',
271
- 'filename': os.path.basename(file.name),
272
- 'file_size': file_size,
273
- 'mime_type': 'application/json',
274
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
275
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
276
- 'content': json_data, # Store the parsed JSON object
277
- 'raw_content': complete_content, # Store the original JSON string
278
- 'timestamp': datetime.now().isoformat()
279
- }]
280
- except json.JSONDecodeError:
281
- logger.warning(f"File {file.name} is not valid JSON.")
282
- except Exception as e:
283
- logger.error(f"Error during JSON processing: {e}")
284
 
285
- return [{
286
- 'source': 'file',
287
- 'filename': os.path.basename(file.name),
288
- 'file_size': file_size,
289
- 'mime_type': mimetypes.guess_type(file.name)[0],
290
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
291
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
292
- 'content': complete_content,
293
- 'timestamp': datetime.now().isoformat()
294
- }]
295
  except Exception as e:
296
- logger.error(f"File processing error: {e}")
297
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
- def _process_archive(self, archive_path: str, extract_to: Path) -> List[Dict]:
 
300
  """Process an archive file with enhanced extraction"""
301
  dataset = []
 
 
 
302
  try:
303
- # Handle ZIP archives
304
- if zipfile.is_zipfile(archive_path):
305
- with zipfile.ZipFile(archive_path, 'r') as zip_ref:
306
- zip_ref.extractall(extract_to)
307
- for file_info in zip_ref.infolist():
308
- if file_info.file_size > 0 and not file_info.filename.endswith('/'):
309
- extracted_path = extract_to / file_info.filename
310
- if extracted_path.suffix.lower() in self.supported_extensions:
311
- with open(extracted_path, 'rb') as f:
312
- dataset.extend(self._process_single_file(f))
313
- # Handle TAR archives
314
- elif archive_path.lower().endswith(('.tar', '.tar.gz', '.tgz')):
 
 
 
 
 
 
 
 
 
 
 
315
  try:
316
- with tarfile.open(archive_path, 'r:*') as tar_ref:
 
 
 
 
 
 
317
  for member in tar_ref.getmembers():
318
  if member.isfile():
319
- extracted_path = extract_to / member.name
320
- tar_ref.extract(member, path=extract_to)
321
- if extracted_path.suffix.lower() in self.supported_extensions:
322
- with open(extracted_path, 'rb') as f:
323
- dataset.extend(self._process_single_file(f))
 
 
 
 
 
 
 
 
324
  except tarfile.TarError as e:
325
- logger.error(f"Error processing TAR archive: {e}")
326
- # Handle GZIP archives (single file)
327
- elif archive_path.lower().endswith('.gz'):
328
- extracted_path = extract_to / Path(archive_path).stem
329
- try:
330
- with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
331
- outfile.write(gz_file.read())
332
- if extracted_path.suffix.lower() in self.supported_extensions:
333
- with open(extracted_path, 'rb') as f:
334
- dataset.extend(self._process_single_file(f))
335
- except gzip.GzipFile as e:
336
- logger.error(f"Error processing GZIP archive: {e}")
337
- # TODO: Add support for other archive types (.bz2, .7z, .rar) - may require external libraries
338
- elif archive_path.lower().endswith(('.bz2', '.7z', '.rar')):
339
- logger.warning(f"Support for {Path(archive_path).suffix} archives is not yet fully implemented.")
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
  except Exception as e:
342
- logger.error(f"Archive processing error: {e}")
 
 
 
 
 
343
  return dataset
344
 
345
  def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
346
  """Enhanced data chunking with sequence metadata"""
347
  try:
348
  # Convert data to JSON string
349
- json_str = json.dumps(data, ensure_ascii=False)
 
350
  total_length = len(json_str)
351
 
352
  # Calculate overhead for metadata
 
353
  metadata_template = {
354
- "chunk_index": 0,
355
- "total_chunks": 1,
356
- "total_length": total_length,
357
- "chunk_hash": "",
358
- "data": ""
359
  }
360
- overhead = len(json.dumps(metadata_template)) + 20 # Extra padding for safety
 
 
361
 
362
  # Calculate effective chunk size
363
- effective_chunk_size = max_size - overhead
 
 
 
 
364
 
365
  if total_length <= effective_chunk_size:
366
  # Data fits in one chunk
 
 
367
  chunk = {
368
- "chunk_index": 0,
369
- "total_chunks": 1,
370
- "total_length": total_length,
371
- "chunk_hash": hash(json_str) & 0xFFFFFFFF, # 32-bit hash
372
- "data": json_str
373
  }
374
  return [chunk]
375
 
376
  # Calculate number of chunks needed
377
  num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
378
- chunk_size = -(-total_length // num_chunks) # Even distribution
 
379
 
380
  chunks = []
 
381
  for i in range(num_chunks):
382
- start_idx = i * chunk_size
383
- end_idx = min(start_idx + chunk_size, total_length)
384
- chunk_data = json_str[start_idx:end_idx]
 
 
 
 
 
 
 
385
 
386
  chunk = {
387
- "chunk_index": i,
388
- "total_chunks": num_chunks,
389
- "total_length": total_length,
390
- "chunk_hash": hash(chunk_data) & 0xFFFFFFFF,
391
- "data": chunk_data
392
  }
393
  chunks.append(chunk)
 
 
 
 
 
 
 
394
 
 
 
395
  return chunks
 
396
  except Exception as e:
397
  logger.error(f"Error chunking data: {e}")
398
  return []
@@ -407,38 +825,51 @@ def generate_stylish_qr(data: Union[str, Dict],
407
  try:
408
  qr = qrcode.QRCode(
409
  version=None,
410
- error_correction=qrcode.constants.ERROR_CORRECT_M,
411
  box_size=size,
412
  border=border
413
  )
414
 
415
  # Add data to QR code
416
  if isinstance(data, dict):
417
- qr.add_data(json.dumps(data, ensure_ascii=False))
 
418
  else:
419
- qr.add_data(data)
420
 
421
  qr.make(fit=True)
422
 
423
  # Create QR code image with custom colors
424
  qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
425
 
426
- # Convert to RGBA for transparency support
427
  qr_image = qr_image.convert('RGBA')
428
 
429
- # Add subtle gradient overlay
430
- gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
431
- draw = ImageDraw.Draw(gradient)
432
- for i in range(qr_image.width):
433
- alpha = int(255 * (1 - i/qr_image.width) * 0.1) # 10% maximum opacity
434
- draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
 
436
- # Combine images
437
- final_image = Image.alpha_composite(qr_image, gradient)
438
 
439
  # Save the image
440
  output_path = QR_CODES_DIR / filename
441
- final_image.save(output_path, quality=95)
442
 
443
  return str(output_path)
444
  except Exception as e:
@@ -447,55 +878,68 @@ def generate_stylish_qr(data: Union[str, Dict],
447
 
448
  def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
449
  """Generate QR codes with enhanced visual appeal and metadata"""
 
 
 
 
 
450
  try:
451
- file_processor = EnhancedFileProcessor()
452
  paths = []
453
 
454
  if combined:
455
  # Process combined data
456
- chunks = file_processor.chunk_data(data)
 
 
 
457
  for i, chunk in enumerate(chunks):
458
  filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png'
459
  qr_path = generate_stylish_qr(
460
- data=chunk,
461
  filename=filename,
462
  fill_color="#1a365d", # Deep blue
463
  back_color="#ffffff"
464
  )
465
  if qr_path:
466
  paths.append(qr_path)
 
 
467
  else:
468
- # Process individual items
469
- if isinstance(data, list):
470
  for idx, item in enumerate(data):
471
- chunks = file_processor.chunk_data(item)
 
 
 
472
  for chunk_idx, chunk in enumerate(chunks):
473
  filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png'
474
  qr_path = generate_stylish_qr(
475
- data=chunk,
476
  filename=filename,
477
  fill_color="#1a365d", # Deep blue
478
  back_color="#ffffff"
479
  )
480
  if qr_path:
481
  paths.append(qr_path)
 
 
482
  else:
483
- chunks = file_processor.chunk_data(data)
484
- for i, chunk in enumerate(chunks):
485
- filename = f'single_qr_{i+1}_of_{len(chunks)}_{int(time.time())}.png'
486
- qr_path = generate_stylish_qr(
487
- data=chunk,
488
- filename=filename,
489
- fill_color="#1a365d", # Deep blue
490
- back_color="#ffffff"
491
- )
492
- if qr_path:
493
- paths.append(qr_path)
494
- return paths
495
  except Exception as e:
496
  logger.error(f"QR code generation error: {e}")
497
  return []
498
 
 
 
 
 
 
499
  def create_modern_interface():
500
  """Create a modern and visually appealing Gradio interface"""
501
 
@@ -599,7 +1043,6 @@ def create_modern_interface():
599
  interface.head += """
600
  <script>
601
  let enabledStates = [];
602
-
603
  function updateEnabledStates(checkbox) {
604
  const index = parseInt(checkbox.dataset.index);
605
  if (checkbox.checked) {
@@ -623,7 +1066,6 @@ def create_modern_interface():
623
  qr_code_paths = gr.State([])
624
  gr.Markdown("""
625
  # 🌐 Advanced Data Processing & QR Code Generator
626
-
627
  Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
628
  """)
629
  with gr.Tab("πŸ“ URL Processing"):
@@ -707,24 +1149,30 @@ def create_modern_interface():
707
  return json.dumps(example, indent=2)
708
 
709
  def clear_input():
710
- return ""
711
 
712
  def update_viewport(paths, enabled_states):
713
  if not paths:
714
  return "<p>No QR codes generated yet.</p>"
715
 
716
  num_qr_codes = len(paths)
717
- cols = math.ceil(math.sqrt(num_qr_codes))
 
718
  rows = math.ceil(num_qr_codes / cols)
719
 
720
- viewport_html = '<div class="viewport-container" style="grid-template-columns: repeat({}, 1fr);">'.format(cols)
 
 
 
 
721
 
722
  for i, path in enumerate(paths):
723
  is_enabled = i in enabled_states
724
  border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;"
 
725
  viewport_html += f'<div class="viewport-item" id="qr_item_{i}">'
726
- viewport_html += f'<img src="{path}" style="{border}" alt="QR Code {i+1}">'
727
- viewport_html += f'<input type="checkbox" id="enable_qr_{i}" data-index="{i}" {"checked" if is_enabled else ""} onchange="updateEnabledStates(this)"> Enable'
728
  viewport_html += '</div>'
729
  viewport_html += '</div>'
730
 
@@ -732,21 +1180,30 @@ def create_modern_interface():
732
 
733
  def process_inputs(urls, files, text, combine):
734
  """Process all inputs and generate QR codes"""
735
- try:
736
- results = []
737
- url_processor = EnhancedURLProcessor()
738
- file_processor = EnhancedFileProcessor()
 
739
 
 
740
  # Process JSON input
741
  if text and text.strip():
742
  try:
743
  json_data = json.loads(text)
744
- if isinstance(json_data, list):
745
- results.extend(json_data)
746
- else:
747
- results.append(json_data)
 
 
 
 
748
  except json.JSONDecodeError as e:
749
- return None, [], f"❌ Invalid JSON format: {str(e)}"
 
 
 
750
 
751
  # Process URLs
752
  if urls and urls.strip():
@@ -755,79 +1212,122 @@ def create_modern_interface():
755
  for url in url_list:
756
  validation = url_processor.validate_url(url)
757
  if validation['is_valid']:
758
- content = url_processor.fetch_content(url)
759
- if content:
760
- results.append({
761
- 'source': 'url',
762
- 'url': url,
763
- 'content': content,
764
- 'timestamp': datetime.now().isoformat()
765
- })
 
 
 
766
 
767
  # Process files
768
  if files:
769
  for file in files:
 
770
  file_results = file_processor.process_file(file)
771
  if file_results:
772
- results.extend(file_results)
 
 
 
773
 
774
  # Generate QR codes
 
 
 
775
  if results:
 
776
  qr_paths = generate_qr_codes(results, combine)
 
 
777
  if qr_paths:
778
- return (
779
- results,
780
- [str(path) for path in qr_paths],
781
- f"βœ… Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
782
- )
783
  else:
784
- return None, [], "❌ Failed to generate QR codes"
 
785
  else:
786
- return None, [], "⚠️ No valid content to process"
 
 
787
  except Exception as e:
788
- logger.error(f"Processing error: {e}")
789
- return None, [], f"❌ Error: {str(e)}"
 
 
 
 
 
 
790
 
791
- def on_qr_generation(results, qr_paths):
792
- return qr_paths, qr_paths # Update state with generated paths
 
 
 
 
 
 
 
 
 
793
 
794
  process_btn.click(
795
  process_inputs,
796
  inputs=[url_input, file_input, text_input, combine_data],
797
  outputs=[output_json, output_gallery, output_text]
798
- ).then(on_qr_generation, inputs=[output_json, output_gallery], outputs=[qr_code_paths, viewport_output])
 
 
 
 
799
 
 
800
  viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output])
801
 
802
  # Add helpful documentation
803
  gr.Markdown("""
804
  ### πŸš€ Features
805
- - **Complete URL Scraping**: Extracts every character from web pages
806
- - **Advanced File Processing**: Full content extraction from various text-based files and common archives. Supports flexible JSON handling.
807
- - **Smart JSON Handling**: Processes any size JSON with automatic chunking, either via direct input or file upload.
808
- - **Sequential QR Codes**: Maintains data integrity across multiple codes
809
- - **QR Code Viewport**: Visualize generated QR codes in a sequenced square, with options to enable/disable individual codes.
810
- - **Modern Design**: Clean, responsive interface with visual feedback
811
-
812
- ### πŸ’‘ Tips
813
- 1. **URLs**: Enter multiple URLs separated by commas or newlines
814
- 2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and JSON files.
815
- 3. **JSON**: Use the example button to see the expected format or upload a .json file. The system will also try to detect JSON content in other file types.
816
- 4. **QR Codes**: Choose whether to combine data into sequential codes
817
- 5. **Processing**: Monitor the status for real-time feedback
818
-
819
- ### 🎨 Output
820
- - Generated QR codes are saved in the `output/qr_codes` directory
821
- - Each QR code contains metadata for proper sequencing
822
- - Hover over QR codes in the gallery to see details
823
- - The **QR Code Viewport** tab displays the generated QR codes in a grid.
824
-
825
- ### βš™οΈ QR Code Viewport Instructions
826
- 1. Navigate to the **QR Code Viewport** tab after generating QR codes.
827
- 2. The generated QR codes will be displayed in a square arrangement.
828
- 3. Use the checkboxes below each QR code to enable or disable it.
829
- 4. The visualization will update to reflect the enabled/disabled state (currently by a green border).
830
- """)
 
 
 
 
 
 
 
 
 
 
 
831
  return interface
832
 
833
  def main():
@@ -842,13 +1342,15 @@ def main():
842
  # Launch with configuration
843
  interface.launch(
844
  share=False,
845
- debug=False,
846
  show_error=True,
847
  show_api=False
848
  )
849
  except Exception as e:
850
  logger.error(f"Application startup error: {e}")
851
- raise
 
 
852
 
853
  if __name__ == "__main__":
854
  main()
 
7
  import zipfile
8
  import tempfile
9
  import chardet
10
+ import io # Needed for processing CSV from string
11
+ import csv # Needed for CSV
12
+ import xml.etree.ElementTree as ET # Needed for XML
13
  from datetime import datetime
14
+ from typing import List, Dict, Optional, Union, Tuple, Any # Added Any for extracted_data
15
  from pathlib import Path
16
  from urllib.parse import urlparse, urljoin
17
  import requests
 
28
  import gzip
29
  import math
30
 
31
+ # Conditional imports for document processing
32
+ try:
33
+ from PyPDF2 import PdfReader
34
+ PDF_SUPPORT = True
35
+ except ImportError:
36
+ PDF_SUPPORT = False
37
+ logger.warning("PyPDF2 not installed. PDF file processing will be limited.")
38
+
39
+ try:
40
+ from docx import Document
41
+ DOCX_SUPPORT = True
42
+ except ImportError:
43
+ DOCX_SUPPORT = False
44
+ logger.warning("python-docx not installed. DOCX file processing will be limited.")
45
+
46
+ try:
47
+ from pyth.plugins.rtf15.reader import Rtf15Reader
48
+ from pyth.plugins.plaintext.writer import PlaintextWriter
49
+ RTF_SUPPORT = True
50
+ except ImportError:
51
+ RTF_SUPPORT = False
52
+ logger.warning("pyth not installed. RTF file processing will be limited.")
53
+
54
+ try:
55
+ from odf.opendocument import OpenDocumentText
56
+ from odf import text as odftext
57
+ ODT_SUPPORT = True
58
+ except ImportError:
59
+ ODT_SUPPORT = False
60
+ logger.warning("odfpy not installed. ODT file processing will be limited.")
61
+
62
+
63
  # Setup enhanced logging with more detailed formatting
64
  logging.basicConfig(
65
  level=logging.INFO,
 
78
  directory.mkdir(parents=True, exist_ok=True)
79
 
80
  class EnhancedURLProcessor:
81
+ """Advanced URL processing with enhanced content extraction"""
82
  def __init__(self):
83
  self.session = requests.Session()
84
  self.timeout = 15 # Extended timeout for larger content
 
88
  # Enhanced headers for better site compatibility
89
  self.session.headers.update({
90
  'User-Agent': self.user_agent.random,
91
+ 'Accept': 'text/html, application/json, application/xml, text/plain, */*', # Request common types
92
  'Accept-Language': 'en-US,en;q=0.9',
93
  'Accept-Encoding': 'gzip, deflate, br',
94
  'Connection': 'keep-alive',
95
+ 'Upgrade-Insecure-Requests': '1', # May be ignored for non-HTML
96
  'Sec-Fetch-Dest': 'document',
97
  'Sec-Fetch-Mode': 'navigate',
98
  'Sec-Fetch-Site': 'none',
 
112
  try:
113
  head_response = self.session.head(url, timeout=5)
114
  head_response.raise_for_status()
115
+ final_url = head_response.url # Capture potential redirects
116
  except requests.exceptions.RequestException:
117
+ # If HEAD fails, try GET as some servers don't support HEAD
118
  response = self.session.get(url, timeout=self.timeout)
119
  response.raise_for_status()
120
+ final_url = response.url # Capture potential redirects
121
 
122
  return {
123
  'is_valid': True,
124
  'message': 'URL is valid and accessible',
125
  'details': {
126
+ 'final_url': final_url,
127
  'content_type': head_response.headers.get('Content-Type', 'unknown'),
128
  'server': head_response.headers.get('Server', 'unknown'),
129
  'size': head_response.headers.get('Content-Length', 'unknown')
 
142
 
143
  response = self.session.get(url, timeout=self.timeout)
144
  response.raise_for_status()
145
+ final_url = response.url # Capture potential redirects
146
 
147
  # Detect encoding
148
+ if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
149
+ encoding_detection = chardet.detect(response.content)
150
+ encoding = encoding_detection['encoding'] or 'utf-8'
151
+ logger.debug(f"Detected encoding '{encoding}' with confidence {encoding_detection['confidence']:.2f} for {url}")
152
  else:
153
  encoding = response.encoding
154
+ logger.debug(f"Using response.encoding '{encoding}' for {url}")
155
+
156
  # Decode content with fallback
157
  try:
158
  raw_content = response.content.decode(encoding, errors='replace')
159
  except (UnicodeDecodeError, LookupError):
160
+ # Fallback to a more common encoding if the first attempt fails
161
+ try:
162
+ raw_content = response.content.decode('utf-8', errors='replace')
163
+ encoding = 'utf-8 (fallback)'
164
+ logger.warning(f"Decoding with {encoding} fallback for {url}")
165
+ except Exception:
166
+ raw_content = response.content.decode('latin-1', errors='replace') # Another common fallback
167
+ encoding = 'latin-1 (fallback)'
168
+ logger.warning(f"Decoding with {encoding} fallback for {url}")
169
+
170
 
171
  # Extract metadata
172
  metadata = {
173
+ 'original_url': url,
174
+ 'final_url': final_url,
175
  'timestamp': datetime.now().isoformat(),
176
+ 'detected_encoding': encoding,
177
  'content_type': response.headers.get('Content-Type', ''),
178
  'content_length': len(response.content),
179
  'headers': dict(response.headers),
 
181
  }
182
 
183
  # Process based on content type
184
+ processed_extraction = self._process_web_content(raw_content, metadata['content_type'], final_url)
185
+
 
 
 
186
  return {
187
+ 'source': 'url',
188
+ 'url': url, # Keep original URL as identifier
189
  'raw_content': raw_content,
190
+ 'metadata': metadata,
191
+ 'extracted_data': processed_extraction['data'],
192
+ 'processing_notes': processed_extraction['notes']
193
  }
194
  except requests.exceptions.RequestException as e:
195
  if retry_count < self.max_retries - 1:
196
  logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
197
  time.sleep(2 ** retry_count) # Exponential backoff
198
  return self.fetch_content(url, retry_count + 1)
199
+ logger.error(f"Failed to fetch content after {self.max_retries} attempts from {url}: {e}")
200
+ return {
201
+ 'source': 'url',
202
+ 'url': url,
203
+ 'raw_content': None,
204
+ 'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat()},
205
+ 'extracted_data': None,
206
+ 'processing_notes': f"Failed to fetch content: {str(e)}"
207
+ }
208
+ except Exception as e:
209
+ logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
210
+ return {
211
+ 'source': 'url',
212
+ 'url': url,
213
+ 'raw_content': raw_content if 'raw_content' in locals() else None,
214
+ 'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat()},
215
+ 'extracted_data': None,
216
+ 'processing_notes': f"Unexpected processing error: {str(e)}"
217
+ }
218
+
219
+ def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
220
+ """Process content based on detected content type"""
221
+ lower_content_type = content_type.lower()
222
+ notes = []
223
+ extracted_data: Any = None # Use Any to allow different types
224
+
225
+ try:
226
+ if 'text/html' in lower_content_type:
227
+ logger.debug(f"Processing HTML content from {base_url}")
228
+ extracted_data = self._process_html_content_enhanced(content, base_url)
229
+ notes.append("Processed as HTML")
230
+ elif 'application/json' in lower_content_type or 'text/json' in lower_content_type:
231
+ logger.debug(f"Processing JSON content from {base_url}")
232
+ try:
233
+ extracted_data = json.loads(content)
234
+ notes.append("Parsed as JSON")
235
+ except json.JSONDecodeError as e:
236
+ extracted_data = content # Keep raw text if invalid JSON
237
+ notes.append(f"Failed to parse as JSON: {e}")
238
+ logger.warning(f"Failed to parse JSON from {base_url}: {e}")
239
+ except Exception as e:
240
+ extracted_data = content
241
+ notes.append(f"Error processing JSON: {e}")
242
+ logger.error(f"Error processing JSON from {base_url}: {e}")
243
+ elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'):
244
+ logger.debug(f"Processing XML content from {base_url}")
245
+ try:
246
+ # Try parsing XML. Convert to a string or a dict representation if needed.
247
+ # For simplicity, we'll convert to a readable string representation of the tree.
248
+ root = ET.fromstring(content)
249
+ # A simple way to represent XML as text
250
+ xml_text = ET.tostring(root, encoding='unicode', method='xml')
251
+ extracted_data = xml_text # Store as string for now
252
+ notes.append("Parsed as XML (text representation)")
253
+ except ET.ParseError as e:
254
+ extracted_data = content
255
+ notes.append(f"Failed to parse as XML: {e}")
256
+ logger.warning(f"Failed to parse XML from {base_url}: {e}")
257
+ except Exception as e:
258
+ extracted_data = content
259
+ notes.append(f"Error processing XML: {e}")
260
+ logger.error(f"Error processing XML from {base_url}: {e}")
261
+ elif 'text/plain' in lower_content_type or 'text/' in lower_content_type: # Catch other text types
262
+ logger.debug(f"Processing Plain Text content from {base_url}")
263
+ extracted_data = content
264
+ notes.append("Processed as Plain Text")
265
+ else:
266
+ logger.debug(f"Unknown content type '{content_type}' from {base_url}. Storing raw content.")
267
+ extracted_data = content # Store raw content for unknown types
268
+ notes.append(f"Unknown content type '{content_type}'. Stored raw text.")
269
+
270
  except Exception as e:
271
+ logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}")
272
+ extracted_data = content # Fallback to raw content on error
273
+ notes.append(f"Unexpected processing error: {e}. Stored raw text.")
274
+
275
+ return {'data': extracted_data, 'notes': notes}
276
 
277
+
278
+ def _process_html_content_enhanced(self, content: str, base_url: str) -> Dict[str, Any]:
279
+ """Process HTML content, preserving text, and extracting metadata."""
280
+ extracted: Dict[str, Any] = {
281
+ 'title': None,
282
+ 'meta_description': None, # Add extraction for meta description
283
+ 'full_text': "",
284
+ 'links': [] # Add extraction for links
285
+ }
286
  try:
287
  soup = BeautifulSoup(content, 'html.parser')
288
 
289
+ # Extract Title
290
+ if soup.title and soup.title.string:
291
+ extracted['title'] = soup.title.string.strip()
292
+
293
+ # Extract Meta Description
294
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
295
+ if meta_desc and meta_desc.get('content'):
296
+ extracted['meta_description'] = meta_desc['content'].strip()
297
+
298
+ # Extract and process links (convert relative to absolute)
299
+ for a_tag in soup.find_all('a', href=True):
300
+ href = a_tag['href']
301
+ text = a_tag.get_text().strip()
302
+ try:
303
+ absolute_url = urljoin(base_url, href)
304
+ extracted['links'].append({'text': text, 'url': absolute_url})
305
+ except Exception:
306
+ extracted['links'].append({'text': text, 'url': href}) # Keep relative if join fails
307
+
308
+
309
+ # Extract all text content (similar to stripped_strings but ensures order)
310
  text_parts = []
311
+ # Use a more robust way to get visible text, including handling script/style tags
312
+ for script_or_style in soup(["script", "style"]):
313
+ script_or_style.extract() # Remove script and style tags
314
+ text = soup.get_text(separator='\n') # Get text with newlines
315
+
316
+ # Clean up whitespace and empty lines
317
+ lines = text.splitlines()
318
+ cleaned_lines = [line.strip() for line in lines if line.strip()]
319
+ extracted['full_text'] = '\n'.join(cleaned_lines)
320
+
321
  except Exception as e:
322
+ logger.error(f"Enhanced HTML processing error for {base_url}: {e}")
323
+ extracted['full_text'] = content # Fallback to raw content
324
+ extracted['processing_error'] = f"Enhanced HTML processing failed: {e}"
325
+
326
+ return extracted
327
 
328
  class EnhancedFileProcessor:
329
+ """Advanced file processing with enhanced content extraction"""
330
  def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
331
  self.max_file_size = max_file_size
332
+ # Expanded supported extensions to include common docs and structured formats
333
  self.supported_extensions = {
334
  '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
335
  '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
336
+ '.pdf', '.doc', '.docx', '.rtf', '.odt',
337
+ # Archives are handled separately but listed for context
338
  '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
 
339
  }
340
+ self.archive_extensions = {'.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'}
341
+
342
 
343
  def process_file(self, file) -> List[Dict]:
344
  """Process uploaded file with enhanced error handling and complete extraction"""
345
+ if not file or not hasattr(file, 'name'):
346
+ logger.warning("Received invalid file object.")
347
  return []
348
 
349
  dataset = []
350
+ file_path = Path(file.name) # Use Path object for easier handling
351
+
352
  try:
353
+ file_size = file_path.stat().st_size
354
  if file_size > self.max_file_size:
355
+ logger.warning(f"File '{file_path.name}' size ({file_size} bytes) exceeds maximum allowed size ({self.max_file_size} bytes).")
356
+ return [{
357
+ 'source': 'file',
358
+ 'filename': file_path.name,
359
+ 'file_size': file_size,
360
+ 'extracted_data': None,
361
+ 'processing_notes': 'File size exceeds limit.'
362
+ }]
363
 
364
  with tempfile.TemporaryDirectory() as temp_dir:
365
  temp_dir_path = Path(temp_dir)
366
 
367
+ # Decide processing strategy
368
+ if file_path.suffix.lower() in self.archive_extensions:
369
+ dataset.extend(self._process_archive(file_path, temp_dir_path))
370
+ elif file_path.suffix.lower() in self.supported_extensions:
371
+ # Pass the path to the single file processor
372
+ dataset.extend(self._process_single_file(file_path))
373
  else:
374
+ logger.warning(f"Unsupported file type for processing: '{file_path.name}'")
375
+ # Optionally process as raw text even if extension is unsupported
376
+ try:
377
+ # Read as text with error replacement
378
+ content_bytes = file_path.read_bytes()
379
+ encoding_detection = chardet.detect(content_bytes)
380
+ encoding = encoding_detection['encoding'] or 'utf-8'
381
+ raw_content = content_bytes.decode(encoding, errors='replace')
382
+ dataset.append({
383
+ 'source': 'file',
384
+ 'filename': file_path.name,
385
+ 'file_size': file_size,
386
+ 'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
387
+ 'extracted_data': {'plain_text': raw_content}, # Store raw text under a key
388
+ 'processing_notes': 'Processed as plain text (unsupported extension).'
389
+ })
390
+ except Exception as e:
391
+ logger.error(f"Error reading or processing unsupported file '{file_path.name}' as text: {e}")
392
+ dataset.append({
393
+ 'source': 'file',
394
+ 'filename': file_path.name,
395
+ 'file_size': file_size,
396
+ 'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
397
+ 'extracted_data': None,
398
+ 'processing_notes': f'Unsupported file type and failed to read as text: {e}'
399
+ })
400
+
401
 
402
  except Exception as e:
403
+ logger.error(f"Error processing file '{file_path.name}': {str(e)}")
404
+ dataset.append({
405
+ 'source': 'file',
406
+ 'filename': file_path.name,
407
+ 'file_size': file_size if 'file_size' in locals() else None,
408
+ 'extracted_data': None,
409
+ 'processing_notes': f'Overall file processing error: {str(e)}'
410
+ })
411
  return dataset
412
 
413
+ def _is_archive(self, filepath: Union[str, Path]) -> bool:
414
  """Check if file is an archive"""
415
+ p = Path(filepath) if isinstance(filepath, str) else filepath
416
+ return p.suffix.lower() in self.archive_extensions
417
+
418
+ def _process_single_file(self, file_path: Path) -> List[Dict]:
419
+ """Process a single file with enhanced character extraction and format-specific handling"""
420
+ dataset_entries = []
421
+ filename = file_path.name
422
+ file_size = file_path.stat().st_size
423
+ mime_type, _ = mimetypes.guess_type(file_path)
424
+ mime_type = mime_type or 'unknown/unknown'
425
+ file_extension = file_path.suffix.lower()
426
+
427
+ logger.info(f"Processing single file: '{filename}' ({mime_type}, {file_size} bytes)")
428
+
429
+ raw_content: Optional[str] = None
430
+ extracted_data: Any = None
431
+ processing_notes = []
432
 
 
 
433
  try:
434
+ # Read content efficiently
435
+ content_bytes = file_path.read_bytes()
436
+ encoding_detection = chardet.detect(content_bytes)
437
+ encoding = encoding_detection['encoding'] or 'utf-8'
438
+ raw_content = content_bytes.decode(encoding, errors='replace')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
 
440
+ # --- Attempt format-specific parsing ---
441
+
442
+ # 1. Attempt JSON parsing (explicit .json or application/json, OR if content looks like JSON)
443
+ is_explicit_json = mime_type == 'application/json' or file_extension == '.json'
444
+ looks_like_json = raw_content.strip().startswith('{') or raw_content.strip().startswith('[')
445
+
446
+ if is_explicit_json or looks_like_json:
447
+ try:
448
+ extracted_data = json.loads(raw_content)
449
+ processing_notes.append("Parsed as JSON.")
450
+ if not is_explicit_json:
451
+ processing_notes.append("Note: Content looked like JSON despite extension/mime.")
452
+ logger.warning(f"File '{filename}' identified as JSON content despite extension/mime.")
453
+ mime_type = 'application/json' # Update mime_type if successfully parsed as JSON
454
+ except json.JSONDecodeError as e:
455
+ processing_notes.append(f"Failed to parse as JSON: {e}.")
456
+ if is_explicit_json:
457
+ logger.error(f"Explicit JSON file '{filename}' has invalid format: {e}")
458
+ else:
459
+ logger.warning(f"Content of '{filename}' looks like JSON but failed to parse: {e}")
460
+ except Exception as e:
461
+ processing_notes.append(f"Error processing JSON: {e}.")
462
+ logger.error(f"Error processing JSON in '{filename}': {e}")
463
+
464
+ # 2. Attempt XML parsing (if not already parsed as JSON, and looks like XML)
465
+ # Add check if extracted_data is still None (meaning JSON parsing failed or wasn't attempted/relevant)
466
+ looks_like_xml = extracted_data is None and raw_content.strip().startswith('<') and raw_content.strip().endswith('>') # Simple heuristic
467
+ is_explicit_xml = extracted_data is None and (mime_type in ('application/xml', 'text/xml') or mime_type.endswith('+xml') or file_extension in ('.xml', '.xsd'))
468
+
469
+ if extracted_data is None and (is_explicit_xml or looks_like_xml):
470
+ try:
471
+ root = ET.fromstring(raw_content)
472
+ # Convert XML element tree to a structured dictionary or string
473
+ # Simple string representation for QR code suitability
474
+ extracted_data = ET.tostring(root, encoding='unicode', method='xml')
475
+ processing_notes.append("Parsed as XML (text representation).")
476
+ if not is_explicit_xml:
477
+ processing_notes.append("Note: Content looked like XML despite extension/mime.")
478
+ # Update mime_type if successfully parsed as XML
479
+ if 'xml' not in mime_type: mime_type = 'application/xml'
480
+ except ET.ParseError as e:
481
+ processing_notes.append(f"Failed to parse as XML: {e}.")
482
+ if is_explicit_xml:
483
+ logger.error(f"Explicit XML file '{filename}' has invalid format: {e}")
484
+ else:
485
+ logger.warning(f"Content of '{filename}' looks like XML but failed to parse: {e}")
486
+ except Exception as e:
487
+ processing_notes.append(f"Error processing XML: {e}.")
488
+ logger.error(f"Error processing XML in '{filename}': {e}")
489
+
490
+
491
+ # 3. Attempt CSV parsing (if not already parsed, and looks like CSV or is explicit CSV)
492
+ is_explicit_csv = extracted_data is None and (mime_type == 'text/csv' or file_extension == '.csv')
493
+ # Heuristic: check for commas/semicolons and multiple lines
494
+ looks_like_csv = extracted_data is None and (',' in raw_content or ';' in raw_content) and ('\n' in raw_content or len(raw_content.splitlines()) > 1)
495
+
496
+ if extracted_data is None and (is_explicit_csv or looks_like_csv):
497
+ try:
498
+ # Use Sniffer to guess dialect for better compatibility
499
+ dialect = 'excel' # Default dialect
500
+ try:
501
+ # Look at first few lines to guess dialect
502
+ sample = '\n'.join(raw_content.splitlines()[:10])
503
+ if sample:
504
+ dialect = csv.Sniffer().sniff(sample).name
505
+ logger.debug(f"Sniffer detected CSV dialect: {dialect} for '{filename}'")
506
+ except csv.Error:
507
+ logger.debug(f"Sniffer failed to detect dialect for '{filename}', using 'excel'.")
508
+ dialect = 'excel' # Fallback
509
+
510
+ # Read using the guessed or default dialect
511
+ csv_reader = csv.reader(io.StringIO(raw_content), dialect=dialect)
512
+ rows = list(csv_reader)
513
+
514
+ if rows:
515
+ # Limit the number of rows included for potentially huge CSVs
516
+ max_rows_preview = 100
517
+ extracted_data = {
518
+ 'headers': rows[0] if rows[0] else None, # Assume first row is header
519
+ 'rows': rows[1:max_rows_preview+1] # Get up to max_rows_preview data rows
520
+ }
521
+ if len(rows) > max_rows_preview + 1:
522
+ processing_notes.append(f"CSV truncated to {max_rows_preview} data rows.")
523
+ processing_notes.append("Parsed as CSV.")
524
+ if not is_explicit_csv:
525
+ processing_notes.append("Note: Content looked like CSV despite extension/mime.")
526
+ mime_type = 'text/csv' # Update mime_type
527
+
528
+ else:
529
+ extracted_data = "Empty CSV"
530
+ processing_notes.append("Parsed as empty CSV.")
531
+ if not is_explicit_csv:
532
+ processing_notes.append("Note: Content looked like CSV but was empty.")
533
+
534
+ except Exception as e:
535
+ processing_notes.append(f"Failed to parse as CSV: {e}.")
536
+ logger.warning(f"Failed to parse CSV from '{filename}': {e}")
537
+
538
+
539
+ # 4. Attempt Document Text Extraction (if not already parsed)
540
+ if extracted_data is None:
541
+ try:
542
+ extracted_text = None
543
+ if file_extension == '.pdf' and PDF_SUPPORT:
544
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
545
+ tmp_file.write(content_bytes) # Write bytes to temp file
546
+ temp_path = Path(tmp_file.name)
547
+ try:
548
+ reader = PdfReader(temp_path)
549
+ text_content = "".join(page.extract_text() or "" for page in reader.pages)
550
+ extracted_text = text_content
551
+ processing_notes.append("Extracted text from PDF.")
552
+ finally:
553
+ temp_path.unlink() # Clean up temp file
554
+ elif file_extension == '.docx' and DOCX_SUPPORT:
555
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
556
+ tmp_file.write(content_bytes) # Write bytes to temp file
557
+ temp_path = Path(tmp_file.name)
558
+ try:
559
+ document = Document(temp_path)
560
+ text_content = "\n".join(paragraph.text for paragraph in document.paragraphs)
561
+ extracted_text = text_content
562
+ processing_notes.append("Extracted text from DOCX.")
563
+ finally:
564
+ temp_path.unlink() # Clean up temp file
565
+ elif file_extension == '.rtf' and RTF_SUPPORT:
566
+ # pyth can read directly from file-like object or string
567
+ try:
568
+ doc = Rtf15Reader.read(io.StringIO(raw_content))
569
+ text_content = PlaintextWriter.write(doc).getvalue()
570
+ extracted_text = text_content
571
+ processing_notes.append("Extracted text from RTF.")
572
+ except Exception as e:
573
+ processing_notes.append(f"RTF extraction error: {e}")
574
+ logger.warning(f"Failed to extract RTF text from '{filename}': {e}")
575
+ elif file_extension == '.odt' and ODT_SUPPORT:
576
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.odt') as tmp_file:
577
+ tmp_file.write(content_bytes) # Write bytes to temp file
578
+ temp_path = Path(tmp_file.name)
579
+ try:
580
+ text_doc = OpenDocumentText(temp_path)
581
+ paragraphs = text_doc.getElementsByType(odftext.P)
582
+ text_content = "\n".join("".join(node.text for node in p.childNodes) for p in paragraphs)
583
+ extracted_text = text_content
584
+ processing_notes.append("Extracted text from ODT.")
585
+ finally:
586
+ temp_path.unlink() # Clean up temp file
587
+ elif file_extension in ['.doc', '.ppt', '.pptx', '.xls', '.xlsx']:
588
+ # These require more complex or platform-specific libraries (e.g. antiword, pandoc, COM objects on Windows)
589
+ processing_notes.append(f"Automatic text extraction for {file_extension.upper()} not fully implemented.")
590
+ logger.warning(f"Automatic text extraction for {file_extension.upper()} not fully implemented for '{filename}'.")
591
+
592
+ if extracted_text is not None:
593
+ # Limit extracted text size
594
+ max_extracted_text_size = 10000 # Limit text preview
595
+ extracted_data = {'text': extracted_text[:max_extracted_text_size]}
596
+ if len(extracted_text) > max_extracted_text_size:
597
+ extracted_data['text'] += "..."
598
+ processing_notes.append("Extracted text truncated.")
599
+
600
+ except ImportError as e:
601
+ processing_notes.append(f"Missing dependency for document type ({e}). Cannot extract text.")
602
+ except Exception as e:
603
+ processing_notes.append(f"Error during document text extraction: {e}")
604
+ logger.warning(f"Error during document text extraction for '{filename}': {e}")
605
+
606
+
607
+ # 5. Fallback to Plain Text (if no specific extraction succeeded)
608
+ if extracted_data is None:
609
+ extracted_data = {'plain_text': raw_content}
610
+ processing_notes.append("Stored as plain text.")
611
+ # Re-guess mime type if it was something specific like application/octet-stream and we just got text
612
+ if mime_type in ['unknown/unknown', 'application/octet-stream']:
613
+ guessed_text_mime, _ = mimetypes.guess_type('dummy.txt') # Use a dummy file name to guess plain text
614
+ if guessed_text_mime: mime_type = guessed_text_mime
615
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
616
 
 
 
 
 
 
 
 
 
 
 
617
  except Exception as e:
618
+ # Catch errors during initial read or other unexpected issues
619
+ logger.error(f"Fatal error processing single file '{filename}': {e}")
620
+ processing_notes.append(f"Fatal processing error: {e}")
621
+ raw_content = None # Ensure raw_content is None if reading failed
622
+ extracted_data = None
623
+
624
+
625
+ # Add file info to the entry
626
+ entry = {
627
+ 'source': 'file',
628
+ 'filename': filename,
629
+ 'file_size': file_size,
630
+ 'mime_type': mime_type,
631
+ 'created': datetime.fromtimestamp(file_path.stat().st_ctime).isoformat() if file_path.exists() else None,
632
+ 'modified': datetime.fromtimestamp(file_path.stat().st_mtime).isoformat() if file_path.exists() else None,
633
+ 'raw_content': raw_content, # Always include raw content if readable
634
+ 'extracted_data': extracted_data, # Include the structured/extracted data
635
+ 'processing_notes': processing_notes # Include any notes/errors encountered
636
+ }
637
+
638
+ dataset_entries.append(entry)
639
+ return dataset_entries
640
 
641
+
642
+ def _process_archive(self, archive_path: Path, extract_to: Path) -> List[Dict]:
643
  """Process an archive file with enhanced extraction"""
644
  dataset = []
645
+ archive_extension = archive_path.suffix.lower()
646
+ logger.info(f"Processing archive: '{archive_path.name}'")
647
+
648
  try:
649
+ if archive_extension == '.zip':
650
+ if zipfile.is_zipfile(archive_path):
651
+ with zipfile.ZipFile(archive_path, 'r') as zip_ref:
652
+ for file_info in zip_ref.infolist():
653
+ if file_info.file_size > 0 and not file_info.filename.endswith('/'):
654
+ try:
655
+ zip_ref.extract(file_info, path=extract_to)
656
+ extracted_file_path = extract_to / file_info.filename
657
+ # Recursively process the extracted file if it's supported and not an archive itself
658
+ if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
659
+ dataset.extend(self._process_single_file(extracted_file_path))
660
+ elif extracted_file_path.suffix.lower() in self.archive_extensions:
661
+ # Recursively process nested archives (careful with depth!)
662
+ logger.info(f"Found nested archive '{file_info.filename}', processing recursively.")
663
+ dataset.extend(self._process_archive(extracted_file_path, extract_to))
664
+ else:
665
+ logger.debug(f"Skipping unsupported file in archive: '{file_info.filename}'")
666
+ except Exception as e:
667
+ logger.warning(f"Error extracting/processing file '{file_info.filename}' from zip '{archive_path.name}': {e}")
668
+ else:
669
+ logger.error(f"'{archive_path.name}' is not a valid zip file.")
670
+
671
+ elif archive_extension in ('.tar', '.gz', '.tgz'):
672
  try:
673
+ # Determine mode: 'r' for tar, 'r:gz' for tar.gz, 'r:bz2' for tar.bz2 (bz2 not fully supported yet)
674
+ mode = 'r'
675
+ if archive_extension in ('.tar.gz', '.tgz'): mode = 'r:gz'
676
+ # elif archive_extension == '.tar.bz2': mode = 'r:bz2' # Needs bz2 support
677
+ # Note: 'r:*' attempts to guess compression, safer to be explicit
678
+
679
+ with tarfile.open(archive_path, mode) as tar_ref:
680
  for member in tar_ref.getmembers():
681
  if member.isfile():
682
+ try:
683
+ tar_ref.extract(member, path=extract_to)
684
+ extracted_file_path = extract_to / member.name
685
+ # Recursively process extracted file
686
+ if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
687
+ dataset.extend(self._process_single_file(extracted_file_path))
688
+ elif extracted_file_path.suffix.lower() in self.archive_extensions:
689
+ logger.info(f"Found nested archive '{member.name}', processing recursively.")
690
+ dataset.extend(self._process_archive(extracted_file_path, extract_to))
691
+ else:
692
+ logger.debug(f"Skipping unsupported file in archive: '{member.name}'")
693
+ except Exception as e:
694
+ logger.warning(f"Error extracting/processing file '{member.name}' from tar '{archive_path.name}': {e}")
695
  except tarfile.TarError as e:
696
+ logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
697
+
698
+ elif archive_extension == '.gz':
699
+ # GZIP archives typically contain a single file. Extract it and process.
700
+ extracted_name = archive_path.stem # Get name without .gz
701
+ extracted_path = extract_to / extracted_name
702
+ try:
703
+ with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
704
+ outfile.write(gz_file.read())
705
+ # Process the extracted file if supported
706
+ if extracted_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_path):
707
+ dataset.extend(self._process_single_file(extracted_path))
708
+ elif extracted_path.suffix.lower() in self.archive_extensions:
709
+ logger.info(f"Found nested archive '{extracted_name}', processing recursively.")
710
+ dataset.extend(self._process_archive(extracted_path, extract_to))
711
+ else:
712
+ logger.debug(f"Skipping unsupported file (from gz): '{extracted_name}'")
713
+
714
+ except gzip.GzipFile as e:
715
+ logger.error(f"Error processing GZIP file '{archive_path.name}': {e}")
716
+ except Exception as e:
717
+ logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
718
+ finally:
719
+ if extracted_path.exists(): extracted_path.unlink() # Clean up extracted file
720
+
721
+ # TODO: Add support for other archive types (.bz2, .7z, .rar)
722
+ elif archive_extension in ('.bz2', '.7z', '.rar'):
723
+ logger.warning(f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.")
724
 
725
  except Exception as e:
726
+ logger.error(f"Overall archive processing error for '{archive_path.name}': {e}")
727
+
728
+ # Clean up extracted files in temp_dir after processing
729
+ # Handled by context manager 'with tempfile.TemporaryDirectory()'
730
+
731
+
732
  return dataset
733
 
734
  def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
735
  """Enhanced data chunking with sequence metadata"""
736
  try:
737
  # Convert data to JSON string
738
+ # Use separators=(',', ':') to remove unnecessary whitespace for maximum data density in QR code
739
+ json_str = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
740
  total_length = len(json_str)
741
 
742
  # Calculate overhead for metadata
743
+ # Metadata structure: {"idx":0,"tc":1,"tl":XXX,"hash":"YYYY","data":"..."}, shortened keys
744
  metadata_template = {
745
+ "idx": 0, # chunk_index
746
+ "tc": 1, # total_chunks
747
+ "tl": total_length, # total_length
748
+ "hash": "", # chunk_hash
749
+ "data": "" # chunk_data
750
  }
751
+ # Estimate overhead more accurately by dumping a sample metadata structure
752
+ # and adding some safety margin. Shortened keys reduce overhead.
753
+ overhead_estimate = len(json.dumps(metadata_template, separators=(',', ':'))) + 50 # Extra padding
754
 
755
  # Calculate effective chunk size
756
+ effective_chunk_size = max_size - overhead_estimate
757
+
758
+ if effective_chunk_size <= 0:
759
+ logger.error(f"Max QR size ({max_size}) is too small for metadata overhead ({overhead_estimate}). Cannot chunk.")
760
+ return []
761
 
762
  if total_length <= effective_chunk_size:
763
  # Data fits in one chunk
764
+ chunk_data = json_str # Use the full string
765
+
766
  chunk = {
767
+ "idx": 0,
768
+ "tc": 1,
769
+ "tl": total_length,
770
+ "hash": hash(chunk_data) & 0xFFFFFFFF, # 32-bit hash
771
+ "data": chunk_data
772
  }
773
  return [chunk]
774
 
775
  # Calculate number of chunks needed
776
  num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
777
+ # Adjust chunk_size slightly to distribute evenly, maybe not strictly necessary
778
+ # chunk_size = -(-total_length // num_chunks) # Use this if perfect distribution is needed
779
 
780
  chunks = []
781
+ current_pos = 0
782
  for i in range(num_chunks):
783
+ # Find the end of the current chunk. Avoid splitting in the middle of escaped characters or surrogate pairs if possible,
784
+ # but simple slicing is usually okay for standard text that's already been errors='replace'.
785
+ # We'll use basic slicing for simplicity, as the JSON string is just text.
786
+ end_pos = min(current_pos + effective_chunk_size, total_length)
787
+
788
+ # Basic attempt to not break in the middle of a UTF-8 character if slicing bytes,
789
+ # but since we are slicing a *decoded string*, this is less of an issue.
790
+ # However, slicing in the middle of JSON structure is bad.
791
+ # For simplicity and robustness with arbitrary JSON structures, slicing the raw string is the easiest.
792
+ chunk_data_str = json_str[current_pos:end_pos]
793
 
794
  chunk = {
795
+ "idx": i,
796
+ "tc": num_chunks,
797
+ "tl": total_length,
798
+ "hash": hash(chunk_data_str) & 0xFFFFFFFF,
799
+ "data": chunk_data_str
800
  }
801
  chunks.append(chunk)
802
+ current_pos = end_pos
803
+
804
+ # Final check: Ensure all data was chunked
805
+ if current_pos < total_length:
806
+ # This shouldn't happen with correct ceiling division and min()
807
+ logger.error(f"Chunking logic error: Only processed {current_pos} of {total_length} characters.")
808
+ return [] # Indicate failure
809
 
810
+
811
+ logger.info(f"Chunked data into {num_chunks} chunks for QR codes.")
812
  return chunks
813
+
814
  except Exception as e:
815
  logger.error(f"Error chunking data: {e}")
816
  return []
 
825
  try:
826
  qr = qrcode.QRCode(
827
  version=None,
828
+ error_correction=qrcode.constants.ERROR_CORRECT_M, # Increased error correction
829
  box_size=size,
830
  border=border
831
  )
832
 
833
  # Add data to QR code
834
  if isinstance(data, dict):
835
+ # Use compact JSON representation
836
+ qr.add_data(json.dumps(data, ensure_ascii=False, separators=(',', ':')))
837
  else:
838
+ qr.add_data(str(data)) # Ensure it's a string
839
 
840
  qr.make(fit=True)
841
 
842
  # Create QR code image with custom colors
843
  qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
844
 
845
+ # Convert to RGBA for transparency support and potential overlays
846
  qr_image = qr_image.convert('RGBA')
847
 
848
+ # Optional: Add a small logo or icon in the center (requires design)
849
+ # logo = Image.open("logo.png").convert("RGBA")
850
+ # logo = logo.resize((logo.width // 4, logo.height // 4)) # Resize logo
851
+ # logo_pos = ((qr_image.width - logo.width) // 2, (qr_image.height - logo.height) // 2)
852
+ # qr_image.paste(logo, logo_pos, logo)
853
+
854
+ # Add subtle gradient overlay (optional visual enhancement)
855
+ try:
856
+ gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
857
+ draw = ImageDraw.Draw(gradient)
858
+ # Horizontal gradient for subtle effect
859
+ for i in range(qr_image.width):
860
+ # Fades from left (alpha=0) to right (max_alpha)
861
+ alpha = int(255 * (i/qr_image.width) * 0.05) # e.g., 5% maximum opacity fade-in
862
+ draw.line([(i, 0), (i, qr_image.height)], fill=(0, 0, 0, alpha))
863
+ # Combine images
864
+ final_image = Image.alpha_composite(qr_image, gradient)
865
+ except Exception as e:
866
+ logger.warning(f"Failed to add gradient overlay to QR code: {e}. Using plain QR.")
867
+ final_image = qr_image
868
 
 
 
869
 
870
  # Save the image
871
  output_path = QR_CODES_DIR / filename
872
+ final_image.save(output_path, quality=90) # Save with slightly lower quality for smaller file size
873
 
874
  return str(output_path)
875
  except Exception as e:
 
878
 
879
  def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
880
  """Generate QR codes with enhanced visual appeal and metadata"""
881
+ # Assume 'data' here is the list of dictionaries produced by process_inputs
882
+ if not isinstance(data, list):
883
+ logger.error("generate_qr_codes received data that is not a list.")
884
+ return []
885
+
886
  try:
887
+ file_processor = EnhancedFileProcessor() # Use the enhanced processor for chunking
888
  paths = []
889
 
890
  if combined:
891
  # Process combined data
892
+ chunks = file_processor.chunk_data(data) # chunk_data works on the list of dicts
893
+ if not chunks:
894
+ logger.warning("No chunks generated for combined data.")
895
+ return []
896
  for i, chunk in enumerate(chunks):
897
  filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png'
898
  qr_path = generate_stylish_qr(
899
+ data=chunk, # Pass the chunk dictionary
900
  filename=filename,
901
  fill_color="#1a365d", # Deep blue
902
  back_color="#ffffff"
903
  )
904
  if qr_path:
905
  paths.append(qr_path)
906
+ else:
907
+ logger.warning(f"Failed to generate QR for chunk {i+1}/{len(chunks)}.")
908
  else:
909
+ # Process individual items (each dictionary in the list)
910
+ if data: # Ensure data is not empty
911
  for idx, item in enumerate(data):
912
+ chunks = file_processor.chunk_data(item) # chunk_data works on individual dict
913
+ if not chunks:
914
+ logger.warning(f"No chunks generated for item {idx+1}.")
915
+ continue
916
  for chunk_idx, chunk in enumerate(chunks):
917
  filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png'
918
  qr_path = generate_stylish_qr(
919
+ data=chunk, # Pass the chunk dictionary
920
  filename=filename,
921
  fill_color="#1a365d", # Deep blue
922
  back_color="#ffffff"
923
  )
924
  if qr_path:
925
  paths.append(qr_path)
926
+ else:
927
+ logger.warning(f"Failed to generate QR for item {idx+1} chunk {chunk_idx+1}/{len(chunks)}.")
928
  else:
929
+ logger.warning("No items in data list to process individually.")
930
+
931
+ logger.info(f"Generated {len(paths)} QR codes.")
932
+ return paths
933
+
 
 
 
 
 
 
 
934
  except Exception as e:
935
  logger.error(f"QR code generation error: {e}")
936
  return []
937
 
938
+ # Keep the Gradio UI definition and main function as they are,
939
+ # as the changes are internal to the processing classes and the
940
+ # process_inputs function already handles calling them and getting
941
+ # the combined list of results.
942
+
943
  def create_modern_interface():
944
  """Create a modern and visually appealing Gradio interface"""
945
 
 
1043
  interface.head += """
1044
  <script>
1045
  let enabledStates = [];
 
1046
  function updateEnabledStates(checkbox) {
1047
  const index = parseInt(checkbox.dataset.index);
1048
  if (checkbox.checked) {
 
1066
  qr_code_paths = gr.State([])
1067
  gr.Markdown("""
1068
  # 🌐 Advanced Data Processing & QR Code Generator
 
1069
  Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
1070
  """)
1071
  with gr.Tab("πŸ“ URL Processing"):
 
1149
  return json.dumps(example, indent=2)
1150
 
1151
  def clear_input():
1152
+ return "", None, "" # Clear url, files, text
1153
 
1154
  def update_viewport(paths, enabled_states):
1155
  if not paths:
1156
  return "<p>No QR codes generated yet.</p>"
1157
 
1158
  num_qr_codes = len(paths)
1159
+ cols = math.ceil(math.sqrt(num_qr_codes)) # Calculate columns for a roughly square grid
1160
+ cols = max(1, min(cols, 6)) # Limit max columns for small screens
1161
  rows = math.ceil(num_qr_codes / cols)
1162
 
1163
+ viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'.format(cols)
1164
+
1165
+ # Initialize enabledStates if it's empty (first load)
1166
+ if not enabled_states and paths:
1167
+ enabled_states = list(range(num_qr_codes)) # Enable all by default on first view
1168
 
1169
  for i, path in enumerate(paths):
1170
  is_enabled = i in enabled_states
1171
  border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;"
1172
+ opacity = "opacity: 1.0;" if is_enabled else "opacity: 0.5;"
1173
  viewport_html += f'<div class="viewport-item" id="qr_item_{i}">'
1174
+ viewport_html += f'<img src="/file={path}" style="{border} {opacity}" alt="QR Code {i+1}">' # Use /file= for Gradio to serve static files
1175
+ viewport_html += f'<label><input type="checkbox" data-index="{i}" {"checked" if is_enabled else ""} onchange="updateEnabledStates(this)"> Enable</label>'
1176
  viewport_html += '</div>'
1177
  viewport_html += '</div>'
1178
 
 
1180
 
1181
  def process_inputs(urls, files, text, combine):
1182
  """Process all inputs and generate QR codes"""
1183
+ results = []
1184
+ processing_status_messages = []
1185
+
1186
+ url_processor = EnhancedURLProcessor()
1187
+ file_processor = EnhancedFileProcessor()
1188
 
1189
+ try:
1190
  # Process JSON input
1191
  if text and text.strip():
1192
  try:
1193
  json_data = json.loads(text)
1194
+ # Wrap direct JSON input in a dictionary for consistency with file/URL output structure
1195
+ results.append({
1196
+ 'source': 'json_input',
1197
+ 'extracted_data': json_data,
1198
+ 'timestamp': datetime.now().isoformat(),
1199
+ 'processing_notes': ['Parsed from direct JSON input.']
1200
+ })
1201
+ processing_status_messages.append("βœ… Successfully parsed direct JSON input.")
1202
  except json.JSONDecodeError as e:
1203
+ processing_status_messages.append(f"❌ Invalid JSON format in text input: {str(e)}")
1204
+ except Exception as e:
1205
+ processing_status_messages.append(f"❌ Error processing direct JSON input: {str(e)}")
1206
+
1207
 
1208
  # Process URLs
1209
  if urls and urls.strip():
 
1212
  for url in url_list:
1213
  validation = url_processor.validate_url(url)
1214
  if validation['is_valid']:
1215
+ processing_status_messages.append(f"🌐 Fetching URL: {url}...")
1216
+ content_result = url_processor.fetch_content(url)
1217
+ if content_result:
1218
+ results.append(content_result)
1219
+ processing_status_messages.append(f"βœ… Fetched and processed URL: {url}")
1220
+ else:
1221
+ processing_status_messages.append(f"❌ Failed to fetch/process URL: {url}")
1222
+ if validation['details'].get('final_url'):
1223
+ processing_status_messages[-1] += f" (Redirected to {validation['details']['final_url']})"
1224
+ else:
1225
+ processing_status_messages.append(f"⚠️ Skipping invalid URL: {url} ({validation['message']})")
1226
 
1227
  # Process files
1228
  if files:
1229
  for file in files:
1230
+ processing_status_messages.append(f"πŸ“ Processing file: {file.name}...")
1231
  file_results = file_processor.process_file(file)
1232
  if file_results:
1233
+ results.extend(file_results)
1234
+ processing_status_messages.append(f"βœ… Processed file: {file.name}")
1235
+ else:
1236
+ processing_status_messages.append(f"❌ Failed to process file: {file.name}")
1237
 
1238
  # Generate QR codes
1239
+ qr_paths = []
1240
+ final_json_output = None
1241
+
1242
  if results:
1243
+ # Use the collected results (list of dicts) for QR code generation
1244
  qr_paths = generate_qr_codes(results, combine)
1245
+ final_json_output = results # Show the structured data in the JSON output box
1246
+
1247
  if qr_paths:
1248
+ processing_status_messages.append(f"βœ… Successfully generated {len(qr_paths)} QR codes.")
 
 
 
 
1249
  else:
1250
+ processing_status_messages.append("❌ Failed to generate QR codes.")
1251
+
1252
  else:
1253
+ processing_status_messages.append("⚠️ No valid content collected from inputs.")
1254
+
1255
+
1256
  except Exception as e:
1257
+ logger.error(f"Overall processing error in process_inputs: {e}")
1258
+ processing_status_messages.append(f"❌ An unexpected error occurred during processing: {str(e)}")
1259
+
1260
+ return (
1261
+ final_json_output,
1262
+ [str(path) for path in qr_paths], # Gradio Gallery expects list of paths (strings)
1263
+ "\n".join(processing_status_messages) # Join status messages
1264
+ )
1265
 
1266
+ def on_qr_generation(qr_paths_list):
1267
+ # When QR codes are generated, update the state with the list of paths
1268
+ # and initialize the enabled_qr_codes state with all indices enabled
1269
+ num_qrs = len(qr_paths_list)
1270
+ initial_enabled_states = list(range(num_qrs))
1271
+ return qr_paths_list, initial_enabled_states # Return paths list and initial enabled state
1272
+
1273
+
1274
+ # Link events
1275
+ example_btn.click(load_example, inputs=[], outputs=text_input)
1276
+ clear_btn.click(clear_input, inputs=[], outputs=[url_input, file_input, text_input]) # Clear all inputs
1277
 
1278
  process_btn.click(
1279
  process_inputs,
1280
  inputs=[url_input, file_input, text_input, combine_data],
1281
  outputs=[output_json, output_gallery, output_text]
1282
+ ).then( # Chain a .then() to update the QR paths state and trigger viewport update
1283
+ on_qr_generation,
1284
+ inputs=[output_gallery], # Get the list of paths from the gallery output
1285
+ outputs=[qr_code_paths, enabled_qr_codes] # Update the state variables
1286
+ )
1287
 
1288
+ # The viewport tab's select event will trigger update_viewport to render the grid
1289
  viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output])
1290
 
1291
  # Add helpful documentation
1292
  gr.Markdown("""
1293
  ### πŸš€ Features
1294
+
1295
+ - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type.
1296
+ - **Advanced File Processing**: Reads various text-based files (.txt, .md, .log etc.), HTML, XML, CSV, and attempts text extraction from common documents (.pdf, .docx, .rtf, .odt - *requires extra dependencies*).
1297
+ - **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs.
1298
+ - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives.
1299
+ - **Robust Encoding Detection**: Uses `chardet` for reliable character encoding identification.
1300
+ - **Structured Output**: Provides a consistent JSON output format containing raw content (if applicable), extracted data, and processing notes for each processed item.
1301
+ - **Sequential QR Codes**: Maintains data integrity across multiple codes by chunking the combined/individual processed data.
1302
+ - **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
1303
+ - **Modern Design**: Clean, responsive interface with visual feedback.
1304
+
1305
+ ### πŸ’‘ Tips
1306
+
1307
+ 1. **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type.
1308
+ 2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats.
1309
+ 3. **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure.
1310
+ 4. **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries. Check the console logs for warnings if a library is missing.
1311
+ 5. **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item.
1312
+ 6. **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps.
1313
+ 7. **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images.
1314
+
1315
+ ### 🎨 Output Details
1316
+
1317
+ - The "Processed Data" JSON will be a list of dictionaries. Each dictionary represents one processed input (URL or file).
1318
+ - Each item will have keys like `source`, `filename` (for files), `url` (for URLs), `mime_type`, `raw_content` (if readable), `extracted_data`, and `processing_notes`.
1319
+ - `extracted_data` will contain the parsed/extracted content, structured according to the input type (e.g., dictionary for JSON, text for documents, list of rows for CSV, dictionary with title/text/links for HTML).
1320
+ - `processing_notes` will list any issues encountered during extraction.
1321
+ - Generated QR codes are saved in the `output/qr_codes` directory.
1322
+
1323
+ ### βš™οΈ QR Code Viewport Instructions
1324
+
1325
+ 1. Navigate to the **QR Code Viewport** tab after generating QR codes.
1326
+ 2. The generated QR codes will be displayed in a grid based on their total count.
1327
+ 3. Use the checkboxes below each QR code to enable or disable it for visual selection. Enabled codes have a green border and full opacity.
1328
+ 4. This viewport is currently for visualization and selection *within the UI*; it doesn't change the generated files themselves. You would manually select which physical QR codes to scan based on this view.
1329
+
1330
+ """)
1331
  return interface
1332
 
1333
  def main():
 
1342
  # Launch with configuration
1343
  interface.launch(
1344
  share=False,
1345
+ debug=False, # Set to True for more verbose Gradio logging
1346
  show_error=True,
1347
  show_api=False
1348
  )
1349
  except Exception as e:
1350
  logger.error(f"Application startup error: {e}")
1351
+ # Optionally print a user-friendly message before exiting
1352
+ print(f"\nFatal Error: {e}\nCheck the logs for details.")
1353
+ raise # Re-raise the exception to ensure the process exits if launch fails
1354
 
1355
  if __name__ == "__main__":
1356
  main()