acecalisto3 commited on
Commit
da7f558
Β·
verified Β·
1 Parent(s): 8407f80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -252
app.py CHANGED
@@ -11,6 +11,7 @@ from datetime import datetime
11
  from typing import List, Dict, Optional, Union, Tuple
12
  from pathlib import Path
13
  from urllib.parse import urlparse, urljoin
 
14
  import requests
15
  import validators
16
  import gradio as gr
@@ -21,8 +22,6 @@ from cleantext import clean
21
  import qrcode
22
  from PIL import Image, ImageDraw, ImageFont
23
  import numpy as np
24
- import tarfile
25
- import gzip
26
 
27
  # Setup enhanced logging with more detailed formatting
28
  logging.basicConfig(
@@ -31,7 +30,8 @@ logging.basicConfig(
31
  handlers=[
32
  logging.StreamHandler(),
33
  logging.FileHandler('app.log', encoding='utf-8')
34
- ])
 
35
  logger = logging.getLogger(__name__)
36
 
37
  # Ensure output directories exist with modern structure
@@ -43,13 +43,13 @@ for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
43
 
44
  class EnhancedURLProcessor:
45
  """Advanced URL processing with complete content extraction"""
46
-
47
  def __init__(self):
48
  self.session = requests.Session()
49
  self.timeout = 15 # Extended timeout for larger content
50
  self.max_retries = 3
51
  self.user_agent = UserAgent()
52
-
53
  # Enhanced headers for better site compatibility
54
  self.session.headers.update({
55
  'User-Agent': self.user_agent.random,
@@ -70,9 +70,11 @@ class EnhancedURLProcessor:
70
  try:
71
  if not validators.url(url):
72
  return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
 
73
  parsed = urlparse(url)
74
  if not all([parsed.scheme, parsed.netloc]):
75
  return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
 
76
  # Try HEAD request first to check accessibility
77
  try:
78
  head_response = self.session.head(url, timeout=5)
@@ -98,18 +100,19 @@ class EnhancedURLProcessor:
98
  """Enhanced content fetcher with retry mechanism and complete character extraction"""
99
  try:
100
  logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
101
-
102
  # Update User-Agent randomly for each request
103
  self.session.headers.update({'User-Agent': self.user_agent.random})
104
-
105
  response = self.session.get(url, timeout=self.timeout)
106
  response.raise_for_status()
107
-
108
  # Detect encoding
109
  if response.encoding is None:
110
  encoding = chardet.detect(response.content)['encoding'] or 'utf-8'
111
  else:
112
  encoding = response.encoding
 
113
  # Decode content with fallback
114
  try:
115
  raw_content = response.content.decode(encoding, errors='replace')
@@ -133,11 +136,13 @@ class EnhancedURLProcessor:
133
  processed_content = self._process_html_content(raw_content, url)
134
  else:
135
  processed_content = raw_content
 
136
  return {
137
  'content': processed_content,
138
  'raw_content': raw_content,
139
  'metadata': metadata
140
  }
 
141
  except requests.exceptions.RequestException as e:
142
  if retry_count < self.max_retries - 1:
143
  logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
@@ -153,7 +158,7 @@ class EnhancedURLProcessor:
153
  """Process HTML content while preserving all characters"""
154
  try:
155
  soup = BeautifulSoup(content, 'html.parser')
156
-
157
  # Convert relative URLs to absolute
158
  for tag in soup.find_all(['a', 'img', 'link', 'script']):
159
  for attr in ['href', 'src']:
@@ -162,10 +167,12 @@ class EnhancedURLProcessor:
162
  tag[attr] = urljoin(base_url, tag[attr])
163
  except Exception:
164
  pass
 
165
  # Extract all text content
166
  text_parts = []
167
  for element in soup.stripped_strings:
168
  text_parts.append(str(element))
 
169
  return '\n'.join(text_parts)
170
  except Exception as e:
171
  logger.error(f"HTML processing error: {e}")
@@ -177,7 +184,7 @@ class EnhancedFileProcessor:
177
  def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
178
  self.max_file_size = max_file_size
179
  self.supported_extensions = {
180
- '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
181
  '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
182
  '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
183
  '.pdf', '.doc', '.docx', '.rtf', '.odt'
@@ -197,18 +204,17 @@ class EnhancedFileProcessor:
197
 
198
  with tempfile.TemporaryDirectory() as temp_dir:
199
  temp_dir_path = Path(temp_dir)
200
-
201
  # Handle different archive types
202
  if self._is_archive(file.name):
203
  dataset.extend(self._process_archive(file.name, temp_dir_path))
204
- elif Path(file.name).suffix.lower() in self.supported_extensions:
205
- dataset.extend(self._process_single_file(file))
206
  else:
207
- logger.warning(f"Unsupported file type: {file.name}")
208
 
209
  except Exception as e:
210
  logger.error(f"Error processing file: {str(e)}")
211
  return []
 
212
  return dataset
213
 
214
  def _is_archive(self, filepath: str) -> bool:
@@ -218,14 +224,14 @@ class EnhancedFileProcessor:
218
  ])
219
 
220
  def _process_single_file(self, file) -> List[Dict]:
221
- """Process a single file with enhanced character extraction and JSON handling"""
222
  try:
223
  file_stat = os.stat(file.name)
224
  file_size = file_stat.st_size
225
-
226
  # Initialize content storage
227
  content_parts = []
228
-
229
  # Process file in chunks for large files
230
  chunk_size = 10 * 1024 * 1024 # 10MB chunks
231
  with open(file.name, 'rb') as f:
@@ -233,7 +239,7 @@ class EnhancedFileProcessor:
233
  chunk = f.read(chunk_size)
234
  if not chunk:
235
  break
236
-
237
  # Detect encoding for each chunk
238
  encoding = chardet.detect(chunk)['encoding'] or 'utf-8'
239
  try:
@@ -246,43 +252,6 @@ class EnhancedFileProcessor:
246
  # Combine all chunks
247
  complete_content = ''.join(content_parts)
248
 
249
- # Check if the content is valid JSON regardless of file extension
250
- try:
251
- if mimetypes.guess_type(file.name)[0] == 'application/json' or file.name.lower().endswith('.json'):
252
- # It's a JSON file by type or extension
253
- json_data = json.loads(complete_content)
254
- return [{
255
- 'source': 'json_file',
256
- 'filename': os.path.basename(file.name),
257
- 'file_size': file_size,
258
- 'mime_type': 'application/json',
259
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
260
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
261
- 'content': json_data, # Store the parsed JSON object
262
- 'raw_content': complete_content, # Store the original JSON string
263
- 'timestamp': datetime.now().isoformat()
264
- }]
265
- else:
266
- # Try to parse as JSON anyway
267
- try:
268
- json_data = json.loads(complete_content)
269
- # If we get here, it's valid JSON despite the extension
270
- return [{
271
- 'source': 'json_content',
272
- 'filename': os.path.basename(file.name),
273
- 'file_size': file_size,
274
- 'mime_type': 'application/json',
275
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
276
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
277
- 'content': json_data, # Store the parsed JSON object
278
- 'raw_content': complete_content, # Store the original JSON string
279
- 'timestamp': datetime.now().isoformat()
280
- }]
281
- except json.JSONDecodeError:
282
- logger.warning(f"File {file.name} is not valid JSON.")
283
- except Exception as e:
284
- logger.error(f"Error during JSON processing: {e}")
285
-
286
  return [{
287
  'source': 'file',
288
  'filename': os.path.basename(file.name),
@@ -311,46 +280,22 @@ class EnhancedFileProcessor:
311
  if extracted_path.suffix.lower() in self.supported_extensions:
312
  with open(extracted_path, 'rb') as f:
313
  dataset.extend(self._process_single_file(f))
314
- # Handle TAR archives
315
- elif archive_path.lower().endswith(('.tar', '.tar.gz', '.tgz')):
316
- try:
317
- with tarfile.open(archive_path, 'r:*') as tar_ref:
318
- for member in tar_ref.getmembers():
319
- if member.isfile():
320
- extracted_path = extract_to / member.name
321
- tar_ref.extract(member, path=extract_to)
322
- if extracted_path.suffix.lower() in self.supported_extensions:
323
- with open(extracted_path, 'rb') as f:
324
- dataset.extend(self._process_single_file(f))
325
- except tarfile.TarError as e:
326
- logger.error(f"Error processing TAR archive: {e}")
327
- # Handle GZIP archives (single file)
328
- elif archive_path.lower().endswith('.gz'):
329
- extracted_path = extract_to / Path(archive_path).stem
330
- try:
331
- with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
332
- outfile.write(gz_file.read())
333
- if extracted_path.suffix.lower() in self.supported_extensions:
334
- with open(extracted_path, 'rb') as f:
335
- dataset.extend(self._process_single_file(f))
336
- except gzip.GzipFile as e:
337
- logger.error(f"Error processing GZIP archive: {e}")
338
- # TODO: Add support for other archive types (.bz2, .7z, .rar) - may require external libraries
339
- elif archive_path.lower().endswith(('.bz2', '.7z', '.rar')):
340
- logger.warning(f"Support for {Path(archive_path).suffix} archives is not yet fully implemented.")
341
 
 
 
342
  except Exception as e:
343
  logger.error(f"Archive processing error: {e}")
 
344
  return dataset
345
 
346
  def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
347
- """Enhanced data chunking with sequence metadata"""
348
  try:
349
- # Convert data to JSON string
350
  json_str = json.dumps(data, ensure_ascii=False)
351
- total_length = len(json_str)
352
-
353
- # Calculate overhead for metadata
 
354
  metadata_template = {
355
  "chunk_index": 0,
356
  "total_chunks": 1,
@@ -358,32 +303,48 @@ class EnhancedFileProcessor:
358
  "chunk_hash": "",
359
  "data": ""
360
  }
361
- overhead = len(json.dumps(metadata_template)) + 20 # Extra padding for safety
362
-
363
- # Calculate effective chunk size
364
- effective_chunk_size = max_size - overhead
365
-
366
- if total_length <= effective_chunk_size:
367
- # Data fits in one chunk
 
 
 
 
 
 
368
  chunk = {
369
- "chunk_index": 0,
370
- "total_chunks": 1,
371
  "total_length": total_length,
372
- "chunk_hash": hash(json_str) & 0xFFFFFFFF, # 32-bit hash
373
- "data": json_str
374
  }
375
- return [chunk]
376
-
 
 
 
 
 
 
 
 
 
 
377
  # Calculate number of chunks needed
378
  num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
379
  chunk_size = -(-total_length // num_chunks) # Even distribution
380
-
381
  chunks = []
382
  for i in range(num_chunks):
383
  start_idx = i * chunk_size
384
  end_idx = min(start_idx + chunk_size, total_length)
385
  chunk_data = json_str[start_idx:end_idx]
386
-
387
  chunk = {
388
  "chunk_index": i,
389
  "total_chunks": num_chunks,
@@ -392,56 +353,58 @@ class EnhancedFileProcessor:
392
  "data": chunk_data
393
  }
394
  chunks.append(chunk)
395
-
396
  return chunks
 
397
  except Exception as e:
398
  logger.error(f"Error chunking data: {e}")
399
  return []
400
 
401
- def generate_stylish_qr(data: Union[str, Dict],
402
- filename: str,
403
- size: int = 10,
404
- border: int = 4,
405
- fill_color: str = "#000000",
406
- back_color: str = "#FFFFFF") -> str:
407
  """Generate a stylish QR code with enhanced visual appeal"""
408
  try:
409
  qr = qrcode.QRCode(
410
  version=None,
411
- error_correction=qrcode.constants.ERROR_CORRECT_H,
412
  box_size=size,
413
  border=border
414
  )
415
-
416
  # Add data to QR code
417
  if isinstance(data, dict):
418
  qr.add_data(json.dumps(data, ensure_ascii=False))
419
  else:
420
  qr.add_data(data)
421
-
422
  qr.make(fit=True)
423
-
424
  # Create QR code image with custom colors
425
  qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
426
-
427
  # Convert to RGBA for transparency support
428
  qr_image = qr_image.convert('RGBA')
429
-
430
  # Add subtle gradient overlay
431
  gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
432
  draw = ImageDraw.Draw(gradient)
433
  for i in range(qr_image.width):
434
  alpha = int(255 * (1 - i/qr_image.width) * 0.1) # 10% maximum opacity
435
  draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
436
-
437
  # Combine images
438
  final_image = Image.alpha_composite(qr_image, gradient)
439
-
440
  # Save the image
441
  output_path = QR_CODES_DIR / filename
442
  final_image.save(output_path, quality=95)
443
-
444
  return str(output_path)
 
445
  except Exception as e:
446
  logger.error(f"QR generation error: {e}")
447
  return ""
@@ -451,7 +414,7 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
451
  try:
452
  file_processor = EnhancedFileProcessor()
453
  paths = []
454
-
455
  if combined:
456
  # Process combined data
457
  chunks = file_processor.chunk_data(data)
@@ -492,126 +455,15 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
492
  )
493
  if qr_path:
494
  paths.append(qr_path)
495
- return paths
 
496
  except Exception as e:
497
  logger.error(f"QR code generation error: {e}")
498
  return []
499
 
500
- def create_qr_visualizer(qr_paths, metadata=None):
501
- """Create an interactive visualization of sequenced QR codes"""
502
- if not qr_paths:
503
- return None
504
-
505
- # Extract metadata from QR codes if not provided
506
- if metadata is None:
507
- metadata = []
508
- for path in qr_paths:
509
- try:
510
- img = Image.open(path)
511
- qr = qrcode.QRCode()
512
- data = qrcode.image.pil.PilImage.get_qr_data(img)
513
- if data:
514
- metadata.append(json.loads(data))
515
- else:
516
- # If can't extract, add placeholder
517
- metadata.append({"chunk_index": len(metadata), "total_chunks": len(qr_paths)})
518
- except Exception as e:
519
- logger.error(f"Error extracting QR metadata: {e}")
520
- metadata.append({"chunk_index": len(metadata), "total_chunks": len(qr_paths)})
521
-
522
- # Compute optimal grid size
523
- total_codes = len(qr_paths)
524
- grid_size = math.ceil(math.sqrt(total_codes))
525
-
526
- # Create a composite image with placeholders for disabled QR codes
527
- def create_composite(enabled_indices):
528
- # Size calculations for the grid
529
- qr_size = 200 # Size of each QR code in pixels
530
- padding = 20 # Padding between QR codes
531
-
532
- # Create grid for visualization
533
- grid_width = grid_size * (qr_size + padding) + padding
534
- grid_height = grid_size * (qr_size + padding) + padding
535
-
536
- # Create a white background image
537
- composite = Image.new('RGBA', (grid_width, grid_height), (255, 255, 255, 255))
538
- draw = ImageDraw.Draw(composite)
539
-
540
- # Load and place QR codes on the grid
541
- for i, path in enumerate(qr_paths):
542
- # Calculate grid position
543
- row = i // grid_size
544
- col = i % grid_size
545
-
546
- # Calculate pixel position
547
- x = col * (qr_size + padding) + padding
548
- y = row * (qr_size + padding) + padding
549
-
550
- if i in enabled_indices:
551
- try:
552
- # Load and resize QR code
553
- qr_img = Image.open(path)
554
- qr_img = qr_img.resize((qr_size, qr_size), Image.Resampling.LANCZOS)
555
-
556
- # Extract metadata for this QR
557
- meta = metadata[i] if i < len(metadata) else {}
558
- chunk_index = meta.get("chunk_index", i)
559
- total_chunks = meta.get("total_chunks", len(qr_paths))
560
-
561
- # Add visual indicator for sequence position
562
- sequence_indicator = Image.new('RGBA', (qr_size, 30), (26, 54, 93, 200)) # Dark blue
563
- draw_indicator = ImageDraw.Draw(sequence_indicator)
564
- draw_indicator.text((10, 5), f"#{chunk_index+1} of {total_chunks}", fill=(255, 255, 255))
565
-
566
- # Combine QR with indicator
567
- qr_with_indicator = Image.new('RGBA', (qr_size, qr_size + 30))
568
- qr_with_indicator.paste(qr_img, (0, 0))
569
- qr_with_indicator.paste(sequence_indicator, (0, qr_size), sequence_indicator)
570
-
571
- # Paste onto composite
572
- composite.paste(qr_with_indicator, (x, y))
573
-
574
- # Draw connection lines based on sequence
575
- if i > 0:
576
- prev_x = (col - 1) * (qr_size + padding) + padding if col > 0 else x
577
- prev_y = (row * (qr_size + padding)) + padding
578
- draw.line([(prev_x + qr_size // 2, prev_y + qr_size), (x + qr_size // 2, y)], fill=(0, 0, 0, 255), width=2)
579
-
580
- return composite
581
-
582
- # Create a toggleable interface for enabling/disabling QR codes
583
- enabled_indices = list(range(total_codes)) # Start with all enabled
584
- def toggle_qr(index):
585
- if index in enabled_indices:
586
- enabled_indices.remove(index)
587
- else:
588
- enabled_indices.append(index)
589
- return create_composite(enabled_indices)
590
-
591
- # Create the initial composite image
592
- initial_composite = create_composite(enabled_indices)
593
-
594
- # Display the composite image
595
- plt.figure(figsize=(10, 10))
596
- plt.imshow(initial_composite)
597
- plt.axis('off')
598
- plt.show()
599
-
600
- return toggle_qr
601
-
602
- # Integrate the visualizer into the main application
603
- def visualize_qr_codes(qr_paths):
604
- """Visualize the generated QR codes with enable/disable functionality"""
605
- toggle_function = create_qr_visualizer(qr_paths)
606
- return toggle_function
607
-
608
- # Add a button in the Gradio interface to trigger visualization
609
- visualize_btn = gr.Button("πŸ” Visualize QR Codes")
610
- visualize_btn.click(visualize_qr_codes, inputs=output_gallery, outputs=None)
611
-
612
  def create_modern_interface():
613
  """Create a modern and visually appealing Gradio interface"""
614
-
615
  # Modern CSS styling
616
  css = """
617
  /* Modern color scheme */
@@ -624,6 +476,7 @@ def create_modern_interface():
624
  --error-color: #f56565;
625
  --warning-color: #ed8936;
626
  }
 
627
  /* Container styling */
628
  .container {
629
  max-width: 1200px;
@@ -633,6 +486,7 @@ def create_modern_interface():
633
  border-radius: 1rem;
634
  box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
635
  }
 
636
  /* Component styling */
637
  .input-container {
638
  background-color: white;
@@ -641,6 +495,7 @@ def create_modern_interface():
641
  border: 1px solid #e2e8f0;
642
  margin-bottom: 1rem;
643
  }
 
644
  /* Button styling */
645
  .primary-button {
646
  background-color: var(--primary-color);
@@ -651,19 +506,23 @@ def create_modern_interface():
651
  cursor: pointer;
652
  transition: all 0.2s;
653
  }
 
654
  .primary-button:hover {
655
  background-color: var(--accent-color);
656
  transform: translateY(-1px);
657
  }
 
658
  /* Status messages */
659
  .status {
660
  padding: 1rem;
661
  border-radius: 0.375rem;
662
  margin: 1rem 0;
663
  }
 
664
  .status.success { background-color: #f0fff4; color: var(--success-color); }
665
  .status.error { background-color: #fff5f5; color: var(--error-color); }
666
  .status.warning { background-color: #fffaf0; color: var(--warning-color); }
 
667
  /* Gallery styling */
668
  .gallery {
669
  display: grid;
@@ -674,22 +533,27 @@ def create_modern_interface():
674
  border-radius: 0.5rem;
675
  border: 1px solid #e2e8f0;
676
  }
 
677
  .gallery img {
678
  width: 100%;
679
  height: auto;
680
  border-radius: 0.375rem;
681
  transition: transform 0.2s;
682
  }
 
683
  .gallery img:hover {
684
  transform: scale(1.05);
685
  }
686
  """
 
687
  # Create interface with modern design
688
  with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
689
  gr.Markdown("""
690
  # 🌐 Advanced Data Processing & QR Code Generator
 
691
  Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
692
  """)
 
693
  with gr.Tab("πŸ“ URL Processing"):
694
  url_input = gr.Textbox(
695
  label="Enter URLs (comma or newline separated)",
@@ -697,12 +561,14 @@ def create_modern_interface():
697
  placeholder="https://example1.com\nhttps://example2.com",
698
  value=""
699
  )
 
700
  with gr.Tab("πŸ“ File Input"):
701
  file_input = gr.File(
702
  label="Upload Files",
703
- file_types=["*"], # Accept all file types
704
  file_count="multiple"
705
  )
 
706
  with gr.Tab("πŸ“‹ JSON Input"):
707
  text_input = gr.TextArea(
708
  label="Direct JSON Input",
@@ -710,9 +576,11 @@ def create_modern_interface():
710
  placeholder="Paste your JSON data here...",
711
  value=""
712
  )
 
713
  with gr.Row():
714
  example_btn = gr.Button("πŸ“ Load Example", variant="secondary")
715
  clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
 
716
  with gr.Row():
717
  combine_data = gr.Checkbox(
718
  label="Combine all data into sequence",
@@ -723,6 +591,7 @@ def create_modern_interface():
723
  "πŸ”„ Process & Generate QR",
724
  variant="primary"
725
  )
 
726
  # Output components
727
  output_json = gr.JSON(label="Processed Data")
728
  output_gallery = gr.Gallery(
@@ -791,6 +660,7 @@ def create_modern_interface():
791
  if urls and urls.strip():
792
  url_list = re.split(r'[,\n]', urls)
793
  url_list = [url.strip() for url in url_list if url.strip()]
 
794
  for url in url_list:
795
  validation = url_processor.validate_url(url)
796
  if validation['is_valid']:
@@ -823,6 +693,7 @@ def create_modern_interface():
823
  return None, [], "❌ Failed to generate QR codes"
824
  else:
825
  return None, [], "⚠️ No valid content to process"
 
826
  except Exception as e:
827
  logger.error(f"Processing error: {e}")
828
  return None, [], f"❌ Error: {str(e)}"
@@ -839,22 +710,28 @@ def create_modern_interface():
839
  # Add helpful documentation
840
  gr.Markdown("""
841
  ### πŸš€ Features
842
- - **Complete URL Scraping**: Extracts every character from web pages
843
- - **Advanced File Processing**: Full content extraction from various text-based files and common archives. Supports flexible JSON handling.
844
- - **Smart JSON Handling**: Processes any size JSON with automatic chunking, either via direct input or file upload.
845
- - **Sequential QR Codes**: Maintains data integrity across multiple codes
846
- - **Modern Design**: Clean, responsive interface with visual feedback
847
- ### πŸ’‘ Tips
848
- 1. **URLs**: Enter multiple URLs separated by commas or newlines
849
- 2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and JSON files.
850
- 3. **JSON**: Use the example button to see the expected format or upload a .json file. The system will also try to detect JSON content in other file types.
851
- 4. **QR Codes**: Choose whether to combine data into sequential codes
852
- 5. **Processing**: Monitor the status for real-time feedback
853
- ### 🎨 Output
854
- - Generated QR codes are saved in the `output/qr_codes` directory
855
- - Each QR code contains metadata for proper sequencing
856
- - Hover over QR codes in the gallery to see details
857
- """)
 
 
 
 
 
 
858
  return interface
859
 
860
  def main():
@@ -865,7 +742,7 @@ def main():
865
 
866
  # Create and launch interface
867
  interface = create_modern_interface()
868
-
869
  # Launch with configuration
870
  interface.launch(
871
  share=False,
@@ -878,4 +755,4 @@ def main():
878
  raise
879
 
880
  if __name__ == "__main__":
881
- main()
 
11
  from typing import List, Dict, Optional, Union, Tuple
12
  from pathlib import Path
13
  from urllib.parse import urlparse, urljoin
14
+
15
  import requests
16
  import validators
17
  import gradio as gr
 
22
  import qrcode
23
  from PIL import Image, ImageDraw, ImageFont
24
  import numpy as np
 
 
25
 
26
  # Setup enhanced logging with more detailed formatting
27
  logging.basicConfig(
 
30
  handlers=[
31
  logging.StreamHandler(),
32
  logging.FileHandler('app.log', encoding='utf-8')
33
+ ]
34
+ )
35
  logger = logging.getLogger(__name__)
36
 
37
  # Ensure output directories exist with modern structure
 
43
 
44
  class EnhancedURLProcessor:
45
  """Advanced URL processing with complete content extraction"""
46
+
47
  def __init__(self):
48
  self.session = requests.Session()
49
  self.timeout = 15 # Extended timeout for larger content
50
  self.max_retries = 3
51
  self.user_agent = UserAgent()
52
+
53
  # Enhanced headers for better site compatibility
54
  self.session.headers.update({
55
  'User-Agent': self.user_agent.random,
 
70
  try:
71
  if not validators.url(url):
72
  return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
73
+
74
  parsed = urlparse(url)
75
  if not all([parsed.scheme, parsed.netloc]):
76
  return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
77
+
78
  # Try HEAD request first to check accessibility
79
  try:
80
  head_response = self.session.head(url, timeout=5)
 
100
  """Enhanced content fetcher with retry mechanism and complete character extraction"""
101
  try:
102
  logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
103
+
104
  # Update User-Agent randomly for each request
105
  self.session.headers.update({'User-Agent': self.user_agent.random})
106
+
107
  response = self.session.get(url, timeout=self.timeout)
108
  response.raise_for_status()
109
+
110
  # Detect encoding
111
  if response.encoding is None:
112
  encoding = chardet.detect(response.content)['encoding'] or 'utf-8'
113
  else:
114
  encoding = response.encoding
115
+
116
  # Decode content with fallback
117
  try:
118
  raw_content = response.content.decode(encoding, errors='replace')
 
136
  processed_content = self._process_html_content(raw_content, url)
137
  else:
138
  processed_content = raw_content
139
+
140
  return {
141
  'content': processed_content,
142
  'raw_content': raw_content,
143
  'metadata': metadata
144
  }
145
+
146
  except requests.exceptions.RequestException as e:
147
  if retry_count < self.max_retries - 1:
148
  logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
 
158
  """Process HTML content while preserving all characters"""
159
  try:
160
  soup = BeautifulSoup(content, 'html.parser')
161
+
162
  # Convert relative URLs to absolute
163
  for tag in soup.find_all(['a', 'img', 'link', 'script']):
164
  for attr in ['href', 'src']:
 
167
  tag[attr] = urljoin(base_url, tag[attr])
168
  except Exception:
169
  pass
170
+
171
  # Extract all text content
172
  text_parts = []
173
  for element in soup.stripped_strings:
174
  text_parts.append(str(element))
175
+
176
  return '\n'.join(text_parts)
177
  except Exception as e:
178
  logger.error(f"HTML processing error: {e}")
 
184
  def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
185
  self.max_file_size = max_file_size
186
  self.supported_extensions = {
187
+ '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
188
  '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
189
  '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
190
  '.pdf', '.doc', '.docx', '.rtf', '.odt'
 
204
 
205
  with tempfile.TemporaryDirectory() as temp_dir:
206
  temp_dir_path = Path(temp_dir)
207
+
208
  # Handle different archive types
209
  if self._is_archive(file.name):
210
  dataset.extend(self._process_archive(file.name, temp_dir_path))
 
 
211
  else:
212
+ dataset.extend(self._process_single_file(file))
213
 
214
  except Exception as e:
215
  logger.error(f"Error processing file: {str(e)}")
216
  return []
217
+
218
  return dataset
219
 
220
  def _is_archive(self, filepath: str) -> bool:
 
224
  ])
225
 
226
  def _process_single_file(self, file) -> List[Dict]:
227
+ """Process a single file with enhanced character extraction"""
228
  try:
229
  file_stat = os.stat(file.name)
230
  file_size = file_stat.st_size
231
+
232
  # Initialize content storage
233
  content_parts = []
234
+
235
  # Process file in chunks for large files
236
  chunk_size = 10 * 1024 * 1024 # 10MB chunks
237
  with open(file.name, 'rb') as f:
 
239
  chunk = f.read(chunk_size)
240
  if not chunk:
241
  break
242
+
243
  # Detect encoding for each chunk
244
  encoding = chardet.detect(chunk)['encoding'] or 'utf-8'
245
  try:
 
252
  # Combine all chunks
253
  complete_content = ''.join(content_parts)
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  return [{
256
  'source': 'file',
257
  'filename': os.path.basename(file.name),
 
280
  if extracted_path.suffix.lower() in self.supported_extensions:
281
  with open(extracted_path, 'rb') as f:
282
  dataset.extend(self._process_single_file(f))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
+ # TODO: Add support for other archive types (tar, 7z, etc.)
285
+
286
  except Exception as e:
287
  logger.error(f"Archive processing error: {e}")
288
+
289
  return dataset
290
 
291
  def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
 
292
  try:
293
+ # Convert data to JSON bytes
294
  json_str = json.dumps(data, ensure_ascii=False)
295
+ json_bytes = json_str.encode('utf-8')
296
+ total_length = len(json_bytes)
297
+
298
+ # Calculate metadata overhead in bytes
299
  metadata_template = {
300
  "chunk_index": 0,
301
  "total_chunks": 1,
 
303
  "chunk_hash": "",
304
  "data": ""
305
  }
306
+ overhead_bytes = len(json.dumps(metadata_template).encode('utf-8')) + 20 # Add padding
307
+
308
+ effective_chunk_size = max_size - overhead_bytes
309
+
310
+ if effective_chunk_size <= 0:
311
+ raise ValueError("Max size is too small after accounting for metadata overhead")
312
+
313
+ chunks = []
314
+ start = 0
315
+ while start < total_length:
316
+ end = start + effective_chunk_size
317
+ # Ensure valid Unicode by decoding
318
+ chunk_str = json_bytes[start:end].decode('utf-8', errors='replace')
319
  chunk = {
320
+ "chunk_index": len(chunks),
321
+ "total_chunks": -1, # To be set later
322
  "total_length": total_length,
323
+ "chunk_hash": hash(chunk_str) & 0xFFFFFFFF,
324
+ "data": chunk_str
325
  }
326
+ chunks.append(chunk)
327
+ start = end
328
+
329
+ # Update total_chunks in each chunk
330
+ for i, chunk in enumerate(chunks):
331
+ chunk["total_chunks"] = len(chunks)
332
+
333
+ return chunks
334
+ except Exception as e:
335
+ logger.error(f"Error chunking data: {e}")
336
+ return []
337
+
338
  # Calculate number of chunks needed
339
  num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
340
  chunk_size = -(-total_length // num_chunks) # Even distribution
341
+
342
  chunks = []
343
  for i in range(num_chunks):
344
  start_idx = i * chunk_size
345
  end_idx = min(start_idx + chunk_size, total_length)
346
  chunk_data = json_str[start_idx:end_idx]
347
+
348
  chunk = {
349
  "chunk_index": i,
350
  "total_chunks": num_chunks,
 
353
  "data": chunk_data
354
  }
355
  chunks.append(chunk)
356
+
357
  return chunks
358
+
359
  except Exception as e:
360
  logger.error(f"Error chunking data: {e}")
361
  return []
362
 
363
+ def generate_stylish_qr(data: Union[str, Dict],
364
+ filename: str,
365
+ size: int = 10,
366
+ border: int = 4,
367
+ fill_color: str = "#000000",
368
+ back_color: str = "#FFFFFF") -> str:
369
  """Generate a stylish QR code with enhanced visual appeal"""
370
  try:
371
  qr = qrcode.QRCode(
372
  version=None,
373
+ error_correction=qrcode.constants.ERROR_CORRECT_M,
374
  box_size=size,
375
  border=border
376
  )
377
+
378
  # Add data to QR code
379
  if isinstance(data, dict):
380
  qr.add_data(json.dumps(data, ensure_ascii=False))
381
  else:
382
  qr.add_data(data)
383
+
384
  qr.make(fit=True)
385
+
386
  # Create QR code image with custom colors
387
  qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
388
+
389
  # Convert to RGBA for transparency support
390
  qr_image = qr_image.convert('RGBA')
391
+
392
  # Add subtle gradient overlay
393
  gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
394
  draw = ImageDraw.Draw(gradient)
395
  for i in range(qr_image.width):
396
  alpha = int(255 * (1 - i/qr_image.width) * 0.1) # 10% maximum opacity
397
  draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
398
+
399
  # Combine images
400
  final_image = Image.alpha_composite(qr_image, gradient)
401
+
402
  # Save the image
403
  output_path = QR_CODES_DIR / filename
404
  final_image.save(output_path, quality=95)
405
+
406
  return str(output_path)
407
+
408
  except Exception as e:
409
  logger.error(f"QR generation error: {e}")
410
  return ""
 
414
  try:
415
  file_processor = EnhancedFileProcessor()
416
  paths = []
417
+
418
  if combined:
419
  # Process combined data
420
  chunks = file_processor.chunk_data(data)
 
455
  )
456
  if qr_path:
457
  paths.append(qr_path)
458
+
459
+ return paths
460
  except Exception as e:
461
  logger.error(f"QR code generation error: {e}")
462
  return []
463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  def create_modern_interface():
465
  """Create a modern and visually appealing Gradio interface"""
466
+
467
  # Modern CSS styling
468
  css = """
469
  /* Modern color scheme */
 
476
  --error-color: #f56565;
477
  --warning-color: #ed8936;
478
  }
479
+
480
  /* Container styling */
481
  .container {
482
  max-width: 1200px;
 
486
  border-radius: 1rem;
487
  box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
488
  }
489
+
490
  /* Component styling */
491
  .input-container {
492
  background-color: white;
 
495
  border: 1px solid #e2e8f0;
496
  margin-bottom: 1rem;
497
  }
498
+
499
  /* Button styling */
500
  .primary-button {
501
  background-color: var(--primary-color);
 
506
  cursor: pointer;
507
  transition: all 0.2s;
508
  }
509
+
510
  .primary-button:hover {
511
  background-color: var(--accent-color);
512
  transform: translateY(-1px);
513
  }
514
+
515
  /* Status messages */
516
  .status {
517
  padding: 1rem;
518
  border-radius: 0.375rem;
519
  margin: 1rem 0;
520
  }
521
+
522
  .status.success { background-color: #f0fff4; color: var(--success-color); }
523
  .status.error { background-color: #fff5f5; color: var(--error-color); }
524
  .status.warning { background-color: #fffaf0; color: var(--warning-color); }
525
+
526
  /* Gallery styling */
527
  .gallery {
528
  display: grid;
 
533
  border-radius: 0.5rem;
534
  border: 1px solid #e2e8f0;
535
  }
536
+
537
  .gallery img {
538
  width: 100%;
539
  height: auto;
540
  border-radius: 0.375rem;
541
  transition: transform 0.2s;
542
  }
543
+
544
  .gallery img:hover {
545
  transform: scale(1.05);
546
  }
547
  """
548
+
549
  # Create interface with modern design
550
  with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
551
  gr.Markdown("""
552
  # 🌐 Advanced Data Processing & QR Code Generator
553
+
554
  Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
555
  """)
556
+
557
  with gr.Tab("πŸ“ URL Processing"):
558
  url_input = gr.Textbox(
559
  label="Enter URLs (comma or newline separated)",
 
561
  placeholder="https://example1.com\nhttps://example2.com",
562
  value=""
563
  )
564
+
565
  with gr.Tab("πŸ“ File Input"):
566
  file_input = gr.File(
567
  label="Upload Files",
568
+ file_types=["text/*", "application/zip"], # Allow all text files and ZIP
569
  file_count="multiple"
570
  )
571
+
572
  with gr.Tab("πŸ“‹ JSON Input"):
573
  text_input = gr.TextArea(
574
  label="Direct JSON Input",
 
576
  placeholder="Paste your JSON data here...",
577
  value=""
578
  )
579
+
580
  with gr.Row():
581
  example_btn = gr.Button("πŸ“ Load Example", variant="secondary")
582
  clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
583
+
584
  with gr.Row():
585
  combine_data = gr.Checkbox(
586
  label="Combine all data into sequence",
 
591
  "πŸ”„ Process & Generate QR",
592
  variant="primary"
593
  )
594
+
595
  # Output components
596
  output_json = gr.JSON(label="Processed Data")
597
  output_gallery = gr.Gallery(
 
660
  if urls and urls.strip():
661
  url_list = re.split(r'[,\n]', urls)
662
  url_list = [url.strip() for url in url_list if url.strip()]
663
+
664
  for url in url_list:
665
  validation = url_processor.validate_url(url)
666
  if validation['is_valid']:
 
693
  return None, [], "❌ Failed to generate QR codes"
694
  else:
695
  return None, [], "⚠️ No valid content to process"
696
+
697
  except Exception as e:
698
  logger.error(f"Processing error: {e}")
699
  return None, [], f"❌ Error: {str(e)}"
 
710
  # Add helpful documentation
711
  gr.Markdown("""
712
  ### πŸš€ Features
713
+
714
+ - **Complete URL Scraping**: Extracts every character from web pages
715
+ - **Advanced File Processing**: Full content extraction from text files and archives
716
+ - **Smart JSON Handling**: Processes any size JSON with automatic chunking
717
+ - **Sequential QR Codes**: Maintains data integrity across multiple codes
718
+ - **Modern Design**: Clean, responsive interface with visual feedback
719
+
720
+ ### πŸ’‘ Tips
721
+
722
+ 1. **URLs**: Enter multiple URLs separated by commas or newlines
723
+ 2. **Files**: Upload text files or ZIP archives containing text files
724
+ 3. **JSON**: Use the example button to see the expected format
725
+ 4. **QR Codes**: Choose whether to combine data into sequential codes
726
+ 5. **Processing**: Monitor the status for real-time feedback
727
+
728
+ ### 🎨 Output
729
+
730
+ - Generated QR codes are saved in the `output/qr_codes` directory
731
+ - Each QR code contains metadata for proper sequencing
732
+ - Hover over QR codes in the gallery to see details
733
+ """)
734
+
735
  return interface
736
 
737
  def main():
 
742
 
743
  # Create and launch interface
744
  interface = create_modern_interface()
745
+
746
  # Launch with configuration
747
  interface.launch(
748
  share=False,
 
755
  raise
756
 
757
  if __name__ == "__main__":
758
+ main()