acecalisto3 commited on
Commit
c19dd51
Β·
verified Β·
1 Parent(s): 2e9ddb9

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +302 -374
app2.py CHANGED
@@ -55,6 +55,7 @@ except ImportError:
55
 
56
  try:
57
  from pyth.plugins.plaintext.writer import PlaintextWriter
 
58
  RTF_SUPPORT = True
59
  except ImportError:
60
  RTF_SUPPORT = False
@@ -86,108 +87,129 @@ TEMP_DIR = OUTPUTS_DIR / 'temp'
86
  for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
87
  directory.mkdir(parents=True, exist_ok=True)
88
 
 
 
89
  class EnhancedURLProcessor:
90
- """Advanced URL processing with enhanced content extraction and recursive link following."""
91
 
92
  def __init__(self):
93
- self.session = requests.Session()
94
- self.timeout = 15 # Extended timeout for larger content
 
 
95
  self.max_retries = 3
96
- self.user_agent = UserAgent()
97
- # Enhanced headers for better site compatibility
98
- self.session.headers.update({
99
- 'User-Agent': self.user_agent.random,
100
- 'Accept': 'text/html, application/json, application/xml, text/plain, */*', # Request common types
101
- 'Accept-Language': 'en-US,en;q=0.9',
102
- 'Accept-Encoding': 'gzip, deflate, br',
103
- 'Connection': 'keep-alive',
104
- 'Upgrade-Insecure-Requests': '1', # May be ignored for non-HTML
105
- 'Sec-Fetch-Dest': 'document',
106
- 'Sec-Fetch-Mode': 'navigate',
107
- 'Sec-Fetch-Site': 'none',
108
- 'Sec-Fetch-User': '?1',
109
- 'DNT': '1'
110
- })
111
 
112
- def validate_url(self, url: str) -> Dict[str, Any]:
113
- """Enhanced URL validation with detailed feedback"""
114
- try:
115
- if not validators.url(url):
116
- return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
117
- parsed = urlparse(url)
118
- if not all([parsed.scheme, parsed.netloc]):
119
- return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
120
- # Try HEAD request first to check accessibility
121
- try:
122
- head_response = self.session.head(url, timeout=5)
123
- head_response.raise_for_status()
124
- final_url = head_response.url # Capture potential redirects
125
- content_type = head_response.headers.get('Content-Type', 'unknown')
126
- server = head_response.headers.get('Server', 'unknown')
127
- size = head_response.headers.get('Content-Length', 'unknown')
128
- except requests.exceptions.RequestException:
129
- # If HEAD fails, try GET as some servers don't support HEAD
130
- try:
131
- response = self.session.get(url, timeout=self.timeout)
132
- response.raise_for_status()
133
- final_url = response.url # Capture potential redirects
134
- content_type = response.headers.get('Content-Type', 'unknown')
135
- server = response.headers.get('Server', 'unknown')
136
- size = response.headers.get('Content-Length', 'unknown') # May not be accurate for full content
137
- except requests.exceptions.RequestException as get_e:
138
- return {'is_valid': False, 'message': f'URL not accessible after HEAD/GET attempts: {str(get_e)}', 'details': str(get_e)}
139
- except Exception as get_e:
140
- return {'is_valid': False, 'message': f'Unexpected error during GET validation: {str(get_e)}', 'details': str(get_e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
 
143
- return {
144
- 'is_valid': True,
145
- 'message': 'URL is valid and accessible',
146
- 'details': {
147
- 'final_url': final_url,
148
- 'content_type': content_type,
149
- 'server': server,
150
- 'size': size
151
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  }
153
- except Exception as e:
154
- return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
155
 
156
  def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]:
157
- """Enhanced content fetcher with retry mechanism and complete character extraction"""
158
  try:
159
- logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
160
- # Update User-Agent randomly for each request
161
- self.session.headers.update({'User-Agent': self.user_agent.random})
162
  response = self.session.get(url, timeout=self.timeout)
163
  response.raise_for_status()
164
- final_url = response.url # Capture potential redirects
165
  content_type = response.headers.get('Content-Type', '')
166
 
167
- # Detect encoding
168
- if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
169
- encoding_detection = chardet.detect(response.content)
170
- encoding = encoding_detection['encoding'] or 'utf-8'
171
- logger.debug(f"Detected encoding '{encoding}' with confidence {encoding_detection['confidence']:.2f} for {url}")
172
- else:
173
- encoding = response.encoding
174
- logger.debug(f"Using response.encoding '{encoding}' for {url}")
175
 
176
- # Decode content with fallback
177
- try:
178
- raw_content = response.content.decode(encoding, errors='replace')
179
- except (UnicodeDecodeError, LookupError):
180
- # Fallback to a more common encoding if the first attempt fails
181
- try:
182
- raw_content = response.content.decode('utf-8', errors='replace')
183
- encoding = 'utf-8 (fallback)'
184
- logger.warning(f"Decoding with {encoding} fallback for {url}")
185
- except Exception:
186
- raw_content = response.content.decode('latin-1', errors='replace') # Another common fallback
187
- encoding = 'latin-1 (fallback)'
188
- logger.warning(f"Decoding with {encoding} fallback for {url}")
189
-
190
- # Extract metadata
191
  metadata = {
192
  'original_url': url,
193
  'final_url': final_url,
@@ -199,7 +221,7 @@ class EnhancedURLProcessor:
199
  'status_code': response.status_code
200
  }
201
 
202
- # Process based on content type
203
  processed_extraction = self._process_web_content(raw_content, metadata['content_type'], final_url)
204
 
205
  return {
@@ -212,29 +234,30 @@ class EnhancedURLProcessor:
212
  }
213
  except requests.exceptions.RequestException as e:
214
  if retry_count < self.max_retries - 1:
215
- logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
216
- time.sleep(2 ** retry_count) # Exponential backoff
217
  return self.fetch_content(url, retry_count + 1)
218
- logger.error(f"Failed to fetch content after {self.max_retries} attempts from {url}: {e}")
219
  return {
220
  'source': 'url',
221
  'url': url,
222
  'raw_content': None,
223
  'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}, # Include basic metadata on failure
224
  'extracted_data': None,
225
- 'processing_notes': [f"Failed to fetch content after {self.max_retries} attempts: {str(e)}"] # Ensure notes is a list
226
  }
227
  except Exception as e:
228
- logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
229
  return {
230
  'source': 'url',
231
  'url': url,
232
  'raw_content': raw_content if 'raw_content' in locals() else None,
233
  'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None},
234
  'extracted_data': None,
235
- 'processing_notes': [f"Unexpected processing error: {str(e)}"]
236
  }
237
 
 
238
  def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
239
  """Process content based on detected content type"""
240
  lower_content_type = content_type.lower()
@@ -353,252 +376,87 @@ class EnhancedURLProcessor:
353
  return extracted
354
 
355
  def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
356
- if not isinstance(max_steps, int) or not (0 <= max_steps <= 3):
357
- logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3.")
358
- return {
359
- 'url': url,
360
- 'level': 0,
361
- 'fetch_result': None,
362
- 'linked_extractions': [],
363
- 'note': f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3."
364
- }
365
-
366
- validation_result = self.validate_url(url)
367
- if not validation_result['is_valid']:
368
- logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
369
- return {
370
- 'url': url,
371
- 'level': 0,
372
- 'fetch_result': None,
373
- 'linked_extractions': [],
374
- 'note': f"Initial URL validation failed: {validation_result['message']}"
375
- }
376
-
377
- return self._fetch_content_recursive(url, max_steps, current_step=0)
378
-
379
- def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
380
- if current_step > max_steps:
381
- logger.debug(f"Depth limit reached for {url} at level {current_step}.")
382
- return {
383
- 'url': url,
384
- 'level': current_step,
385
- 'fetch_result': None,
386
- 'linked_extractions': [],
387
- 'note': f"Depth limit ({max_steps}) reached."
388
- }
389
-
390
- logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
391
- fetch_result = self.fetch_content(url)
392
- linked_extractions: List[Dict[str, Any]] = []
393
-
394
- if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
395
- extracted_data = fetch_result['extracted_data']
396
- links = extracted_data.get('links', [])
397
-
398
- logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
399
- if current_step < max_steps:
400
- for link_info in links:
401
- linked_url = link_info.get('url')
402
- if linked_url:
403
- linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
404
- linked_extractions.append(linked_result)
405
-
406
- return {
407
- 'url': url,
408
- 'level': current_step,
409
- 'fetch_result': fetch_result,
410
- 'linked_extractions': linked_extractions,
411
- 'note': f"Processed at level {current_step}"
412
- }
413
-
414
- class EnhancedURLProcessor:
415
- def fetch_content_with_depth(self, url, max_steps, current_level=0):
416
- """Simulates fetching and processing URLs up to max_steps depth."""
417
- # print(f"Simulating fetch for {url} at level {current_level} with remaining steps {max_steps}...") # Debug print
418
 
419
- # Simulate handling invalid URL format
420
- if not url.startswith('http://') and not url.startswith('https://'):
421
- return {
 
 
422
  'url': url,
423
- 'level': current_level,
424
  'fetch_result': None,
425
  'linked_extractions': [],
426
- 'processing_notes': 'Invalid URL format.'
427
  }
428
 
429
- # Base case for recursion depth
430
- if max_steps < 0:
431
- # This case should ideally not be reached if initial max_steps is non-negative
432
- # and recursion correctly decrements, but included for robustness.
 
 
 
 
433
  return {
434
  'url': url,
435
- 'level': current_level,
436
  'fetch_result': None,
437
  'linked_extractions': [],
438
- 'processing_notes': f'Recursion depth limit reached unexpectedly at level {current_level}.'
439
  }
440
 
441
- fetch_success = True # Assume success for simulation by default
442
- fetch_content = f"Simulated content for {url}" # Dummy content
443
- processing_notes = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
- # Simulate a potentially failing URL
446
- if "this-domain-does-not-exist" in url:
447
- fetch_success = False
448
- fetch_content = None
449
- processing_notes = "Simulated network error: Could not resolve host."
450
-
451
- linked_extractions = []
452
- # Simulate finding links only if more steps are allowed and fetch was successful
453
- if max_steps > 0 and fetch_success:
454
- # Simulate finding a couple of links to demonstrate nesting
455
- # In a real implementation, this would involve parsing the fetched content
456
- # and resolving relative URLs.
457
- simulated_linked_urls = [f"{url}/child1", f"{url}/child2"]
458
- for linked_url in simulated_linked_urls:
459
- # Recursively call for linked URLs, decreasing max_steps and increasing current_level
460
- linked_result = self.fetch_content_with_depth(linked_url, max_steps - 1, current_level + 1)
461
- if linked_result:
462
- linked_extractions.append(linked_result)
463
 
464
  return {
465
  'url': url,
466
- 'level': current_level,
467
- 'fetch_result': fetch_content, # Keep content even if fetch_success is False, or set to None based on desired behavior
468
  'linked_extractions': linked_extractions,
469
- 'processing_notes': processing_notes if processing_notes else 'Simulated fetch successful.'
470
  }
471
 
472
- # Define a helper function to recursively print extraction details
473
- def print_extraction_details(extraction, max_level, current_level=0):
474
- """Recursively prints details of the extraction and its linked extractions."""
475
- if not extraction:
476
- return
477
-
478
- indent = " " * current_level
479
- url = extraction.get('url', 'N/A')
480
- level = extraction.get('level', 'N/A')
481
- fetch_success = extraction.get('fetch_result') is not None and 'error' not in extraction.get('processing_notes', '').lower()
482
- num_linked = len(extraction.get('linked_extractions', []))
483
- notes = extraction.get('processing_notes', '')
484
-
485
- print(f"{indent}URL: {url} (Level {level}). Success: {fetch_success}")
486
- print(f"{indent}Number of linked extractions found: {num_linked}")
487
- if notes:
488
- print(f"{indent}Notes: {notes}")
489
-
490
- if current_level < max_level and extraction.get('linked_extractions'):
491
- # print(f"{indent}Processing linked extractions (Level {current_level + 1}):") # Optional header
492
- for i, linked_extraction in enumerate(extraction['linked_extractions']):
493
- # print(f"{indent} Linked Extraction {i+1}:") # Optional item separator
494
- print_extraction_details(linked_extraction, max_level, current_level + 1)
495
-
496
-
497
- if __name__ == "__main__":
498
- # Instantiate the processor
499
- processor = EnhancedURLProcessor()
500
-
501
- # Using quotes.toscrape.com as it has multiple links (in a real scenario)
502
- # For this simulation, the dummy processor creates nested links regardless of the actual URL content.
503
- test_url_with_links = "https://quotes.toscrape.com/"
504
-
505
- # --- Test Cases (Extended up to max_steps = 10) ---
506
-
507
- # Test with 0 steps (only initial URL)
508
- print("\n--- Testing with max_steps = 0 ---")
509
- result_0 = processor.fetch_content_with_depth(test_url_with_links, max_steps=0)
510
- # print(json.dumps(result_0, indent=2)) # Uncomment to see full structure
511
- print_extraction_details(result_0, 0)
512
-
513
- # Test with 1 step (initial URL + its direct links)
514
- print(f"\n--- Testing with max_steps = 1 for {test_url_with_links} ---")
515
- result_1 = processor.fetch_content_with_depth(test_url_with_links, max_steps=1)
516
- # print(json.dumps(result_1, indent=2)) # Uncomment to see full structure
517
- print_extraction_details(result_1, 1)
518
-
519
- # Test with 2 steps
520
- print(f"\n--- Testing with max_steps = 2 for {test_url_with_links} ---")
521
- result_2 = processor.fetch_content_with_depth(test_url_with_links, max_steps=2)
522
- # print(json.dumps(result_2, indent=2)) # Uncomment to see full structure
523
- print_extraction_details(result_2, 2)
524
-
525
- # Test with max_steps = 3
526
- print(f"\n--- Testing with max_steps = 3 for {test_url_with_links} ---")
527
- result_3 = processor.fetch_content_with_depth(test_url_with_links, max_steps=3)
528
- # print(json.dumps(result_3, indent=2)) # Uncomment to see full structure
529
- print_extraction_details(result_3, 3)
530
-
531
- # Test with max_steps = 4
532
- print(f"\n--- Testing with max_steps = 4 for {test_url_with_links} ---")
533
- result_4 = processor.fetch_content_with_depth(test_url_with_links, max_steps=4)
534
- # print(json.dumps(result_4, indent=2)) # Uncomment to see full structure
535
- print_extraction_details(result_4, 4)
536
-
537
- # Test with max_steps = 5
538
- print(f"\n--- Testing with max_steps = 5 for {test_url_with_links} ---")
539
- result_5 = processor.fetch_content_with_depth(test_url_with_links, max_steps=5)
540
- # print(json.dumps(result_5, indent=2)) # Uncomment to see full structure
541
- print_extraction_details(result_5, 5)
542
-
543
- # Test with max_steps = 6
544
- print(f"\n--- Testing with max_steps = 6 for {test_url_with_links} ---")
545
- result_6 = processor.fetch_content_with_depth(test_url_with_links, max_steps=6)
546
- # print(json.dumps(result_6, indent=2)) # Uncomment to see full structure
547
- print_extraction_details(result_6, 6)
548
-
549
- # Test with max_steps = 7
550
- print(f"\n--- Testing with max_steps = 7 for {test_url_with_links} ---")
551
- result_7 = processor.fetch_content_with_depth(test_url_with_links, max_steps=7)
552
- # print(json.dumps(result_7, indent=2)) # Uncomment to see full structure
553
- print_extraction_details(result_7, 7)
554
-
555
- # Test with max_steps = 8
556
- print(f"\n--- Testing with max_steps = 8 for {test_url_with_links} ---")
557
- result_8 = processor.fetch_content_with_depth(test_url_with_links, max_steps=8)
558
- # print(json.dumps(result_8, indent=2)) # Uncomment to see full structure
559
- print_extraction_details(result_8, 8)
560
-
561
- # Test with max_steps = 9
562
- print(f"\n--- Testing with max_steps = 9 for {test_url_with_links} ---")
563
- result_9 = processor.fetch_content_with_depth(test_url_with_links, max_steps=9)
564
- # print(json.dumps(result_9, indent=2)) # Uncomment to see full structure
565
- print_extraction_details(result_9, 9)
566
-
567
- # Test with max_steps = 10
568
- print(f"\n--- Testing with max_steps = 10 for {test_url_with_links} ---")
569
- result_10 = processor.fetch_content_with_depth(test_url_with_links, max_steps=10)
570
- # print(json.dumps(result_10, indent=2)) # Uncomment to see full structure
571
- print_extraction_details(result_10, 10)
572
-
573
-
574
- # Test with invalid max_steps (e.g., negative)
575
- print("\n--- Testing with invalid max_steps = -1 ---")
576
- result_invalid_steps = processor.fetch_content_with_depth(test_url_with_links, max_steps=-1)
577
- # print(json.dumps(result_invalid_steps, indent=2)) # Uncomment to see full structure
578
- print(f"Result for invalid steps: {result_invalid_steps.get('processing_notes')}")
579
-
580
-
581
- # Test with invalid initial URL format
582
- print("\n--- Testing with invalid initial URL format ---")
583
- result_invalid_url = processor.fetch_content_with_depth("invalid-url", max_steps=1)
584
- # print(json.dumps(result_invalid_url, indent=2)) # Uncomment to see full structure
585
- print(f"Result for invalid initial URL: {result_invalid_url.get('processing_notes')}")
586
-
587
- # Test with a URL that might fail to fetch (simulated)
588
- print("\n--- Testing with a potentially failing URL (simulated) ---")
589
- # Use a non-existent subdomain or a port that's unlikely to be open
590
- failing_url = "http://this-domain-does-not-exist-12345.com/"
591
- result_fail = processor.fetch_content_with_depth(failing_url, max_steps=1)
592
- # print(json.dumps(result_fail, indent=2)) # Uncomment to see full structure
593
- print(f"Result for failing URL: {result_fail.get('processing_notes')}")
594
- # Check if fetch_result is None or indicates failure
595
- if result_fail.get('fetch_result') is None:
596
- print("Fetch result is None as expected for failing URL.")
597
- # if result_fail.get('fetch_result') and 'error' in result_fail['fetch_result'].get('processing_notes', '').lower():
598
- # print(f"Fetch result notes for failing URL: {result_fail['fetch_result'].get('processing_notes')}")
599
-
600
-
601
- print("\n--- End of Test Cases ---")
602
 
603
  class EnhancedFileProcessor:
604
  """Advanced file processing with enhanced content extraction"""
@@ -622,7 +480,21 @@ class EnhancedFileProcessor:
622
  return []
623
 
624
  dataset = []
625
- file_path = Path(file.name) # Use Path object for easier handling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
626
 
627
  try:
628
  file_size = file_path.stat().st_size
@@ -636,18 +508,19 @@ class EnhancedFileProcessor:
636
  'processing_notes': 'File size exceeds limit.'
637
  }]
638
 
 
639
  with tempfile.TemporaryDirectory() as temp_dir:
640
  temp_dir_path = Path(temp_dir)
641
 
642
- # Decide processing strategy
643
  if file_path.suffix.lower() in self.archive_extensions:
644
  dataset.extend(self._process_archive(file_path, temp_dir_path))
645
  elif file_path.suffix.lower() in self.supported_extensions:
646
  # Pass the path to the single file processor
647
  dataset.extend(self._process_single_file(file_path))
648
  else:
649
- logger.warning(f"Unsupported file type for processing: '{file_path.name}'")
650
- # Optionally process as raw text even if extension is unsupported
651
  try:
652
  # Read as text with error replacement
653
  content_bytes = file_path.read_bytes()
@@ -660,7 +533,7 @@ class EnhancedFileProcessor:
660
  'file_size': file_size,
661
  'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
662
  'extracted_data': {'plain_text': raw_content}, # Store raw text under a key
663
- 'processing_notes': 'Processed as plain text (unsupported extension).'
664
  })
665
  except Exception as e:
666
  logger.error(f"Error reading or processing unsupported file '{file_path.name}' as text: {e}")
@@ -670,7 +543,7 @@ class EnhancedFileProcessor:
670
  'file_size': file_size,
671
  'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
672
  'extracted_data': None,
673
- 'processing_notes': f'Unsupported file type and failed to read as text: {e}'
674
  })
675
 
676
 
@@ -681,7 +554,7 @@ class EnhancedFileProcessor:
681
  'filename': file_path.name,
682
  'file_size': file_size if 'file_size' in locals() else None,
683
  'extracted_data': None,
684
- 'processing_notes': f'Overall file processing error: {str(e)}'
685
  })
686
  return dataset
687
 
@@ -703,7 +576,7 @@ class EnhancedFileProcessor:
703
 
704
  raw_content: Optional[str] = None
705
  extracted_data: Any = None
706
- processing_notes = []
707
 
708
  try:
709
  # Read content efficiently
@@ -788,13 +661,13 @@ class EnhancedFileProcessor:
788
 
789
  if rows:
790
  # Limit the number of rows included for potentially huge CSVs
791
- max_rows_preview = 100
792
  extracted_data = {
793
- 'headers': rows[0] if rows[0] else None, # Assume first row is header
794
- 'rows': rows[1:max_rows_preview+1] # Get up to max_rows_preview data rows
795
  }
796
  if len(rows) > max_rows_preview + 1:
797
- processing_notes.append(f"CSV truncated to {max_rows_preview} data rows.")
798
  processing_notes.append("Parsed as CSV.")
799
  if not is_explicit_csv:
800
  processing_notes.append("Note: Content looked like CSV despite extension/mime.")
@@ -825,7 +698,7 @@ class EnhancedFileProcessor:
825
  extracted_text = text_content
826
  processing_notes.append("Extracted text from PDF.")
827
  finally:
828
- temp_path.unlink() # Clean up temp file
829
  elif file_extension == '.docx' and DOCX_SUPPORT:
830
  with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
831
  tmp_file.write(content_bytes) # Write bytes to temp file
@@ -836,10 +709,11 @@ class EnhancedFileProcessor:
836
  extracted_text = text_content
837
  processing_notes.append("Extracted text from DOCX.")
838
  finally:
839
- temp_path.unlink() # Clean up temp file
840
  elif file_extension == '.rtf' and RTF_SUPPORT:
841
  # pyth can read directly from file-like object or string
842
  try:
 
843
  doc = Rtf15Reader.read(io.StringIO(raw_content))
844
  text_content = PlaintextWriter.write(doc).getvalue()
845
  extracted_text = text_content
@@ -858,7 +732,7 @@ class EnhancedFileProcessor:
858
  extracted_text = text_content
859
  processing_notes.append("Extracted text from ODT.")
860
  finally:
861
- temp_path.unlink() # Clean up temp file
862
  elif file_extension in ['.doc', '.ppt', '.pptx', '.xls', '.xlsx']:
863
  # These require more complex or platform-specific libraries (e.g. antiword, pandoc, COM objects on Windows)
864
  processing_notes.append(f"Automatic text extraction for {file_extension.upper()} not fully implemented.")
@@ -925,10 +799,16 @@ class EnhancedFileProcessor:
925
  if zipfile.is_zipfile(archive_path):
926
  with zipfile.ZipFile(archive_path, 'r') as zip_ref:
927
  for file_info in zip_ref.infolist():
 
928
  if file_info.file_size > 0 and not file_info.filename.endswith('/'):
 
 
 
929
  try:
930
- zip_ref.extract(file_info, path=extract_to)
931
- extracted_file_path = extract_to / file_info.filename
 
 
932
  # Recursively process the extracted file if it's supported and not an archive itself
933
  if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
934
  dataset.extend(self._process_single_file(extracted_file_path))
@@ -940,6 +820,14 @@ class EnhancedFileProcessor:
940
  logger.debug(f"Skipping unsupported file in archive: '{file_info.filename}'")
941
  except Exception as e:
942
  logger.warning(f"Error extracting/processing file '{file_info.filename}' from zip '{archive_path.name}': {e}")
 
 
 
 
 
 
 
 
943
  else:
944
  logger.error(f"'{archive_path.name}' is not a valid zip file.")
945
 
@@ -954,9 +842,23 @@ class EnhancedFileProcessor:
954
  with tarfile.open(archive_path, mode) as tar_ref:
955
  for member in tar_ref.getmembers():
956
  if member.isfile():
 
 
 
957
  try:
958
- tar_ref.extract(member, path=extract_to)
959
- extracted_file_path = extract_to / member.name
 
 
 
 
 
 
 
 
 
 
 
960
  # Recursively process extracted file
961
  if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
962
  dataset.extend(self._process_single_file(extracted_file_path))
@@ -967,6 +869,14 @@ class EnhancedFileProcessor:
967
  logger.debug(f"Skipping unsupported file in archive: '{member.name}'")
968
  except Exception as e:
969
  logger.warning(f"Error extracting/processing file '{member.name}' from tar '{archive_path.name}': {e}")
 
 
 
 
 
 
 
 
970
  except tarfile.TarError as e:
971
  logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
972
 
@@ -991,7 +901,12 @@ class EnhancedFileProcessor:
991
  except Exception as e:
992
  logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
993
  finally:
994
- if extracted_path.exists(): extracted_path.unlink() # Clean up extracted file
 
 
 
 
 
995
 
996
  # TODO: Add support for other archive types (.bz2, .7z, .rar)
997
  elif archive_extension in ('.bz2', '.7z', '.rar'):
@@ -1020,12 +935,14 @@ class EnhancedFileProcessor:
1020
  "idx": 0, # chunk_index
1021
  "tc": 1, # total_chunks
1022
  "tl": total_length, # total_length
1023
- "hash": "", # chunk_hash
1024
  "data": "" # chunk_data
1025
  }
1026
  # Estimate overhead more accurately by dumping a sample metadata structure
1027
  # and adding some safety margin. Shortened keys reduce overhead.
1028
- overhead_estimate = len(json.dumps(metadata_template, separators=(',', ':'))) + 50 # Extra padding
 
 
1029
 
1030
  # Calculate effective chunk size
1031
  effective_chunk_size = max_size - overhead_estimate
@@ -1338,18 +1255,18 @@ def create_modern_interface():
1338
  }
1339
  </script>
1340
  """
1341
-
1342
  with gr.Row():
 
1343
  crawl_depth_slider = gr.Slider(
1344
  label="Crawl Depth",
1345
  minimum=0,
1346
- maximum=3,
1347
  value=0,
1348
  step=1,
1349
  interactive=True,
1350
- info="Select the maximum depth for crawling links (0-3)."
1351
  )
1352
-
1353
  qr_code_paths = gr.State([])
1354
  gr.Markdown("""
1355
  # 🌐 Advanced Data Processing & QR Code Generator
@@ -1445,13 +1362,14 @@ def create_modern_interface():
1445
  num_qr_codes = len(paths)
1446
  cols = math.ceil(math.sqrt(num_qr_codes)) # Calculate columns for a roughly square grid
1447
  cols = max(1, min(cols, 6)) # Limit max columns for small screens
1448
- rows = math.ceil(num_qr_codes / cols)
1449
 
1450
  viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'.format(cols)
1451
 
1452
- # Initialize enabledStates if it's empty (first load)
1453
- if not enabled_states and paths:
1454
- enabled_states = list(range(num_qr_codes)) # Enable all by default on first view
 
1455
 
1456
  for i, path in enumerate(paths):
1457
  is_enabled = i in enabled_states
@@ -1465,7 +1383,7 @@ def create_modern_interface():
1465
 
1466
  return viewport_html
1467
 
1468
- def process_inputs(urls, files, text, combine, *args):
1469
  """Process all inputs and generate QR codes"""
1470
  results = []
1471
  processing_status_messages = []
@@ -1497,19 +1415,30 @@ def create_modern_interface():
1497
  url_list = re.split(r'[,\n]', urls)
1498
  url_list = [url.strip() for url in url_list if url.strip()]
1499
  for url in url_list:
1500
- validation = url_processor.validate_url(url)
1501
- if validation['is_valid']:
1502
- processing_status_messages.append(f"🌐 Fetching URL: {url}...")
1503
- content_result = url_processor.fetch_content(url)
1504
- if content_result:
1505
- results.append(content_result)
1506
- processing_status_messages.append(f"βœ… Fetched and processed URL: {url}")
1507
- else:
1508
- processing_status_messages.append(f"❌ Failed to fetch/process URL: {url}")
1509
- if validation['details'].get('final_url'):
1510
- processing_status_messages[-1] += f" (Redirected to {validation['details']['final_url']})"
 
 
 
 
 
1511
  else:
1512
- processing_status_messages.append(f"⚠️ Skipping invalid URL: {url} ({validation['message']})")
 
 
 
 
 
 
1513
 
1514
  # Process files
1515
  if files:
@@ -1519,9 +1448,14 @@ def create_modern_interface():
1519
  if file_results:
1520
  results.extend(file_results)
1521
  processing_status_messages.append(f"βœ… Processed file: {file.name}")
 
 
 
 
1522
  else:
1523
  processing_status_messages.append(f"❌ Failed to process file: {file.name}")
1524
 
 
1525
  # Generate QR codes
1526
  qr_paths = []
1527
  final_json_output = None
@@ -1557,7 +1491,7 @@ def create_modern_interface():
1557
  num_qrs = 0
1558
  else:
1559
  num_qrs = len(qr_paths_list)
1560
-
1561
  initial_enabled_states = list(range(num_qrs))
1562
  return qr_paths_list, initial_enabled_states # Return paths list and initial enabled state
1563
 
@@ -1567,7 +1501,7 @@ def create_modern_interface():
1567
 
1568
  process_btn.click(
1569
  process_inputs,
1570
- inputs=[url_input, file_input, text_input, combine_data, crawl_depth_slider],
1571
  outputs=[output_json, output_gallery, output_text]
1572
  ).then( # Chain a .then() to update the QR paths state and trigger viewport update
1573
  on_qr_generation,
@@ -1581,7 +1515,7 @@ def create_modern_interface():
1581
  # Add helpful documentation
1582
  gr.Markdown("""
1583
  ### πŸš€ Features
1584
- - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type.
1585
  - **Advanced File Processing**: Reads various text-based files (.txt, .md, .log etc.), HTML, XML, CSV, and attempts text extraction from common documents (.pdf, .docx, .rtf, .odt - *requires extra dependencies*).
1586
  - **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs.
1587
  - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives.
@@ -1591,19 +1525,13 @@ def create_modern_interface():
1591
  - **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
1592
  - **Modern Design**: Clean, responsive interface with visual feedback.
1593
  ### πŸ’‘ Tips
1594
- 1. **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type.
1595
  2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats.
1596
  3. **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure.
1597
  4. **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries. Check the console logs for warnings if a library is missing.
1598
  5. **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item.
1599
  6. **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps.
1600
  7. **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images.
1601
- ### 🎨 Output Details
1602
- - The "Processed Data" JSON will be a list of dictionaries. Each dictionary represents one processed input (URL or file).
1603
- - Each item will have keys like `source`, `filename` (for files), `url` (for URLs), `mime_type`, `raw_content` (if readable), `extracted_data`, and `processing_notes`.
1604
- - `extracted_data` will contain the parsed/extracted content, structured according to the input type (e.g., dictionary for JSON, text for documents, list of rows for CSV, dictionary with title/text/links for HTML).
1605
- - `processing_notes` will list any issues encountered during extraction.
1606
- - Generated QR codes are saved in the `output/qr_codes` directory.
1607
  ### βš™οΈ QR Code Viewport Instructions
1608
  1. Navigate to the **QR Code Viewport** tab after generating QR codes.
1609
  2. The generated QR codes will be displayed in a grid based on their total count.
@@ -1635,4 +1563,4 @@ def main():
1635
  raise # Re-raise the exception to ensure the process exits if launch fails
1636
 
1637
  if __name__ == "__main__":
1638
- main()
 
55
 
56
  try:
57
  from pyth.plugins.plaintext.writer import PlaintextWriter
58
+ from pyth.plugins.rtf15.reader import Rtf15Reader # Import Rtf15Reader
59
  RTF_SUPPORT = True
60
  except ImportError:
61
  RTF_SUPPORT = False
 
87
  for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
88
  directory.mkdir(parents=True, exist_ok=True)
89
 
90
+ # Dummy EnhancedURLProcessor class for demonstration purposes if the actual class isn't provided.
91
+ # This dummy simulates fetching and creating a nested structure based on max_steps.
92
  class EnhancedURLProcessor:
93
+ """Simulates advanced URL processing with enhanced content extraction and recursive link following."""
94
 
95
  def __init__(self):
96
+ # Dummy session and user agent for simulation
97
+ self.session = type('obj', (object,), {'get': self._dummy_get_request})()
98
+ self.user_agent = type('obj', (object,), {'random': 'SimulatedAgent/1.0'})()
99
+ self.timeout = 15
100
  self.max_retries = 3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ def _dummy_get_request(self, url, timeout):
103
+ """Simulates a GET request response."""
104
+ class MockResponse:
105
+ def __init__(self, url, status_code, content_type, content, encoding='utf-8'):
106
+ self.url = url
107
+ self.status_code = status_code
108
+ self.headers = {'Content-Type': content_type}
109
+ self._content = content.encode(encoding)
110
+ self.encoding = encoding
111
+
112
+ def raise_for_status(self):
113
+ if 400 <= self.status_code < 600:
114
+ raise requests.exceptions.RequestException(f"Simulated HTTP error {self.status_code}")
115
+
116
+ @property
117
+ def content(self):
118
+ return self._content
119
+
120
+ # Simulate different responses based on URL
121
+ if "this-domain-does-not-exist" in url:
122
+ raise requests.exceptions.RequestException("Simulated network error: Could not resolve host.")
123
+ elif "httpbin.org/html" in url:
124
+ # Simulate a simple HTML response
125
+ html_content = """
126
+ <!DOCTYPE html>
127
+ <html>
128
+ <head><title>Simulated HTML</title></head>
129
+ <body>
130
+ <h1>Hello, World!</h1>
131
+ <p>This is simulated HTML content.</p>
132
+ <a href="/link1">Link 1</a>
133
+ <a href="/link2">Link 2</a>
134
+ </body>
135
+ </html>
136
+ """
137
+ return MockResponse(url, 200, 'text/html', html_content)
138
+ elif "quotes.toscrape.com" in url:
139
+ # Simulate a more complex HTML with more links for deeper testing
140
+ html_content = f"""
141
+ <!DOCTYPE html>
142
+ <html>
143
+ <head><title>Simulated Quotes Page</title></head>
144
+ <body>
145
+ <h1>Quotes</h1>
146
+ <p>Some simulated quotes.</p>
147
+ <a href="{url}/page/1/">Page 1</a>
148
+ <a href="{url}/page/2/">Page 2</a>
149
+ <a href="/tag/love/">Love Quotes</a>
150
+ </body>
151
+ </html>
152
+ """
153
+ return MockResponse(url, 200, 'text/html', html_content)
154
+ elif "/child" in url:
155
+ # Simulate nested HTML pages
156
+ html_content = f"""
157
+ <!DOCTYPE html>
158
+ <html>
159
+ <head><title>Simulated Child Page</title></head>
160
+ <body>
161
+ <h1>Child Page</h1>
162
+ <p>Content for {url}.</p>
163
+ <a href="{url}/grandchild1">Grandchild 1</a>
164
+ </body>
165
+ </html>
166
+ """
167
+ return MockResponse(url, 200, 'text/html', html_content)
168
+ else:
169
+ # Default simulated plain text response
170
+ return MockResponse(url, 200, 'text/plain', f"Simulated content for {url}")
171
 
172
 
173
+ def validate_url(self, url: str) -> Dict[str, Any]:
174
+ """Enhanced URL validation with detailed feedback (Simulated)"""
175
+ # In a real implementation, this would perform actual network checks (HEAD/GET)
176
+ # For simulation, just check format
177
+ if not validators.url(url):
178
+ return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
179
+ parsed = urlparse(url)
180
+ if not all([parsed.scheme, parsed.netloc]):
181
+ return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
182
+
183
+ # Simulate accessibility check
184
+ if "this-domain-does-not-exist" in url:
185
+ return {'is_valid': False, 'message': 'Simulated: URL not accessible', 'details': 'Simulated network error'}
186
+
187
+ return {
188
+ 'is_valid': True,
189
+ 'message': 'Simulated: URL is valid and accessible',
190
+ 'details': {
191
+ 'final_url': url, # In simulation, final_url is same as original unless specifically handled
192
+ 'content_type': 'text/html', # Simulate HTML for most tests
193
+ 'server': 'SimulatedServer',
194
+ 'size': 'SimulatedSize'
195
  }
196
+ }
 
197
 
198
  def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]:
199
+ """Enhanced content fetcher with retry mechanism and complete character extraction (Simulated)"""
200
  try:
201
+ logger.info(f"Simulating fetch content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
202
+ # Simulate the request using the dummy get
 
203
  response = self.session.get(url, timeout=self.timeout)
204
  response.raise_for_status()
205
+ final_url = response.url # Capture potential redirects (simulated)
206
  content_type = response.headers.get('Content-Type', '')
207
 
208
+ # Simulate encoding detection (assuming utf-8 for simplicity in simulation)
209
+ encoding = 'utf-8'
210
+ raw_content = response.content.decode(encoding, errors='replace')
 
 
 
 
 
211
 
212
+ # Extract metadata (simulated)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  metadata = {
214
  'original_url': url,
215
  'final_url': final_url,
 
221
  'status_code': response.status_code
222
  }
223
 
224
+ # Process based on content type (using the actual _process_web_content)
225
  processed_extraction = self._process_web_content(raw_content, metadata['content_type'], final_url)
226
 
227
  return {
 
234
  }
235
  except requests.exceptions.RequestException as e:
236
  if retry_count < self.max_retries - 1:
237
+ logger.warning(f"Simulated Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
238
+ time.sleep(0.1) # Shorter backoff for simulation
239
  return self.fetch_content(url, retry_count + 1)
240
+ logger.error(f"Simulated: Failed to fetch content after {self.max_retries} attempts from {url}: {e}")
241
  return {
242
  'source': 'url',
243
  'url': url,
244
  'raw_content': None,
245
  'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}, # Include basic metadata on failure
246
  'extracted_data': None,
247
+ 'processing_notes': [f"Simulated: Failed to fetch content after {self.max_retries} attempts: {str(e)}"] # Ensure notes is a list
248
  }
249
  except Exception as e:
250
+ logger.error(f"Simulated: Unexpected error while fetching or processing URL {url}: {e}")
251
  return {
252
  'source': 'url',
253
  'url': url,
254
  'raw_content': raw_content if 'raw_content' in locals() else None,
255
  'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None},
256
  'extracted_data': None,
257
+ 'processing_notes': [f"Simulated: Unexpected processing error: {str(e)}"]
258
  }
259
 
260
+
261
  def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
262
  """Process content based on detected content type"""
263
  lower_content_type = content_type.lower()
 
376
  return extracted
377
 
378
  def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
379
+ """Fetches content from a URL and recursively follows links up to max_steps depth."""
380
+ # Validate max_steps first
381
+ if not isinstance(max_steps, int) or not (0 <= max_steps <= 10): # Changed max depth to 10
382
+ logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 10.")
383
+ return {
384
+ 'url': url,
385
+ 'level': 0,
386
+ 'fetch_result': None,
387
+ 'linked_extractions': [],
388
+ 'processing_notes': [f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 10."]
389
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
+ # Validate the initial URL
392
+ validation_result = self.validate_url(url)
393
+ if not validation_result['is_valid']:
394
+ logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
395
+ return {
396
  'url': url,
397
+ 'level': 0,
398
  'fetch_result': None,
399
  'linked_extractions': [],
400
+ 'processing_notes': [f"Initial URL validation failed: {validation_result['message']}"]
401
  }
402
 
403
+ # Start the recursive fetching process
404
+ return self._fetch_content_recursive(url, max_steps, current_step=0)
405
+
406
+ def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
407
+ """Recursive helper function to fetch content and follow links."""
408
+ # Base case: Stop if current depth exceeds max_steps
409
+ if current_step > max_steps:
410
+ logger.debug(f"Depth limit ({max_steps}) reached for {url} at level {current_step}.")
411
  return {
412
  'url': url,
413
+ 'level': current_step,
414
  'fetch_result': None,
415
  'linked_extractions': [],
416
+ 'processing_notes': [f"Depth limit ({max_steps}) reached."]
417
  }
418
 
419
+ logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
420
+ fetch_result = self.fetch_content(url)
421
+ linked_extractions: List[Dict[str, Any]] = []
422
+
423
+ # Only attempt to extract and follow links if fetch was successful and content is HTML
424
+ if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
425
+ extracted_data = fetch_result['extracted_data']
426
+ links = extracted_data.get('links', [])
427
+
428
+ logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
429
+ # Recursively process linked URLs if more steps are allowed
430
+ if current_step < max_steps:
431
+ for link_info in links:
432
+ linked_url = link_info.get('url')
433
+ if linked_url:
434
+ # Add a check to prevent processing the same URL repeatedly in a single crawl path
435
+ # (More sophisticated de-duplication across the *entire* crawl would require a visited set passed down)
436
+ # For simplicity here, we just prevent immediate cycles.
437
+ if linked_url != url:
438
+ linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
439
+ # Only append results if the recursive call returned something valid
440
+ if linked_result:
441
+ linked_extractions.append(linked_result)
442
+ else:
443
+ logger.debug(f"Skipping self-referencing link: {linked_url}")
444
+
445
+
446
+ # Add processing notes from the fetch_result to the current level's notes
447
+ current_notes = fetch_result.get('processing_notes', []) if fetch_result else ['Fetch failed.']
448
+ if f"Processed at level {current_step}" not in current_notes:
449
+ current_notes.append(f"Processed at level {current_step}")
450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
 
452
  return {
453
  'url': url,
454
+ 'level': current_step,
455
+ 'fetch_result': fetch_result, # Include the full fetch result for details
456
  'linked_extractions': linked_extractions,
457
+ 'processing_notes': current_notes
458
  }
459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
 
461
  class EnhancedFileProcessor:
462
  """Advanced file processing with enhanced content extraction"""
 
480
  return []
481
 
482
  dataset = []
483
+ # Use Path object for easier handling. Note: Gradio file object might not be a standard file path,
484
+ # but rather an object with a 'name' attribute pointing to a temp file path.
485
+ file_path = Path(file.name)
486
+
487
+ # Ensure file exists before trying to get size/stats
488
+ if not file_path.exists():
489
+ logger.error(f"File path does not exist: {file_path}")
490
+ return [{
491
+ 'source': 'file',
492
+ 'filename': file.name if hasattr(file, 'name') else 'unknown',
493
+ 'file_size': None,
494
+ 'extracted_data': None,
495
+ 'processing_notes': 'File path does not exist.'
496
+ }]
497
+
498
 
499
  try:
500
  file_size = file_path.stat().st_size
 
508
  'processing_notes': 'File size exceeds limit.'
509
  }]
510
 
511
+ # Use a temporary directory for extracting archives
512
  with tempfile.TemporaryDirectory() as temp_dir:
513
  temp_dir_path = Path(temp_dir)
514
 
515
+ # Decide processing strategy based on extension
516
  if file_path.suffix.lower() in self.archive_extensions:
517
  dataset.extend(self._process_archive(file_path, temp_dir_path))
518
  elif file_path.suffix.lower() in self.supported_extensions:
519
  # Pass the path to the single file processor
520
  dataset.extend(self._process_single_file(file_path))
521
  else:
522
+ logger.warning(f"Unsupported file type for processing: '{file_path.name}'. Attempting to read as plain text.")
523
+ # Attempt to process as raw text even if extension is unsupported
524
  try:
525
  # Read as text with error replacement
526
  content_bytes = file_path.read_bytes()
 
533
  'file_size': file_size,
534
  'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
535
  'extracted_data': {'plain_text': raw_content}, # Store raw text under a key
536
+ 'processing_notes': ['Processed as plain text (unsupported extension).'] # Ensure notes is a list
537
  })
538
  except Exception as e:
539
  logger.error(f"Error reading or processing unsupported file '{file_path.name}' as text: {e}")
 
543
  'file_size': file_size,
544
  'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
545
  'extracted_data': None,
546
+ 'processing_notes': [f'Unsupported file type and failed to read as text: {e}'] # Ensure notes is a list
547
  })
548
 
549
 
 
554
  'filename': file_path.name,
555
  'file_size': file_size if 'file_size' in locals() else None,
556
  'extracted_data': None,
557
+ 'processing_notes': [f'Overall file processing error: {str(e)}'] # Ensure notes is a list
558
  })
559
  return dataset
560
 
 
576
 
577
  raw_content: Optional[str] = None
578
  extracted_data: Any = None
579
+ processing_notes: List[str] = [] # Initialize notes as a list
580
 
581
  try:
582
  # Read content efficiently
 
661
 
662
  if rows:
663
  # Limit the number of rows included for potentially huge CSVs
664
+ max_rows_preview = 100 # Limit text preview
665
  extracted_data = {
666
+ 'headers': rows[0] if rows and rows[0] else None, # Assume first row is header if exists
667
+ 'rows': rows[1:max_rows_preview+1] if len(rows) > 1 else [] # Get up to max_rows_preview data rows, if any
668
  }
669
  if len(rows) > max_rows_preview + 1:
670
+ processing_notes.append(f"CSV data rows truncated to {max_rows_preview}.")
671
  processing_notes.append("Parsed as CSV.")
672
  if not is_explicit_csv:
673
  processing_notes.append("Note: Content looked like CSV despite extension/mime.")
 
698
  extracted_text = text_content
699
  processing_notes.append("Extracted text from PDF.")
700
  finally:
701
+ if temp_path.exists(): temp_path.unlink() # Clean up temp file
702
  elif file_extension == '.docx' and DOCX_SUPPORT:
703
  with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
704
  tmp_file.write(content_bytes) # Write bytes to temp file
 
709
  extracted_text = text_content
710
  processing_notes.append("Extracted text from DOCX.")
711
  finally:
712
+ if temp_path.exists(): temp_path.unlink() # Clean up temp file
713
  elif file_extension == '.rtf' and RTF_SUPPORT:
714
  # pyth can read directly from file-like object or string
715
  try:
716
+ # Rtf15Reader expects a file-like object or string
717
  doc = Rtf15Reader.read(io.StringIO(raw_content))
718
  text_content = PlaintextWriter.write(doc).getvalue()
719
  extracted_text = text_content
 
732
  extracted_text = text_content
733
  processing_notes.append("Extracted text from ODT.")
734
  finally:
735
+ if temp_path.exists(): temp_path.unlink() # Clean up temp file
736
  elif file_extension in ['.doc', '.ppt', '.pptx', '.xls', '.xlsx']:
737
  # These require more complex or platform-specific libraries (e.g. antiword, pandoc, COM objects on Windows)
738
  processing_notes.append(f"Automatic text extraction for {file_extension.upper()} not fully implemented.")
 
799
  if zipfile.is_zipfile(archive_path):
800
  with zipfile.ZipFile(archive_path, 'r') as zip_ref:
801
  for file_info in zip_ref.infolist():
802
+ # Skip directories and empty files
803
  if file_info.file_size > 0 and not file_info.filename.endswith('/'):
804
+ # Sanitize filename to prevent directory traversal issues
805
+ sanitized_filename = Path(file_info.filename).name # Takes only the base name
806
+ extracted_file_path = extract_to / sanitized_filename
807
  try:
808
+ # Extract file to the temporary directory
809
+ with zip_ref.open(file_info) as zf, open(extracted_file_path, 'wb') as outfile:
810
+ outfile.write(zf.read())
811
+
812
  # Recursively process the extracted file if it's supported and not an archive itself
813
  if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
814
  dataset.extend(self._process_single_file(extracted_file_path))
 
820
  logger.debug(f"Skipping unsupported file in archive: '{file_info.filename}'")
821
  except Exception as e:
822
  logger.warning(f"Error extracting/processing file '{file_info.filename}' from zip '{archive_path.name}': {e}")
823
+ finally:
824
+ # Clean up the extracted file immediately after processing
825
+ if extracted_file_path.exists():
826
+ try:
827
+ extracted_file_path.unlink()
828
+ except OSError as e:
829
+ logger.warning(f"Failed to clean up extracted file {extracted_file_path}: {e}")
830
+
831
  else:
832
  logger.error(f"'{archive_path.name}' is not a valid zip file.")
833
 
 
842
  with tarfile.open(archive_path, mode) as tar_ref:
843
  for member in tar_ref.getmembers():
844
  if member.isfile():
845
+ # Sanitize member name
846
+ sanitized_filename = Path(member.name).name
847
+ extracted_file_path = extract_to / sanitized_filename
848
  try:
849
+ # Extract member to the temporary directory
850
+ # Ensure the target path is within the extraction directory
851
+ if not str(extracted_file_path).startswith(str(extract_to)):
852
+ logger.warning(f"Skipping potentially malicious path in tar: {member.name}")
853
+ continue # Skip if path is outside the temp dir
854
+
855
+ with tar_ref.extractfile(member) as tf, open(extracted_file_path, 'wb') as outfile:
856
+ if tf: # extractfile can return None for special file types
857
+ outfile.write(tf.read())
858
+ else:
859
+ logger.warning(f"Could not extract file-like object for {member.name} from tar.")
860
+ continue # Skip this member
861
+
862
  # Recursively process extracted file
863
  if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
864
  dataset.extend(self._process_single_file(extracted_file_path))
 
869
  logger.debug(f"Skipping unsupported file in archive: '{member.name}'")
870
  except Exception as e:
871
  logger.warning(f"Error extracting/processing file '{member.name}' from tar '{archive_path.name}': {e}")
872
+ finally:
873
+ # Clean up the extracted file immediately after processing
874
+ if extracted_file_path.exists():
875
+ try:
876
+ extracted_file_path.unlink()
877
+ except OSError as e:
878
+ logger.warning(f"Failed to clean up extracted file {extracted_file_path}: {e}")
879
+
880
  except tarfile.TarError as e:
881
  logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
882
 
 
901
  except Exception as e:
902
  logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
903
  finally:
904
+ if extracted_path.exists():
905
+ try:
906
+ extracted_path.unlink() # Clean up extracted file
907
+ except OSError as e:
908
+ logger.warning(f"Failed to clean up extracted file {extracted_path}: {e}")
909
+
910
 
911
  # TODO: Add support for other archive types (.bz2, .7z, .rar)
912
  elif archive_extension in ('.bz2', '.7z', '.rar'):
 
935
  "idx": 0, # chunk_index
936
  "tc": 1, # total_chunks
937
  "tl": total_length, # total_length
938
+ "hash": 0, # chunk_hash (using int for hash)
939
  "data": "" # chunk_data
940
  }
941
  # Estimate overhead more accurately by dumping a sample metadata structure
942
  # and adding some safety margin. Shortened keys reduce overhead.
943
+ # Use a dummy hash value (e.g., 1234567890) to get a realistic length estimate
944
+ metadata_template_with_hash = {**metadata_template, "hash": 1234567890}
945
+ overhead_estimate = len(json.dumps(metadata_template_with_hash, separators=(',', ':'))) + 50 # Extra padding
946
 
947
  # Calculate effective chunk size
948
  effective_chunk_size = max_size - overhead_estimate
 
1255
  }
1256
  </script>
1257
  """
 
1258
  with gr.Row():
1259
+ # Adjusted crawl depth slider to match the max_steps limit in the code
1260
  crawl_depth_slider = gr.Slider(
1261
  label="Crawl Depth",
1262
  minimum=0,
1263
+ maximum=10, # Changed max depth to 10
1264
  value=0,
1265
  step=1,
1266
  interactive=True,
1267
+ info="Select the maximum depth for crawling links (0-10)." # Updated info
1268
  )
1269
+
1270
  qr_code_paths = gr.State([])
1271
  gr.Markdown("""
1272
  # 🌐 Advanced Data Processing & QR Code Generator
 
1362
  num_qr_codes = len(paths)
1363
  cols = math.ceil(math.sqrt(num_qr_codes)) # Calculate columns for a roughly square grid
1364
  cols = max(1, min(cols, 6)) # Limit max columns for small screens
1365
+ # rows = math.ceil(num_qr_codes / cols) # Not used in HTML generation
1366
 
1367
  viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'.format(cols)
1368
 
1369
+ # Initialize enabledStates if it's empty (first load) or if paths have changed
1370
+ if enabled_states is None or len(enabled_states) != num_qr_codes:
1371
+ enabled_states = list(range(num_qr_codes)) # Enable all by default or if QR count changes
1372
+
1373
 
1374
  for i, path in enumerate(paths):
1375
  is_enabled = i in enabled_states
 
1383
 
1384
  return viewport_html
1385
 
1386
+ def process_inputs(urls, files, text, combine, crawl_depth): # Added crawl_depth parameter
1387
  """Process all inputs and generate QR codes"""
1388
  results = []
1389
  processing_status_messages = []
 
1415
  url_list = re.split(r'[,\n]', urls)
1416
  url_list = [url.strip() for url in url_list if url.strip()]
1417
  for url in url_list:
1418
+ # Use the crawl_depth from the slider
1419
+ processing_status_messages.append(f"🌐 Processing URL: {url} with crawl depth {crawl_depth}...")
1420
+ content_result = url_processor.fetch_content_with_depth(url, max_steps=crawl_depth)
1421
+ if content_result and content_result.get('fetch_result') is not None: # Check if initial fetch was successful
1422
+ results.append(content_result)
1423
+ processing_status_messages.append(f"βœ… Processed URL: {url} (Level 0)")
1424
+ # Add notes from the result if any
1425
+ if content_result.get('processing_notes'):
1426
+ processing_status_messages.append(f" Notes: {'; '.join(content_result['processing_notes'])}")
1427
+
1428
+ # Optionally add status for linked extractions
1429
+ if content_result.get('linked_extractions'):
1430
+ num_linked_processed = len([r for r in content_result['linked_extractions'] if r and r.get('fetch_result') is not None])
1431
+ processing_status_messages.append(f" Found and processed {num_linked_processed}/{len(content_result['linked_extractions'])} direct links.")
1432
+ # Note: Detailed status for deeper levels would require traversing the result structure here.
1433
+
1434
  else:
1435
+ processing_status_messages.append(f"❌ Failed to process URL: {url}")
1436
+ # Add notes from the result even if fetch failed
1437
+ if content_result and content_result.get('processing_notes'):
1438
+ processing_status_messages.append(f" Notes: {'; '.join(content_result['processing_notes'])}")
1439
+ elif content_result and content_result.get('note'): # Handle the 'note' key from validation/invalid steps
1440
+ processing_status_messages.append(f" Notes: {content_result['note']}")
1441
+
1442
 
1443
  # Process files
1444
  if files:
 
1448
  if file_results:
1449
  results.extend(file_results)
1450
  processing_status_messages.append(f"βœ… Processed file: {file.name}")
1451
+ # Add notes from file processing results
1452
+ for res in file_results:
1453
+ if res.get('processing_notes'):
1454
+ processing_status_messages.append(f" Notes for {res.get('filename', 'item')}: {'; '.join(res['processing_notes'])}")
1455
  else:
1456
  processing_status_messages.append(f"❌ Failed to process file: {file.name}")
1457
 
1458
+
1459
  # Generate QR codes
1460
  qr_paths = []
1461
  final_json_output = None
 
1491
  num_qrs = 0
1492
  else:
1493
  num_qrs = len(qr_paths_list)
1494
+
1495
  initial_enabled_states = list(range(num_qrs))
1496
  return qr_paths_list, initial_enabled_states # Return paths list and initial enabled state
1497
 
 
1501
 
1502
  process_btn.click(
1503
  process_inputs,
1504
+ inputs=[url_input, file_input, text_input, combine_data, crawl_depth_slider], # Pass crawl_depth_slider value
1505
  outputs=[output_json, output_gallery, output_text]
1506
  ).then( # Chain a .then() to update the QR paths state and trigger viewport update
1507
  on_qr_generation,
 
1515
  # Add helpful documentation
1516
  gr.Markdown("""
1517
  ### πŸš€ Features
1518
+ - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type. Supports crawling links up to a specified depth.
1519
  - **Advanced File Processing**: Reads various text-based files (.txt, .md, .log etc.), HTML, XML, CSV, and attempts text extraction from common documents (.pdf, .docx, .rtf, .odt - *requires extra dependencies*).
1520
  - **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs.
1521
  - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives.
 
1525
  - **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
1526
  - **Modern Design**: Clean, responsive interface with visual feedback.
1527
  ### πŸ’‘ Tips
1528
+ 1. **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type, following links up to the specified **Crawl Depth**.
1529
  2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats.
1530
  3. **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure.
1531
  4. **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries. Check the console logs for warnings if a library is missing.
1532
  5. **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item.
1533
  6. **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps.
1534
  7. **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images.
 
 
 
 
 
 
1535
  ### βš™οΈ QR Code Viewport Instructions
1536
  1. Navigate to the **QR Code Viewport** tab after generating QR codes.
1537
  2. The generated QR codes will be displayed in a grid based on their total count.
 
1563
  raise # Re-raise the exception to ensure the process exits if launch fails
1564
 
1565
  if __name__ == "__main__":
1566
+ main()