Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
@@ -55,6 +55,7 @@ except ImportError:
|
|
55 |
|
56 |
try:
|
57 |
from pyth.plugins.plaintext.writer import PlaintextWriter
|
|
|
58 |
RTF_SUPPORT = True
|
59 |
except ImportError:
|
60 |
RTF_SUPPORT = False
|
@@ -86,108 +87,129 @@ TEMP_DIR = OUTPUTS_DIR / 'temp'
|
|
86 |
for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
|
87 |
directory.mkdir(parents=True, exist_ok=True)
|
88 |
|
|
|
|
|
89 |
class EnhancedURLProcessor:
|
90 |
-
"""
|
91 |
|
92 |
def __init__(self):
|
93 |
-
|
94 |
-
self.
|
|
|
|
|
95 |
self.max_retries = 3
|
96 |
-
self.user_agent = UserAgent()
|
97 |
-
# Enhanced headers for better site compatibility
|
98 |
-
self.session.headers.update({
|
99 |
-
'User-Agent': self.user_agent.random,
|
100 |
-
'Accept': 'text/html, application/json, application/xml, text/plain, */*', # Request common types
|
101 |
-
'Accept-Language': 'en-US,en;q=0.9',
|
102 |
-
'Accept-Encoding': 'gzip, deflate, br',
|
103 |
-
'Connection': 'keep-alive',
|
104 |
-
'Upgrade-Insecure-Requests': '1', # May be ignored for non-HTML
|
105 |
-
'Sec-Fetch-Dest': 'document',
|
106 |
-
'Sec-Fetch-Mode': 'navigate',
|
107 |
-
'Sec-Fetch-Site': 'none',
|
108 |
-
'Sec-Fetch-User': '?1',
|
109 |
-
'DNT': '1'
|
110 |
-
})
|
111 |
|
112 |
-
def
|
113 |
-
"""
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
}
|
153 |
-
|
154 |
-
return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
|
155 |
|
156 |
def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]:
|
157 |
-
"""Enhanced content fetcher with retry mechanism and complete character extraction"""
|
158 |
try:
|
159 |
-
logger.info(f"
|
160 |
-
#
|
161 |
-
self.session.headers.update({'User-Agent': self.user_agent.random})
|
162 |
response = self.session.get(url, timeout=self.timeout)
|
163 |
response.raise_for_status()
|
164 |
-
final_url = response.url # Capture potential redirects
|
165 |
content_type = response.headers.get('Content-Type', '')
|
166 |
|
167 |
-
#
|
168 |
-
|
169 |
-
|
170 |
-
encoding = encoding_detection['encoding'] or 'utf-8'
|
171 |
-
logger.debug(f"Detected encoding '{encoding}' with confidence {encoding_detection['confidence']:.2f} for {url}")
|
172 |
-
else:
|
173 |
-
encoding = response.encoding
|
174 |
-
logger.debug(f"Using response.encoding '{encoding}' for {url}")
|
175 |
|
176 |
-
#
|
177 |
-
try:
|
178 |
-
raw_content = response.content.decode(encoding, errors='replace')
|
179 |
-
except (UnicodeDecodeError, LookupError):
|
180 |
-
# Fallback to a more common encoding if the first attempt fails
|
181 |
-
try:
|
182 |
-
raw_content = response.content.decode('utf-8', errors='replace')
|
183 |
-
encoding = 'utf-8 (fallback)'
|
184 |
-
logger.warning(f"Decoding with {encoding} fallback for {url}")
|
185 |
-
except Exception:
|
186 |
-
raw_content = response.content.decode('latin-1', errors='replace') # Another common fallback
|
187 |
-
encoding = 'latin-1 (fallback)'
|
188 |
-
logger.warning(f"Decoding with {encoding} fallback for {url}")
|
189 |
-
|
190 |
-
# Extract metadata
|
191 |
metadata = {
|
192 |
'original_url': url,
|
193 |
'final_url': final_url,
|
@@ -199,7 +221,7 @@ class EnhancedURLProcessor:
|
|
199 |
'status_code': response.status_code
|
200 |
}
|
201 |
|
202 |
-
# Process based on content type
|
203 |
processed_extraction = self._process_web_content(raw_content, metadata['content_type'], final_url)
|
204 |
|
205 |
return {
|
@@ -212,29 +234,30 @@ class EnhancedURLProcessor:
|
|
212 |
}
|
213 |
except requests.exceptions.RequestException as e:
|
214 |
if retry_count < self.max_retries - 1:
|
215 |
-
logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
|
216 |
-
time.sleep(
|
217 |
return self.fetch_content(url, retry_count + 1)
|
218 |
-
logger.error(f"Failed to fetch content after {self.max_retries} attempts from {url}: {e}")
|
219 |
return {
|
220 |
'source': 'url',
|
221 |
'url': url,
|
222 |
'raw_content': None,
|
223 |
'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}, # Include basic metadata on failure
|
224 |
'extracted_data': None,
|
225 |
-
'processing_notes': [f"Failed to fetch content after {self.max_retries} attempts: {str(e)}"] # Ensure notes is a list
|
226 |
}
|
227 |
except Exception as e:
|
228 |
-
logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
|
229 |
return {
|
230 |
'source': 'url',
|
231 |
'url': url,
|
232 |
'raw_content': raw_content if 'raw_content' in locals() else None,
|
233 |
'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None},
|
234 |
'extracted_data': None,
|
235 |
-
'processing_notes': [f"Unexpected processing error: {str(e)}"]
|
236 |
}
|
237 |
|
|
|
238 |
def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
|
239 |
"""Process content based on detected content type"""
|
240 |
lower_content_type = content_type.lower()
|
@@ -353,252 +376,87 @@ class EnhancedURLProcessor:
|
|
353 |
return extracted
|
354 |
|
355 |
def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
if not validation_result['is_valid']:
|
368 |
-
logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
|
369 |
-
return {
|
370 |
-
'url': url,
|
371 |
-
'level': 0,
|
372 |
-
'fetch_result': None,
|
373 |
-
'linked_extractions': [],
|
374 |
-
'note': f"Initial URL validation failed: {validation_result['message']}"
|
375 |
-
}
|
376 |
-
|
377 |
-
return self._fetch_content_recursive(url, max_steps, current_step=0)
|
378 |
-
|
379 |
-
def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
|
380 |
-
if current_step > max_steps:
|
381 |
-
logger.debug(f"Depth limit reached for {url} at level {current_step}.")
|
382 |
-
return {
|
383 |
-
'url': url,
|
384 |
-
'level': current_step,
|
385 |
-
'fetch_result': None,
|
386 |
-
'linked_extractions': [],
|
387 |
-
'note': f"Depth limit ({max_steps}) reached."
|
388 |
-
}
|
389 |
-
|
390 |
-
logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
|
391 |
-
fetch_result = self.fetch_content(url)
|
392 |
-
linked_extractions: List[Dict[str, Any]] = []
|
393 |
-
|
394 |
-
if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
|
395 |
-
extracted_data = fetch_result['extracted_data']
|
396 |
-
links = extracted_data.get('links', [])
|
397 |
-
|
398 |
-
logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
|
399 |
-
if current_step < max_steps:
|
400 |
-
for link_info in links:
|
401 |
-
linked_url = link_info.get('url')
|
402 |
-
if linked_url:
|
403 |
-
linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
|
404 |
-
linked_extractions.append(linked_result)
|
405 |
-
|
406 |
-
return {
|
407 |
-
'url': url,
|
408 |
-
'level': current_step,
|
409 |
-
'fetch_result': fetch_result,
|
410 |
-
'linked_extractions': linked_extractions,
|
411 |
-
'note': f"Processed at level {current_step}"
|
412 |
-
}
|
413 |
-
|
414 |
-
class EnhancedURLProcessor:
|
415 |
-
def fetch_content_with_depth(self, url, max_steps, current_level=0):
|
416 |
-
"""Simulates fetching and processing URLs up to max_steps depth."""
|
417 |
-
# print(f"Simulating fetch for {url} at level {current_level} with remaining steps {max_steps}...") # Debug print
|
418 |
|
419 |
-
#
|
420 |
-
|
421 |
-
|
|
|
|
|
422 |
'url': url,
|
423 |
-
'level':
|
424 |
'fetch_result': None,
|
425 |
'linked_extractions': [],
|
426 |
-
'processing_notes':
|
427 |
}
|
428 |
|
429 |
-
#
|
430 |
-
|
431 |
-
|
432 |
-
|
|
|
|
|
|
|
|
|
433 |
return {
|
434 |
'url': url,
|
435 |
-
'level':
|
436 |
'fetch_result': None,
|
437 |
'linked_extractions': [],
|
438 |
-
'processing_notes': f
|
439 |
}
|
440 |
|
441 |
-
|
442 |
-
|
443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
444 |
|
445 |
-
# Simulate a potentially failing URL
|
446 |
-
if "this-domain-does-not-exist" in url:
|
447 |
-
fetch_success = False
|
448 |
-
fetch_content = None
|
449 |
-
processing_notes = "Simulated network error: Could not resolve host."
|
450 |
-
|
451 |
-
linked_extractions = []
|
452 |
-
# Simulate finding links only if more steps are allowed and fetch was successful
|
453 |
-
if max_steps > 0 and fetch_success:
|
454 |
-
# Simulate finding a couple of links to demonstrate nesting
|
455 |
-
# In a real implementation, this would involve parsing the fetched content
|
456 |
-
# and resolving relative URLs.
|
457 |
-
simulated_linked_urls = [f"{url}/child1", f"{url}/child2"]
|
458 |
-
for linked_url in simulated_linked_urls:
|
459 |
-
# Recursively call for linked URLs, decreasing max_steps and increasing current_level
|
460 |
-
linked_result = self.fetch_content_with_depth(linked_url, max_steps - 1, current_level + 1)
|
461 |
-
if linked_result:
|
462 |
-
linked_extractions.append(linked_result)
|
463 |
|
464 |
return {
|
465 |
'url': url,
|
466 |
-
'level':
|
467 |
-
'fetch_result':
|
468 |
'linked_extractions': linked_extractions,
|
469 |
-
'processing_notes':
|
470 |
}
|
471 |
|
472 |
-
# Define a helper function to recursively print extraction details
|
473 |
-
def print_extraction_details(extraction, max_level, current_level=0):
|
474 |
-
"""Recursively prints details of the extraction and its linked extractions."""
|
475 |
-
if not extraction:
|
476 |
-
return
|
477 |
-
|
478 |
-
indent = " " * current_level
|
479 |
-
url = extraction.get('url', 'N/A')
|
480 |
-
level = extraction.get('level', 'N/A')
|
481 |
-
fetch_success = extraction.get('fetch_result') is not None and 'error' not in extraction.get('processing_notes', '').lower()
|
482 |
-
num_linked = len(extraction.get('linked_extractions', []))
|
483 |
-
notes = extraction.get('processing_notes', '')
|
484 |
-
|
485 |
-
print(f"{indent}URL: {url} (Level {level}). Success: {fetch_success}")
|
486 |
-
print(f"{indent}Number of linked extractions found: {num_linked}")
|
487 |
-
if notes:
|
488 |
-
print(f"{indent}Notes: {notes}")
|
489 |
-
|
490 |
-
if current_level < max_level and extraction.get('linked_extractions'):
|
491 |
-
# print(f"{indent}Processing linked extractions (Level {current_level + 1}):") # Optional header
|
492 |
-
for i, linked_extraction in enumerate(extraction['linked_extractions']):
|
493 |
-
# print(f"{indent} Linked Extraction {i+1}:") # Optional item separator
|
494 |
-
print_extraction_details(linked_extraction, max_level, current_level + 1)
|
495 |
-
|
496 |
-
|
497 |
-
if __name__ == "__main__":
|
498 |
-
# Instantiate the processor
|
499 |
-
processor = EnhancedURLProcessor()
|
500 |
-
|
501 |
-
# Using quotes.toscrape.com as it has multiple links (in a real scenario)
|
502 |
-
# For this simulation, the dummy processor creates nested links regardless of the actual URL content.
|
503 |
-
test_url_with_links = "https://quotes.toscrape.com/"
|
504 |
-
|
505 |
-
# --- Test Cases (Extended up to max_steps = 10) ---
|
506 |
-
|
507 |
-
# Test with 0 steps (only initial URL)
|
508 |
-
print("\n--- Testing with max_steps = 0 ---")
|
509 |
-
result_0 = processor.fetch_content_with_depth(test_url_with_links, max_steps=0)
|
510 |
-
# print(json.dumps(result_0, indent=2)) # Uncomment to see full structure
|
511 |
-
print_extraction_details(result_0, 0)
|
512 |
-
|
513 |
-
# Test with 1 step (initial URL + its direct links)
|
514 |
-
print(f"\n--- Testing with max_steps = 1 for {test_url_with_links} ---")
|
515 |
-
result_1 = processor.fetch_content_with_depth(test_url_with_links, max_steps=1)
|
516 |
-
# print(json.dumps(result_1, indent=2)) # Uncomment to see full structure
|
517 |
-
print_extraction_details(result_1, 1)
|
518 |
-
|
519 |
-
# Test with 2 steps
|
520 |
-
print(f"\n--- Testing with max_steps = 2 for {test_url_with_links} ---")
|
521 |
-
result_2 = processor.fetch_content_with_depth(test_url_with_links, max_steps=2)
|
522 |
-
# print(json.dumps(result_2, indent=2)) # Uncomment to see full structure
|
523 |
-
print_extraction_details(result_2, 2)
|
524 |
-
|
525 |
-
# Test with max_steps = 3
|
526 |
-
print(f"\n--- Testing with max_steps = 3 for {test_url_with_links} ---")
|
527 |
-
result_3 = processor.fetch_content_with_depth(test_url_with_links, max_steps=3)
|
528 |
-
# print(json.dumps(result_3, indent=2)) # Uncomment to see full structure
|
529 |
-
print_extraction_details(result_3, 3)
|
530 |
-
|
531 |
-
# Test with max_steps = 4
|
532 |
-
print(f"\n--- Testing with max_steps = 4 for {test_url_with_links} ---")
|
533 |
-
result_4 = processor.fetch_content_with_depth(test_url_with_links, max_steps=4)
|
534 |
-
# print(json.dumps(result_4, indent=2)) # Uncomment to see full structure
|
535 |
-
print_extraction_details(result_4, 4)
|
536 |
-
|
537 |
-
# Test with max_steps = 5
|
538 |
-
print(f"\n--- Testing with max_steps = 5 for {test_url_with_links} ---")
|
539 |
-
result_5 = processor.fetch_content_with_depth(test_url_with_links, max_steps=5)
|
540 |
-
# print(json.dumps(result_5, indent=2)) # Uncomment to see full structure
|
541 |
-
print_extraction_details(result_5, 5)
|
542 |
-
|
543 |
-
# Test with max_steps = 6
|
544 |
-
print(f"\n--- Testing with max_steps = 6 for {test_url_with_links} ---")
|
545 |
-
result_6 = processor.fetch_content_with_depth(test_url_with_links, max_steps=6)
|
546 |
-
# print(json.dumps(result_6, indent=2)) # Uncomment to see full structure
|
547 |
-
print_extraction_details(result_6, 6)
|
548 |
-
|
549 |
-
# Test with max_steps = 7
|
550 |
-
print(f"\n--- Testing with max_steps = 7 for {test_url_with_links} ---")
|
551 |
-
result_7 = processor.fetch_content_with_depth(test_url_with_links, max_steps=7)
|
552 |
-
# print(json.dumps(result_7, indent=2)) # Uncomment to see full structure
|
553 |
-
print_extraction_details(result_7, 7)
|
554 |
-
|
555 |
-
# Test with max_steps = 8
|
556 |
-
print(f"\n--- Testing with max_steps = 8 for {test_url_with_links} ---")
|
557 |
-
result_8 = processor.fetch_content_with_depth(test_url_with_links, max_steps=8)
|
558 |
-
# print(json.dumps(result_8, indent=2)) # Uncomment to see full structure
|
559 |
-
print_extraction_details(result_8, 8)
|
560 |
-
|
561 |
-
# Test with max_steps = 9
|
562 |
-
print(f"\n--- Testing with max_steps = 9 for {test_url_with_links} ---")
|
563 |
-
result_9 = processor.fetch_content_with_depth(test_url_with_links, max_steps=9)
|
564 |
-
# print(json.dumps(result_9, indent=2)) # Uncomment to see full structure
|
565 |
-
print_extraction_details(result_9, 9)
|
566 |
-
|
567 |
-
# Test with max_steps = 10
|
568 |
-
print(f"\n--- Testing with max_steps = 10 for {test_url_with_links} ---")
|
569 |
-
result_10 = processor.fetch_content_with_depth(test_url_with_links, max_steps=10)
|
570 |
-
# print(json.dumps(result_10, indent=2)) # Uncomment to see full structure
|
571 |
-
print_extraction_details(result_10, 10)
|
572 |
-
|
573 |
-
|
574 |
-
# Test with invalid max_steps (e.g., negative)
|
575 |
-
print("\n--- Testing with invalid max_steps = -1 ---")
|
576 |
-
result_invalid_steps = processor.fetch_content_with_depth(test_url_with_links, max_steps=-1)
|
577 |
-
# print(json.dumps(result_invalid_steps, indent=2)) # Uncomment to see full structure
|
578 |
-
print(f"Result for invalid steps: {result_invalid_steps.get('processing_notes')}")
|
579 |
-
|
580 |
-
|
581 |
-
# Test with invalid initial URL format
|
582 |
-
print("\n--- Testing with invalid initial URL format ---")
|
583 |
-
result_invalid_url = processor.fetch_content_with_depth("invalid-url", max_steps=1)
|
584 |
-
# print(json.dumps(result_invalid_url, indent=2)) # Uncomment to see full structure
|
585 |
-
print(f"Result for invalid initial URL: {result_invalid_url.get('processing_notes')}")
|
586 |
-
|
587 |
-
# Test with a URL that might fail to fetch (simulated)
|
588 |
-
print("\n--- Testing with a potentially failing URL (simulated) ---")
|
589 |
-
# Use a non-existent subdomain or a port that's unlikely to be open
|
590 |
-
failing_url = "http://this-domain-does-not-exist-12345.com/"
|
591 |
-
result_fail = processor.fetch_content_with_depth(failing_url, max_steps=1)
|
592 |
-
# print(json.dumps(result_fail, indent=2)) # Uncomment to see full structure
|
593 |
-
print(f"Result for failing URL: {result_fail.get('processing_notes')}")
|
594 |
-
# Check if fetch_result is None or indicates failure
|
595 |
-
if result_fail.get('fetch_result') is None:
|
596 |
-
print("Fetch result is None as expected for failing URL.")
|
597 |
-
# if result_fail.get('fetch_result') and 'error' in result_fail['fetch_result'].get('processing_notes', '').lower():
|
598 |
-
# print(f"Fetch result notes for failing URL: {result_fail['fetch_result'].get('processing_notes')}")
|
599 |
-
|
600 |
-
|
601 |
-
print("\n--- End of Test Cases ---")
|
602 |
|
603 |
class EnhancedFileProcessor:
|
604 |
"""Advanced file processing with enhanced content extraction"""
|
@@ -622,7 +480,21 @@ class EnhancedFileProcessor:
|
|
622 |
return []
|
623 |
|
624 |
dataset = []
|
625 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
626 |
|
627 |
try:
|
628 |
file_size = file_path.stat().st_size
|
@@ -636,18 +508,19 @@ class EnhancedFileProcessor:
|
|
636 |
'processing_notes': 'File size exceeds limit.'
|
637 |
}]
|
638 |
|
|
|
639 |
with tempfile.TemporaryDirectory() as temp_dir:
|
640 |
temp_dir_path = Path(temp_dir)
|
641 |
|
642 |
-
# Decide processing strategy
|
643 |
if file_path.suffix.lower() in self.archive_extensions:
|
644 |
dataset.extend(self._process_archive(file_path, temp_dir_path))
|
645 |
elif file_path.suffix.lower() in self.supported_extensions:
|
646 |
# Pass the path to the single file processor
|
647 |
dataset.extend(self._process_single_file(file_path))
|
648 |
else:
|
649 |
-
logger.warning(f"Unsupported file type for processing: '{file_path.name}'")
|
650 |
-
#
|
651 |
try:
|
652 |
# Read as text with error replacement
|
653 |
content_bytes = file_path.read_bytes()
|
@@ -660,7 +533,7 @@ class EnhancedFileProcessor:
|
|
660 |
'file_size': file_size,
|
661 |
'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
|
662 |
'extracted_data': {'plain_text': raw_content}, # Store raw text under a key
|
663 |
-
'processing_notes': 'Processed as plain text (unsupported extension).'
|
664 |
})
|
665 |
except Exception as e:
|
666 |
logger.error(f"Error reading or processing unsupported file '{file_path.name}' as text: {e}")
|
@@ -670,7 +543,7 @@ class EnhancedFileProcessor:
|
|
670 |
'file_size': file_size,
|
671 |
'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
|
672 |
'extracted_data': None,
|
673 |
-
'processing_notes': f'Unsupported file type and failed to read as text: {e}'
|
674 |
})
|
675 |
|
676 |
|
@@ -681,7 +554,7 @@ class EnhancedFileProcessor:
|
|
681 |
'filename': file_path.name,
|
682 |
'file_size': file_size if 'file_size' in locals() else None,
|
683 |
'extracted_data': None,
|
684 |
-
'processing_notes': f'Overall file processing error: {str(e)}'
|
685 |
})
|
686 |
return dataset
|
687 |
|
@@ -703,7 +576,7 @@ class EnhancedFileProcessor:
|
|
703 |
|
704 |
raw_content: Optional[str] = None
|
705 |
extracted_data: Any = None
|
706 |
-
processing_notes = []
|
707 |
|
708 |
try:
|
709 |
# Read content efficiently
|
@@ -788,13 +661,13 @@ class EnhancedFileProcessor:
|
|
788 |
|
789 |
if rows:
|
790 |
# Limit the number of rows included for potentially huge CSVs
|
791 |
-
max_rows_preview = 100
|
792 |
extracted_data = {
|
793 |
-
'headers': rows[0] if rows[0] else None, # Assume first row is header
|
794 |
-
'rows': rows[1:max_rows_preview+1] # Get up to max_rows_preview data rows
|
795 |
}
|
796 |
if len(rows) > max_rows_preview + 1:
|
797 |
-
processing_notes.append(f"CSV truncated to {max_rows_preview}
|
798 |
processing_notes.append("Parsed as CSV.")
|
799 |
if not is_explicit_csv:
|
800 |
processing_notes.append("Note: Content looked like CSV despite extension/mime.")
|
@@ -825,7 +698,7 @@ class EnhancedFileProcessor:
|
|
825 |
extracted_text = text_content
|
826 |
processing_notes.append("Extracted text from PDF.")
|
827 |
finally:
|
828 |
-
temp_path.unlink() # Clean up temp file
|
829 |
elif file_extension == '.docx' and DOCX_SUPPORT:
|
830 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
|
831 |
tmp_file.write(content_bytes) # Write bytes to temp file
|
@@ -836,10 +709,11 @@ class EnhancedFileProcessor:
|
|
836 |
extracted_text = text_content
|
837 |
processing_notes.append("Extracted text from DOCX.")
|
838 |
finally:
|
839 |
-
temp_path.unlink() # Clean up temp file
|
840 |
elif file_extension == '.rtf' and RTF_SUPPORT:
|
841 |
# pyth can read directly from file-like object or string
|
842 |
try:
|
|
|
843 |
doc = Rtf15Reader.read(io.StringIO(raw_content))
|
844 |
text_content = PlaintextWriter.write(doc).getvalue()
|
845 |
extracted_text = text_content
|
@@ -858,7 +732,7 @@ class EnhancedFileProcessor:
|
|
858 |
extracted_text = text_content
|
859 |
processing_notes.append("Extracted text from ODT.")
|
860 |
finally:
|
861 |
-
temp_path.unlink() # Clean up temp file
|
862 |
elif file_extension in ['.doc', '.ppt', '.pptx', '.xls', '.xlsx']:
|
863 |
# These require more complex or platform-specific libraries (e.g. antiword, pandoc, COM objects on Windows)
|
864 |
processing_notes.append(f"Automatic text extraction for {file_extension.upper()} not fully implemented.")
|
@@ -925,10 +799,16 @@ class EnhancedFileProcessor:
|
|
925 |
if zipfile.is_zipfile(archive_path):
|
926 |
with zipfile.ZipFile(archive_path, 'r') as zip_ref:
|
927 |
for file_info in zip_ref.infolist():
|
|
|
928 |
if file_info.file_size > 0 and not file_info.filename.endswith('/'):
|
|
|
|
|
|
|
929 |
try:
|
930 |
-
|
931 |
-
extracted_file_path
|
|
|
|
|
932 |
# Recursively process the extracted file if it's supported and not an archive itself
|
933 |
if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
|
934 |
dataset.extend(self._process_single_file(extracted_file_path))
|
@@ -940,6 +820,14 @@ class EnhancedFileProcessor:
|
|
940 |
logger.debug(f"Skipping unsupported file in archive: '{file_info.filename}'")
|
941 |
except Exception as e:
|
942 |
logger.warning(f"Error extracting/processing file '{file_info.filename}' from zip '{archive_path.name}': {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
943 |
else:
|
944 |
logger.error(f"'{archive_path.name}' is not a valid zip file.")
|
945 |
|
@@ -954,9 +842,23 @@ class EnhancedFileProcessor:
|
|
954 |
with tarfile.open(archive_path, mode) as tar_ref:
|
955 |
for member in tar_ref.getmembers():
|
956 |
if member.isfile():
|
|
|
|
|
|
|
957 |
try:
|
958 |
-
|
959 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
960 |
# Recursively process extracted file
|
961 |
if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
|
962 |
dataset.extend(self._process_single_file(extracted_file_path))
|
@@ -967,6 +869,14 @@ class EnhancedFileProcessor:
|
|
967 |
logger.debug(f"Skipping unsupported file in archive: '{member.name}'")
|
968 |
except Exception as e:
|
969 |
logger.warning(f"Error extracting/processing file '{member.name}' from tar '{archive_path.name}': {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
970 |
except tarfile.TarError as e:
|
971 |
logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
|
972 |
|
@@ -991,7 +901,12 @@ class EnhancedFileProcessor:
|
|
991 |
except Exception as e:
|
992 |
logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
|
993 |
finally:
|
994 |
-
if extracted_path.exists():
|
|
|
|
|
|
|
|
|
|
|
995 |
|
996 |
# TODO: Add support for other archive types (.bz2, .7z, .rar)
|
997 |
elif archive_extension in ('.bz2', '.7z', '.rar'):
|
@@ -1020,12 +935,14 @@ class EnhancedFileProcessor:
|
|
1020 |
"idx": 0, # chunk_index
|
1021 |
"tc": 1, # total_chunks
|
1022 |
"tl": total_length, # total_length
|
1023 |
-
"hash":
|
1024 |
"data": "" # chunk_data
|
1025 |
}
|
1026 |
# Estimate overhead more accurately by dumping a sample metadata structure
|
1027 |
# and adding some safety margin. Shortened keys reduce overhead.
|
1028 |
-
|
|
|
|
|
1029 |
|
1030 |
# Calculate effective chunk size
|
1031 |
effective_chunk_size = max_size - overhead_estimate
|
@@ -1338,18 +1255,18 @@ def create_modern_interface():
|
|
1338 |
}
|
1339 |
</script>
|
1340 |
"""
|
1341 |
-
|
1342 |
with gr.Row():
|
|
|
1343 |
crawl_depth_slider = gr.Slider(
|
1344 |
label="Crawl Depth",
|
1345 |
minimum=0,
|
1346 |
-
maximum=
|
1347 |
value=0,
|
1348 |
step=1,
|
1349 |
interactive=True,
|
1350 |
-
info="Select the maximum depth for crawling links (0-
|
1351 |
)
|
1352 |
-
|
1353 |
qr_code_paths = gr.State([])
|
1354 |
gr.Markdown("""
|
1355 |
# π Advanced Data Processing & QR Code Generator
|
@@ -1445,13 +1362,14 @@ def create_modern_interface():
|
|
1445 |
num_qr_codes = len(paths)
|
1446 |
cols = math.ceil(math.sqrt(num_qr_codes)) # Calculate columns for a roughly square grid
|
1447 |
cols = max(1, min(cols, 6)) # Limit max columns for small screens
|
1448 |
-
rows = math.ceil(num_qr_codes / cols)
|
1449 |
|
1450 |
viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'.format(cols)
|
1451 |
|
1452 |
-
# Initialize enabledStates if it's empty (first load)
|
1453 |
-
if
|
1454 |
-
enabled_states = list(range(num_qr_codes)) # Enable all by default
|
|
|
1455 |
|
1456 |
for i, path in enumerate(paths):
|
1457 |
is_enabled = i in enabled_states
|
@@ -1465,7 +1383,7 @@ def create_modern_interface():
|
|
1465 |
|
1466 |
return viewport_html
|
1467 |
|
1468 |
-
def process_inputs(urls, files, text, combine,
|
1469 |
"""Process all inputs and generate QR codes"""
|
1470 |
results = []
|
1471 |
processing_status_messages = []
|
@@ -1497,19 +1415,30 @@ def create_modern_interface():
|
|
1497 |
url_list = re.split(r'[,\n]', urls)
|
1498 |
url_list = [url.strip() for url in url_list if url.strip()]
|
1499 |
for url in url_list:
|
1500 |
-
|
1501 |
-
|
1502 |
-
|
1503 |
-
|
1504 |
-
|
1505 |
-
|
1506 |
-
|
1507 |
-
|
1508 |
-
|
1509 |
-
|
1510 |
-
|
|
|
|
|
|
|
|
|
|
|
1511 |
else:
|
1512 |
-
processing_status_messages.append(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
1513 |
|
1514 |
# Process files
|
1515 |
if files:
|
@@ -1519,9 +1448,14 @@ def create_modern_interface():
|
|
1519 |
if file_results:
|
1520 |
results.extend(file_results)
|
1521 |
processing_status_messages.append(f"β
Processed file: {file.name}")
|
|
|
|
|
|
|
|
|
1522 |
else:
|
1523 |
processing_status_messages.append(f"β Failed to process file: {file.name}")
|
1524 |
|
|
|
1525 |
# Generate QR codes
|
1526 |
qr_paths = []
|
1527 |
final_json_output = None
|
@@ -1557,7 +1491,7 @@ def create_modern_interface():
|
|
1557 |
num_qrs = 0
|
1558 |
else:
|
1559 |
num_qrs = len(qr_paths_list)
|
1560 |
-
|
1561 |
initial_enabled_states = list(range(num_qrs))
|
1562 |
return qr_paths_list, initial_enabled_states # Return paths list and initial enabled state
|
1563 |
|
@@ -1567,7 +1501,7 @@ def create_modern_interface():
|
|
1567 |
|
1568 |
process_btn.click(
|
1569 |
process_inputs,
|
1570 |
-
inputs=[url_input, file_input, text_input, combine_data, crawl_depth_slider],
|
1571 |
outputs=[output_json, output_gallery, output_text]
|
1572 |
).then( # Chain a .then() to update the QR paths state and trigger viewport update
|
1573 |
on_qr_generation,
|
@@ -1581,7 +1515,7 @@ def create_modern_interface():
|
|
1581 |
# Add helpful documentation
|
1582 |
gr.Markdown("""
|
1583 |
### π Features
|
1584 |
-
- **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type.
|
1585 |
- **Advanced File Processing**: Reads various text-based files (.txt, .md, .log etc.), HTML, XML, CSV, and attempts text extraction from common documents (.pdf, .docx, .rtf, .odt - *requires extra dependencies*).
|
1586 |
- **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs.
|
1587 |
- **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives.
|
@@ -1591,19 +1525,13 @@ def create_modern_interface():
|
|
1591 |
- **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
|
1592 |
- **Modern Design**: Clean, responsive interface with visual feedback.
|
1593 |
### π‘ Tips
|
1594 |
-
1. **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type
|
1595 |
2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats.
|
1596 |
3. **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure.
|
1597 |
4. **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries. Check the console logs for warnings if a library is missing.
|
1598 |
5. **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item.
|
1599 |
6. **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps.
|
1600 |
7. **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images.
|
1601 |
-
### π¨ Output Details
|
1602 |
-
- The "Processed Data" JSON will be a list of dictionaries. Each dictionary represents one processed input (URL or file).
|
1603 |
-
- Each item will have keys like `source`, `filename` (for files), `url` (for URLs), `mime_type`, `raw_content` (if readable), `extracted_data`, and `processing_notes`.
|
1604 |
-
- `extracted_data` will contain the parsed/extracted content, structured according to the input type (e.g., dictionary for JSON, text for documents, list of rows for CSV, dictionary with title/text/links for HTML).
|
1605 |
-
- `processing_notes` will list any issues encountered during extraction.
|
1606 |
-
- Generated QR codes are saved in the `output/qr_codes` directory.
|
1607 |
### βοΈ QR Code Viewport Instructions
|
1608 |
1. Navigate to the **QR Code Viewport** tab after generating QR codes.
|
1609 |
2. The generated QR codes will be displayed in a grid based on their total count.
|
@@ -1635,4 +1563,4 @@ def main():
|
|
1635 |
raise # Re-raise the exception to ensure the process exits if launch fails
|
1636 |
|
1637 |
if __name__ == "__main__":
|
1638 |
-
main()
|
|
|
55 |
|
56 |
try:
|
57 |
from pyth.plugins.plaintext.writer import PlaintextWriter
|
58 |
+
from pyth.plugins.rtf15.reader import Rtf15Reader # Import Rtf15Reader
|
59 |
RTF_SUPPORT = True
|
60 |
except ImportError:
|
61 |
RTF_SUPPORT = False
|
|
|
87 |
for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
|
88 |
directory.mkdir(parents=True, exist_ok=True)
|
89 |
|
90 |
+
# Dummy EnhancedURLProcessor class for demonstration purposes if the actual class isn't provided.
|
91 |
+
# This dummy simulates fetching and creating a nested structure based on max_steps.
|
92 |
class EnhancedURLProcessor:
|
93 |
+
"""Simulates advanced URL processing with enhanced content extraction and recursive link following."""
|
94 |
|
95 |
def __init__(self):
|
96 |
+
# Dummy session and user agent for simulation
|
97 |
+
self.session = type('obj', (object,), {'get': self._dummy_get_request})()
|
98 |
+
self.user_agent = type('obj', (object,), {'random': 'SimulatedAgent/1.0'})()
|
99 |
+
self.timeout = 15
|
100 |
self.max_retries = 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
+
def _dummy_get_request(self, url, timeout):
|
103 |
+
"""Simulates a GET request response."""
|
104 |
+
class MockResponse:
|
105 |
+
def __init__(self, url, status_code, content_type, content, encoding='utf-8'):
|
106 |
+
self.url = url
|
107 |
+
self.status_code = status_code
|
108 |
+
self.headers = {'Content-Type': content_type}
|
109 |
+
self._content = content.encode(encoding)
|
110 |
+
self.encoding = encoding
|
111 |
+
|
112 |
+
def raise_for_status(self):
|
113 |
+
if 400 <= self.status_code < 600:
|
114 |
+
raise requests.exceptions.RequestException(f"Simulated HTTP error {self.status_code}")
|
115 |
+
|
116 |
+
@property
|
117 |
+
def content(self):
|
118 |
+
return self._content
|
119 |
+
|
120 |
+
# Simulate different responses based on URL
|
121 |
+
if "this-domain-does-not-exist" in url:
|
122 |
+
raise requests.exceptions.RequestException("Simulated network error: Could not resolve host.")
|
123 |
+
elif "httpbin.org/html" in url:
|
124 |
+
# Simulate a simple HTML response
|
125 |
+
html_content = """
|
126 |
+
<!DOCTYPE html>
|
127 |
+
<html>
|
128 |
+
<head><title>Simulated HTML</title></head>
|
129 |
+
<body>
|
130 |
+
<h1>Hello, World!</h1>
|
131 |
+
<p>This is simulated HTML content.</p>
|
132 |
+
<a href="/link1">Link 1</a>
|
133 |
+
<a href="/link2">Link 2</a>
|
134 |
+
</body>
|
135 |
+
</html>
|
136 |
+
"""
|
137 |
+
return MockResponse(url, 200, 'text/html', html_content)
|
138 |
+
elif "quotes.toscrape.com" in url:
|
139 |
+
# Simulate a more complex HTML with more links for deeper testing
|
140 |
+
html_content = f"""
|
141 |
+
<!DOCTYPE html>
|
142 |
+
<html>
|
143 |
+
<head><title>Simulated Quotes Page</title></head>
|
144 |
+
<body>
|
145 |
+
<h1>Quotes</h1>
|
146 |
+
<p>Some simulated quotes.</p>
|
147 |
+
<a href="{url}/page/1/">Page 1</a>
|
148 |
+
<a href="{url}/page/2/">Page 2</a>
|
149 |
+
<a href="/tag/love/">Love Quotes</a>
|
150 |
+
</body>
|
151 |
+
</html>
|
152 |
+
"""
|
153 |
+
return MockResponse(url, 200, 'text/html', html_content)
|
154 |
+
elif "/child" in url:
|
155 |
+
# Simulate nested HTML pages
|
156 |
+
html_content = f"""
|
157 |
+
<!DOCTYPE html>
|
158 |
+
<html>
|
159 |
+
<head><title>Simulated Child Page</title></head>
|
160 |
+
<body>
|
161 |
+
<h1>Child Page</h1>
|
162 |
+
<p>Content for {url}.</p>
|
163 |
+
<a href="{url}/grandchild1">Grandchild 1</a>
|
164 |
+
</body>
|
165 |
+
</html>
|
166 |
+
"""
|
167 |
+
return MockResponse(url, 200, 'text/html', html_content)
|
168 |
+
else:
|
169 |
+
# Default simulated plain text response
|
170 |
+
return MockResponse(url, 200, 'text/plain', f"Simulated content for {url}")
|
171 |
|
172 |
|
173 |
+
def validate_url(self, url: str) -> Dict[str, Any]:
|
174 |
+
"""Enhanced URL validation with detailed feedback (Simulated)"""
|
175 |
+
# In a real implementation, this would perform actual network checks (HEAD/GET)
|
176 |
+
# For simulation, just check format
|
177 |
+
if not validators.url(url):
|
178 |
+
return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
|
179 |
+
parsed = urlparse(url)
|
180 |
+
if not all([parsed.scheme, parsed.netloc]):
|
181 |
+
return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
|
182 |
+
|
183 |
+
# Simulate accessibility check
|
184 |
+
if "this-domain-does-not-exist" in url:
|
185 |
+
return {'is_valid': False, 'message': 'Simulated: URL not accessible', 'details': 'Simulated network error'}
|
186 |
+
|
187 |
+
return {
|
188 |
+
'is_valid': True,
|
189 |
+
'message': 'Simulated: URL is valid and accessible',
|
190 |
+
'details': {
|
191 |
+
'final_url': url, # In simulation, final_url is same as original unless specifically handled
|
192 |
+
'content_type': 'text/html', # Simulate HTML for most tests
|
193 |
+
'server': 'SimulatedServer',
|
194 |
+
'size': 'SimulatedSize'
|
195 |
}
|
196 |
+
}
|
|
|
197 |
|
198 |
def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]:
|
199 |
+
"""Enhanced content fetcher with retry mechanism and complete character extraction (Simulated)"""
|
200 |
try:
|
201 |
+
logger.info(f"Simulating fetch content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
|
202 |
+
# Simulate the request using the dummy get
|
|
|
203 |
response = self.session.get(url, timeout=self.timeout)
|
204 |
response.raise_for_status()
|
205 |
+
final_url = response.url # Capture potential redirects (simulated)
|
206 |
content_type = response.headers.get('Content-Type', '')
|
207 |
|
208 |
+
# Simulate encoding detection (assuming utf-8 for simplicity in simulation)
|
209 |
+
encoding = 'utf-8'
|
210 |
+
raw_content = response.content.decode(encoding, errors='replace')
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
+
# Extract metadata (simulated)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
metadata = {
|
214 |
'original_url': url,
|
215 |
'final_url': final_url,
|
|
|
221 |
'status_code': response.status_code
|
222 |
}
|
223 |
|
224 |
+
# Process based on content type (using the actual _process_web_content)
|
225 |
processed_extraction = self._process_web_content(raw_content, metadata['content_type'], final_url)
|
226 |
|
227 |
return {
|
|
|
234 |
}
|
235 |
except requests.exceptions.RequestException as e:
|
236 |
if retry_count < self.max_retries - 1:
|
237 |
+
logger.warning(f"Simulated Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
|
238 |
+
time.sleep(0.1) # Shorter backoff for simulation
|
239 |
return self.fetch_content(url, retry_count + 1)
|
240 |
+
logger.error(f"Simulated: Failed to fetch content after {self.max_retries} attempts from {url}: {e}")
|
241 |
return {
|
242 |
'source': 'url',
|
243 |
'url': url,
|
244 |
'raw_content': None,
|
245 |
'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}, # Include basic metadata on failure
|
246 |
'extracted_data': None,
|
247 |
+
'processing_notes': [f"Simulated: Failed to fetch content after {self.max_retries} attempts: {str(e)}"] # Ensure notes is a list
|
248 |
}
|
249 |
except Exception as e:
|
250 |
+
logger.error(f"Simulated: Unexpected error while fetching or processing URL {url}: {e}")
|
251 |
return {
|
252 |
'source': 'url',
|
253 |
'url': url,
|
254 |
'raw_content': raw_content if 'raw_content' in locals() else None,
|
255 |
'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None},
|
256 |
'extracted_data': None,
|
257 |
+
'processing_notes': [f"Simulated: Unexpected processing error: {str(e)}"]
|
258 |
}
|
259 |
|
260 |
+
|
261 |
def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
|
262 |
"""Process content based on detected content type"""
|
263 |
lower_content_type = content_type.lower()
|
|
|
376 |
return extracted
|
377 |
|
378 |
def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
|
379 |
+
"""Fetches content from a URL and recursively follows links up to max_steps depth."""
|
380 |
+
# Validate max_steps first
|
381 |
+
if not isinstance(max_steps, int) or not (0 <= max_steps <= 10): # Changed max depth to 10
|
382 |
+
logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 10.")
|
383 |
+
return {
|
384 |
+
'url': url,
|
385 |
+
'level': 0,
|
386 |
+
'fetch_result': None,
|
387 |
+
'linked_extractions': [],
|
388 |
+
'processing_notes': [f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 10."]
|
389 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
390 |
|
391 |
+
# Validate the initial URL
|
392 |
+
validation_result = self.validate_url(url)
|
393 |
+
if not validation_result['is_valid']:
|
394 |
+
logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
|
395 |
+
return {
|
396 |
'url': url,
|
397 |
+
'level': 0,
|
398 |
'fetch_result': None,
|
399 |
'linked_extractions': [],
|
400 |
+
'processing_notes': [f"Initial URL validation failed: {validation_result['message']}"]
|
401 |
}
|
402 |
|
403 |
+
# Start the recursive fetching process
|
404 |
+
return self._fetch_content_recursive(url, max_steps, current_step=0)
|
405 |
+
|
406 |
+
def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
|
407 |
+
"""Recursive helper function to fetch content and follow links."""
|
408 |
+
# Base case: Stop if current depth exceeds max_steps
|
409 |
+
if current_step > max_steps:
|
410 |
+
logger.debug(f"Depth limit ({max_steps}) reached for {url} at level {current_step}.")
|
411 |
return {
|
412 |
'url': url,
|
413 |
+
'level': current_step,
|
414 |
'fetch_result': None,
|
415 |
'linked_extractions': [],
|
416 |
+
'processing_notes': [f"Depth limit ({max_steps}) reached."]
|
417 |
}
|
418 |
|
419 |
+
logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
|
420 |
+
fetch_result = self.fetch_content(url)
|
421 |
+
linked_extractions: List[Dict[str, Any]] = []
|
422 |
+
|
423 |
+
# Only attempt to extract and follow links if fetch was successful and content is HTML
|
424 |
+
if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
|
425 |
+
extracted_data = fetch_result['extracted_data']
|
426 |
+
links = extracted_data.get('links', [])
|
427 |
+
|
428 |
+
logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
|
429 |
+
# Recursively process linked URLs if more steps are allowed
|
430 |
+
if current_step < max_steps:
|
431 |
+
for link_info in links:
|
432 |
+
linked_url = link_info.get('url')
|
433 |
+
if linked_url:
|
434 |
+
# Add a check to prevent processing the same URL repeatedly in a single crawl path
|
435 |
+
# (More sophisticated de-duplication across the *entire* crawl would require a visited set passed down)
|
436 |
+
# For simplicity here, we just prevent immediate cycles.
|
437 |
+
if linked_url != url:
|
438 |
+
linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
|
439 |
+
# Only append results if the recursive call returned something valid
|
440 |
+
if linked_result:
|
441 |
+
linked_extractions.append(linked_result)
|
442 |
+
else:
|
443 |
+
logger.debug(f"Skipping self-referencing link: {linked_url}")
|
444 |
+
|
445 |
+
|
446 |
+
# Add processing notes from the fetch_result to the current level's notes
|
447 |
+
current_notes = fetch_result.get('processing_notes', []) if fetch_result else ['Fetch failed.']
|
448 |
+
if f"Processed at level {current_step}" not in current_notes:
|
449 |
+
current_notes.append(f"Processed at level {current_step}")
|
450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
|
452 |
return {
|
453 |
'url': url,
|
454 |
+
'level': current_step,
|
455 |
+
'fetch_result': fetch_result, # Include the full fetch result for details
|
456 |
'linked_extractions': linked_extractions,
|
457 |
+
'processing_notes': current_notes
|
458 |
}
|
459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
460 |
|
461 |
class EnhancedFileProcessor:
|
462 |
"""Advanced file processing with enhanced content extraction"""
|
|
|
480 |
return []
|
481 |
|
482 |
dataset = []
|
483 |
+
# Use Path object for easier handling. Note: Gradio file object might not be a standard file path,
|
484 |
+
# but rather an object with a 'name' attribute pointing to a temp file path.
|
485 |
+
file_path = Path(file.name)
|
486 |
+
|
487 |
+
# Ensure file exists before trying to get size/stats
|
488 |
+
if not file_path.exists():
|
489 |
+
logger.error(f"File path does not exist: {file_path}")
|
490 |
+
return [{
|
491 |
+
'source': 'file',
|
492 |
+
'filename': file.name if hasattr(file, 'name') else 'unknown',
|
493 |
+
'file_size': None,
|
494 |
+
'extracted_data': None,
|
495 |
+
'processing_notes': 'File path does not exist.'
|
496 |
+
}]
|
497 |
+
|
498 |
|
499 |
try:
|
500 |
file_size = file_path.stat().st_size
|
|
|
508 |
'processing_notes': 'File size exceeds limit.'
|
509 |
}]
|
510 |
|
511 |
+
# Use a temporary directory for extracting archives
|
512 |
with tempfile.TemporaryDirectory() as temp_dir:
|
513 |
temp_dir_path = Path(temp_dir)
|
514 |
|
515 |
+
# Decide processing strategy based on extension
|
516 |
if file_path.suffix.lower() in self.archive_extensions:
|
517 |
dataset.extend(self._process_archive(file_path, temp_dir_path))
|
518 |
elif file_path.suffix.lower() in self.supported_extensions:
|
519 |
# Pass the path to the single file processor
|
520 |
dataset.extend(self._process_single_file(file_path))
|
521 |
else:
|
522 |
+
logger.warning(f"Unsupported file type for processing: '{file_path.name}'. Attempting to read as plain text.")
|
523 |
+
# Attempt to process as raw text even if extension is unsupported
|
524 |
try:
|
525 |
# Read as text with error replacement
|
526 |
content_bytes = file_path.read_bytes()
|
|
|
533 |
'file_size': file_size,
|
534 |
'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
|
535 |
'extracted_data': {'plain_text': raw_content}, # Store raw text under a key
|
536 |
+
'processing_notes': ['Processed as plain text (unsupported extension).'] # Ensure notes is a list
|
537 |
})
|
538 |
except Exception as e:
|
539 |
logger.error(f"Error reading or processing unsupported file '{file_path.name}' as text: {e}")
|
|
|
543 |
'file_size': file_size,
|
544 |
'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
|
545 |
'extracted_data': None,
|
546 |
+
'processing_notes': [f'Unsupported file type and failed to read as text: {e}'] # Ensure notes is a list
|
547 |
})
|
548 |
|
549 |
|
|
|
554 |
'filename': file_path.name,
|
555 |
'file_size': file_size if 'file_size' in locals() else None,
|
556 |
'extracted_data': None,
|
557 |
+
'processing_notes': [f'Overall file processing error: {str(e)}'] # Ensure notes is a list
|
558 |
})
|
559 |
return dataset
|
560 |
|
|
|
576 |
|
577 |
raw_content: Optional[str] = None
|
578 |
extracted_data: Any = None
|
579 |
+
processing_notes: List[str] = [] # Initialize notes as a list
|
580 |
|
581 |
try:
|
582 |
# Read content efficiently
|
|
|
661 |
|
662 |
if rows:
|
663 |
# Limit the number of rows included for potentially huge CSVs
|
664 |
+
max_rows_preview = 100 # Limit text preview
|
665 |
extracted_data = {
|
666 |
+
'headers': rows[0] if rows and rows[0] else None, # Assume first row is header if exists
|
667 |
+
'rows': rows[1:max_rows_preview+1] if len(rows) > 1 else [] # Get up to max_rows_preview data rows, if any
|
668 |
}
|
669 |
if len(rows) > max_rows_preview + 1:
|
670 |
+
processing_notes.append(f"CSV data rows truncated to {max_rows_preview}.")
|
671 |
processing_notes.append("Parsed as CSV.")
|
672 |
if not is_explicit_csv:
|
673 |
processing_notes.append("Note: Content looked like CSV despite extension/mime.")
|
|
|
698 |
extracted_text = text_content
|
699 |
processing_notes.append("Extracted text from PDF.")
|
700 |
finally:
|
701 |
+
if temp_path.exists(): temp_path.unlink() # Clean up temp file
|
702 |
elif file_extension == '.docx' and DOCX_SUPPORT:
|
703 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
|
704 |
tmp_file.write(content_bytes) # Write bytes to temp file
|
|
|
709 |
extracted_text = text_content
|
710 |
processing_notes.append("Extracted text from DOCX.")
|
711 |
finally:
|
712 |
+
if temp_path.exists(): temp_path.unlink() # Clean up temp file
|
713 |
elif file_extension == '.rtf' and RTF_SUPPORT:
|
714 |
# pyth can read directly from file-like object or string
|
715 |
try:
|
716 |
+
# Rtf15Reader expects a file-like object or string
|
717 |
doc = Rtf15Reader.read(io.StringIO(raw_content))
|
718 |
text_content = PlaintextWriter.write(doc).getvalue()
|
719 |
extracted_text = text_content
|
|
|
732 |
extracted_text = text_content
|
733 |
processing_notes.append("Extracted text from ODT.")
|
734 |
finally:
|
735 |
+
if temp_path.exists(): temp_path.unlink() # Clean up temp file
|
736 |
elif file_extension in ['.doc', '.ppt', '.pptx', '.xls', '.xlsx']:
|
737 |
# These require more complex or platform-specific libraries (e.g. antiword, pandoc, COM objects on Windows)
|
738 |
processing_notes.append(f"Automatic text extraction for {file_extension.upper()} not fully implemented.")
|
|
|
799 |
if zipfile.is_zipfile(archive_path):
|
800 |
with zipfile.ZipFile(archive_path, 'r') as zip_ref:
|
801 |
for file_info in zip_ref.infolist():
|
802 |
+
# Skip directories and empty files
|
803 |
if file_info.file_size > 0 and not file_info.filename.endswith('/'):
|
804 |
+
# Sanitize filename to prevent directory traversal issues
|
805 |
+
sanitized_filename = Path(file_info.filename).name # Takes only the base name
|
806 |
+
extracted_file_path = extract_to / sanitized_filename
|
807 |
try:
|
808 |
+
# Extract file to the temporary directory
|
809 |
+
with zip_ref.open(file_info) as zf, open(extracted_file_path, 'wb') as outfile:
|
810 |
+
outfile.write(zf.read())
|
811 |
+
|
812 |
# Recursively process the extracted file if it's supported and not an archive itself
|
813 |
if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
|
814 |
dataset.extend(self._process_single_file(extracted_file_path))
|
|
|
820 |
logger.debug(f"Skipping unsupported file in archive: '{file_info.filename}'")
|
821 |
except Exception as e:
|
822 |
logger.warning(f"Error extracting/processing file '{file_info.filename}' from zip '{archive_path.name}': {e}")
|
823 |
+
finally:
|
824 |
+
# Clean up the extracted file immediately after processing
|
825 |
+
if extracted_file_path.exists():
|
826 |
+
try:
|
827 |
+
extracted_file_path.unlink()
|
828 |
+
except OSError as e:
|
829 |
+
logger.warning(f"Failed to clean up extracted file {extracted_file_path}: {e}")
|
830 |
+
|
831 |
else:
|
832 |
logger.error(f"'{archive_path.name}' is not a valid zip file.")
|
833 |
|
|
|
842 |
with tarfile.open(archive_path, mode) as tar_ref:
|
843 |
for member in tar_ref.getmembers():
|
844 |
if member.isfile():
|
845 |
+
# Sanitize member name
|
846 |
+
sanitized_filename = Path(member.name).name
|
847 |
+
extracted_file_path = extract_to / sanitized_filename
|
848 |
try:
|
849 |
+
# Extract member to the temporary directory
|
850 |
+
# Ensure the target path is within the extraction directory
|
851 |
+
if not str(extracted_file_path).startswith(str(extract_to)):
|
852 |
+
logger.warning(f"Skipping potentially malicious path in tar: {member.name}")
|
853 |
+
continue # Skip if path is outside the temp dir
|
854 |
+
|
855 |
+
with tar_ref.extractfile(member) as tf, open(extracted_file_path, 'wb') as outfile:
|
856 |
+
if tf: # extractfile can return None for special file types
|
857 |
+
outfile.write(tf.read())
|
858 |
+
else:
|
859 |
+
logger.warning(f"Could not extract file-like object for {member.name} from tar.")
|
860 |
+
continue # Skip this member
|
861 |
+
|
862 |
# Recursively process extracted file
|
863 |
if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
|
864 |
dataset.extend(self._process_single_file(extracted_file_path))
|
|
|
869 |
logger.debug(f"Skipping unsupported file in archive: '{member.name}'")
|
870 |
except Exception as e:
|
871 |
logger.warning(f"Error extracting/processing file '{member.name}' from tar '{archive_path.name}': {e}")
|
872 |
+
finally:
|
873 |
+
# Clean up the extracted file immediately after processing
|
874 |
+
if extracted_file_path.exists():
|
875 |
+
try:
|
876 |
+
extracted_file_path.unlink()
|
877 |
+
except OSError as e:
|
878 |
+
logger.warning(f"Failed to clean up extracted file {extracted_file_path}: {e}")
|
879 |
+
|
880 |
except tarfile.TarError as e:
|
881 |
logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
|
882 |
|
|
|
901 |
except Exception as e:
|
902 |
logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
|
903 |
finally:
|
904 |
+
if extracted_path.exists():
|
905 |
+
try:
|
906 |
+
extracted_path.unlink() # Clean up extracted file
|
907 |
+
except OSError as e:
|
908 |
+
logger.warning(f"Failed to clean up extracted file {extracted_path}: {e}")
|
909 |
+
|
910 |
|
911 |
# TODO: Add support for other archive types (.bz2, .7z, .rar)
|
912 |
elif archive_extension in ('.bz2', '.7z', '.rar'):
|
|
|
935 |
"idx": 0, # chunk_index
|
936 |
"tc": 1, # total_chunks
|
937 |
"tl": total_length, # total_length
|
938 |
+
"hash": 0, # chunk_hash (using int for hash)
|
939 |
"data": "" # chunk_data
|
940 |
}
|
941 |
# Estimate overhead more accurately by dumping a sample metadata structure
|
942 |
# and adding some safety margin. Shortened keys reduce overhead.
|
943 |
+
# Use a dummy hash value (e.g., 1234567890) to get a realistic length estimate
|
944 |
+
metadata_template_with_hash = {**metadata_template, "hash": 1234567890}
|
945 |
+
overhead_estimate = len(json.dumps(metadata_template_with_hash, separators=(',', ':'))) + 50 # Extra padding
|
946 |
|
947 |
# Calculate effective chunk size
|
948 |
effective_chunk_size = max_size - overhead_estimate
|
|
|
1255 |
}
|
1256 |
</script>
|
1257 |
"""
|
|
|
1258 |
with gr.Row():
|
1259 |
+
# Adjusted crawl depth slider to match the max_steps limit in the code
|
1260 |
crawl_depth_slider = gr.Slider(
|
1261 |
label="Crawl Depth",
|
1262 |
minimum=0,
|
1263 |
+
maximum=10, # Changed max depth to 10
|
1264 |
value=0,
|
1265 |
step=1,
|
1266 |
interactive=True,
|
1267 |
+
info="Select the maximum depth for crawling links (0-10)." # Updated info
|
1268 |
)
|
1269 |
+
|
1270 |
qr_code_paths = gr.State([])
|
1271 |
gr.Markdown("""
|
1272 |
# π Advanced Data Processing & QR Code Generator
|
|
|
1362 |
num_qr_codes = len(paths)
|
1363 |
cols = math.ceil(math.sqrt(num_qr_codes)) # Calculate columns for a roughly square grid
|
1364 |
cols = max(1, min(cols, 6)) # Limit max columns for small screens
|
1365 |
+
# rows = math.ceil(num_qr_codes / cols) # Not used in HTML generation
|
1366 |
|
1367 |
viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'.format(cols)
|
1368 |
|
1369 |
+
# Initialize enabledStates if it's empty (first load) or if paths have changed
|
1370 |
+
if enabled_states is None or len(enabled_states) != num_qr_codes:
|
1371 |
+
enabled_states = list(range(num_qr_codes)) # Enable all by default or if QR count changes
|
1372 |
+
|
1373 |
|
1374 |
for i, path in enumerate(paths):
|
1375 |
is_enabled = i in enabled_states
|
|
|
1383 |
|
1384 |
return viewport_html
|
1385 |
|
1386 |
+
def process_inputs(urls, files, text, combine, crawl_depth): # Added crawl_depth parameter
|
1387 |
"""Process all inputs and generate QR codes"""
|
1388 |
results = []
|
1389 |
processing_status_messages = []
|
|
|
1415 |
url_list = re.split(r'[,\n]', urls)
|
1416 |
url_list = [url.strip() for url in url_list if url.strip()]
|
1417 |
for url in url_list:
|
1418 |
+
# Use the crawl_depth from the slider
|
1419 |
+
processing_status_messages.append(f"π Processing URL: {url} with crawl depth {crawl_depth}...")
|
1420 |
+
content_result = url_processor.fetch_content_with_depth(url, max_steps=crawl_depth)
|
1421 |
+
if content_result and content_result.get('fetch_result') is not None: # Check if initial fetch was successful
|
1422 |
+
results.append(content_result)
|
1423 |
+
processing_status_messages.append(f"β
Processed URL: {url} (Level 0)")
|
1424 |
+
# Add notes from the result if any
|
1425 |
+
if content_result.get('processing_notes'):
|
1426 |
+
processing_status_messages.append(f" Notes: {'; '.join(content_result['processing_notes'])}")
|
1427 |
+
|
1428 |
+
# Optionally add status for linked extractions
|
1429 |
+
if content_result.get('linked_extractions'):
|
1430 |
+
num_linked_processed = len([r for r in content_result['linked_extractions'] if r and r.get('fetch_result') is not None])
|
1431 |
+
processing_status_messages.append(f" Found and processed {num_linked_processed}/{len(content_result['linked_extractions'])} direct links.")
|
1432 |
+
# Note: Detailed status for deeper levels would require traversing the result structure here.
|
1433 |
+
|
1434 |
else:
|
1435 |
+
processing_status_messages.append(f"β Failed to process URL: {url}")
|
1436 |
+
# Add notes from the result even if fetch failed
|
1437 |
+
if content_result and content_result.get('processing_notes'):
|
1438 |
+
processing_status_messages.append(f" Notes: {'; '.join(content_result['processing_notes'])}")
|
1439 |
+
elif content_result and content_result.get('note'): # Handle the 'note' key from validation/invalid steps
|
1440 |
+
processing_status_messages.append(f" Notes: {content_result['note']}")
|
1441 |
+
|
1442 |
|
1443 |
# Process files
|
1444 |
if files:
|
|
|
1448 |
if file_results:
|
1449 |
results.extend(file_results)
|
1450 |
processing_status_messages.append(f"β
Processed file: {file.name}")
|
1451 |
+
# Add notes from file processing results
|
1452 |
+
for res in file_results:
|
1453 |
+
if res.get('processing_notes'):
|
1454 |
+
processing_status_messages.append(f" Notes for {res.get('filename', 'item')}: {'; '.join(res['processing_notes'])}")
|
1455 |
else:
|
1456 |
processing_status_messages.append(f"β Failed to process file: {file.name}")
|
1457 |
|
1458 |
+
|
1459 |
# Generate QR codes
|
1460 |
qr_paths = []
|
1461 |
final_json_output = None
|
|
|
1491 |
num_qrs = 0
|
1492 |
else:
|
1493 |
num_qrs = len(qr_paths_list)
|
1494 |
+
|
1495 |
initial_enabled_states = list(range(num_qrs))
|
1496 |
return qr_paths_list, initial_enabled_states # Return paths list and initial enabled state
|
1497 |
|
|
|
1501 |
|
1502 |
process_btn.click(
|
1503 |
process_inputs,
|
1504 |
+
inputs=[url_input, file_input, text_input, combine_data, crawl_depth_slider], # Pass crawl_depth_slider value
|
1505 |
outputs=[output_json, output_gallery, output_text]
|
1506 |
).then( # Chain a .then() to update the QR paths state and trigger viewport update
|
1507 |
on_qr_generation,
|
|
|
1515 |
# Add helpful documentation
|
1516 |
gr.Markdown("""
|
1517 |
### π Features
|
1518 |
+
- **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type. Supports crawling links up to a specified depth.
|
1519 |
- **Advanced File Processing**: Reads various text-based files (.txt, .md, .log etc.), HTML, XML, CSV, and attempts text extraction from common documents (.pdf, .docx, .rtf, .odt - *requires extra dependencies*).
|
1520 |
- **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs.
|
1521 |
- **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives.
|
|
|
1525 |
- **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
|
1526 |
- **Modern Design**: Clean, responsive interface with visual feedback.
|
1527 |
### π‘ Tips
|
1528 |
+
1. **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type, following links up to the specified **Crawl Depth**.
|
1529 |
2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats.
|
1530 |
3. **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure.
|
1531 |
4. **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries. Check the console logs for warnings if a library is missing.
|
1532 |
5. **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item.
|
1533 |
6. **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps.
|
1534 |
7. **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images.
|
|
|
|
|
|
|
|
|
|
|
|
|
1535 |
### βοΈ QR Code Viewport Instructions
|
1536 |
1. Navigate to the **QR Code Viewport** tab after generating QR codes.
|
1537 |
2. The generated QR codes will be displayed in a grid based on their total count.
|
|
|
1563 |
raise # Re-raise the exception to ensure the process exits if launch fails
|
1564 |
|
1565 |
if __name__ == "__main__":
|
1566 |
+
main()
|