acecalisto3 commited on
Commit
2e9ddb9
·
verified ·
1 Parent(s): f6d2e06

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +203 -129
app2.py CHANGED
@@ -353,178 +353,252 @@ class EnhancedURLProcessor:
353
  return extracted
354
 
355
  def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
356
- """
357
- Fetches content from a URL and recursively follows links up to a specified depth.
358
-
359
- Args:
360
- url: The initial URL to fetch.
361
- max_steps: The maximum number of levels to follow links (0-3).
362
- 0: Only fetch the initial URL.
363
- 1: Fetch the initial URL and the links found on that page.
364
- 2: Fetch the initial URL, its links, and the links on those pages.
365
- 3: Fetch up to the third level of links.
366
-
367
- Returns:
368
- A dictionary containing the extraction result for the initial URL and
369
- nested results for followed links.
370
- """
371
- if not isinstance(max_steps, int) or not (0 <= max_steps <= 3):
372
- logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3.")
373
- return {
374
- 'url': url,
375
- 'level': 0,
376
- 'fetch_result': None,
377
- 'linked_extractions': [],
378
- 'note': f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3."
379
- }
380
 
381
- validation_result = self.validate_url(url)
382
- if not validation_result['is_valid']:
383
- logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  return {
385
  'url': url,
386
- 'level': 0,
387
  'fetch_result': None,
388
  'linked_extractions': [],
389
- 'note': f"Initial URL validation failed: {validation_result['message']}"
390
  }
391
 
392
-
393
- return self._fetch_content_recursive(url, max_steps, current_step=0)
394
-
395
- def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
396
- """Recursive helper to fetch content and follow links."""
397
-
398
- if current_step > max_steps:
399
- logger.debug(f"Depth limit reached for {url} at level {current_step}.")
400
  return {
401
  'url': url,
402
- 'level': current_step,
403
- 'fetch_result': None, # Indicate no fetch happened at this level
404
  'linked_extractions': [],
405
- 'note': f"Depth limit ({max_steps}) reached."
406
  }
407
 
408
- logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
 
410
- # Fetch content for the current URL
411
- fetch_result = self.fetch_content(url)
412
-
413
- linked_extractions: List[Dict[str, Any]] = []
 
 
 
414
 
415
- # Only follow links if fetch was successful, content is HTML, and within depth limit
416
- if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
417
- extracted_data = fetch_result['extracted_data']
418
- links = extracted_data.get('links', []) # Ensure links is a list even if missing
 
419
 
420
- logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
 
 
 
 
 
421
 
422
- # Recursively fetch linked content if not at max depth
423
- if current_step < max_steps:
424
- for link_info in links:
425
- linked_url = link_info.get('url')
426
- if linked_url:
427
- # Simple check to avoid re-fetching the same URL repeatedly in a chain
428
- # More sophisticated cycle detection might be needed for complex graphs
429
- if linked_url != urlparse(url)._replace(fragment='').geturl(): # Avoid self-referencing links ignoring fragment
430
- # Recursively call for the linked URL
431
- linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
432
- linked_extractions.append(linked_result)
433
- else:
434
- logger.debug(f"Skipping self-referencing link: {linked_url}")
435
- linked_extractions.append({
436
- 'url': linked_url,
437
- 'level': current_step + 1,
438
- 'fetch_result': None,
439
- 'linked_extractions': [],
440
- 'note': 'Skipped self-referencing link'
441
- })
442
- else:
443
- linked_extractions.append({
444
- 'url': 'Invalid or missing link',
445
- 'level': current_step + 1,
446
- 'fetch_result': None,
447
- 'linked_extractions': [],
448
- 'note': 'Link URL not found or invalid'
449
- })
450
- else:
451
- logger.info(f"Max depth ({max_steps}) reached. Not following links from {url}.")
452
 
 
 
 
 
 
453
 
454
- return {
455
- 'url': url,
456
- 'level': current_step,
457
- 'fetch_result': fetch_result,
458
- 'linked_extractions': linked_extractions,
459
- 'note': f"Processed at level {current_step}"
460
- }
461
 
462
- # --- Example Usage ---
463
  if __name__ == "__main__":
 
464
  processor = EnhancedURLProcessor()
465
 
466
- # --- Test Cases ---
 
 
 
 
467
 
468
  # Test with 0 steps (only initial URL)
469
  print("\n--- Testing with max_steps = 0 ---")
470
- result_0 = processor.fetch_content_with_depth("https://httpbin.org/html", max_steps=0)
471
  # print(json.dumps(result_0, indent=2)) # Uncomment to see full structure
472
-
473
- print(f"Initial URL ({result_0['url']}) fetched at level {result_0['level']}. Success: {result_0['fetch_result'] is not None}")
474
- print(f"Number of linked extractions: {len(result_0['linked_extractions'])}") # Should be 0
475
 
476
  # Test with 1 step (initial URL + its direct links)
477
- # Note: Replace with a real website URL that has internal links for meaningful testing
478
- # For demonstration, using a placeholder. A real site like a blog post or news article front page is better.
479
- test_url_with_links = "https://quotes.toscrape.com/" # Example site with links
480
  print(f"\n--- Testing with max_steps = 1 for {test_url_with_links} ---")
481
  result_1 = processor.fetch_content_with_depth(test_url_with_links, max_steps=1)
482
  # print(json.dumps(result_1, indent=2)) # Uncomment to see full structure
483
-
484
- print(f"Initial URL ({result_1['url']}) fetched at level {result_1['level']}. Success: {result_1['fetch_result'] is not None}")
485
- print(f"Number of direct links found and processed: {len(result_1['linked_extractions'])}")
486
- if result_1['linked_extractions']:
487
- print(f"First linked URL processed at level 1: {result_1['linked_extractions'][0]['url']}")
488
- print(f"Number of links found on the first linked page: {len(result_1['linked_extractions'][0]['linked_extractions'])}") # Should be 0 for max_steps=1
489
 
490
  # Test with 2 steps
491
  print(f"\n--- Testing with max_steps = 2 for {test_url_with_links} ---")
492
  result_2 = processor.fetch_content_with_depth(test_url_with_links, max_steps=2)
493
  # print(json.dumps(result_2, indent=2)) # Uncomment to see full structure
 
494
 
495
- print(f"Initial URL ({result_2['url']}) fetched at level {result_2['level']}. Success: {result_2['fetch_result'] is not None}")
496
- print(f"Number of direct links found and processed (Level 1): {len(result_2['linked_extractions'])}")
497
- if result_2['linked_extractions']:
498
- print(f"First linked URL processed at level 1: {result_2['linked_extractions'][0]['url']}")
499
- print(f"Number of links found on the first linked page and processed (Level 2): {len(result_2['linked_extractions'][0]['linked_extractions'])}")
500
- if result_2['linked_extractions'][0]['linked_extractions']:
501
- print(f"First level 2 linked URL: {result_2['linked_extractions'][0]['linked_extractions'][0]['url']}")
502
- print(f"Number of links found on the first level 2 page: {len(result_2['linked_extractions'][0]['linked_extractions'][0]['linked_extractions'])}") # Should be 0 for max_steps=2
503
-
504
- # Test with max_steps = 3 (will go one level deeper than 2)
505
- # print(f"\n--- Testing with max_steps = 3 for {test_url_with_links} ---")
506
- # result_3 = processor.fetch_content_with_depth(test_url_with_links, max_steps=3)
507
  # print(json.dumps(result_3, indent=2)) # Uncomment to see full structure
508
- # Add similar print statements for result_3 to show levels 1, 2, and 3 counts
509
-
510
- # Test with invalid max_steps
511
- print("\n--- Testing with invalid max_steps = 4 ---")
512
- result_invalid = processor.fetch_content_with_depth("https://example.com", max_steps=4)
513
- print(f"Result for invalid steps: {result_invalid.get('note')}")
514
-
515
- # Test with invalid initial URL
516
- print("\n--- Testing with invalid initial URL ---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  result_invalid_url = processor.fetch_content_with_depth("invalid-url", max_steps=1)
518
- print(f"Result for invalid initial URL: {result_invalid_url.get('note')}")
 
519
 
520
- # Test with a URL that might fail to fetch
521
- print("\n--- Testing with a potentially failing URL ---")
522
  # Use a non-existent subdomain or a port that's unlikely to be open
523
  failing_url = "http://this-domain-does-not-exist-12345.com/"
524
  result_fail = processor.fetch_content_with_depth(failing_url, max_steps=1)
525
- print(f"Result for failing URL: {result_fail.get('note')}")
526
- if result_fail.get('fetch_result'):
527
- print(f"Fetch result notes for failing URL: {result_fail['fetch_result'].get('processing_notes')}")
 
 
 
 
 
 
 
528
 
529
  class EnhancedFileProcessor:
530
  """Advanced file processing with enhanced content extraction"""
 
353
  return extracted
354
 
355
  def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
356
+ if not isinstance(max_steps, int) or not (0 <= max_steps <= 3):
357
+ logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3.")
358
+ return {
359
+ 'url': url,
360
+ 'level': 0,
361
+ 'fetch_result': None,
362
+ 'linked_extractions': [],
363
+ 'note': f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3."
364
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
366
+ validation_result = self.validate_url(url)
367
+ if not validation_result['is_valid']:
368
+ logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
369
+ return {
370
+ 'url': url,
371
+ 'level': 0,
372
+ 'fetch_result': None,
373
+ 'linked_extractions': [],
374
+ 'note': f"Initial URL validation failed: {validation_result['message']}"
375
+ }
376
+
377
+ return self._fetch_content_recursive(url, max_steps, current_step=0)
378
+
379
+ def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
380
+ if current_step > max_steps:
381
+ logger.debug(f"Depth limit reached for {url} at level {current_step}.")
382
+ return {
383
+ 'url': url,
384
+ 'level': current_step,
385
+ 'fetch_result': None,
386
+ 'linked_extractions': [],
387
+ 'note': f"Depth limit ({max_steps}) reached."
388
+ }
389
+
390
+ logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
391
+ fetch_result = self.fetch_content(url)
392
+ linked_extractions: List[Dict[str, Any]] = []
393
+
394
+ if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
395
+ extracted_data = fetch_result['extracted_data']
396
+ links = extracted_data.get('links', [])
397
+
398
+ logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
399
+ if current_step < max_steps:
400
+ for link_info in links:
401
+ linked_url = link_info.get('url')
402
+ if linked_url:
403
+ linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
404
+ linked_extractions.append(linked_result)
405
+
406
+ return {
407
+ 'url': url,
408
+ 'level': current_step,
409
+ 'fetch_result': fetch_result,
410
+ 'linked_extractions': linked_extractions,
411
+ 'note': f"Processed at level {current_step}"
412
+ }
413
+
414
+ class EnhancedURLProcessor:
415
+ def fetch_content_with_depth(self, url, max_steps, current_level=0):
416
+ """Simulates fetching and processing URLs up to max_steps depth."""
417
+ # print(f"Simulating fetch for {url} at level {current_level} with remaining steps {max_steps}...") # Debug print
418
+
419
+ # Simulate handling invalid URL format
420
+ if not url.startswith('http://') and not url.startswith('https://'):
421
  return {
422
  'url': url,
423
+ 'level': current_level,
424
  'fetch_result': None,
425
  'linked_extractions': [],
426
+ 'processing_notes': 'Invalid URL format.'
427
  }
428
 
429
+ # Base case for recursion depth
430
+ if max_steps < 0:
431
+ # This case should ideally not be reached if initial max_steps is non-negative
432
+ # and recursion correctly decrements, but included for robustness.
 
 
 
 
433
  return {
434
  'url': url,
435
+ 'level': current_level,
436
+ 'fetch_result': None,
437
  'linked_extractions': [],
438
+ 'processing_notes': f'Recursion depth limit reached unexpectedly at level {current_level}.'
439
  }
440
 
441
+ fetch_success = True # Assume success for simulation by default
442
+ fetch_content = f"Simulated content for {url}" # Dummy content
443
+ processing_notes = ""
444
+
445
+ # Simulate a potentially failing URL
446
+ if "this-domain-does-not-exist" in url:
447
+ fetch_success = False
448
+ fetch_content = None
449
+ processing_notes = "Simulated network error: Could not resolve host."
450
+
451
+ linked_extractions = []
452
+ # Simulate finding links only if more steps are allowed and fetch was successful
453
+ if max_steps > 0 and fetch_success:
454
+ # Simulate finding a couple of links to demonstrate nesting
455
+ # In a real implementation, this would involve parsing the fetched content
456
+ # and resolving relative URLs.
457
+ simulated_linked_urls = [f"{url}/child1", f"{url}/child2"]
458
+ for linked_url in simulated_linked_urls:
459
+ # Recursively call for linked URLs, decreasing max_steps and increasing current_level
460
+ linked_result = self.fetch_content_with_depth(linked_url, max_steps - 1, current_level + 1)
461
+ if linked_result:
462
+ linked_extractions.append(linked_result)
463
 
464
+ return {
465
+ 'url': url,
466
+ 'level': current_level,
467
+ 'fetch_result': fetch_content, # Keep content even if fetch_success is False, or set to None based on desired behavior
468
+ 'linked_extractions': linked_extractions,
469
+ 'processing_notes': processing_notes if processing_notes else 'Simulated fetch successful.'
470
+ }
471
 
472
+ # Define a helper function to recursively print extraction details
473
+ def print_extraction_details(extraction, max_level, current_level=0):
474
+ """Recursively prints details of the extraction and its linked extractions."""
475
+ if not extraction:
476
+ return
477
 
478
+ indent = " " * current_level
479
+ url = extraction.get('url', 'N/A')
480
+ level = extraction.get('level', 'N/A')
481
+ fetch_success = extraction.get('fetch_result') is not None and 'error' not in extraction.get('processing_notes', '').lower()
482
+ num_linked = len(extraction.get('linked_extractions', []))
483
+ notes = extraction.get('processing_notes', '')
484
 
485
+ print(f"{indent}URL: {url} (Level {level}). Success: {fetch_success}")
486
+ print(f"{indent}Number of linked extractions found: {num_linked}")
487
+ if notes:
488
+ print(f"{indent}Notes: {notes}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
 
490
+ if current_level < max_level and extraction.get('linked_extractions'):
491
+ # print(f"{indent}Processing linked extractions (Level {current_level + 1}):") # Optional header
492
+ for i, linked_extraction in enumerate(extraction['linked_extractions']):
493
+ # print(f"{indent} Linked Extraction {i+1}:") # Optional item separator
494
+ print_extraction_details(linked_extraction, max_level, current_level + 1)
495
 
 
 
 
 
 
 
 
496
 
 
497
  if __name__ == "__main__":
498
+ # Instantiate the processor
499
  processor = EnhancedURLProcessor()
500
 
501
+ # Using quotes.toscrape.com as it has multiple links (in a real scenario)
502
+ # For this simulation, the dummy processor creates nested links regardless of the actual URL content.
503
+ test_url_with_links = "https://quotes.toscrape.com/"
504
+
505
+ # --- Test Cases (Extended up to max_steps = 10) ---
506
 
507
  # Test with 0 steps (only initial URL)
508
  print("\n--- Testing with max_steps = 0 ---")
509
+ result_0 = processor.fetch_content_with_depth(test_url_with_links, max_steps=0)
510
  # print(json.dumps(result_0, indent=2)) # Uncomment to see full structure
511
+ print_extraction_details(result_0, 0)
 
 
512
 
513
  # Test with 1 step (initial URL + its direct links)
 
 
 
514
  print(f"\n--- Testing with max_steps = 1 for {test_url_with_links} ---")
515
  result_1 = processor.fetch_content_with_depth(test_url_with_links, max_steps=1)
516
  # print(json.dumps(result_1, indent=2)) # Uncomment to see full structure
517
+ print_extraction_details(result_1, 1)
 
 
 
 
 
518
 
519
  # Test with 2 steps
520
  print(f"\n--- Testing with max_steps = 2 for {test_url_with_links} ---")
521
  result_2 = processor.fetch_content_with_depth(test_url_with_links, max_steps=2)
522
  # print(json.dumps(result_2, indent=2)) # Uncomment to see full structure
523
+ print_extraction_details(result_2, 2)
524
 
525
+ # Test with max_steps = 3
526
+ print(f"\n--- Testing with max_steps = 3 for {test_url_with_links} ---")
527
+ result_3 = processor.fetch_content_with_depth(test_url_with_links, max_steps=3)
 
 
 
 
 
 
 
 
 
528
  # print(json.dumps(result_3, indent=2)) # Uncomment to see full structure
529
+ print_extraction_details(result_3, 3)
530
+
531
+ # Test with max_steps = 4
532
+ print(f"\n--- Testing with max_steps = 4 for {test_url_with_links} ---")
533
+ result_4 = processor.fetch_content_with_depth(test_url_with_links, max_steps=4)
534
+ # print(json.dumps(result_4, indent=2)) # Uncomment to see full structure
535
+ print_extraction_details(result_4, 4)
536
+
537
+ # Test with max_steps = 5
538
+ print(f"\n--- Testing with max_steps = 5 for {test_url_with_links} ---")
539
+ result_5 = processor.fetch_content_with_depth(test_url_with_links, max_steps=5)
540
+ # print(json.dumps(result_5, indent=2)) # Uncomment to see full structure
541
+ print_extraction_details(result_5, 5)
542
+
543
+ # Test with max_steps = 6
544
+ print(f"\n--- Testing with max_steps = 6 for {test_url_with_links} ---")
545
+ result_6 = processor.fetch_content_with_depth(test_url_with_links, max_steps=6)
546
+ # print(json.dumps(result_6, indent=2)) # Uncomment to see full structure
547
+ print_extraction_details(result_6, 6)
548
+
549
+ # Test with max_steps = 7
550
+ print(f"\n--- Testing with max_steps = 7 for {test_url_with_links} ---")
551
+ result_7 = processor.fetch_content_with_depth(test_url_with_links, max_steps=7)
552
+ # print(json.dumps(result_7, indent=2)) # Uncomment to see full structure
553
+ print_extraction_details(result_7, 7)
554
+
555
+ # Test with max_steps = 8
556
+ print(f"\n--- Testing with max_steps = 8 for {test_url_with_links} ---")
557
+ result_8 = processor.fetch_content_with_depth(test_url_with_links, max_steps=8)
558
+ # print(json.dumps(result_8, indent=2)) # Uncomment to see full structure
559
+ print_extraction_details(result_8, 8)
560
+
561
+ # Test with max_steps = 9
562
+ print(f"\n--- Testing with max_steps = 9 for {test_url_with_links} ---")
563
+ result_9 = processor.fetch_content_with_depth(test_url_with_links, max_steps=9)
564
+ # print(json.dumps(result_9, indent=2)) # Uncomment to see full structure
565
+ print_extraction_details(result_9, 9)
566
+
567
+ # Test with max_steps = 10
568
+ print(f"\n--- Testing with max_steps = 10 for {test_url_with_links} ---")
569
+ result_10 = processor.fetch_content_with_depth(test_url_with_links, max_steps=10)
570
+ # print(json.dumps(result_10, indent=2)) # Uncomment to see full structure
571
+ print_extraction_details(result_10, 10)
572
+
573
+
574
+ # Test with invalid max_steps (e.g., negative)
575
+ print("\n--- Testing with invalid max_steps = -1 ---")
576
+ result_invalid_steps = processor.fetch_content_with_depth(test_url_with_links, max_steps=-1)
577
+ # print(json.dumps(result_invalid_steps, indent=2)) # Uncomment to see full structure
578
+ print(f"Result for invalid steps: {result_invalid_steps.get('processing_notes')}")
579
+
580
+
581
+ # Test with invalid initial URL format
582
+ print("\n--- Testing with invalid initial URL format ---")
583
  result_invalid_url = processor.fetch_content_with_depth("invalid-url", max_steps=1)
584
+ # print(json.dumps(result_invalid_url, indent=2)) # Uncomment to see full structure
585
+ print(f"Result for invalid initial URL: {result_invalid_url.get('processing_notes')}")
586
 
587
+ # Test with a URL that might fail to fetch (simulated)
588
+ print("\n--- Testing with a potentially failing URL (simulated) ---")
589
  # Use a non-existent subdomain or a port that's unlikely to be open
590
  failing_url = "http://this-domain-does-not-exist-12345.com/"
591
  result_fail = processor.fetch_content_with_depth(failing_url, max_steps=1)
592
+ # print(json.dumps(result_fail, indent=2)) # Uncomment to see full structure
593
+ print(f"Result for failing URL: {result_fail.get('processing_notes')}")
594
+ # Check if fetch_result is None or indicates failure
595
+ if result_fail.get('fetch_result') is None:
596
+ print("Fetch result is None as expected for failing URL.")
597
+ # if result_fail.get('fetch_result') and 'error' in result_fail['fetch_result'].get('processing_notes', '').lower():
598
+ # print(f"Fetch result notes for failing URL: {result_fail['fetch_result'].get('processing_notes')}")
599
+
600
+
601
+ print("\n--- End of Test Cases ---")
602
 
603
  class EnhancedFileProcessor:
604
  """Advanced file processing with enhanced content extraction"""