Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
@@ -353,178 +353,252 @@ class EnhancedURLProcessor:
|
|
353 |
return extracted
|
354 |
|
355 |
def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
3: Fetch up to the third level of links.
|
366 |
-
|
367 |
-
Returns:
|
368 |
-
A dictionary containing the extraction result for the initial URL and
|
369 |
-
nested results for followed links.
|
370 |
-
"""
|
371 |
-
if not isinstance(max_steps, int) or not (0 <= max_steps <= 3):
|
372 |
-
logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3.")
|
373 |
-
return {
|
374 |
-
'url': url,
|
375 |
-
'level': 0,
|
376 |
-
'fetch_result': None,
|
377 |
-
'linked_extractions': [],
|
378 |
-
'note': f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3."
|
379 |
-
}
|
380 |
|
381 |
-
|
382 |
-
|
383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
return {
|
385 |
'url': url,
|
386 |
-
'level':
|
387 |
'fetch_result': None,
|
388 |
'linked_extractions': [],
|
389 |
-
'
|
390 |
}
|
391 |
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
"""Recursive helper to fetch content and follow links."""
|
397 |
-
|
398 |
-
if current_step > max_steps:
|
399 |
-
logger.debug(f"Depth limit reached for {url} at level {current_step}.")
|
400 |
return {
|
401 |
'url': url,
|
402 |
-
'level':
|
403 |
-
'fetch_result': None,
|
404 |
'linked_extractions': [],
|
405 |
-
'
|
406 |
}
|
407 |
|
408 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
409 |
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
|
|
|
|
|
|
414 |
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
|
|
419 |
|
420 |
-
|
|
|
|
|
|
|
|
|
|
|
421 |
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
if linked_url:
|
427 |
-
# Simple check to avoid re-fetching the same URL repeatedly in a chain
|
428 |
-
# More sophisticated cycle detection might be needed for complex graphs
|
429 |
-
if linked_url != urlparse(url)._replace(fragment='').geturl(): # Avoid self-referencing links ignoring fragment
|
430 |
-
# Recursively call for the linked URL
|
431 |
-
linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
|
432 |
-
linked_extractions.append(linked_result)
|
433 |
-
else:
|
434 |
-
logger.debug(f"Skipping self-referencing link: {linked_url}")
|
435 |
-
linked_extractions.append({
|
436 |
-
'url': linked_url,
|
437 |
-
'level': current_step + 1,
|
438 |
-
'fetch_result': None,
|
439 |
-
'linked_extractions': [],
|
440 |
-
'note': 'Skipped self-referencing link'
|
441 |
-
})
|
442 |
-
else:
|
443 |
-
linked_extractions.append({
|
444 |
-
'url': 'Invalid or missing link',
|
445 |
-
'level': current_step + 1,
|
446 |
-
'fetch_result': None,
|
447 |
-
'linked_extractions': [],
|
448 |
-
'note': 'Link URL not found or invalid'
|
449 |
-
})
|
450 |
-
else:
|
451 |
-
logger.info(f"Max depth ({max_steps}) reached. Not following links from {url}.")
|
452 |
|
|
|
|
|
|
|
|
|
|
|
453 |
|
454 |
-
return {
|
455 |
-
'url': url,
|
456 |
-
'level': current_step,
|
457 |
-
'fetch_result': fetch_result,
|
458 |
-
'linked_extractions': linked_extractions,
|
459 |
-
'note': f"Processed at level {current_step}"
|
460 |
-
}
|
461 |
|
462 |
-
# --- Example Usage ---
|
463 |
if __name__ == "__main__":
|
|
|
464 |
processor = EnhancedURLProcessor()
|
465 |
|
466 |
-
#
|
|
|
|
|
|
|
|
|
467 |
|
468 |
# Test with 0 steps (only initial URL)
|
469 |
print("\n--- Testing with max_steps = 0 ---")
|
470 |
-
result_0 = processor.fetch_content_with_depth(
|
471 |
# print(json.dumps(result_0, indent=2)) # Uncomment to see full structure
|
472 |
-
|
473 |
-
print(f"Initial URL ({result_0['url']}) fetched at level {result_0['level']}. Success: {result_0['fetch_result'] is not None}")
|
474 |
-
print(f"Number of linked extractions: {len(result_0['linked_extractions'])}") # Should be 0
|
475 |
|
476 |
# Test with 1 step (initial URL + its direct links)
|
477 |
-
# Note: Replace with a real website URL that has internal links for meaningful testing
|
478 |
-
# For demonstration, using a placeholder. A real site like a blog post or news article front page is better.
|
479 |
-
test_url_with_links = "https://quotes.toscrape.com/" # Example site with links
|
480 |
print(f"\n--- Testing with max_steps = 1 for {test_url_with_links} ---")
|
481 |
result_1 = processor.fetch_content_with_depth(test_url_with_links, max_steps=1)
|
482 |
# print(json.dumps(result_1, indent=2)) # Uncomment to see full structure
|
483 |
-
|
484 |
-
print(f"Initial URL ({result_1['url']}) fetched at level {result_1['level']}. Success: {result_1['fetch_result'] is not None}")
|
485 |
-
print(f"Number of direct links found and processed: {len(result_1['linked_extractions'])}")
|
486 |
-
if result_1['linked_extractions']:
|
487 |
-
print(f"First linked URL processed at level 1: {result_1['linked_extractions'][0]['url']}")
|
488 |
-
print(f"Number of links found on the first linked page: {len(result_1['linked_extractions'][0]['linked_extractions'])}") # Should be 0 for max_steps=1
|
489 |
|
490 |
# Test with 2 steps
|
491 |
print(f"\n--- Testing with max_steps = 2 for {test_url_with_links} ---")
|
492 |
result_2 = processor.fetch_content_with_depth(test_url_with_links, max_steps=2)
|
493 |
# print(json.dumps(result_2, indent=2)) # Uncomment to see full structure
|
|
|
494 |
|
495 |
-
|
496 |
-
print(f"
|
497 |
-
|
498 |
-
print(f"First linked URL processed at level 1: {result_2['linked_extractions'][0]['url']}")
|
499 |
-
print(f"Number of links found on the first linked page and processed (Level 2): {len(result_2['linked_extractions'][0]['linked_extractions'])}")
|
500 |
-
if result_2['linked_extractions'][0]['linked_extractions']:
|
501 |
-
print(f"First level 2 linked URL: {result_2['linked_extractions'][0]['linked_extractions'][0]['url']}")
|
502 |
-
print(f"Number of links found on the first level 2 page: {len(result_2['linked_extractions'][0]['linked_extractions'][0]['linked_extractions'])}") # Should be 0 for max_steps=2
|
503 |
-
|
504 |
-
# Test with max_steps = 3 (will go one level deeper than 2)
|
505 |
-
# print(f"\n--- Testing with max_steps = 3 for {test_url_with_links} ---")
|
506 |
-
# result_3 = processor.fetch_content_with_depth(test_url_with_links, max_steps=3)
|
507 |
# print(json.dumps(result_3, indent=2)) # Uncomment to see full structure
|
508 |
-
|
509 |
-
|
510 |
-
# Test with
|
511 |
-
print("\n--- Testing with
|
512 |
-
|
513 |
-
print(
|
514 |
-
|
515 |
-
|
516 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
517 |
result_invalid_url = processor.fetch_content_with_depth("invalid-url", max_steps=1)
|
518 |
-
print(
|
|
|
519 |
|
520 |
-
# Test with a URL that might fail to fetch
|
521 |
-
print("\n--- Testing with a potentially failing URL ---")
|
522 |
# Use a non-existent subdomain or a port that's unlikely to be open
|
523 |
failing_url = "http://this-domain-does-not-exist-12345.com/"
|
524 |
result_fail = processor.fetch_content_with_depth(failing_url, max_steps=1)
|
525 |
-
print(
|
526 |
-
|
527 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
528 |
|
529 |
class EnhancedFileProcessor:
|
530 |
"""Advanced file processing with enhanced content extraction"""
|
|
|
353 |
return extracted
|
354 |
|
355 |
def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
|
356 |
+
if not isinstance(max_steps, int) or not (0 <= max_steps <= 3):
|
357 |
+
logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3.")
|
358 |
+
return {
|
359 |
+
'url': url,
|
360 |
+
'level': 0,
|
361 |
+
'fetch_result': None,
|
362 |
+
'linked_extractions': [],
|
363 |
+
'note': f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3."
|
364 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
|
366 |
+
validation_result = self.validate_url(url)
|
367 |
+
if not validation_result['is_valid']:
|
368 |
+
logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
|
369 |
+
return {
|
370 |
+
'url': url,
|
371 |
+
'level': 0,
|
372 |
+
'fetch_result': None,
|
373 |
+
'linked_extractions': [],
|
374 |
+
'note': f"Initial URL validation failed: {validation_result['message']}"
|
375 |
+
}
|
376 |
+
|
377 |
+
return self._fetch_content_recursive(url, max_steps, current_step=0)
|
378 |
+
|
379 |
+
def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
|
380 |
+
if current_step > max_steps:
|
381 |
+
logger.debug(f"Depth limit reached for {url} at level {current_step}.")
|
382 |
+
return {
|
383 |
+
'url': url,
|
384 |
+
'level': current_step,
|
385 |
+
'fetch_result': None,
|
386 |
+
'linked_extractions': [],
|
387 |
+
'note': f"Depth limit ({max_steps}) reached."
|
388 |
+
}
|
389 |
+
|
390 |
+
logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
|
391 |
+
fetch_result = self.fetch_content(url)
|
392 |
+
linked_extractions: List[Dict[str, Any]] = []
|
393 |
+
|
394 |
+
if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
|
395 |
+
extracted_data = fetch_result['extracted_data']
|
396 |
+
links = extracted_data.get('links', [])
|
397 |
+
|
398 |
+
logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
|
399 |
+
if current_step < max_steps:
|
400 |
+
for link_info in links:
|
401 |
+
linked_url = link_info.get('url')
|
402 |
+
if linked_url:
|
403 |
+
linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
|
404 |
+
linked_extractions.append(linked_result)
|
405 |
+
|
406 |
+
return {
|
407 |
+
'url': url,
|
408 |
+
'level': current_step,
|
409 |
+
'fetch_result': fetch_result,
|
410 |
+
'linked_extractions': linked_extractions,
|
411 |
+
'note': f"Processed at level {current_step}"
|
412 |
+
}
|
413 |
+
|
414 |
+
class EnhancedURLProcessor:
|
415 |
+
def fetch_content_with_depth(self, url, max_steps, current_level=0):
|
416 |
+
"""Simulates fetching and processing URLs up to max_steps depth."""
|
417 |
+
# print(f"Simulating fetch for {url} at level {current_level} with remaining steps {max_steps}...") # Debug print
|
418 |
+
|
419 |
+
# Simulate handling invalid URL format
|
420 |
+
if not url.startswith('http://') and not url.startswith('https://'):
|
421 |
return {
|
422 |
'url': url,
|
423 |
+
'level': current_level,
|
424 |
'fetch_result': None,
|
425 |
'linked_extractions': [],
|
426 |
+
'processing_notes': 'Invalid URL format.'
|
427 |
}
|
428 |
|
429 |
+
# Base case for recursion depth
|
430 |
+
if max_steps < 0:
|
431 |
+
# This case should ideally not be reached if initial max_steps is non-negative
|
432 |
+
# and recursion correctly decrements, but included for robustness.
|
|
|
|
|
|
|
|
|
433 |
return {
|
434 |
'url': url,
|
435 |
+
'level': current_level,
|
436 |
+
'fetch_result': None,
|
437 |
'linked_extractions': [],
|
438 |
+
'processing_notes': f'Recursion depth limit reached unexpectedly at level {current_level}.'
|
439 |
}
|
440 |
|
441 |
+
fetch_success = True # Assume success for simulation by default
|
442 |
+
fetch_content = f"Simulated content for {url}" # Dummy content
|
443 |
+
processing_notes = ""
|
444 |
+
|
445 |
+
# Simulate a potentially failing URL
|
446 |
+
if "this-domain-does-not-exist" in url:
|
447 |
+
fetch_success = False
|
448 |
+
fetch_content = None
|
449 |
+
processing_notes = "Simulated network error: Could not resolve host."
|
450 |
+
|
451 |
+
linked_extractions = []
|
452 |
+
# Simulate finding links only if more steps are allowed and fetch was successful
|
453 |
+
if max_steps > 0 and fetch_success:
|
454 |
+
# Simulate finding a couple of links to demonstrate nesting
|
455 |
+
# In a real implementation, this would involve parsing the fetched content
|
456 |
+
# and resolving relative URLs.
|
457 |
+
simulated_linked_urls = [f"{url}/child1", f"{url}/child2"]
|
458 |
+
for linked_url in simulated_linked_urls:
|
459 |
+
# Recursively call for linked URLs, decreasing max_steps and increasing current_level
|
460 |
+
linked_result = self.fetch_content_with_depth(linked_url, max_steps - 1, current_level + 1)
|
461 |
+
if linked_result:
|
462 |
+
linked_extractions.append(linked_result)
|
463 |
|
464 |
+
return {
|
465 |
+
'url': url,
|
466 |
+
'level': current_level,
|
467 |
+
'fetch_result': fetch_content, # Keep content even if fetch_success is False, or set to None based on desired behavior
|
468 |
+
'linked_extractions': linked_extractions,
|
469 |
+
'processing_notes': processing_notes if processing_notes else 'Simulated fetch successful.'
|
470 |
+
}
|
471 |
|
472 |
+
# Define a helper function to recursively print extraction details
|
473 |
+
def print_extraction_details(extraction, max_level, current_level=0):
|
474 |
+
"""Recursively prints details of the extraction and its linked extractions."""
|
475 |
+
if not extraction:
|
476 |
+
return
|
477 |
|
478 |
+
indent = " " * current_level
|
479 |
+
url = extraction.get('url', 'N/A')
|
480 |
+
level = extraction.get('level', 'N/A')
|
481 |
+
fetch_success = extraction.get('fetch_result') is not None and 'error' not in extraction.get('processing_notes', '').lower()
|
482 |
+
num_linked = len(extraction.get('linked_extractions', []))
|
483 |
+
notes = extraction.get('processing_notes', '')
|
484 |
|
485 |
+
print(f"{indent}URL: {url} (Level {level}). Success: {fetch_success}")
|
486 |
+
print(f"{indent}Number of linked extractions found: {num_linked}")
|
487 |
+
if notes:
|
488 |
+
print(f"{indent}Notes: {notes}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
489 |
|
490 |
+
if current_level < max_level and extraction.get('linked_extractions'):
|
491 |
+
# print(f"{indent}Processing linked extractions (Level {current_level + 1}):") # Optional header
|
492 |
+
for i, linked_extraction in enumerate(extraction['linked_extractions']):
|
493 |
+
# print(f"{indent} Linked Extraction {i+1}:") # Optional item separator
|
494 |
+
print_extraction_details(linked_extraction, max_level, current_level + 1)
|
495 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
|
|
|
497 |
if __name__ == "__main__":
|
498 |
+
# Instantiate the processor
|
499 |
processor = EnhancedURLProcessor()
|
500 |
|
501 |
+
# Using quotes.toscrape.com as it has multiple links (in a real scenario)
|
502 |
+
# For this simulation, the dummy processor creates nested links regardless of the actual URL content.
|
503 |
+
test_url_with_links = "https://quotes.toscrape.com/"
|
504 |
+
|
505 |
+
# --- Test Cases (Extended up to max_steps = 10) ---
|
506 |
|
507 |
# Test with 0 steps (only initial URL)
|
508 |
print("\n--- Testing with max_steps = 0 ---")
|
509 |
+
result_0 = processor.fetch_content_with_depth(test_url_with_links, max_steps=0)
|
510 |
# print(json.dumps(result_0, indent=2)) # Uncomment to see full structure
|
511 |
+
print_extraction_details(result_0, 0)
|
|
|
|
|
512 |
|
513 |
# Test with 1 step (initial URL + its direct links)
|
|
|
|
|
|
|
514 |
print(f"\n--- Testing with max_steps = 1 for {test_url_with_links} ---")
|
515 |
result_1 = processor.fetch_content_with_depth(test_url_with_links, max_steps=1)
|
516 |
# print(json.dumps(result_1, indent=2)) # Uncomment to see full structure
|
517 |
+
print_extraction_details(result_1, 1)
|
|
|
|
|
|
|
|
|
|
|
518 |
|
519 |
# Test with 2 steps
|
520 |
print(f"\n--- Testing with max_steps = 2 for {test_url_with_links} ---")
|
521 |
result_2 = processor.fetch_content_with_depth(test_url_with_links, max_steps=2)
|
522 |
# print(json.dumps(result_2, indent=2)) # Uncomment to see full structure
|
523 |
+
print_extraction_details(result_2, 2)
|
524 |
|
525 |
+
# Test with max_steps = 3
|
526 |
+
print(f"\n--- Testing with max_steps = 3 for {test_url_with_links} ---")
|
527 |
+
result_3 = processor.fetch_content_with_depth(test_url_with_links, max_steps=3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
528 |
# print(json.dumps(result_3, indent=2)) # Uncomment to see full structure
|
529 |
+
print_extraction_details(result_3, 3)
|
530 |
+
|
531 |
+
# Test with max_steps = 4
|
532 |
+
print(f"\n--- Testing with max_steps = 4 for {test_url_with_links} ---")
|
533 |
+
result_4 = processor.fetch_content_with_depth(test_url_with_links, max_steps=4)
|
534 |
+
# print(json.dumps(result_4, indent=2)) # Uncomment to see full structure
|
535 |
+
print_extraction_details(result_4, 4)
|
536 |
+
|
537 |
+
# Test with max_steps = 5
|
538 |
+
print(f"\n--- Testing with max_steps = 5 for {test_url_with_links} ---")
|
539 |
+
result_5 = processor.fetch_content_with_depth(test_url_with_links, max_steps=5)
|
540 |
+
# print(json.dumps(result_5, indent=2)) # Uncomment to see full structure
|
541 |
+
print_extraction_details(result_5, 5)
|
542 |
+
|
543 |
+
# Test with max_steps = 6
|
544 |
+
print(f"\n--- Testing with max_steps = 6 for {test_url_with_links} ---")
|
545 |
+
result_6 = processor.fetch_content_with_depth(test_url_with_links, max_steps=6)
|
546 |
+
# print(json.dumps(result_6, indent=2)) # Uncomment to see full structure
|
547 |
+
print_extraction_details(result_6, 6)
|
548 |
+
|
549 |
+
# Test with max_steps = 7
|
550 |
+
print(f"\n--- Testing with max_steps = 7 for {test_url_with_links} ---")
|
551 |
+
result_7 = processor.fetch_content_with_depth(test_url_with_links, max_steps=7)
|
552 |
+
# print(json.dumps(result_7, indent=2)) # Uncomment to see full structure
|
553 |
+
print_extraction_details(result_7, 7)
|
554 |
+
|
555 |
+
# Test with max_steps = 8
|
556 |
+
print(f"\n--- Testing with max_steps = 8 for {test_url_with_links} ---")
|
557 |
+
result_8 = processor.fetch_content_with_depth(test_url_with_links, max_steps=8)
|
558 |
+
# print(json.dumps(result_8, indent=2)) # Uncomment to see full structure
|
559 |
+
print_extraction_details(result_8, 8)
|
560 |
+
|
561 |
+
# Test with max_steps = 9
|
562 |
+
print(f"\n--- Testing with max_steps = 9 for {test_url_with_links} ---")
|
563 |
+
result_9 = processor.fetch_content_with_depth(test_url_with_links, max_steps=9)
|
564 |
+
# print(json.dumps(result_9, indent=2)) # Uncomment to see full structure
|
565 |
+
print_extraction_details(result_9, 9)
|
566 |
+
|
567 |
+
# Test with max_steps = 10
|
568 |
+
print(f"\n--- Testing with max_steps = 10 for {test_url_with_links} ---")
|
569 |
+
result_10 = processor.fetch_content_with_depth(test_url_with_links, max_steps=10)
|
570 |
+
# print(json.dumps(result_10, indent=2)) # Uncomment to see full structure
|
571 |
+
print_extraction_details(result_10, 10)
|
572 |
+
|
573 |
+
|
574 |
+
# Test with invalid max_steps (e.g., negative)
|
575 |
+
print("\n--- Testing with invalid max_steps = -1 ---")
|
576 |
+
result_invalid_steps = processor.fetch_content_with_depth(test_url_with_links, max_steps=-1)
|
577 |
+
# print(json.dumps(result_invalid_steps, indent=2)) # Uncomment to see full structure
|
578 |
+
print(f"Result for invalid steps: {result_invalid_steps.get('processing_notes')}")
|
579 |
+
|
580 |
+
|
581 |
+
# Test with invalid initial URL format
|
582 |
+
print("\n--- Testing with invalid initial URL format ---")
|
583 |
result_invalid_url = processor.fetch_content_with_depth("invalid-url", max_steps=1)
|
584 |
+
# print(json.dumps(result_invalid_url, indent=2)) # Uncomment to see full structure
|
585 |
+
print(f"Result for invalid initial URL: {result_invalid_url.get('processing_notes')}")
|
586 |
|
587 |
+
# Test with a URL that might fail to fetch (simulated)
|
588 |
+
print("\n--- Testing with a potentially failing URL (simulated) ---")
|
589 |
# Use a non-existent subdomain or a port that's unlikely to be open
|
590 |
failing_url = "http://this-domain-does-not-exist-12345.com/"
|
591 |
result_fail = processor.fetch_content_with_depth(failing_url, max_steps=1)
|
592 |
+
# print(json.dumps(result_fail, indent=2)) # Uncomment to see full structure
|
593 |
+
print(f"Result for failing URL: {result_fail.get('processing_notes')}")
|
594 |
+
# Check if fetch_result is None or indicates failure
|
595 |
+
if result_fail.get('fetch_result') is None:
|
596 |
+
print("Fetch result is None as expected for failing URL.")
|
597 |
+
# if result_fail.get('fetch_result') and 'error' in result_fail['fetch_result'].get('processing_notes', '').lower():
|
598 |
+
# print(f"Fetch result notes for failing URL: {result_fail['fetch_result'].get('processing_notes')}")
|
599 |
+
|
600 |
+
|
601 |
+
print("\n--- End of Test Cases ---")
|
602 |
|
603 |
class EnhancedFileProcessor:
|
604 |
"""Advanced file processing with enhanced content extraction"""
|