Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
Example script that demonstrates how to use the web crawler programmatically. | |
This example: | |
1. Initializes the crawler | |
2. Adds seed URLs | |
3. Starts the crawler with 2 workers | |
4. Monitors progress for a specific duration | |
5. Pauses, resumes, and stops the crawler | |
6. Exports crawl data | |
Usage: | |
python example.py [--time=<seconds>] [--workers=<num>] [--async] | |
Options: | |
--time=<seconds> Duration of the crawl in seconds [default: 60] | |
--workers=<num> Number of worker threads [default: 2] | |
--async Use asynchronous mode | |
""" | |
import time | |
import logging | |
import sys | |
import json | |
import os | |
import signal | |
import threading | |
from docopt import docopt | |
from crawler import Crawler | |
from models import URLStatus, Priority | |
import config | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
) | |
logger = logging.getLogger('example') | |
def log_stats(crawler, interval=5): | |
"""Log crawler statistics periodically""" | |
stats = crawler.stats | |
elapsed = time.time() - stats['start_time'] | |
logger.info(f"=== Crawler Statistics (after {int(elapsed)}s) ===") | |
logger.info(f"Pages crawled: {stats['pages_crawled']}") | |
logger.info(f"Pages failed: {stats['pages_failed']}") | |
logger.info(f"URLs discovered: {stats['urls_discovered']}") | |
logger.info(f"URLs filtered: {stats['urls_filtered']}") | |
logger.info(f"Domains crawled: {len(stats['domains_crawled'])}") | |
logger.info(f"Frontier size: {crawler.frontier.size()}") | |
# Status code distribution | |
status_codes = stats['status_codes'] | |
if status_codes: | |
logger.info("Status code distribution:") | |
for status, count in sorted(status_codes.items()): | |
logger.info(f" {status}: {count}") | |
# Check if crawler is still running | |
if crawler.running and not crawler.stop_event.is_set(): | |
# Schedule next logging | |
timer = threading.Timer(interval, log_stats, args=[crawler, interval]) | |
timer.daemon = True | |
timer.start() | |
def example_crawl(duration=60, workers=2, async_mode=False): | |
""" | |
Example crawler use | |
Args: | |
duration: Duration of the crawl in seconds | |
workers: Number of worker threads | |
async_mode: Whether to use async mode | |
""" | |
logger.info("Initializing web crawler...") | |
# Initialize crawler | |
crawler = Crawler() | |
# Add seed URLs | |
seed_urls = [ | |
'https://en.wikipedia.org/wiki/Web_crawler', | |
'https://en.wikipedia.org/wiki/Search_engine', | |
'https://en.wikipedia.org/wiki/Web_indexing', | |
'https://python.org', | |
'https://www.example.com' | |
] | |
logger.info(f"Adding {len(seed_urls)} seed URLs...") | |
crawler.add_seed_urls(seed_urls) | |
# Set up signal handling | |
def signal_handler(sig, frame): | |
logger.info("Received interrupt signal, stopping crawler") | |
crawler.stop() | |
sys.exit(0) | |
signal.signal(signal.SIGINT, signal_handler) | |
# Start a thread to log stats periodically | |
log_stats(crawler, interval=5) | |
# Start the crawler in a separate thread | |
logger.info(f"Starting crawler with {workers} workers (async={async_mode})...") | |
crawler_thread = threading.Thread( | |
target=crawler.start, | |
kwargs={'num_workers': workers, 'async_mode': async_mode} | |
) | |
crawler_thread.daemon = True | |
crawler_thread.start() | |
# Let the crawler run for a while | |
logger.info(f"Crawler will run for {duration} seconds...") | |
time.sleep(duration // 2) | |
# Pause crawler | |
logger.info("Pausing crawler for 5 seconds...") | |
crawler.pause() | |
time.sleep(5) | |
# Resume crawler | |
logger.info("Resuming crawler...") | |
crawler.resume() | |
time.sleep(duration // 2) | |
# Stop crawler | |
logger.info("Stopping crawler...") | |
crawler.stop() | |
# Wait for crawler to stop | |
crawler_thread.join(timeout=10) | |
# Export crawl data | |
export_dir = os.path.join(config.STORAGE_PATH, 'exports') | |
os.makedirs(export_dir, exist_ok=True) | |
export_file = os.path.join(export_dir, 'example_crawl_results.json') | |
logger.info(f"Exporting crawl data to {export_file}...") | |
export_results(crawler, export_file) | |
logger.info("Crawl example completed") | |
# Print summary | |
print_summary(crawler) | |
def export_results(crawler, output_file): | |
""" | |
Export crawler results to a file | |
Args: | |
crawler: Crawler instance | |
output_file: Output file path | |
""" | |
try: | |
# Get MongoDB collections | |
pages_collection = crawler.db.pages_collection | |
urls_collection = crawler.db.urls_collection | |
# Get data | |
pages = list(pages_collection.find({}, {'_id': 0}).limit(1000)) | |
urls = list(urls_collection.find({}, {'_id': 0}).limit(1000)) | |
# Prepare export data | |
export_data = { | |
'metadata': { | |
'crawl_duration': time.time() - crawler.stats['start_time'], | |
'pages_crawled': crawler.stats['pages_crawled'], | |
'urls_discovered': crawler.stats['urls_discovered'], | |
'domains_crawled': list(crawler.stats['domains_crawled']), | |
'exported_pages': len(pages), | |
'exported_urls': len(urls), | |
'export_timestamp': time.strftime('%Y-%m-%d %H:%M:%S') | |
}, | |
'pages': pages, | |
'urls': urls, | |
'stats': crawler.stats | |
} | |
# Convert datetime objects to strings for JSON serialization | |
export_data = json.loads(json.dumps(export_data, default=str)) | |
# Write to file | |
with open(output_file, 'w') as f: | |
json.dump(export_data, f, indent=2) | |
logger.info(f"Exported data to {output_file}") | |
except Exception as e: | |
logger.error(f"Error exporting results: {e}") | |
def print_summary(crawler): | |
""" | |
Print a summary of the crawl | |
Args: | |
crawler: Crawler instance | |
""" | |
stats = crawler.stats | |
print("\n=============== CRAWL SUMMARY ===============") | |
print(f"Duration: {time.time() - stats['start_time']:.2f} seconds") | |
print(f"Pages crawled: {stats['pages_crawled']}") | |
print(f"Pages failed: {stats['pages_failed']}") | |
print(f"URLs discovered: {stats['urls_discovered']}") | |
print(f"URLs filtered: {stats['urls_filtered']}") | |
print(f"Domains crawled: {len(stats['domains_crawled'])}") | |
if stats['domains_crawled']: | |
print("\nTop domains:") | |
domain_counts = {} | |
# Count pages per domain | |
for page in crawler.db.pages_collection.find({}, {'domain': 1}): | |
domain = page.get('domain', 'unknown') | |
domain_counts[domain] = domain_counts.get(domain, 0) + 1 | |
# Display top domains | |
for domain, count in sorted(domain_counts.items(), key=lambda x: x[1], reverse=True)[:10]: | |
print(f" {domain}: {count} pages") | |
print("\nHTTP Status Codes:") | |
for status, count in sorted(stats['status_codes'].items()): | |
print(f" {status}: {count}") | |
print("\nContent Types:") | |
for content_type, count in sorted(stats['content_types'].items(), key=lambda x: x[1], reverse=True)[:5]: | |
print(f" {content_type}: {count}") | |
print("=============================================\n") | |
if __name__ == '__main__': | |
# Parse command-line arguments | |
args = docopt(__doc__) | |
duration = int(args['--time']) | |
workers = int(args['--workers']) | |
async_mode = args['--async'] | |
try: | |
example_crawl(duration, workers, async_mode) | |
except KeyboardInterrupt: | |
logger.info("Example interrupted by user") | |
except Exception as e: | |
logger.error(f"Error in example: {e}") | |
logger.exception(e) |