Spaces:
Sleeping
Sleeping
File size: 7,897 Bytes
6f509ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
#!/usr/bin/env python3
"""
Example script that demonstrates how to use the web crawler programmatically.
This example:
1. Initializes the crawler
2. Adds seed URLs
3. Starts the crawler with 2 workers
4. Monitors progress for a specific duration
5. Pauses, resumes, and stops the crawler
6. Exports crawl data
Usage:
python example.py [--time=<seconds>] [--workers=<num>] [--async]
Options:
--time=<seconds> Duration of the crawl in seconds [default: 60]
--workers=<num> Number of worker threads [default: 2]
--async Use asynchronous mode
"""
import time
import logging
import sys
import json
import os
import signal
import threading
from docopt import docopt
from crawler import Crawler
from models import URLStatus, Priority
import config
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('example')
def log_stats(crawler, interval=5):
"""Log crawler statistics periodically"""
stats = crawler.stats
elapsed = time.time() - stats['start_time']
logger.info(f"=== Crawler Statistics (after {int(elapsed)}s) ===")
logger.info(f"Pages crawled: {stats['pages_crawled']}")
logger.info(f"Pages failed: {stats['pages_failed']}")
logger.info(f"URLs discovered: {stats['urls_discovered']}")
logger.info(f"URLs filtered: {stats['urls_filtered']}")
logger.info(f"Domains crawled: {len(stats['domains_crawled'])}")
logger.info(f"Frontier size: {crawler.frontier.size()}")
# Status code distribution
status_codes = stats['status_codes']
if status_codes:
logger.info("Status code distribution:")
for status, count in sorted(status_codes.items()):
logger.info(f" {status}: {count}")
# Check if crawler is still running
if crawler.running and not crawler.stop_event.is_set():
# Schedule next logging
timer = threading.Timer(interval, log_stats, args=[crawler, interval])
timer.daemon = True
timer.start()
def example_crawl(duration=60, workers=2, async_mode=False):
"""
Example crawler use
Args:
duration: Duration of the crawl in seconds
workers: Number of worker threads
async_mode: Whether to use async mode
"""
logger.info("Initializing web crawler...")
# Initialize crawler
crawler = Crawler()
# Add seed URLs
seed_urls = [
'https://en.wikipedia.org/wiki/Web_crawler',
'https://en.wikipedia.org/wiki/Search_engine',
'https://en.wikipedia.org/wiki/Web_indexing',
'https://python.org',
'https://www.example.com'
]
logger.info(f"Adding {len(seed_urls)} seed URLs...")
crawler.add_seed_urls(seed_urls)
# Set up signal handling
def signal_handler(sig, frame):
logger.info("Received interrupt signal, stopping crawler")
crawler.stop()
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
# Start a thread to log stats periodically
log_stats(crawler, interval=5)
# Start the crawler in a separate thread
logger.info(f"Starting crawler with {workers} workers (async={async_mode})...")
crawler_thread = threading.Thread(
target=crawler.start,
kwargs={'num_workers': workers, 'async_mode': async_mode}
)
crawler_thread.daemon = True
crawler_thread.start()
# Let the crawler run for a while
logger.info(f"Crawler will run for {duration} seconds...")
time.sleep(duration // 2)
# Pause crawler
logger.info("Pausing crawler for 5 seconds...")
crawler.pause()
time.sleep(5)
# Resume crawler
logger.info("Resuming crawler...")
crawler.resume()
time.sleep(duration // 2)
# Stop crawler
logger.info("Stopping crawler...")
crawler.stop()
# Wait for crawler to stop
crawler_thread.join(timeout=10)
# Export crawl data
export_dir = os.path.join(config.STORAGE_PATH, 'exports')
os.makedirs(export_dir, exist_ok=True)
export_file = os.path.join(export_dir, 'example_crawl_results.json')
logger.info(f"Exporting crawl data to {export_file}...")
export_results(crawler, export_file)
logger.info("Crawl example completed")
# Print summary
print_summary(crawler)
def export_results(crawler, output_file):
"""
Export crawler results to a file
Args:
crawler: Crawler instance
output_file: Output file path
"""
try:
# Get MongoDB collections
pages_collection = crawler.db.pages_collection
urls_collection = crawler.db.urls_collection
# Get data
pages = list(pages_collection.find({}, {'_id': 0}).limit(1000))
urls = list(urls_collection.find({}, {'_id': 0}).limit(1000))
# Prepare export data
export_data = {
'metadata': {
'crawl_duration': time.time() - crawler.stats['start_time'],
'pages_crawled': crawler.stats['pages_crawled'],
'urls_discovered': crawler.stats['urls_discovered'],
'domains_crawled': list(crawler.stats['domains_crawled']),
'exported_pages': len(pages),
'exported_urls': len(urls),
'export_timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
},
'pages': pages,
'urls': urls,
'stats': crawler.stats
}
# Convert datetime objects to strings for JSON serialization
export_data = json.loads(json.dumps(export_data, default=str))
# Write to file
with open(output_file, 'w') as f:
json.dump(export_data, f, indent=2)
logger.info(f"Exported data to {output_file}")
except Exception as e:
logger.error(f"Error exporting results: {e}")
def print_summary(crawler):
"""
Print a summary of the crawl
Args:
crawler: Crawler instance
"""
stats = crawler.stats
print("\n=============== CRAWL SUMMARY ===============")
print(f"Duration: {time.time() - stats['start_time']:.2f} seconds")
print(f"Pages crawled: {stats['pages_crawled']}")
print(f"Pages failed: {stats['pages_failed']}")
print(f"URLs discovered: {stats['urls_discovered']}")
print(f"URLs filtered: {stats['urls_filtered']}")
print(f"Domains crawled: {len(stats['domains_crawled'])}")
if stats['domains_crawled']:
print("\nTop domains:")
domain_counts = {}
# Count pages per domain
for page in crawler.db.pages_collection.find({}, {'domain': 1}):
domain = page.get('domain', 'unknown')
domain_counts[domain] = domain_counts.get(domain, 0) + 1
# Display top domains
for domain, count in sorted(domain_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
print(f" {domain}: {count} pages")
print("\nHTTP Status Codes:")
for status, count in sorted(stats['status_codes'].items()):
print(f" {status}: {count}")
print("\nContent Types:")
for content_type, count in sorted(stats['content_types'].items(), key=lambda x: x[1], reverse=True)[:5]:
print(f" {content_type}: {count}")
print("=============================================\n")
if __name__ == '__main__':
# Parse command-line arguments
args = docopt(__doc__)
duration = int(args['--time'])
workers = int(args['--workers'])
async_mode = args['--async']
try:
example_crawl(duration, workers, async_mode)
except KeyboardInterrupt:
logger.info("Example interrupted by user")
except Exception as e:
logger.error(f"Error in example: {e}")
logger.exception(e) |