File size: 7,897 Bytes
6f509ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/env python3
"""
Example script that demonstrates how to use the web crawler programmatically.

This example:
1. Initializes the crawler
2. Adds seed URLs
3. Starts the crawler with 2 workers
4. Monitors progress for a specific duration
5. Pauses, resumes, and stops the crawler
6. Exports crawl data

Usage:
    python example.py [--time=<seconds>] [--workers=<num>] [--async]

Options:
    --time=<seconds>    Duration of the crawl in seconds [default: 60]
    --workers=<num>     Number of worker threads [default: 2]
    --async             Use asynchronous mode
"""

import time
import logging
import sys
import json
import os
import signal
import threading
from docopt import docopt

from crawler import Crawler
from models import URLStatus, Priority
import config

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('example')


def log_stats(crawler, interval=5):
    """Log crawler statistics periodically"""
    stats = crawler.stats
    elapsed = time.time() - stats['start_time']
    
    logger.info(f"=== Crawler Statistics (after {int(elapsed)}s) ===")
    logger.info(f"Pages crawled: {stats['pages_crawled']}")
    logger.info(f"Pages failed: {stats['pages_failed']}")
    logger.info(f"URLs discovered: {stats['urls_discovered']}")
    logger.info(f"URLs filtered: {stats['urls_filtered']}")
    logger.info(f"Domains crawled: {len(stats['domains_crawled'])}")
    logger.info(f"Frontier size: {crawler.frontier.size()}")
    
    # Status code distribution
    status_codes = stats['status_codes']
    if status_codes:
        logger.info("Status code distribution:")
        for status, count in sorted(status_codes.items()):
            logger.info(f"  {status}: {count}")
    
    # Check if crawler is still running
    if crawler.running and not crawler.stop_event.is_set():
        # Schedule next logging
        timer = threading.Timer(interval, log_stats, args=[crawler, interval])
        timer.daemon = True
        timer.start()


def example_crawl(duration=60, workers=2, async_mode=False):
    """
    Example crawler use
    
    Args:
        duration: Duration of the crawl in seconds
        workers: Number of worker threads
        async_mode: Whether to use async mode
    """
    logger.info("Initializing web crawler...")
    
    # Initialize crawler
    crawler = Crawler()
    
    # Add seed URLs
    seed_urls = [
        'https://en.wikipedia.org/wiki/Web_crawler',
        'https://en.wikipedia.org/wiki/Search_engine',
        'https://en.wikipedia.org/wiki/Web_indexing',
        'https://python.org',
        'https://www.example.com'
    ]
    logger.info(f"Adding {len(seed_urls)} seed URLs...")
    crawler.add_seed_urls(seed_urls)
    
    # Set up signal handling
    def signal_handler(sig, frame):
        logger.info("Received interrupt signal, stopping crawler")
        crawler.stop()
        sys.exit(0)
    
    signal.signal(signal.SIGINT, signal_handler)
    
    # Start a thread to log stats periodically
    log_stats(crawler, interval=5)
    
    # Start the crawler in a separate thread
    logger.info(f"Starting crawler with {workers} workers (async={async_mode})...")
    crawler_thread = threading.Thread(
        target=crawler.start,
        kwargs={'num_workers': workers, 'async_mode': async_mode}
    )
    crawler_thread.daemon = True
    crawler_thread.start()
    
    # Let the crawler run for a while
    logger.info(f"Crawler will run for {duration} seconds...")
    time.sleep(duration // 2)
    
    # Pause crawler
    logger.info("Pausing crawler for 5 seconds...")
    crawler.pause()
    time.sleep(5)
    
    # Resume crawler
    logger.info("Resuming crawler...")
    crawler.resume()
    time.sleep(duration // 2)
    
    # Stop crawler
    logger.info("Stopping crawler...")
    crawler.stop()
    
    # Wait for crawler to stop
    crawler_thread.join(timeout=10)
    
    # Export crawl data
    export_dir = os.path.join(config.STORAGE_PATH, 'exports')
    os.makedirs(export_dir, exist_ok=True)
    export_file = os.path.join(export_dir, 'example_crawl_results.json')
    
    logger.info(f"Exporting crawl data to {export_file}...")
    export_results(crawler, export_file)
    
    logger.info("Crawl example completed")
    
    # Print summary
    print_summary(crawler)


def export_results(crawler, output_file):
    """
    Export crawler results to a file
    
    Args:
        crawler: Crawler instance
        output_file: Output file path
    """
    try:
        # Get MongoDB collections
        pages_collection = crawler.db.pages_collection
        urls_collection = crawler.db.urls_collection
        
        # Get data
        pages = list(pages_collection.find({}, {'_id': 0}).limit(1000))
        urls = list(urls_collection.find({}, {'_id': 0}).limit(1000))
        
        # Prepare export data
        export_data = {
            'metadata': {
                'crawl_duration': time.time() - crawler.stats['start_time'],
                'pages_crawled': crawler.stats['pages_crawled'],
                'urls_discovered': crawler.stats['urls_discovered'],
                'domains_crawled': list(crawler.stats['domains_crawled']),
                'exported_pages': len(pages),
                'exported_urls': len(urls),
                'export_timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
            },
            'pages': pages,
            'urls': urls,
            'stats': crawler.stats
        }
        
        # Convert datetime objects to strings for JSON serialization
        export_data = json.loads(json.dumps(export_data, default=str))
        
        # Write to file
        with open(output_file, 'w') as f:
            json.dump(export_data, f, indent=2)
        
        logger.info(f"Exported data to {output_file}")
    except Exception as e:
        logger.error(f"Error exporting results: {e}")


def print_summary(crawler):
    """
    Print a summary of the crawl
    
    Args:
        crawler: Crawler instance
    """
    stats = crawler.stats
    
    print("\n=============== CRAWL SUMMARY ===============")
    print(f"Duration: {time.time() - stats['start_time']:.2f} seconds")
    print(f"Pages crawled: {stats['pages_crawled']}")
    print(f"Pages failed: {stats['pages_failed']}")
    print(f"URLs discovered: {stats['urls_discovered']}")
    print(f"URLs filtered: {stats['urls_filtered']}")
    print(f"Domains crawled: {len(stats['domains_crawled'])}")
    
    if stats['domains_crawled']:
        print("\nTop domains:")
        domain_counts = {}
        # Count pages per domain
        for page in crawler.db.pages_collection.find({}, {'domain': 1}):
            domain = page.get('domain', 'unknown')
            domain_counts[domain] = domain_counts.get(domain, 0) + 1
        
        # Display top domains
        for domain, count in sorted(domain_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f"  {domain}: {count} pages")
    
    print("\nHTTP Status Codes:")
    for status, count in sorted(stats['status_codes'].items()):
        print(f"  {status}: {count}")
    
    print("\nContent Types:")
    for content_type, count in sorted(stats['content_types'].items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"  {content_type}: {count}")
    
    print("=============================================\n")


if __name__ == '__main__':
    # Parse command-line arguments
    args = docopt(__doc__)
    
    duration = int(args['--time'])
    workers = int(args['--workers'])
    async_mode = args['--async']
    
    try:
        example_crawl(duration, workers, async_mode)
    except KeyboardInterrupt:
        logger.info("Example interrupted by user")
    except Exception as e:
        logger.error(f"Error in example: {e}")
        logger.exception(e)