sagarnildass commited on
Commit
6f509ec
·
verified ·
1 Parent(s): 9d5c8f4

Upload folder using huggingface_hub

Browse files
Files changed (48) hide show
  1. .env +1 -0
  2. Dockerfile +89 -0
  3. README.md +339 -7
  4. __pycache__/api.cpython-311.pyc +0 -0
  5. __pycache__/config.cpython-310.pyc +0 -0
  6. __pycache__/config.cpython-311.pyc +0 -0
  7. __pycache__/crawler.cpython-310.pyc +0 -0
  8. __pycache__/crawler.cpython-311.pyc +0 -0
  9. __pycache__/dns_resolver.cpython-310.pyc +0 -0
  10. __pycache__/dns_resolver.cpython-311.pyc +0 -0
  11. __pycache__/downloader.cpython-310.pyc +0 -0
  12. __pycache__/downloader.cpython-311.pyc +0 -0
  13. __pycache__/frontier.cpython-310.pyc +0 -0
  14. __pycache__/frontier.cpython-311.pyc +0 -0
  15. __pycache__/local_config.cpython-310.pyc +0 -0
  16. __pycache__/local_config.cpython-311.pyc +0 -0
  17. __pycache__/models.cpython-310.pyc +0 -0
  18. __pycache__/models.cpython-311.pyc +0 -0
  19. __pycache__/mongo_cleanup.cpython-310.pyc +0 -0
  20. __pycache__/mongo_cleanup.cpython-311.pyc +0 -0
  21. __pycache__/parser.cpython-310.pyc +0 -0
  22. __pycache__/parser.cpython-311.pyc +0 -0
  23. __pycache__/robots.cpython-310.pyc +0 -0
  24. __pycache__/robots.cpython-311.pyc +0 -0
  25. __pycache__/run_crawler.cpython-310.pyc +0 -0
  26. api.py +588 -0
  27. cleanup.py +130 -0
  28. cleanup_all.sh +47 -0
  29. config.py +96 -0
  30. crawl.py +370 -0
  31. crawler.log +0 -0
  32. crawler.py +908 -0
  33. deduplication.py +422 -0
  34. dns_resolver.py +161 -0
  35. docker-compose.yml +79 -0
  36. downloader.py +400 -0
  37. example.py +250 -0
  38. file_cleanup.py +100 -0
  39. frontier.py +319 -0
  40. models.py +167 -0
  41. mongo_cleanup.py +86 -0
  42. parser.py +316 -0
  43. requirements.txt +43 -0
  44. robots.py +203 -0
  45. run_crawler.py +237 -0
  46. seo_analyzer_ui.py +708 -0
  47. storage.py +888 -0
  48. test_crawler.py +219 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ DEPLOYMENT=true
Dockerfile ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y --no-install-recommends \
8
+ build-essential \
9
+ wget \
10
+ curl \
11
+ gnupg \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Install MongoDB
15
+ RUN wget -qO - https://www.mongodb.org/static/pgp/server-6.0.asc | apt-key add - \
16
+ && echo "deb http://repo.mongodb.org/apt/debian buster/mongodb-org/6.0 main" | tee /etc/apt/sources.list.d/mongodb-org-6.0.list \
17
+ && apt-get update \
18
+ && apt-get install -y mongodb-org \
19
+ && mkdir -p /data/db \
20
+ && apt-get clean \
21
+ && rm -rf /var/lib/apt/lists/*
22
+
23
+ # Install Redis
24
+ RUN apt-get update && apt-get install -y --no-install-recommends \
25
+ redis-server \
26
+ && apt-get clean \
27
+ && rm -rf /var/lib/apt/lists/*
28
+
29
+ # Copy requirements.txt
30
+ COPY requirements.txt .
31
+
32
+ # Install Python dependencies
33
+ RUN pip install --no-cache-dir -r requirements.txt
34
+
35
+ # Copy the crawler code
36
+ COPY . .
37
+
38
+ # Create necessary directories
39
+ RUN mkdir -p /data/storage/html_pages \
40
+ && mkdir -p /data/storage/logs \
41
+ && mkdir -p /data/storage/exports
42
+
43
+ # Expose ports
44
+ # Prometheus metrics port
45
+ EXPOSE 9100
46
+ # MongoDB port
47
+ EXPOSE 27017
48
+ # Redis port
49
+ EXPOSE 6379
50
+
51
+ # Set environment variables
52
+ ENV MONGODB_URI=mongodb://localhost:27017/
53
+ ENV REDIS_URI=redis://localhost:6379/0
54
+ ENV PYTHONUNBUFFERED=1
55
+
56
+ # Create entrypoint script
57
+ RUN echo '#!/bin/bash\n\
58
+ # Start MongoDB\n\
59
+ mongod --fork --logpath /var/log/mongodb.log\n\
60
+ \n\
61
+ # Start Redis\n\
62
+ redis-server --daemonize yes\n\
63
+ \n\
64
+ # Check if services are running\n\
65
+ echo "Waiting for MongoDB to start..."\n\
66
+ until mongo --eval "print(\"MongoDB is ready\")" > /dev/null 2>&1; do\n\
67
+ sleep 1\n\
68
+ done\n\
69
+ \n\
70
+ echo "Waiting for Redis to start..."\n\
71
+ until redis-cli ping > /dev/null 2>&1; do\n\
72
+ sleep 1\n\
73
+ done\n\
74
+ \n\
75
+ echo "All services are running!"\n\
76
+ \n\
77
+ # Execute the provided command or default to help\n\
78
+ if [ $# -eq 0 ]; then\n\
79
+ python crawl.py --help\n\
80
+ else\n\
81
+ exec "$@"\n\
82
+ fi' > /app/entrypoint.sh \
83
+ && chmod +x /app/entrypoint.sh
84
+
85
+ # Set entrypoint
86
+ ENTRYPOINT ["/app/entrypoint.sh"]
87
+
88
+ # Default command is to show help
89
+ CMD ["python", "crawl.py", "--help"]
README.md CHANGED
@@ -1,12 +1,344 @@
1
  ---
2
- title: AI SEO Crawler
3
- emoji: 📊
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.30.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: AI_SEO_Crawler
3
+ app_file: seo_analyzer_ui.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.30.0
 
 
6
  ---
7
+ # Web Crawler Documentation
8
 
9
+ A scalable web crawler with configurability, politeness, and content extraction capabilities.
10
+
11
+ ## Table of Contents
12
+
13
+ - [Architecture](#architecture)
14
+ - [Setup](#setup)
15
+ - [Usage](#usage)
16
+ - [Components](#components)
17
+ - [Troubleshooting](#troubleshooting)
18
+
19
+ ## Architecture
20
+
21
+ The web crawler consists of the following key components:
22
+
23
+ 1. **URL Frontier**: Manages URLs to be crawled with prioritization
24
+ 2. **DNS Resolver**: Caches DNS lookups to improve performance
25
+ 3. **Robots Handler**: Ensures compliance with robots.txt
26
+ 4. **HTML Downloader**: Downloads web pages with error handling
27
+ 5. **HTML Parser**: Extracts URLs and metadata from web pages
28
+ 6. **Storage**: MongoDB for storage of URLs and metadata
29
+ 7. **Crawler**: Main crawler orchestration
30
+ 8. **API**: REST API for controlling the crawler
31
+
32
+ ## Setup
33
+
34
+ ### Requirements
35
+
36
+ - Python 3.8+
37
+ - MongoDB
38
+ - Redis server
39
+
40
+ ### Installation
41
+
42
+ 1. Install MongoDB:
43
+ ```bash
44
+ # For Ubuntu
45
+ sudo apt-get install -y mongodb
46
+ sudo systemctl start mongodb
47
+ sudo systemctl enable mongodb
48
+
49
+ # Verify MongoDB is running
50
+ sudo systemctl status mongodb
51
+ ```
52
+
53
+ 2. Install Redis:
54
+ ```bash
55
+ sudo apt-get install redis-server
56
+ sudo systemctl start redis-server
57
+
58
+ # Verify Redis is running
59
+ redis-cli ping # Should return PONG
60
+ ```
61
+
62
+ 3. Install Python dependencies:
63
+ ```bash
64
+ pip install -r requirements.txt
65
+ ```
66
+
67
+ 4. Create a local configuration file:
68
+ ```bash
69
+ cp config.py local_config.py
70
+ ```
71
+
72
+ 5. Edit `local_config.py` to customize settings:
73
+ ```python
74
+ # Example configuration
75
+ SEED_URLS = ["https://example.com"] # Start URLs
76
+ MAX_DEPTH = 3 # Crawl depth
77
+ MAX_WORKERS = 4 # Number of worker threads
78
+ DELAY_BETWEEN_REQUESTS = 1 # Politeness delay
79
+ ```
80
+
81
+ ## Usage
82
+
83
+ ### Running the Crawler
84
+
85
+ To run the crawler with default settings:
86
+
87
+ ```bash
88
+ cd 4_web_crawler
89
+ python run_crawler.py
90
+ ```
91
+
92
+ To specify custom seed URLs:
93
+
94
+ ```bash
95
+ python run_crawler.py --seed https://example.com https://another-site.com
96
+ ```
97
+
98
+ To limit crawl depth:
99
+
100
+ ```bash
101
+ python run_crawler.py --depth 2
102
+ ```
103
+
104
+ To run with more worker threads:
105
+
106
+ ```bash
107
+ python run_crawler.py --workers 8
108
+ ```
109
+
110
+ ### Sample Commands
111
+
112
+ Here are some common use cases with sample commands:
113
+
114
+ #### Crawl a Single Domain
115
+
116
+ This command crawls only example.com, not following external links:
117
+
118
+ ```bash
119
+ python run_crawler.py --seed example.com --domain-filter example.com
120
+ ```
121
+
122
+ #### Fresh Start (Reset Database)
123
+
124
+ This clears both MongoDB and Redis before starting, solving duplicate key errors:
125
+
126
+ ```bash
127
+ python run_crawler.py --seed example.com --reset-db
128
+ ```
129
+
130
+ #### Custom Speed and Depth
131
+
132
+ Control the crawler's speed and depth:
133
+
134
+ ```bash
135
+ python run_crawler.py --seed example.com --depth 3 --workers 4 --delay 0.5
136
+ ```
137
+
138
+ #### Crawl Multiple Sites
139
+
140
+ Crawl multiple websites at once:
141
+
142
+ ```bash
143
+ python run_crawler.py --seed example.com blog.example.org docs.example.com
144
+ ```
145
+
146
+ #### Ignore robots.txt Rules
147
+
148
+ Use with caution, as this ignores website crawling policies:
149
+
150
+ ```bash
151
+ python run_crawler.py --seed example.com --ignore-robots
152
+ ```
153
+
154
+ #### Set Custom User Agent
155
+
156
+ Identity the crawler with a specific user agent:
157
+
158
+ ```bash
159
+ python run_crawler.py --seed example.com --user-agent "MyCustomBot/1.0"
160
+ ```
161
+
162
+ #### Crawl sagarnildas.com
163
+
164
+ To specifically crawl sagarnildas.com with optimal settings:
165
+
166
+ ```bash
167
+ python run_crawler.py --seed sagarnildas.com --domain-filter sagarnildas.com --reset-db --workers 2 --depth 3 --verbose
168
+ ```
169
+
170
+ ### Using the API
171
+
172
+ The crawler provides a REST API for control and monitoring:
173
+
174
+ ```bash
175
+ cd 4_web_crawler
176
+ python api.py
177
+ ```
178
+
179
+ The API will be available at http://localhost:8000
180
+
181
+ #### API Endpoints
182
+
183
+ - `GET /status` - Get crawler status
184
+ - `GET /stats` - Get detailed statistics
185
+ - `POST /start` - Start the crawler
186
+ - `POST /stop` - Stop the crawler
187
+ - `POST /seed` - Add seed URLs
188
+ - `GET /pages` - List crawled pages
189
+ - `GET /urls` - List discovered URLs
190
+
191
+ ### Checking Results
192
+
193
+ Monitor the crawler through:
194
+
195
+ 1. Console output:
196
+ ```bash
197
+ tail -f crawler.log
198
+ ```
199
+
200
+ 2. MongoDB collections:
201
+ ```bash
202
+ # Start mongo shell
203
+ mongo
204
+
205
+ # Switch to crawler database
206
+ use crawler
207
+
208
+ # Count discovered URLs
209
+ db.urls.count()
210
+
211
+ # View crawled pages
212
+ db.pages.find().limit(5)
213
+ ```
214
+
215
+ 3. API statistics:
216
+ ```bash
217
+ curl http://localhost:8000/stats
218
+ ```
219
+
220
+ ## Components
221
+
222
+ The crawler has several key components that work together:
223
+
224
+ ### URL Frontier
225
+
226
+ Manages the queue of URLs to be crawled with priority-based scheduling.
227
+
228
+ ### DNS Resolver
229
+
230
+ Caches DNS lookups to improve performance and reduce load on DNS servers.
231
+
232
+ ### Robots Handler
233
+
234
+ Ensures compliance with robots.txt rules to be a good web citizen.
235
+
236
+ ### HTML Downloader
237
+
238
+ Downloads web pages with error handling, timeouts, and retries.
239
+
240
+ ### HTML Parser
241
+
242
+ Extracts URLs and metadata from web pages.
243
+
244
+ ### Crawler
245
+
246
+ The main component that orchestrates the crawling process.
247
+
248
+ ## Troubleshooting
249
+
250
+ ### MongoDB Errors
251
+
252
+ If you see duplicate key errors:
253
+
254
+ ```
255
+ ERROR: Error saving seed URL to database: E11000 duplicate key error
256
+ ```
257
+
258
+ Clean MongoDB collections:
259
+
260
+ ```bash
261
+ cd 4_web_crawler
262
+ python mongo_cleanup.py
263
+ ```
264
+
265
+ ### Redis Connection Issues
266
+
267
+ If the crawler can't connect to Redis:
268
+
269
+ 1. Check if Redis is running:
270
+ ```bash
271
+ sudo systemctl status redis-server
272
+ ```
273
+
274
+ 2. Verify Redis connection:
275
+ ```bash
276
+ redis-cli ping
277
+ ```
278
+
279
+ ### Performance Issues
280
+
281
+ If the crawler is running slowly:
282
+
283
+ 1. Increase worker threads in `local_config.py`:
284
+ ```python
285
+ MAX_WORKERS = 8
286
+ ```
287
+
288
+ 2. Adjust the politeness delay:
289
+ ```python
290
+ DELAY_BETWEEN_REQUESTS = 0.5 # Half-second delay
291
+ ```
292
+
293
+ 3. Optimize DNS caching:
294
+ ```python
295
+ DNS_CACHE_SIZE = 10000
296
+ DNS_CACHE_TTL = 7200 # 2 hours
297
+ ```
298
+
299
+ ### Crawler Not Starting
300
+
301
+ If the crawler won't start:
302
+
303
+ 1. Check for MongoDB connection:
304
+ ```bash
305
+ mongo --eval "db.version()"
306
+ ```
307
+
308
+ 2. Ensure Redis is running:
309
+ ```bash
310
+ redis-cli info
311
+ ```
312
+
313
+ 3. Look for error messages in the logs:
314
+ ```bash
315
+ cat crawler.log
316
+ ```
317
+
318
+ ## Configuration Reference
319
+
320
+ Key configurations in `config.py` or `local_config.py`:
321
+
322
+ ```python
323
+ # General settings
324
+ MAX_WORKERS = 4 # Number of worker threads
325
+ MAX_DEPTH = 3 # Maximum crawl depth
326
+ SEED_URLS = ["https://example.com"] # Initial URLs
327
+
328
+ # Politeness settings
329
+ RESPECT_ROBOTS_TXT = True # Whether to respect robots.txt
330
+ USER_AGENT = "MyBot/1.0" # User agent for requests
331
+ DELAY_BETWEEN_REQUESTS = 1 # Delay between requests to the same domain
332
+
333
+ # Storage settings
334
+ MONGODB_URI = "mongodb://localhost:27017/"
335
+ MONGODB_DB = "crawler"
336
+
337
+ # DNS settings
338
+ DNS_CACHE_SIZE = 10000
339
+ DNS_CACHE_TTL = 3600 # 1 hour
340
+
341
+ # Logging settings
342
+ LOG_LEVEL = "INFO"
343
+ LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s"
344
+ ```
__pycache__/api.cpython-311.pyc ADDED
Binary file (25.7 kB). View file
 
__pycache__/config.cpython-310.pyc ADDED
Binary file (2.5 kB). View file
 
__pycache__/config.cpython-311.pyc ADDED
Binary file (3.84 kB). View file
 
__pycache__/crawler.cpython-310.pyc ADDED
Binary file (22.3 kB). View file
 
__pycache__/crawler.cpython-311.pyc ADDED
Binary file (40.2 kB). View file
 
__pycache__/dns_resolver.cpython-310.pyc ADDED
Binary file (4.7 kB). View file
 
__pycache__/dns_resolver.cpython-311.pyc ADDED
Binary file (7.84 kB). View file
 
__pycache__/downloader.cpython-310.pyc ADDED
Binary file (10.8 kB). View file
 
__pycache__/downloader.cpython-311.pyc ADDED
Binary file (18.6 kB). View file
 
__pycache__/frontier.cpython-310.pyc ADDED
Binary file (8.74 kB). View file
 
__pycache__/frontier.cpython-311.pyc ADDED
Binary file (20.6 kB). View file
 
__pycache__/local_config.cpython-310.pyc ADDED
Binary file (850 Bytes). View file
 
__pycache__/local_config.cpython-311.pyc ADDED
Binary file (1.31 kB). View file
 
__pycache__/models.cpython-310.pyc ADDED
Binary file (5.57 kB). View file
 
__pycache__/models.cpython-311.pyc ADDED
Binary file (7.77 kB). View file
 
__pycache__/mongo_cleanup.cpython-310.pyc ADDED
Binary file (2.27 kB). View file
 
__pycache__/mongo_cleanup.cpython-311.pyc ADDED
Binary file (4.26 kB). View file
 
__pycache__/parser.cpython-310.pyc ADDED
Binary file (7.95 kB). View file
 
__pycache__/parser.cpython-311.pyc ADDED
Binary file (14.1 kB). View file
 
__pycache__/robots.cpython-310.pyc ADDED
Binary file (4.75 kB). View file
 
__pycache__/robots.cpython-311.pyc ADDED
Binary file (7.92 kB). View file
 
__pycache__/run_crawler.cpython-310.pyc ADDED
Binary file (6.36 kB). View file
 
api.py ADDED
@@ -0,0 +1,588 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Web API for the web crawler.
3
+
4
+ This module provides a FastAPI-based web API for controlling and monitoring the web crawler.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import time
10
+ import json
11
+ import logging
12
+ import datetime
13
+ from typing import List, Dict, Any, Optional
14
+ from fastapi import FastAPI, HTTPException, Query, Path, BackgroundTasks, Depends
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ from fastapi.responses import JSONResponse
17
+ from pydantic import BaseModel, HttpUrl, Field
18
+ import uvicorn
19
+
20
+ from crawler import Crawler
21
+ from models import URL, URLStatus, Priority
22
+ import config
23
+
24
+ # Configure logging
25
+ logging.basicConfig(
26
+ level=getattr(logging, config.LOG_LEVEL),
27
+ format=config.LOG_FORMAT
28
+ )
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Create FastAPI app
32
+ app = FastAPI(
33
+ title="Web Crawler API",
34
+ description="API for controlling and monitoring the web crawler",
35
+ version="1.0.0"
36
+ )
37
+
38
+ # Enable CORS
39
+ app.add_middleware(
40
+ CORSMiddleware,
41
+ allow_origins=["*"],
42
+ allow_credentials=True,
43
+ allow_methods=["*"],
44
+ allow_headers=["*"],
45
+ )
46
+
47
+ # Global crawler instance
48
+ crawler = None
49
+
50
+
51
+ def get_crawler() -> Crawler:
52
+ """Get or initialize the crawler instance"""
53
+ global crawler
54
+ if crawler is None:
55
+ crawler = Crawler()
56
+ return crawler
57
+
58
+
59
+ # API Models
60
+ class SeedURL(BaseModel):
61
+ url: HttpUrl
62
+ priority: Optional[str] = Field(
63
+ default="NORMAL",
64
+ description="URL priority (VERY_HIGH, HIGH, NORMAL, LOW, VERY_LOW)"
65
+ )
66
+
67
+
68
+ class SeedURLs(BaseModel):
69
+ urls: List[SeedURL]
70
+
71
+
72
+ class CrawlerStatus(BaseModel):
73
+ running: bool
74
+ paused: bool
75
+ start_time: Optional[float] = None
76
+ uptime_seconds: Optional[float] = None
77
+ pages_crawled: int
78
+ pages_failed: int
79
+ urls_discovered: int
80
+ urls_filtered: int
81
+ domains_crawled: int
82
+ frontier_size: int
83
+
84
+
85
+ class CrawlerConfig(BaseModel):
86
+ max_depth: int = Field(..., description="Maximum crawl depth")
87
+ max_workers: int = Field(..., description="Maximum number of worker threads")
88
+ delay_between_requests: float = Field(..., description="Delay between requests to the same domain (seconds)")
89
+
90
+
91
+ class PageDetail(BaseModel):
92
+ url: str
93
+ domain: str
94
+ status_code: int
95
+ content_type: str
96
+ content_length: int
97
+ crawled_at: str
98
+ is_seed: bool
99
+ depth: int
100
+ title: Optional[str] = None
101
+ description: Optional[str] = None
102
+
103
+
104
+ class URLDetail(BaseModel):
105
+ url: str
106
+ normalized_url: str
107
+ domain: str
108
+ status: str
109
+ priority: str
110
+ depth: int
111
+ parent_url: Optional[str] = None
112
+ last_crawled: Optional[str] = None
113
+ error: Optional[str] = None
114
+ retries: int
115
+
116
+
117
+ class DomainStats(BaseModel):
118
+ domain: str
119
+ pages_count: int
120
+ successful_requests: int
121
+ failed_requests: int
122
+ avg_page_size: float
123
+ content_types: Dict[str, int]
124
+ status_codes: Dict[str, int]
125
+
126
+
127
+ # API Routes
128
+ @app.get("/")
129
+ async def read_root():
130
+ """Root endpoint"""
131
+ return {
132
+ "name": "Web Crawler API",
133
+ "version": "1.0.0",
134
+ "description": "API for controlling and monitoring the web crawler",
135
+ "endpoints": {
136
+ "GET /": "This help message",
137
+ "GET /status": "Get crawler status",
138
+ "GET /stats": "Get crawler statistics",
139
+ "GET /config": "Get crawler configuration",
140
+ "PUT /config": "Update crawler configuration",
141
+ "POST /start": "Start the crawler",
142
+ "POST /stop": "Stop the crawler",
143
+ "POST /pause": "Pause the crawler",
144
+ "POST /resume": "Resume the crawler",
145
+ "GET /pages": "List crawled pages",
146
+ "GET /pages/{url}": "Get page details",
147
+ "GET /urls": "List discovered URLs",
148
+ "GET /urls/{url}": "Get URL details",
149
+ "POST /seed": "Add seed URLs",
150
+ "GET /domains": "Get domain statistics",
151
+ "GET /domains/{domain}": "Get statistics for a specific domain",
152
+ }
153
+ }
154
+
155
+
156
+ @app.get("/status", response_model=CrawlerStatus)
157
+ async def get_status(crawler: Crawler = Depends(get_crawler)):
158
+ """Get crawler status"""
159
+ status = {
160
+ "running": crawler.running,
161
+ "paused": crawler.paused,
162
+ "start_time": crawler.stats.get('start_time'),
163
+ "uptime_seconds": time.time() - crawler.stats.get('start_time', time.time()) if crawler.running else None,
164
+ "pages_crawled": crawler.stats.get('pages_crawled', 0),
165
+ "pages_failed": crawler.stats.get('pages_failed', 0),
166
+ "urls_discovered": crawler.stats.get('urls_discovered', 0),
167
+ "urls_filtered": crawler.stats.get('urls_filtered', 0),
168
+ "domains_crawled": len(crawler.stats.get('domains_crawled', set())),
169
+ "frontier_size": crawler.frontier.size()
170
+ }
171
+ return status
172
+
173
+
174
+ @app.get("/stats")
175
+ async def get_stats(crawler: Crawler = Depends(get_crawler)):
176
+ """Get detailed crawler statistics"""
177
+ stats = crawler.stats.copy()
178
+
179
+ # Convert sets to lists for JSON serialization
180
+ for key, value in stats.items():
181
+ if isinstance(value, set):
182
+ stats[key] = list(value)
183
+
184
+ # Add uptime
185
+ if stats.get('start_time'):
186
+ stats['uptime_seconds'] = time.time() - stats['start_time']
187
+ stats['uptime_formatted'] = str(datetime.timedelta(seconds=int(stats['uptime_seconds'])))
188
+
189
+ # Add DNS cache statistics if available
190
+ try:
191
+ dns_stats = crawler.dns_resolver.get_stats()
192
+ stats['dns_cache'] = dns_stats
193
+ except (AttributeError, Exception) as e:
194
+ logger.warning(f"Failed to get DNS stats: {e}")
195
+ stats['dns_cache'] = {'error': 'Stats not available'}
196
+
197
+ # Add frontier statistics if available
198
+ try:
199
+ stats['frontier_size'] = crawler.frontier.size()
200
+ if hasattr(crawler.frontier, 'get_stats'):
201
+ frontier_stats = crawler.frontier.get_stats()
202
+ stats['frontier'] = frontier_stats
203
+ else:
204
+ stats['frontier'] = {'size': crawler.frontier.size()}
205
+ except Exception as e:
206
+ logger.warning(f"Failed to get frontier stats: {e}")
207
+ stats['frontier'] = {'error': 'Stats not available'}
208
+
209
+ return stats
210
+
211
+
212
+ @app.get("/config", response_model=CrawlerConfig)
213
+ async def get_config():
214
+ """Get crawler configuration"""
215
+ return {
216
+ "max_depth": config.MAX_DEPTH,
217
+ "max_workers": config.MAX_WORKERS,
218
+ "delay_between_requests": config.DELAY_BETWEEN_REQUESTS
219
+ }
220
+
221
+
222
+ @app.put("/config", response_model=CrawlerConfig)
223
+ async def update_config(
224
+ crawler_config: CrawlerConfig,
225
+ crawler: Crawler = Depends(get_crawler)
226
+ ):
227
+ """Update crawler configuration"""
228
+ # Update configuration
229
+ config.MAX_DEPTH = crawler_config.max_depth
230
+ config.MAX_WORKERS = crawler_config.max_workers
231
+ config.DELAY_BETWEEN_REQUESTS = crawler_config.delay_between_requests
232
+
233
+ return crawler_config
234
+
235
+
236
+ @app.post("/start")
237
+ async def start_crawler(
238
+ background_tasks: BackgroundTasks,
239
+ num_workers: int = Query(None, description="Number of worker threads"),
240
+ async_mode: bool = Query(False, description="Whether to use async mode"),
241
+ crawler: Crawler = Depends(get_crawler)
242
+ ):
243
+ """Start the crawler"""
244
+ if crawler.running:
245
+ return {"status": "Crawler is already running"}
246
+
247
+ # Start crawler in background
248
+ def start_crawler_task():
249
+ try:
250
+ crawler.start(num_workers=num_workers, async_mode=async_mode)
251
+ except Exception as e:
252
+ logger.error(f"Error starting crawler: {e}")
253
+
254
+ background_tasks.add_task(start_crawler_task)
255
+
256
+ return {"status": "Crawler starting in background"}
257
+
258
+
259
+ @app.post("/stop")
260
+ async def stop_crawler(crawler: Crawler = Depends(get_crawler)):
261
+ """Stop the crawler"""
262
+ if not crawler.running:
263
+ return {"status": "Crawler is not running"}
264
+
265
+ crawler.stop()
266
+ return {"status": "Crawler stopped"}
267
+
268
+
269
+ @app.post("/pause")
270
+ async def pause_crawler(crawler: Crawler = Depends(get_crawler)):
271
+ """Pause the crawler"""
272
+ if not crawler.running:
273
+ return {"status": "Crawler is not running"}
274
+
275
+ if crawler.paused:
276
+ return {"status": "Crawler is already paused"}
277
+
278
+ crawler.pause()
279
+ return {"status": "Crawler paused"}
280
+
281
+
282
+ @app.post("/resume")
283
+ async def resume_crawler(crawler: Crawler = Depends(get_crawler)):
284
+ """Resume the crawler"""
285
+ if not crawler.running:
286
+ return {"status": "Crawler is not running"}
287
+
288
+ if not crawler.paused:
289
+ return {"status": "Crawler is not paused"}
290
+
291
+ crawler.resume()
292
+ return {"status": "Crawler resumed"}
293
+
294
+
295
+ @app.get("/pages")
296
+ async def list_pages(
297
+ limit: int = Query(10, ge=1, le=100, description="Number of pages to return"),
298
+ offset: int = Query(0, ge=0, description="Offset for pagination"),
299
+ domain: Optional[str] = Query(None, description="Filter by domain"),
300
+ status_code: Optional[int] = Query(None, description="Filter by HTTP status code"),
301
+ crawler: Crawler = Depends(get_crawler)
302
+ ):
303
+ """List crawled pages"""
304
+ # Build query
305
+ query = {}
306
+ if domain:
307
+ query['domain'] = domain
308
+ if status_code:
309
+ query['status_code'] = status_code
310
+
311
+ # Execute query
312
+ try:
313
+ pages = list(crawler.db.pages_collection.find(
314
+ query,
315
+ {'_id': 0}
316
+ ).skip(offset).limit(limit))
317
+
318
+ # Count total pages matching query
319
+ total_count = crawler.db.pages_collection.count_documents(query)
320
+
321
+ return {
322
+ "pages": pages,
323
+ "total": total_count,
324
+ "limit": limit,
325
+ "offset": offset
326
+ }
327
+ except Exception as e:
328
+ logger.error(f"Error listing pages: {e}")
329
+ raise HTTPException(status_code=500, detail=str(e))
330
+
331
+
332
+ @app.get("/pages/{url:path}", response_model=PageDetail)
333
+ async def get_page(
334
+ url: str,
335
+ include_content: bool = Query(False, description="Include page content"),
336
+ crawler: Crawler = Depends(get_crawler)
337
+ ):
338
+ """Get page details"""
339
+ try:
340
+ # Decode URL from path parameter
341
+ url = url.replace("___", "/")
342
+
343
+ # Find page in database
344
+ page = crawler.db.pages_collection.find_one({'url': url}, {'_id': 0})
345
+
346
+ if not page:
347
+ raise HTTPException(status_code=404, detail="Page not found")
348
+
349
+ # Load content if requested
350
+ if include_content:
351
+ try:
352
+ if crawler.use_s3:
353
+ content = crawler._load_content_s3(url)
354
+ else:
355
+ content = crawler._load_content_disk(url)
356
+
357
+ if content:
358
+ page['content'] = content
359
+ except Exception as e:
360
+ logger.error(f"Error loading content for {url}: {e}")
361
+ page['content'] = None
362
+
363
+ return page
364
+ except HTTPException:
365
+ raise
366
+ except Exception as e:
367
+ logger.error(f"Error getting page {url}: {e}")
368
+ raise HTTPException(status_code=500, detail=str(e))
369
+
370
+
371
+ @app.get("/urls")
372
+ async def list_urls(
373
+ limit: int = Query(10, ge=1, le=100, description="Number of URLs to return"),
374
+ offset: int = Query(0, ge=0, description="Offset for pagination"),
375
+ status: Optional[str] = Query(None, description="Filter by URL status"),
376
+ domain: Optional[str] = Query(None, description="Filter by domain"),
377
+ priority: Optional[str] = Query(None, description="Filter by priority"),
378
+ crawler: Crawler = Depends(get_crawler)
379
+ ):
380
+ """List discovered URLs"""
381
+ # Build query
382
+ query = {}
383
+ if status:
384
+ query['status'] = status
385
+ if domain:
386
+ query['domain'] = domain
387
+ if priority:
388
+ query['priority'] = priority
389
+
390
+ # Execute query
391
+ try:
392
+ urls = list(crawler.db.urls_collection.find(
393
+ query,
394
+ {'_id': 0}
395
+ ).skip(offset).limit(limit))
396
+
397
+ # Count total URLs matching query
398
+ total_count = crawler.db.urls_collection.count_documents(query)
399
+
400
+ return {
401
+ "urls": urls,
402
+ "total": total_count,
403
+ "limit": limit,
404
+ "offset": offset
405
+ }
406
+ except Exception as e:
407
+ logger.error(f"Error listing URLs: {e}")
408
+ raise HTTPException(status_code=500, detail=str(e))
409
+
410
+
411
+ @app.get("/urls/{url:path}", response_model=URLDetail)
412
+ async def get_url(
413
+ url: str,
414
+ crawler: Crawler = Depends(get_crawler)
415
+ ):
416
+ """Get URL details"""
417
+ try:
418
+ # Decode URL from path parameter
419
+ url = url.replace("___", "/")
420
+
421
+ # Find URL in database
422
+ url_obj = crawler.db.urls_collection.find_one({'url': url}, {'_id': 0})
423
+
424
+ if not url_obj:
425
+ raise HTTPException(status_code=404, detail="URL not found")
426
+
427
+ return url_obj
428
+ except HTTPException:
429
+ raise
430
+ except Exception as e:
431
+ logger.error(f"Error getting URL {url}: {e}")
432
+ raise HTTPException(status_code=500, detail=str(e))
433
+
434
+
435
+ @app.post("/seed")
436
+ async def add_seed_urls(
437
+ seed_urls: SeedURLs,
438
+ crawler: Crawler = Depends(get_crawler)
439
+ ):
440
+ """Add seed URLs to the frontier"""
441
+ try:
442
+ urls_added = 0
443
+ for seed in seed_urls.urls:
444
+ url = str(seed.url)
445
+ priority = getattr(Priority, seed.priority, Priority.NORMAL)
446
+
447
+ # Create URL object
448
+ url_obj = URL(
449
+ url=url,
450
+ status=URLStatus.PENDING,
451
+ priority=priority,
452
+ depth=0 # Seed URLs are at depth 0
453
+ )
454
+
455
+ # Add to frontier
456
+ if crawler.frontier.add_url(url_obj):
457
+ # Save URL to database
458
+ crawler.urls_collection.update_one(
459
+ {'url': url},
460
+ {'$set': url_obj.dict()},
461
+ upsert=True
462
+ )
463
+
464
+ urls_added += 1
465
+ logger.info(f"Added seed URL: {url}")
466
+
467
+ return {"status": "success", "urls_added": urls_added}
468
+ except Exception as e:
469
+ logger.error(f"Error adding seed URLs: {e}")
470
+ raise HTTPException(status_code=500, detail=str(e))
471
+
472
+
473
+ @app.get("/domains")
474
+ async def list_domains(
475
+ limit: int = Query(10, ge=1, le=100, description="Number of domains to return"),
476
+ offset: int = Query(0, ge=0, description="Offset for pagination"),
477
+ crawler: Crawler = Depends(get_crawler)
478
+ ):
479
+ """Get domain statistics"""
480
+ try:
481
+ # Get domains with counts
482
+ domain_counts = crawler.db.pages_collection.aggregate([
483
+ {"$group": {
484
+ "_id": "$domain",
485
+ "pages_count": {"$sum": 1},
486
+ "avg_page_size": {"$avg": "$content_length"}
487
+ }},
488
+ {"$sort": {"pages_count": -1}},
489
+ {"$skip": offset},
490
+ {"$limit": limit}
491
+ ])
492
+
493
+ # Get total domains count
494
+ total_domains = len(crawler.stats.get('domains_crawled', set()))
495
+
496
+ # Format result
497
+ domains = []
498
+ for domain in domain_counts:
499
+ domains.append({
500
+ "domain": domain["_id"],
501
+ "pages_count": domain["pages_count"],
502
+ "avg_page_size": domain["avg_page_size"]
503
+ })
504
+
505
+ return {
506
+ "domains": domains,
507
+ "total": total_domains,
508
+ "limit": limit,
509
+ "offset": offset
510
+ }
511
+ except Exception as e:
512
+ logger.error(f"Error listing domains: {e}")
513
+ raise HTTPException(status_code=500, detail=str(e))
514
+
515
+
516
+ @app.get("/domains/{domain}", response_model=DomainStats)
517
+ async def get_domain_stats(
518
+ domain: str,
519
+ crawler: Crawler = Depends(get_crawler)
520
+ ):
521
+ """Get statistics for a specific domain"""
522
+ try:
523
+ # Get basic domain stats
524
+ domain_stats = crawler.db.pages_collection.aggregate([
525
+ {"$match": {"domain": domain}},
526
+ {"$group": {
527
+ "_id": "$domain",
528
+ "pages_count": {"$sum": 1},
529
+ "successful_requests": {"$sum": {"$cond": [{"$lt": ["$status_code", 400]}, 1, 0]}},
530
+ "failed_requests": {"$sum": {"$cond": [{"$gte": ["$status_code", 400]}, 1, 0]}},
531
+ "avg_page_size": {"$avg": "$content_length"}
532
+ }}
533
+ ]).next()
534
+
535
+ # Get content type distribution
536
+ content_types = crawler.db.pages_collection.aggregate([
537
+ {"$match": {"domain": domain}},
538
+ {"$group": {
539
+ "_id": "$content_type",
540
+ "count": {"$sum": 1}
541
+ }}
542
+ ])
543
+
544
+ content_type_map = {}
545
+ for ct in content_types:
546
+ content_type_map[ct["_id"]] = ct["count"]
547
+
548
+ # Get status code distribution
549
+ status_codes = crawler.db.pages_collection.aggregate([
550
+ {"$match": {"domain": domain}},
551
+ {"$group": {
552
+ "_id": "$status_code",
553
+ "count": {"$sum": 1}
554
+ }}
555
+ ])
556
+
557
+ status_code_map = {}
558
+ for sc in status_codes:
559
+ status_code_map[str(sc["_id"])] = sc["count"]
560
+
561
+ # Format result
562
+ result = {
563
+ "domain": domain,
564
+ "pages_count": domain_stats["pages_count"],
565
+ "successful_requests": domain_stats["successful_requests"],
566
+ "failed_requests": domain_stats["failed_requests"],
567
+ "avg_page_size": domain_stats["avg_page_size"],
568
+ "content_types": content_type_map,
569
+ "status_codes": status_code_map
570
+ }
571
+
572
+ return result
573
+ except StopIteration:
574
+ # Domain not found
575
+ raise HTTPException(status_code=404, detail=f"Domain '{domain}' not found")
576
+ except Exception as e:
577
+ logger.error(f"Error getting domain stats for {domain}: {e}")
578
+ raise HTTPException(status_code=500, detail=str(e))
579
+
580
+
581
+ if __name__ == "__main__":
582
+ # Run the API server
583
+ uvicorn.run(
584
+ "api:app",
585
+ host="0.0.0.0",
586
+ port=8000,
587
+ reload=True
588
+ )
cleanup.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Cleanup script to remove all web crawler data from MongoDB
4
+ and list files to be removed
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import logging
10
+ import shutil
11
+ from pymongo import MongoClient
12
+
13
+ # Configure logging
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
17
+ )
18
+ logger = logging.getLogger("cleanup")
19
+
20
+ def cleanup_mongodb():
21
+ """Remove all web crawler data from MongoDB"""
22
+ try:
23
+ # Connect to MongoDB
24
+ logger.info("Connecting to MongoDB...")
25
+ client = MongoClient("mongodb://localhost:27017/")
26
+
27
+ # Access crawler database
28
+ db = client["crawler"]
29
+
30
+ # List and drop all collections
31
+ collections = db.list_collection_names()
32
+
33
+ if not collections:
34
+ logger.info("No collections found in the crawler database")
35
+ else:
36
+ logger.info(f"Found {len(collections)} collections to drop: {collections}")
37
+
38
+ for collection in collections:
39
+ logger.info(f"Dropping collection: {collection}")
40
+ db[collection].drop()
41
+
42
+ logger.info("All crawler collections dropped successfully")
43
+
44
+ # Optional: Drop the entire database
45
+ # client.drop_database("crawler")
46
+ # logger.info("Dropped entire crawler database")
47
+
48
+ logger.info("MongoDB cleanup completed")
49
+
50
+ except Exception as e:
51
+ logger.error(f"Error cleaning up MongoDB: {e}")
52
+ return False
53
+
54
+ return True
55
+
56
+ def cleanup_files():
57
+ """List and remove files related to simple_crawler"""
58
+ try:
59
+ crawler_dir = os.path.dirname(os.path.abspath(__file__))
60
+
61
+ # Files directly related to simple_crawler
62
+ simple_crawler_files = [
63
+ os.path.join(crawler_dir, "simple_crawler.py"),
64
+ os.path.join(crawler_dir, "README_SIMPLE.md"),
65
+ os.path.join(crawler_dir, "simple_crawler.log")
66
+ ]
67
+
68
+ # Check storage directories
69
+ storage_dir = os.path.join(crawler_dir, "storage")
70
+ if os.path.exists(storage_dir):
71
+ logger.info(f"Will remove storage directory: {storage_dir}")
72
+ simple_crawler_files.append(storage_dir)
73
+
74
+ # List all files that will be removed
75
+ logger.info("The following files will be removed:")
76
+ for file_path in simple_crawler_files:
77
+ if os.path.exists(file_path):
78
+ logger.info(f" - {file_path}")
79
+ else:
80
+ logger.info(f" - {file_path} (not found)")
81
+
82
+ # Confirm removal
83
+ confirm = input("Do you want to proceed with removal? (y/n): ")
84
+ if confirm.lower() != 'y':
85
+ logger.info("File removal cancelled")
86
+ return False
87
+
88
+ # Remove files and directories
89
+ for file_path in simple_crawler_files:
90
+ if os.path.exists(file_path):
91
+ if os.path.isdir(file_path):
92
+ logger.info(f"Removing directory: {file_path}")
93
+ shutil.rmtree(file_path)
94
+ else:
95
+ logger.info(f"Removing file: {file_path}")
96
+ os.remove(file_path)
97
+
98
+ logger.info("File cleanup completed")
99
+
100
+ except Exception as e:
101
+ logger.error(f"Error cleaning up files: {e}")
102
+ return False
103
+
104
+ return True
105
+
106
+ if __name__ == "__main__":
107
+ print("Web Crawler Cleanup Utility")
108
+ print("---------------------------")
109
+ print("This script will:")
110
+ print("1. Remove all web crawler collections from MongoDB")
111
+ print("2. List and remove files related to simple_crawler")
112
+ print()
113
+
114
+ proceed = input("Do you want to proceed? (y/n): ")
115
+ if proceed.lower() != 'y':
116
+ print("Cleanup cancelled")
117
+ sys.exit(0)
118
+
119
+ # Clean up MongoDB
120
+ print("\nStep 1: Cleaning up MongoDB...")
121
+ mongo_success = cleanup_mongodb()
122
+
123
+ # Clean up files
124
+ print("\nStep 2: Cleaning up files...")
125
+ files_success = cleanup_files()
126
+
127
+ # Summary
128
+ print("\nCleanup Summary:")
129
+ print(f"MongoDB cleanup: {'Completed' if mongo_success else 'Failed'}")
130
+ print(f"File cleanup: {'Completed' if files_success else 'Failed'}")
cleanup_all.sh ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Master cleanup script for web crawler - runs both MongoDB and file cleanup
3
+
4
+ set -e # Exit on error
5
+
6
+ echo "====================================================="
7
+ echo " WEB CRAWLER COMPLETE CLEANUP "
8
+ echo "====================================================="
9
+ echo
10
+
11
+ # Get script directory
12
+ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
13
+ cd "$SCRIPT_DIR"
14
+
15
+ # Check if scripts exist
16
+ if [ ! -f "./mongo_cleanup.py" ] || [ ! -f "./file_cleanup.py" ]; then
17
+ echo "Error: Required cleanup scripts not found in $SCRIPT_DIR"
18
+ exit 1
19
+ fi
20
+
21
+ # Ensure scripts are executable
22
+ chmod +x ./mongo_cleanup.py
23
+ chmod +x ./file_cleanup.py
24
+
25
+ # Step 1: MongoDB cleanup
26
+ echo "Step 1: MongoDB Cleanup"
27
+ echo "----------------------"
28
+ if [ "$1" == "--force" ]; then
29
+ python3 ./mongo_cleanup.py --force
30
+ else
31
+ python3 ./mongo_cleanup.py
32
+ fi
33
+
34
+ # Step 2: File cleanup
35
+ echo
36
+ echo "Step 2: File Cleanup"
37
+ echo "------------------"
38
+ if [ "$1" == "--force" ]; then
39
+ python3 ./file_cleanup.py --force
40
+ else
41
+ python3 ./file_cleanup.py
42
+ fi
43
+
44
+ echo
45
+ echo "====================================================="
46
+ echo " CLEANUP PROCESS COMPLETED "
47
+ echo "====================================================="
config.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration settings for the web crawler
3
+ """
4
+
5
+ import os
6
+ from typing import Dict, List, Any, Optional
7
+
8
+ # General settings
9
+ MAX_WORKERS = 100 # Maximum number of worker threads/processes
10
+ MAX_DEPTH = 10 # Maximum depth to crawl from seed URLs
11
+ CRAWL_TIMEOUT = 30 # Timeout for HTTP requests in seconds
12
+ USER_AGENT = "Mozilla/5.0 WebCrawler/1.0 (+https://example.org/bot)"
13
+
14
+ # Politeness settings
15
+ ROBOTSTXT_OBEY = True # Whether to obey robots.txt rules
16
+ DOWNLOAD_DELAY = 1.0 # Delay between requests to the same domain (seconds)
17
+ MAX_REQUESTS_PER_DOMAIN = 10 # Maximum concurrent requests per domain
18
+ RESPECT_CRAWL_DELAY = True # Respect Crawl-delay in robots.txt
19
+ RETRY_TIMES = 3 # Number of retries for failed requests
20
+ RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429] # HTTP codes to retry
21
+
22
+ # URL settings
23
+ ALLOWED_DOMAINS: Optional[List[str]] = None # Domains to restrict crawling to (None = all domains)
24
+ EXCLUDED_DOMAINS: List[str] = [] # Domains to exclude from crawling
25
+ ALLOWED_SCHEMES = ["http", "https"] # URL schemes to allow
26
+ URL_FILTERS = [
27
+ # Only filter out binary and media files
28
+ r".*\.(jpg|jpeg|gif|png|ico|mp3|mp4|wav|avi|mov|mpeg|pdf|zip|rar|gz|exe|dmg|pkg|iso|bin)$",
29
+ ] # Regex patterns to filter out URLs
30
+
31
+ # Storage settings
32
+ MONGODB_URI = "mongodb://localhost:27017/"
33
+ MONGODB_DB = "webcrawler"
34
+ REDIS_URI = "redis://localhost:6379/0"
35
+ STORAGE_PATH = os.path.join(os.path.dirname(__file__), "storage")
36
+ HTML_STORAGE_PATH = os.path.join(STORAGE_PATH, "html")
37
+ LOG_PATH = os.path.join(STORAGE_PATH, "logs")
38
+
39
+ # Frontier settings
40
+ FRONTIER_QUEUE_SIZE = 100000 # Maximum number of URLs in the frontier queue
41
+ PRIORITY_QUEUE_NUM = 5 # Number of priority queues
42
+ HOST_QUEUE_NUM = 1000 # Number of host queues for politeness
43
+
44
+ # Content settings
45
+ MAX_CONTENT_SIZE = 10 * 1024 * 1024 # Maximum size of HTML content to download (10MB)
46
+ ALLOWED_CONTENT_TYPES = [
47
+ "text/html",
48
+ "application/xhtml+xml",
49
+ "text/plain", # Some servers might serve HTML as text/plain
50
+ "application/html",
51
+ "*/*", # Accept any content type
52
+ ] # Allowed content types
53
+
54
+ # DNS settings
55
+ DNS_CACHE_SIZE = 10000 # Maximum number of entries in DNS cache
56
+ DNS_CACHE_TIMEOUT = 3600 # DNS cache timeout in seconds
57
+
58
+ # Logging settings
59
+ LOG_LEVEL = "INFO"
60
+ LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s"
61
+
62
+ # Seed URLs
63
+ SEED_URLS = [
64
+ "https://en.wikipedia.org/",
65
+ "https://www.nytimes.com/",
66
+ "https://www.bbc.com/",
67
+ "https://www.github.com/",
68
+ "https://www.reddit.com/",
69
+ ]
70
+
71
+ # Override settings with environment variables
72
+ def get_env_settings() -> Dict[str, Any]:
73
+ """Get settings from environment variables"""
74
+ env_settings = {}
75
+
76
+ for key, value in globals().items():
77
+ if key.isupper(): # Only consider uppercase variables as settings
78
+ env_value = os.environ.get(f"WEBCRAWLER_{key}")
79
+ if env_value is not None:
80
+ # Convert to appropriate type based on default value
81
+ if isinstance(value, bool):
82
+ env_settings[key] = env_value.lower() in ("true", "1", "yes")
83
+ elif isinstance(value, int):
84
+ env_settings[key] = int(env_value)
85
+ elif isinstance(value, float):
86
+ env_settings[key] = float(env_value)
87
+ elif isinstance(value, list):
88
+ # Assume comma-separated values
89
+ env_settings[key] = [item.strip() for item in env_value.split(",")]
90
+ else:
91
+ env_settings[key] = env_value
92
+
93
+ return env_settings
94
+
95
+ # Update settings with environment variables
96
+ globals().update(get_env_settings())
crawl.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Command-line interface for the web crawler.
4
+
5
+ Usage:
6
+ crawl.py start [--workers=<num>] [--async] [--seed=<url>...]
7
+ crawl.py stop
8
+ crawl.py pause
9
+ crawl.py resume
10
+ crawl.py stats
11
+ crawl.py clean [--days=<days>]
12
+ crawl.py export [--format=<format>] [--output=<file>]
13
+ crawl.py set-max-depth <depth>
14
+ crawl.py add-seed <url>...
15
+ crawl.py (-h | --help)
16
+ crawl.py --version
17
+
18
+ Options:
19
+ -h --help Show this help message
20
+ --version Show version
21
+ --workers=<num> Number of worker threads [default: 4]
22
+ --async Use asynchronous mode
23
+ --seed=<url> Seed URL(s) to start crawling
24
+ --days=<days> Days threshold for data cleaning [default: 90]
25
+ --format=<format> Export format (json, csv) [default: json]
26
+ --output=<file> Output file path [default: crawl_data.json]
27
+ """
28
+
29
+ import os
30
+ import sys
31
+ import time
32
+ import json
33
+ import signal
34
+ import logging
35
+ import csv
36
+ from typing import List, Dict, Any
37
+ from docopt import docopt
38
+ import datetime
39
+ import traceback
40
+
41
+ from models import URL, URLStatus, Priority
42
+ from crawler import Crawler
43
+ import config
44
+
45
+ # Configure logging
46
+ logging.basicConfig(
47
+ level=getattr(logging, config.LOG_LEVEL),
48
+ format=config.LOG_FORMAT
49
+ )
50
+ logger = logging.getLogger(__name__)
51
+
52
+ # Global crawler instance
53
+ crawler = None
54
+
55
+
56
+ def initialize_crawler() -> Crawler:
57
+ """Initialize the crawler instance"""
58
+ global crawler
59
+ if crawler is None:
60
+ crawler = Crawler()
61
+ return crawler
62
+
63
+
64
+ def start_crawler(workers: int, async_mode: bool, seed_urls: List[str]) -> None:
65
+ """
66
+ Start the crawler
67
+
68
+ Args:
69
+ workers: Number of worker threads
70
+ async_mode: Whether to use async mode
71
+ seed_urls: List of seed URLs to add
72
+ """
73
+ crawler = initialize_crawler()
74
+
75
+ # Add seed URLs if provided
76
+ if seed_urls:
77
+ num_added = crawler.add_seed_urls(seed_urls)
78
+ logger.info(f"Added {num_added} seed URLs")
79
+
80
+ # Start crawler
81
+ try:
82
+ crawler.start(num_workers=workers, async_mode=async_mode)
83
+ except KeyboardInterrupt:
84
+ logger.info("Crawler interrupted by user")
85
+ crawler.stop()
86
+ except Exception as e:
87
+ logger.error(f"Error starting crawler: {e}")
88
+ logger.error(traceback.format_exc())
89
+ crawler.stop()
90
+
91
+
92
+ def stop_crawler() -> None:
93
+ """Stop the crawler"""
94
+ if crawler is None:
95
+ logger.error("Crawler is not running")
96
+ return
97
+
98
+ crawler.stop()
99
+ logger.info("Crawler stopped")
100
+
101
+
102
+ def pause_crawler() -> None:
103
+ """Pause the crawler"""
104
+ if crawler is None:
105
+ logger.error("Crawler is not running")
106
+ return
107
+
108
+ crawler.pause()
109
+ logger.info("Crawler paused")
110
+
111
+
112
+ def resume_crawler() -> None:
113
+ """Resume the crawler"""
114
+ if crawler is None:
115
+ logger.error("Crawler is not running")
116
+ return
117
+
118
+ crawler.resume()
119
+ logger.info("Crawler resumed")
120
+
121
+
122
+ def show_stats() -> None:
123
+ """Show crawler statistics"""
124
+ if crawler is None:
125
+ logger.error("Crawler is not running")
126
+ return
127
+
128
+ # Get crawler stats
129
+ stats = crawler.stats
130
+
131
+ # Calculate elapsed time
132
+ elapsed = time.time() - stats['start_time']
133
+ elapsed_str = str(datetime.timedelta(seconds=int(elapsed)))
134
+
135
+ # Format statistics
136
+ print("\n=== Crawler Statistics ===")
137
+ print(f"Running time: {elapsed_str}")
138
+ print(f"Pages crawled: {stats['pages_crawled']}")
139
+ print(f"Pages failed: {stats['pages_failed']}")
140
+ print(f"URLs discovered: {stats['urls_discovered']}")
141
+ print(f"URLs filtered: {stats['urls_filtered']}")
142
+
143
+ # Calculate pages per second
144
+ pages_per_second = stats['pages_crawled'] / elapsed if elapsed > 0 else 0
145
+ print(f"Crawl rate: {pages_per_second:.2f} pages/second")
146
+
147
+ # Domain statistics
148
+ domains = len(stats['domains_crawled'])
149
+ print(f"Domains crawled: {domains}")
150
+
151
+ # Status code statistics
152
+ print("\n--- HTTP Status Codes ---")
153
+ for status, count in sorted(stats['status_codes'].items()):
154
+ print(f" {status}: {count}")
155
+
156
+ # Content type statistics
157
+ print("\n--- Content Types ---")
158
+ for content_type, count in sorted(stats['content_types'].items(), key=lambda x: x[1], reverse=True)[:10]:
159
+ print(f" {content_type}: {count}")
160
+
161
+ # Frontier size
162
+ print(f"\nFrontier size: {crawler.frontier.size()}")
163
+
164
+ # DNS cache statistics
165
+ dns_stats = crawler.dns_resolver.get_stats()
166
+ print(f"\nDNS cache: {dns_stats['hit_count']} hits, {dns_stats['miss_count']} misses, {dns_stats['size']} entries")
167
+
168
+ print("\n=========================\n")
169
+
170
+
171
+ def clean_data(days: int) -> None:
172
+ """
173
+ Clean old data
174
+
175
+ Args:
176
+ days: Days threshold for data cleaning
177
+ """
178
+ try:
179
+ if crawler is None:
180
+ initialize_crawler()
181
+
182
+ # Get MongoDB connection
183
+ storage = crawler.mongo_client
184
+
185
+ # Clean old pages
186
+ old_pages = storage.clean_old_pages(days)
187
+
188
+ # Clean failed URLs
189
+ failed_urls = storage.clean_failed_urls()
190
+
191
+ logger.info(f"Cleaned {old_pages} old pages and {failed_urls} failed URLs")
192
+ print(f"Cleaned {old_pages} old pages and {failed_urls} failed URLs")
193
+ except Exception as e:
194
+ logger.error(f"Error cleaning data: {e}")
195
+ print(f"Error cleaning data: {e}")
196
+
197
+
198
+ def export_data(export_format: str, output_file: str) -> None:
199
+ """
200
+ Export crawler data
201
+
202
+ Args:
203
+ export_format: Format to export (json, csv)
204
+ output_file: Output file path
205
+ """
206
+ try:
207
+ if crawler is None:
208
+ initialize_crawler()
209
+
210
+ # Get MongoDB connection
211
+ db = crawler.db
212
+
213
+ # Get data
214
+ pages = list(db.pages_collection.find({}, {'_id': 0}))
215
+ urls = list(db.urls_collection.find({}, {'_id': 0}))
216
+ stats = list(db.stats_collection.find({}, {'_id': 0}))
217
+
218
+ # Prepare export data
219
+ export_data = {
220
+ 'metadata': {
221
+ 'exported_at': datetime.datetime.now().isoformat(),
222
+ 'pages_count': len(pages),
223
+ 'urls_count': len(urls),
224
+ 'stats_count': len(stats),
225
+ },
226
+ 'pages': pages,
227
+ 'urls': urls,
228
+ 'stats': stats
229
+ }
230
+
231
+ # Convert datetime objects to strings
232
+ export_data = json.loads(json.dumps(export_data, default=str))
233
+
234
+ # Export based on format
235
+ if export_format.lower() == 'json':
236
+ with open(output_file, 'w') as f:
237
+ json.dump(export_data, f, indent=2)
238
+ logger.info(f"Data exported to {output_file} in JSON format")
239
+ print(f"Data exported to {output_file} in JSON format")
240
+ elif export_format.lower() == 'csv':
241
+ # Split export into multiple CSV files
242
+ base_name = os.path.splitext(output_file)[0]
243
+
244
+ # Export pages
245
+ pages_file = f"{base_name}_pages.csv"
246
+ if pages:
247
+ with open(pages_file, 'w', newline='') as f:
248
+ writer = csv.DictWriter(f, fieldnames=pages[0].keys())
249
+ writer.writeheader()
250
+ writer.writerows(pages)
251
+
252
+ # Export URLs
253
+ urls_file = f"{base_name}_urls.csv"
254
+ if urls:
255
+ with open(urls_file, 'w', newline='') as f:
256
+ writer = csv.DictWriter(f, fieldnames=urls[0].keys())
257
+ writer.writeheader()
258
+ writer.writerows(urls)
259
+
260
+ # Export stats
261
+ stats_file = f"{base_name}_stats.csv"
262
+ if stats:
263
+ with open(stats_file, 'w', newline='') as f:
264
+ writer = csv.DictWriter(f, fieldnames=stats[0].keys())
265
+ writer.writeheader()
266
+ writer.writerows(stats)
267
+
268
+ logger.info(f"Data exported to {base_name}_*.csv files in CSV format")
269
+ print(f"Data exported to {base_name}_*.csv files in CSV format")
270
+ else:
271
+ logger.error(f"Unsupported export format: {export_format}")
272
+ print(f"Unsupported export format: {export_format}")
273
+ except Exception as e:
274
+ logger.error(f"Error exporting data: {e}")
275
+ print(f"Error exporting data: {e}")
276
+
277
+
278
+ def set_max_depth(depth: int) -> None:
279
+ """
280
+ Set maximum crawl depth
281
+
282
+ Args:
283
+ depth: Maximum crawl depth
284
+ """
285
+ try:
286
+ depth = int(depth)
287
+ if depth < 0:
288
+ logger.error("Depth must be a positive integer")
289
+ print("Depth must be a positive integer")
290
+ return
291
+
292
+ # Update configuration
293
+ config.MAX_DEPTH = depth
294
+
295
+ logger.info(f"Maximum crawl depth set to {depth}")
296
+ print(f"Maximum crawl depth set to {depth}")
297
+ except ValueError:
298
+ logger.error("Depth must be a valid integer")
299
+ print("Depth must be a valid integer")
300
+
301
+
302
+ def add_seed_urls(urls: List[str]) -> None:
303
+ """
304
+ Add seed URLs to the crawler
305
+
306
+ Args:
307
+ urls: List of URLs to add
308
+ """
309
+ if crawler is None:
310
+ initialize_crawler()
311
+
312
+ num_added = crawler.add_seed_urls(urls)
313
+ logger.info(f"Added {num_added} seed URLs")
314
+ print(f"Added {num_added} seed URLs")
315
+
316
+
317
+ def handle_signal(sig, frame):
318
+ """Handle signal interrupts"""
319
+ if sig == signal.SIGINT:
320
+ logger.info("Received SIGINT, stopping crawler")
321
+ stop_crawler()
322
+ sys.exit(0)
323
+ elif sig == signal.SIGTERM:
324
+ logger.info("Received SIGTERM, stopping crawler")
325
+ stop_crawler()
326
+ sys.exit(0)
327
+
328
+
329
+ def main():
330
+ """Main entry point"""
331
+ # Register signal handlers
332
+ signal.signal(signal.SIGINT, handle_signal)
333
+ signal.signal(signal.SIGTERM, handle_signal)
334
+
335
+ # Parse arguments
336
+ args = docopt(__doc__, version='Web Crawler 1.0')
337
+
338
+ # Handle commands
339
+ if args['start']:
340
+ workers = int(args['--workers'])
341
+ async_mode = args['--async']
342
+ seed_urls = args['--seed'] if args['--seed'] else []
343
+ start_crawler(workers, async_mode, seed_urls)
344
+ elif args['stop']:
345
+ stop_crawler()
346
+ elif args['pause']:
347
+ pause_crawler()
348
+ elif args['resume']:
349
+ resume_crawler()
350
+ elif args['stats']:
351
+ show_stats()
352
+ elif args['clean']:
353
+ days = int(args['--days'])
354
+ clean_data(days)
355
+ elif args['export']:
356
+ export_format = args['--format']
357
+ output_file = args['--output']
358
+ export_data(export_format, output_file)
359
+ elif args['set-max-depth']:
360
+ depth = args['<depth>']
361
+ set_max_depth(depth)
362
+ elif args['add-seed']:
363
+ urls = args['<url>']
364
+ add_seed_urls(urls)
365
+ else:
366
+ print(__doc__)
367
+
368
+
369
+ if __name__ == '__main__':
370
+ main()
crawler.log ADDED
File without changes
crawler.py ADDED
@@ -0,0 +1,908 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main crawler class to coordinate the web crawling process
3
+ """
4
+
5
+ import time
6
+ import logging
7
+ import os
8
+ import asyncio
9
+ import threading
10
+ from typing import List, Dict, Set, Tuple, Optional, Any, Callable
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ import signal
13
+ import json
14
+ from datetime import datetime
15
+ from urllib.parse import urlparse
16
+ import traceback
17
+ from pymongo import MongoClient
18
+ from prometheus_client import Counter, Gauge, Histogram, start_http_server, REGISTRY
19
+ import redis
20
+
21
+ from models import URL, Page, URLStatus, Priority
22
+ from frontier import URLFrontier
23
+ from downloader import HTMLDownloader
24
+ from parser import HTMLParser
25
+ from robots import RobotsHandler
26
+ from dns_resolver import DNSResolver
27
+ import config
28
+ from dotenv import load_dotenv, find_dotenv
29
+
30
+ load_dotenv(find_dotenv())
31
+
32
+
33
+ # Check if we're in deployment mode
34
+ IS_DEPLOYMENT = os.getenv('DEPLOYMENT', 'false').lower() == 'true'
35
+
36
+ # Import local configuration if available
37
+ try:
38
+ import local_config
39
+ # Override config settings with local settings
40
+ for key in dir(local_config):
41
+ if key.isupper():
42
+ setattr(config, key, getattr(local_config, key))
43
+ print(f"Loaded local configuration from {local_config.__file__}")
44
+ except ImportError:
45
+ pass
46
+
47
+ # Configure logging
48
+ logging.basicConfig(
49
+ level=getattr(logging, config.LOG_LEVEL),
50
+ format=config.LOG_FORMAT
51
+ )
52
+ logger = logging.getLogger(__name__)
53
+
54
+
55
+ class Crawler:
56
+ """
57
+ Main crawler class that coordinates the web crawling process
58
+
59
+ Manages:
60
+ - URL Frontier
61
+ - HTML Downloader
62
+ - HTML Parser
63
+ - Content Storage
64
+ - Monitoring and Statistics
65
+ """
66
+
67
+ def __init__(self,
68
+ mongo_uri: Optional[str] = None,
69
+ redis_uri: Optional[str] = None,
70
+ metrics_port: int = 9100,
71
+ storage: Optional[Any] = None):
72
+ """
73
+ Initialize the crawler
74
+
75
+ Args:
76
+ mongo_uri: MongoDB URI for content storage
77
+ redis_uri: Redis URI for URL frontier
78
+ metrics_port: Port for Prometheus metrics server
79
+ storage: Optional storage backend for deployment mode
80
+ """
81
+ self.storage = storage
82
+ self.metrics_port = metrics_port
83
+
84
+ # Initialize database connections only if not using custom storage
85
+ if storage is None:
86
+ self.mongo_uri = mongo_uri or config.MONGODB_URI
87
+ self.redis_uri = redis_uri or config.REDIS_URI
88
+
89
+ # Connect to MongoDB
90
+ self.mongo_client = MongoClient(self.mongo_uri)
91
+ self.db = self.mongo_client[config.MONGODB_DB]
92
+ self.pages_collection = self.db['pages']
93
+ self.urls_collection = self.db['urls']
94
+ self.stats_collection = self.db['stats']
95
+
96
+ # Ensure indexes
97
+ self._create_indexes()
98
+
99
+ # Create frontier with Redis
100
+ self.frontier = URLFrontier(redis_client=redis.from_url(self.redis_uri))
101
+ else:
102
+ # In deployment mode, use in-memory storage
103
+ self.frontier = URLFrontier(use_memory=True)
104
+
105
+ # Create other components that don't need database connections
106
+ self.robots_handler = RobotsHandler()
107
+ self.dns_resolver = DNSResolver()
108
+ self.downloader = HTMLDownloader(self.dns_resolver, self.robots_handler)
109
+ self.parser = HTMLParser()
110
+
111
+ # Initialize statistics
112
+ self.stats = {
113
+ 'pages_crawled': 0,
114
+ 'pages_failed': 0,
115
+ 'urls_discovered': 0,
116
+ 'urls_filtered': 0,
117
+ 'start_time': time.time(),
118
+ 'domains_crawled': set(),
119
+ 'content_types': {},
120
+ 'status_codes': {},
121
+ }
122
+
123
+ # Set up metrics only in local mode
124
+ if not IS_DEPLOYMENT:
125
+ self._setup_metrics()
126
+ else:
127
+ # In deployment mode, use dummy metrics that do nothing
128
+ self.pages_crawled_counter = DummyMetric()
129
+ self.pages_failed_counter = DummyMetric()
130
+ self.urls_discovered_counter = DummyMetric()
131
+ self.urls_filtered_counter = DummyMetric()
132
+ self.frontier_size_gauge = DummyMetric()
133
+ self.active_threads_gauge = DummyMetric()
134
+ self.download_time_histogram = DummyMetric()
135
+ self.page_size_histogram = DummyMetric()
136
+
137
+ # Flag to control crawling
138
+ self.running = False
139
+ self.paused = False
140
+ self.stop_event = threading.Event()
141
+
142
+ # Create storage directories if they don't exist
143
+ os.makedirs(config.HTML_STORAGE_PATH, exist_ok=True)
144
+ os.makedirs(config.LOG_PATH, exist_ok=True)
145
+
146
+ def _create_indexes(self):
147
+ """Create indexes for MongoDB collections"""
148
+ try:
149
+ # Pages collection indexes
150
+ self.pages_collection.create_index('url', unique=True)
151
+ self.pages_collection.create_index('content_hash')
152
+ self.pages_collection.create_index('crawled_at')
153
+
154
+ # URLs collection indexes
155
+ self.urls_collection.create_index('url', unique=True)
156
+ self.urls_collection.create_index('normalized_url', unique=True)
157
+ self.urls_collection.create_index('domain')
158
+ self.urls_collection.create_index('status')
159
+ self.urls_collection.create_index('priority')
160
+
161
+ logger.info("MongoDB indexes created")
162
+ except Exception as e:
163
+ logger.error(f"Error creating MongoDB indexes: {e}")
164
+
165
+ def _setup_metrics(self):
166
+ """Set up Prometheus metrics"""
167
+ # Clean up any existing metrics
168
+ collectors_to_remove = []
169
+ for collector in REGISTRY._collector_to_names:
170
+ for name in REGISTRY._collector_to_names[collector]:
171
+ if name.startswith('crawler_'):
172
+ collectors_to_remove.append(collector)
173
+ break
174
+
175
+ for collector in collectors_to_remove:
176
+ REGISTRY.unregister(collector)
177
+
178
+ # Counters
179
+ self.pages_crawled_counter = Counter('crawler_pages_crawled_total', 'Total pages crawled')
180
+ self.pages_failed_counter = Counter('crawler_pages_failed_total', 'Total pages failed')
181
+ self.urls_discovered_counter = Counter('crawler_urls_discovered_total', 'Total URLs discovered')
182
+ self.urls_filtered_counter = Counter('crawler_urls_filtered_total', 'Total URLs filtered')
183
+
184
+ # Gauges
185
+ self.frontier_size_gauge = Gauge('crawler_frontier_size', 'Size of URL frontier')
186
+ self.active_threads_gauge = Gauge('crawler_active_threads', 'Number of active crawler threads')
187
+
188
+ # Histograms
189
+ self.download_time_histogram = Histogram('crawler_download_time_seconds', 'Time to download pages')
190
+ self.page_size_histogram = Histogram('crawler_page_size_bytes', 'Size of downloaded pages')
191
+
192
+ # Start metrics server
193
+ try:
194
+ start_http_server(self.metrics_port)
195
+ logger.info(f"Metrics server started on port {self.metrics_port}")
196
+ except Exception as e:
197
+ logger.error(f"Error starting metrics server: {e}")
198
+
199
+ def add_seed_urls(self, urls: List[str], priority: Priority = Priority.VERY_HIGH) -> int:
200
+ """
201
+ Add seed URLs to the frontier
202
+
203
+ Args:
204
+ urls: List of URLs to add
205
+ priority: Priority for the seed URLs
206
+
207
+ Returns:
208
+ Number of URLs added
209
+ """
210
+ added = 0
211
+ for url in urls:
212
+ url_obj = URL(
213
+ url=url,
214
+ status=URLStatus.PENDING,
215
+ priority=priority,
216
+ depth=0 # Seed URLs are at depth 0
217
+ )
218
+
219
+ # Save URL based on storage mode
220
+ try:
221
+ if self.storage is not None:
222
+ # Use custom storage in deployment mode
223
+ self.storage.add_url(url_obj)
224
+ else:
225
+ # Use MongoDB in local mode
226
+ self.urls_collection.update_one(
227
+ {'url': url},
228
+ {'$set': url_obj.dict()},
229
+ upsert=True
230
+ )
231
+ except Exception as e:
232
+ logger.error(f"Error saving seed URL to database: {e}")
233
+
234
+ # Add to frontier
235
+ if self.frontier.add_url(url_obj):
236
+ added += 1
237
+ self.urls_discovered_counter.inc()
238
+ logger.info(f"Added seed URL: {url}")
239
+
240
+ return added
241
+
242
+ def start(self, num_workers: int = None, async_mode: bool = False) -> None:
243
+ """
244
+ Start the crawler
245
+
246
+ Args:
247
+ num_workers: Number of worker threads
248
+ async_mode: Whether to use async mode
249
+ """
250
+ if self.running:
251
+ logger.warning("Crawler is already running")
252
+ return
253
+
254
+ num_workers = num_workers or config.MAX_WORKERS
255
+
256
+ # Reset stop event
257
+ self.stop_event.clear()
258
+
259
+ # Add seed URLs if frontier is empty
260
+ if self.frontier.size() == 0:
261
+ logger.info("Adding seed URLs")
262
+ self.add_seed_urls(config.SEED_URLS)
263
+
264
+ # Start crawler
265
+ self.running = True
266
+
267
+ # Register signal handlers
268
+ self._register_signal_handlers()
269
+
270
+ logger.info(f"Starting crawler with {num_workers} workers")
271
+
272
+ if async_mode:
273
+ # Use asyncio for crawler
274
+ try:
275
+ loop = asyncio.get_event_loop()
276
+ loop.run_until_complete(self._crawl_async(num_workers))
277
+ except KeyboardInterrupt:
278
+ logger.info("Crawler stopped by user")
279
+ except Exception as e:
280
+ logger.error(f"Error in async crawler: {e}")
281
+ logger.error(traceback.format_exc())
282
+ finally:
283
+ self._cleanup()
284
+ else:
285
+ # Use threads for crawler
286
+ with ThreadPoolExecutor(max_workers=num_workers) as executor:
287
+ try:
288
+ # Submit worker tasks
289
+ futures = [executor.submit(self._crawl_worker) for _ in range(num_workers)]
290
+
291
+ # Wait for completion
292
+ for future in futures:
293
+ future.result()
294
+ except KeyboardInterrupt:
295
+ logger.info("Crawler stopped by user")
296
+ except Exception as e:
297
+ logger.error(f"Error in threaded crawler: {e}")
298
+ logger.error(traceback.format_exc())
299
+ finally:
300
+ self._cleanup()
301
+
302
+ def _register_signal_handlers(self) -> None:
303
+ """Register signal handlers for graceful shutdown"""
304
+ def signal_handler(sig, frame):
305
+ logger.info(f"Received signal {sig}, shutting down")
306
+ self.stop()
307
+
308
+ signal.signal(signal.SIGINT, signal_handler)
309
+ signal.signal(signal.SIGTERM, signal_handler)
310
+
311
+ def _crawl_worker(self) -> None:
312
+ """Worker function for threaded crawler"""
313
+ try:
314
+ self.active_threads_gauge.inc()
315
+
316
+ while self.running and not self.stop_event.is_set():
317
+ # Check if paused
318
+ if self.paused:
319
+ time.sleep(1)
320
+ continue
321
+
322
+ # Get next URL from frontier
323
+ url_obj = self.frontier.get_next_url()
324
+
325
+ # No URL available, wait and retry
326
+ if url_obj is None:
327
+ time.sleep(1)
328
+ continue
329
+
330
+ try:
331
+ # Process the URL
332
+ self._process_url(url_obj)
333
+
334
+ # Update statistics
335
+ self._update_stats()
336
+
337
+ except Exception as e:
338
+ logger.error(f"Error processing URL {url_obj.url}: {e}")
339
+ logger.error(traceback.format_exc())
340
+
341
+ # Update URL status to failed
342
+ self._mark_url_failed(url_obj, str(e))
343
+ except Exception as e:
344
+ logger.error(f"Unhandled error in worker thread: {e}")
345
+ logger.error(traceback.format_exc())
346
+ finally:
347
+ self.active_threads_gauge.dec()
348
+
349
+ async def _crawl_async(self, num_workers: int) -> None:
350
+ """Async worker function for asyncio crawler"""
351
+ try:
352
+ self.active_threads_gauge.inc(num_workers)
353
+
354
+ # Create tasks
355
+ tasks = [self._async_worker() for _ in range(num_workers)]
356
+
357
+ # Wait for all tasks to complete
358
+ await asyncio.gather(*tasks)
359
+
360
+ except Exception as e:
361
+ logger.error(f"Unhandled error in async crawler: {e}")
362
+ logger.error(traceback.format_exc())
363
+ finally:
364
+ self.active_threads_gauge.dec(num_workers)
365
+
366
+ async def _async_worker(self) -> None:
367
+ """Async worker function"""
368
+ try:
369
+ while self.running and not self.stop_event.is_set():
370
+ # Check if paused
371
+ if self.paused:
372
+ await asyncio.sleep(1)
373
+ continue
374
+
375
+ # Get next URL from frontier
376
+ url_obj = self.frontier.get_next_url()
377
+
378
+ # No URL available, wait and retry
379
+ if url_obj is None:
380
+ await asyncio.sleep(1)
381
+ continue
382
+
383
+ try:
384
+ # Process the URL
385
+ await self._process_url_async(url_obj)
386
+
387
+ # Update statistics
388
+ self._update_stats()
389
+
390
+ except Exception as e:
391
+ logger.error(f"Error processing URL {url_obj.url}: {e}")
392
+ logger.error(traceback.format_exc())
393
+
394
+ # Update URL status to failed
395
+ self._mark_url_failed(url_obj, str(e))
396
+ except Exception as e:
397
+ logger.error(f"Unhandled error in async worker: {e}")
398
+ logger.error(traceback.format_exc())
399
+
400
+ def _process_url(self, url_obj: URL) -> None:
401
+ """
402
+ Process a URL
403
+
404
+ Args:
405
+ url_obj: URL object to process
406
+ """
407
+ url = url_obj.url
408
+ logger.debug(f"Processing URL: {url}")
409
+
410
+ # Download page
411
+ with self.download_time_histogram.time():
412
+ page = self.downloader.download(url_obj)
413
+
414
+ # If download failed
415
+ if page is None:
416
+ self.pages_failed_counter.inc()
417
+ self.stats['pages_failed'] += 1
418
+ self._mark_url_failed(url_obj, url_obj.error or "Download failed")
419
+ return
420
+
421
+ # Record page size
422
+ self.page_size_histogram.observe(page.content_length)
423
+
424
+ # Check for duplicate content
425
+ content_hash = page.content_hash
426
+ duplicate = self._check_duplicate_content(content_hash, url)
427
+
428
+ if duplicate:
429
+ logger.info(f"Duplicate content detected for URL {url}")
430
+ page.is_duplicate = True
431
+
432
+ # Mark URL as duplicate but still store the page
433
+ self._mark_url_completed(url_obj)
434
+ else:
435
+ # Parse page and extract URLs
436
+ extracted_urls, metadata = self.parser.parse(page)
437
+
438
+ # Store page metadata
439
+ page.metadata = metadata
440
+
441
+ # Process extracted URLs
442
+ self._process_extracted_urls(extracted_urls, url_obj, metadata)
443
+
444
+ # Mark URL as completed
445
+ self._mark_url_completed(url_obj)
446
+
447
+ # Store page
448
+ self._store_page(page)
449
+
450
+ # Update statistics
451
+ self.pages_crawled_counter.inc()
452
+ self.stats['pages_crawled'] += 1
453
+
454
+ # Add domain to statistics
455
+ domain = url_obj.domain
456
+ self.stats['domains_crawled'].add(domain)
457
+
458
+ # Update content type statistics
459
+ content_type = page.content_type.split(';')[0].strip()
460
+ self.stats['content_types'][content_type] = self.stats['content_types'].get(content_type, 0) + 1
461
+
462
+ # Update status code statistics
463
+ status_code = page.status_code
464
+ self.stats['status_codes'][str(status_code)] = self.stats['status_codes'].get(str(status_code), 0) + 1
465
+
466
+ async def _process_url_async(self, url_obj: URL) -> None:
467
+ """
468
+ Process a URL asynchronously
469
+
470
+ Args:
471
+ url_obj: URL object to process
472
+ """
473
+ url = url_obj.url
474
+ logger.debug(f"Processing URL (async): {url}")
475
+
476
+ # Download page
477
+ download_start = time.time()
478
+ page = await self.downloader.download_async(url_obj)
479
+ download_time = time.time() - download_start
480
+ self.download_time_histogram.observe(download_time)
481
+
482
+ # If download failed
483
+ if page is None:
484
+ self.pages_failed_counter.inc()
485
+ self.stats['pages_failed'] += 1
486
+ self._mark_url_failed(url_obj, url_obj.error or "Download failed")
487
+ return
488
+
489
+ # Record page size
490
+ self.page_size_histogram.observe(page.content_length)
491
+
492
+ # Check for duplicate content
493
+ content_hash = page.content_hash
494
+ duplicate = self._check_duplicate_content(content_hash, url)
495
+
496
+ if duplicate:
497
+ logger.info(f"Duplicate content detected for URL {url}")
498
+ page.is_duplicate = True
499
+
500
+ # Mark URL as duplicate but still store the page
501
+ self._mark_url_completed(url_obj)
502
+ else:
503
+ # Parse page and extract URLs
504
+ extracted_urls, metadata = self.parser.parse(page)
505
+
506
+ # Store page metadata
507
+ page.metadata = metadata
508
+
509
+ # Process extracted URLs
510
+ self._process_extracted_urls(extracted_urls, url_obj, metadata)
511
+
512
+ # Mark URL as completed
513
+ self._mark_url_completed(url_obj)
514
+
515
+ # Store page
516
+ self._store_page(page)
517
+
518
+ # Update statistics
519
+ self.pages_crawled_counter.inc()
520
+ self.stats['pages_crawled'] += 1
521
+
522
+ def _check_duplicate_content(self, content_hash: str, url: str) -> bool:
523
+ """
524
+ Check if content has been seen before
525
+
526
+ Args:
527
+ content_hash: Hash of the content
528
+ url: URL of the page
529
+
530
+ Returns:
531
+ True if content is a duplicate, False otherwise
532
+ """
533
+ try:
534
+ if self.storage is not None:
535
+ # Use custom storage - simplified duplicate check
536
+ for page in self.storage.pages.values():
537
+ if page.content_hash == content_hash and page.url != url:
538
+ return True
539
+ return False
540
+ else:
541
+ # Use MongoDB
542
+ return self.pages_collection.find_one({
543
+ 'content_hash': content_hash,
544
+ 'url': {'$ne': url}
545
+ }) is not None
546
+ except Exception as e:
547
+ logger.error(f"Error checking for duplicate content: {e}")
548
+ return False
549
+
550
+ def _process_extracted_urls(self, urls: List[str], parent_url_obj: URL, metadata: Dict[str, Any]) -> None:
551
+ """
552
+ Process extracted URLs
553
+
554
+ Args:
555
+ urls: List of URLs to process
556
+ parent_url_obj: Parent URL object
557
+ metadata: Metadata from the parent page
558
+ """
559
+ parent_url = parent_url_obj.url
560
+ parent_depth = parent_url_obj.depth
561
+
562
+ # Check max depth
563
+ if parent_depth >= config.MAX_DEPTH:
564
+ logger.debug(f"Max depth reached for {parent_url}")
565
+ return
566
+
567
+ for url in urls:
568
+ # Calculate priority based on URL and metadata
569
+ priority = self.parser.calculate_priority(url, metadata)
570
+
571
+ # Create URL object
572
+ url_obj = URL(
573
+ url=url,
574
+ status=URLStatus.PENDING,
575
+ priority=priority,
576
+ depth=parent_depth + 1,
577
+ parent_url=parent_url
578
+ )
579
+
580
+ # Add to frontier
581
+ if self.frontier.add_url(url_obj):
582
+ # URL was added to frontier
583
+ self.urls_discovered_counter.inc()
584
+ self.stats['urls_discovered'] += 1
585
+
586
+ # Save URL based on storage mode
587
+ try:
588
+ if self.storage is not None:
589
+ # Use custom storage in deployment mode
590
+ self.storage.add_url(url_obj)
591
+ else:
592
+ # Use MongoDB in local mode
593
+ self.urls_collection.update_one(
594
+ {'url': url},
595
+ {'$set': url_obj.dict()},
596
+ upsert=True
597
+ )
598
+ except Exception as e:
599
+ logger.error(f"Error saving URL to database: {e}")
600
+ else:
601
+ # URL was not added (filtered or duplicate)
602
+ self.urls_filtered_counter.inc()
603
+ self.stats['urls_filtered'] += 1
604
+
605
+ def _mark_url_completed(self, url_obj: URL) -> None:
606
+ """
607
+ Mark URL as completed
608
+
609
+ Args:
610
+ url_obj: URL object to mark as completed
611
+ """
612
+ try:
613
+ url_obj.status = URLStatus.COMPLETED
614
+ url_obj.completed_at = datetime.now()
615
+
616
+ if self.storage is not None:
617
+ # Use custom storage
618
+ self.storage.add_url(url_obj)
619
+ else:
620
+ # Use MongoDB
621
+ self.urls_collection.update_one(
622
+ {'url': url_obj.url},
623
+ {'$set': url_obj.dict()},
624
+ upsert=True
625
+ )
626
+ except Exception as e:
627
+ logger.error(f"Error marking URL as completed: {e}")
628
+
629
+ def _mark_url_failed(self, url_obj: URL, error: str) -> None:
630
+ """
631
+ Mark URL as failed
632
+
633
+ Args:
634
+ url_obj: URL object to mark as failed
635
+ error: Error message
636
+ """
637
+ try:
638
+ url_obj.status = URLStatus.FAILED
639
+ url_obj.error = error
640
+ url_obj.completed_at = datetime.now()
641
+
642
+ if self.storage is not None:
643
+ # Use custom storage
644
+ self.storage.add_url(url_obj)
645
+ else:
646
+ # Use MongoDB
647
+ self.urls_collection.update_one(
648
+ {'url': url_obj.url},
649
+ {'$set': url_obj.dict()},
650
+ upsert=True
651
+ )
652
+
653
+ # If retries not exceeded, add back to frontier with lower priority
654
+ if url_obj.retries < config.RETRY_TIMES:
655
+ # Lower priority by one level (to a maximum of VERY_LOW)
656
+ new_priority = min(Priority.VERY_LOW, Priority(url_obj.priority + 1))
657
+ url_obj.priority = new_priority
658
+ url_obj.status = URLStatus.PENDING
659
+
660
+ # Add back to frontier
661
+ self.frontier.add_url(url_obj)
662
+
663
+ except Exception as e:
664
+ logger.error(f"Error marking URL as failed: {e}")
665
+
666
+ def _store_page(self, page: Page) -> None:
667
+ """
668
+ Store a page in the database and optionally on disk
669
+
670
+ Args:
671
+ page: Page object to store
672
+ """
673
+ try:
674
+ if self.storage is not None:
675
+ # Use custom storage in deployment mode
676
+ self.storage.add_page(page)
677
+ else:
678
+ # Use MongoDB in local mode
679
+ self.pages_collection.update_one(
680
+ {'url': page.url},
681
+ {'$set': page.dict()},
682
+ upsert=True
683
+ )
684
+
685
+ # Optionally store HTML content on disk
686
+ if not page.is_duplicate:
687
+ if IS_DEPLOYMENT:
688
+ # In deployment mode, store in temporary directory
689
+ domain_dir = os.path.join(config.HTML_STORAGE_PATH, self._extract_domain(page.url))
690
+ os.makedirs(domain_dir, exist_ok=True)
691
+
692
+ # Create filename from URL
693
+ filename = self._url_to_filename(page.url)
694
+ filepath = os.path.join(domain_dir, filename)
695
+
696
+ # Write HTML to file
697
+ with open(filepath, 'w', encoding='utf-8') as f:
698
+ f.write(page.content)
699
+
700
+ logger.debug(f"Stored HTML content for {page.url} at {filepath}")
701
+ else:
702
+ # In local mode, store in permanent storage
703
+ domain = self._extract_domain(page.url)
704
+ domain_dir = os.path.join(config.HTML_STORAGE_PATH, domain)
705
+ os.makedirs(domain_dir, exist_ok=True)
706
+
707
+ # Create filename from URL
708
+ filename = self._url_to_filename(page.url)
709
+ filepath = os.path.join(domain_dir, filename)
710
+
711
+ # Write HTML to file
712
+ with open(filepath, 'w', encoding='utf-8') as f:
713
+ f.write(page.content)
714
+
715
+ logger.debug(f"Stored HTML content for {page.url} at {filepath}")
716
+ except Exception as e:
717
+ logger.error(f"Error storing page: {e}")
718
+
719
+ def _extract_domain(self, url: str) -> str:
720
+ """Extract domain from URL"""
721
+ parsed = urlparse(url)
722
+ return parsed.netloc.replace(':', '_')
723
+
724
+ def _url_to_filename(self, url: str) -> str:
725
+ """Convert URL to filename"""
726
+ # Hash the URL to create a safe filename
727
+ url_hash = self._hash_url(url)
728
+ return f"{url_hash}.html"
729
+
730
+ def _hash_url(self, url: str) -> str:
731
+ """Create a hash of a URL"""
732
+ import hashlib
733
+ return hashlib.md5(url.encode('utf-8')).hexdigest()
734
+
735
+ def _update_stats(self) -> None:
736
+ """Update and log statistics"""
737
+ # Update frontier size gauge
738
+ self.frontier_size_gauge.set(self.frontier.size())
739
+
740
+ # Log statistics periodically
741
+ if self.stats['pages_crawled'] % 100 == 0:
742
+ self._log_stats()
743
+
744
+ def _log_stats(self) -> None:
745
+ """Log crawler statistics"""
746
+ # Calculate elapsed time
747
+ elapsed = time.time() - self.stats['start_time']
748
+ hours, remainder = divmod(elapsed, 3600)
749
+ minutes, seconds = divmod(remainder, 60)
750
+
751
+ # Get current statistics
752
+ pages_crawled = self.stats['pages_crawled']
753
+ pages_failed = self.stats['pages_failed']
754
+ urls_discovered = self.stats['urls_discovered']
755
+ urls_filtered = self.stats['urls_filtered']
756
+ domains_crawled = len(self.stats['domains_crawled'])
757
+ frontier_size = self.frontier.size()
758
+
759
+ # Calculate pages per second
760
+ pages_per_second = pages_crawled / elapsed if elapsed > 0 else 0
761
+
762
+ # Log statistics
763
+ logger.info(
764
+ f"Crawler running for {int(hours):02d}:{int(minutes):02d}:{int(seconds):02d} - "
765
+ f"Pages: {pages_crawled} ({pages_per_second:.2f}/s) - "
766
+ f"Failed: {pages_failed} - "
767
+ f"URLs Discovered: {urls_discovered} - "
768
+ f"URLs Filtered: {urls_filtered} - "
769
+ f"Domains: {domains_crawled} - "
770
+ f"Frontier: {frontier_size}"
771
+ )
772
+
773
+ # Save statistics to database
774
+ try:
775
+ stats_copy = self.stats.copy()
776
+ stats_copy['domains_crawled'] = list(stats_copy['domains_crawled'])
777
+ stats_copy['timestamp'] = datetime.datetime.now()
778
+
779
+ self.stats_collection.insert_one(stats_copy)
780
+ except Exception as e:
781
+ logger.error(f"Error saving statistics to database: {e}")
782
+
783
+ def stop(self) -> None:
784
+ """Stop the crawler"""
785
+ if not self.running:
786
+ logger.warning("Crawler is not running")
787
+ return
788
+
789
+ logger.info("Stopping crawler")
790
+ self.stop_event.set()
791
+ self.running = False
792
+
793
+ def pause(self) -> None:
794
+ """Pause the crawler"""
795
+ if not self.running:
796
+ logger.warning("Crawler is not running")
797
+ return
798
+
799
+ logger.info("Pausing crawler")
800
+ self.paused = True
801
+
802
+ def resume(self) -> None:
803
+ """Resume the crawler"""
804
+ if not self.running:
805
+ logger.warning("Crawler is not running")
806
+ return
807
+
808
+ logger.info("Resuming crawler")
809
+ self.paused = False
810
+
811
+ def checkpoint(self) -> bool:
812
+ """
813
+ Save crawler state for recovery
814
+
815
+ Returns:
816
+ True if successful, False otherwise
817
+ """
818
+ logger.info("Creating crawler checkpoint")
819
+
820
+ # Checkpoint the frontier
821
+ frontier_checkpoint = self.frontier.checkpoint()
822
+
823
+ # Save current statistics
824
+ try:
825
+ stats_copy = self.stats.copy()
826
+ stats_copy['domains_crawled'] = list(stats_copy['domains_crawled'])
827
+ stats_copy['checkpoint_time'] = datetime.datetime.now()
828
+
829
+ with open(os.path.join(config.STORAGE_PATH, 'crawler_stats.json'), 'w') as f:
830
+ json.dump(stats_copy, f)
831
+
832
+ logger.info("Crawler checkpoint created")
833
+ return frontier_checkpoint
834
+ except Exception as e:
835
+ logger.error(f"Error creating crawler checkpoint: {e}")
836
+ return False
837
+
838
+ def restore(self) -> bool:
839
+ """
840
+ Restore crawler state from checkpoint
841
+
842
+ Returns:
843
+ True if successful, False otherwise
844
+ """
845
+ logger.info("Restoring crawler from checkpoint")
846
+
847
+ # Restore frontier
848
+ frontier_restored = self.frontier.restore()
849
+
850
+ # Restore statistics
851
+ try:
852
+ stats_path = os.path.join(config.STORAGE_PATH, 'crawler_stats.json')
853
+ if os.path.exists(stats_path):
854
+ with open(stats_path, 'r') as f:
855
+ saved_stats = json.load(f)
856
+
857
+ # Restore stats
858
+ self.stats = saved_stats
859
+ self.stats['domains_crawled'] = set(self.stats['domains_crawled'])
860
+
861
+ logger.info("Crawler statistics restored")
862
+ else:
863
+ logger.warning("No statistics checkpoint found")
864
+
865
+ return frontier_restored
866
+ except Exception as e:
867
+ logger.error(f"Error restoring crawler checkpoint: {e}")
868
+ return False
869
+
870
+ def _cleanup(self) -> None:
871
+ """Clean up resources when crawler stops"""
872
+ # Create final checkpoint
873
+ self.checkpoint()
874
+
875
+ # Log final statistics
876
+ self._log_stats()
877
+
878
+ # Reset flags
879
+ self.running = False
880
+ self.paused = False
881
+
882
+ logger.info("Crawler stopped")
883
+
884
+
885
+ # Dummy metric class for deployment mode
886
+ class DummyMetric:
887
+ """A dummy metric that does nothing"""
888
+ def inc(self, *args, **kwargs): pass
889
+ def dec(self, *args, **kwargs): pass
890
+ def set(self, *args, **kwargs): pass
891
+ def observe(self, *args, **kwargs): pass
892
+ def time(self): return self.Timer()
893
+
894
+ class Timer:
895
+ def __enter__(self): pass
896
+ def __exit__(self, exc_type, exc_val, exc_tb): pass
897
+
898
+
899
+ if __name__ == "__main__":
900
+ # Create and start crawler
901
+ crawler = Crawler()
902
+
903
+ try:
904
+ crawler.start()
905
+ except KeyboardInterrupt:
906
+ logger.info("Crawler interrupted by user")
907
+ finally:
908
+ crawler.stop()
deduplication.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Content deduplication component for the web crawler.
3
+
4
+ Provides functionality to detect duplicate pages efficiently
5
+
6
+ 1. Exact content hashing
7
+ 2. Shingling and MinHash for near-duplicate detection
8
+ 3. SimHash for fuzzy matching
9
+ """
10
+
11
+ import hashlib
12
+ import logging
13
+ import time
14
+ from typing import Set, List, Dict, Tuple, Optional, Union
15
+ import random
16
+ import numpy as np
17
+ from collections import defaultdict
18
+ import re
19
+
20
+ import config
21
+
22
+ # Configure logging
23
+ logging.basicConfig(
24
+ level=getattr(logging, config.LOG_LEVEL),
25
+ format=config.LOG_FORMAT
26
+ )
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class ContentDeduplicator:
31
+ """
32
+ Content deduplication using multiple techniques:
33
+ - Exact match (MD5 hash)
34
+ - Near-duplicate detection (MinHash)
35
+ - Fuzzy matching (SimHash)
36
+ """
37
+
38
+ def __init__(self):
39
+ """Initialize the deduplicator"""
40
+ # Exact content hashing
41
+ self.content_hashes = set()
42
+
43
+ # MinHash parameters
44
+ self.num_hashes = 100
45
+ self.minhash_signatures = {} # URL -> MinHash signature
46
+ self.minhash_bands = defaultdict(set) # band_id -> set of URLs
47
+ self.band_size = 5 # Each band contains 5 signatures
48
+ self.shingle_size = 3 # k-shingles of 3 consecutive tokens
49
+
50
+ # SimHash parameters
51
+ self.simhash_dim = 64
52
+ self.simhash_values = {} # URL -> SimHash value
53
+ self.hamming_threshold = 3 # Maximum Hamming distance for similarity
54
+
55
+ # Cache of previously computed duplicates for quick lookups
56
+ self.duplicate_cache = {} # URL -> set of duplicate URLs
57
+
58
+ # Token preprocessing
59
+ self.token_pattern = re.compile(r'\w+')
60
+ self.stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'is', 'that', 'for', 'on', 'with'])
61
+
62
+ # Statistics
63
+ self.stats = {
64
+ 'exact_duplicates': 0,
65
+ 'near_duplicates': 0,
66
+ 'fuzzy_duplicates': 0,
67
+ 'processing_time': 0,
68
+ 'total_documents': 0,
69
+ }
70
+
71
+ def is_duplicate(self, url: str, content: str) -> Tuple[bool, Optional[str]]:
72
+ """
73
+ Check if content is a duplicate
74
+
75
+ Args:
76
+ url: URL of the page
77
+ content: Page content
78
+
79
+ Returns:
80
+ (is_duplicate, duplicate_url): Tuple indicating if content is duplicate and what it duplicates
81
+ """
82
+ start_time = time.time()
83
+
84
+ # Check exact match first (fastest)
85
+ content_hash = self._hash_content(content)
86
+ if content_hash in self.content_hashes:
87
+ self.stats['exact_duplicates'] += 1
88
+ processing_time = time.time() - start_time
89
+ self.stats['processing_time'] += processing_time
90
+
91
+ # Find the URL with the same hash
92
+ for existing_url, existing_hash in self._get_hash_map().items():
93
+ if existing_hash == content_hash and existing_url != url:
94
+ logger.debug(f"Exact duplicate detected: {url} duplicates {existing_url}")
95
+ return True, existing_url
96
+
97
+ return True, None
98
+
99
+ # Check cache for quick lookup
100
+ if url in self.duplicate_cache:
101
+ duplicate_url = next(iter(self.duplicate_cache[url]))
102
+ logger.debug(f"Duplicate found in cache: {url} duplicates {duplicate_url}")
103
+ return True, duplicate_url
104
+
105
+ # Only perform more expensive checks if configured to do so
106
+ if config.NEAR_DUPLICATE_DETECTION:
107
+ # Check for near-duplicates using MinHash
108
+ near_duplicate = self._check_minhash(url, content)
109
+ if near_duplicate:
110
+ self.stats['near_duplicates'] += 1
111
+ processing_time = time.time() - start_time
112
+ self.stats['processing_time'] += processing_time
113
+
114
+ logger.debug(f"Near-duplicate detected: {url} is similar to {near_duplicate}")
115
+ self._add_to_duplicate_cache(url, near_duplicate)
116
+ return True, near_duplicate
117
+
118
+ if config.FUZZY_DUPLICATE_DETECTION:
119
+ # Check for fuzzy matches using SimHash
120
+ fuzzy_duplicate = self._check_simhash(url, content)
121
+ if fuzzy_duplicate:
122
+ self.stats['fuzzy_duplicates'] += 1
123
+ processing_time = time.time() - start_time
124
+ self.stats['processing_time'] += processing_time
125
+
126
+ logger.debug(f"Fuzzy duplicate detected: {url} is similar to {fuzzy_duplicate}")
127
+ self._add_to_duplicate_cache(url, fuzzy_duplicate)
128
+ return True, fuzzy_duplicate
129
+
130
+ # Not a duplicate, add to index
131
+ self._add_to_index(url, content, content_hash)
132
+
133
+ self.stats['total_documents'] += 1
134
+ processing_time = time.time() - start_time
135
+ self.stats['processing_time'] += processing_time
136
+
137
+ return False, None
138
+
139
+ def _add_to_duplicate_cache(self, url: str, duplicate_url: str) -> None:
140
+ """Add URL to duplicate cache for faster lookups"""
141
+ if url not in self.duplicate_cache:
142
+ self.duplicate_cache[url] = set()
143
+ self.duplicate_cache[url].add(duplicate_url)
144
+
145
+ # Also add reverse relationship
146
+ if duplicate_url not in self.duplicate_cache:
147
+ self.duplicate_cache[duplicate_url] = set()
148
+ self.duplicate_cache[duplicate_url].add(url)
149
+
150
+ def _get_hash_map(self) -> Dict[str, str]:
151
+ """Get mapping of URLs to their content hashes"""
152
+ return {url: signature for url, signature in self.simhash_values.items()}
153
+
154
+ def _hash_content(self, content: str) -> str:
155
+ """Create MD5 hash of content"""
156
+ return hashlib.md5(content.encode('utf-8')).hexdigest()
157
+
158
+ def _preprocess_content(self, content: str) -> List[str]:
159
+ """
160
+ Preprocess content for tokenization:
161
+ 1. Convert to lowercase
162
+ 2. Remove HTML tags
163
+ 3. Extract tokens
164
+ 4. Remove stop words
165
+ """
166
+ # Remove HTML tags
167
+ content = re.sub(r'<[^>]+>', ' ', content)
168
+
169
+ # Tokenize
170
+ tokens = self.token_pattern.findall(content.lower())
171
+
172
+ # Remove stop words
173
+ tokens = [token for token in tokens if token not in self.stop_words]
174
+
175
+ return tokens
176
+
177
+ def _add_to_index(self, url: str, content: str, content_hash: Optional[str] = None) -> None:
178
+ """
179
+ Add content to the deduplication index
180
+
181
+ Args:
182
+ url: URL of the page
183
+ content: Page content
184
+ content_hash: Optional pre-computed hash
185
+ """
186
+ # Add exact hash
187
+ if content_hash is None:
188
+ content_hash = self._hash_content(content)
189
+ self.content_hashes.add(content_hash)
190
+
191
+ # Add MinHash signature
192
+ if config.NEAR_DUPLICATE_DETECTION:
193
+ signature = self._compute_minhash(content)
194
+ self.minhash_signatures[url] = signature
195
+
196
+ # Add to LSH bands
197
+ for i in range(0, self.num_hashes, self.band_size):
198
+ band = tuple(signature[i:i+self.band_size])
199
+ band_id = hash(band)
200
+ self.minhash_bands[band_id].add(url)
201
+
202
+ # Add SimHash
203
+ if config.FUZZY_DUPLICATE_DETECTION:
204
+ simhash_value = self._compute_simhash(content)
205
+ self.simhash_values[url] = simhash_value
206
+
207
+ def _create_shingles(self, tokens: List[str], k: int = 3) -> Set[str]:
208
+ """
209
+ Create k-shingles from tokens
210
+
211
+ Args:
212
+ tokens: List of tokens
213
+ k: Size of shingles
214
+
215
+ Returns:
216
+ Set of shingles
217
+ """
218
+ return set(' '.join(tokens[i:i+k]) for i in range(len(tokens) - k + 1))
219
+
220
+ def _compute_minhash(self, content: str) -> List[int]:
221
+ """
222
+ Compute MinHash signature for content
223
+
224
+ Args:
225
+ content: Page content
226
+
227
+ Returns:
228
+ MinHash signature (list of hash values)
229
+ """
230
+ tokens = self._preprocess_content(content)
231
+ shingles = self._create_shingles(tokens, self.shingle_size)
232
+
233
+ # Generate random hash functions
234
+ max_hash = 2**32 - 1
235
+
236
+ # Create signature
237
+ signature = [max_hash] * self.num_hashes
238
+
239
+ # For each shingle, compute hashes and keep minimum values
240
+ for shingle in shingles:
241
+ # Use shingle as seed for random hash functions
242
+ shingle_hash = hash(shingle)
243
+
244
+ for i in range(self.num_hashes):
245
+ # Simple linear hash function: (a*x + b) mod c
246
+ a = i + 1 # Different 'a' for each hash function
247
+ b = i * i # Different 'b' for each hash function
248
+ hash_value = (a * shingle_hash + b) % max_hash
249
+
250
+ # Keep the minimum hash value
251
+ signature[i] = min(signature[i], hash_value)
252
+
253
+ return signature
254
+
255
+ def _check_minhash(self, url: str, content: str) -> Optional[str]:
256
+ """
257
+ Check for near-duplicates using MinHash and LSH
258
+
259
+ Args:
260
+ url: URL of the page
261
+ content: Page content
262
+
263
+ Returns:
264
+ URL of duplicate page if found, None otherwise
265
+ """
266
+ # Compute MinHash signature
267
+ signature = self._compute_minhash(content)
268
+
269
+ # Check each band for potential matches
270
+ candidate_urls = set()
271
+ for i in range(0, self.num_hashes, self.band_size):
272
+ band = tuple(signature[i:i+self.band_size])
273
+ band_id = hash(band)
274
+
275
+ # Get URLs that share this band
276
+ if band_id in self.minhash_bands:
277
+ candidate_urls.update(self.minhash_bands[band_id])
278
+
279
+ # Check Jaccard similarity with candidates
280
+ for candidate_url in candidate_urls:
281
+ if candidate_url == url:
282
+ continue
283
+
284
+ candidate_signature = self.minhash_signatures[candidate_url]
285
+ similarity = self._jaccard_similarity(signature, candidate_signature)
286
+
287
+ if similarity >= config.SIMILARITY_THRESHOLD:
288
+ return candidate_url
289
+
290
+ return None
291
+
292
+ def _jaccard_similarity(self, sig1: List[int], sig2: List[int]) -> float:
293
+ """
294
+ Estimate Jaccard similarity from MinHash signatures
295
+
296
+ Args:
297
+ sig1: First signature
298
+ sig2: Second signature
299
+
300
+ Returns:
301
+ Estimated Jaccard similarity (0-1)
302
+ """
303
+ if len(sig1) != len(sig2):
304
+ raise ValueError("Signatures must have the same length")
305
+
306
+ # Count matching hash values
307
+ matches = sum(1 for i in range(len(sig1)) if sig1[i] == sig2[i])
308
+
309
+ # Estimate similarity
310
+ return matches / len(sig1)
311
+
312
+ def _compute_simhash(self, content: str) -> int:
313
+ """
314
+ Compute SimHash for content
315
+
316
+ Args:
317
+ content: Page content
318
+
319
+ Returns:
320
+ SimHash value
321
+ """
322
+ tokens = self._preprocess_content(content)
323
+
324
+ # Initialize vector
325
+ v = [0] * self.simhash_dim
326
+
327
+ # For each token, compute hash and update vector
328
+ for token in tokens:
329
+ # Compute hash of token
330
+ token_hash = hashlib.md5(token.encode('utf-8')).digest()
331
+
332
+ # Convert to binary representation
333
+ token_bits = ''.join(format(byte, '08b') for byte in token_hash)
334
+
335
+ # Use first self.simhash_dim bits
336
+ token_bits = token_bits[:self.simhash_dim]
337
+
338
+ # Update vector
339
+ for i, bit in enumerate(token_bits):
340
+ if bit == '1':
341
+ v[i] += 1
342
+ else:
343
+ v[i] -= 1
344
+
345
+ # Create fingerprint
346
+ fingerprint = 0
347
+ for i, val in enumerate(v):
348
+ if val > 0:
349
+ fingerprint |= (1 << i)
350
+
351
+ return fingerprint
352
+
353
+ def _check_simhash(self, url: str, content: str) -> Optional[str]:
354
+ """
355
+ Check for fuzzy duplicates using SimHash
356
+
357
+ Args:
358
+ url: URL of the page
359
+ content: Page content
360
+
361
+ Returns:
362
+ URL of duplicate page if found, None otherwise
363
+ """
364
+ # Compute SimHash
365
+ simhash_value = self._compute_simhash(content)
366
+
367
+ # Compare with existing SimHash values
368
+ for existing_url, existing_simhash in self.simhash_values.items():
369
+ if existing_url == url:
370
+ continue
371
+
372
+ # Calculate Hamming distance
373
+ hamming_distance = bin(simhash_value ^ existing_simhash).count('1')
374
+
375
+ if hamming_distance <= self.hamming_threshold:
376
+ return existing_url
377
+
378
+ return None
379
+
380
+ def clear(self) -> None:
381
+ """Clear all indexes and caches"""
382
+ self.content_hashes.clear()
383
+ self.minhash_signatures.clear()
384
+ self.minhash_bands.clear()
385
+ self.simhash_values.clear()
386
+ self.duplicate_cache.clear()
387
+
388
+ # Reset statistics
389
+ self.stats = {
390
+ 'exact_duplicates': 0,
391
+ 'near_duplicates': 0,
392
+ 'fuzzy_duplicates': 0,
393
+ 'processing_time': 0,
394
+ 'total_documents': 0,
395
+ }
396
+
397
+ def get_stats(self) -> Dict[str, Union[int, float]]:
398
+ """Get deduplication statistics"""
399
+ stats_copy = self.stats.copy()
400
+
401
+ # Calculate average processing time
402
+ total_docs = self.stats['total_documents']
403
+ if total_docs > 0:
404
+ avg_time = self.stats['processing_time'] / total_docs
405
+ stats_copy['avg_processing_time'] = avg_time
406
+ else:
407
+ stats_copy['avg_processing_time'] = 0
408
+
409
+ # Calculate total duplicates
410
+ total_duplicates = (self.stats['exact_duplicates'] +
411
+ self.stats['near_duplicates'] +
412
+ self.stats['fuzzy_duplicates'])
413
+ stats_copy['total_duplicates'] = total_duplicates
414
+
415
+ # Calculate duplicate percentage
416
+ if total_docs > 0:
417
+ duplicate_percentage = (total_duplicates / total_docs) * 100
418
+ stats_copy['duplicate_percentage'] = duplicate_percentage
419
+ else:
420
+ stats_copy['duplicate_percentage'] = 0
421
+
422
+ return stats_copy
dns_resolver.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DNS resolver with caching for web crawler
3
+ """
4
+
5
+ import socket
6
+ import logging
7
+ import time
8
+ from typing import Dict, Optional, Tuple
9
+ from urllib.parse import urlparse
10
+ from datetime import datetime, timedelta
11
+ from cachetools import TTLCache
12
+ import threading
13
+ import dns
14
+ import dns.resolver
15
+
16
+ import config
17
+
18
+ # Import local configuration if available
19
+ try:
20
+ import local_config
21
+ # Override config settings with local settings
22
+ for key in dir(local_config):
23
+ if key.isupper():
24
+ setattr(config, key, getattr(local_config, key))
25
+ logging.info("Loaded local configuration")
26
+ except ImportError:
27
+ pass
28
+
29
+ # Configure logging
30
+ logging.basicConfig(
31
+ level=getattr(logging, config.LOG_LEVEL),
32
+ format=config.LOG_FORMAT
33
+ )
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class DNSResolver:
38
+ """
39
+ DNS resolver with caching to improve performance
40
+
41
+ DNS resolution can be a bottleneck for crawlers due to the synchronous
42
+ nature of many DNS interfaces. This class provides a cached resolver
43
+ to reduce the number of DNS lookups.
44
+ """
45
+
46
+ def __init__(self, cache_size: int = 10000, cache_ttl: int = 3600):
47
+ """
48
+ Initialize DNS resolver
49
+
50
+ Args:
51
+ cache_size: Maximum number of DNS records to cache
52
+ cache_ttl: Time to live for cache entries in seconds
53
+ """
54
+ self.cache = TTLCache(maxsize=cache_size, ttl=cache_ttl)
55
+ self.lock = threading.RLock() # Thread-safe operations
56
+ self.resolver = dns.resolver.Resolver()
57
+ self.resolver.timeout = 3.0 # Timeout for DNS requests in seconds
58
+ self.resolver.lifetime = 5.0 # Total timeout for all DNS requests
59
+
60
+ # Stats tracking
61
+ self.hit_count = 0
62
+ self.miss_count = 0
63
+
64
+ def resolve(self, url: str) -> Optional[str]:
65
+ """
66
+ Resolve a URL to an IP address
67
+
68
+ Args:
69
+ url: URL to resolve
70
+
71
+ Returns:
72
+ IP address or None if resolution fails
73
+ """
74
+ try:
75
+ parsed = urlparse(url)
76
+ hostname = parsed.netloc.split(':')[0] # Remove port if present
77
+
78
+ # Check cache first
79
+ with self.lock:
80
+ if hostname in self.cache:
81
+ logger.debug(f"DNS cache hit for {hostname}")
82
+ self.hit_count += 1
83
+ return self.cache[hostname]
84
+
85
+ # Cache miss - resolve hostname
86
+ ip_address = self._resolve_hostname(hostname)
87
+
88
+ # Update cache
89
+ if ip_address:
90
+ with self.lock:
91
+ self.cache[hostname] = ip_address
92
+ self.miss_count += 1
93
+
94
+ return ip_address
95
+
96
+ except Exception as e:
97
+ logger.warning(f"Error resolving DNS for {url}: {e}")
98
+ return None
99
+
100
+ def _resolve_hostname(self, hostname: str) -> Optional[str]:
101
+ """
102
+ Resolve hostname to IP address
103
+
104
+ Args:
105
+ hostname: Hostname to resolve
106
+
107
+ Returns:
108
+ IP address or None if resolution fails
109
+ """
110
+ try:
111
+ # First try using dnspython for more control
112
+ answers = self.resolver.resolve(hostname, 'A')
113
+ if answers:
114
+ # Return first IP address
115
+ return str(answers[0])
116
+ except dns.exception.DNSException as e:
117
+ logger.debug(f"dnspython DNS resolution failed for {hostname}: {e}")
118
+
119
+ # Fall back to socket.gethostbyname
120
+ try:
121
+ return socket.gethostbyname(hostname)
122
+ except socket.gaierror as e:
123
+ logger.warning(f"Socket DNS resolution failed for {hostname}: {e}")
124
+ return None
125
+
126
+ def bulk_resolve(self, urls: list) -> Dict[str, Optional[str]]:
127
+ """
128
+ Resolve multiple URLs to IP addresses
129
+
130
+ Args:
131
+ urls: List of URLs to resolve
132
+
133
+ Returns:
134
+ Dictionary mapping URLs to IP addresses
135
+ """
136
+ results = {}
137
+ for url in urls:
138
+ results[url] = self.resolve(url)
139
+ return results
140
+
141
+ def clear_cache(self) -> None:
142
+ """Clear the DNS cache"""
143
+ with self.lock:
144
+ self.cache.clear()
145
+
146
+ def get_stats(self) -> Dict[str, int]:
147
+ """
148
+ Get statistics about the DNS cache
149
+
150
+ Returns:
151
+ Dictionary with cache statistics
152
+ """
153
+ with self.lock:
154
+ return {
155
+ 'size': len(self.cache),
156
+ 'max_size': self.cache.maxsize,
157
+ 'ttl': self.cache.ttl,
158
+ 'hit_count': self.hit_count,
159
+ 'miss_count': self.miss_count,
160
+ 'hit_ratio': self.hit_count / (self.hit_count + self.miss_count) if (self.hit_count + self.miss_count) > 0 else 0
161
+ }
docker-compose.yml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3'
2
+
3
+ services:
4
+ mongodb:
5
+ image: mongo:6.0
6
+ container_name: crawler-mongodb
7
+ ports:
8
+ - "27017:27017"
9
+ volumes:
10
+ - mongodb_data:/data/db
11
+ restart: unless-stopped
12
+ environment:
13
+ - MONGO_INITDB_DATABASE=webcrawler
14
+ networks:
15
+ - crawler-network
16
+
17
+ redis:
18
+ image: redis:latest
19
+ container_name: crawler-redis
20
+ ports:
21
+ - "6379:6379"
22
+ volumes:
23
+ - redis_data:/data
24
+ restart: unless-stopped
25
+ networks:
26
+ - crawler-network
27
+
28
+ web-crawler:
29
+ build:
30
+ context: .
31
+ dockerfile: Dockerfile
32
+ container_name: web-crawler
33
+ volumes:
34
+ - ./:/app
35
+ - crawler_data:/data/storage
36
+ ports:
37
+ - "9100:9100"
38
+ depends_on:
39
+ - mongodb
40
+ - redis
41
+ environment:
42
+ - MONGODB_URI=mongodb://mongodb:27017/
43
+ - REDIS_URI=redis://redis:6379/0
44
+ - LOG_LEVEL=INFO
45
+ - MAX_WORKERS=4
46
+ networks:
47
+ - crawler-network
48
+ command: python crawl.py start --workers=4
49
+
50
+ crawler-api:
51
+ build:
52
+ context: .
53
+ dockerfile: Dockerfile
54
+ container_name: crawler-api
55
+ volumes:
56
+ - ./:/app
57
+ - crawler_data:/data/storage
58
+ ports:
59
+ - "8000:8000"
60
+ depends_on:
61
+ - mongodb
62
+ - redis
63
+ - web-crawler
64
+ environment:
65
+ - MONGODB_URI=mongodb://mongodb:27017/
66
+ - REDIS_URI=redis://redis:6379/0
67
+ - LOG_LEVEL=INFO
68
+ networks:
69
+ - crawler-network
70
+ command: python api.py
71
+
72
+ networks:
73
+ crawler-network:
74
+ driver: bridge
75
+
76
+ volumes:
77
+ mongodb_data:
78
+ redis_data:
79
+ crawler_data:
downloader.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HTML Downloader component for web crawler
3
+ """
4
+
5
+ import time
6
+ import logging
7
+ import requests
8
+ from requests.exceptions import RequestException
9
+ from typing import Dict, Optional, Tuple, List, Any
10
+ from urllib.parse import urlparse
11
+ import aiohttp
12
+ import asyncio
13
+ from aiohttp.client_exceptions import ClientError
14
+ import hashlib
15
+ import os
16
+
17
+ from models import URL, Page, calculate_content_hash
18
+ from dns_resolver import DNSResolver
19
+ from robots import RobotsHandler
20
+ import config
21
+
22
+ # Configure logging
23
+ logging.basicConfig(
24
+ level=getattr(logging, config.LOG_LEVEL),
25
+ format=config.LOG_FORMAT
26
+ )
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class HTMLDownloader:
31
+ """
32
+ HTML Downloader responsible for downloading web pages
33
+
34
+ Features:
35
+ - Respects robots.txt rules
36
+ - Uses DNS caching for performance
37
+ - Handles errors and retries
38
+ - Supports both synchronous and asynchronous downloads
39
+ """
40
+
41
+ def __init__(self,
42
+ dns_resolver: Optional[DNSResolver] = None,
43
+ robots_handler: Optional[RobotsHandler] = None,
44
+ user_agent: Optional[str] = None):
45
+ """
46
+ Initialize HTML Downloader
47
+
48
+ Args:
49
+ dns_resolver: DNS resolver for hostname resolution
50
+ robots_handler: Handler for robots.txt
51
+ user_agent: User agent to use for requests
52
+ """
53
+ self.dns_resolver = dns_resolver or DNSResolver()
54
+ self.robots_handler = robots_handler or RobotsHandler()
55
+ self.user_agent = user_agent or config.USER_AGENT
56
+
57
+ # Create request session
58
+ self.session = requests.Session()
59
+ self.session.headers.update({
60
+ 'User-Agent': self.user_agent,
61
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
62
+ 'Accept-Language': 'en-US,en;q=0.5',
63
+ 'Accept-Encoding': 'gzip, deflate, br',
64
+ 'Connection': 'keep-alive',
65
+ 'Upgrade-Insecure-Requests': '1',
66
+ 'Cache-Control': 'max-age=0'
67
+ })
68
+
69
+ def download(self, url_obj: URL) -> Optional[Page]:
70
+ """
71
+ Download an HTML page from a URL
72
+
73
+ Args:
74
+ url_obj: URL object to download
75
+
76
+ Returns:
77
+ Page object or None if download fails
78
+ """
79
+ url = url_obj.url
80
+ try:
81
+ # Check robots.txt first
82
+ if config.ROBOTSTXT_OBEY:
83
+ allowed, crawl_delay = self.robots_handler.can_fetch(url)
84
+ if not allowed:
85
+ logger.info(f"URL not allowed by robots.txt: {url}")
86
+ url_obj.status = "robotstxt_excluded"
87
+ return None
88
+
89
+ # Respect crawl delay if specified
90
+ if crawl_delay and crawl_delay > 0:
91
+ time.sleep(crawl_delay)
92
+
93
+ # Resolve DNS
94
+ ip_address = self.dns_resolver.resolve(url)
95
+ if not ip_address:
96
+ logger.warning(f"Failed to resolve DNS for URL: {url}")
97
+ url_obj.error = "DNS resolution failed"
98
+ return None
99
+
100
+ # Download page with specific headers
101
+ start_time = time.time()
102
+ response = self.session.get(
103
+ url,
104
+ timeout=config.CRAWL_TIMEOUT,
105
+ allow_redirects=True,
106
+ stream=True, # Stream to avoid downloading large files fully
107
+ headers={
108
+ 'User-Agent': self.user_agent,
109
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
110
+ 'Accept-Language': 'en-US,en;q=0.5',
111
+ 'Accept-Encoding': 'gzip', # Only accept gzip to avoid encoding issues
112
+ 'Connection': 'keep-alive'
113
+ }
114
+ )
115
+
116
+ # Log response details
117
+ logger.debug(f"Response status code: {response.status_code}")
118
+ logger.debug(f"Response headers: {dict(response.headers)}")
119
+
120
+ # Check content type
121
+ content_type = response.headers.get('Content-Type', '').lower()
122
+ logger.debug(f"Content type for {url}: {content_type}")
123
+
124
+ is_html = any(allowed_type in content_type for allowed_type in config.ALLOWED_CONTENT_TYPES) or \
125
+ any(allowed_type == '*/*' for allowed_type in config.ALLOWED_CONTENT_TYPES)
126
+
127
+ if not is_html:
128
+ logger.info(f"Skipping non-HTML content ({content_type}): {url}")
129
+ url_obj.error = f"Non-HTML content type: {content_type}"
130
+ return None
131
+
132
+ # Read content (with size limit)
133
+ content = b""
134
+ for chunk in response.iter_content(chunk_size=1024*1024): # 1MB chunks
135
+ content += chunk
136
+ if len(content) > config.MAX_CONTENT_SIZE:
137
+ logger.info(f"Content exceeded max size during download: {url}")
138
+ url_obj.error = f"Content exceeded max size: {len(content)} bytes"
139
+ return None
140
+
141
+ # Log content details
142
+ logger.debug(f"Downloaded content size: {len(content)} bytes")
143
+ logger.debug(f"First 100 bytes (hex): {content[:100].hex()}")
144
+
145
+ # Check for UTF-8 BOM
146
+ if content.startswith(b'\xef\xbb\xbf'):
147
+ content = content[3:]
148
+ logger.debug("Removed UTF-8 BOM from content")
149
+
150
+ # Try to detect encoding from response headers
151
+ encoding = None
152
+ if 'charset=' in content_type:
153
+ encoding = content_type.split('charset=')[-1].strip()
154
+ logger.debug(f"Found encoding in Content-Type header: {encoding}")
155
+
156
+ # Try to detect encoding from content
157
+ try:
158
+ import chardet
159
+ detected = chardet.detect(content)
160
+ if detected['confidence'] > 0.8: # Only use if confidence is high
161
+ encoding = detected['encoding']
162
+ logger.debug(f"Detected encoding using chardet: {encoding} (confidence: {detected['confidence']})")
163
+ except ImportError:
164
+ logger.debug("chardet not available for encoding detection")
165
+
166
+ # Decode content with fallbacks
167
+ html_content = None
168
+ encodings_to_try = [
169
+ encoding,
170
+ 'utf-8',
171
+ 'utf-8-sig',
172
+ 'iso-8859-1',
173
+ 'cp1252',
174
+ 'ascii'
175
+ ]
176
+
177
+ for enc in encodings_to_try:
178
+ if not enc:
179
+ continue
180
+ try:
181
+ html_content = content.decode(enc)
182
+ # Quick validation of HTML content
183
+ if '<!DOCTYPE' in html_content[:1000] or '<html' in html_content[:1000]:
184
+ logger.debug(f"Successfully decoded content using {enc} encoding")
185
+ break
186
+ else:
187
+ logger.debug(f"Decoded with {enc} but content doesn't look like HTML")
188
+ html_content = None
189
+ except UnicodeDecodeError:
190
+ logger.debug(f"Failed to decode content using {enc} encoding")
191
+ continue
192
+
193
+ if html_content is None:
194
+ logger.warning(f"Failed to decode content for URL: {url} with any encoding")
195
+ url_obj.error = "Failed to decode content"
196
+ return None
197
+
198
+ # Additional HTML validation
199
+ if not any(marker in html_content[:1000] for marker in ['<!DOCTYPE', '<html', '<head', '<body']):
200
+ logger.warning(f"Content doesn't appear to be valid HTML for URL: {url}")
201
+ url_obj.error = "Invalid HTML content"
202
+ return None
203
+
204
+ # Calculate hash for duplicate detection
205
+ content_hash = calculate_content_hash(html_content)
206
+
207
+ elapsed_time = time.time() - start_time
208
+
209
+ # Create page object
210
+ page = Page(
211
+ url=url,
212
+ status_code=response.status_code,
213
+ content=html_content,
214
+ content_type=content_type,
215
+ content_length=len(content),
216
+ content_hash=content_hash,
217
+ headers={k.lower(): v for k, v in response.headers.items()},
218
+ crawled_at=time.time(),
219
+ redirect_url=response.url if response.url != url else None,
220
+ elapsed_time=elapsed_time
221
+ )
222
+
223
+ logger.info(f"Downloaded {len(content)} bytes from {url} in {elapsed_time:.2f}s")
224
+ return page
225
+
226
+ except RequestException as e:
227
+ logger.warning(f"Request error for URL {url}: {e}")
228
+ url_obj.error = f"Request error: {str(e)}"
229
+ return None
230
+
231
+ except Exception as e:
232
+ logger.error(f"Unexpected error downloading URL {url}: {e}")
233
+ url_obj.error = f"Unexpected error: {str(e)}"
234
+ return None
235
+
236
+ async def download_async(self, url_obj: URL, session: Optional[aiohttp.ClientSession] = None) -> Optional[Page]:
237
+ """
238
+ Download an HTML page asynchronously
239
+
240
+ Args:
241
+ url_obj: URL object to download
242
+ session: Optional aiohttp session to use
243
+
244
+ Returns:
245
+ Page object or None if download fails
246
+ """
247
+ url = url_obj.url
248
+ own_session = False
249
+
250
+ try:
251
+ # Check robots.txt first (blocking call)
252
+ if config.ROBOTSTXT_OBEY:
253
+ allowed, crawl_delay = self.robots_handler.can_fetch(url)
254
+ if not allowed:
255
+ logger.info(f"URL not allowed by robots.txt: {url}")
256
+ url_obj.status = "robotstxt_excluded"
257
+ return None
258
+
259
+ # Respect crawl delay if specified
260
+ if crawl_delay and crawl_delay > 0:
261
+ await asyncio.sleep(crawl_delay)
262
+
263
+ # Resolve DNS (blocking call, but cached)
264
+ ip_address = self.dns_resolver.resolve(url)
265
+ if not ip_address:
266
+ logger.warning(f"Failed to resolve DNS for URL: {url}")
267
+ url_obj.error = "DNS resolution failed"
268
+ return None
269
+
270
+ # Create session if not provided
271
+ if session is None:
272
+ own_session = True
273
+ session = aiohttp.ClientSession(headers={
274
+ 'User-Agent': self.user_agent,
275
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
276
+ 'Accept-Language': 'en-US,en;q=0.5',
277
+ 'Accept-Encoding': 'gzip, deflate, br',
278
+ 'Connection': 'keep-alive',
279
+ 'Upgrade-Insecure-Requests': '1',
280
+ 'Cache-Control': 'max-age=0'
281
+ })
282
+
283
+ # Download page
284
+ start_time = time.time()
285
+ async with session.get(url, timeout=config.CRAWL_TIMEOUT, allow_redirects=True) as response:
286
+ # Check content type
287
+ content_type = response.headers.get('Content-Type', '').lower()
288
+ is_html = any(allowed_type in content_type for allowed_type in config.ALLOWED_CONTENT_TYPES)
289
+
290
+ if not is_html:
291
+ logger.info(f"Skipping non-HTML content ({content_type}): {url}")
292
+ url_obj.error = f"Non-HTML content type: {content_type}"
293
+ return None
294
+
295
+ # Check content length
296
+ content_length = int(response.headers.get('Content-Length', 0))
297
+ if content_length > config.MAX_CONTENT_SIZE:
298
+ logger.info(f"Skipping large content ({content_length} bytes): {url}")
299
+ url_obj.error = f"Content too large: {content_length} bytes"
300
+ return None
301
+
302
+ # Read content (with size limit)
303
+ content = b""
304
+ async for chunk in response.content.iter_chunked(1024*1024): # 1MB chunks
305
+ content += chunk
306
+ if len(content) > config.MAX_CONTENT_SIZE:
307
+ logger.info(f"Content exceeded max size during download: {url}")
308
+ url_obj.error = f"Content exceeded max size: {len(content)} bytes"
309
+ return None
310
+
311
+ # Decode content
312
+ try:
313
+ html_content = content.decode('utf-8')
314
+ except UnicodeDecodeError:
315
+ try:
316
+ # Try with a more forgiving encoding
317
+ html_content = content.decode('iso-8859-1')
318
+ except UnicodeDecodeError:
319
+ logger.warning(f"Failed to decode content for URL: {url}")
320
+ url_obj.error = "Failed to decode content"
321
+ return None
322
+
323
+ # Calculate hash for duplicate detection
324
+ content_hash = calculate_content_hash(html_content)
325
+
326
+ elapsed_time = time.time() - start_time
327
+
328
+ # Create page object
329
+ page = Page(
330
+ url=url,
331
+ status_code=response.status,
332
+ content=html_content,
333
+ content_type=content_type,
334
+ content_length=len(content),
335
+ content_hash=content_hash,
336
+ headers={k.lower(): v for k, v in response.headers.items()},
337
+ crawled_at=time.time(),
338
+ redirect_url=str(response.url) if str(response.url) != url else None,
339
+ elapsed_time=elapsed_time
340
+ )
341
+
342
+ logger.info(f"Downloaded {len(content)} bytes from {url} in {elapsed_time:.2f}s")
343
+ return page
344
+
345
+ except (ClientError, asyncio.TimeoutError) as e:
346
+ logger.warning(f"Request error for URL {url}: {e}")
347
+ url_obj.error = f"Request error: {str(e)}"
348
+ return None
349
+
350
+ except Exception as e:
351
+ logger.error(f"Unexpected error downloading URL {url}: {e}")
352
+ url_obj.error = f"Unexpected error: {str(e)}"
353
+ return None
354
+
355
+ finally:
356
+ # Close session if we created it
357
+ if own_session and session:
358
+ await session.close()
359
+
360
+ async def bulk_download(self, urls: List[URL], concurrency: int = 10) -> Dict[str, Optional[Page]]:
361
+ """
362
+ Download multiple URLs concurrently
363
+
364
+ Args:
365
+ urls: List of URL objects to download
366
+ concurrency: Maximum number of concurrent downloads
367
+
368
+ Returns:
369
+ Dictionary mapping URL strings to Page objects
370
+ """
371
+ results = {}
372
+
373
+ # Create a session to be shared across requests
374
+ async with aiohttp.ClientSession(headers={
375
+ 'User-Agent': self.user_agent,
376
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
377
+ 'Accept-Language': 'en-US,en;q=0.5',
378
+ 'Accept-Encoding': 'gzip, deflate, br',
379
+ 'Connection': 'keep-alive',
380
+ 'Upgrade-Insecure-Requests': '1',
381
+ 'Cache-Control': 'max-age=0'
382
+ }) as session:
383
+ # Create a semaphore to limit concurrency
384
+ semaphore = asyncio.Semaphore(concurrency)
385
+
386
+ async def download_with_semaphore(url_obj):
387
+ async with semaphore:
388
+ return await self.download_async(url_obj, session)
389
+
390
+ # Create download tasks
391
+ tasks = [download_with_semaphore(url_obj) for url_obj in urls]
392
+
393
+ # Wait for all tasks to complete
394
+ pages = await asyncio.gather(*tasks)
395
+
396
+ # Map results
397
+ for url_obj, page in zip(urls, pages):
398
+ results[url_obj.url] = page
399
+
400
+ return results
example.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Example script that demonstrates how to use the web crawler programmatically.
4
+
5
+ This example:
6
+ 1. Initializes the crawler
7
+ 2. Adds seed URLs
8
+ 3. Starts the crawler with 2 workers
9
+ 4. Monitors progress for a specific duration
10
+ 5. Pauses, resumes, and stops the crawler
11
+ 6. Exports crawl data
12
+
13
+ Usage:
14
+ python example.py [--time=<seconds>] [--workers=<num>] [--async]
15
+
16
+ Options:
17
+ --time=<seconds> Duration of the crawl in seconds [default: 60]
18
+ --workers=<num> Number of worker threads [default: 2]
19
+ --async Use asynchronous mode
20
+ """
21
+
22
+ import time
23
+ import logging
24
+ import sys
25
+ import json
26
+ import os
27
+ import signal
28
+ import threading
29
+ from docopt import docopt
30
+
31
+ from crawler import Crawler
32
+ from models import URLStatus, Priority
33
+ import config
34
+
35
+ # Configure logging
36
+ logging.basicConfig(
37
+ level=logging.INFO,
38
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
39
+ )
40
+ logger = logging.getLogger('example')
41
+
42
+
43
+ def log_stats(crawler, interval=5):
44
+ """Log crawler statistics periodically"""
45
+ stats = crawler.stats
46
+ elapsed = time.time() - stats['start_time']
47
+
48
+ logger.info(f"=== Crawler Statistics (after {int(elapsed)}s) ===")
49
+ logger.info(f"Pages crawled: {stats['pages_crawled']}")
50
+ logger.info(f"Pages failed: {stats['pages_failed']}")
51
+ logger.info(f"URLs discovered: {stats['urls_discovered']}")
52
+ logger.info(f"URLs filtered: {stats['urls_filtered']}")
53
+ logger.info(f"Domains crawled: {len(stats['domains_crawled'])}")
54
+ logger.info(f"Frontier size: {crawler.frontier.size()}")
55
+
56
+ # Status code distribution
57
+ status_codes = stats['status_codes']
58
+ if status_codes:
59
+ logger.info("Status code distribution:")
60
+ for status, count in sorted(status_codes.items()):
61
+ logger.info(f" {status}: {count}")
62
+
63
+ # Check if crawler is still running
64
+ if crawler.running and not crawler.stop_event.is_set():
65
+ # Schedule next logging
66
+ timer = threading.Timer(interval, log_stats, args=[crawler, interval])
67
+ timer.daemon = True
68
+ timer.start()
69
+
70
+
71
+ def example_crawl(duration=60, workers=2, async_mode=False):
72
+ """
73
+ Example crawler use
74
+
75
+ Args:
76
+ duration: Duration of the crawl in seconds
77
+ workers: Number of worker threads
78
+ async_mode: Whether to use async mode
79
+ """
80
+ logger.info("Initializing web crawler...")
81
+
82
+ # Initialize crawler
83
+ crawler = Crawler()
84
+
85
+ # Add seed URLs
86
+ seed_urls = [
87
+ 'https://en.wikipedia.org/wiki/Web_crawler',
88
+ 'https://en.wikipedia.org/wiki/Search_engine',
89
+ 'https://en.wikipedia.org/wiki/Web_indexing',
90
+ 'https://python.org',
91
+ 'https://www.example.com'
92
+ ]
93
+ logger.info(f"Adding {len(seed_urls)} seed URLs...")
94
+ crawler.add_seed_urls(seed_urls)
95
+
96
+ # Set up signal handling
97
+ def signal_handler(sig, frame):
98
+ logger.info("Received interrupt signal, stopping crawler")
99
+ crawler.stop()
100
+ sys.exit(0)
101
+
102
+ signal.signal(signal.SIGINT, signal_handler)
103
+
104
+ # Start a thread to log stats periodically
105
+ log_stats(crawler, interval=5)
106
+
107
+ # Start the crawler in a separate thread
108
+ logger.info(f"Starting crawler with {workers} workers (async={async_mode})...")
109
+ crawler_thread = threading.Thread(
110
+ target=crawler.start,
111
+ kwargs={'num_workers': workers, 'async_mode': async_mode}
112
+ )
113
+ crawler_thread.daemon = True
114
+ crawler_thread.start()
115
+
116
+ # Let the crawler run for a while
117
+ logger.info(f"Crawler will run for {duration} seconds...")
118
+ time.sleep(duration // 2)
119
+
120
+ # Pause crawler
121
+ logger.info("Pausing crawler for 5 seconds...")
122
+ crawler.pause()
123
+ time.sleep(5)
124
+
125
+ # Resume crawler
126
+ logger.info("Resuming crawler...")
127
+ crawler.resume()
128
+ time.sleep(duration // 2)
129
+
130
+ # Stop crawler
131
+ logger.info("Stopping crawler...")
132
+ crawler.stop()
133
+
134
+ # Wait for crawler to stop
135
+ crawler_thread.join(timeout=10)
136
+
137
+ # Export crawl data
138
+ export_dir = os.path.join(config.STORAGE_PATH, 'exports')
139
+ os.makedirs(export_dir, exist_ok=True)
140
+ export_file = os.path.join(export_dir, 'example_crawl_results.json')
141
+
142
+ logger.info(f"Exporting crawl data to {export_file}...")
143
+ export_results(crawler, export_file)
144
+
145
+ logger.info("Crawl example completed")
146
+
147
+ # Print summary
148
+ print_summary(crawler)
149
+
150
+
151
+ def export_results(crawler, output_file):
152
+ """
153
+ Export crawler results to a file
154
+
155
+ Args:
156
+ crawler: Crawler instance
157
+ output_file: Output file path
158
+ """
159
+ try:
160
+ # Get MongoDB collections
161
+ pages_collection = crawler.db.pages_collection
162
+ urls_collection = crawler.db.urls_collection
163
+
164
+ # Get data
165
+ pages = list(pages_collection.find({}, {'_id': 0}).limit(1000))
166
+ urls = list(urls_collection.find({}, {'_id': 0}).limit(1000))
167
+
168
+ # Prepare export data
169
+ export_data = {
170
+ 'metadata': {
171
+ 'crawl_duration': time.time() - crawler.stats['start_time'],
172
+ 'pages_crawled': crawler.stats['pages_crawled'],
173
+ 'urls_discovered': crawler.stats['urls_discovered'],
174
+ 'domains_crawled': list(crawler.stats['domains_crawled']),
175
+ 'exported_pages': len(pages),
176
+ 'exported_urls': len(urls),
177
+ 'export_timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
178
+ },
179
+ 'pages': pages,
180
+ 'urls': urls,
181
+ 'stats': crawler.stats
182
+ }
183
+
184
+ # Convert datetime objects to strings for JSON serialization
185
+ export_data = json.loads(json.dumps(export_data, default=str))
186
+
187
+ # Write to file
188
+ with open(output_file, 'w') as f:
189
+ json.dump(export_data, f, indent=2)
190
+
191
+ logger.info(f"Exported data to {output_file}")
192
+ except Exception as e:
193
+ logger.error(f"Error exporting results: {e}")
194
+
195
+
196
+ def print_summary(crawler):
197
+ """
198
+ Print a summary of the crawl
199
+
200
+ Args:
201
+ crawler: Crawler instance
202
+ """
203
+ stats = crawler.stats
204
+
205
+ print("\n=============== CRAWL SUMMARY ===============")
206
+ print(f"Duration: {time.time() - stats['start_time']:.2f} seconds")
207
+ print(f"Pages crawled: {stats['pages_crawled']}")
208
+ print(f"Pages failed: {stats['pages_failed']}")
209
+ print(f"URLs discovered: {stats['urls_discovered']}")
210
+ print(f"URLs filtered: {stats['urls_filtered']}")
211
+ print(f"Domains crawled: {len(stats['domains_crawled'])}")
212
+
213
+ if stats['domains_crawled']:
214
+ print("\nTop domains:")
215
+ domain_counts = {}
216
+ # Count pages per domain
217
+ for page in crawler.db.pages_collection.find({}, {'domain': 1}):
218
+ domain = page.get('domain', 'unknown')
219
+ domain_counts[domain] = domain_counts.get(domain, 0) + 1
220
+
221
+ # Display top domains
222
+ for domain, count in sorted(domain_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
223
+ print(f" {domain}: {count} pages")
224
+
225
+ print("\nHTTP Status Codes:")
226
+ for status, count in sorted(stats['status_codes'].items()):
227
+ print(f" {status}: {count}")
228
+
229
+ print("\nContent Types:")
230
+ for content_type, count in sorted(stats['content_types'].items(), key=lambda x: x[1], reverse=True)[:5]:
231
+ print(f" {content_type}: {count}")
232
+
233
+ print("=============================================\n")
234
+
235
+
236
+ if __name__ == '__main__':
237
+ # Parse command-line arguments
238
+ args = docopt(__doc__)
239
+
240
+ duration = int(args['--time'])
241
+ workers = int(args['--workers'])
242
+ async_mode = args['--async']
243
+
244
+ try:
245
+ example_crawl(duration, workers, async_mode)
246
+ except KeyboardInterrupt:
247
+ logger.info("Example interrupted by user")
248
+ except Exception as e:
249
+ logger.error(f"Error in example: {e}")
250
+ logger.exception(e)
file_cleanup.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to remove all simple_crawler related files without interactive confirmation
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import logging
9
+ import shutil
10
+
11
+ # Configure logging
12
+ logging.basicConfig(
13
+ level=logging.INFO,
14
+ format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
15
+ )
16
+ logger = logging.getLogger("file_cleanup")
17
+
18
+ def cleanup_files(dry_run=False):
19
+ """List and remove files related to simple_crawler"""
20
+ try:
21
+ crawler_dir = os.path.dirname(os.path.abspath(__file__))
22
+
23
+ # Files directly related to simple_crawler
24
+ simple_crawler_files = [
25
+ os.path.join(crawler_dir, "simple_crawler.py"),
26
+ os.path.join(crawler_dir, "README_SIMPLE.md"),
27
+ os.path.join(crawler_dir, "simple_crawler.log"),
28
+ os.path.join(crawler_dir, "local_config.py")
29
+ ]
30
+
31
+ # Check storage directories
32
+ storage_dir = os.path.join(crawler_dir, "storage")
33
+ if os.path.exists(storage_dir):
34
+ logger.info(f"Adding storage directory to removal list: {storage_dir}")
35
+ simple_crawler_files.append(storage_dir)
36
+
37
+ # Check for any log files with 'crawler' in the name
38
+ for filename in os.listdir(crawler_dir):
39
+ if ('crawler' in filename.lower() or 'crawl' in filename.lower()) and filename.endswith('.log'):
40
+ full_path = os.path.join(crawler_dir, filename)
41
+ if full_path not in simple_crawler_files:
42
+ logger.info(f"Adding log file to removal list: {filename}")
43
+ simple_crawler_files.append(full_path)
44
+
45
+ # List files that will be removed
46
+ logger.info("The following files will be removed:")
47
+ files_to_remove = []
48
+
49
+ for file_path in simple_crawler_files:
50
+ if os.path.exists(file_path):
51
+ logger.info(f" - {file_path}")
52
+ files_to_remove.append(file_path)
53
+ else:
54
+ logger.info(f" - {file_path} (not found)")
55
+
56
+ if dry_run:
57
+ logger.info("Dry run mode - no files will be removed")
58
+ return True
59
+
60
+ # Remove files and directories
61
+ for file_path in files_to_remove:
62
+ if os.path.isdir(file_path):
63
+ logger.info(f"Removing directory: {file_path}")
64
+ shutil.rmtree(file_path)
65
+ else:
66
+ logger.info(f"Removing file: {file_path}")
67
+ os.remove(file_path)
68
+
69
+ logger.info("File cleanup completed")
70
+ return True
71
+
72
+ except Exception as e:
73
+ logger.error(f"Error cleaning up files: {e}")
74
+ return False
75
+
76
+ if __name__ == "__main__":
77
+ print("Simple Crawler File Cleanup")
78
+ print("--------------------------")
79
+ print("This script will remove all files related to simple_crawler")
80
+ print()
81
+
82
+ # Check for dry-run flag
83
+ dry_run = '--dry-run' in sys.argv
84
+
85
+ if '--force' in sys.argv:
86
+ # Non-interactive mode for scripting
87
+ success = cleanup_files(dry_run)
88
+ sys.exit(0 if success else 1)
89
+ else:
90
+ # Interactive mode
91
+ if dry_run:
92
+ print("DRY RUN MODE: Files will be listed but not removed")
93
+
94
+ proceed = input("Do you want to proceed with file cleanup? (y/n): ")
95
+ if proceed.lower() != 'y':
96
+ print("Cleanup cancelled")
97
+ sys.exit(0)
98
+
99
+ success = cleanup_files(dry_run)
100
+ print(f"\nFile cleanup: {'Completed' if success else 'Failed'}")
frontier.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ URL Frontier implementation for web crawler
3
+
4
+ The URL Frontier maintains URLs to be crawled with two main goals:
5
+ 1. Prioritization - Important URLs are crawled first
6
+ 2. Politeness - Avoid overloading web servers with too many requests
7
+ """
8
+
9
+ import time
10
+ import logging
11
+ import heapq
12
+ import pickle
13
+ import threading
14
+ import random
15
+ from typing import Dict, List, Tuple, Optional, Any, Set
16
+ from collections import deque
17
+ import redis
18
+ from redis.exceptions import RedisError
19
+ import mmh3
20
+ import os
21
+ import json
22
+
23
+ from models import URL, Priority, URLStatus
24
+ import config
25
+
26
+ # Import local configuration if available
27
+ try:
28
+ import local_config
29
+ # Override config settings with local settings
30
+ for key in dir(local_config):
31
+ if key.isupper():
32
+ setattr(config, key, getattr(local_config, key))
33
+ logging.info("Loaded local configuration")
34
+ except ImportError:
35
+ pass
36
+
37
+ # Configure logging
38
+ logging.basicConfig(
39
+ level=getattr(logging, config.LOG_LEVEL),
40
+ format=config.LOG_FORMAT
41
+ )
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ class URLFrontier:
46
+ """
47
+ URL Frontier implementation with prioritization and politeness
48
+
49
+ Architecture:
50
+ - Front queues: Priority-based queues
51
+ - Back queues: Host-based queues for politeness
52
+
53
+ This uses Redis for persistent storage to handle large number of URLs
54
+ and enable distributed crawling. In deployment mode, it can also use
55
+ in-memory storage.
56
+ """
57
+
58
+ def __init__(self, redis_client: Optional[redis.Redis] = None, use_memory: bool = False):
59
+ """Initialize the URL Frontier"""
60
+ self.use_memory = use_memory
61
+ if use_memory:
62
+ # Initialize in-memory storage
63
+ self.memory_storage = {
64
+ 'seen_urls': set(),
65
+ 'priority_queues': [[] for _ in range(config.PRIORITY_QUEUE_NUM)],
66
+ 'host_queues': [[] for _ in range(config.HOST_QUEUE_NUM)]
67
+ }
68
+ else:
69
+ # Use Redis
70
+ self.redis = redis_client or redis.from_url(config.REDIS_URI)
71
+
72
+ self.priority_count = config.PRIORITY_QUEUE_NUM # Number of priority queues
73
+ self.host_count = config.HOST_QUEUE_NUM # Number of host queues
74
+ self.url_seen_key = "webcrawler:url_seen" # Bloom filter for seen URLs
75
+ self.priority_queue_key_prefix = "webcrawler:priority_queue:"
76
+ self.host_queue_key_prefix = "webcrawler:host_queue:"
77
+ self.lock = threading.RLock() # Thread-safe operations
78
+
79
+ # Simple mode uses Redis Set instead of Bloom filter
80
+ self.use_simple_mode = getattr(config, 'USE_SIMPLE_URL_SEEN', False)
81
+ logger.info(f"URLFrontier using simple mode: {self.use_simple_mode}")
82
+
83
+ # Ensure directory for checkpoint exists
84
+ if not os.path.exists(config.STORAGE_PATH):
85
+ os.makedirs(config.STORAGE_PATH)
86
+
87
+ # Initialize URL seen storage
88
+ if not self.use_memory:
89
+ self._init_url_seen()
90
+
91
+ def _init_url_seen(self):
92
+ """Initialize URL seen storage based on configuration"""
93
+ try:
94
+ # If using simple mode, just use a Redis set
95
+ if self.use_simple_mode:
96
+ if not self.redis.exists(self.url_seen_key):
97
+ logger.info("Initializing URL seen set")
98
+ self.redis.sadd(self.url_seen_key, "initialized")
99
+ return
100
+
101
+ # Try to use Bloom filter
102
+ if not self.redis.exists(self.url_seen_key):
103
+ logger.info("Initializing URL seen bloom filter")
104
+ try:
105
+ # Use a bloom filter with 100 million items and 0.01 false positive rate
106
+ # This requires approximately 119.5 MB of memory
107
+ self.redis.execute_command("BF.RESERVE", self.url_seen_key, 0.01, 100000000)
108
+ except RedisError as e:
109
+ logger.error(f"Failed to initialize bloom filter: {e}")
110
+ logger.info("Falling back to simple set for URL seen detection")
111
+ self.use_simple_mode = True
112
+ # Initialize a set instead
113
+ if not self.redis.exists(self.url_seen_key):
114
+ self.redis.sadd(self.url_seen_key, "initialized")
115
+ except RedisError as e:
116
+ logger.error(f"Error initializing URL seen: {e}")
117
+ # Fallback to set if bloom filter is not available
118
+ self.use_simple_mode = True
119
+ if not self.redis.exists(self.url_seen_key):
120
+ self.redis.sadd(self.url_seen_key, "initialized")
121
+
122
+ def add_url(self, url_obj: URL) -> bool:
123
+ """Add a URL to the frontier"""
124
+ with self.lock:
125
+ url = url_obj.url
126
+
127
+ # Check if URL has been seen
128
+ if self.use_memory:
129
+ if url in self.memory_storage['seen_urls']:
130
+ return False
131
+ self.memory_storage['seen_urls'].add(url)
132
+ else:
133
+ if self.use_simple_mode:
134
+ if self.redis.sismember(self.url_seen_key, url):
135
+ return False
136
+ self.redis.sadd(self.url_seen_key, url)
137
+ else:
138
+ if self._check_url_seen(url):
139
+ return False
140
+ self._mark_url_seen(url)
141
+
142
+ # Add to priority queue
143
+ priority_index = url_obj.priority.value % self.priority_count
144
+ if self.use_memory:
145
+ self.memory_storage['priority_queues'][priority_index].append(url_obj)
146
+ else:
147
+ priority_key = f"{self.priority_queue_key_prefix}{priority_index}"
148
+ self.redis.rpush(priority_key, url_obj.json())
149
+
150
+ return True
151
+
152
+ def get_next_url(self) -> Optional[URL]:
153
+ """Get the next URL to crawl"""
154
+ with self.lock:
155
+ # Try each priority queue
156
+ for i in range(self.priority_count):
157
+ if self.use_memory:
158
+ queue = self.memory_storage['priority_queues'][i]
159
+ if queue:
160
+ return queue.pop(0)
161
+ else:
162
+ priority_key = f"{self.priority_queue_key_prefix}{i}"
163
+ url_data = self.redis.lpop(priority_key)
164
+ if url_data:
165
+ return URL.parse_raw(url_data)
166
+ return None
167
+
168
+ def _check_url_seen(self, url: str) -> bool:
169
+ """Check if URL has been seen"""
170
+ if self.use_memory:
171
+ return url in self.memory_storage['seen_urls']
172
+ elif self.use_simple_mode:
173
+ return self.redis.sismember(self.url_seen_key, url)
174
+ else:
175
+ # Using Redis Bloom filter
176
+ return bool(self.redis.getbit(self.url_seen_key, self._hash_url(url)))
177
+
178
+ def _mark_url_seen(self, url: str) -> None:
179
+ """Mark URL as seen"""
180
+ if self.use_memory:
181
+ self.memory_storage['seen_urls'].add(url)
182
+ elif self.use_simple_mode:
183
+ self.redis.sadd(self.url_seen_key, url)
184
+ else:
185
+ # Using Redis Bloom filter
186
+ self.redis.setbit(self.url_seen_key, self._hash_url(url), 1)
187
+
188
+ def _hash_url(self, url: str) -> int:
189
+ """Hash URL for Bloom filter"""
190
+ return hash(url) % (1 << 32) # 32-bit hash
191
+
192
+ def size(self) -> int:
193
+ """Get the total size of all queues"""
194
+ if self.use_memory:
195
+ return sum(len(q) for q in self.memory_storage['priority_queues'])
196
+ else:
197
+ total = 0
198
+ for i in range(self.priority_count):
199
+ priority_key = f"{self.priority_queue_key_prefix}{i}"
200
+ total += self.redis.llen(priority_key)
201
+ return total
202
+
203
+ def get_stats(self) -> Dict[str, Any]:
204
+ """Get frontier statistics"""
205
+ stats = {
206
+ "size": self.size(),
207
+ "priority_queues": {},
208
+ "host_queues": {},
209
+ }
210
+
211
+ try:
212
+ # Get priority queue stats
213
+ for priority in range(1, self.priority_count + 1):
214
+ queue_key = f"{self.priority_queue_key_prefix}{priority}"
215
+ stats["priority_queues"][f"priority_{priority}"] = self.redis.llen(queue_key)
216
+
217
+ # Get host queue stats (just count total host queues with items)
218
+ host_queue_count = 0
219
+ for host_id in range(self.host_count):
220
+ queue_key = f"{self.host_queue_key_prefix}{host_id}"
221
+ if self.redis.llen(queue_key) > 0:
222
+ host_queue_count += 1
223
+
224
+ stats["host_queues"]["count_with_items"] = host_queue_count
225
+
226
+ # Add URLs seen count if using simple mode
227
+ if self.use_simple_mode:
228
+ stats["urls_seen"] = self.redis.scard(self.url_seen_key)
229
+
230
+ return stats
231
+ except RedisError as e:
232
+ logger.error(f"Error getting frontier stats: {e}")
233
+ return stats
234
+
235
+ def checkpoint(self) -> bool:
236
+ """Save frontier state"""
237
+ if self.use_memory:
238
+ # No need to checkpoint in-memory storage
239
+ return True
240
+
241
+ try:
242
+ # Save priority queues
243
+ for i in range(self.priority_count):
244
+ priority_key = f"{self.priority_queue_key_prefix}{i}"
245
+ queue_data = []
246
+ while True:
247
+ url_data = self.redis.lpop(priority_key)
248
+ if not url_data:
249
+ break
250
+ queue_data.append(url_data)
251
+
252
+ # Save to file
253
+ checkpoint_file = os.path.join(config.STORAGE_PATH, f"priority_queue_{i}.json")
254
+ with open(checkpoint_file, 'w') as f:
255
+ json.dump(queue_data, f)
256
+
257
+ # Restore queue
258
+ for url_data in reversed(queue_data):
259
+ self.redis.rpush(priority_key, url_data)
260
+
261
+ return True
262
+ except Exception as e:
263
+ logger.error(f"Error creating frontier checkpoint: {e}")
264
+ return False
265
+
266
+ def restore(self) -> bool:
267
+ """Restore frontier state"""
268
+ if self.use_memory:
269
+ # No need to restore in-memory storage
270
+ return True
271
+
272
+ try:
273
+ # Restore priority queues
274
+ for i in range(self.priority_count):
275
+ checkpoint_file = os.path.join(config.STORAGE_PATH, f"priority_queue_{i}.json")
276
+ if os.path.exists(checkpoint_file):
277
+ with open(checkpoint_file, 'r') as f:
278
+ queue_data = json.load(f)
279
+
280
+ # Clear existing queue
281
+ priority_key = f"{self.priority_queue_key_prefix}{i}"
282
+ self.redis.delete(priority_key)
283
+
284
+ # Restore queue
285
+ for url_data in queue_data:
286
+ self.redis.rpush(priority_key, url_data)
287
+
288
+ return True
289
+ except Exception as e:
290
+ logger.error(f"Error restoring frontier checkpoint: {e}")
291
+ return False
292
+
293
+ def clear(self) -> bool:
294
+ """
295
+ Clear all queues in the frontier
296
+
297
+ Returns:
298
+ bool: True if successful, False otherwise
299
+ """
300
+ try:
301
+ # Delete all queue keys
302
+ keys = []
303
+ for priority in range(1, self.priority_count + 1):
304
+ keys.append(f"{self.priority_queue_key_prefix}{priority}")
305
+
306
+ for host_id in range(self.host_count):
307
+ keys.append(f"{self.host_queue_key_prefix}{host_id}")
308
+
309
+ if keys:
310
+ self.redis.delete(*keys)
311
+
312
+ # Reset URL seen filter (optional)
313
+ self.redis.delete(self.url_seen_key)
314
+
315
+ logger.info("Frontier cleared")
316
+ return True
317
+ except Exception as e:
318
+ logger.error(f"Error clearing frontier: {e}")
319
+ return False
models.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data models for the web crawler
3
+ """
4
+
5
+ import time
6
+ import hashlib
7
+ import tldextract
8
+ from urllib.parse import urlparse, urljoin, urlunparse
9
+ from datetime import datetime
10
+ from typing import Dict, List, Any, Optional, Set, Tuple
11
+ from pydantic import BaseModel, Field, HttpUrl, validator
12
+ from enum import Enum
13
+ import logging
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class URLStatus(str, Enum):
19
+ """Status of a URL in the crawl process"""
20
+ PENDING = "pending" # Not yet processed
21
+ IN_PROGRESS = "in_progress" # Currently being processed
22
+ COMPLETED = "completed" # Successfully processed
23
+ FAILED = "failed" # Failed to process
24
+ FILTERED = "filtered" # Filtered out based on rules
25
+ ROBOTSTXT_EXCLUDED = "robotstxt_excluded" # Excluded by robots.txt
26
+
27
+
28
+ class Priority(int, Enum):
29
+ """Priority levels for URLs"""
30
+ VERY_HIGH = 1
31
+ HIGH = 2
32
+ MEDIUM = 3
33
+ LOW = 4
34
+ VERY_LOW = 5
35
+
36
+
37
+ class URL(BaseModel):
38
+ """URL model with metadata for crawling"""
39
+ url: str
40
+ normalized_url: str = "" # Normalized version of the URL
41
+ domain: str = "" # Domain extracted from the URL
42
+ depth: int = 0 # Depth from seed URL
43
+ discovered_at: datetime = Field(default_factory=datetime.now)
44
+ last_crawled: Optional[datetime] = None
45
+ completed_at: Optional[datetime] = None # When the URL was completed/failed
46
+ status: URLStatus = URLStatus.PENDING
47
+ priority: Priority = Priority.MEDIUM
48
+ parent_url: Optional[str] = None # URL that led to this URL
49
+ retries: int = 0 # Number of times retried
50
+ error: Optional[str] = None # Error message if failed
51
+ metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
52
+
53
+ @validator("normalized_url", pre=True, always=True)
54
+ def set_normalized_url(cls, v, values):
55
+ """Normalize the URL if not already set"""
56
+ if not v and "url" in values:
57
+ return normalize_url(values["url"])
58
+ return v
59
+
60
+ @validator("domain", pre=True, always=True)
61
+ def set_domain(cls, v, values):
62
+ """Extract domain from URL if not already set"""
63
+ if not v and "url" in values:
64
+ parsed = tldextract.extract(values["url"])
65
+ return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
66
+ return v
67
+
68
+
69
+ class RobotsInfo(BaseModel):
70
+ """Information from robots.txt for a domain"""
71
+ domain: str
72
+ allowed: bool = True # Whether crawling is allowed
73
+ crawl_delay: Optional[float] = None # Crawl delay in seconds
74
+ last_fetched: datetime = Field(default_factory=datetime.now)
75
+ user_agents: Dict[str, Dict[str, Any]] = Field(default_factory=dict) # Info per user agent
76
+ status_code: Optional[int] = None # HTTP status code when fetching robots.txt
77
+
78
+
79
+ class Page(BaseModel):
80
+ """Web page model with content and metadata"""
81
+ url: str
82
+ status_code: int
83
+ content: str # HTML content
84
+ content_type: str
85
+ content_length: int
86
+ content_hash: str # Hash of the content for duplicate detection
87
+ headers: Dict[str, str] = Field(default_factory=dict)
88
+ links: List[str] = Field(default_factory=list) # Links extracted from the page
89
+ crawled_at: datetime = Field(default_factory=datetime.now)
90
+ redirect_url: Optional[str] = None # URL after redirects
91
+ elapsed_time: float = 0.0 # Time taken to fetch the page
92
+ is_duplicate: bool = False # Whether this is duplicate content
93
+ metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
94
+
95
+
96
+ class DomainStats(BaseModel):
97
+ """Statistics for a domain"""
98
+ domain: str
99
+ pages_crawled: int = 0
100
+ successful_crawls: int = 0
101
+ failed_crawls: int = 0
102
+ last_crawled: Optional[datetime] = None
103
+ robots_info: Optional[RobotsInfo] = None
104
+ crawl_times: List[float] = Field(default_factory=list) # Recent crawl times
105
+ errors: Dict[int, int] = Field(default_factory=dict) # Status code counts for errors
106
+
107
+
108
+ def normalize_url(url: str) -> str:
109
+ """
110
+ Normalize a URL by:
111
+ 1. Converting to lowercase
112
+ 2. Removing fragments
113
+ 3. Removing default ports
114
+ 4. Sorting query parameters
115
+ 5. Removing trailing slashes
116
+ 6. Adding scheme if missing
117
+ """
118
+ try:
119
+ # Parse URL
120
+ parsed = urlparse(url)
121
+
122
+ # Add scheme if missing
123
+ if not parsed.scheme:
124
+ url = 'http://' + url
125
+ parsed = urlparse(url)
126
+
127
+ # Get domain and path
128
+ domain = parsed.netloc.lower()
129
+ path = parsed.path
130
+
131
+ # Remove default ports
132
+ if ':' in domain:
133
+ domain_parts = domain.split(':')
134
+ if (parsed.scheme == 'http' and domain_parts[1] == '80') or \
135
+ (parsed.scheme == 'https' and domain_parts[1] == '443'):
136
+ domain = domain_parts[0]
137
+
138
+ # Sort query parameters
139
+ query = parsed.query
140
+ if query:
141
+ query_params = sorted(query.split('&'))
142
+ query = '&'.join(query_params)
143
+
144
+ # Remove trailing slashes from path
145
+ while path.endswith('/') and len(path) > 1:
146
+ path = path[:-1]
147
+
148
+ # Add leading slash if missing
149
+ if not path:
150
+ path = '/'
151
+
152
+ # Reconstruct URL
153
+ normalized = f"{parsed.scheme}://{domain}{path}"
154
+ if query:
155
+ normalized += f"?{query}"
156
+
157
+ logger.debug(f"Normalized URL: {url} -> {normalized}")
158
+ return normalized
159
+
160
+ except Exception as e:
161
+ logger.error(f"Error normalizing URL {url}: {e}")
162
+ return url
163
+
164
+
165
+ def calculate_content_hash(content: str) -> str:
166
+ """Calculate hash of content for duplicate detection"""
167
+ return hashlib.md5(content.encode('utf-8')).hexdigest()
mongo_cleanup.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to remove all web crawler data from MongoDB without interactive confirmation
4
+ """
5
+
6
+ import logging
7
+ from pymongo import MongoClient
8
+ import sys
9
+
10
+ # Configure logging
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
14
+ )
15
+ logger = logging.getLogger("mongo_cleanup")
16
+
17
+ def cleanup_mongodb():
18
+ """Remove all web crawler data from MongoDB"""
19
+ try:
20
+ # Connect to MongoDB
21
+ logger.info("Connecting to MongoDB...")
22
+ client = MongoClient("mongodb://localhost:27017/")
23
+
24
+ # Access crawler database
25
+ db = client["crawler"]
26
+
27
+ # List and drop all collections
28
+ collections = db.list_collection_names()
29
+
30
+ if not collections:
31
+ logger.info("No collections found in the crawler database")
32
+ else:
33
+ logger.info(f"Found {len(collections)} collections to drop: {collections}")
34
+
35
+ for collection in collections:
36
+ logger.info(f"Dropping collection: {collection}")
37
+ db[collection].drop()
38
+
39
+ logger.info("All crawler collections dropped successfully")
40
+
41
+ # Optionally drop the entire database
42
+ logger.info("Dropping entire crawler database")
43
+ client.drop_database("crawler")
44
+
45
+ # Check for any URLs collection in other databases that might be related
46
+ all_dbs = client.list_database_names()
47
+ for db_name in all_dbs:
48
+ if db_name in ['admin', 'config', 'local']:
49
+ continue
50
+
51
+ db = client[db_name]
52
+ if 'urls' in db.list_collection_names() or 'pages' in db.list_collection_names():
53
+ logger.info(f"Found crawler-related collections in database: {db_name}")
54
+
55
+ # Ask for confirmation before dropping collections in other databases
56
+ for collection in ['urls', 'pages', 'domains', 'stats']:
57
+ if collection in db.list_collection_names():
58
+ logger.info(f"Dropping collection {db_name}.{collection}")
59
+ db[collection].drop()
60
+
61
+ logger.info("MongoDB cleanup completed successfully")
62
+ return True
63
+
64
+ except Exception as e:
65
+ logger.error(f"Error cleaning up MongoDB: {e}")
66
+ return False
67
+
68
+ if __name__ == "__main__":
69
+ print("MongoDB Crawler Data Cleanup")
70
+ print("--------------------------")
71
+ print("This script will remove all web crawler collections from MongoDB")
72
+ print()
73
+
74
+ if len(sys.argv) > 1 and sys.argv[1] == '--force':
75
+ # Non-interactive mode for scripting
76
+ success = cleanup_mongodb()
77
+ sys.exit(0 if success else 1)
78
+ else:
79
+ # Interactive mode
80
+ proceed = input("Do you want to proceed with MongoDB cleanup? (y/n): ")
81
+ if proceed.lower() != 'y':
82
+ print("Cleanup cancelled")
83
+ sys.exit(0)
84
+
85
+ success = cleanup_mongodb()
86
+ print(f"\nMongoDB cleanup: {'Completed' if success else 'Failed'}")
parser.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HTML Parser and URL Extractor component for web crawler
3
+ """
4
+
5
+ import logging
6
+ import re
7
+ from typing import Dict, List, Set, Tuple, Optional, Any
8
+ from urllib.parse import urlparse, urljoin, unquote
9
+ from bs4 import BeautifulSoup
10
+ import tldextract
11
+ import hashlib
12
+ import os
13
+
14
+ from models import URL, Page, Priority, normalize_url
15
+ import config
16
+
17
+ # Configure logging
18
+ logging.basicConfig(
19
+ level=getattr(logging, config.LOG_LEVEL),
20
+ format=config.LOG_FORMAT
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class HTMLParser:
26
+ """
27
+ Parses HTML content and extracts URLs and other information
28
+ """
29
+
30
+ def __init__(self):
31
+ """Initialize HTML parser"""
32
+ # Compile URL filter regex patterns for efficiency
33
+ self.url_filters = [re.compile(pattern) for pattern in config.URL_FILTERS]
34
+
35
+ def parse(self, page: Page, base_url: Optional[str] = None) -> Tuple[List[str], Dict[str, Any]]:
36
+ """
37
+ Parse HTML content and extract URLs and metadata
38
+
39
+ Args:
40
+ page: Page object containing HTML content
41
+ base_url: Base URL for resolving relative links (defaults to page URL)
42
+
43
+ Returns:
44
+ Tuple of (extracted URLs, metadata)
45
+ """
46
+ if not page or not page.content:
47
+ return [], {}
48
+
49
+ # Use page URL as base URL if not provided
50
+ if not base_url:
51
+ base_url = page.url
52
+
53
+ # Parse HTML content
54
+ soup = BeautifulSoup(page.content, 'html.parser')
55
+
56
+ # Extract URLs
57
+ urls = self._extract_urls(soup, base_url)
58
+
59
+ # Extract metadata
60
+ metadata = self._extract_metadata(soup)
61
+
62
+ return urls, metadata
63
+
64
+ def _extract_urls(self, soup: BeautifulSoup, base_url: str) -> List[str]:
65
+ """
66
+ Extract and normalize URLs from HTML content
67
+
68
+ Args:
69
+ soup: BeautifulSoup object
70
+ base_url: Base URL for resolving relative links
71
+
72
+ Returns:
73
+ List of normalized URLs
74
+ """
75
+ urls = set()
76
+ all_urls = set() # Track all URLs before filtering
77
+ filtered_urls = set() # Track filtered URLs
78
+
79
+ logger.debug(f"Extracting URLs from page: {base_url}")
80
+
81
+ # Extract URLs from <a> tags
82
+ for link in soup.find_all('a', href=True):
83
+ href = link['href'].strip()
84
+ if href and not href.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
85
+ # Resolve relative URLs
86
+ try:
87
+ absolute_url = urljoin(base_url, href)
88
+ all_urls.add(absolute_url)
89
+ # Normalize URL
90
+ normalized_url = normalize_url(absolute_url)
91
+ # Apply URL filters
92
+ if self._should_allow_url(normalized_url):
93
+ urls.add(normalized_url)
94
+ else:
95
+ filtered_urls.add(normalized_url)
96
+ except Exception as e:
97
+ logger.debug(f"Error processing URL {href}: {e}")
98
+
99
+ # Extract URLs from other elements like <iframe>, <frame>, <img>, etc.
100
+ for tag_name, attr in [('frame', 'src'), ('iframe', 'src'), ('img', 'src'),
101
+ ('link', 'href'), ('script', 'src'), ('area', 'href')]:
102
+ for tag in soup.find_all(tag_name, attrs={attr: True}):
103
+ url = tag[attr].strip()
104
+ if url and not url.startswith(('#', 'javascript:', 'data:', 'mailto:', 'tel:')):
105
+ try:
106
+ absolute_url = urljoin(base_url, url)
107
+ all_urls.add(absolute_url)
108
+ normalized_url = normalize_url(absolute_url)
109
+ if self._should_allow_url(normalized_url):
110
+ urls.add(normalized_url)
111
+ else:
112
+ filtered_urls.add(normalized_url)
113
+ except Exception as e:
114
+ logger.debug(f"Error processing URL {url}: {e}")
115
+
116
+ # Log statistics
117
+ logger.debug(f"Found {len(all_urls)} total URLs")
118
+ logger.debug(f"Filtered {len(filtered_urls)} URLs")
119
+ logger.debug(f"Accepted {len(urls)} URLs")
120
+
121
+ # Log some example filtered URLs for debugging
122
+ if filtered_urls:
123
+ sample_filtered = list(filtered_urls)[:5]
124
+ logger.debug(f"Sample filtered URLs: {sample_filtered}")
125
+
126
+ # Return list of unique URLs
127
+ return list(urls)
128
+
129
+ def _should_allow_url(self, url: str) -> bool:
130
+ """
131
+ Check if URL should be allowed based on filters
132
+
133
+ Args:
134
+ url: URL to check
135
+
136
+ Returns:
137
+ True if URL should be allowed, False otherwise
138
+ """
139
+ try:
140
+ parsed = urlparse(url)
141
+
142
+ # Check scheme
143
+ if parsed.scheme not in config.ALLOWED_SCHEMES:
144
+ logger.debug(f"URL filtered - invalid scheme: {url}")
145
+ return False
146
+
147
+ # Check domain restrictions
148
+ domain = self._extract_domain(url)
149
+
150
+ # Check allowed domains if set
151
+ if config.ALLOWED_DOMAINS and domain not in config.ALLOWED_DOMAINS:
152
+ logger.debug(f"URL filtered - domain not allowed: {url} (domain: {domain}, allowed: {config.ALLOWED_DOMAINS})")
153
+ return False
154
+
155
+ # Check excluded domains
156
+ if domain in config.EXCLUDED_DOMAINS:
157
+ logger.debug(f"URL filtered - domain excluded: {url}")
158
+ return False
159
+
160
+ # Check URL filters
161
+ for pattern in self.url_filters:
162
+ if pattern.match(url):
163
+ logger.debug(f"URL filtered - pattern match: {url}")
164
+ return False
165
+
166
+ return True
167
+
168
+ except Exception as e:
169
+ logger.debug(f"Error checking URL {url}: {e}")
170
+ return False
171
+
172
+ def _extract_metadata(self, soup: BeautifulSoup) -> Dict[str, Any]:
173
+ """
174
+ Extract metadata from HTML content
175
+
176
+ Args:
177
+ soup: BeautifulSoup object
178
+
179
+ Returns:
180
+ Dictionary of metadata
181
+ """
182
+ metadata = {}
183
+
184
+ # Extract title
185
+ title_tag = soup.find('title')
186
+ if title_tag and title_tag.string:
187
+ metadata['title'] = title_tag.string.strip()
188
+
189
+ # Extract meta description
190
+ description_tag = soup.find('meta', attrs={'name': 'description'})
191
+ if description_tag and description_tag.get('content'):
192
+ metadata['description'] = description_tag['content'].strip()
193
+
194
+ # Extract meta keywords
195
+ keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
196
+ if keywords_tag and keywords_tag.get('content'):
197
+ metadata['keywords'] = [k.strip() for k in keywords_tag['content'].split(',')]
198
+
199
+ # Extract canonical URL
200
+ canonical_tag = soup.find('link', attrs={'rel': 'canonical'})
201
+ if canonical_tag and canonical_tag.get('href'):
202
+ metadata['canonical_url'] = canonical_tag['href'].strip()
203
+
204
+ # Extract robots meta
205
+ robots_tag = soup.find('meta', attrs={'name': 'robots'})
206
+ if robots_tag and robots_tag.get('content'):
207
+ metadata['robots'] = robots_tag['content'].strip()
208
+
209
+ # Extract Open Graph metadata
210
+ og_metadata = {}
211
+ for meta_tag in soup.find_all('meta', attrs={'property': re.compile('^og:')}):
212
+ if meta_tag.get('content'):
213
+ property_name = meta_tag['property'][3:] # Remove 'og:' prefix
214
+ og_metadata[property_name] = meta_tag['content'].strip()
215
+
216
+ if og_metadata:
217
+ metadata['open_graph'] = og_metadata
218
+
219
+ # Extract Twitter Card metadata
220
+ twitter_metadata = {}
221
+ for meta_tag in soup.find_all('meta', attrs={'name': re.compile('^twitter:')}):
222
+ if meta_tag.get('content'):
223
+ property_name = meta_tag['name'][8:] # Remove 'twitter:' prefix
224
+ twitter_metadata[property_name] = meta_tag['content'].strip()
225
+
226
+ if twitter_metadata:
227
+ metadata['twitter_card'] = twitter_metadata
228
+
229
+ # Extract schema.org structured data (JSON-LD)
230
+ schema_metadata = []
231
+ for script in soup.find_all('script', attrs={'type': 'application/ld+json'}):
232
+ if script.string:
233
+ try:
234
+ import json
235
+ schema_data = json.loads(script.string)
236
+ schema_metadata.append(schema_data)
237
+ except Exception as e:
238
+ logger.debug(f"Error parsing JSON-LD: {e}")
239
+
240
+ if schema_metadata:
241
+ metadata['structured_data'] = schema_metadata
242
+
243
+ # Extract text content statistics
244
+ text_content = soup.get_text(separator=' ', strip=True)
245
+ if text_content:
246
+ word_count = len(text_content.split())
247
+ metadata['word_count'] = word_count
248
+ metadata['text_length'] = len(text_content)
249
+
250
+ return metadata
251
+
252
+ def _extract_domain(self, url: str) -> str:
253
+ """Extract domain from URL"""
254
+ parsed = tldextract.extract(url)
255
+ return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
256
+
257
+ def calculate_priority(self, url: str, metadata: Dict[str, Any]) -> Priority:
258
+ """
259
+ Calculate priority for a URL based on various factors
260
+
261
+ Args:
262
+ url: URL to calculate priority for
263
+ metadata: Metadata extracted from the page
264
+
265
+ Returns:
266
+ Priority enum value
267
+ """
268
+ # Default priority
269
+ priority = Priority.MEDIUM
270
+
271
+ try:
272
+ # Extract path depth
273
+ parsed = urlparse(url)
274
+ path = parsed.path
275
+ depth = len([p for p in path.split('/') if p])
276
+
277
+ # Prioritize URLs with shorter paths
278
+ if depth <= 1:
279
+ priority = Priority.HIGH
280
+ elif depth <= 3:
281
+ priority = Priority.MEDIUM
282
+ else:
283
+ priority = Priority.LOW
284
+
285
+ # Prioritize URLs with certain keywords in path
286
+ if re.search(r'(article|blog|news|post)', path, re.IGNORECASE):
287
+ priority = Priority.HIGH
288
+
289
+ # Deprioritize URLs with pagination patterns
290
+ if re.search(r'(page|p|pg)=\d+', url, re.IGNORECASE):
291
+ priority = Priority.LOW
292
+
293
+ # Check metadata
294
+ if metadata:
295
+ # Prioritize based on title
296
+ title = metadata.get('title', '')
297
+ if title and len(title) > 10:
298
+ priority = min(priority, Priority.MEDIUM) # Raise priority if it's lower
299
+
300
+ # Prioritize based on description
301
+ description = metadata.get('description', '')
302
+ if description and len(description) > 50:
303
+ priority = min(priority, Priority.MEDIUM) # Raise priority if it's lower
304
+
305
+ # Prioritize based on word count
306
+ word_count = metadata.get('word_count', 0)
307
+ if word_count > 1000:
308
+ priority = min(priority, Priority.HIGH) # High priority for content-rich pages
309
+ elif word_count > 500:
310
+ priority = min(priority, Priority.MEDIUM)
311
+
312
+ return priority
313
+
314
+ except Exception as e:
315
+ logger.debug(f"Error calculating priority for URL {url}: {e}")
316
+ return Priority.MEDIUM
requirements.txt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ requests==2.31.0
3
+ beautifulsoup4==4.12.3
4
+ aiohttp==3.9.3
5
+ lxml==4.9.2
6
+ html5lib==1.1
7
+ pydantic==1.10.7
8
+ pymongo==4.6.1
9
+ redis==5.0.1
10
+ boto3==1.26.123
11
+ docopt==0.6.2
12
+
13
+ # URL and DNS handling
14
+ dnspython==2.3.0
15
+ tldextract==5.1.1
16
+ validators==0.20.0
17
+ robotexclusionrulesparser==1.7.1
18
+ urllib3==1.26.15
19
+
20
+ # Monitoring and metrics
21
+ prometheus-client==0.16.0
22
+
23
+ # HTML processing
24
+ html2text==2020.1.16
25
+
26
+ # Async and concurrency
27
+ anyio==3.6.2
28
+ asyncio==3.4.3
29
+
30
+ # Utilities
31
+ python-dateutil==2.8.2
32
+ pytz==2023.3
33
+ retry==0.9.2
34
+ cryptography==40.0.1
35
+ cachetools==5.3.0
36
+
37
+ # Added from the code block
38
+ openai==1.12.0
39
+ gradio==4.16.0
40
+ chardet==5.2.0
41
+
42
+ # Dotenv
43
+ python-dotenv
robots.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Robots.txt handler for web crawler
3
+ """
4
+
5
+ import time
6
+ import logging
7
+ import requests
8
+ from urllib.parse import urlparse, urljoin
9
+ from typing import Dict, Optional, Tuple
10
+ import tldextract
11
+ from datetime import datetime, timedelta
12
+ from cachetools import TTLCache
13
+ import robotexclusionrulesparser
14
+
15
+ from models import RobotsInfo
16
+ import config
17
+
18
+ # Import local configuration if available
19
+ try:
20
+ import local_config
21
+ # Override config settings with local settings
22
+ for key in dir(local_config):
23
+ if key.isupper():
24
+ setattr(config, key, getattr(local_config, key))
25
+ logging.info("Loaded local configuration")
26
+ except ImportError:
27
+ pass
28
+
29
+ # Configure logging
30
+ logging.basicConfig(
31
+ level=getattr(logging, config.LOG_LEVEL),
32
+ format=config.LOG_FORMAT
33
+ )
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class RobotsHandler:
38
+ """Handles robots.txt fetching and parsing"""
39
+
40
+ def __init__(self, user_agent: Optional[str] = None, cache_size: int = 1000, cache_ttl: int = 3600):
41
+ """
42
+ Initialize robots handler
43
+
44
+ Args:
45
+ user_agent: User agent to use when fetching robots.txt
46
+ cache_size: Maximum number of robots.txt rules to cache
47
+ cache_ttl: Time to live for cache entries in seconds
48
+ """
49
+ self.user_agent = user_agent or config.USER_AGENT
50
+ self.parser = robotexclusionrulesparser.RobotExclusionRulesParser()
51
+
52
+ # Cache of robots.txt rules for domains
53
+ self.robots_cache = TTLCache(maxsize=cache_size, ttl=cache_ttl)
54
+
55
+ # Create request session
56
+ self.session = requests.Session()
57
+ self.session.headers.update({'User-Agent': self.user_agent})
58
+
59
+ def can_fetch(self, url: str) -> Tuple[bool, Optional[float]]:
60
+ """
61
+ Check if URL can be fetched according to robots.txt
62
+
63
+ Args:
64
+ url: URL to check
65
+
66
+ Returns:
67
+ Tuple of (can_fetch, crawl_delay), where crawl_delay is in seconds
68
+ """
69
+ try:
70
+ parsed = urlparse(url)
71
+ base_url = f"{parsed.scheme}://{parsed.netloc}"
72
+ domain = self._get_domain(url)
73
+
74
+ # Check if robots info is in cache
75
+ robots_info = self._get_robots_info(base_url, domain)
76
+
77
+ # Check if allowed
78
+ path = parsed.path or "/"
79
+ allowed = robots_info.allowed
80
+ if allowed:
81
+ allowed = self.parser.is_allowed(self.user_agent, path)
82
+
83
+ # Get crawl delay
84
+ crawl_delay = robots_info.crawl_delay
85
+ if not crawl_delay and hasattr(self.parser, 'get_crawl_delay'):
86
+ try:
87
+ crawl_delay = float(self.parser.get_crawl_delay(self.user_agent) or 0)
88
+ except:
89
+ crawl_delay = 0
90
+
91
+ return allowed, crawl_delay
92
+
93
+ except Exception as e:
94
+ logger.warning(f"Error checking robots.txt for {url}: {e}")
95
+ # In case of error, assume allowed
96
+ return True, None
97
+
98
+ def _get_robots_info(self, base_url: str, domain: str) -> RobotsInfo:
99
+ """
100
+ Get robots.txt info for a domain
101
+
102
+ Args:
103
+ base_url: Base URL of the domain
104
+ domain: Domain name
105
+
106
+ Returns:
107
+ RobotsInfo object
108
+ """
109
+ # Check if in cache
110
+ if domain in self.robots_cache:
111
+ return self.robots_cache[domain]
112
+
113
+ # Fetch robots.txt
114
+ robots_url = urljoin(base_url, "/robots.txt")
115
+ try:
116
+ response = self.session.get(
117
+ robots_url,
118
+ timeout=config.CRAWL_TIMEOUT,
119
+ allow_redirects=True
120
+ )
121
+
122
+ status_code = response.status_code
123
+
124
+ # If robots.txt exists
125
+ if status_code == 200:
126
+ # Parse robots.txt
127
+ self.parser.parse(response.text)
128
+
129
+ # Create simpler user agents info that doesn't depend on get_user_agents
130
+ user_agents = {}
131
+ # Just store info for our specific user agent
132
+ crawl_delay = None
133
+ if hasattr(self.parser, 'get_crawl_delay'):
134
+ try:
135
+ crawl_delay = self.parser.get_crawl_delay(self.user_agent)
136
+ except:
137
+ crawl_delay = None
138
+
139
+ user_agents[self.user_agent] = {
140
+ 'crawl_delay': crawl_delay
141
+ }
142
+
143
+ # Create robots info
144
+ robots_info = RobotsInfo(
145
+ domain=domain,
146
+ allowed=True,
147
+ crawl_delay=crawl_delay,
148
+ last_fetched=datetime.now(),
149
+ user_agents=user_agents,
150
+ status_code=status_code
151
+ )
152
+ else:
153
+ # If no robots.txt or error, assume allowed
154
+ self.parser.parse("") # Parse empty robots.txt
155
+ robots_info = RobotsInfo(
156
+ domain=domain,
157
+ allowed=True,
158
+ crawl_delay=None,
159
+ last_fetched=datetime.now(),
160
+ user_agents={},
161
+ status_code=status_code
162
+ )
163
+
164
+ # Cache robots info
165
+ self.robots_cache[domain] = robots_info
166
+ return robots_info
167
+
168
+ except requests.RequestException as e:
169
+ logger.warning(f"Error fetching robots.txt from {robots_url}: {e}")
170
+
171
+ # In case of error, assume allowed
172
+ self.parser.parse("") # Parse empty robots.txt
173
+ robots_info = RobotsInfo(
174
+ domain=domain,
175
+ allowed=True,
176
+ crawl_delay=None,
177
+ last_fetched=datetime.now(),
178
+ user_agents={},
179
+ status_code=None
180
+ )
181
+
182
+ # Cache robots info
183
+ self.robots_cache[domain] = robots_info
184
+ return robots_info
185
+
186
+ def _get_domain(self, url: str) -> str:
187
+ """Extract domain from URL"""
188
+ parsed = tldextract.extract(url)
189
+ return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
190
+
191
+ def clear_cache(self) -> None:
192
+ """Clear the robots.txt cache"""
193
+ self.robots_cache.clear()
194
+
195
+ def update_cache(self, domain: str) -> None:
196
+ """
197
+ Force update of a domain's robots.txt in the cache
198
+
199
+ Args:
200
+ domain: Domain to update
201
+ """
202
+ if domain in self.robots_cache:
203
+ del self.robots_cache[domain]
run_crawler.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Main script to run the web crawler with command line arguments
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import time
9
+ import logging
10
+ import argparse
11
+ import signal
12
+ from urllib.parse import urlparse
13
+
14
+ # Add the current directory to path if needed
15
+ script_dir = os.path.dirname(os.path.abspath(__file__))
16
+ if script_dir not in sys.path:
17
+ sys.path.insert(0, script_dir)
18
+
19
+ # Configure logging - do this first
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format='%(asctime)s [%(name)s] %(levelname)s: %(message)s',
23
+ handlers=[
24
+ logging.StreamHandler(sys.stdout),
25
+ logging.FileHandler(os.path.join(script_dir, 'crawler.log'))
26
+ ]
27
+ )
28
+ logger = logging.getLogger("run_crawler")
29
+
30
+ # Now import the crawler components
31
+ logger.info("Importing crawler modules...")
32
+ try:
33
+ from crawler import Crawler
34
+ from models import Priority
35
+ logger.info("Successfully imported crawler modules")
36
+ except Exception as e:
37
+ logger.error(f"Error importing crawler modules: {e}", exc_info=True)
38
+ sys.exit(1)
39
+
40
+ def parse_arguments():
41
+ """Parse command line arguments"""
42
+ parser = argparse.ArgumentParser(description='Run the web crawler with custom settings')
43
+
44
+ parser.add_argument('--seed', nargs='+', metavar='URL',
45
+ help='One or more seed URLs to start crawling')
46
+
47
+ parser.add_argument('--depth', type=int, default=None,
48
+ help='Maximum crawl depth')
49
+
50
+ parser.add_argument('--workers', type=int, default=None,
51
+ help='Number of worker threads')
52
+
53
+ parser.add_argument('--delay', type=float, default=None,
54
+ help='Delay between requests to the same domain (in seconds)')
55
+
56
+ parser.add_argument('--respect-robots', dest='respect_robots', action='store_true',
57
+ help='Respect robots.txt rules')
58
+
59
+ parser.add_argument('--ignore-robots', dest='respect_robots', action='store_false',
60
+ help='Ignore robots.txt rules')
61
+
62
+ parser.add_argument('--user-agent', type=str, default=None,
63
+ help='User agent to use for requests')
64
+
65
+ parser.add_argument('--async', dest='async_mode', action='store_true',
66
+ help='Use async mode for requests')
67
+
68
+ parser.add_argument('--domain-filter', type=str, default=None,
69
+ help='Only crawl URLs that match this domain')
70
+
71
+ parser.add_argument('--reset-db', action='store_true',
72
+ help='Reset MongoDB and flush Redis data before starting')
73
+
74
+ parser.add_argument('--verbose', action='store_true',
75
+ help='Enable verbose logging')
76
+
77
+ args = parser.parse_args()
78
+
79
+ # Set log level based on verbose flag
80
+ if args.verbose:
81
+ logger.setLevel(logging.DEBUG)
82
+ logger.debug("Verbose logging enabled")
83
+
84
+ return args
85
+
86
+ def reset_databases():
87
+ """Reset MongoDB and flush Redis data"""
88
+ success = True
89
+
90
+ # Reset MongoDB
91
+ try:
92
+ logger.info("Starting MongoDB cleanup...")
93
+ from mongo_cleanup import cleanup_mongodb
94
+ mongo_success = cleanup_mongodb()
95
+ if not mongo_success:
96
+ logger.warning("MongoDB cleanup may not have been completely successful")
97
+ success = False
98
+ else:
99
+ logger.info("MongoDB cleanup completed successfully")
100
+ except Exception as e:
101
+ logger.error(f"Error cleaning up MongoDB: {e}", exc_info=True)
102
+ success = False
103
+
104
+ # Flush Redis
105
+ try:
106
+ logger.info("Starting Redis flush...")
107
+ import redis
108
+ logger.debug("Connecting to Redis to flush data...")
109
+
110
+ # Set a timeout for Redis connection
111
+ r = redis.Redis(host='localhost', port=6379, db=0, socket_timeout=5)
112
+
113
+ # Check if Redis is available
114
+ try:
115
+ logger.debug("Testing Redis connection...")
116
+ ping_result = r.ping()
117
+ logger.debug(f"Redis ping result: {ping_result}")
118
+
119
+ # If connection works, flush all data
120
+ logger.info("Flushing all Redis data...")
121
+ result = r.flushall()
122
+ logger.info(f"Redis flush result: {result}")
123
+ except redis.ConnectionError as e:
124
+ logger.error(f"Redis connection error: {e}")
125
+ success = False
126
+ except Exception as e:
127
+ logger.error(f"Error flushing Redis: {e}", exc_info=True)
128
+ success = False
129
+
130
+ return success
131
+
132
+ def setup_signal_handlers(crawler_instance):
133
+ """Setup signal handlers for graceful shutdown"""
134
+ def signal_handler(sig, frame):
135
+ logger.info(f"Received signal {sig}, shutting down gracefully...")
136
+ if crawler_instance and crawler_instance.running:
137
+ logger.info("Stopping crawler...")
138
+ crawler_instance.stop()
139
+ sys.exit(0)
140
+
141
+ signal.signal(signal.SIGINT, signal_handler)
142
+ signal.signal(signal.SIGTERM, signal_handler)
143
+
144
+ def run_crawler():
145
+ """Run the crawler with command-line arguments"""
146
+ args = parse_arguments()
147
+ crawler = None
148
+
149
+ try:
150
+ logger.info("Starting the web crawler...")
151
+
152
+ # Reset database if requested
153
+ if args.reset_db:
154
+ logger.info("Resetting MongoDB and flushing Redis data...")
155
+ if not reset_databases():
156
+ logger.warning("Database reset was not completely successful")
157
+
158
+ # Create crawler instance
159
+ logger.info("Creating crawler instance...")
160
+ crawler = Crawler()
161
+ logger.info("Crawler instance created successfully")
162
+
163
+ # Setup signal handlers
164
+ setup_signal_handlers(crawler)
165
+
166
+ # Override settings from command line if provided
167
+ if args.depth is not None:
168
+ import config
169
+ config.MAX_DEPTH = args.depth
170
+ logger.info(f"Setting maximum depth to {args.depth}")
171
+
172
+ if args.delay is not None:
173
+ import config
174
+ config.DELAY_BETWEEN_REQUESTS = args.delay
175
+ logger.info(f"Setting delay between requests to {args.delay} seconds")
176
+
177
+ if args.respect_robots is not None:
178
+ import config
179
+ config.RESPECT_ROBOTS_TXT = args.respect_robots
180
+ logger.info(f"Respect robots.txt: {args.respect_robots}")
181
+
182
+ if args.user_agent is not None:
183
+ import config
184
+ config.USER_AGENT = args.user_agent
185
+ logger.info(f"Using user agent: {args.user_agent}")
186
+
187
+ # Add seed URLs if provided
188
+ if args.seed:
189
+ logger.info(f"Adding {len(args.seed)} seed URLs")
190
+ seed_urls = []
191
+ for url in args.seed:
192
+ if not (url.startswith('http://') or url.startswith('https://')):
193
+ url = 'https://' + url
194
+ seed_urls.append(url)
195
+ logger.debug(f"Added seed URL: {url}")
196
+
197
+ # Add the URLs to the frontier
198
+ logger.info("Adding seed URLs to frontier...")
199
+ added = crawler.add_seed_urls(seed_urls, Priority.VERY_HIGH)
200
+ logger.info(f"Successfully added {added} seed URLs to the frontier")
201
+
202
+ # Apply domain filter if provided
203
+ if args.domain_filter:
204
+ import config
205
+
206
+ # Allow both domain.com or http://domain.com formats
207
+ domain = args.domain_filter
208
+ if domain.startswith('http://') or domain.startswith('https://'):
209
+ domain = urlparse(domain).netloc
210
+
211
+ config.ALLOWED_DOMAINS = [domain]
212
+ logger.info(f"Filtering to domain: {domain}")
213
+
214
+ # Start the crawler
215
+ num_workers = args.workers if args.workers is not None else 4
216
+
217
+ logger.info(f"Starting crawler with {num_workers} workers...")
218
+ crawler.start(num_workers=num_workers, async_mode=args.async_mode)
219
+ # If we get here, crawler has finished or was stopped
220
+ logger.info("Crawler finished")
221
+
222
+ except KeyboardInterrupt:
223
+ logger.info("Crawler interrupted by user")
224
+ if crawler and crawler.running:
225
+ logger.info("Stopping crawler...")
226
+ crawler.stop()
227
+ except Exception as e:
228
+ logger.error(f"Error running crawler: {e}", exc_info=True)
229
+ if crawler and crawler.running:
230
+ try:
231
+ logger.info("Attempting to stop crawler after error...")
232
+ crawler.stop()
233
+ except:
234
+ pass
235
+
236
+ if __name__ == "__main__":
237
+ run_crawler()
seo_analyzer_ui.py ADDED
@@ -0,0 +1,708 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SEO Analyzer UI using Gradio, Web Crawler, and OpenAI
3
+ """
4
+
5
+ import gradio as gr
6
+ import logging
7
+ import json
8
+ from typing import Dict, List, Any, Tuple, Optional
9
+ from urllib.parse import urlparse
10
+ import tldextract
11
+ from openai import OpenAI
12
+ import time
13
+ import os
14
+ import threading
15
+ import queue
16
+ import shutil
17
+ import uuid
18
+ from concurrent.futures import ThreadPoolExecutor
19
+ from datetime import datetime
20
+ import tempfile
21
+
22
+ from crawler import Crawler
23
+ from frontier import URLFrontier
24
+ from models import URL, Page
25
+ import config
26
+ from run_crawler import reset_databases
27
+ from dotenv import load_dotenv, find_dotenv
28
+
29
+ load_dotenv(find_dotenv())
30
+
31
+ # Check if we're in deployment mode (e.g., Hugging Face Spaces)
32
+ IS_DEPLOYMENT = os.getenv('DEPLOYMENT', 'false').lower() == 'true'
33
+
34
+ # Custom CSS for better styling
35
+ CUSTOM_CSS = """
36
+ .container {
37
+ max-width: 1200px !important;
38
+ margin: auto;
39
+ padding: 20px;
40
+ }
41
+
42
+ .header {
43
+ text-align: center;
44
+ margin-bottom: 2rem;
45
+ }
46
+
47
+ .header h1 {
48
+ color: #2d3748;
49
+ font-size: 2.5rem;
50
+ font-weight: 700;
51
+ margin-bottom: 1rem;
52
+ }
53
+
54
+ .header p {
55
+ color: #4a5568;
56
+ font-size: 1.1rem;
57
+ max-width: 800px;
58
+ margin: 0 auto;
59
+ }
60
+
61
+ .input-section {
62
+ background: #f7fafc;
63
+ border-radius: 12px;
64
+ padding: 24px;
65
+ margin-bottom: 24px;
66
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
67
+ }
68
+
69
+ .analysis-section {
70
+ background: white;
71
+ border-radius: 12px;
72
+ padding: 24px;
73
+ margin-top: 24px;
74
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
75
+ }
76
+
77
+ .log-section {
78
+ font-family: monospace;
79
+ background: #1a202c;
80
+ color: #e2e8f0;
81
+ padding: 16px;
82
+ border-radius: 8px;
83
+ margin-top: 24px;
84
+ }
85
+
86
+ /* Custom styling for inputs */
87
+ .input-container {
88
+ background: white;
89
+ padding: 16px;
90
+ border-radius: 8px;
91
+ margin-bottom: 16px;
92
+ }
93
+
94
+ /* Custom styling for the slider */
95
+ .slider-container {
96
+ padding: 12px;
97
+ background: white;
98
+ border-radius: 8px;
99
+ }
100
+
101
+ /* Custom styling for buttons */
102
+ .primary-button {
103
+ background: #4299e1 !important;
104
+ color: white !important;
105
+ padding: 12px 24px !important;
106
+ border-radius: 8px !important;
107
+ font-weight: 600 !important;
108
+ transition: all 0.3s ease !important;
109
+ }
110
+
111
+ .primary-button:hover {
112
+ background: #3182ce !important;
113
+ transform: translateY(-1px) !important;
114
+ }
115
+
116
+ /* Markdown output styling */
117
+ .markdown-output {
118
+ font-family: system-ui, -apple-system, sans-serif;
119
+ line-height: 1.6;
120
+ }
121
+
122
+ .markdown-output h1 {
123
+ color: #2d3748;
124
+ border-bottom: 2px solid #e2e8f0;
125
+ padding-bottom: 0.5rem;
126
+ }
127
+
128
+ .markdown-output h2 {
129
+ color: #4a5568;
130
+ margin-top: 2rem;
131
+ }
132
+
133
+ .markdown-output h3 {
134
+ color: #718096;
135
+ margin-top: 1.5rem;
136
+ }
137
+
138
+ /* Progress bar styling */
139
+ .progress-bar {
140
+ height: 8px !important;
141
+ border-radius: 4px !important;
142
+ background: #ebf8ff !important;
143
+ }
144
+
145
+ .progress-bar-fill {
146
+ background: #4299e1 !important;
147
+ border-radius: 4px !important;
148
+ }
149
+
150
+ /* Add some spacing between sections */
151
+ .gap {
152
+ margin: 2rem 0;
153
+ }
154
+ """
155
+
156
+ # Create a custom handler that will store logs in a queue
157
+ class QueueHandler(logging.Handler):
158
+ def __init__(self, log_queue):
159
+ super().__init__()
160
+ self.log_queue = log_queue
161
+
162
+ def emit(self, record):
163
+ log_entry = self.format(record)
164
+ try:
165
+ self.log_queue.put_nowait(f"{datetime.now().strftime('%H:%M:%S')} - {log_entry}")
166
+ except queue.Full:
167
+ pass # Ignore if queue is full
168
+
169
+ # Configure logging
170
+ logging.basicConfig(
171
+ level=getattr(logging, config.LOG_LEVEL),
172
+ format='%(levelname)s - %(message)s'
173
+ )
174
+ logger = logging.getLogger(__name__)
175
+
176
+ logger.info(f"IS_DEPLOYMENT: {IS_DEPLOYMENT}")
177
+
178
+ class InMemoryStorage:
179
+ """Simple in-memory storage for deployment mode"""
180
+ def __init__(self):
181
+ self.urls = {}
182
+ self.pages = {}
183
+
184
+ def reset(self):
185
+ self.urls.clear()
186
+ self.pages.clear()
187
+
188
+ def add_url(self, url_obj):
189
+ self.urls[url_obj.url] = url_obj
190
+
191
+ def add_page(self, page_obj):
192
+ self.pages[page_obj.url] = page_obj
193
+
194
+ def get_url(self, url):
195
+ return self.urls.get(url)
196
+
197
+ def get_page(self, url):
198
+ return self.pages.get(url)
199
+
200
+ class SEOAnalyzer:
201
+ """
202
+ SEO Analyzer that combines web crawler with OpenAI analysis
203
+ """
204
+
205
+ def __init__(self, api_key: str):
206
+ """Initialize SEO Analyzer"""
207
+ self.client = OpenAI(api_key=api_key)
208
+ self.crawler = None
209
+ self.crawled_pages = []
210
+ self.pages_crawled = 0
211
+ self.max_pages = 0
212
+ self.crawl_complete = threading.Event()
213
+ self.log_queue = queue.Queue(maxsize=1000)
214
+ self.session_id = str(uuid.uuid4())
215
+ self.storage = InMemoryStorage() if IS_DEPLOYMENT else None
216
+
217
+ # Add queue handler to logger
218
+ queue_handler = QueueHandler(self.log_queue)
219
+ queue_handler.setFormatter(logging.Formatter('%(levelname)s - %(message)s'))
220
+ logger.addHandler(queue_handler)
221
+
222
+ def _setup_session_storage(self) -> Tuple[str, str, str]:
223
+ """
224
+ Set up session-specific storage directories
225
+
226
+ Returns:
227
+ Tuple of (storage_path, html_path, log_path)
228
+ """
229
+ # Create session-specific paths
230
+ session_storage = os.path.join(config.STORAGE_PATH, self.session_id)
231
+ session_html = os.path.join(session_storage, "html")
232
+ session_logs = os.path.join(session_storage, "logs")
233
+
234
+ # Create directories
235
+ os.makedirs(session_storage, exist_ok=True)
236
+ os.makedirs(session_html, exist_ok=True)
237
+ os.makedirs(session_logs, exist_ok=True)
238
+
239
+ logger.info(f"Created session storage at {session_storage}")
240
+ return session_storage, session_html, session_logs
241
+
242
+ def _cleanup_session_storage(self):
243
+ """Clean up session-specific storage"""
244
+ session_path = os.path.join(config.STORAGE_PATH, self.session_id)
245
+ try:
246
+ if os.path.exists(session_path):
247
+ shutil.rmtree(session_path)
248
+ logger.info(f"Cleaned up session storage at {session_path}")
249
+ except Exception as e:
250
+ logger.error(f"Error cleaning up session storage: {e}")
251
+
252
+ def _reset_storage(self):
253
+ """Reset storage based on deployment mode"""
254
+ if IS_DEPLOYMENT:
255
+ self.storage.reset()
256
+ else:
257
+ reset_databases()
258
+
259
+ def analyze_website(self, url: str, max_pages: int = 10, progress: gr.Progress = gr.Progress()) -> Tuple[str, List[Dict], str]:
260
+ """
261
+ Crawl website and analyze SEO using OpenAI
262
+
263
+ Args:
264
+ url: Seed URL to crawl
265
+ max_pages: Maximum number of pages to crawl
266
+ progress: Gradio progress indicator
267
+
268
+ Returns:
269
+ Tuple of (overall analysis, list of page-specific analyses, log output)
270
+ """
271
+ try:
272
+ # Reset state
273
+ self.crawled_pages = []
274
+ self.pages_crawled = 0
275
+ self.max_pages = max_pages
276
+ self.crawl_complete.clear()
277
+
278
+ # Set up storage
279
+ if IS_DEPLOYMENT:
280
+ # Use temporary directory for file storage in deployment
281
+ temp_dir = tempfile.mkdtemp()
282
+ session_storage = temp_dir
283
+ session_html = os.path.join(temp_dir, "html")
284
+ session_logs = os.path.join(temp_dir, "logs")
285
+ os.makedirs(session_html, exist_ok=True)
286
+ os.makedirs(session_logs, exist_ok=True)
287
+ else:
288
+ session_storage, session_html, session_logs = self._setup_session_storage()
289
+
290
+ # Update config paths for this session
291
+ config.HTML_STORAGE_PATH = session_html
292
+ config.LOG_PATH = session_logs
293
+
294
+ # Clear log queue
295
+ while not self.log_queue.empty():
296
+ self.log_queue.get_nowait()
297
+
298
+ logger.info(f"Starting analysis of {url} with max_pages={max_pages}")
299
+
300
+ # Reset storage
301
+ logger.info("Resetting storage...")
302
+ self._reset_storage()
303
+ logger.info("Storage reset completed")
304
+
305
+ # Create new crawler instance with appropriate storage
306
+ logger.info("Creating crawler instance...")
307
+ if IS_DEPLOYMENT:
308
+ # In deployment mode, use in-memory storage
309
+ self.crawler = Crawler(storage=self.storage)
310
+ # Set frontier to use memory mode
311
+ self.crawler.frontier = URLFrontier(use_memory=True)
312
+ else:
313
+ # In local mode, use MongoDB and Redis
314
+ self.crawler = Crawler()
315
+ logger.info("Crawler instance created successfully")
316
+
317
+ # Extract domain for filtering
318
+ domain = self._extract_domain(url)
319
+ logger.info(f"Analyzing domain: {domain}")
320
+
321
+ # Add seed URL and configure domain filter
322
+ self.crawler.add_seed_urls([url])
323
+ config.ALLOWED_DOMAINS = [domain]
324
+ logger.info("Added seed URL and configured domain filter")
325
+
326
+ # Override the crawler's _process_url method to capture pages
327
+ original_process_url = self.crawler._process_url
328
+ def wrapped_process_url(url_obj):
329
+ if self.pages_crawled >= self.max_pages:
330
+ self.crawler.running = False # Signal crawler to stop
331
+ self.crawl_complete.set()
332
+ return
333
+
334
+ original_process_url(url_obj)
335
+
336
+ # Get the page based on storage mode
337
+ if IS_DEPLOYMENT:
338
+ # In deployment mode, get page from in-memory storage
339
+ page = self.storage.get_page(url_obj.url)
340
+ if page:
341
+ _, metadata = self.crawler.parser.parse(page)
342
+ self.crawled_pages.append({
343
+ 'url': url_obj.url,
344
+ 'content': page.content,
345
+ 'metadata': metadata
346
+ })
347
+ self.pages_crawled += 1
348
+ logger.info(f"Crawled page {self.pages_crawled}/{max_pages}: {url_obj.url}")
349
+ else:
350
+ # In local mode, get page from MongoDB
351
+ page_data = self.crawler.pages_collection.find_one({'url': url_obj.url})
352
+ if page_data and page_data.get('content'):
353
+ _, metadata = self.crawler.parser.parse(Page(**page_data))
354
+ self.crawled_pages.append({
355
+ 'url': url_obj.url,
356
+ 'content': page_data['content'],
357
+ 'metadata': metadata
358
+ })
359
+ self.pages_crawled += 1
360
+ logger.info(f"Crawled page {self.pages_crawled}/{max_pages}: {url_obj.url}")
361
+
362
+ if self.pages_crawled >= self.max_pages:
363
+ self.crawler.running = False # Signal crawler to stop
364
+ self.crawl_complete.set()
365
+
366
+ self.crawler._process_url = wrapped_process_url
367
+
368
+ def run_crawler():
369
+ try:
370
+ # Skip signal handler registration
371
+ self.crawler.running = True
372
+ with ThreadPoolExecutor(max_workers=1) as executor:
373
+ try:
374
+ futures = [executor.submit(self.crawler._crawl_worker)]
375
+ for future in futures:
376
+ future.result()
377
+ except Exception as e:
378
+ logger.error(f"Error in crawler worker: {e}")
379
+ finally:
380
+ self.crawler.running = False
381
+ self.crawl_complete.set()
382
+ except Exception as e:
383
+ logger.error(f"Error in run_crawler: {e}")
384
+ self.crawl_complete.set()
385
+
386
+ # Start crawler in a thread
387
+ crawler_thread = threading.Thread(target=run_crawler)
388
+ crawler_thread.daemon = True
389
+ crawler_thread.start()
390
+
391
+ # Wait for completion or timeout with progress updates
392
+ timeout = 300 # 5 minutes
393
+ start_time = time.time()
394
+ last_progress = 0
395
+ while not self.crawl_complete.is_set() and time.time() - start_time < timeout:
396
+ current_progress = min(0.8, self.pages_crawled / max_pages)
397
+ if current_progress != last_progress:
398
+ progress(current_progress, f"Crawled {self.pages_crawled}/{max_pages} pages")
399
+ last_progress = current_progress
400
+ time.sleep(0.1) # More frequent updates
401
+
402
+ if time.time() - start_time >= timeout:
403
+ logger.warning("Crawler timed out")
404
+ self.crawler.running = False
405
+
406
+ # Wait for thread to finish
407
+ crawler_thread.join(timeout=10)
408
+
409
+ # Restore original method
410
+ self.crawler._process_url = original_process_url
411
+
412
+ # Collect all logs
413
+ logs = []
414
+ while not self.log_queue.empty():
415
+ logs.append(self.log_queue.get_nowait())
416
+ log_output = "\n".join(logs)
417
+
418
+ if not self.crawled_pages:
419
+ self._cleanup_session_storage()
420
+ return "No pages were successfully crawled.", [], log_output
421
+
422
+ logger.info("Starting OpenAI analysis...")
423
+ progress(0.9, "Analyzing crawled pages with OpenAI...")
424
+
425
+ # Analyze crawled pages with OpenAI
426
+ overall_analysis = self._get_overall_analysis(self.crawled_pages)
427
+ progress(0.95, "Generating page-specific analyses...")
428
+ page_analyses = self._get_page_analyses(self.crawled_pages)
429
+
430
+ logger.info("Analysis complete")
431
+ progress(1.0, "Analysis complete")
432
+
433
+ # Format the results
434
+ formatted_analysis = f"""
435
+ # SEO Analysis Report for {domain}
436
+
437
+ ## Overall Analysis
438
+ {overall_analysis}
439
+
440
+ ## Page-Specific Analyses
441
+ """
442
+ for page_analysis in page_analyses:
443
+ formatted_analysis += f"""
444
+ ### {page_analysis['url']}
445
+ {page_analysis['analysis']}
446
+ """
447
+
448
+ # Clean up all resources
449
+ logger.info("Cleaning up resources...")
450
+ if IS_DEPLOYMENT:
451
+ shutil.rmtree(temp_dir, ignore_errors=True)
452
+ self.storage.reset()
453
+ else:
454
+ self._cleanup_session_storage()
455
+ self._reset_storage()
456
+ logger.info("All resources cleaned up")
457
+
458
+ return formatted_analysis, page_analyses, log_output
459
+
460
+ except Exception as e:
461
+ logger.error(f"Error analyzing website: {e}")
462
+ # Clean up all resources even on error
463
+ if IS_DEPLOYMENT:
464
+ shutil.rmtree(temp_dir, ignore_errors=True)
465
+ self.storage.reset()
466
+ else:
467
+ self._cleanup_session_storage()
468
+ self._reset_storage()
469
+ # Collect all logs
470
+ logs = []
471
+ while not self.log_queue.empty():
472
+ logs.append(self.log_queue.get_nowait())
473
+ log_output = "\n".join(logs)
474
+ return f"Error analyzing website: {str(e)}", [], log_output
475
+
476
+ def _extract_domain(self, url: str) -> str:
477
+ """Extract domain from URL"""
478
+ extracted = tldextract.extract(url)
479
+ return f"{extracted.domain}.{extracted.suffix}"
480
+
481
+ def _get_overall_analysis(self, pages: List[Dict]) -> str:
482
+ """Get overall SEO analysis using OpenAI"""
483
+ try:
484
+ # Prepare site overview for analysis
485
+ site_overview = {
486
+ 'num_pages': len(pages),
487
+ 'pages': [{
488
+ 'url': page['url'],
489
+ 'metadata': page['metadata']
490
+ } for page in pages]
491
+ }
492
+
493
+ # Create analysis prompt
494
+ prompt = f"""
495
+ You are an expert SEO consultant. Analyze this website's SEO based on the crawled data:
496
+
497
+ {json.dumps(site_overview, indent=2)}
498
+
499
+ Provide a comprehensive SEO analysis including:
500
+ 1. Overall site structure and navigation
501
+ 2. Common SEO issues across pages
502
+ 3. Content quality and optimization
503
+ 4. Technical SEO recommendations
504
+ 5. Priority improvements
505
+
506
+ Format your response in Markdown.
507
+ """
508
+
509
+ # Get analysis from OpenAI
510
+ response = self.client.chat.completions.create(
511
+ model="gpt-4o-mini",
512
+ messages=[
513
+ {"role": "system", "content": "You are an expert SEO consultant providing detailed website analysis."},
514
+ {"role": "user", "content": prompt}
515
+ ],
516
+ temperature=0.7,
517
+ max_tokens=2000
518
+ )
519
+
520
+ return response.choices[0].message.content
521
+
522
+ except Exception as e:
523
+ logger.error(f"Error getting overall analysis: {e}")
524
+ return f"Error generating overall analysis: {str(e)}"
525
+
526
+ def _get_page_analyses(self, pages: List[Dict]) -> List[Dict]:
527
+ """Get page-specific SEO analyses using OpenAI"""
528
+ page_analyses = []
529
+
530
+ for page in pages:
531
+ try:
532
+ # Create page analysis prompt
533
+ prompt = f"""
534
+ Analyze this page's SEO:
535
+
536
+ URL: {page['url']}
537
+ Metadata: {json.dumps(page['metadata'], indent=2)}
538
+
539
+ Provide specific recommendations for:
540
+ 1. Title and meta description
541
+ 2. Heading structure
542
+ 3. Content optimization
543
+ 4. Internal linking
544
+ 5. Technical improvements
545
+
546
+ Format your response in Markdown.
547
+ """
548
+
549
+ # Get analysis from OpenAI
550
+ response = self.client.chat.completions.create(
551
+ model="gpt-4o-mini",
552
+ messages=[
553
+ {"role": "system", "content": "You are an expert SEO consultant providing detailed page analysis."},
554
+ {"role": "user", "content": prompt}
555
+ ],
556
+ temperature=0.7,
557
+ max_tokens=1000
558
+ )
559
+
560
+ page_analyses.append({
561
+ 'url': page['url'],
562
+ 'analysis': response.choices[0].message.content
563
+ })
564
+
565
+ # Sleep to respect rate limits
566
+ time.sleep(1)
567
+
568
+ except Exception as e:
569
+ logger.error(f"Error analyzing page {page['url']}: {e}")
570
+ page_analyses.append({
571
+ 'url': page['url'],
572
+ 'analysis': f"Error analyzing page: {str(e)}"
573
+ })
574
+
575
+ return page_analyses
576
+
577
+ def create_ui() -> gr.Interface:
578
+ """Create Gradio interface"""
579
+
580
+ def analyze(url: str, api_key: str, max_pages: int, progress: gr.Progress = gr.Progress()) -> Tuple[str, str]:
581
+ """Gradio interface function"""
582
+ try:
583
+ # Initialize analyzer
584
+ analyzer = SEOAnalyzer(api_key)
585
+
586
+ # Run analysis with progress updates
587
+ analysis, _, logs = analyzer.analyze_website(url, max_pages, progress)
588
+
589
+ # Collect all logs
590
+ log_output = ""
591
+ while not analyzer.log_queue.empty():
592
+ try:
593
+ log_output += analyzer.log_queue.get_nowait() + "\n"
594
+ except queue.Empty:
595
+ break
596
+
597
+ # Set progress to complete
598
+ progress(1.0, "Analysis complete")
599
+
600
+ # Return results
601
+ return analysis, log_output
602
+
603
+ except Exception as e:
604
+ error_msg = f"Error: {str(e)}"
605
+ logger.error(error_msg)
606
+ return error_msg, error_msg
607
+
608
+ # Create markdown content for the about section
609
+ about_markdown = """
610
+ # 🔍 SEO Analyzer Pro
611
+
612
+ Analyze your website's SEO performance using advanced crawling and AI technology.
613
+
614
+ ### Features:
615
+ - 🕷️ Intelligent Web Crawling
616
+ - 🧠 AI-Powered Analysis
617
+ - 📊 Comprehensive Reports
618
+ - 🚀 Performance Insights
619
+
620
+ ### How to Use:
621
+ 1. Enter your website URL
622
+ 2. Provide your OpenAI API key
623
+ 3. Choose how many pages to analyze
624
+ 4. Click Analyze and watch the magic happen!
625
+
626
+ ### What You'll Get:
627
+ - Detailed SEO analysis
628
+ - Content quality assessment
629
+ - Technical recommendations
630
+ - Performance insights
631
+ - Actionable improvements
632
+ """
633
+
634
+ # Create the interface with custom styling
635
+ with gr.Blocks(css=CUSTOM_CSS) as iface:
636
+ gr.Markdown(about_markdown)
637
+
638
+ with gr.Row():
639
+ with gr.Column(scale=2):
640
+ with gr.Group(elem_classes="input-section"):
641
+ gr.Markdown("### 📝 Enter Website Details")
642
+ url_input = gr.Textbox(
643
+ label="Website URL",
644
+ placeholder="https://example.com",
645
+ elem_classes="input-container",
646
+ info="Enter the full URL of the website you want to analyze (e.g., https://example.com)"
647
+ )
648
+ api_key = gr.Textbox(
649
+ label="OpenAI API Key",
650
+ placeholder="sk-...",
651
+ type="password",
652
+ elem_classes="input-container",
653
+ info="Your OpenAI API key is required for AI-powered analysis. Keep this secure!"
654
+ )
655
+ max_pages = gr.Slider(
656
+ minimum=1,
657
+ maximum=50,
658
+ value=10,
659
+ step=1,
660
+ label="Maximum Pages to Crawl",
661
+ elem_classes="slider-container",
662
+ info="Choose how many pages to analyze. More pages = more comprehensive analysis but takes longer"
663
+ )
664
+ analyze_btn = gr.Button(
665
+ "🔍 Analyze Website",
666
+ elem_classes="primary-button"
667
+ )
668
+
669
+ with gr.Row():
670
+ with gr.Column():
671
+ with gr.Group(elem_classes="analysis-section"):
672
+ gr.Markdown("### 📊 Analysis Results")
673
+ analysis_output = gr.Markdown(
674
+ label="SEO Analysis",
675
+ elem_classes="markdown-output"
676
+ )
677
+
678
+ with gr.Row():
679
+ with gr.Column():
680
+ with gr.Group(elem_classes="log-section"):
681
+ gr.Markdown("### 📋 Process Logs")
682
+ logs_output = gr.Textbox(
683
+ label="Logs",
684
+ lines=10,
685
+ elem_classes="log-output"
686
+ )
687
+
688
+ # Connect the button click to the analyze function
689
+ analyze_btn.click(
690
+ fn=analyze,
691
+ inputs=[url_input, api_key, max_pages],
692
+ outputs=[analysis_output, logs_output],
693
+ )
694
+
695
+ return iface
696
+
697
+ if __name__ == "__main__":
698
+ # Create base storage directory if it doesn't exist
699
+ os.makedirs(config.STORAGE_PATH, exist_ok=True)
700
+
701
+ # Create and launch UI
702
+ ui = create_ui()
703
+ ui.launch(
704
+ share=False,
705
+ server_name="0.0.0.0",
706
+ show_api=False,
707
+ show_error=True,
708
+ )
storage.py ADDED
@@ -0,0 +1,888 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Storage component for the web crawler.
3
+
4
+ Handles storing and retrieving crawled web pages using:
5
+ 1. MongoDB for metadata, URL information, and crawl stats
6
+ 2. Disk-based storage for HTML content
7
+ 3. Optional Amazon S3 integration for scalable storage
8
+ """
9
+
10
+ import os
11
+ import logging
12
+ import time
13
+ import datetime
14
+ import hashlib
15
+ import json
16
+ import gzip
17
+ import shutil
18
+ from typing import Dict, List, Optional, Union, Any, Tuple
19
+ from urllib.parse import urlparse
20
+ import pymongo
21
+ from pymongo import MongoClient, UpdateOne
22
+ from pymongo.errors import PyMongoError, BulkWriteError
23
+ import boto3
24
+ from botocore.exceptions import ClientError
25
+
26
+ from models import Page, URL
27
+ import config
28
+
29
+ # Configure logging
30
+ logging.basicConfig(
31
+ level=getattr(logging, config.LOG_LEVEL),
32
+ format=config.LOG_FORMAT
33
+ )
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class StorageManager:
38
+ """
39
+ Storage manager for web crawler data
40
+
41
+ Handles:
42
+ - MongoDB for metadata, URL information, and stats
43
+ - Disk-based storage for HTML content
44
+ - Optional Amazon S3 integration
45
+ """
46
+
47
+ def __init__(self,
48
+ mongo_uri: Optional[str] = None,
49
+ use_s3: bool = False,
50
+ compress_html: bool = True,
51
+ max_disk_usage_gb: float = 100.0):
52
+ """
53
+ Initialize the storage manager
54
+
55
+ Args:
56
+ mongo_uri: MongoDB connection URI
57
+ use_s3: Whether to use Amazon S3 for HTML storage
58
+ compress_html: Whether to compress HTML content
59
+ max_disk_usage_gb: Maximum disk space to use in GB
60
+ """
61
+ self.mongo_uri = mongo_uri or config.MONGODB_URI
62
+ self.use_s3 = use_s3
63
+ self.compress_html = compress_html
64
+ self.max_disk_usage_gb = max_disk_usage_gb
65
+
66
+ # Connect to MongoDB
67
+ self.mongo_client = MongoClient(self.mongo_uri)
68
+ self.db = self.mongo_client[config.MONGODB_DB]
69
+
70
+ # MongoDB collections
71
+ self.pages_collection = self.db['pages']
72
+ self.urls_collection = self.db['urls']
73
+ self.stats_collection = self.db['stats']
74
+
75
+ # Create necessary indexes
76
+ self._create_indexes()
77
+
78
+ # S3 client (if enabled)
79
+ self.s3_client = None
80
+ if self.use_s3:
81
+ self._init_s3_client()
82
+
83
+ # Ensure storage directories exist
84
+ self._ensure_directories()
85
+
86
+ # Bulk operation buffers
87
+ self.page_buffer = []
88
+ self.url_buffer = []
89
+ self.max_buffer_size = 100
90
+
91
+ # Statistics
92
+ self.stats = {
93
+ 'pages_stored': 0,
94
+ 'pages_retrieved': 0,
95
+ 'urls_stored': 0,
96
+ 'urls_retrieved': 0,
97
+ 'disk_space_used': 0,
98
+ 's3_objects_stored': 0,
99
+ 'mongodb_size': 0,
100
+ 'storage_errors': 0,
101
+ 'start_time': time.time()
102
+ }
103
+
104
+ def _create_indexes(self) -> None:
105
+ """Create necessary indexes in MongoDB collections"""
106
+ try:
107
+ # Pages collection indexes
108
+ self.pages_collection.create_index('url', unique=True)
109
+ self.pages_collection.create_index('content_hash')
110
+ self.pages_collection.create_index('crawled_at')
111
+ self.pages_collection.create_index('domain')
112
+
113
+ # URLs collection indexes
114
+ self.urls_collection.create_index('url', unique=True)
115
+ self.urls_collection.create_index('normalized_url')
116
+ self.urls_collection.create_index('domain')
117
+ self.urls_collection.create_index('status')
118
+ self.urls_collection.create_index('priority')
119
+ self.urls_collection.create_index('last_crawled')
120
+
121
+ logger.info("MongoDB indexes created")
122
+ except PyMongoError as e:
123
+ logger.error(f"Error creating MongoDB indexes: {e}")
124
+ self.stats['storage_errors'] += 1
125
+
126
+ def _init_s3_client(self) -> None:
127
+ """Initialize AWS S3 client"""
128
+ try:
129
+ self.s3_client = boto3.client(
130
+ 's3',
131
+ aws_access_key_id=config.AWS_ACCESS_KEY,
132
+ aws_secret_access_key=config.AWS_SECRET_KEY,
133
+ region_name=config.AWS_REGION
134
+ )
135
+ logger.info("S3 client initialized")
136
+
137
+ # Create bucket if it doesn't exist
138
+ self._ensure_s3_bucket()
139
+ except Exception as e:
140
+ logger.error(f"Error initializing S3 client: {e}")
141
+ self.use_s3 = False
142
+ self.stats['storage_errors'] += 1
143
+
144
+ def _ensure_s3_bucket(self) -> None:
145
+ """Create S3 bucket if it doesn't exist"""
146
+ if not self.s3_client:
147
+ return
148
+
149
+ try:
150
+ # Check if bucket exists
151
+ self.s3_client.head_bucket(Bucket=config.S3_BUCKET)
152
+ logger.info(f"S3 bucket '{config.S3_BUCKET}' exists")
153
+ except ClientError as e:
154
+ error_code = e.response.get('Error', {}).get('Code')
155
+
156
+ if error_code == '404':
157
+ # Bucket doesn't exist, create it
158
+ try:
159
+ self.s3_client.create_bucket(
160
+ Bucket=config.S3_BUCKET,
161
+ CreateBucketConfiguration={
162
+ 'LocationConstraint': config.AWS_REGION
163
+ }
164
+ )
165
+ logger.info(f"Created S3 bucket '{config.S3_BUCKET}'")
166
+ except ClientError as ce:
167
+ logger.error(f"Error creating S3 bucket: {ce}")
168
+ self.use_s3 = False
169
+ self.stats['storage_errors'] += 1
170
+ else:
171
+ logger.error(f"Error checking S3 bucket: {e}")
172
+ self.use_s3 = False
173
+ self.stats['storage_errors'] += 1
174
+
175
+ def _ensure_directories(self) -> None:
176
+ """Ensure storage directories exist"""
177
+ # Create main storage directory
178
+ os.makedirs(config.STORAGE_PATH, exist_ok=True)
179
+
180
+ # Create HTML storage directory
181
+ os.makedirs(config.HTML_STORAGE_PATH, exist_ok=True)
182
+
183
+ # Create log directory
184
+ os.makedirs(config.LOG_PATH, exist_ok=True)
185
+
186
+ logger.info("Storage directories created")
187
+
188
+ def store_page(self, page: Page, flush: bool = False) -> bool:
189
+ """
190
+ Store a crawled page
191
+
192
+ Args:
193
+ page: Page object to store
194
+ flush: Whether to flush page buffer immediately
195
+
196
+ Returns:
197
+ True if successful, False otherwise
198
+ """
199
+ try:
200
+ # Store page content based on configuration
201
+ if self.use_s3:
202
+ content_stored = self._store_content_s3(page)
203
+ else:
204
+ content_stored = self._store_content_disk(page)
205
+
206
+ if not content_stored:
207
+ logger.warning(f"Failed to store content for {page.url}")
208
+ self.stats['storage_errors'] += 1
209
+ return False
210
+
211
+ # Remove HTML content from page object for MongoDB storage
212
+ page_dict = page.dict(exclude={'content'})
213
+
214
+ # Convert datetime fields to proper format
215
+ if page.crawled_at:
216
+ page_dict['crawled_at'] = page.crawled_at
217
+
218
+ # Add to buffer
219
+ self.page_buffer.append(
220
+ UpdateOne(
221
+ {'url': page.url},
222
+ {'$set': page_dict},
223
+ upsert=True
224
+ )
225
+ )
226
+
227
+ # Update statistics
228
+ self.stats['pages_stored'] += 1
229
+
230
+ # Check if buffer should be flushed
231
+ if flush or len(self.page_buffer) >= self.max_buffer_size:
232
+ return self.flush_page_buffer()
233
+
234
+ return True
235
+ except Exception as e:
236
+ logger.error(f"Error storing page {page.url}: {e}")
237
+ self.stats['storage_errors'] += 1
238
+ return False
239
+
240
+ def _store_content_disk(self, page: Page) -> bool:
241
+ """
242
+ Store page content on disk
243
+
244
+ Args:
245
+ page: Page to store
246
+
247
+ Returns:
248
+ True if successful, False otherwise
249
+ """
250
+ try:
251
+ # Check disk space
252
+ if not self._check_disk_space():
253
+ logger.warning("Disk space limit exceeded")
254
+ return False
255
+
256
+ # Create directory for domain if it doesn't exist
257
+ domain = self._extract_domain(page.url)
258
+ domain_dir = os.path.join(config.HTML_STORAGE_PATH, domain)
259
+ os.makedirs(domain_dir, exist_ok=True)
260
+
261
+ # Create filename
262
+ filename = self._url_to_filename(page.url)
263
+
264
+ # Full path for the file
265
+ if self.compress_html:
266
+ filepath = os.path.join(domain_dir, f"{filename}.gz")
267
+
268
+ # Compress and write HTML to file
269
+ with gzip.open(filepath, 'wt', encoding='utf-8') as f:
270
+ f.write(page.content)
271
+ else:
272
+ filepath = os.path.join(domain_dir, f"{filename}.html")
273
+
274
+ # Write HTML to file
275
+ with open(filepath, 'w', encoding='utf-8') as f:
276
+ f.write(page.content)
277
+
278
+ # Update disk space used
279
+ file_size = os.path.getsize(filepath)
280
+ self.stats['disk_space_used'] += file_size
281
+
282
+ logger.debug(f"Stored HTML content for {page.url} at {filepath}")
283
+ return True
284
+ except Exception as e:
285
+ logger.error(f"Error storing content on disk for {page.url}: {e}")
286
+ self.stats['storage_errors'] += 1
287
+ return False
288
+
289
+ def _store_content_s3(self, page: Page) -> bool:
290
+ """
291
+ Store page content in S3
292
+
293
+ Args:
294
+ page: Page to store
295
+
296
+ Returns:
297
+ True if successful, False otherwise
298
+ """
299
+ if not self.s3_client:
300
+ logger.warning("S3 client not initialized, falling back to disk storage")
301
+ return self._store_content_disk(page)
302
+
303
+ try:
304
+ # Create key for S3 object
305
+ domain = self._extract_domain(page.url)
306
+ filename = self._url_to_filename(page.url)
307
+
308
+ # S3 key
309
+ s3_key = f"{domain}/{filename}"
310
+ if self.compress_html:
311
+ s3_key += ".gz"
312
+
313
+ # Compress content
314
+ content_bytes = gzip.compress(page.content.encode('utf-8'))
315
+ content_type = 'application/gzip'
316
+ else:
317
+ s3_key += ".html"
318
+ content_bytes = page.content.encode('utf-8')
319
+ content_type = 'text/html'
320
+
321
+ # Upload to S3
322
+ self.s3_client.put_object(
323
+ Bucket=config.S3_BUCKET,
324
+ Key=s3_key,
325
+ Body=content_bytes,
326
+ ContentType=content_type,
327
+ Metadata={
328
+ 'url': page.url,
329
+ 'crawled_at': page.crawled_at.isoformat() if page.crawled_at else '',
330
+ 'content_hash': page.content_hash or ''
331
+ }
332
+ )
333
+
334
+ # Update statistics
335
+ self.stats['s3_objects_stored'] += 1
336
+
337
+ logger.debug(f"Stored HTML content for {page.url} in S3 at {s3_key}")
338
+ return True
339
+ except Exception as e:
340
+ logger.error(f"Error storing content in S3 for {page.url}: {e}")
341
+ self.stats['storage_errors'] += 1
342
+
343
+ # Fall back to disk storage
344
+ logger.info(f"Falling back to disk storage for {page.url}")
345
+ return self._store_content_disk(page)
346
+
347
+ def store_url(self, url_obj: URL, flush: bool = False) -> bool:
348
+ """
349
+ Store URL information
350
+
351
+ Args:
352
+ url_obj: URL object to store
353
+ flush: Whether to flush URL buffer immediately
354
+
355
+ Returns:
356
+ True if successful, False otherwise
357
+ """
358
+ try:
359
+ # Convert URL object to dict
360
+ url_dict = url_obj.dict()
361
+
362
+ # Add to buffer
363
+ self.url_buffer.append(
364
+ UpdateOne(
365
+ {'url': url_obj.url},
366
+ {'$set': url_dict},
367
+ upsert=True
368
+ )
369
+ )
370
+
371
+ # Update statistics
372
+ self.stats['urls_stored'] += 1
373
+
374
+ # Check if buffer should be flushed
375
+ if flush or len(self.url_buffer) >= self.max_buffer_size:
376
+ return self.flush_url_buffer()
377
+
378
+ return True
379
+ except Exception as e:
380
+ logger.error(f"Error storing URL {url_obj.url}: {e}")
381
+ self.stats['storage_errors'] += 1
382
+ return False
383
+
384
+ def flush_page_buffer(self) -> bool:
385
+ """
386
+ Flush page buffer to MongoDB
387
+
388
+ Returns:
389
+ True if successful, False otherwise
390
+ """
391
+ if not self.page_buffer:
392
+ return True
393
+
394
+ try:
395
+ # Execute bulk operation
396
+ result = self.pages_collection.bulk_write(self.page_buffer, ordered=False)
397
+
398
+ # Clear buffer
399
+ buffer_size = len(self.page_buffer)
400
+ self.page_buffer = []
401
+
402
+ logger.debug(f"Flushed {buffer_size} pages to MongoDB")
403
+ return True
404
+ except BulkWriteError as e:
405
+ logger.error(f"Error in bulk write for pages: {e.details}")
406
+ self.stats['storage_errors'] += 1
407
+
408
+ # Clear buffer
409
+ self.page_buffer = []
410
+ return False
411
+ except Exception as e:
412
+ logger.error(f"Error flushing page buffer: {e}")
413
+ self.stats['storage_errors'] += 1
414
+
415
+ # Clear buffer
416
+ self.page_buffer = []
417
+ return False
418
+
419
+ def flush_url_buffer(self) -> bool:
420
+ """
421
+ Flush URL buffer to MongoDB
422
+
423
+ Returns:
424
+ True if successful, False otherwise
425
+ """
426
+ if not self.url_buffer:
427
+ return True
428
+
429
+ try:
430
+ # Execute bulk operation
431
+ result = self.urls_collection.bulk_write(self.url_buffer, ordered=False)
432
+
433
+ # Clear buffer
434
+ buffer_size = len(self.url_buffer)
435
+ self.url_buffer = []
436
+
437
+ logger.debug(f"Flushed {buffer_size} URLs to MongoDB")
438
+ return True
439
+ except BulkWriteError as e:
440
+ logger.error(f"Error in bulk write for URLs: {e.details}")
441
+ self.stats['storage_errors'] += 1
442
+
443
+ # Clear buffer
444
+ self.url_buffer = []
445
+ return False
446
+ except Exception as e:
447
+ logger.error(f"Error flushing URL buffer: {e}")
448
+ self.stats['storage_errors'] += 1
449
+
450
+ # Clear buffer
451
+ self.url_buffer = []
452
+ return False
453
+
454
+ def get_page(self, url: str) -> Optional[Page]:
455
+ """
456
+ Retrieve a page by URL
457
+
458
+ Args:
459
+ url: URL of the page to retrieve
460
+
461
+ Returns:
462
+ Page object if found, None otherwise
463
+ """
464
+ try:
465
+ # Get page metadata from MongoDB
466
+ page_doc = self.pages_collection.find_one({'url': url})
467
+
468
+ if not page_doc:
469
+ return None
470
+
471
+ # Create Page object from document
472
+ page = Page(**page_doc)
473
+
474
+ # Load content based on configuration
475
+ if self.use_s3:
476
+ content = self._load_content_s3(url)
477
+ else:
478
+ content = self._load_content_disk(url)
479
+
480
+ if content:
481
+ page.content = content
482
+
483
+ # Update statistics
484
+ self.stats['pages_retrieved'] += 1
485
+
486
+ return page
487
+ except Exception as e:
488
+ logger.error(f"Error retrieving page {url}: {e}")
489
+ self.stats['storage_errors'] += 1
490
+ return None
491
+
492
+ def _load_content_disk(self, url: str) -> Optional[str]:
493
+ """
494
+ Load page content from disk
495
+
496
+ Args:
497
+ url: URL of the page
498
+
499
+ Returns:
500
+ Page content if found, None otherwise
501
+ """
502
+ try:
503
+ # Get domain and filename
504
+ domain = self._extract_domain(url)
505
+ filename = self._url_to_filename(url)
506
+
507
+ # Check for compressed file first
508
+ compressed_path = os.path.join(config.HTML_STORAGE_PATH, domain, f"{filename}.gz")
509
+ uncompressed_path = os.path.join(config.HTML_STORAGE_PATH, domain, f"{filename}.html")
510
+
511
+ if os.path.exists(compressed_path):
512
+ # Load compressed content
513
+ with gzip.open(compressed_path, 'rt', encoding='utf-8') as f:
514
+ return f.read()
515
+ elif os.path.exists(uncompressed_path):
516
+ # Load uncompressed content
517
+ with open(uncompressed_path, 'r', encoding='utf-8') as f:
518
+ return f.read()
519
+ else:
520
+ logger.warning(f"Content file not found for {url}")
521
+ return None
522
+ except Exception as e:
523
+ logger.error(f"Error loading content from disk for {url}: {e}")
524
+ self.stats['storage_errors'] += 1
525
+ return None
526
+
527
+ def _load_content_s3(self, url: str) -> Optional[str]:
528
+ """
529
+ Load page content from S3
530
+
531
+ Args:
532
+ url: URL of the page
533
+
534
+ Returns:
535
+ Page content if found, None otherwise
536
+ """
537
+ if not self.s3_client:
538
+ logger.warning("S3 client not initialized, falling back to disk loading")
539
+ return self._load_content_disk(url)
540
+
541
+ try:
542
+ # Get domain and filename
543
+ domain = self._extract_domain(url)
544
+ filename = self._url_to_filename(url)
545
+
546
+ # Try both compressed and uncompressed keys
547
+ s3_key_compressed = f"{domain}/{filename}.gz"
548
+ s3_key_uncompressed = f"{domain}/{filename}.html"
549
+
550
+ try:
551
+ # Try compressed file first
552
+ response = self.s3_client.get_object(
553
+ Bucket=config.S3_BUCKET,
554
+ Key=s3_key_compressed
555
+ )
556
+
557
+ # Decompress content
558
+ content_bytes = response['Body'].read()
559
+ return gzip.decompress(content_bytes).decode('utf-8')
560
+ except ClientError as e:
561
+ if e.response['Error']['Code'] == 'NoSuchKey':
562
+ # Try uncompressed file
563
+ try:
564
+ response = self.s3_client.get_object(
565
+ Bucket=config.S3_BUCKET,
566
+ Key=s3_key_uncompressed
567
+ )
568
+ content_bytes = response['Body'].read()
569
+ return content_bytes.decode('utf-8')
570
+ except ClientError as e2:
571
+ if e2.response['Error']['Code'] == 'NoSuchKey':
572
+ logger.warning(f"Content not found in S3 for {url}")
573
+
574
+ # Try loading from disk as fallback
575
+ return self._load_content_disk(url)
576
+ else:
577
+ raise e2
578
+ else:
579
+ raise e
580
+ except Exception as e:
581
+ logger.error(f"Error loading content from S3 for {url}: {e}")
582
+ self.stats['storage_errors'] += 1
583
+
584
+ # Try loading from disk as fallback
585
+ return self._load_content_disk(url)
586
+
587
+ def get_url(self, url: str) -> Optional[URL]:
588
+ """
589
+ Retrieve URL information by URL
590
+
591
+ Args:
592
+ url: URL to retrieve
593
+
594
+ Returns:
595
+ URL object if found, None otherwise
596
+ """
597
+ try:
598
+ # Get URL information from MongoDB
599
+ url_doc = self.urls_collection.find_one({'url': url})
600
+
601
+ if not url_doc:
602
+ return None
603
+
604
+ # Create URL object from document
605
+ url_obj = URL(**url_doc)
606
+
607
+ # Update statistics
608
+ self.stats['urls_retrieved'] += 1
609
+
610
+ return url_obj
611
+ except Exception as e:
612
+ logger.error(f"Error retrieving URL {url}: {e}")
613
+ self.stats['storage_errors'] += 1
614
+ return None
615
+
616
+ def get_urls_by_status(self, status: str, limit: int = 100) -> List[URL]:
617
+ """
618
+ Retrieve URLs by status
619
+
620
+ Args:
621
+ status: Status of URLs to retrieve
622
+ limit: Maximum number of URLs to retrieve
623
+
624
+ Returns:
625
+ List of URL objects
626
+ """
627
+ try:
628
+ # Get URLs from MongoDB
629
+ url_docs = list(self.urls_collection.find({'status': status}).limit(limit))
630
+
631
+ # Create URL objects from documents
632
+ url_objs = [URL(**doc) for doc in url_docs]
633
+
634
+ # Update statistics
635
+ self.stats['urls_retrieved'] += len(url_objs)
636
+
637
+ return url_objs
638
+ except Exception as e:
639
+ logger.error(f"Error retrieving URLs by status {status}: {e}")
640
+ self.stats['storage_errors'] += 1
641
+ return []
642
+
643
+ def get_urls_by_domain(self, domain: str, limit: int = 100) -> List[URL]:
644
+ """
645
+ Retrieve URLs by domain
646
+
647
+ Args:
648
+ domain: Domain of URLs to retrieve
649
+ limit: Maximum number of URLs to retrieve
650
+
651
+ Returns:
652
+ List of URL objects
653
+ """
654
+ try:
655
+ # Get URLs from MongoDB
656
+ url_docs = list(self.urls_collection.find({'domain': domain}).limit(limit))
657
+
658
+ # Create URL objects from documents
659
+ url_objs = [URL(**doc) for doc in url_docs]
660
+
661
+ # Update statistics
662
+ self.stats['urls_retrieved'] += len(url_objs)
663
+
664
+ return url_objs
665
+ except Exception as e:
666
+ logger.error(f"Error retrieving URLs by domain {domain}: {e}")
667
+ self.stats['storage_errors'] += 1
668
+ return []
669
+
670
+ def store_stats(self, stats: Dict[str, Any]) -> bool:
671
+ """
672
+ Store crawler statistics
673
+
674
+ Args:
675
+ stats: Statistics to store
676
+
677
+ Returns:
678
+ True if successful, False otherwise
679
+ """
680
+ try:
681
+ # Create statistics document
682
+ stats_doc = stats.copy()
683
+ stats_doc['timestamp'] = datetime.datetime.now()
684
+
685
+ # Convert sets to lists for MongoDB
686
+ for key, value in stats_doc.items():
687
+ if isinstance(value, set):
688
+ stats_doc[key] = list(value)
689
+
690
+ # Store in MongoDB
691
+ self.stats_collection.insert_one(stats_doc)
692
+
693
+ return True
694
+ except Exception as e:
695
+ logger.error(f"Error storing statistics: {e}")
696
+ self.stats['storage_errors'] += 1
697
+ return False
698
+
699
+ def _check_disk_space(self) -> bool:
700
+ """
701
+ Check if disk space limit is exceeded
702
+
703
+ Returns:
704
+ True if space is available, False otherwise
705
+ """
706
+ # Convert max disk usage to bytes
707
+ max_bytes = self.max_disk_usage_gb * 1024 * 1024 * 1024
708
+
709
+ # Check if limit is exceeded
710
+ return self.stats['disk_space_used'] < max_bytes
711
+
712
+ def _extract_domain(self, url: str) -> str:
713
+ """Extract domain from URL"""
714
+ parsed = urlparse(url)
715
+ return parsed.netloc.replace(':', '_')
716
+
717
+ def _url_to_filename(self, url: str) -> str:
718
+ """Convert URL to filename"""
719
+ # Hash the URL to create a safe filename
720
+ return hashlib.md5(url.encode('utf-8')).hexdigest()
721
+
722
+ def clean_old_pages(self, days: int = 90) -> int:
723
+ """
724
+ Remove pages older than a specified number of days
725
+
726
+ Args:
727
+ days: Number of days after which pages are considered old
728
+
729
+ Returns:
730
+ Number of pages removed
731
+ """
732
+ try:
733
+ # Calculate cutoff date
734
+ cutoff_date = datetime.datetime.now() - datetime.timedelta(days=days)
735
+
736
+ # Find old pages
737
+ old_pages = list(self.pages_collection.find({
738
+ 'crawled_at': {'$lt': cutoff_date}
739
+ }, {'url': 1}))
740
+
741
+ if not old_pages:
742
+ logger.info(f"No pages older than {days} days found")
743
+ return 0
744
+
745
+ # Remove from database
746
+ delete_result = self.pages_collection.delete_many({
747
+ 'crawled_at': {'$lt': cutoff_date}
748
+ })
749
+
750
+ # Remove content files
751
+ count = 0
752
+ for page in old_pages:
753
+ url = page['url']
754
+ domain = self._extract_domain(url)
755
+ filename = self._url_to_filename(url)
756
+
757
+ # Check disk
758
+ compressed_path = os.path.join(config.HTML_STORAGE_PATH, domain, f"{filename}.gz")
759
+ uncompressed_path = os.path.join(config.HTML_STORAGE_PATH, domain, f"{filename}.html")
760
+
761
+ if os.path.exists(compressed_path):
762
+ os.remove(compressed_path)
763
+ count += 1
764
+
765
+ if os.path.exists(uncompressed_path):
766
+ os.remove(uncompressed_path)
767
+ count += 1
768
+
769
+ # Check S3
770
+ if self.s3_client:
771
+ s3_key_compressed = f"{domain}/{filename}.gz"
772
+ s3_key_uncompressed = f"{domain}/{filename}.html"
773
+
774
+ try:
775
+ self.s3_client.delete_object(
776
+ Bucket=config.S3_BUCKET,
777
+ Key=s3_key_compressed
778
+ )
779
+ count += 1
780
+ except:
781
+ pass
782
+
783
+ try:
784
+ self.s3_client.delete_object(
785
+ Bucket=config.S3_BUCKET,
786
+ Key=s3_key_uncompressed
787
+ )
788
+ count += 1
789
+ except:
790
+ pass
791
+
792
+ logger.info(f"Removed {delete_result.deleted_count} old pages and {count} content files")
793
+ return delete_result.deleted_count
794
+ except Exception as e:
795
+ logger.error(f"Error cleaning old pages: {e}")
796
+ self.stats['storage_errors'] += 1
797
+ return 0
798
+
799
+ def clean_failed_urls(self, retries: int = 3) -> int:
800
+ """
801
+ Remove URLs that have failed repeatedly
802
+
803
+ Args:
804
+ retries: Number of retries after which a URL is considered permanently failed
805
+
806
+ Returns:
807
+ Number of URLs removed
808
+ """
809
+ try:
810
+ # Delete failed URLs with too many retries
811
+ delete_result = self.urls_collection.delete_many({
812
+ 'status': 'FAILED',
813
+ 'retries': {'$gte': retries}
814
+ })
815
+
816
+ logger.info(f"Removed {delete_result.deleted_count} permanently failed URLs")
817
+ return delete_result.deleted_count
818
+ except Exception as e:
819
+ logger.error(f"Error cleaning failed URLs: {e}")
820
+ self.stats['storage_errors'] += 1
821
+ return 0
822
+
823
+ def calculate_storage_stats(self) -> Dict[str, Any]:
824
+ """
825
+ Calculate storage statistics
826
+
827
+ Returns:
828
+ Dictionary of storage statistics
829
+ """
830
+ stats = {
831
+ 'timestamp': datetime.datetime.now(),
832
+ 'pages_count': 0,
833
+ 'urls_count': 0,
834
+ 'disk_space_used_mb': 0,
835
+ 's3_objects_count': 0,
836
+ 'mongodb_size_mb': 0,
837
+ }
838
+
839
+ try:
840
+ # Count pages and URLs
841
+ stats['pages_count'] = self.pages_collection.count_documents({})
842
+ stats['urls_count'] = self.urls_collection.count_documents({})
843
+
844
+ # Calculate disk space used
845
+ total_size = 0
846
+ for root, _, files in os.walk(config.HTML_STORAGE_PATH):
847
+ total_size += sum(os.path.getsize(os.path.join(root, name)) for name in files)
848
+ stats['disk_space_used_mb'] = total_size / (1024 * 1024)
849
+
850
+ # Calculate MongoDB size
851
+ db_stats = self.db.command('dbStats')
852
+ stats['mongodb_size_mb'] = db_stats['storageSize'] / (1024 * 1024)
853
+
854
+ # Count S3 objects if enabled
855
+ if self.s3_client:
856
+ try:
857
+ s3_objects = 0
858
+ paginator = self.s3_client.get_paginator('list_objects_v2')
859
+ for page in paginator.paginate(Bucket=config.S3_BUCKET):
860
+ if 'Contents' in page:
861
+ s3_objects += len(page['Contents'])
862
+ stats['s3_objects_count'] = s3_objects
863
+ except Exception as e:
864
+ logger.error(f"Error counting S3 objects: {e}")
865
+
866
+ # Update internal statistics
867
+ self.stats['disk_space_used'] = total_size
868
+ self.stats['mongodb_size'] = db_stats['storageSize']
869
+
870
+ return stats
871
+ except Exception as e:
872
+ logger.error(f"Error calculating storage statistics: {e}")
873
+ self.stats['storage_errors'] += 1
874
+ return stats
875
+
876
+ def close(self) -> None:
877
+ """Close connections and perform cleanup"""
878
+ # Flush any pending buffers
879
+ self.flush_page_buffer()
880
+ self.flush_url_buffer()
881
+
882
+ # Close MongoDB connection
883
+ if self.mongo_client:
884
+ self.mongo_client.close()
885
+ logger.info("MongoDB connection closed")
886
+
887
+ # Log final statistics
888
+ logger.info(f"Storage manager closed. Pages stored: {self.stats['pages_stored']}, URLs stored: {self.stats['urls_stored']}")
test_crawler.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for the web crawler - tests only the URL frontier and downloader
4
+ without requiring MongoDB
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import time
10
+ import logging
11
+ import threading
12
+ from urllib.parse import urlparse
13
+ import redis
14
+
15
+ # Make sure we're in the right directory
16
+ script_dir = os.path.dirname(os.path.abspath(__file__))
17
+ os.chdir(script_dir)
18
+
19
+ # Set up logging
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format='%(asctime)s [%(name)s] %(levelname)s: %(message)s',
23
+ handlers=[
24
+ logging.StreamHandler(sys.stdout),
25
+ logging.FileHandler(os.path.join(script_dir, 'test_crawler.log'))
26
+ ]
27
+ )
28
+ logger = logging.getLogger("test_crawler")
29
+
30
+ # Import our modules
31
+ import config
32
+ from frontier import URLFrontier
33
+ from models import URL, Priority, URLStatus
34
+ from downloader import HTMLDownloader
35
+ from parser import HTMLParser
36
+ from robots import RobotsHandler
37
+ from dns_resolver import DNSResolver
38
+
39
+ # Import local configuration if available
40
+ try:
41
+ import local_config
42
+ # Override config settings with local settings
43
+ for key in dir(local_config):
44
+ if key.isupper():
45
+ setattr(config, key, getattr(local_config, key))
46
+ logger.info("Loaded local configuration")
47
+ except ImportError:
48
+ logger.warning("No local_config.py found - using default config")
49
+
50
+ def test_redis():
51
+ """Test Redis connection"""
52
+ try:
53
+ logger.info(f"Testing Redis connection to {config.REDIS_URI}")
54
+ r = redis.from_url(config.REDIS_URI)
55
+ r.ping()
56
+ logger.info("Redis connection successful")
57
+ return True
58
+ except Exception as e:
59
+ logger.error(f"Redis connection failed: {e}")
60
+ return False
61
+
62
+ def test_robots_txt():
63
+ """Test robots.txt handling"""
64
+ try:
65
+ logger.info("Testing robots.txt handling")
66
+ robots_handler = RobotsHandler()
67
+ test_urls = [
68
+ "https://www.google.com/",
69
+ "https://www.github.com/",
70
+ "https://sagarnildas.com/",
71
+ ]
72
+
73
+ for url in test_urls:
74
+ logger.info(f"Checking robots.txt for {url}")
75
+ allowed, crawl_delay = robots_handler.can_fetch(url)
76
+ logger.info(f" Allowed: {allowed}, Crawl delay: {crawl_delay}")
77
+
78
+ return True
79
+ except Exception as e:
80
+ logger.error(f"Error testing robots.txt: {e}")
81
+ return False
82
+
83
+ def test_dns_resolver():
84
+ """Test DNS resolver"""
85
+ try:
86
+ logger.info("Testing DNS resolver")
87
+ dns_resolver = DNSResolver()
88
+ test_domains = [
89
+ "www.google.com",
90
+ "www.github.com",
91
+ "example.com",
92
+ ]
93
+
94
+ for domain in test_domains:
95
+ logger.info(f"Resolving {domain}")
96
+ ip = dns_resolver.resolve(f"https://{domain}/")
97
+ logger.info(f" IP: {ip}")
98
+
99
+ return True
100
+ except Exception as e:
101
+ logger.error(f"Error testing DNS resolver: {e}")
102
+ return False
103
+
104
+ def test_url_frontier():
105
+ """Test URL frontier"""
106
+ try:
107
+ logger.info("Testing URL frontier")
108
+ frontier = URLFrontier()
109
+
110
+ # Clear frontier
111
+ frontier.clear()
112
+
113
+ # Add some URLs
114
+ test_urls = [
115
+ "https://www.google.com/",
116
+ "https://www.github.com/",
117
+ "https://sagarnildas.com/",
118
+ ]
119
+
120
+ for i, url in enumerate(test_urls):
121
+ url_obj = URL(
122
+ url=url,
123
+ priority=Priority.MEDIUM,
124
+ status=URLStatus.PENDING,
125
+ depth=0
126
+ )
127
+ added = frontier.add_url(url_obj)
128
+ logger.info(f"Added {url}: {added}")
129
+
130
+ # Check size
131
+ size = frontier.size()
132
+ logger.info(f"Frontier size: {size}")
133
+
134
+ # Get next URL
135
+ url = frontier.get_next_url()
136
+ if url:
137
+ logger.info(f"Next URL: {url.url} (priority: {url.priority})")
138
+ else:
139
+ logger.info("No URL available")
140
+
141
+ return True
142
+ except Exception as e:
143
+ logger.error(f"Error testing URL frontier: {e}")
144
+ return False
145
+
146
+ def test_downloader():
147
+ """Test HTML downloader"""
148
+ try:
149
+ logger.info("Testing HTML downloader")
150
+ downloader = HTMLDownloader()
151
+
152
+ test_urls = [
153
+ URL(url="https://sagarnildas.com/", priority=Priority.MEDIUM, status=URLStatus.PENDING, depth=0),
154
+ URL(url="https://www.google.com/", priority=Priority.MEDIUM, status=URLStatus.PENDING, depth=0),
155
+ ]
156
+
157
+ for url_obj in test_urls:
158
+ logger.info(f"Downloading {url_obj.url}")
159
+ page = downloader.download(url_obj)
160
+ if page:
161
+ logger.info(f" Downloaded {page.content_length} bytes, status: {page.status_code}")
162
+ # Test parsing
163
+ parser = HTMLParser()
164
+ urls, metadata = parser.parse(page)
165
+ logger.info(f" Extracted {len(urls)} URLs and {len(metadata)} metadata items")
166
+ else:
167
+ logger.info(f" Download failed: {url_obj.error}")
168
+
169
+ return True
170
+ except Exception as e:
171
+ logger.error(f"Error testing HTML downloader: {e}")
172
+ return False
173
+
174
+ def run_tests():
175
+ """Run all tests"""
176
+ logger.info("Starting crawler component tests")
177
+
178
+ tests = [
179
+ ("Redis", test_redis),
180
+ ("Robots.txt", test_robots_txt),
181
+ ("DNS Resolver", test_dns_resolver),
182
+ ("URL Frontier", test_url_frontier),
183
+ ("HTML Downloader", test_downloader),
184
+ ]
185
+
186
+ results = []
187
+ for name, test_func in tests:
188
+ logger.info(f"\n=== Testing {name} ===")
189
+ start_time = time.time()
190
+ success = test_func()
191
+ elapsed = time.time() - start_time
192
+
193
+ result = {
194
+ "name": name,
195
+ "success": success,
196
+ "time": elapsed
197
+ }
198
+ results.append(result)
199
+
200
+ logger.info(f"=== {name} test {'succeeded' if success else 'failed'} in {elapsed:.2f}s ===\n")
201
+
202
+ # Print summary
203
+ logger.info("\n=== Test Summary ===")
204
+ all_success = True
205
+ for result in results:
206
+ status = "SUCCESS" if result["success"] else "FAILED"
207
+ logger.info(f"{result['name']}: {status} ({result['time']:.2f}s)")
208
+ if not result["success"]:
209
+ all_success = False
210
+
211
+ if all_success:
212
+ logger.info("All tests passed!")
213
+ else:
214
+ logger.warning("Some tests failed. Check logs for details.")
215
+
216
+ return all_success
217
+
218
+ if __name__ == "__main__":
219
+ run_tests()